iudex-da 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,32 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ bin/iudex-da-generate-test-data
7
+ bin/iudex-da-import
8
+ bin/iudex-da-simhash-dump
9
+ bin/iudex-migrate
10
+ config/config.rb
11
+ db/0010_base_urls.rb
12
+ db/0020_add_feed_metadata.rb
13
+ db/0021_more_feed_text.rb
14
+ db/0030_add_priority.rb
15
+ db/0040_add_visit_after.rb
16
+ db/0050_add_cache_location.rb
17
+ db/0060_url_indexes.rb
18
+ db/0070_add_created_at.rb
19
+ db/0080_add_simhash.rb
20
+ lib/iudex-da/base.rb
21
+ lib/iudex-da.rb
22
+ lib/iudex-da/ar.rb
23
+ lib/iudex-da/config.rb
24
+ lib/iudex-da/factory_helper.rb
25
+ lib/iudex-da/importer.rb
26
+ lib/iudex-da/key_helper.rb
27
+ lib/iudex-da/pool_data_source_factory.rb
28
+ test/setup.rb
29
+ test/test_migrate.rb
30
+ test/test_poll_work.rb
31
+ test/test_pool_factory.rb
32
+ lib/iudex-da/iudex-da-1.0.0.jar
data/README.rdoc ADDED
@@ -0,0 +1,30 @@
1
+ = iudex-da
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-da gem provides a PostgreSQL-based content
9
+ meta-data store and work priority queue.
10
+
11
+ == Postgresql Setup
12
+
13
+ % createuser iudex
14
+ % createdb iudex_test -O iudex
15
+
16
+ == License
17
+
18
+ Copyright (c) 2008-2011 David Kellum
19
+
20
+ Licensed under the Apache License, Version 2.0 (the "License"); you
21
+ may not use this file except in compliance with the License. You may
22
+ obtain a copy of the License at
23
+
24
+ http://www.apache.org/licenses/LICENSE-2.0
25
+
26
+ Unless required by applicable law or agreed to in writing, software
27
+ distributed under the License is distributed on an "AS IS" BASIS,
28
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
29
+ implied. See the License for the specific language governing
30
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,49 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-da/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-da',
11
+ Iudex::DA::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+
17
+ h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
18
+ [ 'activerecord', '~> 2.3.10' ],
19
+ [ 'jdbc-postgres', '>= 8.4.702', '< 9.1' ],
20
+ [ 'activerecord-jdbcpostgresql-adapter', '~> 1.1.0' ],
21
+ [ 'rjack-commons-dbcp', '~> 1.4.0' ],
22
+ [ 'rjack-commons-dbutils', '~> 1.3.0' ] ]
23
+
24
+ h.testlib = :minitest
25
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
26
+ [ 'rjack-logback', '~> 1.0' ] ]
27
+ end
28
+
29
+ file 'Manifest.txt' => "lib/#{t.name}/base.rb"
30
+
31
+ task :check_pom_version do
32
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
33
+ end
34
+ task :check_history_version do
35
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
36
+ end
37
+ task :check_history_date do
38
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
39
+ end
40
+
41
+ task :gem => [ :check_pom_version, :check_history_version ]
42
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
43
+ task :push => [ :check_history_date ]
44
+
45
+ # Disable verbose warnings, which are a bit much with ActiveRecord
46
+ # 2.3.x at least.
47
+ Hoe::RUBY_FLAGS.sub!( /\-w(\s|$)/, '-W1\1' )
48
+
49
+ t.define_tasks
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+
23
+ require 'rjack-logback'
24
+ RJack::Logback.config_console( :thread => true )
25
+
26
+ require 'iudex-da'
27
+ require 'iudex-da/pool_data_source_factory'
28
+ require 'optparse'
29
+
30
+ class Generator
31
+ include Iudex::DA
32
+ include Gravitext::HTMap
33
+
34
+ import 'iudex.core.VisitURL'
35
+ import 'iudex.core.ContentKeys'
36
+ import 'org.apache.commons.dbutils.ResultSetHandler'
37
+ import 'org.apache.commons.dbutils.QueryRunner'
38
+ import 'iudex.da.ContentMapper'
39
+ import 'iudex.da.ContentWriter'
40
+
41
+ # 26^3 or 17,576 hosts
42
+ def hosts
43
+ h = []
44
+ r = ('a'..'z')
45
+ r.each do |i|
46
+ r.each do |j|
47
+ r.each do |k|
48
+ h << i.to_s + j + k + ".com"
49
+ end
50
+ end
51
+ end
52
+ h
53
+ end
54
+
55
+ def initialize
56
+ @factory = PoolDataSourceFactory.new
57
+ @data_source = @factory.create
58
+
59
+ @kmap = ContentMapper.new( [ ContentMapper::UHASH,
60
+ ContentMapper::HOST,
61
+ ContentKeys::URL,
62
+ ContentKeys::TYPE,
63
+ ContentKeys::PRIORITY,
64
+ ContentKeys::NEXT_VISIT_AFTER ] )
65
+
66
+ UniMap.define_accessors
67
+ end
68
+
69
+ def clear
70
+ qrun = QueryRunner.new( @data_source )
71
+ qrun.update( "DELETE from urls;" )
72
+ end
73
+
74
+ def write( parts = 3 )
75
+ writer = ContentWriter.new( @data_source, @kmap )
76
+ hs = hosts
77
+ sl = hs.length / parts
78
+ threads = []
79
+ while( ( h = hs.slice!( 0, sl ) ).length > 0 )
80
+ threads << Thread.new( h, writer ) do |hi, out|
81
+ now = Time.now
82
+ batch = []
83
+ hi.each do |host|
84
+ (rand(50) + 1).times do |i|
85
+ c = UniMap.new
86
+ c.priority = ( rand * 9.99 + 0.01 )
87
+ c.url = VisitURL.normalize( "http://#{host}/#{i}" )
88
+ c.type = "FEED"
89
+ c.next_visit_after = now + ( rand(3) * 60 * 60 * 24 * 100 ) # 2/3 are in future
90
+ batch << c
91
+ if batch.length >= 10_000
92
+ out.write( batch )
93
+ batch.clear
94
+ end
95
+ end
96
+ end
97
+ out.write( batch ) unless batch.empty?
98
+ end
99
+ end
100
+ threads.each { |t| t.join }
101
+ end
102
+
103
+ end
104
+
105
+ Hooker.log_with { |m| RJack::SLF4J[ 'iudex' ].info( m.rstrip ) }
106
+
107
+ OptionParser.new do |opts|
108
+ opts.on( "-s", "--set name=value", String,
109
+ "Set connect prop (ex: database=iudex)" ) do |nv|
110
+ name,value = nv.split('=').map { |t| t.strip }
111
+ Hooker.add( [ :iudex, :connect_props ] ) do
112
+ { name.to_sym => value }
113
+ end
114
+ end
115
+ opts.on( "-d", "--debug" ) do
116
+ RJack::Logback[ 'iudex.da' ].level = RJack::Logback::DEBUG
117
+ end
118
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
119
+ puts "iudex-da: #{Iudex::DA::VERSION}"
120
+ exit 1
121
+ end
122
+ Hooker.register_config( opts )
123
+ end.parse!
124
+
125
+ g = Generator.new
126
+ g.clear
127
+ g.write
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ module IudexBinScript
22
+
23
+ require 'rubygems'
24
+ require 'optparse'
25
+
26
+ require 'rjack-logback'
27
+ include RJack
28
+ Logback.config_console( :mdc => "uhash" )
29
+
30
+ require 'iudex-da'
31
+ require 'iudex-da/importer'
32
+
33
+ include Iudex
34
+
35
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
36
+
37
+ OptionParser.new do |opts|
38
+ opts.banner = <<END
39
+ iudex-da: #{Iudex::DA::VERSION}
40
+ Usage: iudex-da-import [Options] ImportCSV...
41
+ END
42
+ opts.on( "-s", "--set name=value", String,
43
+ "Set connect prop (ex: database=iudex)" ) do |nv|
44
+ name,value = nv.split('=').map { |t| t.strip }
45
+ Hooker.add( [ :iudex, :connect_props ] ) do
46
+ { name.to_sym => value }
47
+ end
48
+ end
49
+ opts.on( "-d", "--debug" ) do
50
+ Logback[ 'iudex' ].level = Logback::DEBUG
51
+ end
52
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
53
+ puts "iudex-da: #{Iudex::DA::VERSION}"
54
+ exit 1
55
+ end
56
+ Hooker.register_config( opts )
57
+ end.parse!
58
+
59
+ importer = DA::Importer.new
60
+ importer.import_files( ARGV )
61
+
62
+ end
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+
23
+ require 'rjack-logback'
24
+ RJack::Logback.config_console
25
+
26
+ require 'iudex-da/pool_data_source_factory'
27
+ require 'optparse'
28
+
29
+ class Iudex::DA::SimHashDumper
30
+ import 'org.apache.commons.dbutils.ResultSetHandler'
31
+ import 'org.apache.commons.dbutils.QueryRunner'
32
+
33
+ include RJack
34
+
35
+ def initialize
36
+ @log = SLF4J[ self.class ]
37
+ end
38
+
39
+ def parse_options
40
+ Hooker.log_with { |m| @log.info( m.rstrip ) }
41
+
42
+ # Apply all config before including ar
43
+ parser = OptionParser.new do |opts|
44
+ opts.banner = <<END
45
+ Usage: iudex-da-simhash-dump [options] <output-file>
46
+ END
47
+ opts.on( "-s", "--set name=value", String,
48
+ "Set connect prop (ex: database=iudex)" ) do |nv|
49
+ name,value = nv.split('=').map { |t| t.strip }
50
+ Hooker.add( [ :iudex, :connect_props ] ) do
51
+ { name.to_sym => value }
52
+ end
53
+ end
54
+ opts.on( "-d", "--debug" ) do
55
+ Logback[ 'iudex.da' ].level = Logback::DEBUG
56
+ end
57
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
58
+ puts "iudex-da: #{Iudex::DA::VERSION}"
59
+ exit 1
60
+ end
61
+ Hooker.register_config( opts )
62
+ end
63
+ parser.parse!
64
+ end
65
+
66
+ def run
67
+ parse_options
68
+
69
+ unless ARGV.empty?
70
+ @factory = Iudex::DA::PoolDataSourceFactory.new
71
+ @data_source = @factory.create
72
+ dump
73
+ end
74
+ end
75
+
76
+ class UniWriter
77
+ include ResultSetHandler
78
+ def initialize( log, fout )
79
+ @fout = fout
80
+ @log = log
81
+ end
82
+ def handle( rset )
83
+ cnt = 0
84
+ while rset.next
85
+ @fout.puts rset.get_string( 1 )
86
+ cnt += 1
87
+ @log.info( "Wrote #{cnt} simhashes." ) if ( cnt % 100000 ) == 0
88
+ end
89
+ @log.info( "Wrote #{cnt} simhashes total." )
90
+ end
91
+ end
92
+
93
+ def dump( file = ARGV.first )
94
+
95
+ File.open( file, 'w' ) do |fout|
96
+ qrun = QueryRunner.new( @data_source )
97
+ hdlr = UniWriter.new( @log, fout )
98
+
99
+ qrun.query( <<-"SQL", hdlr )
100
+ SELECT lpad( to_hex( simhash ), 16, '0' ) AS sh
101
+ FROM urls
102
+ WHERE simhash IS NOT NULL AND status IN (200, 304)
103
+ ORDER BY last_visit ASC;
104
+ SQL
105
+
106
+ #FIXME: WHERE pass (accepted state) instead of status?
107
+ end
108
+ end
109
+
110
+ end
111
+
112
+ Iudex::DA::SimHashDumper.new.run
data/bin/iudex-migrate ADDED
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'optparse'
23
+
24
+ module IudexBinScript
25
+
26
+ require 'rjack-logback'
27
+ include RJack
28
+ Logback.config_console
29
+
30
+ # Note: Avoid loading iudex-da with its jar dependency which would
31
+ # make it hard to boostrap the db from source alone.
32
+ # Instead load only nessary core, base, config, and ar (post config):
33
+ require 'iudex-core'
34
+ require 'iudex-da/base'
35
+ require 'iudex-da/config'
36
+ include Iudex
37
+
38
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
39
+
40
+ # Apply all config before including ar
41
+ OptionParser.new do |opts|
42
+ opts.banner = <<END
43
+ Usage: iudex-migrate [options] [target-migration-number]
44
+ END
45
+ opts.on( "-s", "--set name=value", String,
46
+ "Set connect prop (ex: database=iudex)" ) do |nv|
47
+ name,value = nv.split('=').map { |t| t.strip }
48
+ Hooker.add( [ :iudex, :connect_props ] ) do
49
+ { name.to_sym => value }
50
+ end
51
+ end
52
+ opts.on( "-d", "--debug" ) do
53
+ Logback[ 'iudex.da' ].level = Logback::DEBUG
54
+ end
55
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
56
+ puts "iudex-da: #{DA::VERSION}"
57
+ exit 1
58
+ end
59
+ Hooker.register_config( opts )
60
+ end.parse!
61
+
62
+ require 'iudex-da/ar'
63
+
64
+ DA::migrate( ARGV[0] && ARGV[0].to_i )
65
+
66
+ end