iudex-da 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,32 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ bin/iudex-da-generate-test-data
7
+ bin/iudex-da-import
8
+ bin/iudex-da-simhash-dump
9
+ bin/iudex-migrate
10
+ config/config.rb
11
+ db/0010_base_urls.rb
12
+ db/0020_add_feed_metadata.rb
13
+ db/0021_more_feed_text.rb
14
+ db/0030_add_priority.rb
15
+ db/0040_add_visit_after.rb
16
+ db/0050_add_cache_location.rb
17
+ db/0060_url_indexes.rb
18
+ db/0070_add_created_at.rb
19
+ db/0080_add_simhash.rb
20
+ lib/iudex-da/base.rb
21
+ lib/iudex-da.rb
22
+ lib/iudex-da/ar.rb
23
+ lib/iudex-da/config.rb
24
+ lib/iudex-da/factory_helper.rb
25
+ lib/iudex-da/importer.rb
26
+ lib/iudex-da/key_helper.rb
27
+ lib/iudex-da/pool_data_source_factory.rb
28
+ test/setup.rb
29
+ test/test_migrate.rb
30
+ test/test_poll_work.rb
31
+ test/test_pool_factory.rb
32
+ lib/iudex-da/iudex-da-1.0.0.jar
data/README.rdoc ADDED
@@ -0,0 +1,30 @@
1
+ = iudex-da
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-da gem provides a PostgreSQL-based content
9
+ meta-data store and work priority queue.
10
+
11
+ == Postgresql Setup
12
+
13
+ % createuser iudex
14
+ % createdb iudex_test -O iudex
15
+
16
+ == License
17
+
18
+ Copyright (c) 2008-2011 David Kellum
19
+
20
+ Licensed under the Apache License, Version 2.0 (the "License"); you
21
+ may not use this file except in compliance with the License. You may
22
+ obtain a copy of the License at
23
+
24
+ http://www.apache.org/licenses/LICENSE-2.0
25
+
26
+ Unless required by applicable law or agreed to in writing, software
27
+ distributed under the License is distributed on an "AS IS" BASIS,
28
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
29
+ implied. See the License for the specific language governing
30
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,49 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-da/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-da',
11
+ Iudex::DA::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+
17
+ h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
18
+ [ 'activerecord', '~> 2.3.10' ],
19
+ [ 'jdbc-postgres', '>= 8.4.702', '< 9.1' ],
20
+ [ 'activerecord-jdbcpostgresql-adapter', '~> 1.1.0' ],
21
+ [ 'rjack-commons-dbcp', '~> 1.4.0' ],
22
+ [ 'rjack-commons-dbutils', '~> 1.3.0' ] ]
23
+
24
+ h.testlib = :minitest
25
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
26
+ [ 'rjack-logback', '~> 1.0' ] ]
27
+ end
28
+
29
+ file 'Manifest.txt' => "lib/#{t.name}/base.rb"
30
+
31
+ task :check_pom_version do
32
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
33
+ end
34
+ task :check_history_version do
35
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
36
+ end
37
+ task :check_history_date do
38
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
39
+ end
40
+
41
+ task :gem => [ :check_pom_version, :check_history_version ]
42
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
43
+ task :push => [ :check_history_date ]
44
+
45
+ # Disable verbose warnings, which are a bit much with ActiveRecord
46
+ # 2.3.x at least.
47
+ Hoe::RUBY_FLAGS.sub!( /\-w(\s|$)/, '-W1\1' )
48
+
49
+ t.define_tasks
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+
23
+ require 'rjack-logback'
24
+ RJack::Logback.config_console( :thread => true )
25
+
26
+ require 'iudex-da'
27
+ require 'iudex-da/pool_data_source_factory'
28
+ require 'optparse'
29
+
30
+ class Generator
31
+ include Iudex::DA
32
+ include Gravitext::HTMap
33
+
34
+ import 'iudex.core.VisitURL'
35
+ import 'iudex.core.ContentKeys'
36
+ import 'org.apache.commons.dbutils.ResultSetHandler'
37
+ import 'org.apache.commons.dbutils.QueryRunner'
38
+ import 'iudex.da.ContentMapper'
39
+ import 'iudex.da.ContentWriter'
40
+
41
+ # 26^3 or 17,576 hosts
42
+ def hosts
43
+ h = []
44
+ r = ('a'..'z')
45
+ r.each do |i|
46
+ r.each do |j|
47
+ r.each do |k|
48
+ h << i.to_s + j + k + ".com"
49
+ end
50
+ end
51
+ end
52
+ h
53
+ end
54
+
55
+ def initialize
56
+ @factory = PoolDataSourceFactory.new
57
+ @data_source = @factory.create
58
+
59
+ @kmap = ContentMapper.new( [ ContentMapper::UHASH,
60
+ ContentMapper::HOST,
61
+ ContentKeys::URL,
62
+ ContentKeys::TYPE,
63
+ ContentKeys::PRIORITY,
64
+ ContentKeys::NEXT_VISIT_AFTER ] )
65
+
66
+ UniMap.define_accessors
67
+ end
68
+
69
+ def clear
70
+ qrun = QueryRunner.new( @data_source )
71
+ qrun.update( "DELETE from urls;" )
72
+ end
73
+
74
+ def write( parts = 3 )
75
+ writer = ContentWriter.new( @data_source, @kmap )
76
+ hs = hosts
77
+ sl = hs.length / parts
78
+ threads = []
79
+ while( ( h = hs.slice!( 0, sl ) ).length > 0 )
80
+ threads << Thread.new( h, writer ) do |hi, out|
81
+ now = Time.now
82
+ batch = []
83
+ hi.each do |host|
84
+ (rand(50) + 1).times do |i|
85
+ c = UniMap.new
86
+ c.priority = ( rand * 9.99 + 0.01 )
87
+ c.url = VisitURL.normalize( "http://#{host}/#{i}" )
88
+ c.type = "FEED"
89
+ c.next_visit_after = now + ( rand(3) * 60 * 60 * 24 * 100 ) # 2/3 are in future
90
+ batch << c
91
+ if batch.length >= 10_000
92
+ out.write( batch )
93
+ batch.clear
94
+ end
95
+ end
96
+ end
97
+ out.write( batch ) unless batch.empty?
98
+ end
99
+ end
100
+ threads.each { |t| t.join }
101
+ end
102
+
103
+ end
104
+
105
+ Hooker.log_with { |m| RJack::SLF4J[ 'iudex' ].info( m.rstrip ) }
106
+
107
+ OptionParser.new do |opts|
108
+ opts.on( "-s", "--set name=value", String,
109
+ "Set connect prop (ex: database=iudex)" ) do |nv|
110
+ name,value = nv.split('=').map { |t| t.strip }
111
+ Hooker.add( [ :iudex, :connect_props ] ) do
112
+ { name.to_sym => value }
113
+ end
114
+ end
115
+ opts.on( "-d", "--debug" ) do
116
+ RJack::Logback[ 'iudex.da' ].level = RJack::Logback::DEBUG
117
+ end
118
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
119
+ puts "iudex-da: #{Iudex::DA::VERSION}"
120
+ exit 1
121
+ end
122
+ Hooker.register_config( opts )
123
+ end.parse!
124
+
125
+ g = Generator.new
126
+ g.clear
127
+ g.write
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ module IudexBinScript
22
+
23
+ require 'rubygems'
24
+ require 'optparse'
25
+
26
+ require 'rjack-logback'
27
+ include RJack
28
+ Logback.config_console( :mdc => "uhash" )
29
+
30
+ require 'iudex-da'
31
+ require 'iudex-da/importer'
32
+
33
+ include Iudex
34
+
35
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
36
+
37
+ OptionParser.new do |opts|
38
+ opts.banner = <<END
39
+ iudex-da: #{Iudex::DA::VERSION}
40
+ Usage: iudex-da-import [Options] ImportCSV...
41
+ END
42
+ opts.on( "-s", "--set name=value", String,
43
+ "Set connect prop (ex: database=iudex)" ) do |nv|
44
+ name,value = nv.split('=').map { |t| t.strip }
45
+ Hooker.add( [ :iudex, :connect_props ] ) do
46
+ { name.to_sym => value }
47
+ end
48
+ end
49
+ opts.on( "-d", "--debug" ) do
50
+ Logback[ 'iudex' ].level = Logback::DEBUG
51
+ end
52
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
53
+ puts "iudex-da: #{Iudex::DA::VERSION}"
54
+ exit 1
55
+ end
56
+ Hooker.register_config( opts )
57
+ end.parse!
58
+
59
+ importer = DA::Importer.new
60
+ importer.import_files( ARGV )
61
+
62
+ end
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+
23
+ require 'rjack-logback'
24
+ RJack::Logback.config_console
25
+
26
+ require 'iudex-da/pool_data_source_factory'
27
+ require 'optparse'
28
+
29
+ class Iudex::DA::SimHashDumper
30
+ import 'org.apache.commons.dbutils.ResultSetHandler'
31
+ import 'org.apache.commons.dbutils.QueryRunner'
32
+
33
+ include RJack
34
+
35
+ def initialize
36
+ @log = SLF4J[ self.class ]
37
+ end
38
+
39
+ def parse_options
40
+ Hooker.log_with { |m| @log.info( m.rstrip ) }
41
+
42
+ # Apply all config before including ar
43
+ parser = OptionParser.new do |opts|
44
+ opts.banner = <<END
45
+ Usage: iudex-da-simhash-dump [options] <output-file>
46
+ END
47
+ opts.on( "-s", "--set name=value", String,
48
+ "Set connect prop (ex: database=iudex)" ) do |nv|
49
+ name,value = nv.split('=').map { |t| t.strip }
50
+ Hooker.add( [ :iudex, :connect_props ] ) do
51
+ { name.to_sym => value }
52
+ end
53
+ end
54
+ opts.on( "-d", "--debug" ) do
55
+ Logback[ 'iudex.da' ].level = Logback::DEBUG
56
+ end
57
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
58
+ puts "iudex-da: #{Iudex::DA::VERSION}"
59
+ exit 1
60
+ end
61
+ Hooker.register_config( opts )
62
+ end
63
+ parser.parse!
64
+ end
65
+
66
+ def run
67
+ parse_options
68
+
69
+ unless ARGV.empty?
70
+ @factory = Iudex::DA::PoolDataSourceFactory.new
71
+ @data_source = @factory.create
72
+ dump
73
+ end
74
+ end
75
+
76
+ class UniWriter
77
+ include ResultSetHandler
78
+ def initialize( log, fout )
79
+ @fout = fout
80
+ @log = log
81
+ end
82
+ def handle( rset )
83
+ cnt = 0
84
+ while rset.next
85
+ @fout.puts rset.get_string( 1 )
86
+ cnt += 1
87
+ @log.info( "Wrote #{cnt} simhashes." ) if ( cnt % 100000 ) == 0
88
+ end
89
+ @log.info( "Wrote #{cnt} simhashes total." )
90
+ end
91
+ end
92
+
93
+ def dump( file = ARGV.first )
94
+
95
+ File.open( file, 'w' ) do |fout|
96
+ qrun = QueryRunner.new( @data_source )
97
+ hdlr = UniWriter.new( @log, fout )
98
+
99
+ qrun.query( <<-"SQL", hdlr )
100
+ SELECT lpad( to_hex( simhash ), 16, '0' ) AS sh
101
+ FROM urls
102
+ WHERE simhash IS NOT NULL AND status IN (200, 304)
103
+ ORDER BY last_visit ASC;
104
+ SQL
105
+
106
+ #FIXME: WHERE pass (accepted state) instead of status?
107
+ end
108
+ end
109
+
110
+ end
111
+
112
+ Iudex::DA::SimHashDumper.new.run
data/bin/iudex-migrate ADDED
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'optparse'
23
+
24
+ module IudexBinScript
25
+
26
+ require 'rjack-logback'
27
+ include RJack
28
+ Logback.config_console
29
+
30
+ # Note: Avoid loading iudex-da with its jar dependency which would
31
+ # make it hard to boostrap the db from source alone.
32
+ # Instead load only nessary core, base, config, and ar (post config):
33
+ require 'iudex-core'
34
+ require 'iudex-da/base'
35
+ require 'iudex-da/config'
36
+ include Iudex
37
+
38
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
39
+
40
+ # Apply all config before including ar
41
+ OptionParser.new do |opts|
42
+ opts.banner = <<END
43
+ Usage: iudex-migrate [options] [target-migration-number]
44
+ END
45
+ opts.on( "-s", "--set name=value", String,
46
+ "Set connect prop (ex: database=iudex)" ) do |nv|
47
+ name,value = nv.split('=').map { |t| t.strip }
48
+ Hooker.add( [ :iudex, :connect_props ] ) do
49
+ { name.to_sym => value }
50
+ end
51
+ end
52
+ opts.on( "-d", "--debug" ) do
53
+ Logback[ 'iudex.da' ].level = Logback::DEBUG
54
+ end
55
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
56
+ puts "iudex-da: #{DA::VERSION}"
57
+ exit 1
58
+ end
59
+ Hooker.register_config( opts )
60
+ end.parse!
61
+
62
+ require 'iudex-da/ar'
63
+
64
+ DA::migrate( ARGV[0] && ARGV[0].to_i )
65
+
66
+ end