iudex-da 1.2.1-java → 1.3.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc CHANGED
@@ -1,3 +1,33 @@
1
+ === 1.3.0 (2012-10-4)
2
+ * Expand to iudex-core [1.2.1,1.4)
3
+ * Rewrite of WorkPoller:
4
+ * Now in Ruby, for ease of customization.
5
+ * max_priority_urls now defaults off for common cases
6
+ * do_domain_group now selectable (default off, commonly unneeded)
7
+ * domain_depth_coef (deprecated host_depth_divisor reciprocal) now
8
+ defaults nil (domain window partitioning off)
9
+ * Added new priority aging feature (priority increases with current
10
+ age of next_visit_after), on by default.
11
+ * Added new domain_union feature, which allows domain filtering,
12
+ independent size control, and faster polling, as compared with
13
+ general domain depth prioritization, when a limited set of domains
14
+ is used.
15
+ * Added new uhash_slice feature, which allows selecting a specific
16
+ segment of the overall uhash space, and scaling to multiple worker
17
+ instances (and IPs) with a shared database. This form of sharding
18
+ offers best pseudo-random balancing of URLs at the expense of
19
+ single-domain-instance politeness control.
20
+ * Replace use of Activerecord (for migrations and tests) with Sequel:
21
+ * Smaller deps, faster load times (particularly under jruby)
22
+ * Better Primary Key, composite VisitURL, type field support (models
23
+ Url).
24
+ * Provide Activerecord -> Sequel migration support (from a complete
25
+ iudex-da 1.2.1 db); consolidate prior migrations to a single base
26
+ and profile migrations.
27
+ * Add new (Sequel) migration to set "C" locale collate (aka ASCII sort
28
+ order) for uhash (required for uhash_slice, see above.) iudex-da now
29
+ depends on PostgreSQL 9.1 for COLLATE support.
30
+
1
31
  === 1.2.1 (2012-9-15)
2
32
  * Upgrade to logback ~> 1.2 (dev)
3
33
 
data/Manifest.txt CHANGED
@@ -8,29 +8,23 @@ bin/iudex-da-import
8
8
  bin/iudex-da-simhash-dump
9
9
  bin/iudex-migrate
10
10
  config/config.rb
11
- db/0010_base_urls.rb
12
- db/0020_add_feed_metadata.rb
13
- db/0021_more_feed_text.rb
14
- db/0030_add_priority.rb
15
- db/0040_add_visit_after.rb
16
- db/0050_add_cache_location.rb
17
- db/0060_url_indexes.rb
18
- db/0070_add_created_at.rb
19
- db/0080_add_simhash.rb
20
- db/0081_remove_simhash_index.rb
21
- db/0110_host_to_domain.rb
22
- db/index_next_visit/0100_add_index_next_visit.rb
23
- db/simhash/0085_add_simhash_index.rb
11
+ db/20111012173757_base.rb
12
+ db/20120930173600_uhash_collation_order.rb
13
+ db/index_next_visit/21500000000101_add_index_next_visit.rb
14
+ db/simhash/21500000000001_add_simhash_index.rb
24
15
  lib/iudex-da/base.rb
25
16
  lib/iudex-da.rb
26
- lib/iudex-da/ar.rb
27
17
  lib/iudex-da/config.rb
28
18
  lib/iudex-da/factory_helper.rb
29
19
  lib/iudex-da/importer.rb
30
20
  lib/iudex-da/key_helper.rb
21
+ lib/iudex-da/models.rb
22
+ lib/iudex-da/orm.rb
31
23
  lib/iudex-da/pool_data_source_factory.rb
24
+ lib/iudex-da/work_poller.rb
32
25
  test/setup.rb
33
26
  test/test_migrate.rb
34
- test/test_poll_work.rb
35
27
  test/test_pool_factory.rb
36
- lib/iudex-da/iudex-da-1.2.1.jar
28
+ test/test_url_model.rb
29
+ test/test_work_poller.rb
30
+ lib/iudex-da/iudex-da-1.3.0.jar
data/bin/iudex-migrate CHANGED
@@ -56,7 +56,8 @@ END
56
56
  end
57
57
  end
58
58
  opts.on( "-d", "--debug" ) do
59
- Logback[ 'iudex.da' ].level = Logback::DEBUG
59
+ Logback[ 'iudex.da' ].level = :debug
60
+ Hooker.add( [ :iudex, :connect_props ] ) { { :log => true } }
60
61
  end
61
62
  opts.on( "-v", "--version", "Display version and exit" ) do
62
63
  puts "iudex-da: #{DA::VERSION}"
@@ -65,8 +66,11 @@ END
65
66
  Hooker.register_config( opts )
66
67
  end.parse!
67
68
 
68
- require 'iudex-da/ar'
69
+ require 'iudex-da/orm'
69
70
 
70
- DA::migrate( ARGV[0] && ARGV[0].to_i )
71
+ target = ARGV[0] && ARGV[0].to_i
72
+ opts = {}
73
+ opts[ :target ] = target if target
74
+ DA::ORM::migrate( opts )
71
75
 
72
76
  end
@@ -0,0 +1,117 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ Sequel.migration do
18
+ change do
19
+
20
+ # The main/only urls table, in matching a default-setup iudex
21
+ # 1.1.0-1.2.1.
22
+ create_table( :urls ) do
23
+
24
+ twtz = "timestamp with time zone"
25
+ now = Sequel::CURRENT_TIMESTAMP
26
+
27
+ String :uhash, :null => false
28
+ # 23 byte ASCII PRIMARY KEY SHA-1 hash fragment of URL
29
+
30
+ String :url, :null => false
31
+ # Complete normalized url (exactly as used for uhash)
32
+
33
+ String :domain, :null => false
34
+ # Registration level domain from url host
35
+
36
+ String :type, :null => false
37
+ # FEED, PAGE, ROBOTS, SITEMAP
38
+ # Potentially speculative (i.e. "PAGE" before visited)
39
+ # FIXME: Or REDIRECT here instead of status?
40
+
41
+ String :etag
42
+ # HTTP ETag header used for subsequent conditional GET
43
+ # Should only be on 200 and related HTTP status, not redirect
44
+
45
+ DateTime :last_visit, :type => twtz
46
+ # Time of last visit (and thus last type,status,reason,etc.)
47
+
48
+ Integer :status
49
+ # HTTP status code or special (negative) status mapping
50
+ # null : Not yet visited
51
+ # -1 : Connection Failed
52
+ # 4xx : Permanent Failures
53
+ # 5xx : Transient server error
54
+ # 200 : Success
55
+ # 304 : Not Modified
56
+ # 301,302 : Redirect
57
+ # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
58
+
59
+ TrueClass :pass
60
+ # null : Not yet processed (i.e. visit failed)
61
+ # false : Rejected by processing (for reason), DELETE required
62
+ # true : Fully Processed
63
+
64
+ String :reason
65
+ # null : None
66
+ # DUPE : Duplicate of referent
67
+ # rejection filter (intended as key)
68
+
69
+ String :referent
70
+ # null : None
71
+ # uhash of url this is refering to
72
+ # (includes status:REDIRECT, reason:DUPE, etc.)
73
+
74
+ String :referer
75
+ # null : None
76
+ # uhash of url this was refered from. (i.e. the feed URL)
77
+
78
+ String :title
79
+ # PAGE,FEED title
80
+
81
+ DateTime :ref_pub_date, :type => twtz
82
+ # (Latest) published date as provided from feed (may be ahead of
83
+ # or set before pub_date, below).
84
+
85
+ DateTime :pub_date, :type => twtz
86
+ # (Latest) published date as processed
87
+
88
+ String :summary
89
+ # (Feed) summary
90
+
91
+ String :content
92
+ # (Feed) content
93
+
94
+ Float :priority, :type => "real", :default => 0.0, :null => false
95
+ # Prioritization of next visit, range -INF,+INF
96
+
97
+ DateTime :next_visit_after, :type => twtz, :default => now
98
+ # null: never visit (terminal result)
99
+ # Don't visit again before the specified date.
100
+
101
+ Integer :cache_file
102
+ # 32-bit file number
103
+
104
+ Bignum :cache_file_offset
105
+ # 64-bit byte offset within file
106
+
107
+ DateTime :created_at, :type => twtz, :default => now
108
+ # When inserted
109
+
110
+ Bignum :simhash
111
+ # A simhash signature as a signed 8-byte long (should be
112
+ # compatible with java long).
113
+
114
+ primary_key [ :uhash ]
115
+ end
116
+ end
117
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -14,15 +14,21 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- class AddCreatedAt < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'created_at', 'timestamp with time zone' )
21
- execute 'ALTER TABLE urls ALTER COLUMN created_at SET DEFAULT now()'
17
+ Sequel.migration do
18
+ up do
19
+ run <<-DDL
20
+ ALTER TABLE urls
21
+ ALTER COLUMN uhash
22
+ SET DATA TYPE text COLLATE "C"
23
+ DDL
24
+ run "REINDEX INDEX urls_pkey"
22
25
  end
23
-
24
- def self.down
25
- remove_column( 'urls', 'created_at' )
26
+ down do
27
+ run <<-DDL
28
+ ALTER TABLE urls
29
+ ALTER COLUMN uhash
30
+ SET DATA TYPE text
31
+ DDL
32
+ run "REINDEX INDEX urls_pkey"
26
33
  end
27
-
28
34
  end
@@ -14,14 +14,10 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- class AddSimhashIndex < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_index( 'urls', [ 'simhash' ] )
21
- end
22
-
23
- def self.down
24
- remove_index( 'urls', 'simhash' )
17
+ Sequel.migration do
18
+ change do
19
+ alter_table( :urls ) do
20
+ add_index( :next_visit_after )
21
+ end
25
22
  end
26
-
27
23
  end
@@ -14,14 +14,10 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- class RemoveSimhashIndex < ActiveRecord::Migration
18
-
19
- def self.up
20
- remove_index( 'urls', 'simhash' )
21
- end
22
-
23
- def self.down
24
- add_index( 'urls', [ 'simhash' ] )
17
+ Sequel.migration do
18
+ change do
19
+ alter_table( :urls ) do
20
+ add_index( :simhash )
21
+ end
25
22
  end
26
-
27
23
  end
data/lib/iudex-da.rb CHANGED
@@ -29,8 +29,8 @@ module Iudex
29
29
 
30
30
  require "#{LIB_DIR}/iudex-da-#{VERSION}.jar"
31
31
 
32
- import 'iudex.da.WorkPoller'
33
32
  import 'iudex.da.ContentMapper'
33
+ import 'iudex.da.ContentReader'
34
34
 
35
35
  module Filters
36
36
  import 'iudex.da.filters.UpdateFilter'
@@ -39,3 +39,5 @@ module Iudex
39
39
 
40
40
  end
41
41
  end
42
+
43
+ require 'iudex-da/work_poller.rb'
data/lib/iudex-da/base.rb CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
  module Iudex
18
18
  module DA
19
- VERSION = '1.2.1'
19
+ VERSION = '1.3.0'
20
20
 
21
21
  LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
22
  end
@@ -18,10 +18,10 @@
18
18
  module Iudex
19
19
 
20
20
  module DA
21
- # Default database connection configuration for both ActiveRecord
22
- # (migrations, testing) and PoolDataSourceFactory.
21
+ # Default database connection configuration for both Sequel
22
+ # (migrations, testing) and JDBC PoolDataSourceFactory.
23
23
  CONFIG = {
24
- :adapter => 'jdbcpostgresql',
24
+ :adapter => 'jdbc:postgresql',
25
25
  :host => 'localhost',
26
26
  :database => 'iudex_test',
27
27
  :username => 'iudex',
Binary file
@@ -0,0 +1,66 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-da/orm'
18
+
19
+ # Ensure setup has been run, as Sequel::Model needs the database
20
+ # connected for schema at model class creation
21
+ Iudex::DA::ORM.db
22
+
23
+ module Iudex::DA::ORM
24
+
25
+ # Url model (for urls table). Usage note: ORM::setup must be called
26
+ # before this can be loaded.
27
+ class Url < ::Sequel::Model
28
+
29
+ VisitURL = Iudex::Core::VisitURL
30
+
31
+ plugin :composition
32
+
33
+ composition( :visit_url,
34
+ :composer => proc { VisitURL.trust( url ) },
35
+ :decomposer => proc {
36
+ if v = compositions[ :visit_url ]
37
+ self.url = v.url
38
+ self.uhash = v.uhash
39
+ self.domain = v.domain
40
+ end
41
+ } )
42
+
43
+ def visit_url=( vurl )
44
+ vurl = VisitURL.normalize( vurl ) unless vurl.is_a?( VisitURL )
45
+ super( vurl )
46
+ end
47
+
48
+ def self.find_by_url( vurl )
49
+ vurl = VisitURL.normalize( vurl ) unless vurl.is_a?( VisitURL )
50
+ self[ vurl.uhash ]
51
+ end
52
+
53
+ # Specifically include type accessors to avoid deprecation warnings for
54
+ # old ruby method.
55
+
56
+ def type
57
+ self[ :type ]
58
+ end
59
+
60
+ def type=( t )
61
+ self[ :type ] = t
62
+ end
63
+
64
+ end
65
+
66
+ end
@@ -0,0 +1,183 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'rjack-slf4j'
18
+ require 'iudex-da/config'
19
+ require 'sequel'
20
+ require 'jdbc/postgres'
21
+ require 'hooker'
22
+
23
+ Sequel.extension :migration
24
+
25
+ module Iudex::DA
26
+
27
+ module ORM
28
+
29
+ class << self
30
+
31
+ # The Sequel::Database instance. #setup is called if necessary.
32
+ def db
33
+ setup unless @db
34
+ @db
35
+ end
36
+
37
+ # Setup the ORM (Sequel) connection given CONFIG defaults, any
38
+ # passed opts, and connect_props config hooks.
39
+ def setup( opts = {} )
40
+
41
+ @db.disconnect if @db
42
+
43
+ log = RJack::SLF4J[ "iudex.da.sequel" ]
44
+ conf = CONFIG.merge( opts )
45
+ conf = Hooker.merge( [ :iudex, :connect_props ], conf )
46
+
47
+ conf[ :loggers ] = [ log ] if conf[ :log ]
48
+
49
+ cstr = ( "%s://%s/%s?%s" %
50
+ [ conf[ :adapter ],
51
+ [ conf[ :host ], conf[ :port ] ].compact.join( ':' ),
52
+ conf[ :database ],
53
+ params( conf ) ] )
54
+
55
+ log.info { "Connecting: #{cstr}" }
56
+ log.debug { "Full Params: #{ conf.inspect }" }
57
+
58
+ @db = Sequel.connect( cstr, conf )
59
+
60
+ end
61
+
62
+ # Migrate the DB given opts, including :target version. For
63
+ # backward compatibility, opts may be a single Integer,
64
+ # interpreted as the :target version. Setup must be called
65
+ # beforehand.
66
+ # See also opts for #migrate_ar_to_sequel
67
+ def migrate( opts = {} )
68
+ opts = {} if opts.nil?
69
+ opts = { :target => opts } if opts.is_a?( Integer )
70
+ raise "setup must be run before migrate" unless db
71
+ profiles = Hooker.apply( [ :iudex, :migration_profiles ],
72
+ opts[ :profiles ] || [] )
73
+
74
+ migrate_ar_to_sequel( opts )
75
+
76
+ pm = ProfileMigrator.new( db, profiles, opts )
77
+ pm.run
78
+ end
79
+
80
+ # Migrate from a iudex [1.1.0,1.3) database managed by
81
+ # activerecord to a 1.3.x database managed by Sequel. No-op if
82
+ # already Sequel.
83
+ # === Options
84
+ # :ar_to_sequel_migrations:: Hash<Integer,String> AR migration
85
+ # number to sequel filename
86
+ # (timestamp) map for extensions
87
+ # supported externally to iudex-da.
88
+ def migrate_ar_to_sequel( opts )
89
+
90
+ columns = ( db.table_exists?( :schema_migrations ) &&
91
+ db.schema( :schema_migrations ).map { |sr| sr[0] } )
92
+
93
+ if columns == [ :version ] # Old format AR schema_migrations
94
+ db.transaction do
95
+ versions = db.from( :schema_migrations ).
96
+ map { |r| r[ :version ].to_i }
97
+
98
+ if ( versions & AR_REQUIRED ) != AR_REQUIRED
99
+ missing = AR_REQUIRED - ( versions & AR_REQUIRED )
100
+ raise( ARNotComplete,
101
+ "Missing AR migrations #{missing.inspect}; " +
102
+ "Use 'iudex-migrate _1.2.1_' first" )
103
+ end
104
+
105
+ migrations_map = AR_TO_SEQUEL_MIGRATIONS.
106
+ merge( opts[ :ar_to_sequel_migrations ] || {} )
107
+
108
+ db.drop_table( :schema_migrations )
109
+ db.create_table( :schema_migrations ) do
110
+ String :filename, :null => false
111
+ primary_key [ :filename ]
112
+ end
113
+
114
+ sm = db[:schema_migrations]
115
+ sm.insert( :filename => '20111012173757_base.rb' )
116
+
117
+ migrations_map.each do | version, filename |
118
+ sm.insert( :filename => filename ) if versions.include?( version )
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ def params( opts )
125
+ pms = {}
126
+
127
+ u = opts[ :username ]
128
+ pms[ :user ] = u if u
129
+
130
+ p = opts[ :password ]
131
+ pms[ :password ] = p if p
132
+
133
+ pms.sort.map { |*p| p.join( '=' ) }.join( '&' )
134
+ end
135
+
136
+ end
137
+
138
+ AR_TO_SEQUEL_MIGRATIONS = {
139
+ 85 => '21500000000001_add_simhash_index.rb',
140
+ 100 => '21500000000101_add_index_next_visit.rb'
141
+ }
142
+ AR_REQUIRED = [ 10, 20, 21, 30, 40, 50, 60, 70, 80, 81, 110 ]
143
+
144
+ ARNotComplete = Class.new(StandardError)
145
+
146
+ @db = nil
147
+
148
+ # Custom migrator handling "profile" directories (optional
149
+ # migrations)
150
+ class ProfileMigrator < Sequel::TimestampMigrator
151
+
152
+ def initialize( db, profiles, opts )
153
+
154
+ base = File.join( LIB_DIR, '..', '..', 'db' )
155
+ paths = [ base ]
156
+
157
+ paths += profiles.compact.map do |p|
158
+ p = p.to_s
159
+ if p =~ %r{^/}
160
+ p
161
+ else
162
+ File.join( base, p )
163
+ end
164
+ end
165
+
166
+ @pattern = if paths.size > 1
167
+ '{' + paths.join( ',' ) + '}'
168
+ else
169
+ paths.first
170
+ end
171
+
172
+ super( db, base, opts )
173
+ end
174
+
175
+ def get_migration_files
176
+ Dir.glob( "#{@pattern}/[0-9]*_*.rb" )
177
+ end
178
+
179
+ end
180
+
181
+ end
182
+
183
+ end