iudex-da 1.2.1-java → 1.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc CHANGED
@@ -1,3 +1,33 @@
1
+ === 1.3.0 (2012-10-4)
2
+ * Expand to iudex-core [1.2.1,1.4)
3
+ * Rewrite of WorkPoller:
4
+ * Now in Ruby, for ease of customization.
5
+ * max_priority_urls now defaults off for common cases
6
+ * do_domain_group now selectable (default off, commonly unneeded)
7
+ * domain_depth_coef (deprecated host_depth_divisor reciprocal) now
8
+ defaults nil (domain window partitioning off)
9
+ * Added new priority aging feature (priority increases with current
10
+ age of next_visit_after), on by default.
11
+ * Added new domain_union feature, which allows domain filtering,
12
+ independent size control, and faster polling, as compared with
13
+ general domain depth prioritization, when a limited set of domains
14
+ is used.
15
+ * Added new uhash_slice feature, which allows selecting a specific
16
+ segment of the overall uhash space, and scaling to multiple worker
17
+ instances (and IPs) with a shared database. This form of sharding
18
+ offers best pseudo-random balancing of URLs at the expense of
19
+ single-domain-instance politeness control.
20
+ * Replace use of Activerecord (for migrations and tests) with Sequel:
21
+ * Smaller deps, faster load times (particularly under jruby)
22
+ * Better Primary Key, composite VisitURL, type field support (models
23
+ Url).
24
+ * Provide Activerecord -> Sequel migration support (from a complete
25
+ iudex-da 1.2.1 db); consolidate prior migrations to a single base
26
+ and profile migrations.
27
+ * Add new (Sequel) migration to set "C" locale collate (aka ASCII sort
28
+ order) for uhash (required for uhash_slice, see above.) iudex-da now
29
+ depends on PostgreSQL 9.1 for COLLATE support.
30
+
1
31
  === 1.2.1 (2012-9-15)
2
32
  * Upgrade to logback ~> 1.2 (dev)
3
33
 
data/Manifest.txt CHANGED
@@ -8,29 +8,23 @@ bin/iudex-da-import
8
8
  bin/iudex-da-simhash-dump
9
9
  bin/iudex-migrate
10
10
  config/config.rb
11
- db/0010_base_urls.rb
12
- db/0020_add_feed_metadata.rb
13
- db/0021_more_feed_text.rb
14
- db/0030_add_priority.rb
15
- db/0040_add_visit_after.rb
16
- db/0050_add_cache_location.rb
17
- db/0060_url_indexes.rb
18
- db/0070_add_created_at.rb
19
- db/0080_add_simhash.rb
20
- db/0081_remove_simhash_index.rb
21
- db/0110_host_to_domain.rb
22
- db/index_next_visit/0100_add_index_next_visit.rb
23
- db/simhash/0085_add_simhash_index.rb
11
+ db/20111012173757_base.rb
12
+ db/20120930173600_uhash_collation_order.rb
13
+ db/index_next_visit/21500000000101_add_index_next_visit.rb
14
+ db/simhash/21500000000001_add_simhash_index.rb
24
15
  lib/iudex-da/base.rb
25
16
  lib/iudex-da.rb
26
- lib/iudex-da/ar.rb
27
17
  lib/iudex-da/config.rb
28
18
  lib/iudex-da/factory_helper.rb
29
19
  lib/iudex-da/importer.rb
30
20
  lib/iudex-da/key_helper.rb
21
+ lib/iudex-da/models.rb
22
+ lib/iudex-da/orm.rb
31
23
  lib/iudex-da/pool_data_source_factory.rb
24
+ lib/iudex-da/work_poller.rb
32
25
  test/setup.rb
33
26
  test/test_migrate.rb
34
- test/test_poll_work.rb
35
27
  test/test_pool_factory.rb
36
- lib/iudex-da/iudex-da-1.2.1.jar
28
+ test/test_url_model.rb
29
+ test/test_work_poller.rb
30
+ lib/iudex-da/iudex-da-1.3.0.jar
data/bin/iudex-migrate CHANGED
@@ -56,7 +56,8 @@ END
56
56
  end
57
57
  end
58
58
  opts.on( "-d", "--debug" ) do
59
- Logback[ 'iudex.da' ].level = Logback::DEBUG
59
+ Logback[ 'iudex.da' ].level = :debug
60
+ Hooker.add( [ :iudex, :connect_props ] ) { { :log => true } }
60
61
  end
61
62
  opts.on( "-v", "--version", "Display version and exit" ) do
62
63
  puts "iudex-da: #{DA::VERSION}"
@@ -65,8 +66,11 @@ END
65
66
  Hooker.register_config( opts )
66
67
  end.parse!
67
68
 
68
- require 'iudex-da/ar'
69
+ require 'iudex-da/orm'
69
70
 
70
- DA::migrate( ARGV[0] && ARGV[0].to_i )
71
+ target = ARGV[0] && ARGV[0].to_i
72
+ opts = {}
73
+ opts[ :target ] = target if target
74
+ DA::ORM::migrate( opts )
71
75
 
72
76
  end
@@ -0,0 +1,117 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ Sequel.migration do
18
+ change do
19
+
20
+ # The main/only urls table, in matching a default-setup iudex
21
+ # 1.1.0-1.2.1.
22
+ create_table( :urls ) do
23
+
24
+ twtz = "timestamp with time zone"
25
+ now = Sequel::CURRENT_TIMESTAMP
26
+
27
+ String :uhash, :null => false
28
+ # 23 byte ASCII PRIMARY KEY SHA-1 hash fragment of URL
29
+
30
+ String :url, :null => false
31
+ # Complete normalized url (exactly as used for uhash)
32
+
33
+ String :domain, :null => false
34
+ # Registration level domain from url host
35
+
36
+ String :type, :null => false
37
+ # FEED, PAGE, ROBOTS, SITEMAP
38
+ # Potentially speculative (i.e. "PAGE" before visited)
39
+ # FIXME: Or REDIRECT here instead of status?
40
+
41
+ String :etag
42
+ # HTTP ETag header used for subsequent conditional GET
43
+ # Should only be on 200 and related HTTP status, not redirect
44
+
45
+ DateTime :last_visit, :type => twtz
46
+ # Time of last visit (and thus last type,status,reason,etc.)
47
+
48
+ Integer :status
49
+ # HTTP status code or special (negative) status mapping
50
+ # null : Not yet visited
51
+ # -1 : Connection Failed
52
+ # 4xx : Permanent Failures
53
+ # 5xx : Transient server error
54
+ # 200 : Success
55
+ # 304 : Not Modified
56
+ # 301,302 : Redirect
57
+ # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
58
+
59
+ TrueClass :pass
60
+ # null : Not yet processed (i.e. visit failed)
61
+ # false : Rejected by processing (for reason), DELETE required
62
+ # true : Fully Processed
63
+
64
+ String :reason
65
+ # null : None
66
+ # DUPE : Duplicate of referent
67
+ # rejection filter (intended as key)
68
+
69
+ String :referent
70
+ # null : None
71
+ # uhash of url this is refering to
72
+ # (includes status:REDIRECT, reason:DUPE, etc.)
73
+
74
+ String :referer
75
+ # null : None
76
+ # uhash of url this was refered from. (i.e. the feed URL)
77
+
78
+ String :title
79
+ # PAGE,FEED title
80
+
81
+ DateTime :ref_pub_date, :type => twtz
82
+ # (Latest) published date as provided from feed (may be ahead of
83
+ # or set before pub_date, below).
84
+
85
+ DateTime :pub_date, :type => twtz
86
+ # (Latest) published date as processed
87
+
88
+ String :summary
89
+ # (Feed) summary
90
+
91
+ String :content
92
+ # (Feed) content
93
+
94
+ Float :priority, :type => "real", :default => 0.0, :null => false
95
+ # Prioritization of next visit, range -INF,+INF
96
+
97
+ DateTime :next_visit_after, :type => twtz, :default => now
98
+ # null: never visit (terminal result)
99
+ # Don't visit again before the specified date.
100
+
101
+ Integer :cache_file
102
+ # 32-bit file number
103
+
104
+ Bignum :cache_file_offset
105
+ # 64-bit byte offset within file
106
+
107
+ DateTime :created_at, :type => twtz, :default => now
108
+ # When inserted
109
+
110
+ Bignum :simhash
111
+ # A simhash signature as a signed 8-byte long (should be
112
+ # compatible with java long).
113
+
114
+ primary_key [ :uhash ]
115
+ end
116
+ end
117
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -14,15 +14,21 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- class AddCreatedAt < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'created_at', 'timestamp with time zone' )
21
- execute 'ALTER TABLE urls ALTER COLUMN created_at SET DEFAULT now()'
17
+ Sequel.migration do
18
+ up do
19
+ run <<-DDL
20
+ ALTER TABLE urls
21
+ ALTER COLUMN uhash
22
+ SET DATA TYPE text COLLATE "C"
23
+ DDL
24
+ run "REINDEX INDEX urls_pkey"
22
25
  end
23
-
24
- def self.down
25
- remove_column( 'urls', 'created_at' )
26
+ down do
27
+ run <<-DDL
28
+ ALTER TABLE urls
29
+ ALTER COLUMN uhash
30
+ SET DATA TYPE text
31
+ DDL
32
+ run "REINDEX INDEX urls_pkey"
26
33
  end
27
-
28
34
  end
@@ -14,14 +14,10 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- class AddSimhashIndex < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_index( 'urls', [ 'simhash' ] )
21
- end
22
-
23
- def self.down
24
- remove_index( 'urls', 'simhash' )
17
+ Sequel.migration do
18
+ change do
19
+ alter_table( :urls ) do
20
+ add_index( :next_visit_after )
21
+ end
25
22
  end
26
-
27
23
  end
@@ -14,14 +14,10 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- class RemoveSimhashIndex < ActiveRecord::Migration
18
-
19
- def self.up
20
- remove_index( 'urls', 'simhash' )
21
- end
22
-
23
- def self.down
24
- add_index( 'urls', [ 'simhash' ] )
17
+ Sequel.migration do
18
+ change do
19
+ alter_table( :urls ) do
20
+ add_index( :simhash )
21
+ end
25
22
  end
26
-
27
23
  end
data/lib/iudex-da.rb CHANGED
@@ -29,8 +29,8 @@ module Iudex
29
29
 
30
30
  require "#{LIB_DIR}/iudex-da-#{VERSION}.jar"
31
31
 
32
- import 'iudex.da.WorkPoller'
33
32
  import 'iudex.da.ContentMapper'
33
+ import 'iudex.da.ContentReader'
34
34
 
35
35
  module Filters
36
36
  import 'iudex.da.filters.UpdateFilter'
@@ -39,3 +39,5 @@ module Iudex
39
39
 
40
40
  end
41
41
  end
42
+
43
+ require 'iudex-da/work_poller.rb'
data/lib/iudex-da/base.rb CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
  module Iudex
18
18
  module DA
19
- VERSION = '1.2.1'
19
+ VERSION = '1.3.0'
20
20
 
21
21
  LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
22
  end
@@ -18,10 +18,10 @@
18
18
  module Iudex
19
19
 
20
20
  module DA
21
- # Default database connection configuration for both ActiveRecord
22
- # (migrations, testing) and PoolDataSourceFactory.
21
+ # Default database connection configuration for both Sequel
22
+ # (migrations, testing) and JDBC PoolDataSourceFactory.
23
23
  CONFIG = {
24
- :adapter => 'jdbcpostgresql',
24
+ :adapter => 'jdbc:postgresql',
25
25
  :host => 'localhost',
26
26
  :database => 'iudex_test',
27
27
  :username => 'iudex',
Binary file
@@ -0,0 +1,66 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-da/orm'
18
+
19
+ # Ensure setup has been run, as Sequel::Model needs the database
20
+ # connected for schema at model class creation
21
+ Iudex::DA::ORM.db
22
+
23
+ module Iudex::DA::ORM
24
+
25
+ # Url model (for urls table). Usage note: ORM::setup must be called
26
+ # before this can be loaded.
27
+ class Url < ::Sequel::Model
28
+
29
+ VisitURL = Iudex::Core::VisitURL
30
+
31
+ plugin :composition
32
+
33
+ composition( :visit_url,
34
+ :composer => proc { VisitURL.trust( url ) },
35
+ :decomposer => proc {
36
+ if v = compositions[ :visit_url ]
37
+ self.url = v.url
38
+ self.uhash = v.uhash
39
+ self.domain = v.domain
40
+ end
41
+ } )
42
+
43
+ def visit_url=( vurl )
44
+ vurl = VisitURL.normalize( vurl ) unless vurl.is_a?( VisitURL )
45
+ super( vurl )
46
+ end
47
+
48
+ def self.find_by_url( vurl )
49
+ vurl = VisitURL.normalize( vurl ) unless vurl.is_a?( VisitURL )
50
+ self[ vurl.uhash ]
51
+ end
52
+
53
+ # Specifically include type accessors to avoid deprecation warnings for
54
+ # old ruby method.
55
+
56
+ def type
57
+ self[ :type ]
58
+ end
59
+
60
+ def type=( t )
61
+ self[ :type ] = t
62
+ end
63
+
64
+ end
65
+
66
+ end
@@ -0,0 +1,183 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'rjack-slf4j'
18
+ require 'iudex-da/config'
19
+ require 'sequel'
20
+ require 'jdbc/postgres'
21
+ require 'hooker'
22
+
23
+ Sequel.extension :migration
24
+
25
+ module Iudex::DA
26
+
27
+ module ORM
28
+
29
+ class << self
30
+
31
+ # The Sequel::Database instance. #setup is called if necessary.
32
+ def db
33
+ setup unless @db
34
+ @db
35
+ end
36
+
37
+ # Setup the ORM (Sequel) connection given CONFIG defaults, any
38
+ # passed opts, and connect_props config hooks.
39
+ def setup( opts = {} )
40
+
41
+ @db.disconnect if @db
42
+
43
+ log = RJack::SLF4J[ "iudex.da.sequel" ]
44
+ conf = CONFIG.merge( opts )
45
+ conf = Hooker.merge( [ :iudex, :connect_props ], conf )
46
+
47
+ conf[ :loggers ] = [ log ] if conf[ :log ]
48
+
49
+ cstr = ( "%s://%s/%s?%s" %
50
+ [ conf[ :adapter ],
51
+ [ conf[ :host ], conf[ :port ] ].compact.join( ':' ),
52
+ conf[ :database ],
53
+ params( conf ) ] )
54
+
55
+ log.info { "Connecting: #{cstr}" }
56
+ log.debug { "Full Params: #{ conf.inspect }" }
57
+
58
+ @db = Sequel.connect( cstr, conf )
59
+
60
+ end
61
+
62
+ # Migrate the DB given opts, including :target version. For
63
+ # backward compatibility, opts may be a single Integer,
64
+ # interpreted as the :target version. Setup must be called
65
+ # beforehand.
66
+ # See also opts for #migrate_ar_to_sequel
67
+ def migrate( opts = {} )
68
+ opts = {} if opts.nil?
69
+ opts = { :target => opts } if opts.is_a?( Integer )
70
+ raise "setup must be run before migrate" unless db
71
+ profiles = Hooker.apply( [ :iudex, :migration_profiles ],
72
+ opts[ :profiles ] || [] )
73
+
74
+ migrate_ar_to_sequel( opts )
75
+
76
+ pm = ProfileMigrator.new( db, profiles, opts )
77
+ pm.run
78
+ end
79
+
80
+ # Migrate from a iudex [1.1.0,1.3) database managed by
81
+ # activerecord to a 1.3.x database managed by Sequel. No-op if
82
+ # already Sequel.
83
+ # === Options
84
+ # :ar_to_sequel_migrations:: Hash<Integer,String> AR migration
85
+ # number to sequel filename
86
+ # (timestamp) map for extensions
87
+ # supported externally to iudex-da.
88
+ def migrate_ar_to_sequel( opts )
89
+
90
+ columns = ( db.table_exists?( :schema_migrations ) &&
91
+ db.schema( :schema_migrations ).map { |sr| sr[0] } )
92
+
93
+ if columns == [ :version ] # Old format AR schema_migrations
94
+ db.transaction do
95
+ versions = db.from( :schema_migrations ).
96
+ map { |r| r[ :version ].to_i }
97
+
98
+ if ( versions & AR_REQUIRED ) != AR_REQUIRED
99
+ missing = AR_REQUIRED - ( versions & AR_REQUIRED )
100
+ raise( ARNotComplete,
101
+ "Missing AR migrations #{missing.inspect}; " +
102
+ "Use 'iudex-migrate _1.2.1_' first" )
103
+ end
104
+
105
+ migrations_map = AR_TO_SEQUEL_MIGRATIONS.
106
+ merge( opts[ :ar_to_sequel_migrations ] || {} )
107
+
108
+ db.drop_table( :schema_migrations )
109
+ db.create_table( :schema_migrations ) do
110
+ String :filename, :null => false
111
+ primary_key [ :filename ]
112
+ end
113
+
114
+ sm = db[:schema_migrations]
115
+ sm.insert( :filename => '20111012173757_base.rb' )
116
+
117
+ migrations_map.each do | version, filename |
118
+ sm.insert( :filename => filename ) if versions.include?( version )
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ def params( opts )
125
+ pms = {}
126
+
127
+ u = opts[ :username ]
128
+ pms[ :user ] = u if u
129
+
130
+ p = opts[ :password ]
131
+ pms[ :password ] = p if p
132
+
133
+ pms.sort.map { |*p| p.join( '=' ) }.join( '&' )
134
+ end
135
+
136
+ end
137
+
138
+ AR_TO_SEQUEL_MIGRATIONS = {
139
+ 85 => '21500000000001_add_simhash_index.rb',
140
+ 100 => '21500000000101_add_index_next_visit.rb'
141
+ }
142
+ AR_REQUIRED = [ 10, 20, 21, 30, 40, 50, 60, 70, 80, 81, 110 ]
143
+
144
+ ARNotComplete = Class.new(StandardError)
145
+
146
+ @db = nil
147
+
148
+ # Custom migrator handling "profile" directories (optional
149
+ # migrations)
150
+ class ProfileMigrator < Sequel::TimestampMigrator
151
+
152
+ def initialize( db, profiles, opts )
153
+
154
+ base = File.join( LIB_DIR, '..', '..', 'db' )
155
+ paths = [ base ]
156
+
157
+ paths += profiles.compact.map do |p|
158
+ p = p.to_s
159
+ if p =~ %r{^/}
160
+ p
161
+ else
162
+ File.join( base, p )
163
+ end
164
+ end
165
+
166
+ @pattern = if paths.size > 1
167
+ '{' + paths.join( ',' ) + '}'
168
+ else
169
+ paths.first
170
+ end
171
+
172
+ super( db, base, opts )
173
+ end
174
+
175
+ def get_migration_files
176
+ Dir.glob( "#{@pattern}/[0-9]*_*.rb" )
177
+ end
178
+
179
+ end
180
+
181
+ end
182
+
183
+ end