iudex-da 1.3.2-java → 1.3.3-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc CHANGED
@@ -1,3 +1,23 @@
1
+ === 1.3.3 (2012-11-8)
2
+ * FactoryHelper.create_update_filter now prefers an options Hash
3
+ exposing greater control over what is updated and how. In
4
+ particular, :on_referer can be independently set.
5
+ * Add (Base)Transformer, ContentUpdater, UpdateFilter support for a
6
+ distinct REFERER filter chain. Now content, referer, and references
7
+ updates are all optional.
8
+ * BaseTransformer.merge now augments the updated map with the current
9
+ (database) contents instead of creating a temporary map on which
10
+ UpdateFilter chain mutations would be discarded. This change makes
11
+ it consistent with either new or updated content.
12
+ * Add new options Hash syntax and :type support to
13
+ WorkPoller.domain_union
14
+ * Fix WorkPoller uhash_slice range calculation for ruby 1.8
15
+ * Intern :type values on read in ContentMapper
16
+ * Upgrade/narrow to iudex-core ~> 1.3.0 (incl. gravitext-util ~> 1.7.0)
17
+ * Upgrade to logback ~> 1.5 (dev)
18
+ * Add WorkPoller logging and consolidate log from
19
+ GenericWorkPollStrategy.
20
+
1
21
  === 1.3.2 (2012-10-25)
2
22
  * Add migration to make index_next_visit partial over non-null
3
23
  next_visit_after rows. (Index rebuild may take a while.)
data/Manifest.txt CHANGED
@@ -28,4 +28,4 @@ test/test_migrate.rb
28
28
  test/test_pool_factory.rb
29
29
  test/test_url_model.rb
30
30
  test/test_work_poller.rb
31
- lib/iudex-da/iudex-da-1.3.2.jar
31
+ lib/iudex-da/iudex-da-1.3.3.jar
data/lib/iudex-da/base.rb CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
  module Iudex
18
18
  module DA
19
- VERSION = '1.3.2'
19
+ VERSION = '1.3.3'
20
20
 
21
21
  LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
22
  end
@@ -31,22 +31,88 @@ module Iudex
31
31
  @data_source ||= PoolDataSourceFactory.new.create
32
32
  end
33
33
 
34
- # Create UpdateFilter given fields and filter list factory
35
- # methods
36
- def create_update_filter( fields,
37
- post_sym = nil,
38
- update_sym = nil,
39
- new_sym = nil )
40
-
41
- f = UpdateFilter.new( data_source, field_mapper( fields ) )
42
- create_chain( update_sym ) { |c| f.update_ref_filter = c }
43
- create_chain( new_sym ) { |c| f.new_ref_filter = c }
44
- create_chain( post_sym ) { |c| f.content_filter = c }
34
+ # Create an UpdateFilter given the provided options.
35
+ #
36
+ # === Options
37
+ #
38
+ # :fields:: The Array of fields (Symbol or Key) to (re-)read
39
+ # from the database and update. The required :uhash
40
+ # field is included automatically.
41
+ #
42
+ # :max_retries:: Maximum number of retries not including the
43
+ # initial attempt, in case of a database
44
+ # conflict (Default: 3)
45
+ #
46
+ # :isolation_level:: A transaction isolation constant as
47
+ # defined in java.sql.Connection
48
+ # (Default: REPEATABLE_READ 0x04)
49
+ #
50
+ # :on_content:: Filter option for current (content) UniMap.
51
+ #
52
+ # :on_ref_update:: Filter option for REFERENCES that are
53
+ # already existing in the database
54
+ #
55
+ # :on_ref_new:: Filter option for REFERENCES that are new (not
56
+ # found in db).
57
+ #
58
+ # :on_referer:: Filter option for the REFERER to the current
59
+ # content.
60
+ #
61
+ # The positional parameters equivalent to
62
+ # ( :fields, :on_content, :on_ref_update, :on_ref_new )
63
+ # as defined above are also supported but deprecated.
64
+ #
65
+ # === Filter options
66
+ #
67
+ # Each of the on_* filter options defined above may take a
68
+ # value of a Filter, Array of filters or a Symbol value. The
69
+ # following symbol values are special, all other Symbol values
70
+ # are passed as the :filters option to create_chain, and is
71
+ # interpreted as a method name.
72
+ #
73
+ # :merge:: The default behavior of merging the in-memory state
74
+ # with the current database state. This is equivalent
75
+ # to providing a NoOpFilter.
76
+ #
77
+ # :ignore:: Do not update this element. This is equivalent to
78
+ # providing a filter that always rejects, but faster
79
+ # since it need not read the value from the db.
80
+ #
81
+ def create_update_filter( *args )
82
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
83
+
84
+ opts[ :fields ] ||= args.shift #deprecated
85
+ opts[ :on_content ] ||= args.shift
86
+ opts[ :on_ref_update ] ||= args.shift
87
+ opts[ :on_ref_new ] ||= args.shift
88
+
89
+ f = UpdateFilter.new( data_source, field_mapper( opts[ :fields ] ) )
90
+ updater_chain( opts[:on_ref_update]) { |c| f.update_ref_filter = c }
91
+ updater_chain( opts[ :on_ref_new ] ) { |c| f.new_ref_filter = c }
92
+ updater_chain( opts[ :on_content ] ) { |c| f.content_filter = c }
93
+ updater_chain( opts[ :on_referer ] ) { |c| f.referer_filter = c }
94
+
95
+ f.max_retries = opts[ :max_retries ] if opts[ :max_retries ]
96
+ f.isolation_level = opts[:isolation_level] if opts[:isolation_level]
97
+
45
98
  f
46
99
  end
47
100
 
48
- def create_read_filter( fields = [] )
49
- f = ReadFilter.new( data_source, field_mapper( fields ) )
101
+ # Create a ReadFilter given the provided options.
102
+ #
103
+ # === Options
104
+ #
105
+ # :fields:: The Array of fields (Symbol or Key) to read from
106
+ # the database.
107
+ #
108
+ # The positional parameters equivalent to ( :fields ) as
109
+ # defined above is also supported but deprecated.
110
+ #
111
+ def create_read_filter( *args )
112
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
113
+ opts[ :fields ] ||= args.shift #deprecated
114
+
115
+ ReadFilter.new( data_source, field_mapper( opts[ :fields ] ) )
50
116
  end
51
117
 
52
118
  def field_mapper( fields )
@@ -54,6 +120,18 @@ module Iudex
54
120
  ContentMapper.new( fields )
55
121
  end
56
122
 
123
+ def updater_chain( v, &block )
124
+ if v.is_a?( Iudex::Filter::Filter )
125
+ block.call( v )
126
+ elsif v == :merge
127
+ block.call( UpdateFilter::DEFAULT_MERGE )
128
+ elsif v == :ignore
129
+ block.call( nil )
130
+ else
131
+ create_chain( :filters => v, &block )
132
+ end
133
+ end
134
+
57
135
  end
58
136
 
59
137
  end
Binary file
@@ -90,19 +90,33 @@ module Iudex::DA
90
90
  age_coef_2 && age_coef_2 > 0.0 )
91
91
  end
92
92
 
93
- # An Array of [ domain, max_urls ] pairs where each domain is a
94
- # unique reqistration-level, normalized lower-case domain. A nil
95
- # domain applies to all domains not covered by another
96
- # row. Without a nil domain row, work is limited to the explicit
97
- # domains listed. If provided these max_urls values are used
98
- # instead of top level #max_urls. Domain depth should most likely
99
- # be avoided if this feature is used. (default: [], off)
93
+ # A table of option rows as defined below. A nil/unspecified
94
+ # domain and type row applies to all domains/types not covered by
95
+ # another row. Without such a row, work is limited to the explicit
96
+ # domains/types listed.
97
+ #
98
+ # ==== Options
99
+ #
100
+ # :domain:: The registration-level, normalized lower-case domain
101
+ # value.
102
+ #
103
+ # :type:: An (upper-case) TYPE value to be AND'd with a domain
104
+ # domain or may appear on its own, applying to all
105
+ # unconfigured domains.
106
+ #
107
+ # :max:: The maximum number of visit urls to obtain in one poll
108
+ # (instead of the top level #max_urls.) A zero max_urls
109
+ # value excludes this domain/type (efficiently).
110
+ #
111
+ # Also a [ domain, max ] alternative syntax is currently supported
112
+ # but deprecated.
113
+ #
100
114
  attr_accessor :domain_union
101
115
 
102
116
  # An array containing a zero-based position and a total number of
103
117
  # evenly divided segments within the range of possible uhash
104
118
  # values. If set only work with uhashes in the designated range
105
- # will be polled. Note that the uhash is indepedent of domain,
119
+ # will be polled. Note that the uhash is independent of domain,
106
120
  # being a hash on the entire URL. (default: nil, off)
107
121
  attr_accessor :uhash_slice
108
122
 
@@ -136,6 +150,11 @@ module Iudex::DA
136
150
  @data_source = data_source
137
151
  end
138
152
 
153
+ # Override GenericWorkPollStrategy
154
+ def log
155
+ @log.java_logger
156
+ end
157
+
139
158
  # Override GenericWorkPollStrategy
140
159
  def pollWorkImpl( visit_queue )
141
160
  visit_queue.add_all( poll )
@@ -147,6 +166,7 @@ module Iudex::DA
147
166
  # Raises SQLException
148
167
  def poll
149
168
  query, params = generate_query
169
+ @log.debug { "Poll query: #{query}; #{params.inspect}" }
150
170
  reader.select( query, *params )
151
171
  end
152
172
 
@@ -156,6 +176,16 @@ module Iudex::DA
156
176
  end
157
177
  end
158
178
 
179
+ def domain_union=( table )
180
+ @domain_union = table.map do | *args |
181
+ args = args.flatten.dup
182
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
183
+ opts[ :domain ] ||= args.shift
184
+ opts[ :max ] ||= args.shift
185
+ opts
186
+ end
187
+ end
188
+
159
189
  def generate_query
160
190
  criteria = [ "next_visit_after <= now()" ]
161
191
 
@@ -172,18 +202,34 @@ module Iudex::DA
172
202
  params = [ max_urls ]
173
203
  else
174
204
  subqueries = []
175
- @domain_union.each do | domain, dmax |
176
- next if dmax == 0
205
+ @domain_union.each do | opts |
206
+ opts = opts.dup
207
+ opts[ :max ] ||= @max_urls
208
+
209
+ next if opts[ :max ] == 0
210
+
177
211
  c = criteria.dup
178
- if domain.nil?
179
- c += @domain_union.map { |nd,_| nd }.
212
+ if opts[ :domain ].nil?
213
+ c += @domain_union.map { |r| r[ :domain ] }.
180
214
  compact.
215
+ uniq.
181
216
  map { |nd| "domain != '#{nd}'" }
182
217
  else
183
- c << "domain = '#{domain}'"
218
+ c << "domain = '#{opts[ :domain ]}'"
184
219
  end
220
+
221
+ if opts[ :type ].nil?
222
+ c += @domain_union.select { |r| r[ :domain ] == opts[ :domain ] }.
223
+ map { |r| r[ :type ] }.
224
+ compact.
225
+ uniq.
226
+ map { |nt| "type != '#{nt}'" }
227
+ elsif opts[ :type ]
228
+ c << "type = '#{opts[ :type ]}'"
229
+ end
230
+
185
231
  subqueries << generate_query_inner( c )
186
- params << dmax
232
+ params << opts[ :max ]
187
233
  end
188
234
  if subqueries.size == 1
189
235
  query = subqueries.first
@@ -291,7 +337,7 @@ module Iudex::DA
291
337
  high = ( period * (pos+1) ).round if (pos+1) < segments
292
338
 
293
339
  [ low, high ].map do |i|
294
- URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
340
+ URL64_ORDER[ i / 64 ].chr + URL64_ORDER[ i % 64 ].chr if i
295
341
  end
296
342
  end
297
343
 
data/pom.xml CHANGED
@@ -5,13 +5,13 @@
5
5
  <groupId>iudex</groupId>
6
6
  <artifactId>iudex-da</artifactId>
7
7
  <packaging>jar</packaging>
8
- <version>1.3.2</version>
8
+ <version>1.3.3</version>
9
9
  <name>Iudex Data Access</name>
10
10
 
11
11
  <parent>
12
12
  <groupId>iudex</groupId>
13
13
  <artifactId>iudex-parent</artifactId>
14
- <version>1.2.1</version>
14
+ <version>1.3.0</version>
15
15
  <relativePath>..</relativePath>
16
16
  </parent>
17
17
 
@@ -20,7 +20,7 @@
20
20
  <dependency>
21
21
  <groupId>iudex</groupId>
22
22
  <artifactId>iudex-core</artifactId>
23
- <version>[1.2.1,1.3.999)</version>
23
+ <version>[1.3.0,1.3.999)</version>
24
24
  </dependency>
25
25
 
26
26
  <dependency>
@@ -33,13 +33,13 @@ class TestWorkPoller < MiniTest::Unit::TestCase
33
33
 
34
34
  URLS = [ [ "http://foo.gravitext.com/bar/1", 11 ],
35
35
  [ "http://hometown.com/33", 10 ],
36
- [ "http://gravitext.com/2", 9 ] ]
36
+ [ "http://gravitext.com/2", 9, "ALT" ] ]
37
37
 
38
38
  def setup
39
39
  Url.truncate
40
40
 
41
- URLS.each do | u, p |
42
- Url.create( :visit_url => u, :priority => p, :type => "PAGE" )
41
+ URLS.each do | u, p, t |
42
+ Url.create( :visit_url => u, :priority => p, :type => t || "PAGE" )
43
43
  end
44
44
 
45
45
  @factory = PoolDataSourceFactory.new( :loglevel => 4 )
@@ -124,17 +124,78 @@ class TestWorkPoller < MiniTest::Unit::TestCase
124
124
  end
125
125
 
126
126
  def test_poll_domain_union_2
127
- poller.domain_union = [ [ 'gravitext.com', 15000 ],
128
- [ nil, 10000 ] ]
127
+ poller.domain_union = [ { :domain => 'gravitext.com', :max => 15000 },
128
+ { :max => 10000 } ]
129
129
 
130
130
  result = poller.poll
131
131
  assert_equal( 3, result.size )
132
132
  end
133
133
 
134
134
  def test_poll_domain_union_3
135
- poller.domain_union = [ [ 'gravitext.com', 1 ],
136
- [ 'hometown.com', 1 ],
137
- [ nil, 3 ] ]
135
+ poller.domain_union = [ { :domain => 'gravitext.com', :max => 1 },
136
+ { :domain => 'hometown.com', :max => 1 },
137
+ { :max => 3 } ]
138
+
139
+ result = poller.poll
140
+ assert_equal( 2, result.size )
141
+ end
142
+
143
+ def test_poll_domain_union_type_1
144
+ poller.domain_union = [
145
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 15000 } ]
146
+
147
+ result = poller.poll
148
+ assert_equal( 1, result.size )
149
+ end
150
+
151
+ def test_poll_domain_union_type_2
152
+ poller.domain_union = [
153
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
154
+ { :domain => 'gravitext.com', :max => 1 } ]
155
+
156
+ result = poller.poll
157
+ assert_equal( 2, result.size )
158
+ end
159
+
160
+ def test_poll_domain_union_type_3
161
+ poller.domain_union = [
162
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
163
+ { :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
164
+ { :domain => 'gravitext.com', :max => 1 } ]
165
+
166
+ result = poller.poll
167
+ assert_equal( 2, result.size )
168
+ end
169
+
170
+ def test_poll_domain_union_type_4
171
+ poller.domain_union = [
172
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
173
+ { :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
174
+ { :domain => 'gravitext.com', :max => 1 },
175
+ { :domain => 'hometown.com', :max => 1 } ]
176
+
177
+ result = poller.poll
178
+ assert_equal( 3, result.size )
179
+ end
180
+
181
+ def test_poll_domain_union_type_5
182
+ poller.domain_union = [ { :type => 'ALT', :max => 15000 } ]
183
+
184
+ result = poller.poll
185
+ assert_equal( 1, result.size )
186
+ end
187
+
188
+ def test_poll_domain_union_type_6
189
+ poller.domain_union = [ { :type => 'ALT', :max => 2 },
190
+ { :max => 3 } ]
191
+
192
+ result = poller.poll
193
+ assert_equal( 3, result.size )
194
+ end
195
+
196
+ def test_poll_domain_union_type_7
197
+ poller.domain_union = [ { :type => 'ALT', :max => 2 },
198
+ { :max => 1 } ]
138
199
 
139
200
  result = poller.poll
140
201
  assert_equal( 2, result.size )
metadata CHANGED
@@ -2,34 +2,28 @@
2
2
  name: iudex-da
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.3.2
5
+ version: 1.3.3
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-25 00:00:00.000000000 Z
12
+ date: 2012-11-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: iudex-core
16
16
  version_requirements: !ruby/object:Gem::Requirement
17
17
  requirements:
18
- - - ! '>='
19
- - !ruby/object:Gem::Version
20
- version: 1.2.1
21
- - - <
18
+ - - ~>
22
19
  - !ruby/object:Gem::Version
23
- version: '1.4'
20
+ version: 1.3.0
24
21
  none: false
25
22
  requirement: !ruby/object:Gem::Requirement
26
23
  requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: 1.2.1
30
- - - <
24
+ - - ~>
31
25
  - !ruby/object:Gem::Version
32
- version: '1.4'
26
+ version: 1.3.0
33
27
  none: false
34
28
  prerelease: false
35
29
  type: :runtime
@@ -119,13 +113,13 @@ dependencies:
119
113
  requirements:
120
114
  - - ~>
121
115
  - !ruby/object:Gem::Version
122
- version: '1.2'
116
+ version: '1.5'
123
117
  none: false
124
118
  requirement: !ruby/object:Gem::Requirement
125
119
  requirements:
126
120
  - - ~>
127
121
  - !ruby/object:Gem::Version
128
- version: '1.2'
122
+ version: '1.5'
129
123
  none: false
130
124
  prerelease: false
131
125
  type: :development
@@ -188,7 +182,7 @@ files:
188
182
  - test/test_pool_factory.rb
189
183
  - test/test_url_model.rb
190
184
  - test/test_work_poller.rb
191
- - lib/iudex-da/iudex-da-1.3.2.jar
185
+ - lib/iudex-da/iudex-da-1.3.3.jar
192
186
  homepage: http://iudex.gravitext.com
193
187
  licenses: []
194
188
  post_install_message:
Binary file