iudex-da 1.3.2-java → 1.3.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc CHANGED
@@ -1,3 +1,23 @@
1
+ === 1.3.3 (2012-11-8)
2
+ * FactoryHelper.create_update_filter now prefers an options Hash
3
+ exposing greater control over what is updated and how. In
4
+ particular, :on_referer can be independently set.
5
+ * Add (Base)Transformer, ContentUpdater, UpdateFilter support for a
6
+ distinct REFERER filter chain. Now content, referer, and references
7
+ updates are all optional.
8
+ * BaseTransformer.merge now augments the updated map with the current
9
+ (database) contents instead of creating a temporary map on which
10
+ UpdateFilter chain mutations would be discarded. This change makes
11
+ it consistent with either new or updated content.
12
+ * Add new options Hash syntax and :type support to
13
+ WorkPoller.domain_union
14
+ * Fix WorkPoller uhash_slice range calculation for ruby 1.8
15
+ * Intern :type values on read in ContentMapper
16
+ * Upgrade/narrow to iudex-core ~> 1.3.0 (incl. gravitext-util ~> 1.7.0)
17
+ * Upgrade to logback ~> 1.5 (dev)
18
+ * Add WorkPoller logging and consolidate log from
19
+ GenericWorkPollStrategy.
20
+
1
21
  === 1.3.2 (2012-10-25)
2
22
  * Add migration to make index_next_visit partial over non-null
3
23
  next_visit_after rows. (Index rebuild may take a while.)
data/Manifest.txt CHANGED
@@ -28,4 +28,4 @@ test/test_migrate.rb
28
28
  test/test_pool_factory.rb
29
29
  test/test_url_model.rb
30
30
  test/test_work_poller.rb
31
- lib/iudex-da/iudex-da-1.3.2.jar
31
+ lib/iudex-da/iudex-da-1.3.3.jar
data/lib/iudex-da/base.rb CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
  module Iudex
18
18
  module DA
19
- VERSION = '1.3.2'
19
+ VERSION = '1.3.3'
20
20
 
21
21
  LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
22
  end
@@ -31,22 +31,88 @@ module Iudex
31
31
  @data_source ||= PoolDataSourceFactory.new.create
32
32
  end
33
33
 
34
- # Create UpdateFilter given fields and filter list factory
35
- # methods
36
- def create_update_filter( fields,
37
- post_sym = nil,
38
- update_sym = nil,
39
- new_sym = nil )
40
-
41
- f = UpdateFilter.new( data_source, field_mapper( fields ) )
42
- create_chain( update_sym ) { |c| f.update_ref_filter = c }
43
- create_chain( new_sym ) { |c| f.new_ref_filter = c }
44
- create_chain( post_sym ) { |c| f.content_filter = c }
34
+ # Create an UpdateFilter given the provided options.
35
+ #
36
+ # === Options
37
+ #
38
+ # :fields:: The Array of fields (Symbol or Key) to (re-)read
39
+ # from the database and update. The required :uhash
40
+ # field is included automatically.
41
+ #
42
+ # :max_retries:: Maximum number of retries not including the
43
+ # initial attempt, in case of a database
44
+ # conflict (Default: 3)
45
+ #
46
+ # :isolation_level:: A transaction isolation constant as
47
+ # defined in java.sql.Connection
48
+ # (Default: REPEATABLE_READ 0x04)
49
+ #
50
+ # :on_content:: Filter option for current (content) UniMap.
51
+ #
52
+ # :on_ref_update:: Filter option for REFERENCES that are
53
+ # already existing in the database
54
+ #
55
+ # :on_ref_new:: Filter option for REFERENCES that are new (not
56
+ # found in db).
57
+ #
58
+ # :on_referer:: Filter option for the REFERER to the current
59
+ # content.
60
+ #
61
+ # The positional parameters equivalent to
62
+ # ( :fields, :on_content, :on_ref_update, :on_ref_new )
63
+ # as defined above are also supported but deprecated.
64
+ #
65
+ # === Filter options
66
+ #
67
+ # Each of the on_* filter options defined above may take a
68
+ # value of a Filter, Array of filters or a Symbol value. The
69
+ # following symbol values are special, all other Symbol values
70
+ # are passed as the :filters option to create_chain, and is
71
+ # interpreted as a method name.
72
+ #
73
+ # :merge:: The default behavior of merging the in-memory state
74
+ # with the current database state. This is equivalent
75
+ # to providing a NoOpFilter.
76
+ #
77
+ # :ignore:: Do not update this element. This is equivalent to
78
+ # providing a filter that always rejects, but faster
79
+ # since it need not read the value from the db.
80
+ #
81
+ def create_update_filter( *args )
82
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
83
+
84
+ opts[ :fields ] ||= args.shift #deprecated
85
+ opts[ :on_content ] ||= args.shift
86
+ opts[ :on_ref_update ] ||= args.shift
87
+ opts[ :on_ref_new ] ||= args.shift
88
+
89
+ f = UpdateFilter.new( data_source, field_mapper( opts[ :fields ] ) )
90
+ updater_chain( opts[:on_ref_update]) { |c| f.update_ref_filter = c }
91
+ updater_chain( opts[ :on_ref_new ] ) { |c| f.new_ref_filter = c }
92
+ updater_chain( opts[ :on_content ] ) { |c| f.content_filter = c }
93
+ updater_chain( opts[ :on_referer ] ) { |c| f.referer_filter = c }
94
+
95
+ f.max_retries = opts[ :max_retries ] if opts[ :max_retries ]
96
+ f.isolation_level = opts[:isolation_level] if opts[:isolation_level]
97
+
45
98
  f
46
99
  end
47
100
 
48
- def create_read_filter( fields = [] )
49
- f = ReadFilter.new( data_source, field_mapper( fields ) )
101
+ # Create a ReadFilter given the provided options.
102
+ #
103
+ # === Options
104
+ #
105
+ # :fields:: The Array of fields (Symbol or Key) to read from
106
+ # the database.
107
+ #
108
+ # The positional parameters equivalent to ( :fields ) as
109
+ # defined above is also supported but deprecated.
110
+ #
111
+ def create_read_filter( *args )
112
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
113
+ opts[ :fields ] ||= args.shift #deprecated
114
+
115
+ ReadFilter.new( data_source, field_mapper( opts[ :fields ] ) )
50
116
  end
51
117
 
52
118
  def field_mapper( fields )
@@ -54,6 +120,18 @@ module Iudex
54
120
  ContentMapper.new( fields )
55
121
  end
56
122
 
123
+ def updater_chain( v, &block )
124
+ if v.is_a?( Iudex::Filter::Filter )
125
+ block.call( v )
126
+ elsif v == :merge
127
+ block.call( UpdateFilter::DEFAULT_MERGE )
128
+ elsif v == :ignore
129
+ block.call( nil )
130
+ else
131
+ create_chain( :filters => v, &block )
132
+ end
133
+ end
134
+
57
135
  end
58
136
 
59
137
  end
Binary file
@@ -90,19 +90,33 @@ module Iudex::DA
90
90
  age_coef_2 && age_coef_2 > 0.0 )
91
91
  end
92
92
 
93
- # An Array of [ domain, max_urls ] pairs where each domain is a
94
- # unique reqistration-level, normalized lower-case domain. A nil
95
- # domain applies to all domains not covered by another
96
- # row. Without a nil domain row, work is limited to the explicit
97
- # domains listed. If provided these max_urls values are used
98
- # instead of top level #max_urls. Domain depth should most likely
99
- # be avoided if this feature is used. (default: [], off)
93
+ # A table of option rows as defined below. A nil/unspecified
94
+ # domain and type row applies to all domains/types not covered by
95
+ # another row. Without such a row, work is limited to the explicit
96
+ # domains/types listed.
97
+ #
98
+ # ==== Options
99
+ #
100
+ # :domain:: The registration-level, normalized lower-case domain
101
+ # value.
102
+ #
103
+ # :type:: An (upper-case) TYPE value to be AND'd with a domain
104
+ # domain or may appear on its own, applying to all
105
+ # unconfigured domains.
106
+ #
107
+ # :max:: The maximum number of visit urls to obtain in one poll
108
+ # (instead of the top level #max_urls.) A zero max_urls
109
+ # value excludes this domain/type (efficiently).
110
+ #
111
+ # Also a [ domain, max ] alternative syntax is currently supported
112
+ # but deprecated.
113
+ #
100
114
  attr_accessor :domain_union
101
115
 
102
116
  # An array containing a zero-based position and a total number of
103
117
  # evenly divided segments within the range of possible uhash
104
118
  # values. If set only work with uhashes in the designated range
105
- # will be polled. Note that the uhash is indepedent of domain,
119
+ # will be polled. Note that the uhash is independent of domain,
106
120
  # being a hash on the entire URL. (default: nil, off)
107
121
  attr_accessor :uhash_slice
108
122
 
@@ -136,6 +150,11 @@ module Iudex::DA
136
150
  @data_source = data_source
137
151
  end
138
152
 
153
+ # Override GenericWorkPollStrategy
154
+ def log
155
+ @log.java_logger
156
+ end
157
+
139
158
  # Override GenericWorkPollStrategy
140
159
  def pollWorkImpl( visit_queue )
141
160
  visit_queue.add_all( poll )
@@ -147,6 +166,7 @@ module Iudex::DA
147
166
  # Raises SQLException
148
167
  def poll
149
168
  query, params = generate_query
169
+ @log.debug { "Poll query: #{query}; #{params.inspect}" }
150
170
  reader.select( query, *params )
151
171
  end
152
172
 
@@ -156,6 +176,16 @@ module Iudex::DA
156
176
  end
157
177
  end
158
178
 
179
+ def domain_union=( table )
180
+ @domain_union = table.map do | *args |
181
+ args = args.flatten.dup
182
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
183
+ opts[ :domain ] ||= args.shift
184
+ opts[ :max ] ||= args.shift
185
+ opts
186
+ end
187
+ end
188
+
159
189
  def generate_query
160
190
  criteria = [ "next_visit_after <= now()" ]
161
191
 
@@ -172,18 +202,34 @@ module Iudex::DA
172
202
  params = [ max_urls ]
173
203
  else
174
204
  subqueries = []
175
- @domain_union.each do | domain, dmax |
176
- next if dmax == 0
205
+ @domain_union.each do | opts |
206
+ opts = opts.dup
207
+ opts[ :max ] ||= @max_urls
208
+
209
+ next if opts[ :max ] == 0
210
+
177
211
  c = criteria.dup
178
- if domain.nil?
179
- c += @domain_union.map { |nd,_| nd }.
212
+ if opts[ :domain ].nil?
213
+ c += @domain_union.map { |r| r[ :domain ] }.
180
214
  compact.
215
+ uniq.
181
216
  map { |nd| "domain != '#{nd}'" }
182
217
  else
183
- c << "domain = '#{domain}'"
218
+ c << "domain = '#{opts[ :domain ]}'"
184
219
  end
220
+
221
+ if opts[ :type ].nil?
222
+ c += @domain_union.select { |r| r[ :domain ] == opts[ :domain ] }.
223
+ map { |r| r[ :type ] }.
224
+ compact.
225
+ uniq.
226
+ map { |nt| "type != '#{nt}'" }
227
+ elsif opts[ :type ]
228
+ c << "type = '#{opts[ :type ]}'"
229
+ end
230
+
185
231
  subqueries << generate_query_inner( c )
186
- params << dmax
232
+ params << opts[ :max ]
187
233
  end
188
234
  if subqueries.size == 1
189
235
  query = subqueries.first
@@ -291,7 +337,7 @@ module Iudex::DA
291
337
  high = ( period * (pos+1) ).round if (pos+1) < segments
292
338
 
293
339
  [ low, high ].map do |i|
294
- URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
340
+ URL64_ORDER[ i / 64 ].chr + URL64_ORDER[ i % 64 ].chr if i
295
341
  end
296
342
  end
297
343
 
data/pom.xml CHANGED
@@ -5,13 +5,13 @@
5
5
  <groupId>iudex</groupId>
6
6
  <artifactId>iudex-da</artifactId>
7
7
  <packaging>jar</packaging>
8
- <version>1.3.2</version>
8
+ <version>1.3.3</version>
9
9
  <name>Iudex Data Access</name>
10
10
 
11
11
  <parent>
12
12
  <groupId>iudex</groupId>
13
13
  <artifactId>iudex-parent</artifactId>
14
- <version>1.2.1</version>
14
+ <version>1.3.0</version>
15
15
  <relativePath>..</relativePath>
16
16
  </parent>
17
17
 
@@ -20,7 +20,7 @@
20
20
  <dependency>
21
21
  <groupId>iudex</groupId>
22
22
  <artifactId>iudex-core</artifactId>
23
- <version>[1.2.1,1.3.999)</version>
23
+ <version>[1.3.0,1.3.999)</version>
24
24
  </dependency>
25
25
 
26
26
  <dependency>
@@ -33,13 +33,13 @@ class TestWorkPoller < MiniTest::Unit::TestCase
33
33
 
34
34
  URLS = [ [ "http://foo.gravitext.com/bar/1", 11 ],
35
35
  [ "http://hometown.com/33", 10 ],
36
- [ "http://gravitext.com/2", 9 ] ]
36
+ [ "http://gravitext.com/2", 9, "ALT" ] ]
37
37
 
38
38
  def setup
39
39
  Url.truncate
40
40
 
41
- URLS.each do | u, p |
42
- Url.create( :visit_url => u, :priority => p, :type => "PAGE" )
41
+ URLS.each do | u, p, t |
42
+ Url.create( :visit_url => u, :priority => p, :type => t || "PAGE" )
43
43
  end
44
44
 
45
45
  @factory = PoolDataSourceFactory.new( :loglevel => 4 )
@@ -124,17 +124,78 @@ class TestWorkPoller < MiniTest::Unit::TestCase
124
124
  end
125
125
 
126
126
  def test_poll_domain_union_2
127
- poller.domain_union = [ [ 'gravitext.com', 15000 ],
128
- [ nil, 10000 ] ]
127
+ poller.domain_union = [ { :domain => 'gravitext.com', :max => 15000 },
128
+ { :max => 10000 } ]
129
129
 
130
130
  result = poller.poll
131
131
  assert_equal( 3, result.size )
132
132
  end
133
133
 
134
134
  def test_poll_domain_union_3
135
- poller.domain_union = [ [ 'gravitext.com', 1 ],
136
- [ 'hometown.com', 1 ],
137
- [ nil, 3 ] ]
135
+ poller.domain_union = [ { :domain => 'gravitext.com', :max => 1 },
136
+ { :domain => 'hometown.com', :max => 1 },
137
+ { :max => 3 } ]
138
+
139
+ result = poller.poll
140
+ assert_equal( 2, result.size )
141
+ end
142
+
143
+ def test_poll_domain_union_type_1
144
+ poller.domain_union = [
145
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 15000 } ]
146
+
147
+ result = poller.poll
148
+ assert_equal( 1, result.size )
149
+ end
150
+
151
+ def test_poll_domain_union_type_2
152
+ poller.domain_union = [
153
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
154
+ { :domain => 'gravitext.com', :max => 1 } ]
155
+
156
+ result = poller.poll
157
+ assert_equal( 2, result.size )
158
+ end
159
+
160
+ def test_poll_domain_union_type_3
161
+ poller.domain_union = [
162
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
163
+ { :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
164
+ { :domain => 'gravitext.com', :max => 1 } ]
165
+
166
+ result = poller.poll
167
+ assert_equal( 2, result.size )
168
+ end
169
+
170
+ def test_poll_domain_union_type_4
171
+ poller.domain_union = [
172
+ { :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
173
+ { :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
174
+ { :domain => 'gravitext.com', :max => 1 },
175
+ { :domain => 'hometown.com', :max => 1 } ]
176
+
177
+ result = poller.poll
178
+ assert_equal( 3, result.size )
179
+ end
180
+
181
+ def test_poll_domain_union_type_5
182
+ poller.domain_union = [ { :type => 'ALT', :max => 15000 } ]
183
+
184
+ result = poller.poll
185
+ assert_equal( 1, result.size )
186
+ end
187
+
188
+ def test_poll_domain_union_type_6
189
+ poller.domain_union = [ { :type => 'ALT', :max => 2 },
190
+ { :max => 3 } ]
191
+
192
+ result = poller.poll
193
+ assert_equal( 3, result.size )
194
+ end
195
+
196
+ def test_poll_domain_union_type_7
197
+ poller.domain_union = [ { :type => 'ALT', :max => 2 },
198
+ { :max => 1 } ]
138
199
 
139
200
  result = poller.poll
140
201
  assert_equal( 2, result.size )
metadata CHANGED
@@ -2,34 +2,28 @@
2
2
  name: iudex-da
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.3.2
5
+ version: 1.3.3
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-25 00:00:00.000000000 Z
12
+ date: 2012-11-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: iudex-core
16
16
  version_requirements: !ruby/object:Gem::Requirement
17
17
  requirements:
18
- - - ! '>='
19
- - !ruby/object:Gem::Version
20
- version: 1.2.1
21
- - - <
18
+ - - ~>
22
19
  - !ruby/object:Gem::Version
23
- version: '1.4'
20
+ version: 1.3.0
24
21
  none: false
25
22
  requirement: !ruby/object:Gem::Requirement
26
23
  requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: 1.2.1
30
- - - <
24
+ - - ~>
31
25
  - !ruby/object:Gem::Version
32
- version: '1.4'
26
+ version: 1.3.0
33
27
  none: false
34
28
  prerelease: false
35
29
  type: :runtime
@@ -119,13 +113,13 @@ dependencies:
119
113
  requirements:
120
114
  - - ~>
121
115
  - !ruby/object:Gem::Version
122
- version: '1.2'
116
+ version: '1.5'
123
117
  none: false
124
118
  requirement: !ruby/object:Gem::Requirement
125
119
  requirements:
126
120
  - - ~>
127
121
  - !ruby/object:Gem::Version
128
- version: '1.2'
122
+ version: '1.5'
129
123
  none: false
130
124
  prerelease: false
131
125
  type: :development
@@ -188,7 +182,7 @@ files:
188
182
  - test/test_pool_factory.rb
189
183
  - test/test_url_model.rb
190
184
  - test/test_work_poller.rb
191
- - lib/iudex-da/iudex-da-1.3.2.jar
185
+ - lib/iudex-da/iudex-da-1.3.3.jar
192
186
  homepage: http://iudex.gravitext.com
193
187
  licenses: []
194
188
  post_install_message:
Binary file