iudex-da 1.3.2-java → 1.3.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +20 -0
- data/Manifest.txt +1 -1
- data/lib/iudex-da/base.rb +1 -1
- data/lib/iudex-da/factory_helper.rb +91 -13
- data/lib/iudex-da/iudex-da-1.3.3.jar +0 -0
- data/lib/iudex-da/work_poller.rb +61 -15
- data/pom.xml +3 -3
- data/test/test_work_poller.rb +69 -8
- metadata +9 -15
- data/lib/iudex-da/iudex-da-1.3.2.jar +0 -0
data/History.rdoc
CHANGED
@@ -1,3 +1,23 @@
|
|
1
|
+
=== 1.3.3 (2012-11-8)
|
2
|
+
* FactoryHelper.create_update_filter now prefers an options Hash
|
3
|
+
exposing greater control over what is updated and how. In
|
4
|
+
particular, :on_referer can be independently set.
|
5
|
+
* Add (Base)Transformer, ContentUpdater, UpdateFilter support for a
|
6
|
+
distinct REFERER filter chain. Now content, referer, and references
|
7
|
+
updates are all optional.
|
8
|
+
* BaseTransformer.merge now augments the updated map with the current
|
9
|
+
(database) contents instead of creating a temporary map on which
|
10
|
+
UpdateFilter chain mutations would be discarded. This change makes
|
11
|
+
it consistent with either new or updated content.
|
12
|
+
* Add new options Hash syntax and :type support to
|
13
|
+
WorkPoller.domain_union
|
14
|
+
* Fix WorkPoller uhash_slice range calculation for ruby 1.8
|
15
|
+
* Intern :type values on read in ContentMapper
|
16
|
+
* Upgrade/narrow to iudex-core ~> 1.3.0 (incl. gravitext-util ~> 1.7.0)
|
17
|
+
* Upgrade to logback ~> 1.5 (dev)
|
18
|
+
* Add WorkPoller logging and consolidate log from
|
19
|
+
GenericWorkPollStrategy.
|
20
|
+
|
1
21
|
=== 1.3.2 (2012-10-25)
|
2
22
|
* Add migration to make index_next_visit partial over non-null
|
3
23
|
next_visit_after rows. (Index rebuild may take a while.)
|
data/Manifest.txt
CHANGED
data/lib/iudex-da/base.rb
CHANGED
@@ -31,22 +31,88 @@ module Iudex
|
|
31
31
|
@data_source ||= PoolDataSourceFactory.new.create
|
32
32
|
end
|
33
33
|
|
34
|
-
# Create UpdateFilter given
|
35
|
-
#
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
34
|
+
# Create an UpdateFilter given the provided options.
|
35
|
+
#
|
36
|
+
# === Options
|
37
|
+
#
|
38
|
+
# :fields:: The Array of fields (Symbol or Key) to (re-)read
|
39
|
+
# from the database and update. The required :uhash
|
40
|
+
# field is included automatically.
|
41
|
+
#
|
42
|
+
# :max_retries:: Maximum number of retries not including the
|
43
|
+
# initial attempt, in case of a database
|
44
|
+
# conflict (Default: 3)
|
45
|
+
#
|
46
|
+
# :isolation_level:: A transaction isolation constant as
|
47
|
+
# defined in java.sql.Connection
|
48
|
+
# (Default: REPEATABLE_READ 0x04)
|
49
|
+
#
|
50
|
+
# :on_content:: Filter option for current (content) UniMap.
|
51
|
+
#
|
52
|
+
# :on_ref_update:: Filter option for REFERENCES that are
|
53
|
+
# already existing in the database
|
54
|
+
#
|
55
|
+
# :on_ref_new:: Filter option for REFERENCES that are new (not
|
56
|
+
# found in db).
|
57
|
+
#
|
58
|
+
# :on_referer:: Filter option for the REFERER to the current
|
59
|
+
# content.
|
60
|
+
#
|
61
|
+
# The positional parameters equivalent to
|
62
|
+
# ( :fields, :on_content, :on_ref_update, :on_ref_new )
|
63
|
+
# as defined above are also supported but deprecated.
|
64
|
+
#
|
65
|
+
# === Filter options
|
66
|
+
#
|
67
|
+
# Each of the on_* filter options defined above may take a
|
68
|
+
# value of a Filter, Array of filters or a Symbol value. The
|
69
|
+
# following symbol values are special, all other Symbol values
|
70
|
+
# are passed as the :filters option to create_chain, and is
|
71
|
+
# interpreted as a method name.
|
72
|
+
#
|
73
|
+
# :merge:: The default behavior of merging the in-memory state
|
74
|
+
# with the current database state. This is equivalent
|
75
|
+
# to providing a NoOpFilter.
|
76
|
+
#
|
77
|
+
# :ignore:: Do not update this element. This is equivalent to
|
78
|
+
# providing a filter that always rejects, but faster
|
79
|
+
# since it need not read the value from the db.
|
80
|
+
#
|
81
|
+
def create_update_filter( *args )
|
82
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
83
|
+
|
84
|
+
opts[ :fields ] ||= args.shift #deprecated
|
85
|
+
opts[ :on_content ] ||= args.shift
|
86
|
+
opts[ :on_ref_update ] ||= args.shift
|
87
|
+
opts[ :on_ref_new ] ||= args.shift
|
88
|
+
|
89
|
+
f = UpdateFilter.new( data_source, field_mapper( opts[ :fields ] ) )
|
90
|
+
updater_chain( opts[:on_ref_update]) { |c| f.update_ref_filter = c }
|
91
|
+
updater_chain( opts[ :on_ref_new ] ) { |c| f.new_ref_filter = c }
|
92
|
+
updater_chain( opts[ :on_content ] ) { |c| f.content_filter = c }
|
93
|
+
updater_chain( opts[ :on_referer ] ) { |c| f.referer_filter = c }
|
94
|
+
|
95
|
+
f.max_retries = opts[ :max_retries ] if opts[ :max_retries ]
|
96
|
+
f.isolation_level = opts[:isolation_level] if opts[:isolation_level]
|
97
|
+
|
45
98
|
f
|
46
99
|
end
|
47
100
|
|
48
|
-
|
49
|
-
|
101
|
+
# Create a ReadFilter given the provided options.
|
102
|
+
#
|
103
|
+
# === Options
|
104
|
+
#
|
105
|
+
# :fields:: The Array of fields (Symbol or Key) to read from
|
106
|
+
# the database.
|
107
|
+
#
|
108
|
+
# The positional parameters equivalent to ( :fields ) as
|
109
|
+
# defined above is also supported but deprecated.
|
110
|
+
#
|
111
|
+
def create_read_filter( *args )
|
112
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
113
|
+
opts[ :fields ] ||= args.shift #deprecated
|
114
|
+
|
115
|
+
ReadFilter.new( data_source, field_mapper( opts[ :fields ] ) )
|
50
116
|
end
|
51
117
|
|
52
118
|
def field_mapper( fields )
|
@@ -54,6 +120,18 @@ module Iudex
|
|
54
120
|
ContentMapper.new( fields )
|
55
121
|
end
|
56
122
|
|
123
|
+
def updater_chain( v, &block )
|
124
|
+
if v.is_a?( Iudex::Filter::Filter )
|
125
|
+
block.call( v )
|
126
|
+
elsif v == :merge
|
127
|
+
block.call( UpdateFilter::DEFAULT_MERGE )
|
128
|
+
elsif v == :ignore
|
129
|
+
block.call( nil )
|
130
|
+
else
|
131
|
+
create_chain( :filters => v, &block )
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
57
135
|
end
|
58
136
|
|
59
137
|
end
|
Binary file
|
data/lib/iudex-da/work_poller.rb
CHANGED
@@ -90,19 +90,33 @@ module Iudex::DA
|
|
90
90
|
age_coef_2 && age_coef_2 > 0.0 )
|
91
91
|
end
|
92
92
|
|
93
|
-
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
#
|
97
|
-
#
|
98
|
-
#
|
99
|
-
#
|
93
|
+
# A table of option rows as defined below. A nil/unspecified
|
94
|
+
# domain and type row applies to all domains/types not covered by
|
95
|
+
# another row. Without such a row, work is limited to the explicit
|
96
|
+
# domains/types listed.
|
97
|
+
#
|
98
|
+
# ==== Options
|
99
|
+
#
|
100
|
+
# :domain:: The registration-level, normalized lower-case domain
|
101
|
+
# value.
|
102
|
+
#
|
103
|
+
# :type:: An (upper-case) TYPE value to be AND'd with a domain
|
104
|
+
# domain or may appear on its own, applying to all
|
105
|
+
# unconfigured domains.
|
106
|
+
#
|
107
|
+
# :max:: The maximum number of visit urls to obtain in one poll
|
108
|
+
# (instead of the top level #max_urls.) A zero max_urls
|
109
|
+
# value excludes this domain/type (efficiently).
|
110
|
+
#
|
111
|
+
# Also a [ domain, max ] alternative syntax is currently supported
|
112
|
+
# but deprecated.
|
113
|
+
#
|
100
114
|
attr_accessor :domain_union
|
101
115
|
|
102
116
|
# An array containing a zero-based position and a total number of
|
103
117
|
# evenly divided segments within the range of possible uhash
|
104
118
|
# values. If set only work with uhashes in the designated range
|
105
|
-
# will be polled. Note that the uhash is
|
119
|
+
# will be polled. Note that the uhash is independent of domain,
|
106
120
|
# being a hash on the entire URL. (default: nil, off)
|
107
121
|
attr_accessor :uhash_slice
|
108
122
|
|
@@ -136,6 +150,11 @@ module Iudex::DA
|
|
136
150
|
@data_source = data_source
|
137
151
|
end
|
138
152
|
|
153
|
+
# Override GenericWorkPollStrategy
|
154
|
+
def log
|
155
|
+
@log.java_logger
|
156
|
+
end
|
157
|
+
|
139
158
|
# Override GenericWorkPollStrategy
|
140
159
|
def pollWorkImpl( visit_queue )
|
141
160
|
visit_queue.add_all( poll )
|
@@ -147,6 +166,7 @@ module Iudex::DA
|
|
147
166
|
# Raises SQLException
|
148
167
|
def poll
|
149
168
|
query, params = generate_query
|
169
|
+
@log.debug { "Poll query: #{query}; #{params.inspect}" }
|
150
170
|
reader.select( query, *params )
|
151
171
|
end
|
152
172
|
|
@@ -156,6 +176,16 @@ module Iudex::DA
|
|
156
176
|
end
|
157
177
|
end
|
158
178
|
|
179
|
+
def domain_union=( table )
|
180
|
+
@domain_union = table.map do | *args |
|
181
|
+
args = args.flatten.dup
|
182
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
183
|
+
opts[ :domain ] ||= args.shift
|
184
|
+
opts[ :max ] ||= args.shift
|
185
|
+
opts
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
159
189
|
def generate_query
|
160
190
|
criteria = [ "next_visit_after <= now()" ]
|
161
191
|
|
@@ -172,18 +202,34 @@ module Iudex::DA
|
|
172
202
|
params = [ max_urls ]
|
173
203
|
else
|
174
204
|
subqueries = []
|
175
|
-
@domain_union.each do |
|
176
|
-
|
205
|
+
@domain_union.each do | opts |
|
206
|
+
opts = opts.dup
|
207
|
+
opts[ :max ] ||= @max_urls
|
208
|
+
|
209
|
+
next if opts[ :max ] == 0
|
210
|
+
|
177
211
|
c = criteria.dup
|
178
|
-
if domain.nil?
|
179
|
-
c += @domain_union.map { |
|
212
|
+
if opts[ :domain ].nil?
|
213
|
+
c += @domain_union.map { |r| r[ :domain ] }.
|
180
214
|
compact.
|
215
|
+
uniq.
|
181
216
|
map { |nd| "domain != '#{nd}'" }
|
182
217
|
else
|
183
|
-
c << "domain = '#{domain}'"
|
218
|
+
c << "domain = '#{opts[ :domain ]}'"
|
184
219
|
end
|
220
|
+
|
221
|
+
if opts[ :type ].nil?
|
222
|
+
c += @domain_union.select { |r| r[ :domain ] == opts[ :domain ] }.
|
223
|
+
map { |r| r[ :type ] }.
|
224
|
+
compact.
|
225
|
+
uniq.
|
226
|
+
map { |nt| "type != '#{nt}'" }
|
227
|
+
elsif opts[ :type ]
|
228
|
+
c << "type = '#{opts[ :type ]}'"
|
229
|
+
end
|
230
|
+
|
185
231
|
subqueries << generate_query_inner( c )
|
186
|
-
params <<
|
232
|
+
params << opts[ :max ]
|
187
233
|
end
|
188
234
|
if subqueries.size == 1
|
189
235
|
query = subqueries.first
|
@@ -291,7 +337,7 @@ module Iudex::DA
|
|
291
337
|
high = ( period * (pos+1) ).round if (pos+1) < segments
|
292
338
|
|
293
339
|
[ low, high ].map do |i|
|
294
|
-
URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
|
340
|
+
URL64_ORDER[ i / 64 ].chr + URL64_ORDER[ i % 64 ].chr if i
|
295
341
|
end
|
296
342
|
end
|
297
343
|
|
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-da</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.3.
|
8
|
+
<version>1.3.3</version>
|
9
9
|
<name>Iudex Data Access</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.3.0</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -20,7 +20,7 @@
|
|
20
20
|
<dependency>
|
21
21
|
<groupId>iudex</groupId>
|
22
22
|
<artifactId>iudex-core</artifactId>
|
23
|
-
<version>[1.
|
23
|
+
<version>[1.3.0,1.3.999)</version>
|
24
24
|
</dependency>
|
25
25
|
|
26
26
|
<dependency>
|
data/test/test_work_poller.rb
CHANGED
@@ -33,13 +33,13 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
33
33
|
|
34
34
|
URLS = [ [ "http://foo.gravitext.com/bar/1", 11 ],
|
35
35
|
[ "http://hometown.com/33", 10 ],
|
36
|
-
[ "http://gravitext.com/2", 9 ] ]
|
36
|
+
[ "http://gravitext.com/2", 9, "ALT" ] ]
|
37
37
|
|
38
38
|
def setup
|
39
39
|
Url.truncate
|
40
40
|
|
41
|
-
URLS.each do | u, p |
|
42
|
-
Url.create( :visit_url => u, :priority => p, :type => "PAGE" )
|
41
|
+
URLS.each do | u, p, t |
|
42
|
+
Url.create( :visit_url => u, :priority => p, :type => t || "PAGE" )
|
43
43
|
end
|
44
44
|
|
45
45
|
@factory = PoolDataSourceFactory.new( :loglevel => 4 )
|
@@ -124,17 +124,78 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
124
124
|
end
|
125
125
|
|
126
126
|
def test_poll_domain_union_2
|
127
|
-
poller.domain_union = [
|
128
|
-
|
127
|
+
poller.domain_union = [ { :domain => 'gravitext.com', :max => 15000 },
|
128
|
+
{ :max => 10000 } ]
|
129
129
|
|
130
130
|
result = poller.poll
|
131
131
|
assert_equal( 3, result.size )
|
132
132
|
end
|
133
133
|
|
134
134
|
def test_poll_domain_union_3
|
135
|
-
poller.domain_union = [
|
136
|
-
|
137
|
-
|
135
|
+
poller.domain_union = [ { :domain => 'gravitext.com', :max => 1 },
|
136
|
+
{ :domain => 'hometown.com', :max => 1 },
|
137
|
+
{ :max => 3 } ]
|
138
|
+
|
139
|
+
result = poller.poll
|
140
|
+
assert_equal( 2, result.size )
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_poll_domain_union_type_1
|
144
|
+
poller.domain_union = [
|
145
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 15000 } ]
|
146
|
+
|
147
|
+
result = poller.poll
|
148
|
+
assert_equal( 1, result.size )
|
149
|
+
end
|
150
|
+
|
151
|
+
def test_poll_domain_union_type_2
|
152
|
+
poller.domain_union = [
|
153
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
|
154
|
+
{ :domain => 'gravitext.com', :max => 1 } ]
|
155
|
+
|
156
|
+
result = poller.poll
|
157
|
+
assert_equal( 2, result.size )
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_poll_domain_union_type_3
|
161
|
+
poller.domain_union = [
|
162
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
|
163
|
+
{ :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
|
164
|
+
{ :domain => 'gravitext.com', :max => 1 } ]
|
165
|
+
|
166
|
+
result = poller.poll
|
167
|
+
assert_equal( 2, result.size )
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_poll_domain_union_type_4
|
171
|
+
poller.domain_union = [
|
172
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
|
173
|
+
{ :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
|
174
|
+
{ :domain => 'gravitext.com', :max => 1 },
|
175
|
+
{ :domain => 'hometown.com', :max => 1 } ]
|
176
|
+
|
177
|
+
result = poller.poll
|
178
|
+
assert_equal( 3, result.size )
|
179
|
+
end
|
180
|
+
|
181
|
+
def test_poll_domain_union_type_5
|
182
|
+
poller.domain_union = [ { :type => 'ALT', :max => 15000 } ]
|
183
|
+
|
184
|
+
result = poller.poll
|
185
|
+
assert_equal( 1, result.size )
|
186
|
+
end
|
187
|
+
|
188
|
+
def test_poll_domain_union_type_6
|
189
|
+
poller.domain_union = [ { :type => 'ALT', :max => 2 },
|
190
|
+
{ :max => 3 } ]
|
191
|
+
|
192
|
+
result = poller.poll
|
193
|
+
assert_equal( 3, result.size )
|
194
|
+
end
|
195
|
+
|
196
|
+
def test_poll_domain_union_type_7
|
197
|
+
poller.domain_union = [ { :type => 'ALT', :max => 2 },
|
198
|
+
{ :max => 1 } ]
|
138
199
|
|
139
200
|
result = poller.poll
|
140
201
|
assert_equal( 2, result.size )
|
metadata
CHANGED
@@ -2,34 +2,28 @@
|
|
2
2
|
name: iudex-da
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.3.
|
5
|
+
version: 1.3.3
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: iudex-core
|
16
16
|
version_requirements: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: 1.2.1
|
21
|
-
- - <
|
18
|
+
- - ~>
|
22
19
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
20
|
+
version: 1.3.0
|
24
21
|
none: false
|
25
22
|
requirement: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: 1.2.1
|
30
|
-
- - <
|
24
|
+
- - ~>
|
31
25
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
26
|
+
version: 1.3.0
|
33
27
|
none: false
|
34
28
|
prerelease: false
|
35
29
|
type: :runtime
|
@@ -119,13 +113,13 @@ dependencies:
|
|
119
113
|
requirements:
|
120
114
|
- - ~>
|
121
115
|
- !ruby/object:Gem::Version
|
122
|
-
version: '1.
|
116
|
+
version: '1.5'
|
123
117
|
none: false
|
124
118
|
requirement: !ruby/object:Gem::Requirement
|
125
119
|
requirements:
|
126
120
|
- - ~>
|
127
121
|
- !ruby/object:Gem::Version
|
128
|
-
version: '1.
|
122
|
+
version: '1.5'
|
129
123
|
none: false
|
130
124
|
prerelease: false
|
131
125
|
type: :development
|
@@ -188,7 +182,7 @@ files:
|
|
188
182
|
- test/test_pool_factory.rb
|
189
183
|
- test/test_url_model.rb
|
190
184
|
- test/test_work_poller.rb
|
191
|
-
- lib/iudex-da/iudex-da-1.3.
|
185
|
+
- lib/iudex-da/iudex-da-1.3.3.jar
|
192
186
|
homepage: http://iudex.gravitext.com
|
193
187
|
licenses: []
|
194
188
|
post_install_message:
|
Binary file
|