iudex-da 1.3.2-java → 1.3.3-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +20 -0
- data/Manifest.txt +1 -1
- data/lib/iudex-da/base.rb +1 -1
- data/lib/iudex-da/factory_helper.rb +91 -13
- data/lib/iudex-da/iudex-da-1.3.3.jar +0 -0
- data/lib/iudex-da/work_poller.rb +61 -15
- data/pom.xml +3 -3
- data/test/test_work_poller.rb +69 -8
- metadata +9 -15
- data/lib/iudex-da/iudex-da-1.3.2.jar +0 -0
data/History.rdoc
CHANGED
@@ -1,3 +1,23 @@
|
|
1
|
+
=== 1.3.3 (2012-11-8)
|
2
|
+
* FactoryHelper.create_update_filter now prefers an options Hash
|
3
|
+
exposing greater control over what is updated and how. In
|
4
|
+
particular, :on_referer can be independently set.
|
5
|
+
* Add (Base)Transformer, ContentUpdater, UpdateFilter support for a
|
6
|
+
distinct REFERER filter chain. Now content, referer, and references
|
7
|
+
updates are all optional.
|
8
|
+
* BaseTransformer.merge now augments the updated map with the current
|
9
|
+
(database) contents instead of creating a temporary map on which
|
10
|
+
UpdateFilter chain mutations would be discarded. This change makes
|
11
|
+
it consistent with either new or updated content.
|
12
|
+
* Add new options Hash syntax and :type support to
|
13
|
+
WorkPoller.domain_union
|
14
|
+
* Fix WorkPoller uhash_slice range calculation for ruby 1.8
|
15
|
+
* Intern :type values on read in ContentMapper
|
16
|
+
* Upgrade/narrow to iudex-core ~> 1.3.0 (incl. gravitext-util ~> 1.7.0)
|
17
|
+
* Upgrade to logback ~> 1.5 (dev)
|
18
|
+
* Add WorkPoller logging and consolidate log from
|
19
|
+
GenericWorkPollStrategy.
|
20
|
+
|
1
21
|
=== 1.3.2 (2012-10-25)
|
2
22
|
* Add migration to make index_next_visit partial over non-null
|
3
23
|
next_visit_after rows. (Index rebuild may take a while.)
|
data/Manifest.txt
CHANGED
data/lib/iudex-da/base.rb
CHANGED
@@ -31,22 +31,88 @@ module Iudex
|
|
31
31
|
@data_source ||= PoolDataSourceFactory.new.create
|
32
32
|
end
|
33
33
|
|
34
|
-
# Create UpdateFilter given
|
35
|
-
#
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
34
|
+
# Create an UpdateFilter given the provided options.
|
35
|
+
#
|
36
|
+
# === Options
|
37
|
+
#
|
38
|
+
# :fields:: The Array of fields (Symbol or Key) to (re-)read
|
39
|
+
# from the database and update. The required :uhash
|
40
|
+
# field is included automatically.
|
41
|
+
#
|
42
|
+
# :max_retries:: Maximum number of retries not including the
|
43
|
+
# initial attempt, in case of a database
|
44
|
+
# conflict (Default: 3)
|
45
|
+
#
|
46
|
+
# :isolation_level:: A transaction isolation constant as
|
47
|
+
# defined in java.sql.Connection
|
48
|
+
# (Default: REPEATABLE_READ 0x04)
|
49
|
+
#
|
50
|
+
# :on_content:: Filter option for current (content) UniMap.
|
51
|
+
#
|
52
|
+
# :on_ref_update:: Filter option for REFERENCES that are
|
53
|
+
# already existing in the database
|
54
|
+
#
|
55
|
+
# :on_ref_new:: Filter option for REFERENCES that are new (not
|
56
|
+
# found in db).
|
57
|
+
#
|
58
|
+
# :on_referer:: Filter option for the REFERER to the current
|
59
|
+
# content.
|
60
|
+
#
|
61
|
+
# The positional parameters equivalent to
|
62
|
+
# ( :fields, :on_content, :on_ref_update, :on_ref_new )
|
63
|
+
# as defined above are also supported but deprecated.
|
64
|
+
#
|
65
|
+
# === Filter options
|
66
|
+
#
|
67
|
+
# Each of the on_* filter options defined above may take a
|
68
|
+
# value of a Filter, Array of filters or a Symbol value. The
|
69
|
+
# following symbol values are special, all other Symbol values
|
70
|
+
# are passed as the :filters option to create_chain, and is
|
71
|
+
# interpreted as a method name.
|
72
|
+
#
|
73
|
+
# :merge:: The default behavior of merging the in-memory state
|
74
|
+
# with the current database state. This is equivalent
|
75
|
+
# to providing a NoOpFilter.
|
76
|
+
#
|
77
|
+
# :ignore:: Do not update this element. This is equivalent to
|
78
|
+
# providing a filter that always rejects, but faster
|
79
|
+
# since it need not read the value from the db.
|
80
|
+
#
|
81
|
+
def create_update_filter( *args )
|
82
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
83
|
+
|
84
|
+
opts[ :fields ] ||= args.shift #deprecated
|
85
|
+
opts[ :on_content ] ||= args.shift
|
86
|
+
opts[ :on_ref_update ] ||= args.shift
|
87
|
+
opts[ :on_ref_new ] ||= args.shift
|
88
|
+
|
89
|
+
f = UpdateFilter.new( data_source, field_mapper( opts[ :fields ] ) )
|
90
|
+
updater_chain( opts[:on_ref_update]) { |c| f.update_ref_filter = c }
|
91
|
+
updater_chain( opts[ :on_ref_new ] ) { |c| f.new_ref_filter = c }
|
92
|
+
updater_chain( opts[ :on_content ] ) { |c| f.content_filter = c }
|
93
|
+
updater_chain( opts[ :on_referer ] ) { |c| f.referer_filter = c }
|
94
|
+
|
95
|
+
f.max_retries = opts[ :max_retries ] if opts[ :max_retries ]
|
96
|
+
f.isolation_level = opts[:isolation_level] if opts[:isolation_level]
|
97
|
+
|
45
98
|
f
|
46
99
|
end
|
47
100
|
|
48
|
-
|
49
|
-
|
101
|
+
# Create a ReadFilter given the provided options.
|
102
|
+
#
|
103
|
+
# === Options
|
104
|
+
#
|
105
|
+
# :fields:: The Array of fields (Symbol or Key) to read from
|
106
|
+
# the database.
|
107
|
+
#
|
108
|
+
# The positional parameters equivalent to ( :fields ) as
|
109
|
+
# defined above is also supported but deprecated.
|
110
|
+
#
|
111
|
+
def create_read_filter( *args )
|
112
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
113
|
+
opts[ :fields ] ||= args.shift #deprecated
|
114
|
+
|
115
|
+
ReadFilter.new( data_source, field_mapper( opts[ :fields ] ) )
|
50
116
|
end
|
51
117
|
|
52
118
|
def field_mapper( fields )
|
@@ -54,6 +120,18 @@ module Iudex
|
|
54
120
|
ContentMapper.new( fields )
|
55
121
|
end
|
56
122
|
|
123
|
+
def updater_chain( v, &block )
|
124
|
+
if v.is_a?( Iudex::Filter::Filter )
|
125
|
+
block.call( v )
|
126
|
+
elsif v == :merge
|
127
|
+
block.call( UpdateFilter::DEFAULT_MERGE )
|
128
|
+
elsif v == :ignore
|
129
|
+
block.call( nil )
|
130
|
+
else
|
131
|
+
create_chain( :filters => v, &block )
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
57
135
|
end
|
58
136
|
|
59
137
|
end
|
Binary file
|
data/lib/iudex-da/work_poller.rb
CHANGED
@@ -90,19 +90,33 @@ module Iudex::DA
|
|
90
90
|
age_coef_2 && age_coef_2 > 0.0 )
|
91
91
|
end
|
92
92
|
|
93
|
-
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
#
|
97
|
-
#
|
98
|
-
#
|
99
|
-
#
|
93
|
+
# A table of option rows as defined below. A nil/unspecified
|
94
|
+
# domain and type row applies to all domains/types not covered by
|
95
|
+
# another row. Without such a row, work is limited to the explicit
|
96
|
+
# domains/types listed.
|
97
|
+
#
|
98
|
+
# ==== Options
|
99
|
+
#
|
100
|
+
# :domain:: The registration-level, normalized lower-case domain
|
101
|
+
# value.
|
102
|
+
#
|
103
|
+
# :type:: An (upper-case) TYPE value to be AND'd with a domain
|
104
|
+
# domain or may appear on its own, applying to all
|
105
|
+
# unconfigured domains.
|
106
|
+
#
|
107
|
+
# :max:: The maximum number of visit urls to obtain in one poll
|
108
|
+
# (instead of the top level #max_urls.) A zero max_urls
|
109
|
+
# value excludes this domain/type (efficiently).
|
110
|
+
#
|
111
|
+
# Also a [ domain, max ] alternative syntax is currently supported
|
112
|
+
# but deprecated.
|
113
|
+
#
|
100
114
|
attr_accessor :domain_union
|
101
115
|
|
102
116
|
# An array containing a zero-based position and a total number of
|
103
117
|
# evenly divided segments within the range of possible uhash
|
104
118
|
# values. If set only work with uhashes in the designated range
|
105
|
-
# will be polled. Note that the uhash is
|
119
|
+
# will be polled. Note that the uhash is independent of domain,
|
106
120
|
# being a hash on the entire URL. (default: nil, off)
|
107
121
|
attr_accessor :uhash_slice
|
108
122
|
|
@@ -136,6 +150,11 @@ module Iudex::DA
|
|
136
150
|
@data_source = data_source
|
137
151
|
end
|
138
152
|
|
153
|
+
# Override GenericWorkPollStrategy
|
154
|
+
def log
|
155
|
+
@log.java_logger
|
156
|
+
end
|
157
|
+
|
139
158
|
# Override GenericWorkPollStrategy
|
140
159
|
def pollWorkImpl( visit_queue )
|
141
160
|
visit_queue.add_all( poll )
|
@@ -147,6 +166,7 @@ module Iudex::DA
|
|
147
166
|
# Raises SQLException
|
148
167
|
def poll
|
149
168
|
query, params = generate_query
|
169
|
+
@log.debug { "Poll query: #{query}; #{params.inspect}" }
|
150
170
|
reader.select( query, *params )
|
151
171
|
end
|
152
172
|
|
@@ -156,6 +176,16 @@ module Iudex::DA
|
|
156
176
|
end
|
157
177
|
end
|
158
178
|
|
179
|
+
def domain_union=( table )
|
180
|
+
@domain_union = table.map do | *args |
|
181
|
+
args = args.flatten.dup
|
182
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
183
|
+
opts[ :domain ] ||= args.shift
|
184
|
+
opts[ :max ] ||= args.shift
|
185
|
+
opts
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
159
189
|
def generate_query
|
160
190
|
criteria = [ "next_visit_after <= now()" ]
|
161
191
|
|
@@ -172,18 +202,34 @@ module Iudex::DA
|
|
172
202
|
params = [ max_urls ]
|
173
203
|
else
|
174
204
|
subqueries = []
|
175
|
-
@domain_union.each do |
|
176
|
-
|
205
|
+
@domain_union.each do | opts |
|
206
|
+
opts = opts.dup
|
207
|
+
opts[ :max ] ||= @max_urls
|
208
|
+
|
209
|
+
next if opts[ :max ] == 0
|
210
|
+
|
177
211
|
c = criteria.dup
|
178
|
-
if domain.nil?
|
179
|
-
c += @domain_union.map { |
|
212
|
+
if opts[ :domain ].nil?
|
213
|
+
c += @domain_union.map { |r| r[ :domain ] }.
|
180
214
|
compact.
|
215
|
+
uniq.
|
181
216
|
map { |nd| "domain != '#{nd}'" }
|
182
217
|
else
|
183
|
-
c << "domain = '#{domain}'"
|
218
|
+
c << "domain = '#{opts[ :domain ]}'"
|
184
219
|
end
|
220
|
+
|
221
|
+
if opts[ :type ].nil?
|
222
|
+
c += @domain_union.select { |r| r[ :domain ] == opts[ :domain ] }.
|
223
|
+
map { |r| r[ :type ] }.
|
224
|
+
compact.
|
225
|
+
uniq.
|
226
|
+
map { |nt| "type != '#{nt}'" }
|
227
|
+
elsif opts[ :type ]
|
228
|
+
c << "type = '#{opts[ :type ]}'"
|
229
|
+
end
|
230
|
+
|
185
231
|
subqueries << generate_query_inner( c )
|
186
|
-
params <<
|
232
|
+
params << opts[ :max ]
|
187
233
|
end
|
188
234
|
if subqueries.size == 1
|
189
235
|
query = subqueries.first
|
@@ -291,7 +337,7 @@ module Iudex::DA
|
|
291
337
|
high = ( period * (pos+1) ).round if (pos+1) < segments
|
292
338
|
|
293
339
|
[ low, high ].map do |i|
|
294
|
-
URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
|
340
|
+
URL64_ORDER[ i / 64 ].chr + URL64_ORDER[ i % 64 ].chr if i
|
295
341
|
end
|
296
342
|
end
|
297
343
|
|
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-da</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.3.
|
8
|
+
<version>1.3.3</version>
|
9
9
|
<name>Iudex Data Access</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.3.0</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -20,7 +20,7 @@
|
|
20
20
|
<dependency>
|
21
21
|
<groupId>iudex</groupId>
|
22
22
|
<artifactId>iudex-core</artifactId>
|
23
|
-
<version>[1.
|
23
|
+
<version>[1.3.0,1.3.999)</version>
|
24
24
|
</dependency>
|
25
25
|
|
26
26
|
<dependency>
|
data/test/test_work_poller.rb
CHANGED
@@ -33,13 +33,13 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
33
33
|
|
34
34
|
URLS = [ [ "http://foo.gravitext.com/bar/1", 11 ],
|
35
35
|
[ "http://hometown.com/33", 10 ],
|
36
|
-
[ "http://gravitext.com/2", 9 ] ]
|
36
|
+
[ "http://gravitext.com/2", 9, "ALT" ] ]
|
37
37
|
|
38
38
|
def setup
|
39
39
|
Url.truncate
|
40
40
|
|
41
|
-
URLS.each do | u, p |
|
42
|
-
Url.create( :visit_url => u, :priority => p, :type => "PAGE" )
|
41
|
+
URLS.each do | u, p, t |
|
42
|
+
Url.create( :visit_url => u, :priority => p, :type => t || "PAGE" )
|
43
43
|
end
|
44
44
|
|
45
45
|
@factory = PoolDataSourceFactory.new( :loglevel => 4 )
|
@@ -124,17 +124,78 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
124
124
|
end
|
125
125
|
|
126
126
|
def test_poll_domain_union_2
|
127
|
-
poller.domain_union = [
|
128
|
-
|
127
|
+
poller.domain_union = [ { :domain => 'gravitext.com', :max => 15000 },
|
128
|
+
{ :max => 10000 } ]
|
129
129
|
|
130
130
|
result = poller.poll
|
131
131
|
assert_equal( 3, result.size )
|
132
132
|
end
|
133
133
|
|
134
134
|
def test_poll_domain_union_3
|
135
|
-
poller.domain_union = [
|
136
|
-
|
137
|
-
|
135
|
+
poller.domain_union = [ { :domain => 'gravitext.com', :max => 1 },
|
136
|
+
{ :domain => 'hometown.com', :max => 1 },
|
137
|
+
{ :max => 3 } ]
|
138
|
+
|
139
|
+
result = poller.poll
|
140
|
+
assert_equal( 2, result.size )
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_poll_domain_union_type_1
|
144
|
+
poller.domain_union = [
|
145
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 15000 } ]
|
146
|
+
|
147
|
+
result = poller.poll
|
148
|
+
assert_equal( 1, result.size )
|
149
|
+
end
|
150
|
+
|
151
|
+
def test_poll_domain_union_type_2
|
152
|
+
poller.domain_union = [
|
153
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
|
154
|
+
{ :domain => 'gravitext.com', :max => 1 } ]
|
155
|
+
|
156
|
+
result = poller.poll
|
157
|
+
assert_equal( 2, result.size )
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_poll_domain_union_type_3
|
161
|
+
poller.domain_union = [
|
162
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
|
163
|
+
{ :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
|
164
|
+
{ :domain => 'gravitext.com', :max => 1 } ]
|
165
|
+
|
166
|
+
result = poller.poll
|
167
|
+
assert_equal( 2, result.size )
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_poll_domain_union_type_4
|
171
|
+
poller.domain_union = [
|
172
|
+
{ :domain => 'gravitext.com', :type => 'ALT', :max => 1 },
|
173
|
+
{ :domain => 'gravitext.com', :type => 'NOT', :max => 1 },
|
174
|
+
{ :domain => 'gravitext.com', :max => 1 },
|
175
|
+
{ :domain => 'hometown.com', :max => 1 } ]
|
176
|
+
|
177
|
+
result = poller.poll
|
178
|
+
assert_equal( 3, result.size )
|
179
|
+
end
|
180
|
+
|
181
|
+
def test_poll_domain_union_type_5
|
182
|
+
poller.domain_union = [ { :type => 'ALT', :max => 15000 } ]
|
183
|
+
|
184
|
+
result = poller.poll
|
185
|
+
assert_equal( 1, result.size )
|
186
|
+
end
|
187
|
+
|
188
|
+
def test_poll_domain_union_type_6
|
189
|
+
poller.domain_union = [ { :type => 'ALT', :max => 2 },
|
190
|
+
{ :max => 3 } ]
|
191
|
+
|
192
|
+
result = poller.poll
|
193
|
+
assert_equal( 3, result.size )
|
194
|
+
end
|
195
|
+
|
196
|
+
def test_poll_domain_union_type_7
|
197
|
+
poller.domain_union = [ { :type => 'ALT', :max => 2 },
|
198
|
+
{ :max => 1 } ]
|
138
199
|
|
139
200
|
result = poller.poll
|
140
201
|
assert_equal( 2, result.size )
|
metadata
CHANGED
@@ -2,34 +2,28 @@
|
|
2
2
|
name: iudex-da
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.3.
|
5
|
+
version: 1.3.3
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: iudex-core
|
16
16
|
version_requirements: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: 1.2.1
|
21
|
-
- - <
|
18
|
+
- - ~>
|
22
19
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
20
|
+
version: 1.3.0
|
24
21
|
none: false
|
25
22
|
requirement: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: 1.2.1
|
30
|
-
- - <
|
24
|
+
- - ~>
|
31
25
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
26
|
+
version: 1.3.0
|
33
27
|
none: false
|
34
28
|
prerelease: false
|
35
29
|
type: :runtime
|
@@ -119,13 +113,13 @@ dependencies:
|
|
119
113
|
requirements:
|
120
114
|
- - ~>
|
121
115
|
- !ruby/object:Gem::Version
|
122
|
-
version: '1.
|
116
|
+
version: '1.5'
|
123
117
|
none: false
|
124
118
|
requirement: !ruby/object:Gem::Requirement
|
125
119
|
requirements:
|
126
120
|
- - ~>
|
127
121
|
- !ruby/object:Gem::Version
|
128
|
-
version: '1.
|
122
|
+
version: '1.5'
|
129
123
|
none: false
|
130
124
|
prerelease: false
|
131
125
|
type: :development
|
@@ -188,7 +182,7 @@ files:
|
|
188
182
|
- test/test_pool_factory.rb
|
189
183
|
- test/test_url_model.rb
|
190
184
|
- test/test_work_poller.rb
|
191
|
-
- lib/iudex-da/iudex-da-1.3.
|
185
|
+
- lib/iudex-da/iudex-da-1.3.3.jar
|
192
186
|
homepage: http://iudex.gravitext.com
|
193
187
|
licenses: []
|
194
188
|
post_install_message:
|
Binary file
|