iudex-da 1.2.1-java → 1.3.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +30 -0
- data/Manifest.txt +10 -16
- data/bin/iudex-migrate +7 -3
- data/db/20111012173757_base.rb +117 -0
- data/db/{0070_add_created_at.rb → 20120930173600_uhash_collation_order.rb} +16 -10
- data/db/{simhash/0085_add_simhash_index.rb → index_next_visit/21500000000101_add_index_next_visit.rb} +5 -9
- data/db/{0081_remove_simhash_index.rb → simhash/21500000000001_add_simhash_index.rb} +5 -9
- data/lib/iudex-da.rb +3 -1
- data/lib/iudex-da/base.rb +1 -1
- data/lib/iudex-da/config.rb +3 -3
- data/lib/iudex-da/iudex-da-1.3.0.jar +0 -0
- data/lib/iudex-da/models.rb +66 -0
- data/lib/iudex-da/orm.rb +183 -0
- data/lib/iudex-da/work_poller.rb +307 -0
- data/pom.xml +2 -2
- data/test/setup.rb +7 -5
- data/test/test_migrate.rb +8 -22
- data/test/test_pool_factory.rb +24 -13
- data/test/test_url_model.rb +52 -0
- data/test/test_work_poller.rb +157 -0
- metadata +210 -185
- data/db/0010_base_urls.rb +0 -84
- data/db/0020_add_feed_metadata.rb +0 -37
- data/db/0021_more_feed_text.rb +0 -29
- data/db/0030_add_priority.rb +0 -28
- data/db/0040_add_visit_after.rb +0 -30
- data/db/0050_add_cache_location.rb +0 -32
- data/db/0060_url_indexes.rb +0 -41
- data/db/0080_add_simhash.rb +0 -33
- data/db/0110_host_to_domain.rb +0 -36
- data/db/index_next_visit/0100_add_index_next_visit.rb +0 -27
- data/lib/iudex-da/ar.rb +0 -66
- data/lib/iudex-da/iudex-da-1.2.1.jar +0 -0
- data/test/test_poll_work.rb +0 -132
@@ -0,0 +1,307 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-da'
|
18
|
+
require 'iudex-da/key_helper'
|
19
|
+
require 'rjack-slf4j'
|
20
|
+
|
21
|
+
module Iudex::DA
|
22
|
+
|
23
|
+
# A SQL based WorkPoller
|
24
|
+
class WorkPoller < Java::iudex.core.GenericWorkPollStrategy
|
25
|
+
include Iudex::Filter::KeyHelper
|
26
|
+
include Gravitext::HTMap
|
27
|
+
|
28
|
+
import 'java.sql.SQLException'
|
29
|
+
|
30
|
+
# If set > 0.0 group by domain and reduce priority for subsequent
|
31
|
+
# urls within a common (registration level) domain (coefficient of
|
32
|
+
# depth). This increases crawl throughput when many domains are
|
33
|
+
# available. (default: nil, off)
|
34
|
+
attr_accessor :domain_depth_coef
|
35
|
+
|
36
|
+
def domain_depth?
|
37
|
+
domain_depth_coef && domain_depth_coef > 0.0
|
38
|
+
end
|
39
|
+
|
40
|
+
# Deprecated, use #domain_depth_coef (the reciprocal)
|
41
|
+
def host_depth_divisor
|
42
|
+
1.0 / domain_depth_coef
|
43
|
+
end
|
44
|
+
|
45
|
+
# Deprecated, use #domain_depth_coef= (reciprocal)
|
46
|
+
def host_depth_divisor=( dv )
|
47
|
+
@domain_depth_coef = 1.0 / dv
|
48
|
+
end
|
49
|
+
|
50
|
+
# If #domain_depth_coef is set, this sets maximum urls for any
|
51
|
+
# single (registration level) domain (default: 10_000)
|
52
|
+
attr_accessor :max_domain_urls
|
53
|
+
|
54
|
+
# Deprecated, use #max_domain_urls
|
55
|
+
alias :max_host_urls :max_domain_urls
|
56
|
+
|
57
|
+
# Deprecated, use #max_domain_urls=
|
58
|
+
alias :max_host_urls= :max_domain_urls=
|
59
|
+
|
60
|
+
# The limit of urls to obtain in a single poll (across all
|
61
|
+
# domains) (default: 50_000)
|
62
|
+
attr_accessor :max_urls
|
63
|
+
|
64
|
+
# A secondary limit on the number of urls to consider, taking the
|
65
|
+
# N high basic priority urls. This is only ever applied when
|
66
|
+
# #domain_depth_coef is set. (default: nil, off)
|
67
|
+
attr_accessor :max_priority_urls
|
68
|
+
|
69
|
+
# If set true, provide the final work list ordered in domain,
|
70
|
+
# priority order (default: false)
|
71
|
+
attr_writer :do_domain_group
|
72
|
+
|
73
|
+
def domain_group?
|
74
|
+
@do_domain_group
|
75
|
+
end
|
76
|
+
|
77
|
+
# First age coefficient. If set > 0.0, adjust priority by the
|
78
|
+
# equation:
|
79
|
+
#
|
80
|
+
# priority + age_coef_1 * sqrt( age_coef_2 * age )
|
81
|
+
#
|
82
|
+
# Where age is now - next_visit_after the (default: 0.2)
|
83
|
+
attr_accessor :age_coef_1
|
84
|
+
|
85
|
+
# Second age coefficient (default: 0.1)
|
86
|
+
attr_accessor :age_coef_2
|
87
|
+
|
88
|
+
def aged_priority?
|
89
|
+
( age_coef_1 && age_coef_1 > 0.0 &&
|
90
|
+
age_coef_2 && age_coef_2 > 0.0 )
|
91
|
+
end
|
92
|
+
|
93
|
+
# An Array of [ domain, max_urls ] pairs where each domain is a
|
94
|
+
# unique reqistration-level, normalized lower-case domain. A nil
|
95
|
+
# domain applies to all domains not covered by another
|
96
|
+
# row. Without a nil domain row, work is limited to the explicit
|
97
|
+
# domains listed. If provided these max_urls values are used
|
98
|
+
# instead of top level #max_urls. Domain depth should most likely
|
99
|
+
# be avoided if this feature is used. (default: [], off)
|
100
|
+
attr_accessor :domain_union
|
101
|
+
|
102
|
+
# An array containing a zero-based position and a total number of
|
103
|
+
# evenly divided segments within the range of possible uhash
|
104
|
+
# values. If set only work with uhashes in the designated range
|
105
|
+
# will be polled. Note that the uhash is indepedent of domain,
|
106
|
+
# being a hash on the entire URL. (default: nil, off)
|
107
|
+
attr_accessor :uhash_slice
|
108
|
+
|
109
|
+
def initialize( data_source, mapper )
|
110
|
+
super()
|
111
|
+
|
112
|
+
@domain_depth_coef = nil
|
113
|
+
@do_domain_group = false
|
114
|
+
|
115
|
+
@max_priority_urls = nil
|
116
|
+
@max_domain_urls = 10_000
|
117
|
+
@max_urls = 50_000
|
118
|
+
|
119
|
+
@age_coef_1 = 0.2
|
120
|
+
@age_coef_2 = 0.1
|
121
|
+
|
122
|
+
@domain_union = []
|
123
|
+
|
124
|
+
@uhash_slice = nil
|
125
|
+
|
126
|
+
@log = RJack::SLF4J[ self.class ]
|
127
|
+
#FIXME: Add accessor for log in GenericWorkPollStrategy
|
128
|
+
|
129
|
+
keys( :url, :priority, :next_visit_after ).each do |k|
|
130
|
+
unless mapper.fields.include?( k )
|
131
|
+
raise "WorkPoller needs mapper with #{key.name} included."
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
@mapper = mapper
|
136
|
+
@reader = ContentReader.new( data_source, mapper )
|
137
|
+
end
|
138
|
+
|
139
|
+
# Override GenericWorkPollStrategy
|
140
|
+
def pollWorkImpl( visit_queue )
|
141
|
+
visit_queue.add_all( poll )
|
142
|
+
rescue SQLException => x
|
143
|
+
@log.error( "On poll: ", x )
|
144
|
+
end
|
145
|
+
|
146
|
+
# Poll work and return as List<UniMap>
|
147
|
+
# Raises SQLException
|
148
|
+
def poll
|
149
|
+
query, params = generate_query
|
150
|
+
@reader.select( query, *params )
|
151
|
+
end
|
152
|
+
|
153
|
+
def generate_query
|
154
|
+
criteria = [ "next_visit_after <= now()" ]
|
155
|
+
|
156
|
+
if uhash_slice
|
157
|
+
min, max = url64_range( *uhash_slice )
|
158
|
+
criteria << "uhash > ( '#{min}' COLLATE \"C\" )" if min
|
159
|
+
criteria << "uhash < ( '#{max}' COLLATE \"C\" )" if max
|
160
|
+
end
|
161
|
+
|
162
|
+
params = []
|
163
|
+
|
164
|
+
if @domain_union.empty?
|
165
|
+
query = generate_query_inner( criteria )
|
166
|
+
params = [ max_urls ]
|
167
|
+
else
|
168
|
+
subqueries = []
|
169
|
+
@domain_union.each do | domain, dmax |
|
170
|
+
next if dmax == 0
|
171
|
+
c = criteria.dup
|
172
|
+
if domain.nil?
|
173
|
+
c += @domain_union.map { |nd,_| nd }.
|
174
|
+
compact.
|
175
|
+
map { |nd| "domain != '#{nd}'" }
|
176
|
+
else
|
177
|
+
c << "domain = '#{domain}'"
|
178
|
+
end
|
179
|
+
subqueries << generate_query_inner( c )
|
180
|
+
params << dmax
|
181
|
+
end
|
182
|
+
if subqueries.size == 1
|
183
|
+
query = subqueries.first
|
184
|
+
else
|
185
|
+
query = "(" + subqueries.join( ") UNION ALL (" ) + ")"
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
query = wrap_domain_group_query( fields, query ) if domain_group?
|
190
|
+
|
191
|
+
query = query.gsub( /\s+/, ' ').strip
|
192
|
+
|
193
|
+
[ query, params ]
|
194
|
+
end
|
195
|
+
|
196
|
+
def generate_query_inner( criteria )
|
197
|
+
|
198
|
+
query = filter_query(
|
199
|
+
fields( ( :domain if domain_depth? || domain_group? ) ),
|
200
|
+
( max_priority_urls if domain_depth? ),
|
201
|
+
criteria )
|
202
|
+
|
203
|
+
if domain_depth?
|
204
|
+
flds = fields( ( :domain if domain_group? ) )
|
205
|
+
query = wrap_domain_partition_query( flds, query )
|
206
|
+
end
|
207
|
+
|
208
|
+
limit_priority = domain_depth? ? :adj_priority : :priority
|
209
|
+
query += <<-SQL
|
210
|
+
ORDER BY #{limit_priority} DESC
|
211
|
+
LIMIT ?
|
212
|
+
SQL
|
213
|
+
|
214
|
+
query
|
215
|
+
end
|
216
|
+
|
217
|
+
def wrap_domain_partition_query( flds, sub )
|
218
|
+
<<-SQL
|
219
|
+
SELECT #{clist flds}
|
220
|
+
FROM ( SELECT #{clist flds},
|
221
|
+
( priority - ( #{domain_depth_coef}::REAL * ( dpos - 1 ) )
|
222
|
+
)::REAL AS adj_priority
|
223
|
+
FROM ( SELECT #{clist flds},
|
224
|
+
row_number() OVER (
|
225
|
+
PARTITION BY domain
|
226
|
+
ORDER BY priority DESC ) AS dpos
|
227
|
+
FROM ( #{ sub } ) AS subP
|
228
|
+
) AS subH
|
229
|
+
WHERE dpos <= #{max_domain_urls}
|
230
|
+
) AS subA
|
231
|
+
SQL
|
232
|
+
end
|
233
|
+
|
234
|
+
def filter_query( flds, max, criteria )
|
235
|
+
|
236
|
+
if aged_priority?
|
237
|
+
flds = flds.dup
|
238
|
+
i = flds.index( :priority ) || flds.size
|
239
|
+
flds[ i ] = <<-SQL
|
240
|
+
( priority +
|
241
|
+
#{age_coef_1}::REAL *
|
242
|
+
SQRT( #{age_coef_2}::REAL *
|
243
|
+
EXTRACT( EPOCH FROM ( now() - next_visit_after ) ) )::REAL
|
244
|
+
) AS priority
|
245
|
+
SQL
|
246
|
+
end
|
247
|
+
|
248
|
+
sql = <<-SQL
|
249
|
+
SELECT #{clist flds}
|
250
|
+
FROM urls
|
251
|
+
WHERE #{and_list criteria}
|
252
|
+
SQL
|
253
|
+
|
254
|
+
sql += <<-SQL if max
|
255
|
+
ORDER BY priority DESC
|
256
|
+
LIMIT #{max}
|
257
|
+
SQL
|
258
|
+
|
259
|
+
sql
|
260
|
+
end
|
261
|
+
|
262
|
+
def wrap_domain_group_query( flds, sub )
|
263
|
+
<<-SQL
|
264
|
+
SELECT #{clist flds}
|
265
|
+
FROM ( #{sub} ) AS subDG
|
266
|
+
ORDER BY domain, priority DESC
|
267
|
+
SQL
|
268
|
+
end
|
269
|
+
|
270
|
+
# URL 64 lexicon, ASCII or "C" LOCALE ordered
|
271
|
+
URL64_ORDER = "-0123456789ABCDEFGHIJKLMNOPQRSTU" +
|
272
|
+
"VWXYZ_abcdefghijklmnopqrstuvwxyz"
|
273
|
+
|
274
|
+
# Given a zero-based position within some number of segments,
|
275
|
+
# returns [ min, max ] bounds where min will be nil at pos=0, and
|
276
|
+
# max will be nil at pos=segments-1. Non nil values are uhash
|
277
|
+
# prefixes that can be used as selection criteria.
|
278
|
+
def url64_range( pos, segments )
|
279
|
+
unless pos >= 0 && segments > pos
|
280
|
+
raise "Invalid url64_range: 0 <= #{pos} < #{segments}"
|
281
|
+
end
|
282
|
+
|
283
|
+
period = ( 64 * 64 / segments.to_f )
|
284
|
+
low = ( period * pos ).round if pos > 0
|
285
|
+
high = ( period * (pos+1) ).round if (pos+1) < segments
|
286
|
+
|
287
|
+
[ low, high ].map do |i|
|
288
|
+
URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
def fields( *ksyms )
|
293
|
+
( @mapper.fields.map { |k| k.name.to_sym } |
|
294
|
+
ksyms.flatten.compact.map { |s| s.to_sym } )
|
295
|
+
end
|
296
|
+
|
297
|
+
def clist( l )
|
298
|
+
l.compact.join( ', ' )
|
299
|
+
end
|
300
|
+
|
301
|
+
def and_list( l )
|
302
|
+
l.compact.join( " AND " )
|
303
|
+
end
|
304
|
+
|
305
|
+
end
|
306
|
+
|
307
|
+
end
|
data/pom.xml
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-da</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.3.0</version>
|
9
9
|
<name>Iudex Data Access</name>
|
10
10
|
|
11
11
|
<parent>
|
@@ -20,7 +20,7 @@
|
|
20
20
|
<dependency>
|
21
21
|
<groupId>iudex</groupId>
|
22
22
|
<artifactId>iudex-core</artifactId>
|
23
|
-
<version>[1.2.1,1.
|
23
|
+
<version>[1.2.1,1.3.999)</version>
|
24
24
|
</dependency>
|
25
25
|
|
26
26
|
<dependency>
|
data/test/setup.rb
CHANGED
@@ -28,17 +28,19 @@ module TestSetup
|
|
28
28
|
include RJack
|
29
29
|
Logback.config_console( :stderr => true, :thread => true )
|
30
30
|
|
31
|
-
|
32
|
-
|
31
|
+
VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
|
32
|
+
|
33
|
+
if VERBOSE
|
33
34
|
class TestOut
|
34
35
|
def print( *a ); $stdout.puts( *a ); end
|
35
36
|
def puts( *a ); $stdout.puts( *a ); end
|
36
37
|
end
|
37
38
|
MiniTest::Unit.output = TestOut.new
|
38
|
-
|
39
|
+
|
39
40
|
Logback.root.level = Logback::DEBUG
|
41
|
+
else
|
42
|
+
Logback[ 'iudex.da.sequel' ].level = :warn
|
43
|
+
Logback[ 'iudex.da.PoolDataSourceFactory' ].level = :warn
|
40
44
|
end
|
41
45
|
|
42
|
-
ARGV.delete( '--debug' )
|
43
|
-
|
44
46
|
end
|
data/test/test_migrate.rb
CHANGED
@@ -20,24 +20,20 @@
|
|
20
20
|
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
21
|
|
22
22
|
require 'iudex-da'
|
23
|
-
require 'iudex-da/
|
23
|
+
require 'iudex-da/orm'
|
24
24
|
|
25
25
|
class TestMigrate < MiniTest::Unit::TestCase
|
26
26
|
include Iudex::DA
|
27
27
|
include RJack
|
28
28
|
|
29
|
-
VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
|
30
|
-
|
31
29
|
def setup
|
32
|
-
unless VERBOSE
|
33
|
-
Logback[ 'iudex.da.ActiveRecord' ].level = Logback::WARN
|
34
|
-
end
|
30
|
+
Logback[ 'iudex.da.sequel' ].level = :warn unless TestSetup::VERBOSE
|
35
31
|
end
|
36
32
|
|
37
33
|
def teardown
|
38
34
|
Hooker.send( :clear )
|
39
|
-
|
40
|
-
Logback[ 'iudex.da.
|
35
|
+
ORM.migrate
|
36
|
+
Logback[ 'iudex.da.sequel' ].level = nil
|
41
37
|
end
|
42
38
|
|
43
39
|
def test_default
|
@@ -55,20 +51,10 @@ class TestMigrate < MiniTest::Unit::TestCase
|
|
55
51
|
end
|
56
52
|
|
57
53
|
def check_up_down
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
pass
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def suppress_messages?( &block )
|
67
|
-
if VERBOSE
|
68
|
-
block.call
|
69
|
-
else
|
70
|
-
ActiveRecord::Migration.suppress_messages( &block )
|
71
|
-
end
|
54
|
+
ORM.migrate
|
55
|
+
pass
|
56
|
+
ORM.migrate( 0 )
|
57
|
+
pass
|
72
58
|
end
|
73
59
|
|
74
60
|
end
|
data/test/test_pool_factory.rb
CHANGED
@@ -19,18 +19,19 @@
|
|
19
19
|
require File.join( File.dirname( __FILE__ ), "setup" )
|
20
20
|
|
21
21
|
require 'iudex-core'
|
22
|
-
require 'iudex-da/ar'
|
23
22
|
|
24
23
|
require 'iudex-da'
|
25
24
|
require 'iudex-da/pool_data_source_factory'
|
26
25
|
|
27
26
|
class TestPoolFactory < MiniTest::Unit::TestCase
|
28
27
|
include Iudex::DA
|
28
|
+
include Iudex::Core
|
29
|
+
|
29
30
|
import 'org.apache.commons.dbutils.ResultSetHandler'
|
30
31
|
import 'org.apache.commons.dbutils.QueryRunner'
|
31
32
|
|
32
33
|
def setup
|
33
|
-
@factory = PoolDataSourceFactory.new( :loglevel =>
|
34
|
+
@factory = PoolDataSourceFactory.new( :loglevel => 4 )
|
34
35
|
@data_source = @factory.create
|
35
36
|
end
|
36
37
|
|
@@ -39,21 +40,31 @@ class TestPoolFactory < MiniTest::Unit::TestCase
|
|
39
40
|
@data_source = nil
|
40
41
|
end
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
# Really just want to test the factory and data_source but this
|
44
|
+
# makes a fine demonstration of dbutils query runner "just working"
|
45
|
+
# via ruby.
|
46
|
+
def test_query_runner
|
47
|
+
assert( @data_source )
|
48
|
+
qrun = QueryRunner.new( @data_source )
|
49
|
+
|
50
|
+
url = VisitURL.normalize( "http://gravitext.com/test" )
|
51
|
+
|
52
|
+
qrun.update( "TRUNCATE urls;" )
|
53
|
+
|
54
|
+
c = qrun.update( "INSERT into urls (uhash, url, domain, type ) " +
|
55
|
+
"VALUES (?,?,?,?);",
|
56
|
+
url.uhash, url.url, url.domain, "PAGE" )
|
57
|
+
assert_equal( 1, c )
|
58
|
+
|
59
|
+
out_domain = nil
|
60
|
+
qrun.query( "SELECT * FROM urls WHERE uhash = ?", url.uhash ) do |rs|
|
45
61
|
while rs.next
|
46
|
-
|
62
|
+
out_domain = rs.string( 'domain' )
|
47
63
|
end
|
48
|
-
nil
|
49
64
|
end
|
50
|
-
|
65
|
+
assert_equal( url.domain, out_domain )
|
51
66
|
|
52
|
-
|
53
|
-
refute( @data_source.nil? )
|
54
|
-
qrun = QueryRunner.new( @data_source )
|
55
|
-
qrun.query( "SELECT url FROM urls WHERE uhash IN ('uRlU1h_YL-NvooSv2i98Rd3', 'notthere' );",
|
56
|
-
TestHandler.new )
|
67
|
+
assert_equal( 1, qrun.update( "DELETE from urls;" ) )
|
57
68
|
end
|
58
69
|
|
59
70
|
end
|