iudex-da 1.2.1-java → 1.3.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +30 -0
- data/Manifest.txt +10 -16
- data/bin/iudex-migrate +7 -3
- data/db/20111012173757_base.rb +117 -0
- data/db/{0070_add_created_at.rb → 20120930173600_uhash_collation_order.rb} +16 -10
- data/db/{simhash/0085_add_simhash_index.rb → index_next_visit/21500000000101_add_index_next_visit.rb} +5 -9
- data/db/{0081_remove_simhash_index.rb → simhash/21500000000001_add_simhash_index.rb} +5 -9
- data/lib/iudex-da.rb +3 -1
- data/lib/iudex-da/base.rb +1 -1
- data/lib/iudex-da/config.rb +3 -3
- data/lib/iudex-da/iudex-da-1.3.0.jar +0 -0
- data/lib/iudex-da/models.rb +66 -0
- data/lib/iudex-da/orm.rb +183 -0
- data/lib/iudex-da/work_poller.rb +307 -0
- data/pom.xml +2 -2
- data/test/setup.rb +7 -5
- data/test/test_migrate.rb +8 -22
- data/test/test_pool_factory.rb +24 -13
- data/test/test_url_model.rb +52 -0
- data/test/test_work_poller.rb +157 -0
- metadata +210 -185
- data/db/0010_base_urls.rb +0 -84
- data/db/0020_add_feed_metadata.rb +0 -37
- data/db/0021_more_feed_text.rb +0 -29
- data/db/0030_add_priority.rb +0 -28
- data/db/0040_add_visit_after.rb +0 -30
- data/db/0050_add_cache_location.rb +0 -32
- data/db/0060_url_indexes.rb +0 -41
- data/db/0080_add_simhash.rb +0 -33
- data/db/0110_host_to_domain.rb +0 -36
- data/db/index_next_visit/0100_add_index_next_visit.rb +0 -27
- data/lib/iudex-da/ar.rb +0 -66
- data/lib/iudex-da/iudex-da-1.2.1.jar +0 -0
- data/test/test_poll_work.rb +0 -132
@@ -0,0 +1,307 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-da'
|
18
|
+
require 'iudex-da/key_helper'
|
19
|
+
require 'rjack-slf4j'
|
20
|
+
|
21
|
+
module Iudex::DA
|
22
|
+
|
23
|
+
# A SQL based WorkPoller
|
24
|
+
class WorkPoller < Java::iudex.core.GenericWorkPollStrategy
|
25
|
+
include Iudex::Filter::KeyHelper
|
26
|
+
include Gravitext::HTMap
|
27
|
+
|
28
|
+
import 'java.sql.SQLException'
|
29
|
+
|
30
|
+
# If set > 0.0 group by domain and reduce priority for subsequent
|
31
|
+
# urls within a common (registration level) domain (coefficient of
|
32
|
+
# depth). This increases crawl throughput when many domains are
|
33
|
+
# available. (default: nil, off)
|
34
|
+
attr_accessor :domain_depth_coef
|
35
|
+
|
36
|
+
def domain_depth?
|
37
|
+
domain_depth_coef && domain_depth_coef > 0.0
|
38
|
+
end
|
39
|
+
|
40
|
+
# Deprecated, use #domain_depth_coef (the reciprocal)
|
41
|
+
def host_depth_divisor
|
42
|
+
1.0 / domain_depth_coef
|
43
|
+
end
|
44
|
+
|
45
|
+
# Deprecated, use #domain_depth_coef= (reciprocal)
|
46
|
+
def host_depth_divisor=( dv )
|
47
|
+
@domain_depth_coef = 1.0 / dv
|
48
|
+
end
|
49
|
+
|
50
|
+
# If #domain_depth_coef is set, this sets maximum urls for any
|
51
|
+
# single (registration level) domain (default: 10_000)
|
52
|
+
attr_accessor :max_domain_urls
|
53
|
+
|
54
|
+
# Deprecated, use #max_domain_urls
|
55
|
+
alias :max_host_urls :max_domain_urls
|
56
|
+
|
57
|
+
# Deprecated, use #max_domain_urls=
|
58
|
+
alias :max_host_urls= :max_domain_urls=
|
59
|
+
|
60
|
+
# The limit of urls to obtain in a single poll (across all
|
61
|
+
# domains) (default: 50_000)
|
62
|
+
attr_accessor :max_urls
|
63
|
+
|
64
|
+
# A secondary limit on the number of urls to consider, taking the
|
65
|
+
# N high basic priority urls. This is only ever applied when
|
66
|
+
# #domain_depth_coef is set. (default: nil, off)
|
67
|
+
attr_accessor :max_priority_urls
|
68
|
+
|
69
|
+
# If set true, provide the final work list ordered in domain,
|
70
|
+
# priority order (default: false)
|
71
|
+
attr_writer :do_domain_group
|
72
|
+
|
73
|
+
def domain_group?
|
74
|
+
@do_domain_group
|
75
|
+
end
|
76
|
+
|
77
|
+
# First age coefficient. If set > 0.0, adjust priority by the
|
78
|
+
# equation:
|
79
|
+
#
|
80
|
+
# priority + age_coef_1 * sqrt( age_coef_2 * age )
|
81
|
+
#
|
82
|
+
# Where age is now - next_visit_after the (default: 0.2)
|
83
|
+
attr_accessor :age_coef_1
|
84
|
+
|
85
|
+
# Second age coefficient (default: 0.1)
|
86
|
+
attr_accessor :age_coef_2
|
87
|
+
|
88
|
+
def aged_priority?
|
89
|
+
( age_coef_1 && age_coef_1 > 0.0 &&
|
90
|
+
age_coef_2 && age_coef_2 > 0.0 )
|
91
|
+
end
|
92
|
+
|
93
|
+
# An Array of [ domain, max_urls ] pairs where each domain is a
|
94
|
+
# unique reqistration-level, normalized lower-case domain. A nil
|
95
|
+
# domain applies to all domains not covered by another
|
96
|
+
# row. Without a nil domain row, work is limited to the explicit
|
97
|
+
# domains listed. If provided these max_urls values are used
|
98
|
+
# instead of top level #max_urls. Domain depth should most likely
|
99
|
+
# be avoided if this feature is used. (default: [], off)
|
100
|
+
attr_accessor :domain_union
|
101
|
+
|
102
|
+
# An array containing a zero-based position and a total number of
|
103
|
+
# evenly divided segments within the range of possible uhash
|
104
|
+
# values. If set only work with uhashes in the designated range
|
105
|
+
# will be polled. Note that the uhash is indepedent of domain,
|
106
|
+
# being a hash on the entire URL. (default: nil, off)
|
107
|
+
attr_accessor :uhash_slice
|
108
|
+
|
109
|
+
def initialize( data_source, mapper )
|
110
|
+
super()
|
111
|
+
|
112
|
+
@domain_depth_coef = nil
|
113
|
+
@do_domain_group = false
|
114
|
+
|
115
|
+
@max_priority_urls = nil
|
116
|
+
@max_domain_urls = 10_000
|
117
|
+
@max_urls = 50_000
|
118
|
+
|
119
|
+
@age_coef_1 = 0.2
|
120
|
+
@age_coef_2 = 0.1
|
121
|
+
|
122
|
+
@domain_union = []
|
123
|
+
|
124
|
+
@uhash_slice = nil
|
125
|
+
|
126
|
+
@log = RJack::SLF4J[ self.class ]
|
127
|
+
#FIXME: Add accessor for log in GenericWorkPollStrategy
|
128
|
+
|
129
|
+
keys( :url, :priority, :next_visit_after ).each do |k|
|
130
|
+
unless mapper.fields.include?( k )
|
131
|
+
raise "WorkPoller needs mapper with #{key.name} included."
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
@mapper = mapper
|
136
|
+
@reader = ContentReader.new( data_source, mapper )
|
137
|
+
end
|
138
|
+
|
139
|
+
# Override GenericWorkPollStrategy
|
140
|
+
def pollWorkImpl( visit_queue )
|
141
|
+
visit_queue.add_all( poll )
|
142
|
+
rescue SQLException => x
|
143
|
+
@log.error( "On poll: ", x )
|
144
|
+
end
|
145
|
+
|
146
|
+
# Poll work and return as List<UniMap>
|
147
|
+
# Raises SQLException
|
148
|
+
def poll
|
149
|
+
query, params = generate_query
|
150
|
+
@reader.select( query, *params )
|
151
|
+
end
|
152
|
+
|
153
|
+
def generate_query
|
154
|
+
criteria = [ "next_visit_after <= now()" ]
|
155
|
+
|
156
|
+
if uhash_slice
|
157
|
+
min, max = url64_range( *uhash_slice )
|
158
|
+
criteria << "uhash > ( '#{min}' COLLATE \"C\" )" if min
|
159
|
+
criteria << "uhash < ( '#{max}' COLLATE \"C\" )" if max
|
160
|
+
end
|
161
|
+
|
162
|
+
params = []
|
163
|
+
|
164
|
+
if @domain_union.empty?
|
165
|
+
query = generate_query_inner( criteria )
|
166
|
+
params = [ max_urls ]
|
167
|
+
else
|
168
|
+
subqueries = []
|
169
|
+
@domain_union.each do | domain, dmax |
|
170
|
+
next if dmax == 0
|
171
|
+
c = criteria.dup
|
172
|
+
if domain.nil?
|
173
|
+
c += @domain_union.map { |nd,_| nd }.
|
174
|
+
compact.
|
175
|
+
map { |nd| "domain != '#{nd}'" }
|
176
|
+
else
|
177
|
+
c << "domain = '#{domain}'"
|
178
|
+
end
|
179
|
+
subqueries << generate_query_inner( c )
|
180
|
+
params << dmax
|
181
|
+
end
|
182
|
+
if subqueries.size == 1
|
183
|
+
query = subqueries.first
|
184
|
+
else
|
185
|
+
query = "(" + subqueries.join( ") UNION ALL (" ) + ")"
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
query = wrap_domain_group_query( fields, query ) if domain_group?
|
190
|
+
|
191
|
+
query = query.gsub( /\s+/, ' ').strip
|
192
|
+
|
193
|
+
[ query, params ]
|
194
|
+
end
|
195
|
+
|
196
|
+
def generate_query_inner( criteria )
|
197
|
+
|
198
|
+
query = filter_query(
|
199
|
+
fields( ( :domain if domain_depth? || domain_group? ) ),
|
200
|
+
( max_priority_urls if domain_depth? ),
|
201
|
+
criteria )
|
202
|
+
|
203
|
+
if domain_depth?
|
204
|
+
flds = fields( ( :domain if domain_group? ) )
|
205
|
+
query = wrap_domain_partition_query( flds, query )
|
206
|
+
end
|
207
|
+
|
208
|
+
limit_priority = domain_depth? ? :adj_priority : :priority
|
209
|
+
query += <<-SQL
|
210
|
+
ORDER BY #{limit_priority} DESC
|
211
|
+
LIMIT ?
|
212
|
+
SQL
|
213
|
+
|
214
|
+
query
|
215
|
+
end
|
216
|
+
|
217
|
+
def wrap_domain_partition_query( flds, sub )
|
218
|
+
<<-SQL
|
219
|
+
SELECT #{clist flds}
|
220
|
+
FROM ( SELECT #{clist flds},
|
221
|
+
( priority - ( #{domain_depth_coef}::REAL * ( dpos - 1 ) )
|
222
|
+
)::REAL AS adj_priority
|
223
|
+
FROM ( SELECT #{clist flds},
|
224
|
+
row_number() OVER (
|
225
|
+
PARTITION BY domain
|
226
|
+
ORDER BY priority DESC ) AS dpos
|
227
|
+
FROM ( #{ sub } ) AS subP
|
228
|
+
) AS subH
|
229
|
+
WHERE dpos <= #{max_domain_urls}
|
230
|
+
) AS subA
|
231
|
+
SQL
|
232
|
+
end
|
233
|
+
|
234
|
+
def filter_query( flds, max, criteria )
|
235
|
+
|
236
|
+
if aged_priority?
|
237
|
+
flds = flds.dup
|
238
|
+
i = flds.index( :priority ) || flds.size
|
239
|
+
flds[ i ] = <<-SQL
|
240
|
+
( priority +
|
241
|
+
#{age_coef_1}::REAL *
|
242
|
+
SQRT( #{age_coef_2}::REAL *
|
243
|
+
EXTRACT( EPOCH FROM ( now() - next_visit_after ) ) )::REAL
|
244
|
+
) AS priority
|
245
|
+
SQL
|
246
|
+
end
|
247
|
+
|
248
|
+
sql = <<-SQL
|
249
|
+
SELECT #{clist flds}
|
250
|
+
FROM urls
|
251
|
+
WHERE #{and_list criteria}
|
252
|
+
SQL
|
253
|
+
|
254
|
+
sql += <<-SQL if max
|
255
|
+
ORDER BY priority DESC
|
256
|
+
LIMIT #{max}
|
257
|
+
SQL
|
258
|
+
|
259
|
+
sql
|
260
|
+
end
|
261
|
+
|
262
|
+
def wrap_domain_group_query( flds, sub )
|
263
|
+
<<-SQL
|
264
|
+
SELECT #{clist flds}
|
265
|
+
FROM ( #{sub} ) AS subDG
|
266
|
+
ORDER BY domain, priority DESC
|
267
|
+
SQL
|
268
|
+
end
|
269
|
+
|
270
|
+
# URL 64 lexicon, ASCII or "C" LOCALE ordered
|
271
|
+
URL64_ORDER = "-0123456789ABCDEFGHIJKLMNOPQRSTU" +
|
272
|
+
"VWXYZ_abcdefghijklmnopqrstuvwxyz"
|
273
|
+
|
274
|
+
# Given a zero-based position within some number of segments,
|
275
|
+
# returns [ min, max ] bounds where min will be nil at pos=0, and
|
276
|
+
# max will be nil at pos=segments-1. Non nil values are uhash
|
277
|
+
# prefixes that can be used as selection criteria.
|
278
|
+
def url64_range( pos, segments )
|
279
|
+
unless pos >= 0 && segments > pos
|
280
|
+
raise "Invalid url64_range: 0 <= #{pos} < #{segments}"
|
281
|
+
end
|
282
|
+
|
283
|
+
period = ( 64 * 64 / segments.to_f )
|
284
|
+
low = ( period * pos ).round if pos > 0
|
285
|
+
high = ( period * (pos+1) ).round if (pos+1) < segments
|
286
|
+
|
287
|
+
[ low, high ].map do |i|
|
288
|
+
URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
def fields( *ksyms )
|
293
|
+
( @mapper.fields.map { |k| k.name.to_sym } |
|
294
|
+
ksyms.flatten.compact.map { |s| s.to_sym } )
|
295
|
+
end
|
296
|
+
|
297
|
+
def clist( l )
|
298
|
+
l.compact.join( ', ' )
|
299
|
+
end
|
300
|
+
|
301
|
+
def and_list( l )
|
302
|
+
l.compact.join( " AND " )
|
303
|
+
end
|
304
|
+
|
305
|
+
end
|
306
|
+
|
307
|
+
end
|
data/pom.xml
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-da</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.3.0</version>
|
9
9
|
<name>Iudex Data Access</name>
|
10
10
|
|
11
11
|
<parent>
|
@@ -20,7 +20,7 @@
|
|
20
20
|
<dependency>
|
21
21
|
<groupId>iudex</groupId>
|
22
22
|
<artifactId>iudex-core</artifactId>
|
23
|
-
<version>[1.2.1,1.
|
23
|
+
<version>[1.2.1,1.3.999)</version>
|
24
24
|
</dependency>
|
25
25
|
|
26
26
|
<dependency>
|
data/test/setup.rb
CHANGED
@@ -28,17 +28,19 @@ module TestSetup
|
|
28
28
|
include RJack
|
29
29
|
Logback.config_console( :stderr => true, :thread => true )
|
30
30
|
|
31
|
-
|
32
|
-
|
31
|
+
VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
|
32
|
+
|
33
|
+
if VERBOSE
|
33
34
|
class TestOut
|
34
35
|
def print( *a ); $stdout.puts( *a ); end
|
35
36
|
def puts( *a ); $stdout.puts( *a ); end
|
36
37
|
end
|
37
38
|
MiniTest::Unit.output = TestOut.new
|
38
|
-
|
39
|
+
|
39
40
|
Logback.root.level = Logback::DEBUG
|
41
|
+
else
|
42
|
+
Logback[ 'iudex.da.sequel' ].level = :warn
|
43
|
+
Logback[ 'iudex.da.PoolDataSourceFactory' ].level = :warn
|
40
44
|
end
|
41
45
|
|
42
|
-
ARGV.delete( '--debug' )
|
43
|
-
|
44
46
|
end
|
data/test/test_migrate.rb
CHANGED
@@ -20,24 +20,20 @@
|
|
20
20
|
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
21
|
|
22
22
|
require 'iudex-da'
|
23
|
-
require 'iudex-da/
|
23
|
+
require 'iudex-da/orm'
|
24
24
|
|
25
25
|
class TestMigrate < MiniTest::Unit::TestCase
|
26
26
|
include Iudex::DA
|
27
27
|
include RJack
|
28
28
|
|
29
|
-
VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
|
30
|
-
|
31
29
|
def setup
|
32
|
-
unless VERBOSE
|
33
|
-
Logback[ 'iudex.da.ActiveRecord' ].level = Logback::WARN
|
34
|
-
end
|
30
|
+
Logback[ 'iudex.da.sequel' ].level = :warn unless TestSetup::VERBOSE
|
35
31
|
end
|
36
32
|
|
37
33
|
def teardown
|
38
34
|
Hooker.send( :clear )
|
39
|
-
|
40
|
-
Logback[ 'iudex.da.
|
35
|
+
ORM.migrate
|
36
|
+
Logback[ 'iudex.da.sequel' ].level = nil
|
41
37
|
end
|
42
38
|
|
43
39
|
def test_default
|
@@ -55,20 +51,10 @@ class TestMigrate < MiniTest::Unit::TestCase
|
|
55
51
|
end
|
56
52
|
|
57
53
|
def check_up_down
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
pass
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def suppress_messages?( &block )
|
67
|
-
if VERBOSE
|
68
|
-
block.call
|
69
|
-
else
|
70
|
-
ActiveRecord::Migration.suppress_messages( &block )
|
71
|
-
end
|
54
|
+
ORM.migrate
|
55
|
+
pass
|
56
|
+
ORM.migrate( 0 )
|
57
|
+
pass
|
72
58
|
end
|
73
59
|
|
74
60
|
end
|
data/test/test_pool_factory.rb
CHANGED
@@ -19,18 +19,19 @@
|
|
19
19
|
require File.join( File.dirname( __FILE__ ), "setup" )
|
20
20
|
|
21
21
|
require 'iudex-core'
|
22
|
-
require 'iudex-da/ar'
|
23
22
|
|
24
23
|
require 'iudex-da'
|
25
24
|
require 'iudex-da/pool_data_source_factory'
|
26
25
|
|
27
26
|
class TestPoolFactory < MiniTest::Unit::TestCase
|
28
27
|
include Iudex::DA
|
28
|
+
include Iudex::Core
|
29
|
+
|
29
30
|
import 'org.apache.commons.dbutils.ResultSetHandler'
|
30
31
|
import 'org.apache.commons.dbutils.QueryRunner'
|
31
32
|
|
32
33
|
def setup
|
33
|
-
@factory = PoolDataSourceFactory.new( :loglevel =>
|
34
|
+
@factory = PoolDataSourceFactory.new( :loglevel => 4 )
|
34
35
|
@data_source = @factory.create
|
35
36
|
end
|
36
37
|
|
@@ -39,21 +40,31 @@ class TestPoolFactory < MiniTest::Unit::TestCase
|
|
39
40
|
@data_source = nil
|
40
41
|
end
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
# Really just want to test the factory and data_source but this
|
44
|
+
# makes a fine demonstration of dbutils query runner "just working"
|
45
|
+
# via ruby.
|
46
|
+
def test_query_runner
|
47
|
+
assert( @data_source )
|
48
|
+
qrun = QueryRunner.new( @data_source )
|
49
|
+
|
50
|
+
url = VisitURL.normalize( "http://gravitext.com/test" )
|
51
|
+
|
52
|
+
qrun.update( "TRUNCATE urls;" )
|
53
|
+
|
54
|
+
c = qrun.update( "INSERT into urls (uhash, url, domain, type ) " +
|
55
|
+
"VALUES (?,?,?,?);",
|
56
|
+
url.uhash, url.url, url.domain, "PAGE" )
|
57
|
+
assert_equal( 1, c )
|
58
|
+
|
59
|
+
out_domain = nil
|
60
|
+
qrun.query( "SELECT * FROM urls WHERE uhash = ?", url.uhash ) do |rs|
|
45
61
|
while rs.next
|
46
|
-
|
62
|
+
out_domain = rs.string( 'domain' )
|
47
63
|
end
|
48
|
-
nil
|
49
64
|
end
|
50
|
-
|
65
|
+
assert_equal( url.domain, out_domain )
|
51
66
|
|
52
|
-
|
53
|
-
refute( @data_source.nil? )
|
54
|
-
qrun = QueryRunner.new( @data_source )
|
55
|
-
qrun.query( "SELECT url FROM urls WHERE uhash IN ('uRlU1h_YL-NvooSv2i98Rd3', 'notthere' );",
|
56
|
-
TestHandler.new )
|
67
|
+
assert_equal( 1, qrun.update( "DELETE from urls;" ) )
|
57
68
|
end
|
58
69
|
|
59
70
|
end
|