iudex-da 1.2.1-java → 1.3.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,307 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-da'
18
+ require 'iudex-da/key_helper'
19
+ require 'rjack-slf4j'
20
+
21
+ module Iudex::DA
22
+
23
+ # A SQL based WorkPoller
24
+ class WorkPoller < Java::iudex.core.GenericWorkPollStrategy
25
+ include Iudex::Filter::KeyHelper
26
+ include Gravitext::HTMap
27
+
28
+ import 'java.sql.SQLException'
29
+
30
+ # If set > 0.0 group by domain and reduce priority for subsequent
31
+ # urls within a common (registration level) domain (coefficient of
32
+ # depth). This increases crawl throughput when many domains are
33
+ # available. (default: nil, off)
34
+ attr_accessor :domain_depth_coef
35
+
36
+ def domain_depth?
37
+ domain_depth_coef && domain_depth_coef > 0.0
38
+ end
39
+
40
+ # Deprecated, use #domain_depth_coef (the reciprocal)
41
+ def host_depth_divisor
42
+ 1.0 / domain_depth_coef
43
+ end
44
+
45
+ # Deprecated, use #domain_depth_coef= (reciprocal)
46
+ def host_depth_divisor=( dv )
47
+ @domain_depth_coef = 1.0 / dv
48
+ end
49
+
50
+ # If #domain_depth_coef is set, this sets maximum urls for any
51
+ # single (registration level) domain (default: 10_000)
52
+ attr_accessor :max_domain_urls
53
+
54
+ # Deprecated, use #max_domain_urls
55
+ alias :max_host_urls :max_domain_urls
56
+
57
+ # Deprecated, use #max_domain_urls=
58
+ alias :max_host_urls= :max_domain_urls=
59
+
60
+ # The limit of urls to obtain in a single poll (across all
61
+ # domains) (default: 50_000)
62
+ attr_accessor :max_urls
63
+
64
+ # A secondary limit on the number of urls to consider, taking the
65
+ # N high basic priority urls. This is only ever applied when
66
+ # #domain_depth_coef is set. (default: nil, off)
67
+ attr_accessor :max_priority_urls
68
+
69
+ # If set true, provide the final work list ordered in domain,
70
+ # priority order (default: false)
71
+ attr_writer :do_domain_group
72
+
73
+ def domain_group?
74
+ @do_domain_group
75
+ end
76
+
77
+ # First age coefficient. If set > 0.0, adjust priority by the
78
+ # equation:
79
+ #
80
+ # priority + age_coef_1 * sqrt( age_coef_2 * age )
81
+ #
82
+ # Where age is now - next_visit_after the (default: 0.2)
83
+ attr_accessor :age_coef_1
84
+
85
+ # Second age coefficient (default: 0.1)
86
+ attr_accessor :age_coef_2
87
+
88
+ def aged_priority?
89
+ ( age_coef_1 && age_coef_1 > 0.0 &&
90
+ age_coef_2 && age_coef_2 > 0.0 )
91
+ end
92
+
93
+ # An Array of [ domain, max_urls ] pairs where each domain is a
94
+ # unique reqistration-level, normalized lower-case domain. A nil
95
+ # domain applies to all domains not covered by another
96
+ # row. Without a nil domain row, work is limited to the explicit
97
+ # domains listed. If provided these max_urls values are used
98
+ # instead of top level #max_urls. Domain depth should most likely
99
+ # be avoided if this feature is used. (default: [], off)
100
+ attr_accessor :domain_union
101
+
102
+ # An array containing a zero-based position and a total number of
103
+ # evenly divided segments within the range of possible uhash
104
+ # values. If set only work with uhashes in the designated range
105
+ # will be polled. Note that the uhash is indepedent of domain,
106
+ # being a hash on the entire URL. (default: nil, off)
107
+ attr_accessor :uhash_slice
108
+
109
+ def initialize( data_source, mapper )
110
+ super()
111
+
112
+ @domain_depth_coef = nil
113
+ @do_domain_group = false
114
+
115
+ @max_priority_urls = nil
116
+ @max_domain_urls = 10_000
117
+ @max_urls = 50_000
118
+
119
+ @age_coef_1 = 0.2
120
+ @age_coef_2 = 0.1
121
+
122
+ @domain_union = []
123
+
124
+ @uhash_slice = nil
125
+
126
+ @log = RJack::SLF4J[ self.class ]
127
+ #FIXME: Add accessor for log in GenericWorkPollStrategy
128
+
129
+ keys( :url, :priority, :next_visit_after ).each do |k|
130
+ unless mapper.fields.include?( k )
131
+ raise "WorkPoller needs mapper with #{key.name} included."
132
+ end
133
+ end
134
+
135
+ @mapper = mapper
136
+ @reader = ContentReader.new( data_source, mapper )
137
+ end
138
+
139
+ # Override GenericWorkPollStrategy
140
+ def pollWorkImpl( visit_queue )
141
+ visit_queue.add_all( poll )
142
+ rescue SQLException => x
143
+ @log.error( "On poll: ", x )
144
+ end
145
+
146
+ # Poll work and return as List<UniMap>
147
+ # Raises SQLException
148
+ def poll
149
+ query, params = generate_query
150
+ @reader.select( query, *params )
151
+ end
152
+
153
+ def generate_query
154
+ criteria = [ "next_visit_after <= now()" ]
155
+
156
+ if uhash_slice
157
+ min, max = url64_range( *uhash_slice )
158
+ criteria << "uhash > ( '#{min}' COLLATE \"C\" )" if min
159
+ criteria << "uhash < ( '#{max}' COLLATE \"C\" )" if max
160
+ end
161
+
162
+ params = []
163
+
164
+ if @domain_union.empty?
165
+ query = generate_query_inner( criteria )
166
+ params = [ max_urls ]
167
+ else
168
+ subqueries = []
169
+ @domain_union.each do | domain, dmax |
170
+ next if dmax == 0
171
+ c = criteria.dup
172
+ if domain.nil?
173
+ c += @domain_union.map { |nd,_| nd }.
174
+ compact.
175
+ map { |nd| "domain != '#{nd}'" }
176
+ else
177
+ c << "domain = '#{domain}'"
178
+ end
179
+ subqueries << generate_query_inner( c )
180
+ params << dmax
181
+ end
182
+ if subqueries.size == 1
183
+ query = subqueries.first
184
+ else
185
+ query = "(" + subqueries.join( ") UNION ALL (" ) + ")"
186
+ end
187
+ end
188
+
189
+ query = wrap_domain_group_query( fields, query ) if domain_group?
190
+
191
+ query = query.gsub( /\s+/, ' ').strip
192
+
193
+ [ query, params ]
194
+ end
195
+
196
+ def generate_query_inner( criteria )
197
+
198
+ query = filter_query(
199
+ fields( ( :domain if domain_depth? || domain_group? ) ),
200
+ ( max_priority_urls if domain_depth? ),
201
+ criteria )
202
+
203
+ if domain_depth?
204
+ flds = fields( ( :domain if domain_group? ) )
205
+ query = wrap_domain_partition_query( flds, query )
206
+ end
207
+
208
+ limit_priority = domain_depth? ? :adj_priority : :priority
209
+ query += <<-SQL
210
+ ORDER BY #{limit_priority} DESC
211
+ LIMIT ?
212
+ SQL
213
+
214
+ query
215
+ end
216
+
217
+ def wrap_domain_partition_query( flds, sub )
218
+ <<-SQL
219
+ SELECT #{clist flds}
220
+ FROM ( SELECT #{clist flds},
221
+ ( priority - ( #{domain_depth_coef}::REAL * ( dpos - 1 ) )
222
+ )::REAL AS adj_priority
223
+ FROM ( SELECT #{clist flds},
224
+ row_number() OVER (
225
+ PARTITION BY domain
226
+ ORDER BY priority DESC ) AS dpos
227
+ FROM ( #{ sub } ) AS subP
228
+ ) AS subH
229
+ WHERE dpos <= #{max_domain_urls}
230
+ ) AS subA
231
+ SQL
232
+ end
233
+
234
+ def filter_query( flds, max, criteria )
235
+
236
+ if aged_priority?
237
+ flds = flds.dup
238
+ i = flds.index( :priority ) || flds.size
239
+ flds[ i ] = <<-SQL
240
+ ( priority +
241
+ #{age_coef_1}::REAL *
242
+ SQRT( #{age_coef_2}::REAL *
243
+ EXTRACT( EPOCH FROM ( now() - next_visit_after ) ) )::REAL
244
+ ) AS priority
245
+ SQL
246
+ end
247
+
248
+ sql = <<-SQL
249
+ SELECT #{clist flds}
250
+ FROM urls
251
+ WHERE #{and_list criteria}
252
+ SQL
253
+
254
+ sql += <<-SQL if max
255
+ ORDER BY priority DESC
256
+ LIMIT #{max}
257
+ SQL
258
+
259
+ sql
260
+ end
261
+
262
+ def wrap_domain_group_query( flds, sub )
263
+ <<-SQL
264
+ SELECT #{clist flds}
265
+ FROM ( #{sub} ) AS subDG
266
+ ORDER BY domain, priority DESC
267
+ SQL
268
+ end
269
+
270
+ # URL 64 lexicon, ASCII or "C" LOCALE ordered
271
+ URL64_ORDER = "-0123456789ABCDEFGHIJKLMNOPQRSTU" +
272
+ "VWXYZ_abcdefghijklmnopqrstuvwxyz"
273
+
274
+ # Given a zero-based position within some number of segments,
275
+ # returns [ min, max ] bounds where min will be nil at pos=0, and
276
+ # max will be nil at pos=segments-1. Non nil values are uhash
277
+ # prefixes that can be used as selection criteria.
278
+ def url64_range( pos, segments )
279
+ unless pos >= 0 && segments > pos
280
+ raise "Invalid url64_range: 0 <= #{pos} < #{segments}"
281
+ end
282
+
283
+ period = ( 64 * 64 / segments.to_f )
284
+ low = ( period * pos ).round if pos > 0
285
+ high = ( period * (pos+1) ).round if (pos+1) < segments
286
+
287
+ [ low, high ].map do |i|
288
+ URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
289
+ end
290
+ end
291
+
292
+ def fields( *ksyms )
293
+ ( @mapper.fields.map { |k| k.name.to_sym } |
294
+ ksyms.flatten.compact.map { |s| s.to_sym } )
295
+ end
296
+
297
+ def clist( l )
298
+ l.compact.join( ', ' )
299
+ end
300
+
301
+ def and_list( l )
302
+ l.compact.join( " AND " )
303
+ end
304
+
305
+ end
306
+
307
+ end
data/pom.xml CHANGED
@@ -5,7 +5,7 @@
5
5
  <groupId>iudex</groupId>
6
6
  <artifactId>iudex-da</artifactId>
7
7
  <packaging>jar</packaging>
8
- <version>1.2.1</version>
8
+ <version>1.3.0</version>
9
9
  <name>Iudex Data Access</name>
10
10
 
11
11
  <parent>
@@ -20,7 +20,7 @@
20
20
  <dependency>
21
21
  <groupId>iudex</groupId>
22
22
  <artifactId>iudex-core</artifactId>
23
- <version>[1.2.1,1.2.999)</version>
23
+ <version>[1.2.1,1.3.999)</version>
24
24
  </dependency>
25
25
 
26
26
  <dependency>
data/test/setup.rb CHANGED
@@ -28,17 +28,19 @@ module TestSetup
28
28
  include RJack
29
29
  Logback.config_console( :stderr => true, :thread => true )
30
30
 
31
- if ( ARGV & %w[ -v --verbose --debug ] ).empty?
32
- # Make test output logging compatible: no partial lines.
31
+ VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
32
+
33
+ if VERBOSE
33
34
  class TestOut
34
35
  def print( *a ); $stdout.puts( *a ); end
35
36
  def puts( *a ); $stdout.puts( *a ); end
36
37
  end
37
38
  MiniTest::Unit.output = TestOut.new
38
- else
39
+
39
40
  Logback.root.level = Logback::DEBUG
41
+ else
42
+ Logback[ 'iudex.da.sequel' ].level = :warn
43
+ Logback[ 'iudex.da.PoolDataSourceFactory' ].level = :warn
40
44
  end
41
45
 
42
- ARGV.delete( '--debug' )
43
-
44
46
  end
data/test/test_migrate.rb CHANGED
@@ -20,24 +20,20 @@
20
20
  require File.join( File.dirname( __FILE__ ), "setup" )
21
21
 
22
22
  require 'iudex-da'
23
- require 'iudex-da/ar'
23
+ require 'iudex-da/orm'
24
24
 
25
25
  class TestMigrate < MiniTest::Unit::TestCase
26
26
  include Iudex::DA
27
27
  include RJack
28
28
 
29
- VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
30
-
31
29
  def setup
32
- unless VERBOSE
33
- Logback[ 'iudex.da.ActiveRecord' ].level = Logback::WARN
34
- end
30
+ Logback[ 'iudex.da.sequel' ].level = :warn unless TestSetup::VERBOSE
35
31
  end
36
32
 
37
33
  def teardown
38
34
  Hooker.send( :clear )
39
- suppress_messages? { migrate }
40
- Logback[ 'iudex.da.ActiveRecord' ].level = nil
35
+ ORM.migrate
36
+ Logback[ 'iudex.da.sequel' ].level = nil
41
37
  end
42
38
 
43
39
  def test_default
@@ -55,20 +51,10 @@ class TestMigrate < MiniTest::Unit::TestCase
55
51
  end
56
52
 
57
53
  def check_up_down
58
- suppress_messages? do
59
- migrate
60
- pass
61
- migrate( 0 )
62
- pass
63
- end
64
- end
65
-
66
- def suppress_messages?( &block )
67
- if VERBOSE
68
- block.call
69
- else
70
- ActiveRecord::Migration.suppress_messages( &block )
71
- end
54
+ ORM.migrate
55
+ pass
56
+ ORM.migrate( 0 )
57
+ pass
72
58
  end
73
59
 
74
60
  end
@@ -19,18 +19,19 @@
19
19
  require File.join( File.dirname( __FILE__ ), "setup" )
20
20
 
21
21
  require 'iudex-core'
22
- require 'iudex-da/ar'
23
22
 
24
23
  require 'iudex-da'
25
24
  require 'iudex-da/pool_data_source_factory'
26
25
 
27
26
  class TestPoolFactory < MiniTest::Unit::TestCase
28
27
  include Iudex::DA
28
+ include Iudex::Core
29
+
29
30
  import 'org.apache.commons.dbutils.ResultSetHandler'
30
31
  import 'org.apache.commons.dbutils.QueryRunner'
31
32
 
32
33
  def setup
33
- @factory = PoolDataSourceFactory.new( :loglevel => 2 )
34
+ @factory = PoolDataSourceFactory.new( :loglevel => 4 )
34
35
  @data_source = @factory.create
35
36
  end
36
37
 
@@ -39,21 +40,31 @@ class TestPoolFactory < MiniTest::Unit::TestCase
39
40
  @data_source = nil
40
41
  end
41
42
 
42
- class TestHandler
43
- include ResultSetHandler
44
- def handle( rs )
43
+ # Really just want to test the factory and data_source but this
44
+ # makes a fine demonstration of dbutils query runner "just working"
45
+ # via ruby.
46
+ def test_query_runner
47
+ assert( @data_source )
48
+ qrun = QueryRunner.new( @data_source )
49
+
50
+ url = VisitURL.normalize( "http://gravitext.com/test" )
51
+
52
+ qrun.update( "TRUNCATE urls;" )
53
+
54
+ c = qrun.update( "INSERT into urls (uhash, url, domain, type ) " +
55
+ "VALUES (?,?,?,?);",
56
+ url.uhash, url.url, url.domain, "PAGE" )
57
+ assert_equal( 1, c )
58
+
59
+ out_domain = nil
60
+ qrun.query( "SELECT * FROM urls WHERE uhash = ?", url.uhash ) do |rs|
45
61
  while rs.next
46
- p [ rs.string( 'url' ) ]
62
+ out_domain = rs.string( 'domain' )
47
63
  end
48
- nil
49
64
  end
50
- end
65
+ assert_equal( url.domain, out_domain )
51
66
 
52
- def test_query
53
- refute( @data_source.nil? )
54
- qrun = QueryRunner.new( @data_source )
55
- qrun.query( "SELECT url FROM urls WHERE uhash IN ('uRlU1h_YL-NvooSv2i98Rd3', 'notthere' );",
56
- TestHandler.new )
67
+ assert_equal( 1, qrun.update( "DELETE from urls;" ) )
57
68
  end
58
69
 
59
70
  end