iudex-da 1.2.1-java → 1.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ #--
2
+ # Copyright (c) 2008-2012 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-da'
18
+ require 'iudex-da/key_helper'
19
+ require 'rjack-slf4j'
20
+
21
+ module Iudex::DA
22
+
23
+ # A SQL based WorkPoller
24
+ class WorkPoller < Java::iudex.core.GenericWorkPollStrategy
25
+ include Iudex::Filter::KeyHelper
26
+ include Gravitext::HTMap
27
+
28
+ import 'java.sql.SQLException'
29
+
30
+ # If set > 0.0 group by domain and reduce priority for subsequent
31
+ # urls within a common (registration level) domain (coefficient of
32
+ # depth). This increases crawl throughput when many domains are
33
+ # available. (default: nil, off)
34
+ attr_accessor :domain_depth_coef
35
+
36
+ def domain_depth?
37
+ domain_depth_coef && domain_depth_coef > 0.0
38
+ end
39
+
40
+ # Deprecated, use #domain_depth_coef (the reciprocal)
41
+ def host_depth_divisor
42
+ 1.0 / domain_depth_coef
43
+ end
44
+
45
+ # Deprecated, use #domain_depth_coef= (reciprocal)
46
+ def host_depth_divisor=( dv )
47
+ @domain_depth_coef = 1.0 / dv
48
+ end
49
+
50
+ # If #domain_depth_coef is set, this sets maximum urls for any
51
+ # single (registration level) domain (default: 10_000)
52
+ attr_accessor :max_domain_urls
53
+
54
+ # Deprecated, use #max_domain_urls
55
+ alias :max_host_urls :max_domain_urls
56
+
57
+ # Deprecated, use #max_domain_urls=
58
+ alias :max_host_urls= :max_domain_urls=
59
+
60
+ # The limit of urls to obtain in a single poll (across all
61
+ # domains) (default: 50_000)
62
+ attr_accessor :max_urls
63
+
64
+ # A secondary limit on the number of urls to consider, taking the
65
+ # N high basic priority urls. This is only ever applied when
66
+ # #domain_depth_coef is set. (default: nil, off)
67
+ attr_accessor :max_priority_urls
68
+
69
+ # If set true, provide the final work list ordered in domain,
70
+ # priority order (default: false)
71
+ attr_writer :do_domain_group
72
+
73
+ def domain_group?
74
+ @do_domain_group
75
+ end
76
+
77
+ # First age coefficient. If set > 0.0, adjust priority by the
78
+ # equation:
79
+ #
80
+ # priority + age_coef_1 * sqrt( age_coef_2 * age )
81
+ #
82
+ # Where age is now - next_visit_after the (default: 0.2)
83
+ attr_accessor :age_coef_1
84
+
85
+ # Second age coefficient (default: 0.1)
86
+ attr_accessor :age_coef_2
87
+
88
+ def aged_priority?
89
+ ( age_coef_1 && age_coef_1 > 0.0 &&
90
+ age_coef_2 && age_coef_2 > 0.0 )
91
+ end
92
+
93
+ # An Array of [ domain, max_urls ] pairs where each domain is a
94
+ # unique reqistration-level, normalized lower-case domain. A nil
95
+ # domain applies to all domains not covered by another
96
+ # row. Without a nil domain row, work is limited to the explicit
97
+ # domains listed. If provided these max_urls values are used
98
+ # instead of top level #max_urls. Domain depth should most likely
99
+ # be avoided if this feature is used. (default: [], off)
100
+ attr_accessor :domain_union
101
+
102
+ # An array containing a zero-based position and a total number of
103
+ # evenly divided segments within the range of possible uhash
104
+ # values. If set only work with uhashes in the designated range
105
+ # will be polled. Note that the uhash is indepedent of domain,
106
+ # being a hash on the entire URL. (default: nil, off)
107
+ attr_accessor :uhash_slice
108
+
109
+ def initialize( data_source, mapper )
110
+ super()
111
+
112
+ @domain_depth_coef = nil
113
+ @do_domain_group = false
114
+
115
+ @max_priority_urls = nil
116
+ @max_domain_urls = 10_000
117
+ @max_urls = 50_000
118
+
119
+ @age_coef_1 = 0.2
120
+ @age_coef_2 = 0.1
121
+
122
+ @domain_union = []
123
+
124
+ @uhash_slice = nil
125
+
126
+ @log = RJack::SLF4J[ self.class ]
127
+ #FIXME: Add accessor for log in GenericWorkPollStrategy
128
+
129
+ keys( :url, :priority, :next_visit_after ).each do |k|
130
+ unless mapper.fields.include?( k )
131
+ raise "WorkPoller needs mapper with #{key.name} included."
132
+ end
133
+ end
134
+
135
+ @mapper = mapper
136
+ @reader = ContentReader.new( data_source, mapper )
137
+ end
138
+
139
+ # Override GenericWorkPollStrategy
140
+ def pollWorkImpl( visit_queue )
141
+ visit_queue.add_all( poll )
142
+ rescue SQLException => x
143
+ @log.error( "On poll: ", x )
144
+ end
145
+
146
+ # Poll work and return as List<UniMap>
147
+ # Raises SQLException
148
+ def poll
149
+ query, params = generate_query
150
+ @reader.select( query, *params )
151
+ end
152
+
153
+ def generate_query
154
+ criteria = [ "next_visit_after <= now()" ]
155
+
156
+ if uhash_slice
157
+ min, max = url64_range( *uhash_slice )
158
+ criteria << "uhash > ( '#{min}' COLLATE \"C\" )" if min
159
+ criteria << "uhash < ( '#{max}' COLLATE \"C\" )" if max
160
+ end
161
+
162
+ params = []
163
+
164
+ if @domain_union.empty?
165
+ query = generate_query_inner( criteria )
166
+ params = [ max_urls ]
167
+ else
168
+ subqueries = []
169
+ @domain_union.each do | domain, dmax |
170
+ next if dmax == 0
171
+ c = criteria.dup
172
+ if domain.nil?
173
+ c += @domain_union.map { |nd,_| nd }.
174
+ compact.
175
+ map { |nd| "domain != '#{nd}'" }
176
+ else
177
+ c << "domain = '#{domain}'"
178
+ end
179
+ subqueries << generate_query_inner( c )
180
+ params << dmax
181
+ end
182
+ if subqueries.size == 1
183
+ query = subqueries.first
184
+ else
185
+ query = "(" + subqueries.join( ") UNION ALL (" ) + ")"
186
+ end
187
+ end
188
+
189
+ query = wrap_domain_group_query( fields, query ) if domain_group?
190
+
191
+ query = query.gsub( /\s+/, ' ').strip
192
+
193
+ [ query, params ]
194
+ end
195
+
196
+ def generate_query_inner( criteria )
197
+
198
+ query = filter_query(
199
+ fields( ( :domain if domain_depth? || domain_group? ) ),
200
+ ( max_priority_urls if domain_depth? ),
201
+ criteria )
202
+
203
+ if domain_depth?
204
+ flds = fields( ( :domain if domain_group? ) )
205
+ query = wrap_domain_partition_query( flds, query )
206
+ end
207
+
208
+ limit_priority = domain_depth? ? :adj_priority : :priority
209
+ query += <<-SQL
210
+ ORDER BY #{limit_priority} DESC
211
+ LIMIT ?
212
+ SQL
213
+
214
+ query
215
+ end
216
+
217
+ def wrap_domain_partition_query( flds, sub )
218
+ <<-SQL
219
+ SELECT #{clist flds}
220
+ FROM ( SELECT #{clist flds},
221
+ ( priority - ( #{domain_depth_coef}::REAL * ( dpos - 1 ) )
222
+ )::REAL AS adj_priority
223
+ FROM ( SELECT #{clist flds},
224
+ row_number() OVER (
225
+ PARTITION BY domain
226
+ ORDER BY priority DESC ) AS dpos
227
+ FROM ( #{ sub } ) AS subP
228
+ ) AS subH
229
+ WHERE dpos <= #{max_domain_urls}
230
+ ) AS subA
231
+ SQL
232
+ end
233
+
234
+ def filter_query( flds, max, criteria )
235
+
236
+ if aged_priority?
237
+ flds = flds.dup
238
+ i = flds.index( :priority ) || flds.size
239
+ flds[ i ] = <<-SQL
240
+ ( priority +
241
+ #{age_coef_1}::REAL *
242
+ SQRT( #{age_coef_2}::REAL *
243
+ EXTRACT( EPOCH FROM ( now() - next_visit_after ) ) )::REAL
244
+ ) AS priority
245
+ SQL
246
+ end
247
+
248
+ sql = <<-SQL
249
+ SELECT #{clist flds}
250
+ FROM urls
251
+ WHERE #{and_list criteria}
252
+ SQL
253
+
254
+ sql += <<-SQL if max
255
+ ORDER BY priority DESC
256
+ LIMIT #{max}
257
+ SQL
258
+
259
+ sql
260
+ end
261
+
262
+ def wrap_domain_group_query( flds, sub )
263
+ <<-SQL
264
+ SELECT #{clist flds}
265
+ FROM ( #{sub} ) AS subDG
266
+ ORDER BY domain, priority DESC
267
+ SQL
268
+ end
269
+
270
+ # URL 64 lexicon, ASCII or "C" LOCALE ordered
271
+ URL64_ORDER = "-0123456789ABCDEFGHIJKLMNOPQRSTU" +
272
+ "VWXYZ_abcdefghijklmnopqrstuvwxyz"
273
+
274
+ # Given a zero-based position within some number of segments,
275
+ # returns [ min, max ] bounds where min will be nil at pos=0, and
276
+ # max will be nil at pos=segments-1. Non nil values are uhash
277
+ # prefixes that can be used as selection criteria.
278
+ def url64_range( pos, segments )
279
+ unless pos >= 0 && segments > pos
280
+ raise "Invalid url64_range: 0 <= #{pos} < #{segments}"
281
+ end
282
+
283
+ period = ( 64 * 64 / segments.to_f )
284
+ low = ( period * pos ).round if pos > 0
285
+ high = ( period * (pos+1) ).round if (pos+1) < segments
286
+
287
+ [ low, high ].map do |i|
288
+ URL64_ORDER[ i / 64 ] + URL64_ORDER[ i % 64 ] if i
289
+ end
290
+ end
291
+
292
+ def fields( *ksyms )
293
+ ( @mapper.fields.map { |k| k.name.to_sym } |
294
+ ksyms.flatten.compact.map { |s| s.to_sym } )
295
+ end
296
+
297
+ def clist( l )
298
+ l.compact.join( ', ' )
299
+ end
300
+
301
+ def and_list( l )
302
+ l.compact.join( " AND " )
303
+ end
304
+
305
+ end
306
+
307
+ end
data/pom.xml CHANGED
@@ -5,7 +5,7 @@
5
5
  <groupId>iudex</groupId>
6
6
  <artifactId>iudex-da</artifactId>
7
7
  <packaging>jar</packaging>
8
- <version>1.2.1</version>
8
+ <version>1.3.0</version>
9
9
  <name>Iudex Data Access</name>
10
10
 
11
11
  <parent>
@@ -20,7 +20,7 @@
20
20
  <dependency>
21
21
  <groupId>iudex</groupId>
22
22
  <artifactId>iudex-core</artifactId>
23
- <version>[1.2.1,1.2.999)</version>
23
+ <version>[1.2.1,1.3.999)</version>
24
24
  </dependency>
25
25
 
26
26
  <dependency>
data/test/setup.rb CHANGED
@@ -28,17 +28,19 @@ module TestSetup
28
28
  include RJack
29
29
  Logback.config_console( :stderr => true, :thread => true )
30
30
 
31
- if ( ARGV & %w[ -v --verbose --debug ] ).empty?
32
- # Make test output logging compatible: no partial lines.
31
+ VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
32
+
33
+ if VERBOSE
33
34
  class TestOut
34
35
  def print( *a ); $stdout.puts( *a ); end
35
36
  def puts( *a ); $stdout.puts( *a ); end
36
37
  end
37
38
  MiniTest::Unit.output = TestOut.new
38
- else
39
+
39
40
  Logback.root.level = Logback::DEBUG
41
+ else
42
+ Logback[ 'iudex.da.sequel' ].level = :warn
43
+ Logback[ 'iudex.da.PoolDataSourceFactory' ].level = :warn
40
44
  end
41
45
 
42
- ARGV.delete( '--debug' )
43
-
44
46
  end
data/test/test_migrate.rb CHANGED
@@ -20,24 +20,20 @@
20
20
  require File.join( File.dirname( __FILE__ ), "setup" )
21
21
 
22
22
  require 'iudex-da'
23
- require 'iudex-da/ar'
23
+ require 'iudex-da/orm'
24
24
 
25
25
  class TestMigrate < MiniTest::Unit::TestCase
26
26
  include Iudex::DA
27
27
  include RJack
28
28
 
29
- VERBOSE = ! ( ARGV & %w[ -v --verbose ] ).empty?
30
-
31
29
  def setup
32
- unless VERBOSE
33
- Logback[ 'iudex.da.ActiveRecord' ].level = Logback::WARN
34
- end
30
+ Logback[ 'iudex.da.sequel' ].level = :warn unless TestSetup::VERBOSE
35
31
  end
36
32
 
37
33
  def teardown
38
34
  Hooker.send( :clear )
39
- suppress_messages? { migrate }
40
- Logback[ 'iudex.da.ActiveRecord' ].level = nil
35
+ ORM.migrate
36
+ Logback[ 'iudex.da.sequel' ].level = nil
41
37
  end
42
38
 
43
39
  def test_default
@@ -55,20 +51,10 @@ class TestMigrate < MiniTest::Unit::TestCase
55
51
  end
56
52
 
57
53
  def check_up_down
58
- suppress_messages? do
59
- migrate
60
- pass
61
- migrate( 0 )
62
- pass
63
- end
64
- end
65
-
66
- def suppress_messages?( &block )
67
- if VERBOSE
68
- block.call
69
- else
70
- ActiveRecord::Migration.suppress_messages( &block )
71
- end
54
+ ORM.migrate
55
+ pass
56
+ ORM.migrate( 0 )
57
+ pass
72
58
  end
73
59
 
74
60
  end
@@ -19,18 +19,19 @@
19
19
  require File.join( File.dirname( __FILE__ ), "setup" )
20
20
 
21
21
  require 'iudex-core'
22
- require 'iudex-da/ar'
23
22
 
24
23
  require 'iudex-da'
25
24
  require 'iudex-da/pool_data_source_factory'
26
25
 
27
26
  class TestPoolFactory < MiniTest::Unit::TestCase
28
27
  include Iudex::DA
28
+ include Iudex::Core
29
+
29
30
  import 'org.apache.commons.dbutils.ResultSetHandler'
30
31
  import 'org.apache.commons.dbutils.QueryRunner'
31
32
 
32
33
  def setup
33
- @factory = PoolDataSourceFactory.new( :loglevel => 2 )
34
+ @factory = PoolDataSourceFactory.new( :loglevel => 4 )
34
35
  @data_source = @factory.create
35
36
  end
36
37
 
@@ -39,21 +40,31 @@ class TestPoolFactory < MiniTest::Unit::TestCase
39
40
  @data_source = nil
40
41
  end
41
42
 
42
- class TestHandler
43
- include ResultSetHandler
44
- def handle( rs )
43
+ # Really just want to test the factory and data_source but this
44
+ # makes a fine demonstration of dbutils query runner "just working"
45
+ # via ruby.
46
+ def test_query_runner
47
+ assert( @data_source )
48
+ qrun = QueryRunner.new( @data_source )
49
+
50
+ url = VisitURL.normalize( "http://gravitext.com/test" )
51
+
52
+ qrun.update( "TRUNCATE urls;" )
53
+
54
+ c = qrun.update( "INSERT into urls (uhash, url, domain, type ) " +
55
+ "VALUES (?,?,?,?);",
56
+ url.uhash, url.url, url.domain, "PAGE" )
57
+ assert_equal( 1, c )
58
+
59
+ out_domain = nil
60
+ qrun.query( "SELECT * FROM urls WHERE uhash = ?", url.uhash ) do |rs|
45
61
  while rs.next
46
- p [ rs.string( 'url' ) ]
62
+ out_domain = rs.string( 'domain' )
47
63
  end
48
- nil
49
64
  end
50
- end
65
+ assert_equal( url.domain, out_domain )
51
66
 
52
- def test_query
53
- refute( @data_source.nil? )
54
- qrun = QueryRunner.new( @data_source )
55
- qrun.query( "SELECT url FROM urls WHERE uhash IN ('uRlU1h_YL-NvooSv2i98Rd3', 'notthere' );",
56
- TestHandler.new )
67
+ assert_equal( 1, qrun.update( "DELETE from urls;" ) )
57
68
  end
58
69
 
59
70
  end