iudex-da 1.3.3-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc CHANGED
@@ -1,3 +1,31 @@
1
+ === 1.4.0 (2013-10-29)
2
+ * Add experimental WorkPoller.do_reserve support, which utilizes
3
+ PostgreSQL 9.1+ writable Common Table Expressions (CTEs) to
4
+ efficiently(?) set a reserved column and instance (identifier) on
5
+ poll that guards against polling by another or the same iudex-worker
6
+ process. This enables multiple workers to cover the same urls/orders
7
+ as well as concurrent polling. Previously only an instance unique,
8
+ uhash_slice single-tenant partitioning scheme was supported for
9
+ multi-worker distribution.
10
+ * Add experimental support for concurrent (no discard) WorkPoller (in
11
+ collaboration with iudex-core 1.4.0 VisitManager), enabled when set
12
+ do_reserve = true, do_discard = false.
13
+ * When enabled, polled max_urls is adjusted by current order_count
14
+ for subsequent polls.
15
+ * New max_reserved_time will force discard of possibly long reserved
16
+ tenure based on time since the queue was last empty.
17
+ * New max_discard_ratio will also force a discard if
18
+ order_count/max_urls exceeds this ratio.
19
+ * Add iudex-da-unreserve utility for cleaning reservations left due to
20
+ failure, based on age.
21
+ * Add DAKeys INSTANCE (String identifier) and RESERVED (date)
22
+ * Add migrations for instance and reserved columns; removal of
23
+ reserved from index_next_visit (profile)
24
+ * Upgrade to dbutils ~> 1.5.0, rjack-jdbc-postgres ~> 9.2.1002,
25
+ sequel ~> 3.46 (but not upcoming 4.x)
26
+ * Upgrade to iudex-* ~> 1.4.0 dependencies
27
+ * Upgrade to minitest ~> 4.7.4 (dev)
28
+
1
29
  === 1.3.3 (2012-11-8)
2
30
  * FactoryHelper.create_update_filter now prefers an options Hash
3
31
  exposing greater control over what is updated and how. In
data/Manifest.txt CHANGED
@@ -6,12 +6,16 @@ pom.xml
6
6
  bin/iudex-da-generate-test-data
7
7
  bin/iudex-da-import
8
8
  bin/iudex-da-simhash-dump
9
+ bin/iudex-da-unreserve
9
10
  bin/iudex-migrate
10
11
  config/config.rb
11
12
  db/20111012173757_base.rb
12
13
  db/20120930173600_uhash_collation_order.rb
14
+ db/20130419090000_instance_column.rb
15
+ db/20130419095500_reserved_column.rb
13
16
  db/index_next_visit/21500000000101_add_index_next_visit.rb
14
17
  db/index_next_visit/21500000000110_index_next_visit_partial.rb
18
+ db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb
15
19
  db/simhash/21500000000001_add_simhash_index.rb
16
20
  lib/iudex-da/base.rb
17
21
  lib/iudex-da.rb
@@ -28,4 +32,4 @@ test/test_migrate.rb
28
32
  test/test_pool_factory.rb
29
33
  test/test_url_model.rb
30
34
  test/test_work_poller.rb
31
- lib/iudex-da/iudex-da-1.3.3.jar
35
+ lib/iudex-da/iudex-da-1.4.0.jar
data/README.rdoc CHANGED
@@ -16,7 +16,7 @@ meta-data store and work priority queue.
16
16
 
17
17
  == License
18
18
 
19
- Copyright (c) 2008-2012 David Kellum
19
+ Copyright (c) 2008-2013 David Kellum
20
20
 
21
21
  Licensed under the Apache License, Version 2.0 (the "License"); you
22
22
  may not use this file except in compliance with the License. You may
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2008-2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You
data/bin/iudex-da-import CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2008-2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2008-2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2013 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'optparse'
23
+
24
+ module IudexBinScript
25
+
26
+ require 'rjack-logback'
27
+ include RJack
28
+ Logback.config_console
29
+
30
+ require 'iudex-core'
31
+ require 'iudex-da/base'
32
+ require 'iudex-da/config'
33
+ include Iudex
34
+
35
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
36
+
37
+ interval = '60 minutes'
38
+
39
+ # Apply all config before including ar
40
+ OptionParser.new do |opts|
41
+ opts.banner = <<END
42
+ Usage: iudex-da-unreserve [options]
43
+ END
44
+ opts.on( "-s", "--set name=value", String,
45
+ "Set connect prop (ex: database=iudex)" ) do |nv|
46
+ name,value = nv.split('=').map { |t| t.strip }
47
+ Hooker.add( [ :iudex, :connect_props ] ) do
48
+ { name.to_sym => value }
49
+ end
50
+ end
51
+ opts.on( "-p", "--profile NAME", String,
52
+ "Add a migration profile (ex: simhash)" ) do |p|
53
+ Hooker.add( [ :iudex, :migration_profiles ] ) do |profiles|
54
+ profiles << p
55
+ end
56
+ end
57
+ opts.on( "-d", "--debug" ) do
58
+ Logback[ 'iudex.da' ].level = :debug
59
+ Hooker.add( [ :iudex, :connect_props ] ) { { :log => true } }
60
+ end
61
+ opts.on( "-v", "--version", "Display version and exit" ) do
62
+ puts "iudex-da: #{DA::VERSION}"
63
+ exit 1
64
+ end
65
+ opts.on( "-l", "--long", "Log in long form, to STDERR" ) do
66
+ Logback.config_console( :full => true, :stderr => true )
67
+ end
68
+ opts.on( "-a", "--age AGE",
69
+ "Age to unreserve, in PG interval syntax " +
70
+ "(default: '60 minutes')" ) do |age|
71
+ interval = age
72
+ end
73
+ Hooker.register_config( opts )
74
+ end.parse!
75
+
76
+ require 'iudex-da/orm'
77
+
78
+ class Runner
79
+ include Iudex::DA
80
+
81
+ def unreserve( age )
82
+ ORM.db.transaction( :isolation => :repeatable,
83
+ :retry_on => [ Sequel::SerializationFailure ] ) do
84
+ ds = ORM::db.fetch <<-SQL
85
+ WITH unreserve AS (
86
+ UPDATE urls
87
+ SET reserved = NULL
88
+ WHERE reserved < ( now() - interval '#{age}' )
89
+ RETURNING instance )
90
+ SELECT count(*),instance FROM unreserve
91
+ GROUP BY instance
92
+ ORDER BY count DESC
93
+ SQL
94
+
95
+ total = ds.inject( 0 ) do |m,row|
96
+ puts( "%30s %7d" % [ row[:instance], row[:count] ] )
97
+ m + row[:count]
98
+ end
99
+ puts( "%30s %7d" % [ 'TOTAL', total ] ) if total > 0
100
+
101
+ end
102
+ end
103
+
104
+ end
105
+
106
+ Runner.new.unreserve( interval )
107
+ end
data/bin/iudex-migrate CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2008-2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You
@@ -29,7 +29,7 @@ module IudexBinScript
29
29
 
30
30
  # Note: Avoid loading iudex-da with its jar dependency which would
31
31
  # make it hard to boostrap the db from source alone.
32
- # Instead load only nessary core, base, config, and ar (post config):
32
+ # Instead load only necessary core, base, config, and orm (post config):
33
33
  require 'iudex-core'
34
34
  require 'iudex-da/base'
35
35
  require 'iudex-da/config'
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -0,0 +1,27 @@
1
+ #--
2
+ # Copyright (c) 2008-2013 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ Sequel.migration do
18
+ change do
19
+ alter_table( :urls ) do
20
+ # A short string uniquely identifying the iudex worker instance
21
+ # that has last operated on this URL (for example processed, or
22
+ # in future: reserved). A hostname or IP may be a suitable
23
+ # value.
24
+ add_column :instance, String
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,23 @@
1
+ #--
2
+ # Copyright (c) 2008-2013 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ Sequel.migration do
18
+ change do
19
+ alter_table( :urls ) do
20
+ add_column :reserved, "timestamp with time zone"
21
+ end
22
+ end
23
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -0,0 +1,35 @@
1
+ #--
2
+ # Copyright (c) 2008-2013 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ Sequel.migration do
18
+
19
+ up do
20
+ alter_table( :urls ) do
21
+ drop_index( :next_visit_after )
22
+ add_index( :next_visit_after,
23
+ :where => 'next_visit_after IS NOT NULL AND reserved IS NULL' )
24
+ end
25
+ end
26
+
27
+ down do
28
+ alter_table( :urls ) do
29
+ drop_index( :next_visit_after )
30
+ add_index( :next_visit_after,
31
+ :where => 'next_visit_after is not null' )
32
+ end
33
+ end
34
+
35
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
data/lib/iudex-da.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
data/lib/iudex-da/base.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -16,7 +16,7 @@
16
16
 
17
17
  module Iudex
18
18
  module DA
19
- VERSION = '1.3.3'
19
+ VERSION = '1.4.0'
20
20
 
21
21
  LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
22
  end
@@ -1,6 +1,6 @@
1
1
 
2
2
  #--
3
- # Copyright (c) 2008-2012 David Kellum
3
+ # Copyright (c) 2008-2013 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
Binary file
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
data/lib/iudex-da/orm.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -17,7 +17,7 @@
17
17
  require 'rjack-slf4j'
18
18
  require 'iudex-da/config'
19
19
  require 'sequel'
20
- require 'jdbc/postgres'
20
+ require 'rjack-jdbc-postgres'
21
21
  require 'hooker'
22
22
 
23
23
  Sequel.extension :migration
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -17,7 +17,7 @@
17
17
  require 'iudex-da'
18
18
  require 'rjack-slf4j'
19
19
  require 'java'
20
- require 'jdbc/postgres'
20
+ require 'rjack-jdbc-postgres'
21
21
 
22
22
  module Iudex::DA
23
23
 
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -70,10 +70,53 @@ module Iudex::DA
70
70
  # priority order (default: false)
71
71
  attr_writer :do_domain_group
72
72
 
73
+ # If set true, UPDATE reserved date (and instance, if specified)
74
+ # (Default: false)
75
+ attr_writer :do_reserve
76
+
77
+ # If set true, discards old queue at every poll, even if
78
+ # do_reserve could make queue re-fill a safe operation.
79
+ # (Default: true)
80
+ attr_writer :do_discard
81
+
82
+ # The maximum ratio of current to max_urls where the old queue
83
+ # will be discarded as a safety to avoid starvation (Default: 0.667)
84
+ attr_accessor :max_discard_ratio
85
+
86
+ # The maximum amount of time in milliseconds that the oldest order
87
+ # can remain reserved before a discard is required. This is only
88
+ # relevant when do_reserve is true and do_discard is set false,
89
+ # and typically would be set as a multiple of max_poll_interval
90
+ # (ms). Note that max_poll_interval is interpreted as the worst
91
+ # case next discard opportunity for this purpose. The next poll
92
+ # made to an empty queue, either by prior discard or completion,
93
+ # resets the time tracking. (Default: nil, off)
94
+ def max_reserved_time
95
+ @max_reserved_time_s && ( @max_reserved_time_s * 1000.0 ).round
96
+ end
97
+
98
+ def max_reserved_time=( ms )
99
+ @max_reserved_time_s = ms / 1000.0
100
+ end
101
+
102
+ attr_reader :max_reserved_time_s
103
+
73
104
  def domain_group?
74
105
  @do_domain_group
75
106
  end
76
107
 
108
+ def reserve?
109
+ @do_reserve
110
+ end
111
+
112
+ def discard?
113
+ @do_discard
114
+ end
115
+
116
+ # String uniquely identifying this worker instance. Only used here
117
+ # with do_reserve.
118
+ attr_accessor :instance
119
+
77
120
  # First age coefficient. If set > 0.0, adjust priority by the
78
121
  # equation:
79
122
  #
@@ -125,10 +168,16 @@ module Iudex::DA
125
168
 
126
169
  @domain_depth_coef = nil
127
170
  @do_domain_group = false
171
+ @do_reserve = false
172
+ @do_discard = true
173
+ @instance = nil
128
174
 
129
175
  @max_priority_urls = nil
130
176
  @max_domain_urls = 10_000
131
177
  @max_urls = 50_000
178
+ @max_discard_ratio = 2.0/3.0
179
+ @max_reserved_time_s = nil
180
+ @last_none_reserved = Time.now
132
181
 
133
182
  @age_coef_1 = 0.2
134
183
  @age_coef_2 = 0.1
@@ -138,7 +187,6 @@ module Iudex::DA
138
187
  @uhash_slice = nil
139
188
 
140
189
  @log = RJack::SLF4J[ self.class ]
141
- #FIXME: Add accessor for log in GenericWorkPollStrategy
142
190
 
143
191
  keys( :url, :priority, :next_visit_after ).each do |k|
144
192
  unless mapper.fields.include?( k )
@@ -157,22 +205,68 @@ module Iudex::DA
157
205
 
158
206
  # Override GenericWorkPollStrategy
159
207
  def pollWorkImpl( visit_queue )
160
- visit_queue.add_all( poll )
208
+ visit_queue.add_all( poll( visit_queue.order_count ) )
161
209
  rescue SQLException => x
162
210
  @log.error( "On poll: ", x )
163
211
  end
164
212
 
165
213
  # Poll work and return as List<UniMap>
166
214
  # Raises SQLException
167
- def poll
168
- query, params = generate_query
169
- @log.debug { "Poll query: #{query}; #{params.inspect}" }
170
- reader.select( query, *params )
215
+ def poll( current_urls = 0 )
216
+ @last_none_reserved = Time.now if max_reserved_time_s && current_urls == 0
217
+ query = generate_query( current_urls )
218
+ @log.debug { "Poll query: #{query}" }
219
+ reader.select_with_retry( query )
220
+ end
221
+
222
+ # Override GenericWorkPollStrategy
223
+ def shouldReplaceQueue( visit_queue )
224
+ ( !reserve? || discard? ||
225
+ ( ( visit_queue.order_count.to_f / max_urls ) > max_discard_ratio ) ||
226
+ ( max_reserved_time_s && next_reserve_time > max_reserved_time_s ) )
227
+ end
228
+
229
+ def next_reserve_time( now = Time.now )
230
+ now - @last_none_reserved + ( max_poll_interval / 1000.0 )
231
+ end
232
+
233
+ # Override GenericWorkPollStrategy to discard old VisitQueue
234
+ # contents when do_reserve is enabled.
235
+ def discard( visit_queue )
236
+ if reserve? && visit_queue.order_count > 0
237
+ orders = visit_queue.hosts.inject( [] ) do |a, hq|
238
+ a.concat( hq.orders.to_a )
239
+ end
240
+ if orders.length > 0
241
+ n = reader.unreserve( orders )
242
+ @log.info { "Unreserved #{n} orders on discard" }
243
+ end
244
+ end
245
+ rescue SQLException => x
246
+ @log.error( "On discard: ", x )
247
+ end
248
+
249
+ # Unreserve any orders that are reserved by the current instance.
250
+ # No-op unless do_reserve and instance are set.
251
+ def instance_unreserve
252
+ if reserve? && instance
253
+ n = reader.update( <<-SQL )
254
+ UPDATE urls
255
+ SET reserved = NULL
256
+ WHERE reserved IS NOT NULL AND
257
+ instance = '#{instance}'
258
+ SQL
259
+ @log.info { "Unreserved #{n} orders for instance #{instance}" }
260
+ n
261
+ end
262
+ rescue SQLException => x
263
+ @log.error( "On instance_unreserve: ", x )
171
264
  end
172
265
 
173
266
  def reader
174
267
  @reader ||= ContentReader.new( @data_source, @mapper ).tap do |r|
175
268
  r.priority_adjusted = aged_priority?
269
+ r.max_retries = 10
176
270
  end
177
271
  end
178
272
 
@@ -186,25 +280,33 @@ module Iudex::DA
186
280
  end
187
281
  end
188
282
 
189
- def generate_query
283
+ def domain_union?
284
+ !@domain_union.empty?
285
+ end
286
+
287
+ def generate_query( current_urls )
190
288
  criteria = [ "next_visit_after <= now()" ]
191
289
 
290
+ criteria << "reserved IS NULL" if reserve?
291
+
192
292
  if uhash_slice
193
293
  min, max = url64_range( *uhash_slice )
194
294
  criteria << "uhash > ( '#{min}' COLLATE \"C\" )" if min
195
295
  criteria << "uhash < ( '#{max}' COLLATE \"C\" )" if max
196
296
  end
197
297
 
198
- params = []
199
-
200
- if @domain_union.empty?
201
- query = generate_query_inner( criteria )
202
- params = [ max_urls ]
298
+ unless domain_union?
299
+ query = generate_query_inner( criteria, ( max_urls - current_urls ) )
203
300
  else
204
301
  subqueries = []
205
302
  @domain_union.each do | opts |
206
303
  opts = opts.dup
207
- opts[ :max ] ||= @max_urls
304
+ if opts[ :max ]
305
+ opts[ :max ] = ( opts[ :max ] * ( max_urls - current_urls ) /
306
+ max_urls.to_f ).floor
307
+ else
308
+ opts[ :max ] = ( max_urls - current_urls )
309
+ end
208
310
 
209
311
  next if opts[ :max ] == 0
210
312
 
@@ -228,8 +330,7 @@ module Iudex::DA
228
330
  c << "type = '#{opts[ :type ]}'"
229
331
  end
230
332
 
231
- subqueries << generate_query_inner( c )
232
- params << opts[ :max ]
333
+ subqueries << generate_query_inner( c, opts[ :max ] )
233
334
  end
234
335
  if subqueries.size == 1
235
336
  query = subqueries.first
@@ -238,17 +339,20 @@ module Iudex::DA
238
339
  end
239
340
  end
240
341
 
342
+ query = wrap_with_update( fields, query ) if reserve?
343
+
241
344
  query = wrap_domain_group_query( fields, query ) if domain_group?
242
345
 
243
346
  query = query.gsub( /\s+/, ' ').strip
244
347
 
245
- [ query, params ]
348
+ query
246
349
  end
247
350
 
248
- def generate_query_inner( criteria )
351
+ def generate_query_inner( criteria, max_urls )
249
352
 
250
353
  query = filter_query(
251
- fields( ( :domain if domain_depth? || domain_group? ) ),
354
+ fields( ( :domain if domain_depth? || domain_group? ),
355
+ ( :uhash if reserve? ) ),
252
356
  ( max_priority_urls if domain_depth? ),
253
357
  criteria )
254
358
 
@@ -260,7 +364,7 @@ module Iudex::DA
260
364
  limit_priority = domain_depth? ? :adj_priority : :priority
261
365
  query += <<-SQL
262
366
  ORDER BY #{limit_priority} DESC
263
- LIMIT ?
367
+ LIMIT #{max_urls}
264
368
  SQL
265
369
 
266
370
  query
@@ -288,7 +392,7 @@ module Iudex::DA
288
392
  if aged_priority?
289
393
  flds = flds.dup
290
394
  i = flds.index( :priority ) || flds.size
291
- flds[ i ] = <<-SQL
395
+ flds[ i ] = <<-SQL.strip
292
396
  ( priority +
293
397
  #{age_coef_1}::REAL *
294
398
  SQRT( #{age_coef_2}::REAL *
@@ -311,6 +415,24 @@ module Iudex::DA
311
415
  sql
312
416
  end
313
417
 
418
+ def wrap_with_update( flds, sub )
419
+ sflds = [ "reserved = now()" ]
420
+ sflds << "instance = '#{instance}'" if instance
421
+
422
+ # Use ..FOR UPDATE unless not supported by query specific
423
+ # options with PostgreSQL <= 9.1
424
+ sub += " FOR UPDATE" unless domain_depth? || domain_union?
425
+
426
+ <<-SQL
427
+ WITH work AS ( #{sub} ),
428
+ reserve AS (
429
+ UPDATE urls
430
+ SET #{clist sflds}
431
+ WHERE uhash IN ( SELECT uhash FROM work ) )
432
+ SELECT #{clist flds} FROM work
433
+ SQL
434
+ end
435
+
314
436
  def wrap_domain_group_query( flds, sub )
315
437
  <<-SQL
316
438
  SELECT #{clist flds}
data/pom.xml CHANGED
@@ -5,13 +5,13 @@
5
5
  <groupId>iudex</groupId>
6
6
  <artifactId>iudex-da</artifactId>
7
7
  <packaging>jar</packaging>
8
- <version>1.3.3</version>
8
+ <version>1.4.0</version>
9
9
  <name>Iudex Data Access</name>
10
10
 
11
11
  <parent>
12
12
  <groupId>iudex</groupId>
13
13
  <artifactId>iudex-parent</artifactId>
14
- <version>1.3.0</version>
14
+ <version>1.4.0</version>
15
15
  <relativePath>..</relativePath>
16
16
  </parent>
17
17
 
@@ -20,13 +20,13 @@
20
20
  <dependency>
21
21
  <groupId>iudex</groupId>
22
22
  <artifactId>iudex-core</artifactId>
23
- <version>[1.3.0,1.3.999)</version>
23
+ <version>[1.4.0,1.4.999)</version>
24
24
  </dependency>
25
25
 
26
26
  <dependency>
27
27
  <groupId>commons-dbutils</groupId>
28
28
  <artifactId>commons-dbutils</artifactId>
29
- <version>1.4</version>
29
+ <version>1.5</version>
30
30
  </dependency>
31
31
 
32
32
  <dependency>
@@ -38,7 +38,7 @@
38
38
  <dependency>
39
39
  <groupId>commons-pool</groupId>
40
40
  <artifactId>commons-pool</artifactId>
41
- <version>[1.5.4,1.5.6]</version>
41
+ <version>[1.5.7,1.5.999]</version>
42
42
  </dependency>
43
43
 
44
44
  <dependency>
data/test/setup.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
data/test/test_migrate.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You may
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You may
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You may
@@ -25,6 +25,7 @@ require 'iudex-da/pool_data_source_factory'
25
25
  require 'iudex-da/models'
26
26
 
27
27
  class TestWorkPoller < MiniTest::Unit::TestCase
28
+ include Iudex::Core
28
29
  include Iudex::Filter::KeyHelper
29
30
  include Iudex::DA
30
31
  include Iudex::DA::ORM
@@ -66,6 +67,31 @@ class TestWorkPoller < MiniTest::Unit::TestCase
66
67
  assert_equal( 3, pos )
67
68
  end
68
69
 
70
+ def test_poll_with_reserve
71
+ poller.do_reserve = true
72
+ poller.max_urls = 2
73
+ poller.instance = 'test'
74
+
75
+ polled = poller.poll
76
+ polled.each_with_index do |map,i|
77
+ assert_equal( URLS[ i ][ 0 ], map.url.url )
78
+ end
79
+ assert_equal( 2, polled.size )
80
+ reserved = polled
81
+
82
+ polled = poller.poll
83
+ assert_equal( 1, polled.size )
84
+ assert_equal( URLS[2][0], polled.first.url.url )
85
+ reserved += polled
86
+
87
+ RJack::Logback[ 'iudex.da.WorkPoller' ].with_level( :warn ) do
88
+ poller.discard( VisitQueue.new.tap { |q| q.add_all( reserved ) } )
89
+ end
90
+ poller.max_urls = 3
91
+
92
+ assert_equal( 3, poller.poll.size )
93
+ end
94
+
69
95
  def test_poll_with_max_priority_urls
70
96
  poller.max_priority_urls = 4
71
97
 
@@ -89,6 +115,25 @@ class TestWorkPoller < MiniTest::Unit::TestCase
89
115
  assert_equal( 3, pos )
90
116
  end
91
117
 
118
+ def test_poll_with_domain_depth_reserve
119
+ poller.domain_depth_coef = 0.125
120
+ poller.max_priority_urls = 4
121
+ poller.do_reserve = true
122
+ poller.instance = 'test'
123
+
124
+ pos = 0
125
+ poller.poll.each do |map|
126
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
127
+ pos += 1
128
+ end
129
+ assert_equal( 3, pos )
130
+ RJack::Logback[ 'iudex.da.WorkPoller' ].with_level( :warn ) do
131
+ assert_equal( 3, poller.instance_unreserve )
132
+ end
133
+ assert_equal( 3, poller.poll.size )
134
+ assert_equal( 0, poller.poll.size )
135
+ end
136
+
92
137
  def test_poll_with_domain_depth_only
93
138
  poller.domain_depth_coef = 0.125
94
139
  poller.age_coef_1 = 0.0
@@ -131,6 +176,15 @@ class TestWorkPoller < MiniTest::Unit::TestCase
131
176
  assert_equal( 3, result.size )
132
177
  end
133
178
 
179
+ def test_poll_domain_union_2_reserve
180
+ poller.do_reserve = true
181
+ poller.domain_union = [ { :domain => 'gravitext.com', :max => 15000 },
182
+ { :max => 10000 } ]
183
+
184
+ assert_equal( 3, poller.poll.size )
185
+ assert_equal( 0, poller.poll.size )
186
+ end
187
+
134
188
  def test_poll_domain_union_3
135
189
  poller.domain_union = [ { :domain => 'gravitext.com', :max => 1 },
136
190
  { :domain => 'hometown.com', :max => 1 },
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: iudex-da
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.3.3
5
+ version: 1.4.0
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-08 00:00:00.000000000 Z
12
+ date: 2013-10-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: iudex-core
@@ -17,13 +17,13 @@ dependencies:
17
17
  requirements:
18
18
  - - ~>
19
19
  - !ruby/object:Gem::Version
20
- version: 1.3.0
20
+ version: 1.4.0
21
21
  none: false
22
22
  requirement: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
- version: 1.3.0
26
+ version: 1.4.0
27
27
  none: false
28
28
  prerelease: false
29
29
  type: :runtime
@@ -33,29 +33,29 @@ dependencies:
33
33
  requirements:
34
34
  - - ~>
35
35
  - !ruby/object:Gem::Version
36
- version: 3.40.0
36
+ version: '3.46'
37
37
  none: false
38
38
  requirement: !ruby/object:Gem::Requirement
39
39
  requirements:
40
40
  - - ~>
41
41
  - !ruby/object:Gem::Version
42
- version: 3.40.0
42
+ version: '3.46'
43
43
  none: false
44
44
  prerelease: false
45
45
  type: :runtime
46
46
  - !ruby/object:Gem::Dependency
47
- name: jdbc-postgres
47
+ name: rjack-jdbc-postgres
48
48
  version_requirements: !ruby/object:Gem::Requirement
49
49
  requirements:
50
50
  - - ~>
51
51
  - !ruby/object:Gem::Version
52
- version: 9.1.901
52
+ version: 9.2.1002
53
53
  none: false
54
54
  requirement: !ruby/object:Gem::Requirement
55
55
  requirements:
56
56
  - - ~>
57
57
  - !ruby/object:Gem::Version
58
- version: 9.1.901
58
+ version: 9.2.1002
59
59
  none: false
60
60
  prerelease: false
61
61
  type: :runtime
@@ -81,13 +81,13 @@ dependencies:
81
81
  requirements:
82
82
  - - ~>
83
83
  - !ruby/object:Gem::Version
84
- version: 1.4.0
84
+ version: 1.5.0
85
85
  none: false
86
86
  requirement: !ruby/object:Gem::Requirement
87
87
  requirements:
88
88
  - - ~>
89
89
  - !ruby/object:Gem::Version
90
- version: 1.4.0
90
+ version: 1.5.0
91
91
  none: false
92
92
  prerelease: false
93
93
  type: :runtime
@@ -97,13 +97,13 @@ dependencies:
97
97
  requirements:
98
98
  - - ~>
99
99
  - !ruby/object:Gem::Version
100
- version: '2.3'
100
+ version: 4.7.4
101
101
  none: false
102
102
  requirement: !ruby/object:Gem::Requirement
103
103
  requirements:
104
104
  - - ~>
105
105
  - !ruby/object:Gem::Version
106
- version: '2.3'
106
+ version: 4.7.4
107
107
  none: false
108
108
  prerelease: false
109
109
  type: :development
@@ -146,6 +146,7 @@ executables:
146
146
  - iudex-da-generate-test-data
147
147
  - iudex-da-import
148
148
  - iudex-da-simhash-dump
149
+ - iudex-da-unreserve
149
150
  - iudex-migrate
150
151
  extensions: []
151
152
  extra_rdoc_files:
@@ -160,12 +161,16 @@ files:
160
161
  - bin/iudex-da-generate-test-data
161
162
  - bin/iudex-da-import
162
163
  - bin/iudex-da-simhash-dump
164
+ - bin/iudex-da-unreserve
163
165
  - bin/iudex-migrate
164
166
  - config/config.rb
165
167
  - db/20111012173757_base.rb
166
168
  - db/20120930173600_uhash_collation_order.rb
169
+ - db/20130419090000_instance_column.rb
170
+ - db/20130419095500_reserved_column.rb
167
171
  - db/index_next_visit/21500000000101_add_index_next_visit.rb
168
172
  - db/index_next_visit/21500000000110_index_next_visit_partial.rb
173
+ - db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb
169
174
  - db/simhash/21500000000001_add_simhash_index.rb
170
175
  - lib/iudex-da/base.rb
171
176
  - lib/iudex-da.rb
@@ -182,7 +187,7 @@ files:
182
187
  - test/test_pool_factory.rb
183
188
  - test/test_url_model.rb
184
189
  - test/test_work_poller.rb
185
- - lib/iudex-da/iudex-da-1.3.3.jar
190
+ - lib/iudex-da/iudex-da-1.4.0.jar
186
191
  homepage: http://iudex.gravitext.com
187
192
  licenses: []
188
193
  post_install_message:
Binary file