iudex-da 1.3.3-java → 1.4.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +28 -0
- data/Manifest.txt +5 -1
- data/README.rdoc +1 -1
- data/bin/iudex-da-generate-test-data +1 -1
- data/bin/iudex-da-import +1 -1
- data/bin/iudex-da-simhash-dump +1 -1
- data/bin/iudex-da-unreserve +107 -0
- data/bin/iudex-migrate +2 -2
- data/db/20111012173757_base.rb +1 -1
- data/db/20120930173600_uhash_collation_order.rb +1 -1
- data/db/20130419090000_instance_column.rb +27 -0
- data/db/20130419095500_reserved_column.rb +23 -0
- data/db/index_next_visit/21500000000101_add_index_next_visit.rb +1 -1
- data/db/index_next_visit/21500000000110_index_next_visit_partial.rb +1 -1
- data/db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb +35 -0
- data/db/simhash/21500000000001_add_simhash_index.rb +1 -1
- data/lib/iudex-da.rb +1 -1
- data/lib/iudex-da/base.rb +2 -2
- data/lib/iudex-da/config.rb +1 -1
- data/lib/iudex-da/factory_helper.rb +1 -1
- data/lib/iudex-da/importer.rb +1 -1
- data/lib/iudex-da/iudex-da-1.4.0.jar +0 -0
- data/lib/iudex-da/key_helper.rb +1 -1
- data/lib/iudex-da/models.rb +1 -1
- data/lib/iudex-da/orm.rb +2 -2
- data/lib/iudex-da/pool_data_source_factory.rb +2 -2
- data/lib/iudex-da/work_poller.rb +143 -21
- data/pom.xml +5 -5
- data/test/setup.rb +1 -1
- data/test/test_migrate.rb +1 -1
- data/test/test_pool_factory.rb +1 -1
- data/test/test_url_model.rb +1 -1
- data/test/test_work_poller.rb +55 -1
- metadata +19 -14
- data/lib/iudex-da/iudex-da-1.3.3.jar +0 -0
data/History.rdoc
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
=== 1.4.0 (2013-10-29)
|
2
|
+
* Add experimental WorkPoller.do_reserve support, which utilizes
|
3
|
+
PostgreSQL 9.1+ writable Common Table Expressions (CTEs) to
|
4
|
+
efficiently(?) set a reserved column and instance (identifier) on
|
5
|
+
poll that guards against polling by another or the same iudex-worker
|
6
|
+
process. This enables multiple workers to cover the same urls/orders
|
7
|
+
as well as concurrent polling. Previously only an instance unique,
|
8
|
+
uhash_slice single-tenant partitioning scheme was supported for
|
9
|
+
multi-worker distribution.
|
10
|
+
* Add experimental support for concurrent (no discard) WorkPoller (in
|
11
|
+
collaboration with iudex-core 1.4.0 VisitManager), enabled when set
|
12
|
+
do_reserve = true, do_discard = false.
|
13
|
+
* When enabled, polled max_urls is adjusted by current order_count
|
14
|
+
for subsequent polls.
|
15
|
+
* New max_reserved_time will force discard of possibly long reserved
|
16
|
+
tenure based on time since the queue was last empty.
|
17
|
+
* New max_discard_ratio will also force a discard if
|
18
|
+
order_count/max_urls exceeds this ratio.
|
19
|
+
* Add iudex-da-unreserve utility for cleaning reservations left due to
|
20
|
+
failure, based on age.
|
21
|
+
* Add DAKeys INSTANCE (String identifier) and RESERVED (date)
|
22
|
+
* Add migrations for instance and reserved columns; removal of
|
23
|
+
reserved from index_next_visit (profile)
|
24
|
+
* Upgrade to dbutils ~> 1.5.0, rjack-jdbc-postgres ~> 9.2.1002,
|
25
|
+
sequel ~> 3.46 (but not upcoming 4.x)
|
26
|
+
* Upgrade to iudex-* ~> 1.4.0 dependencies
|
27
|
+
* Upgrade to minitest ~> 4.7.4 (dev)
|
28
|
+
|
1
29
|
=== 1.3.3 (2012-11-8)
|
2
30
|
* FactoryHelper.create_update_filter now prefers an options Hash
|
3
31
|
exposing greater control over what is updated and how. In
|
data/Manifest.txt
CHANGED
@@ -6,12 +6,16 @@ pom.xml
|
|
6
6
|
bin/iudex-da-generate-test-data
|
7
7
|
bin/iudex-da-import
|
8
8
|
bin/iudex-da-simhash-dump
|
9
|
+
bin/iudex-da-unreserve
|
9
10
|
bin/iudex-migrate
|
10
11
|
config/config.rb
|
11
12
|
db/20111012173757_base.rb
|
12
13
|
db/20120930173600_uhash_collation_order.rb
|
14
|
+
db/20130419090000_instance_column.rb
|
15
|
+
db/20130419095500_reserved_column.rb
|
13
16
|
db/index_next_visit/21500000000101_add_index_next_visit.rb
|
14
17
|
db/index_next_visit/21500000000110_index_next_visit_partial.rb
|
18
|
+
db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb
|
15
19
|
db/simhash/21500000000001_add_simhash_index.rb
|
16
20
|
lib/iudex-da/base.rb
|
17
21
|
lib/iudex-da.rb
|
@@ -28,4 +32,4 @@ test/test_migrate.rb
|
|
28
32
|
test/test_pool_factory.rb
|
29
33
|
test/test_url_model.rb
|
30
34
|
test/test_work_poller.rb
|
31
|
-
lib/iudex-da/iudex-da-1.
|
35
|
+
lib/iudex-da/iudex-da-1.4.0.jar
|
data/README.rdoc
CHANGED
@@ -16,7 +16,7 @@ meta-data store and work priority queue.
|
|
16
16
|
|
17
17
|
== License
|
18
18
|
|
19
|
-
Copyright (c) 2008-
|
19
|
+
Copyright (c) 2008-2013 David Kellum
|
20
20
|
|
21
21
|
Licensed under the Apache License, Version 2.0 (the "License"); you
|
22
22
|
may not use this file except in compliance with the License. You may
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
data/bin/iudex-da-import
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
data/bin/iudex-da-simhash-dump
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#--
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
|
+
# may not use this file except in compliance with the License. You
|
8
|
+
# may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15
|
+
# implied. See the License for the specific language governing
|
16
|
+
# permissions and limitations under the License.
|
17
|
+
#++
|
18
|
+
|
19
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
20
|
+
|
21
|
+
require 'rubygems'
|
22
|
+
require 'optparse'
|
23
|
+
|
24
|
+
module IudexBinScript
|
25
|
+
|
26
|
+
require 'rjack-logback'
|
27
|
+
include RJack
|
28
|
+
Logback.config_console
|
29
|
+
|
30
|
+
require 'iudex-core'
|
31
|
+
require 'iudex-da/base'
|
32
|
+
require 'iudex-da/config'
|
33
|
+
include Iudex
|
34
|
+
|
35
|
+
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
36
|
+
|
37
|
+
interval = '60 minutes'
|
38
|
+
|
39
|
+
# Apply all config before including ar
|
40
|
+
OptionParser.new do |opts|
|
41
|
+
opts.banner = <<END
|
42
|
+
Usage: iudex-da-unreserve [options]
|
43
|
+
END
|
44
|
+
opts.on( "-s", "--set name=value", String,
|
45
|
+
"Set connect prop (ex: database=iudex)" ) do |nv|
|
46
|
+
name,value = nv.split('=').map { |t| t.strip }
|
47
|
+
Hooker.add( [ :iudex, :connect_props ] ) do
|
48
|
+
{ name.to_sym => value }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
opts.on( "-p", "--profile NAME", String,
|
52
|
+
"Add a migration profile (ex: simhash)" ) do |p|
|
53
|
+
Hooker.add( [ :iudex, :migration_profiles ] ) do |profiles|
|
54
|
+
profiles << p
|
55
|
+
end
|
56
|
+
end
|
57
|
+
opts.on( "-d", "--debug" ) do
|
58
|
+
Logback[ 'iudex.da' ].level = :debug
|
59
|
+
Hooker.add( [ :iudex, :connect_props ] ) { { :log => true } }
|
60
|
+
end
|
61
|
+
opts.on( "-v", "--version", "Display version and exit" ) do
|
62
|
+
puts "iudex-da: #{DA::VERSION}"
|
63
|
+
exit 1
|
64
|
+
end
|
65
|
+
opts.on( "-l", "--long", "Log in long form, to STDERR" ) do
|
66
|
+
Logback.config_console( :full => true, :stderr => true )
|
67
|
+
end
|
68
|
+
opts.on( "-a", "--age AGE",
|
69
|
+
"Age to unreserve, in PG interval syntax " +
|
70
|
+
"(default: '60 minutes')" ) do |age|
|
71
|
+
interval = age
|
72
|
+
end
|
73
|
+
Hooker.register_config( opts )
|
74
|
+
end.parse!
|
75
|
+
|
76
|
+
require 'iudex-da/orm'
|
77
|
+
|
78
|
+
class Runner
|
79
|
+
include Iudex::DA
|
80
|
+
|
81
|
+
def unreserve( age )
|
82
|
+
ORM.db.transaction( :isolation => :repeatable,
|
83
|
+
:retry_on => [ Sequel::SerializationFailure ] ) do
|
84
|
+
ds = ORM::db.fetch <<-SQL
|
85
|
+
WITH unreserve AS (
|
86
|
+
UPDATE urls
|
87
|
+
SET reserved = NULL
|
88
|
+
WHERE reserved < ( now() - interval '#{age}' )
|
89
|
+
RETURNING instance )
|
90
|
+
SELECT count(*),instance FROM unreserve
|
91
|
+
GROUP BY instance
|
92
|
+
ORDER BY count DESC
|
93
|
+
SQL
|
94
|
+
|
95
|
+
total = ds.inject( 0 ) do |m,row|
|
96
|
+
puts( "%30s %7d" % [ row[:instance], row[:count] ] )
|
97
|
+
m + row[:count]
|
98
|
+
end
|
99
|
+
puts( "%30s %7d" % [ 'TOTAL', total ] ) if total > 0
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
Runner.new.unreserve( interval )
|
107
|
+
end
|
data/bin/iudex-migrate
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
@@ -29,7 +29,7 @@ module IudexBinScript
|
|
29
29
|
|
30
30
|
# Note: Avoid loading iudex-da with its jar dependency which would
|
31
31
|
# make it hard to boostrap the db from source alone.
|
32
|
-
# Instead load only
|
32
|
+
# Instead load only necessary core, base, config, and orm (post config):
|
33
33
|
require 'iudex-core'
|
34
34
|
require 'iudex-da/base'
|
35
35
|
require 'iudex-da/config'
|
data/db/20111012173757_base.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
Sequel.migration do
|
18
|
+
change do
|
19
|
+
alter_table( :urls ) do
|
20
|
+
# A short string uniquely identifying the iudex worker instance
|
21
|
+
# that has last operated on this URL (for example processed, or
|
22
|
+
# in future: reserved). A hostname or IP may be a suitable
|
23
|
+
# value.
|
24
|
+
add_column :instance, String
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
Sequel.migration do
|
18
|
+
change do
|
19
|
+
alter_table( :urls ) do
|
20
|
+
add_column :reserved, "timestamp with time zone"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
Sequel.migration do
|
18
|
+
|
19
|
+
up do
|
20
|
+
alter_table( :urls ) do
|
21
|
+
drop_index( :next_visit_after )
|
22
|
+
add_index( :next_visit_after,
|
23
|
+
:where => 'next_visit_after IS NOT NULL AND reserved IS NULL' )
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
down do
|
28
|
+
alter_table( :urls ) do
|
29
|
+
drop_index( :next_visit_after )
|
30
|
+
add_index( :next_visit_after,
|
31
|
+
:where => 'next_visit_after is not null' )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
data/lib/iudex-da.rb
CHANGED
data/lib/iudex-da/base.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
module Iudex
|
18
18
|
module DA
|
19
|
-
VERSION = '1.
|
19
|
+
VERSION = '1.4.0'
|
20
20
|
|
21
21
|
LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
|
22
22
|
end
|
data/lib/iudex-da/config.rb
CHANGED
data/lib/iudex-da/importer.rb
CHANGED
Binary file
|
data/lib/iudex-da/key_helper.rb
CHANGED
data/lib/iudex-da/models.rb
CHANGED
data/lib/iudex-da/orm.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -17,7 +17,7 @@
|
|
17
17
|
require 'rjack-slf4j'
|
18
18
|
require 'iudex-da/config'
|
19
19
|
require 'sequel'
|
20
|
-
require 'jdbc
|
20
|
+
require 'rjack-jdbc-postgres'
|
21
21
|
require 'hooker'
|
22
22
|
|
23
23
|
Sequel.extension :migration
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -17,7 +17,7 @@
|
|
17
17
|
require 'iudex-da'
|
18
18
|
require 'rjack-slf4j'
|
19
19
|
require 'java'
|
20
|
-
require 'jdbc
|
20
|
+
require 'rjack-jdbc-postgres'
|
21
21
|
|
22
22
|
module Iudex::DA
|
23
23
|
|
data/lib/iudex-da/work_poller.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You may
|
@@ -70,10 +70,53 @@ module Iudex::DA
|
|
70
70
|
# priority order (default: false)
|
71
71
|
attr_writer :do_domain_group
|
72
72
|
|
73
|
+
# If set true, UPDATE reserved date (and instance, if specified)
|
74
|
+
# (Default: false)
|
75
|
+
attr_writer :do_reserve
|
76
|
+
|
77
|
+
# If set true, discards old queue at every poll, even if
|
78
|
+
# do_reserve could make queue re-fill a safe operation.
|
79
|
+
# (Default: true)
|
80
|
+
attr_writer :do_discard
|
81
|
+
|
82
|
+
# The maximum ratio of current to max_urls where the old queue
|
83
|
+
# will be discarded as a safety to avoid starvation (Default: 0.667)
|
84
|
+
attr_accessor :max_discard_ratio
|
85
|
+
|
86
|
+
# The maximum amount of time in milliseconds that the oldest order
|
87
|
+
# can remain reserved before a discard is required. This is only
|
88
|
+
# relevant when do_reserve is true and do_discard is set false,
|
89
|
+
# and typically would be set as a multiple of max_poll_interval
|
90
|
+
# (ms). Note that max_poll_interval is interpreted as the worst
|
91
|
+
# case next discard opportunity for this purpose. The next poll
|
92
|
+
# made to an empty queue, either by prior discard or completion,
|
93
|
+
# resets the time tracking. (Default: nil, off)
|
94
|
+
def max_reserved_time
|
95
|
+
@max_reserved_time_s && ( @max_reserved_time_s * 1000.0 ).round
|
96
|
+
end
|
97
|
+
|
98
|
+
def max_reserved_time=( ms )
|
99
|
+
@max_reserved_time_s = ms / 1000.0
|
100
|
+
end
|
101
|
+
|
102
|
+
attr_reader :max_reserved_time_s
|
103
|
+
|
73
104
|
def domain_group?
|
74
105
|
@do_domain_group
|
75
106
|
end
|
76
107
|
|
108
|
+
def reserve?
|
109
|
+
@do_reserve
|
110
|
+
end
|
111
|
+
|
112
|
+
def discard?
|
113
|
+
@do_discard
|
114
|
+
end
|
115
|
+
|
116
|
+
# String uniquely identifying this worker instance. Only used here
|
117
|
+
# with do_reserve.
|
118
|
+
attr_accessor :instance
|
119
|
+
|
77
120
|
# First age coefficient. If set > 0.0, adjust priority by the
|
78
121
|
# equation:
|
79
122
|
#
|
@@ -125,10 +168,16 @@ module Iudex::DA
|
|
125
168
|
|
126
169
|
@domain_depth_coef = nil
|
127
170
|
@do_domain_group = false
|
171
|
+
@do_reserve = false
|
172
|
+
@do_discard = true
|
173
|
+
@instance = nil
|
128
174
|
|
129
175
|
@max_priority_urls = nil
|
130
176
|
@max_domain_urls = 10_000
|
131
177
|
@max_urls = 50_000
|
178
|
+
@max_discard_ratio = 2.0/3.0
|
179
|
+
@max_reserved_time_s = nil
|
180
|
+
@last_none_reserved = Time.now
|
132
181
|
|
133
182
|
@age_coef_1 = 0.2
|
134
183
|
@age_coef_2 = 0.1
|
@@ -138,7 +187,6 @@ module Iudex::DA
|
|
138
187
|
@uhash_slice = nil
|
139
188
|
|
140
189
|
@log = RJack::SLF4J[ self.class ]
|
141
|
-
#FIXME: Add accessor for log in GenericWorkPollStrategy
|
142
190
|
|
143
191
|
keys( :url, :priority, :next_visit_after ).each do |k|
|
144
192
|
unless mapper.fields.include?( k )
|
@@ -157,22 +205,68 @@ module Iudex::DA
|
|
157
205
|
|
158
206
|
# Override GenericWorkPollStrategy
|
159
207
|
def pollWorkImpl( visit_queue )
|
160
|
-
visit_queue.add_all( poll )
|
208
|
+
visit_queue.add_all( poll( visit_queue.order_count ) )
|
161
209
|
rescue SQLException => x
|
162
210
|
@log.error( "On poll: ", x )
|
163
211
|
end
|
164
212
|
|
165
213
|
# Poll work and return as List<UniMap>
|
166
214
|
# Raises SQLException
|
167
|
-
def poll
|
168
|
-
|
169
|
-
|
170
|
-
|
215
|
+
def poll( current_urls = 0 )
|
216
|
+
@last_none_reserved = Time.now if max_reserved_time_s && current_urls == 0
|
217
|
+
query = generate_query( current_urls )
|
218
|
+
@log.debug { "Poll query: #{query}" }
|
219
|
+
reader.select_with_retry( query )
|
220
|
+
end
|
221
|
+
|
222
|
+
# Override GenericWorkPollStrategy
|
223
|
+
def shouldReplaceQueue( visit_queue )
|
224
|
+
( !reserve? || discard? ||
|
225
|
+
( ( visit_queue.order_count.to_f / max_urls ) > max_discard_ratio ) ||
|
226
|
+
( max_reserved_time_s && next_reserve_time > max_reserved_time_s ) )
|
227
|
+
end
|
228
|
+
|
229
|
+
def next_reserve_time( now = Time.now )
|
230
|
+
now - @last_none_reserved + ( max_poll_interval / 1000.0 )
|
231
|
+
end
|
232
|
+
|
233
|
+
# Override GenericWorkPollStrategy to discard old VisitQueue
|
234
|
+
# contents when do_reserve is enabled.
|
235
|
+
def discard( visit_queue )
|
236
|
+
if reserve? && visit_queue.order_count > 0
|
237
|
+
orders = visit_queue.hosts.inject( [] ) do |a, hq|
|
238
|
+
a.concat( hq.orders.to_a )
|
239
|
+
end
|
240
|
+
if orders.length > 0
|
241
|
+
n = reader.unreserve( orders )
|
242
|
+
@log.info { "Unreserved #{n} orders on discard" }
|
243
|
+
end
|
244
|
+
end
|
245
|
+
rescue SQLException => x
|
246
|
+
@log.error( "On discard: ", x )
|
247
|
+
end
|
248
|
+
|
249
|
+
# Unreserve any orders that are reserved by the current instance.
|
250
|
+
# No-op unless do_reserve and instance are set.
|
251
|
+
def instance_unreserve
|
252
|
+
if reserve? && instance
|
253
|
+
n = reader.update( <<-SQL )
|
254
|
+
UPDATE urls
|
255
|
+
SET reserved = NULL
|
256
|
+
WHERE reserved IS NOT NULL AND
|
257
|
+
instance = '#{instance}'
|
258
|
+
SQL
|
259
|
+
@log.info { "Unreserved #{n} orders for instance #{instance}" }
|
260
|
+
n
|
261
|
+
end
|
262
|
+
rescue SQLException => x
|
263
|
+
@log.error( "On instance_unreserve: ", x )
|
171
264
|
end
|
172
265
|
|
173
266
|
def reader
|
174
267
|
@reader ||= ContentReader.new( @data_source, @mapper ).tap do |r|
|
175
268
|
r.priority_adjusted = aged_priority?
|
269
|
+
r.max_retries = 10
|
176
270
|
end
|
177
271
|
end
|
178
272
|
|
@@ -186,25 +280,33 @@ module Iudex::DA
|
|
186
280
|
end
|
187
281
|
end
|
188
282
|
|
189
|
-
def
|
283
|
+
def domain_union?
|
284
|
+
!@domain_union.empty?
|
285
|
+
end
|
286
|
+
|
287
|
+
def generate_query( current_urls )
|
190
288
|
criteria = [ "next_visit_after <= now()" ]
|
191
289
|
|
290
|
+
criteria << "reserved IS NULL" if reserve?
|
291
|
+
|
192
292
|
if uhash_slice
|
193
293
|
min, max = url64_range( *uhash_slice )
|
194
294
|
criteria << "uhash > ( '#{min}' COLLATE \"C\" )" if min
|
195
295
|
criteria << "uhash < ( '#{max}' COLLATE \"C\" )" if max
|
196
296
|
end
|
197
297
|
|
198
|
-
|
199
|
-
|
200
|
-
if @domain_union.empty?
|
201
|
-
query = generate_query_inner( criteria )
|
202
|
-
params = [ max_urls ]
|
298
|
+
unless domain_union?
|
299
|
+
query = generate_query_inner( criteria, ( max_urls - current_urls ) )
|
203
300
|
else
|
204
301
|
subqueries = []
|
205
302
|
@domain_union.each do | opts |
|
206
303
|
opts = opts.dup
|
207
|
-
opts[ :max ]
|
304
|
+
if opts[ :max ]
|
305
|
+
opts[ :max ] = ( opts[ :max ] * ( max_urls - current_urls ) /
|
306
|
+
max_urls.to_f ).floor
|
307
|
+
else
|
308
|
+
opts[ :max ] = ( max_urls - current_urls )
|
309
|
+
end
|
208
310
|
|
209
311
|
next if opts[ :max ] == 0
|
210
312
|
|
@@ -228,8 +330,7 @@ module Iudex::DA
|
|
228
330
|
c << "type = '#{opts[ :type ]}'"
|
229
331
|
end
|
230
332
|
|
231
|
-
subqueries << generate_query_inner( c )
|
232
|
-
params << opts[ :max ]
|
333
|
+
subqueries << generate_query_inner( c, opts[ :max ] )
|
233
334
|
end
|
234
335
|
if subqueries.size == 1
|
235
336
|
query = subqueries.first
|
@@ -238,17 +339,20 @@ module Iudex::DA
|
|
238
339
|
end
|
239
340
|
end
|
240
341
|
|
342
|
+
query = wrap_with_update( fields, query ) if reserve?
|
343
|
+
|
241
344
|
query = wrap_domain_group_query( fields, query ) if domain_group?
|
242
345
|
|
243
346
|
query = query.gsub( /\s+/, ' ').strip
|
244
347
|
|
245
|
-
|
348
|
+
query
|
246
349
|
end
|
247
350
|
|
248
|
-
def generate_query_inner( criteria )
|
351
|
+
def generate_query_inner( criteria, max_urls )
|
249
352
|
|
250
353
|
query = filter_query(
|
251
|
-
fields( ( :domain if domain_depth? || domain_group? )
|
354
|
+
fields( ( :domain if domain_depth? || domain_group? ),
|
355
|
+
( :uhash if reserve? ) ),
|
252
356
|
( max_priority_urls if domain_depth? ),
|
253
357
|
criteria )
|
254
358
|
|
@@ -260,7 +364,7 @@ module Iudex::DA
|
|
260
364
|
limit_priority = domain_depth? ? :adj_priority : :priority
|
261
365
|
query += <<-SQL
|
262
366
|
ORDER BY #{limit_priority} DESC
|
263
|
-
LIMIT
|
367
|
+
LIMIT #{max_urls}
|
264
368
|
SQL
|
265
369
|
|
266
370
|
query
|
@@ -288,7 +392,7 @@ module Iudex::DA
|
|
288
392
|
if aged_priority?
|
289
393
|
flds = flds.dup
|
290
394
|
i = flds.index( :priority ) || flds.size
|
291
|
-
flds[ i ] = <<-SQL
|
395
|
+
flds[ i ] = <<-SQL.strip
|
292
396
|
( priority +
|
293
397
|
#{age_coef_1}::REAL *
|
294
398
|
SQRT( #{age_coef_2}::REAL *
|
@@ -311,6 +415,24 @@ module Iudex::DA
|
|
311
415
|
sql
|
312
416
|
end
|
313
417
|
|
418
|
+
def wrap_with_update( flds, sub )
|
419
|
+
sflds = [ "reserved = now()" ]
|
420
|
+
sflds << "instance = '#{instance}'" if instance
|
421
|
+
|
422
|
+
# Use ..FOR UPDATE unless not supported by query specific
|
423
|
+
# options with PostgreSQL <= 9.1
|
424
|
+
sub += " FOR UPDATE" unless domain_depth? || domain_union?
|
425
|
+
|
426
|
+
<<-SQL
|
427
|
+
WITH work AS ( #{sub} ),
|
428
|
+
reserve AS (
|
429
|
+
UPDATE urls
|
430
|
+
SET #{clist sflds}
|
431
|
+
WHERE uhash IN ( SELECT uhash FROM work ) )
|
432
|
+
SELECT #{clist flds} FROM work
|
433
|
+
SQL
|
434
|
+
end
|
435
|
+
|
314
436
|
def wrap_domain_group_query( flds, sub )
|
315
437
|
<<-SQL
|
316
438
|
SELECT #{clist flds}
|
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-da</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.4.0</version>
|
9
9
|
<name>Iudex Data Access</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.4.0</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -20,13 +20,13 @@
|
|
20
20
|
<dependency>
|
21
21
|
<groupId>iudex</groupId>
|
22
22
|
<artifactId>iudex-core</artifactId>
|
23
|
-
<version>[1.
|
23
|
+
<version>[1.4.0,1.4.999)</version>
|
24
24
|
</dependency>
|
25
25
|
|
26
26
|
<dependency>
|
27
27
|
<groupId>commons-dbutils</groupId>
|
28
28
|
<artifactId>commons-dbutils</artifactId>
|
29
|
-
<version>1.
|
29
|
+
<version>1.5</version>
|
30
30
|
</dependency>
|
31
31
|
|
32
32
|
<dependency>
|
@@ -38,7 +38,7 @@
|
|
38
38
|
<dependency>
|
39
39
|
<groupId>commons-pool</groupId>
|
40
40
|
<artifactId>commons-pool</artifactId>
|
41
|
-
<version>[1.5.
|
41
|
+
<version>[1.5.7,1.5.999]</version>
|
42
42
|
</dependency>
|
43
43
|
|
44
44
|
<dependency>
|
data/test/setup.rb
CHANGED
data/test/test_migrate.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You may
|
data/test/test_pool_factory.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_url_model.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You may
|
data/test/test_work_poller.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You may
|
@@ -25,6 +25,7 @@ require 'iudex-da/pool_data_source_factory'
|
|
25
25
|
require 'iudex-da/models'
|
26
26
|
|
27
27
|
class TestWorkPoller < MiniTest::Unit::TestCase
|
28
|
+
include Iudex::Core
|
28
29
|
include Iudex::Filter::KeyHelper
|
29
30
|
include Iudex::DA
|
30
31
|
include Iudex::DA::ORM
|
@@ -66,6 +67,31 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
66
67
|
assert_equal( 3, pos )
|
67
68
|
end
|
68
69
|
|
70
|
+
def test_poll_with_reserve
|
71
|
+
poller.do_reserve = true
|
72
|
+
poller.max_urls = 2
|
73
|
+
poller.instance = 'test'
|
74
|
+
|
75
|
+
polled = poller.poll
|
76
|
+
polled.each_with_index do |map,i|
|
77
|
+
assert_equal( URLS[ i ][ 0 ], map.url.url )
|
78
|
+
end
|
79
|
+
assert_equal( 2, polled.size )
|
80
|
+
reserved = polled
|
81
|
+
|
82
|
+
polled = poller.poll
|
83
|
+
assert_equal( 1, polled.size )
|
84
|
+
assert_equal( URLS[2][0], polled.first.url.url )
|
85
|
+
reserved += polled
|
86
|
+
|
87
|
+
RJack::Logback[ 'iudex.da.WorkPoller' ].with_level( :warn ) do
|
88
|
+
poller.discard( VisitQueue.new.tap { |q| q.add_all( reserved ) } )
|
89
|
+
end
|
90
|
+
poller.max_urls = 3
|
91
|
+
|
92
|
+
assert_equal( 3, poller.poll.size )
|
93
|
+
end
|
94
|
+
|
69
95
|
def test_poll_with_max_priority_urls
|
70
96
|
poller.max_priority_urls = 4
|
71
97
|
|
@@ -89,6 +115,25 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
89
115
|
assert_equal( 3, pos )
|
90
116
|
end
|
91
117
|
|
118
|
+
def test_poll_with_domain_depth_reserve
|
119
|
+
poller.domain_depth_coef = 0.125
|
120
|
+
poller.max_priority_urls = 4
|
121
|
+
poller.do_reserve = true
|
122
|
+
poller.instance = 'test'
|
123
|
+
|
124
|
+
pos = 0
|
125
|
+
poller.poll.each do |map|
|
126
|
+
assert_equal( URLS[ pos ][ 0 ], map.url.url )
|
127
|
+
pos += 1
|
128
|
+
end
|
129
|
+
assert_equal( 3, pos )
|
130
|
+
RJack::Logback[ 'iudex.da.WorkPoller' ].with_level( :warn ) do
|
131
|
+
assert_equal( 3, poller.instance_unreserve )
|
132
|
+
end
|
133
|
+
assert_equal( 3, poller.poll.size )
|
134
|
+
assert_equal( 0, poller.poll.size )
|
135
|
+
end
|
136
|
+
|
92
137
|
def test_poll_with_domain_depth_only
|
93
138
|
poller.domain_depth_coef = 0.125
|
94
139
|
poller.age_coef_1 = 0.0
|
@@ -131,6 +176,15 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
131
176
|
assert_equal( 3, result.size )
|
132
177
|
end
|
133
178
|
|
179
|
+
def test_poll_domain_union_2_reserve
|
180
|
+
poller.do_reserve = true
|
181
|
+
poller.domain_union = [ { :domain => 'gravitext.com', :max => 15000 },
|
182
|
+
{ :max => 10000 } ]
|
183
|
+
|
184
|
+
assert_equal( 3, poller.poll.size )
|
185
|
+
assert_equal( 0, poller.poll.size )
|
186
|
+
end
|
187
|
+
|
134
188
|
def test_poll_domain_union_3
|
135
189
|
poller.domain_union = [ { :domain => 'gravitext.com', :max => 1 },
|
136
190
|
{ :domain => 'hometown.com', :max => 1 },
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: iudex-da
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.4.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-10-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: iudex-core
|
@@ -17,13 +17,13 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - ~>
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 1.
|
20
|
+
version: 1.4.0
|
21
21
|
none: false
|
22
22
|
requirement: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.
|
26
|
+
version: 1.4.0
|
27
27
|
none: false
|
28
28
|
prerelease: false
|
29
29
|
type: :runtime
|
@@ -33,29 +33,29 @@ dependencies:
|
|
33
33
|
requirements:
|
34
34
|
- - ~>
|
35
35
|
- !ruby/object:Gem::Version
|
36
|
-
version: 3.
|
36
|
+
version: '3.46'
|
37
37
|
none: false
|
38
38
|
requirement: !ruby/object:Gem::Requirement
|
39
39
|
requirements:
|
40
40
|
- - ~>
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version: 3.
|
42
|
+
version: '3.46'
|
43
43
|
none: false
|
44
44
|
prerelease: false
|
45
45
|
type: :runtime
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name: jdbc-postgres
|
47
|
+
name: rjack-jdbc-postgres
|
48
48
|
version_requirements: !ruby/object:Gem::Requirement
|
49
49
|
requirements:
|
50
50
|
- - ~>
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version: 9.
|
52
|
+
version: 9.2.1002
|
53
53
|
none: false
|
54
54
|
requirement: !ruby/object:Gem::Requirement
|
55
55
|
requirements:
|
56
56
|
- - ~>
|
57
57
|
- !ruby/object:Gem::Version
|
58
|
-
version: 9.
|
58
|
+
version: 9.2.1002
|
59
59
|
none: false
|
60
60
|
prerelease: false
|
61
61
|
type: :runtime
|
@@ -81,13 +81,13 @@ dependencies:
|
|
81
81
|
requirements:
|
82
82
|
- - ~>
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 1.
|
84
|
+
version: 1.5.0
|
85
85
|
none: false
|
86
86
|
requirement: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
88
|
- - ~>
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: 1.
|
90
|
+
version: 1.5.0
|
91
91
|
none: false
|
92
92
|
prerelease: false
|
93
93
|
type: :runtime
|
@@ -97,13 +97,13 @@ dependencies:
|
|
97
97
|
requirements:
|
98
98
|
- - ~>
|
99
99
|
- !ruby/object:Gem::Version
|
100
|
-
version:
|
100
|
+
version: 4.7.4
|
101
101
|
none: false
|
102
102
|
requirement: !ruby/object:Gem::Requirement
|
103
103
|
requirements:
|
104
104
|
- - ~>
|
105
105
|
- !ruby/object:Gem::Version
|
106
|
-
version:
|
106
|
+
version: 4.7.4
|
107
107
|
none: false
|
108
108
|
prerelease: false
|
109
109
|
type: :development
|
@@ -146,6 +146,7 @@ executables:
|
|
146
146
|
- iudex-da-generate-test-data
|
147
147
|
- iudex-da-import
|
148
148
|
- iudex-da-simhash-dump
|
149
|
+
- iudex-da-unreserve
|
149
150
|
- iudex-migrate
|
150
151
|
extensions: []
|
151
152
|
extra_rdoc_files:
|
@@ -160,12 +161,16 @@ files:
|
|
160
161
|
- bin/iudex-da-generate-test-data
|
161
162
|
- bin/iudex-da-import
|
162
163
|
- bin/iudex-da-simhash-dump
|
164
|
+
- bin/iudex-da-unreserve
|
163
165
|
- bin/iudex-migrate
|
164
166
|
- config/config.rb
|
165
167
|
- db/20111012173757_base.rb
|
166
168
|
- db/20120930173600_uhash_collation_order.rb
|
169
|
+
- db/20130419090000_instance_column.rb
|
170
|
+
- db/20130419095500_reserved_column.rb
|
167
171
|
- db/index_next_visit/21500000000101_add_index_next_visit.rb
|
168
172
|
- db/index_next_visit/21500000000110_index_next_visit_partial.rb
|
173
|
+
- db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb
|
169
174
|
- db/simhash/21500000000001_add_simhash_index.rb
|
170
175
|
- lib/iudex-da/base.rb
|
171
176
|
- lib/iudex-da.rb
|
@@ -182,7 +187,7 @@ files:
|
|
182
187
|
- test/test_pool_factory.rb
|
183
188
|
- test/test_url_model.rb
|
184
189
|
- test/test_work_poller.rb
|
185
|
-
- lib/iudex-da/iudex-da-1.
|
190
|
+
- lib/iudex-da/iudex-da-1.4.0.jar
|
186
191
|
homepage: http://iudex.gravitext.com
|
187
192
|
licenses: []
|
188
193
|
post_install_message:
|
Binary file
|