iudex-da 1.3.3-java → 1.4.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +28 -0
- data/Manifest.txt +5 -1
- data/README.rdoc +1 -1
- data/bin/iudex-da-generate-test-data +1 -1
- data/bin/iudex-da-import +1 -1
- data/bin/iudex-da-simhash-dump +1 -1
- data/bin/iudex-da-unreserve +107 -0
- data/bin/iudex-migrate +2 -2
- data/db/20111012173757_base.rb +1 -1
- data/db/20120930173600_uhash_collation_order.rb +1 -1
- data/db/20130419090000_instance_column.rb +27 -0
- data/db/20130419095500_reserved_column.rb +23 -0
- data/db/index_next_visit/21500000000101_add_index_next_visit.rb +1 -1
- data/db/index_next_visit/21500000000110_index_next_visit_partial.rb +1 -1
- data/db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb +35 -0
- data/db/simhash/21500000000001_add_simhash_index.rb +1 -1
- data/lib/iudex-da.rb +1 -1
- data/lib/iudex-da/base.rb +2 -2
- data/lib/iudex-da/config.rb +1 -1
- data/lib/iudex-da/factory_helper.rb +1 -1
- data/lib/iudex-da/importer.rb +1 -1
- data/lib/iudex-da/iudex-da-1.4.0.jar +0 -0
- data/lib/iudex-da/key_helper.rb +1 -1
- data/lib/iudex-da/models.rb +1 -1
- data/lib/iudex-da/orm.rb +2 -2
- data/lib/iudex-da/pool_data_source_factory.rb +2 -2
- data/lib/iudex-da/work_poller.rb +143 -21
- data/pom.xml +5 -5
- data/test/setup.rb +1 -1
- data/test/test_migrate.rb +1 -1
- data/test/test_pool_factory.rb +1 -1
- data/test/test_url_model.rb +1 -1
- data/test/test_work_poller.rb +55 -1
- metadata +19 -14
- data/lib/iudex-da/iudex-da-1.3.3.jar +0 -0
data/History.rdoc
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
=== 1.4.0 (2013-10-29)
|
2
|
+
* Add experimental WorkPoller.do_reserve support, which utilizes
|
3
|
+
PostgreSQL 9.1+ writable Common Table Expressions (CTEs) to
|
4
|
+
efficiently(?) set a reserved column and instance (identifier) on
|
5
|
+
poll that guards against polling by another or the same iudex-worker
|
6
|
+
process. This enables multiple workers to cover the same urls/orders
|
7
|
+
as well as concurrent polling. Previously only an instance unique,
|
8
|
+
uhash_slice single-tenant partitioning scheme was supported for
|
9
|
+
multi-worker distribution.
|
10
|
+
* Add experimental support for concurrent (no discard) WorkPoller (in
|
11
|
+
collaboration with iudex-core 1.4.0 VisitManager), enabled when set
|
12
|
+
do_reserve = true, do_discard = false.
|
13
|
+
* When enabled, polled max_urls is adjusted by current order_count
|
14
|
+
for subsequent polls.
|
15
|
+
* New max_reserved_time will force discard of possibly long reserved
|
16
|
+
tenure based on time since the queue was last empty.
|
17
|
+
* New max_discard_ratio will also force a discard if
|
18
|
+
order_count/max_urls exceeds this ratio.
|
19
|
+
* Add iudex-da-unreserve utility for cleaning reservations left due to
|
20
|
+
failure, based on age.
|
21
|
+
* Add DAKeys INSTANCE (String identifier) and RESERVED (date)
|
22
|
+
* Add migrations for instance and reserved columns; removal of
|
23
|
+
reserved from index_next_visit (profile)
|
24
|
+
* Upgrade to dbutils ~> 1.5.0, rjack-jdbc-postgres ~> 9.2.1002,
|
25
|
+
sequel ~> 3.46 (but not upcoming 4.x)
|
26
|
+
* Upgrade to iudex-* ~> 1.4.0 dependencies
|
27
|
+
* Upgrade to minitest ~> 4.7.4 (dev)
|
28
|
+
|
1
29
|
=== 1.3.3 (2012-11-8)
|
2
30
|
* FactoryHelper.create_update_filter now prefers an options Hash
|
3
31
|
exposing greater control over what is updated and how. In
|
data/Manifest.txt
CHANGED
@@ -6,12 +6,16 @@ pom.xml
|
|
6
6
|
bin/iudex-da-generate-test-data
|
7
7
|
bin/iudex-da-import
|
8
8
|
bin/iudex-da-simhash-dump
|
9
|
+
bin/iudex-da-unreserve
|
9
10
|
bin/iudex-migrate
|
10
11
|
config/config.rb
|
11
12
|
db/20111012173757_base.rb
|
12
13
|
db/20120930173600_uhash_collation_order.rb
|
14
|
+
db/20130419090000_instance_column.rb
|
15
|
+
db/20130419095500_reserved_column.rb
|
13
16
|
db/index_next_visit/21500000000101_add_index_next_visit.rb
|
14
17
|
db/index_next_visit/21500000000110_index_next_visit_partial.rb
|
18
|
+
db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb
|
15
19
|
db/simhash/21500000000001_add_simhash_index.rb
|
16
20
|
lib/iudex-da/base.rb
|
17
21
|
lib/iudex-da.rb
|
@@ -28,4 +32,4 @@ test/test_migrate.rb
|
|
28
32
|
test/test_pool_factory.rb
|
29
33
|
test/test_url_model.rb
|
30
34
|
test/test_work_poller.rb
|
31
|
-
lib/iudex-da/iudex-da-1.
|
35
|
+
lib/iudex-da/iudex-da-1.4.0.jar
|
data/README.rdoc
CHANGED
@@ -16,7 +16,7 @@ meta-data store and work priority queue.
|
|
16
16
|
|
17
17
|
== License
|
18
18
|
|
19
|
-
Copyright (c) 2008-
|
19
|
+
Copyright (c) 2008-2013 David Kellum
|
20
20
|
|
21
21
|
Licensed under the Apache License, Version 2.0 (the "License"); you
|
22
22
|
may not use this file except in compliance with the License. You may
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
data/bin/iudex-da-import
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
data/bin/iudex-da-simhash-dump
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#--
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
|
+
# may not use this file except in compliance with the License. You
|
8
|
+
# may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15
|
+
# implied. See the License for the specific language governing
|
16
|
+
# permissions and limitations under the License.
|
17
|
+
#++
|
18
|
+
|
19
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
20
|
+
|
21
|
+
require 'rubygems'
|
22
|
+
require 'optparse'
|
23
|
+
|
24
|
+
module IudexBinScript
|
25
|
+
|
26
|
+
require 'rjack-logback'
|
27
|
+
include RJack
|
28
|
+
Logback.config_console
|
29
|
+
|
30
|
+
require 'iudex-core'
|
31
|
+
require 'iudex-da/base'
|
32
|
+
require 'iudex-da/config'
|
33
|
+
include Iudex
|
34
|
+
|
35
|
+
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
36
|
+
|
37
|
+
interval = '60 minutes'
|
38
|
+
|
39
|
+
# Apply all config before including ar
|
40
|
+
OptionParser.new do |opts|
|
41
|
+
opts.banner = <<END
|
42
|
+
Usage: iudex-da-unreserve [options]
|
43
|
+
END
|
44
|
+
opts.on( "-s", "--set name=value", String,
|
45
|
+
"Set connect prop (ex: database=iudex)" ) do |nv|
|
46
|
+
name,value = nv.split('=').map { |t| t.strip }
|
47
|
+
Hooker.add( [ :iudex, :connect_props ] ) do
|
48
|
+
{ name.to_sym => value }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
opts.on( "-p", "--profile NAME", String,
|
52
|
+
"Add a migration profile (ex: simhash)" ) do |p|
|
53
|
+
Hooker.add( [ :iudex, :migration_profiles ] ) do |profiles|
|
54
|
+
profiles << p
|
55
|
+
end
|
56
|
+
end
|
57
|
+
opts.on( "-d", "--debug" ) do
|
58
|
+
Logback[ 'iudex.da' ].level = :debug
|
59
|
+
Hooker.add( [ :iudex, :connect_props ] ) { { :log => true } }
|
60
|
+
end
|
61
|
+
opts.on( "-v", "--version", "Display version and exit" ) do
|
62
|
+
puts "iudex-da: #{DA::VERSION}"
|
63
|
+
exit 1
|
64
|
+
end
|
65
|
+
opts.on( "-l", "--long", "Log in long form, to STDERR" ) do
|
66
|
+
Logback.config_console( :full => true, :stderr => true )
|
67
|
+
end
|
68
|
+
opts.on( "-a", "--age AGE",
|
69
|
+
"Age to unreserve, in PG interval syntax " +
|
70
|
+
"(default: '60 minutes')" ) do |age|
|
71
|
+
interval = age
|
72
|
+
end
|
73
|
+
Hooker.register_config( opts )
|
74
|
+
end.parse!
|
75
|
+
|
76
|
+
require 'iudex-da/orm'
|
77
|
+
|
78
|
+
class Runner
|
79
|
+
include Iudex::DA
|
80
|
+
|
81
|
+
def unreserve( age )
|
82
|
+
ORM.db.transaction( :isolation => :repeatable,
|
83
|
+
:retry_on => [ Sequel::SerializationFailure ] ) do
|
84
|
+
ds = ORM::db.fetch <<-SQL
|
85
|
+
WITH unreserve AS (
|
86
|
+
UPDATE urls
|
87
|
+
SET reserved = NULL
|
88
|
+
WHERE reserved < ( now() - interval '#{age}' )
|
89
|
+
RETURNING instance )
|
90
|
+
SELECT count(*),instance FROM unreserve
|
91
|
+
GROUP BY instance
|
92
|
+
ORDER BY count DESC
|
93
|
+
SQL
|
94
|
+
|
95
|
+
total = ds.inject( 0 ) do |m,row|
|
96
|
+
puts( "%30s %7d" % [ row[:instance], row[:count] ] )
|
97
|
+
m + row[:count]
|
98
|
+
end
|
99
|
+
puts( "%30s %7d" % [ 'TOTAL', total ] ) if total > 0
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
Runner.new.unreserve( interval )
|
107
|
+
end
|
data/bin/iudex-migrate
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
@@ -29,7 +29,7 @@ module IudexBinScript
|
|
29
29
|
|
30
30
|
# Note: Avoid loading iudex-da with its jar dependency which would
|
31
31
|
# make it hard to boostrap the db from source alone.
|
32
|
-
# Instead load only
|
32
|
+
# Instead load only necessary core, base, config, and orm (post config):
|
33
33
|
require 'iudex-core'
|
34
34
|
require 'iudex-da/base'
|
35
35
|
require 'iudex-da/config'
|
data/db/20111012173757_base.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
Sequel.migration do
|
18
|
+
change do
|
19
|
+
alter_table( :urls ) do
|
20
|
+
# A short string uniquely identifying the iudex worker instance
|
21
|
+
# that has last operated on this URL (for example processed, or
|
22
|
+
# in future: reserved). A hostname or IP may be a suitable
|
23
|
+
# value.
|
24
|
+
add_column :instance, String
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
Sequel.migration do
|
18
|
+
change do
|
19
|
+
alter_table( :urls ) do
|
20
|
+
add_column :reserved, "timestamp with time zone"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
Sequel.migration do
|
18
|
+
|
19
|
+
up do
|
20
|
+
alter_table( :urls ) do
|
21
|
+
drop_index( :next_visit_after )
|
22
|
+
add_index( :next_visit_after,
|
23
|
+
:where => 'next_visit_after IS NOT NULL AND reserved IS NULL' )
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
down do
|
28
|
+
alter_table( :urls ) do
|
29
|
+
drop_index( :next_visit_after )
|
30
|
+
add_index( :next_visit_after,
|
31
|
+
:where => 'next_visit_after is not null' )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
data/lib/iudex-da.rb
CHANGED
data/lib/iudex-da/base.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
module Iudex
|
18
18
|
module DA
|
19
|
-
VERSION = '1.
|
19
|
+
VERSION = '1.4.0'
|
20
20
|
|
21
21
|
LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
|
22
22
|
end
|
data/lib/iudex-da/config.rb
CHANGED
data/lib/iudex-da/importer.rb
CHANGED
Binary file
|
data/lib/iudex-da/key_helper.rb
CHANGED
data/lib/iudex-da/models.rb
CHANGED
data/lib/iudex-da/orm.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -17,7 +17,7 @@
|
|
17
17
|
require 'rjack-slf4j'
|
18
18
|
require 'iudex-da/config'
|
19
19
|
require 'sequel'
|
20
|
-
require 'jdbc
|
20
|
+
require 'rjack-jdbc-postgres'
|
21
21
|
require 'hooker'
|
22
22
|
|
23
23
|
Sequel.extension :migration
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -17,7 +17,7 @@
|
|
17
17
|
require 'iudex-da'
|
18
18
|
require 'rjack-slf4j'
|
19
19
|
require 'java'
|
20
|
-
require 'jdbc
|
20
|
+
require 'rjack-jdbc-postgres'
|
21
21
|
|
22
22
|
module Iudex::DA
|
23
23
|
|
data/lib/iudex-da/work_poller.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You may
|
@@ -70,10 +70,53 @@ module Iudex::DA
|
|
70
70
|
# priority order (default: false)
|
71
71
|
attr_writer :do_domain_group
|
72
72
|
|
73
|
+
# If set true, UPDATE reserved date (and instance, if specified)
|
74
|
+
# (Default: false)
|
75
|
+
attr_writer :do_reserve
|
76
|
+
|
77
|
+
# If set true, discards old queue at every poll, even if
|
78
|
+
# do_reserve could make queue re-fill a safe operation.
|
79
|
+
# (Default: true)
|
80
|
+
attr_writer :do_discard
|
81
|
+
|
82
|
+
# The maximum ratio of current to max_urls where the old queue
|
83
|
+
# will be discarded as a safety to avoid starvation (Default: 0.667)
|
84
|
+
attr_accessor :max_discard_ratio
|
85
|
+
|
86
|
+
# The maximum amount of time in milliseconds that the oldest order
|
87
|
+
# can remain reserved before a discard is required. This is only
|
88
|
+
# relevant when do_reserve is true and do_discard is set false,
|
89
|
+
# and typically would be set as a multiple of max_poll_interval
|
90
|
+
# (ms). Note that max_poll_interval is interpreted as the worst
|
91
|
+
# case next discard opportunity for this purpose. The next poll
|
92
|
+
# made to an empty queue, either by prior discard or completion,
|
93
|
+
# resets the time tracking. (Default: nil, off)
|
94
|
+
def max_reserved_time
|
95
|
+
@max_reserved_time_s && ( @max_reserved_time_s * 1000.0 ).round
|
96
|
+
end
|
97
|
+
|
98
|
+
def max_reserved_time=( ms )
|
99
|
+
@max_reserved_time_s = ms / 1000.0
|
100
|
+
end
|
101
|
+
|
102
|
+
attr_reader :max_reserved_time_s
|
103
|
+
|
73
104
|
def domain_group?
|
74
105
|
@do_domain_group
|
75
106
|
end
|
76
107
|
|
108
|
+
def reserve?
|
109
|
+
@do_reserve
|
110
|
+
end
|
111
|
+
|
112
|
+
def discard?
|
113
|
+
@do_discard
|
114
|
+
end
|
115
|
+
|
116
|
+
# String uniquely identifying this worker instance. Only used here
|
117
|
+
# with do_reserve.
|
118
|
+
attr_accessor :instance
|
119
|
+
|
77
120
|
# First age coefficient. If set > 0.0, adjust priority by the
|
78
121
|
# equation:
|
79
122
|
#
|
@@ -125,10 +168,16 @@ module Iudex::DA
|
|
125
168
|
|
126
169
|
@domain_depth_coef = nil
|
127
170
|
@do_domain_group = false
|
171
|
+
@do_reserve = false
|
172
|
+
@do_discard = true
|
173
|
+
@instance = nil
|
128
174
|
|
129
175
|
@max_priority_urls = nil
|
130
176
|
@max_domain_urls = 10_000
|
131
177
|
@max_urls = 50_000
|
178
|
+
@max_discard_ratio = 2.0/3.0
|
179
|
+
@max_reserved_time_s = nil
|
180
|
+
@last_none_reserved = Time.now
|
132
181
|
|
133
182
|
@age_coef_1 = 0.2
|
134
183
|
@age_coef_2 = 0.1
|
@@ -138,7 +187,6 @@ module Iudex::DA
|
|
138
187
|
@uhash_slice = nil
|
139
188
|
|
140
189
|
@log = RJack::SLF4J[ self.class ]
|
141
|
-
#FIXME: Add accessor for log in GenericWorkPollStrategy
|
142
190
|
|
143
191
|
keys( :url, :priority, :next_visit_after ).each do |k|
|
144
192
|
unless mapper.fields.include?( k )
|
@@ -157,22 +205,68 @@ module Iudex::DA
|
|
157
205
|
|
158
206
|
# Override GenericWorkPollStrategy
|
159
207
|
def pollWorkImpl( visit_queue )
|
160
|
-
visit_queue.add_all( poll )
|
208
|
+
visit_queue.add_all( poll( visit_queue.order_count ) )
|
161
209
|
rescue SQLException => x
|
162
210
|
@log.error( "On poll: ", x )
|
163
211
|
end
|
164
212
|
|
165
213
|
# Poll work and return as List<UniMap>
|
166
214
|
# Raises SQLException
|
167
|
-
def poll
|
168
|
-
|
169
|
-
|
170
|
-
|
215
|
+
def poll( current_urls = 0 )
|
216
|
+
@last_none_reserved = Time.now if max_reserved_time_s && current_urls == 0
|
217
|
+
query = generate_query( current_urls )
|
218
|
+
@log.debug { "Poll query: #{query}" }
|
219
|
+
reader.select_with_retry( query )
|
220
|
+
end
|
221
|
+
|
222
|
+
# Override GenericWorkPollStrategy
|
223
|
+
def shouldReplaceQueue( visit_queue )
|
224
|
+
( !reserve? || discard? ||
|
225
|
+
( ( visit_queue.order_count.to_f / max_urls ) > max_discard_ratio ) ||
|
226
|
+
( max_reserved_time_s && next_reserve_time > max_reserved_time_s ) )
|
227
|
+
end
|
228
|
+
|
229
|
+
def next_reserve_time( now = Time.now )
|
230
|
+
now - @last_none_reserved + ( max_poll_interval / 1000.0 )
|
231
|
+
end
|
232
|
+
|
233
|
+
# Override GenericWorkPollStrategy to discard old VisitQueue
|
234
|
+
# contents when do_reserve is enabled.
|
235
|
+
def discard( visit_queue )
|
236
|
+
if reserve? && visit_queue.order_count > 0
|
237
|
+
orders = visit_queue.hosts.inject( [] ) do |a, hq|
|
238
|
+
a.concat( hq.orders.to_a )
|
239
|
+
end
|
240
|
+
if orders.length > 0
|
241
|
+
n = reader.unreserve( orders )
|
242
|
+
@log.info { "Unreserved #{n} orders on discard" }
|
243
|
+
end
|
244
|
+
end
|
245
|
+
rescue SQLException => x
|
246
|
+
@log.error( "On discard: ", x )
|
247
|
+
end
|
248
|
+
|
249
|
+
# Unreserve any orders that are reserved by the current instance.
|
250
|
+
# No-op unless do_reserve and instance are set.
|
251
|
+
def instance_unreserve
|
252
|
+
if reserve? && instance
|
253
|
+
n = reader.update( <<-SQL )
|
254
|
+
UPDATE urls
|
255
|
+
SET reserved = NULL
|
256
|
+
WHERE reserved IS NOT NULL AND
|
257
|
+
instance = '#{instance}'
|
258
|
+
SQL
|
259
|
+
@log.info { "Unreserved #{n} orders for instance #{instance}" }
|
260
|
+
n
|
261
|
+
end
|
262
|
+
rescue SQLException => x
|
263
|
+
@log.error( "On instance_unreserve: ", x )
|
171
264
|
end
|
172
265
|
|
173
266
|
def reader
|
174
267
|
@reader ||= ContentReader.new( @data_source, @mapper ).tap do |r|
|
175
268
|
r.priority_adjusted = aged_priority?
|
269
|
+
r.max_retries = 10
|
176
270
|
end
|
177
271
|
end
|
178
272
|
|
@@ -186,25 +280,33 @@ module Iudex::DA
|
|
186
280
|
end
|
187
281
|
end
|
188
282
|
|
189
|
-
def
|
283
|
+
def domain_union?
|
284
|
+
!@domain_union.empty?
|
285
|
+
end
|
286
|
+
|
287
|
+
def generate_query( current_urls )
|
190
288
|
criteria = [ "next_visit_after <= now()" ]
|
191
289
|
|
290
|
+
criteria << "reserved IS NULL" if reserve?
|
291
|
+
|
192
292
|
if uhash_slice
|
193
293
|
min, max = url64_range( *uhash_slice )
|
194
294
|
criteria << "uhash > ( '#{min}' COLLATE \"C\" )" if min
|
195
295
|
criteria << "uhash < ( '#{max}' COLLATE \"C\" )" if max
|
196
296
|
end
|
197
297
|
|
198
|
-
|
199
|
-
|
200
|
-
if @domain_union.empty?
|
201
|
-
query = generate_query_inner( criteria )
|
202
|
-
params = [ max_urls ]
|
298
|
+
unless domain_union?
|
299
|
+
query = generate_query_inner( criteria, ( max_urls - current_urls ) )
|
203
300
|
else
|
204
301
|
subqueries = []
|
205
302
|
@domain_union.each do | opts |
|
206
303
|
opts = opts.dup
|
207
|
-
opts[ :max ]
|
304
|
+
if opts[ :max ]
|
305
|
+
opts[ :max ] = ( opts[ :max ] * ( max_urls - current_urls ) /
|
306
|
+
max_urls.to_f ).floor
|
307
|
+
else
|
308
|
+
opts[ :max ] = ( max_urls - current_urls )
|
309
|
+
end
|
208
310
|
|
209
311
|
next if opts[ :max ] == 0
|
210
312
|
|
@@ -228,8 +330,7 @@ module Iudex::DA
|
|
228
330
|
c << "type = '#{opts[ :type ]}'"
|
229
331
|
end
|
230
332
|
|
231
|
-
subqueries << generate_query_inner( c )
|
232
|
-
params << opts[ :max ]
|
333
|
+
subqueries << generate_query_inner( c, opts[ :max ] )
|
233
334
|
end
|
234
335
|
if subqueries.size == 1
|
235
336
|
query = subqueries.first
|
@@ -238,17 +339,20 @@ module Iudex::DA
|
|
238
339
|
end
|
239
340
|
end
|
240
341
|
|
342
|
+
query = wrap_with_update( fields, query ) if reserve?
|
343
|
+
|
241
344
|
query = wrap_domain_group_query( fields, query ) if domain_group?
|
242
345
|
|
243
346
|
query = query.gsub( /\s+/, ' ').strip
|
244
347
|
|
245
|
-
|
348
|
+
query
|
246
349
|
end
|
247
350
|
|
248
|
-
def generate_query_inner( criteria )
|
351
|
+
def generate_query_inner( criteria, max_urls )
|
249
352
|
|
250
353
|
query = filter_query(
|
251
|
-
fields( ( :domain if domain_depth? || domain_group? )
|
354
|
+
fields( ( :domain if domain_depth? || domain_group? ),
|
355
|
+
( :uhash if reserve? ) ),
|
252
356
|
( max_priority_urls if domain_depth? ),
|
253
357
|
criteria )
|
254
358
|
|
@@ -260,7 +364,7 @@ module Iudex::DA
|
|
260
364
|
limit_priority = domain_depth? ? :adj_priority : :priority
|
261
365
|
query += <<-SQL
|
262
366
|
ORDER BY #{limit_priority} DESC
|
263
|
-
LIMIT
|
367
|
+
LIMIT #{max_urls}
|
264
368
|
SQL
|
265
369
|
|
266
370
|
query
|
@@ -288,7 +392,7 @@ module Iudex::DA
|
|
288
392
|
if aged_priority?
|
289
393
|
flds = flds.dup
|
290
394
|
i = flds.index( :priority ) || flds.size
|
291
|
-
flds[ i ] = <<-SQL
|
395
|
+
flds[ i ] = <<-SQL.strip
|
292
396
|
( priority +
|
293
397
|
#{age_coef_1}::REAL *
|
294
398
|
SQRT( #{age_coef_2}::REAL *
|
@@ -311,6 +415,24 @@ module Iudex::DA
|
|
311
415
|
sql
|
312
416
|
end
|
313
417
|
|
418
|
+
def wrap_with_update( flds, sub )
|
419
|
+
sflds = [ "reserved = now()" ]
|
420
|
+
sflds << "instance = '#{instance}'" if instance
|
421
|
+
|
422
|
+
# Use ..FOR UPDATE unless not supported by query specific
|
423
|
+
# options with PostgreSQL <= 9.1
|
424
|
+
sub += " FOR UPDATE" unless domain_depth? || domain_union?
|
425
|
+
|
426
|
+
<<-SQL
|
427
|
+
WITH work AS ( #{sub} ),
|
428
|
+
reserve AS (
|
429
|
+
UPDATE urls
|
430
|
+
SET #{clist sflds}
|
431
|
+
WHERE uhash IN ( SELECT uhash FROM work ) )
|
432
|
+
SELECT #{clist flds} FROM work
|
433
|
+
SQL
|
434
|
+
end
|
435
|
+
|
314
436
|
def wrap_domain_group_query( flds, sub )
|
315
437
|
<<-SQL
|
316
438
|
SELECT #{clist flds}
|
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-da</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.4.0</version>
|
9
9
|
<name>Iudex Data Access</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.4.0</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -20,13 +20,13 @@
|
|
20
20
|
<dependency>
|
21
21
|
<groupId>iudex</groupId>
|
22
22
|
<artifactId>iudex-core</artifactId>
|
23
|
-
<version>[1.
|
23
|
+
<version>[1.4.0,1.4.999)</version>
|
24
24
|
</dependency>
|
25
25
|
|
26
26
|
<dependency>
|
27
27
|
<groupId>commons-dbutils</groupId>
|
28
28
|
<artifactId>commons-dbutils</artifactId>
|
29
|
-
<version>1.
|
29
|
+
<version>1.5</version>
|
30
30
|
</dependency>
|
31
31
|
|
32
32
|
<dependency>
|
@@ -38,7 +38,7 @@
|
|
38
38
|
<dependency>
|
39
39
|
<groupId>commons-pool</groupId>
|
40
40
|
<artifactId>commons-pool</artifactId>
|
41
|
-
<version>[1.5.
|
41
|
+
<version>[1.5.7,1.5.999]</version>
|
42
42
|
</dependency>
|
43
43
|
|
44
44
|
<dependency>
|
data/test/setup.rb
CHANGED
data/test/test_migrate.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You may
|
data/test/test_pool_factory.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_url_model.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You may
|
data/test/test_work_poller.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You may
|
@@ -25,6 +25,7 @@ require 'iudex-da/pool_data_source_factory'
|
|
25
25
|
require 'iudex-da/models'
|
26
26
|
|
27
27
|
class TestWorkPoller < MiniTest::Unit::TestCase
|
28
|
+
include Iudex::Core
|
28
29
|
include Iudex::Filter::KeyHelper
|
29
30
|
include Iudex::DA
|
30
31
|
include Iudex::DA::ORM
|
@@ -66,6 +67,31 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
66
67
|
assert_equal( 3, pos )
|
67
68
|
end
|
68
69
|
|
70
|
+
def test_poll_with_reserve
|
71
|
+
poller.do_reserve = true
|
72
|
+
poller.max_urls = 2
|
73
|
+
poller.instance = 'test'
|
74
|
+
|
75
|
+
polled = poller.poll
|
76
|
+
polled.each_with_index do |map,i|
|
77
|
+
assert_equal( URLS[ i ][ 0 ], map.url.url )
|
78
|
+
end
|
79
|
+
assert_equal( 2, polled.size )
|
80
|
+
reserved = polled
|
81
|
+
|
82
|
+
polled = poller.poll
|
83
|
+
assert_equal( 1, polled.size )
|
84
|
+
assert_equal( URLS[2][0], polled.first.url.url )
|
85
|
+
reserved += polled
|
86
|
+
|
87
|
+
RJack::Logback[ 'iudex.da.WorkPoller' ].with_level( :warn ) do
|
88
|
+
poller.discard( VisitQueue.new.tap { |q| q.add_all( reserved ) } )
|
89
|
+
end
|
90
|
+
poller.max_urls = 3
|
91
|
+
|
92
|
+
assert_equal( 3, poller.poll.size )
|
93
|
+
end
|
94
|
+
|
69
95
|
def test_poll_with_max_priority_urls
|
70
96
|
poller.max_priority_urls = 4
|
71
97
|
|
@@ -89,6 +115,25 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
89
115
|
assert_equal( 3, pos )
|
90
116
|
end
|
91
117
|
|
118
|
+
def test_poll_with_domain_depth_reserve
|
119
|
+
poller.domain_depth_coef = 0.125
|
120
|
+
poller.max_priority_urls = 4
|
121
|
+
poller.do_reserve = true
|
122
|
+
poller.instance = 'test'
|
123
|
+
|
124
|
+
pos = 0
|
125
|
+
poller.poll.each do |map|
|
126
|
+
assert_equal( URLS[ pos ][ 0 ], map.url.url )
|
127
|
+
pos += 1
|
128
|
+
end
|
129
|
+
assert_equal( 3, pos )
|
130
|
+
RJack::Logback[ 'iudex.da.WorkPoller' ].with_level( :warn ) do
|
131
|
+
assert_equal( 3, poller.instance_unreserve )
|
132
|
+
end
|
133
|
+
assert_equal( 3, poller.poll.size )
|
134
|
+
assert_equal( 0, poller.poll.size )
|
135
|
+
end
|
136
|
+
|
92
137
|
def test_poll_with_domain_depth_only
|
93
138
|
poller.domain_depth_coef = 0.125
|
94
139
|
poller.age_coef_1 = 0.0
|
@@ -131,6 +176,15 @@ class TestWorkPoller < MiniTest::Unit::TestCase
|
|
131
176
|
assert_equal( 3, result.size )
|
132
177
|
end
|
133
178
|
|
179
|
+
def test_poll_domain_union_2_reserve
|
180
|
+
poller.do_reserve = true
|
181
|
+
poller.domain_union = [ { :domain => 'gravitext.com', :max => 15000 },
|
182
|
+
{ :max => 10000 } ]
|
183
|
+
|
184
|
+
assert_equal( 3, poller.poll.size )
|
185
|
+
assert_equal( 0, poller.poll.size )
|
186
|
+
end
|
187
|
+
|
134
188
|
def test_poll_domain_union_3
|
135
189
|
poller.domain_union = [ { :domain => 'gravitext.com', :max => 1 },
|
136
190
|
{ :domain => 'hometown.com', :max => 1 },
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: iudex-da
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.4.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-10-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: iudex-core
|
@@ -17,13 +17,13 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - ~>
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 1.
|
20
|
+
version: 1.4.0
|
21
21
|
none: false
|
22
22
|
requirement: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.
|
26
|
+
version: 1.4.0
|
27
27
|
none: false
|
28
28
|
prerelease: false
|
29
29
|
type: :runtime
|
@@ -33,29 +33,29 @@ dependencies:
|
|
33
33
|
requirements:
|
34
34
|
- - ~>
|
35
35
|
- !ruby/object:Gem::Version
|
36
|
-
version: 3.
|
36
|
+
version: '3.46'
|
37
37
|
none: false
|
38
38
|
requirement: !ruby/object:Gem::Requirement
|
39
39
|
requirements:
|
40
40
|
- - ~>
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version: 3.
|
42
|
+
version: '3.46'
|
43
43
|
none: false
|
44
44
|
prerelease: false
|
45
45
|
type: :runtime
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name: jdbc-postgres
|
47
|
+
name: rjack-jdbc-postgres
|
48
48
|
version_requirements: !ruby/object:Gem::Requirement
|
49
49
|
requirements:
|
50
50
|
- - ~>
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version: 9.
|
52
|
+
version: 9.2.1002
|
53
53
|
none: false
|
54
54
|
requirement: !ruby/object:Gem::Requirement
|
55
55
|
requirements:
|
56
56
|
- - ~>
|
57
57
|
- !ruby/object:Gem::Version
|
58
|
-
version: 9.
|
58
|
+
version: 9.2.1002
|
59
59
|
none: false
|
60
60
|
prerelease: false
|
61
61
|
type: :runtime
|
@@ -81,13 +81,13 @@ dependencies:
|
|
81
81
|
requirements:
|
82
82
|
- - ~>
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 1.
|
84
|
+
version: 1.5.0
|
85
85
|
none: false
|
86
86
|
requirement: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
88
|
- - ~>
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: 1.
|
90
|
+
version: 1.5.0
|
91
91
|
none: false
|
92
92
|
prerelease: false
|
93
93
|
type: :runtime
|
@@ -97,13 +97,13 @@ dependencies:
|
|
97
97
|
requirements:
|
98
98
|
- - ~>
|
99
99
|
- !ruby/object:Gem::Version
|
100
|
-
version:
|
100
|
+
version: 4.7.4
|
101
101
|
none: false
|
102
102
|
requirement: !ruby/object:Gem::Requirement
|
103
103
|
requirements:
|
104
104
|
- - ~>
|
105
105
|
- !ruby/object:Gem::Version
|
106
|
-
version:
|
106
|
+
version: 4.7.4
|
107
107
|
none: false
|
108
108
|
prerelease: false
|
109
109
|
type: :development
|
@@ -146,6 +146,7 @@ executables:
|
|
146
146
|
- iudex-da-generate-test-data
|
147
147
|
- iudex-da-import
|
148
148
|
- iudex-da-simhash-dump
|
149
|
+
- iudex-da-unreserve
|
149
150
|
- iudex-migrate
|
150
151
|
extensions: []
|
151
152
|
extra_rdoc_files:
|
@@ -160,12 +161,16 @@ files:
|
|
160
161
|
- bin/iudex-da-generate-test-data
|
161
162
|
- bin/iudex-da-import
|
162
163
|
- bin/iudex-da-simhash-dump
|
164
|
+
- bin/iudex-da-unreserve
|
163
165
|
- bin/iudex-migrate
|
164
166
|
- config/config.rb
|
165
167
|
- db/20111012173757_base.rb
|
166
168
|
- db/20120930173600_uhash_collation_order.rb
|
169
|
+
- db/20130419090000_instance_column.rb
|
170
|
+
- db/20130419095500_reserved_column.rb
|
167
171
|
- db/index_next_visit/21500000000101_add_index_next_visit.rb
|
168
172
|
- db/index_next_visit/21500000000110_index_next_visit_partial.rb
|
173
|
+
- db/index_next_visit/21500000000120_index_next_visit_not_reserved.rb
|
169
174
|
- db/simhash/21500000000001_add_simhash_index.rb
|
170
175
|
- lib/iudex-da/base.rb
|
171
176
|
- lib/iudex-da.rb
|
@@ -182,7 +187,7 @@ files:
|
|
182
187
|
- test/test_pool_factory.rb
|
183
188
|
- test/test_url_model.rb
|
184
189
|
- test/test_work_poller.rb
|
185
|
-
- lib/iudex-da/iudex-da-1.
|
190
|
+
- lib/iudex-da/iudex-da-1.4.0.jar
|
186
191
|
homepage: http://iudex.gravitext.com
|
187
192
|
licenses: []
|
188
193
|
post_install_message:
|
Binary file
|