iudex-worker 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +17 -0
- data/README.rdoc +25 -0
- data/Rakefile +41 -0
- data/bin/iudex-worker-fg +50 -0
- data/config/config.rb +46 -0
- data/init/iudex-worker +46 -0
- data/lib/iudex-worker/agent.rb +87 -0
- data/lib/iudex-worker/base.rb +21 -0
- data/lib/iudex-worker/fetch_helper.rb +79 -0
- data/lib/iudex-worker/filter_chain_factory.rb +201 -0
- data/lib/iudex-worker/prioritizer.rb +152 -0
- data/lib/iudex-worker.rb +20 -0
- data/test/setup.rb +34 -0
- data/test/test_agent.rb +69 -0
- data/test/test_filter_chain_factory.rb +63 -0
- data/test/test_prioritizer.rb +105 -0
- metadata +182 -0
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
bin/iudex-worker-fg
|
6
|
+
config/config.rb
|
7
|
+
init/iudex-worker
|
8
|
+
lib/iudex-worker/base.rb
|
9
|
+
lib/iudex-worker.rb
|
10
|
+
lib/iudex-worker/agent.rb
|
11
|
+
lib/iudex-worker/fetch_helper.rb
|
12
|
+
lib/iudex-worker/filter_chain_factory.rb
|
13
|
+
lib/iudex-worker/prioritizer.rb
|
14
|
+
test/setup.rb
|
15
|
+
test/test_agent.rb
|
16
|
+
test/test_filter_chain_factory.rb
|
17
|
+
test/test_prioritizer.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= iudex-worker
|
2
|
+
|
3
|
+
* http://github.com/dekellum/iudex
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Iudex is a general purpose web crawler and feed processor in
|
8
|
+
ruby/java. The iudex-worker gem provides a worker deamon for feed/page
|
9
|
+
processing.
|
10
|
+
|
11
|
+
== License
|
12
|
+
|
13
|
+
Copyright (c) 2008-2011 David Kellum
|
14
|
+
|
15
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
16
|
+
may not use this file except in compliance with the License. You may
|
17
|
+
obtain a copy of the License at
|
18
|
+
|
19
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
20
|
+
|
21
|
+
Unless required by applicable law or agreed to in writing, software
|
22
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
23
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
24
|
+
implied. See the License for the specific language governing
|
25
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$LOAD_PATH << './lib'
|
4
|
+
require 'iudex-worker/base'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
gem 'rjack-tarpit', '~> 1.2'
|
8
|
+
require 'rjack-tarpit'
|
9
|
+
|
10
|
+
t = RJack::TarPit.new( 'iudex-worker', Iudex::Worker::VERSION, :java_platform )
|
11
|
+
|
12
|
+
t.specify do |h|
|
13
|
+
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
14
|
+
|
15
|
+
h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
|
16
|
+
[ 'rjack-logback', '~> 1.0' ],
|
17
|
+
[ 'iudex-da', '~> 1.0.0' ],
|
18
|
+
[ 'iudex-rome', '~> 1.0.0' ],
|
19
|
+
[ 'iudex-html', '~> 1.0.0' ],
|
20
|
+
[ 'iudex-simhash', '~> 1.0.0' ],
|
21
|
+
[ 'iudex-httpclient-3', '~> 1.0.0' ] ]
|
22
|
+
|
23
|
+
h.testlib = :minitest
|
24
|
+
h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ] ]
|
25
|
+
end
|
26
|
+
|
27
|
+
task :chk_hist_vers do
|
28
|
+
t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
|
29
|
+
end
|
30
|
+
task :chk_init_v do
|
31
|
+
t.test_line_match( 'init/iudex-worker', /^gem.+#{t.name}/, /= #{t.version}/ )
|
32
|
+
end
|
33
|
+
task :chk_hist_date do
|
34
|
+
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
35
|
+
end
|
36
|
+
|
37
|
+
task :gem => [ :chk_hist_vers, :chk_init_v ]
|
38
|
+
task :tag => [ :chk_hist_vers, :chk_init_v, :chk_hist_date ]
|
39
|
+
task :push => [ :chk_hist_date ]
|
40
|
+
|
41
|
+
t.define_tasks
|
data/bin/iudex-worker-fg
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
21
|
+
|
22
|
+
require 'optparse'
|
23
|
+
|
24
|
+
module IudexBinScript
|
25
|
+
|
26
|
+
require 'rubygems'
|
27
|
+
require 'rjack-logback'
|
28
|
+
|
29
|
+
include RJack
|
30
|
+
Logback.config_console( :mdc => "uhash", :thread => true )
|
31
|
+
|
32
|
+
require 'iudex-worker'
|
33
|
+
include Iudex
|
34
|
+
|
35
|
+
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
36
|
+
|
37
|
+
OptionParser.new do |opts|
|
38
|
+
opts.on( "-v", "--version", "Display version" ) do |file|
|
39
|
+
puts "iudex-worker: #{ Worker::VERSION }"
|
40
|
+
exit 1
|
41
|
+
end
|
42
|
+
opts.on( "-d", "--debug", "Enable verbose DEBUG logging" ) do |file|
|
43
|
+
Logback[ 'iudex' ].level = Logback::DEBUG
|
44
|
+
end
|
45
|
+
Hooker.register_config( opts )
|
46
|
+
end.parse!
|
47
|
+
|
48
|
+
Worker::Agent.new.run
|
49
|
+
|
50
|
+
end
|
data/config/config.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
|
2
|
+
RJack::Logback[ 'iudex.filter.core.FilterChain.agent' ].level =
|
3
|
+
RJack::Logback::DEBUG
|
4
|
+
|
5
|
+
Iudex.configure do |c|
|
6
|
+
|
7
|
+
threads = 3
|
8
|
+
|
9
|
+
c.setup_connect_props do
|
10
|
+
{ :ds_pool => { :max_active => threads / 3 * 2,
|
11
|
+
:max_idle => threads / 3 },
|
12
|
+
:loglevel => 1 }
|
13
|
+
end
|
14
|
+
|
15
|
+
c.setup_http_client_3 do |mgr|
|
16
|
+
mgr.manager_params.max_total_connections = threads * 10
|
17
|
+
end
|
18
|
+
|
19
|
+
c.setup_visit_executor do |vx|
|
20
|
+
vx.max_threads = threads
|
21
|
+
vx.min_host_delay = 100 #ms
|
22
|
+
end
|
23
|
+
|
24
|
+
c.setup_work_poller do |wp|
|
25
|
+
wp.min_order_remaining_ratio = 0.30
|
26
|
+
wp.max_check_interval = 100 #ms
|
27
|
+
wp.min_poll_interval = 2_000 #ms
|
28
|
+
end
|
29
|
+
|
30
|
+
c.setup_filter_factory do |ff|
|
31
|
+
|
32
|
+
def ff.barc_writer
|
33
|
+
bw = super
|
34
|
+
bw.do_compress = false
|
35
|
+
bw
|
36
|
+
end
|
37
|
+
|
38
|
+
def ff.barc_directory
|
39
|
+
bdir = super
|
40
|
+
bdir.target_length = 2 * ( 1024 ** 2 )
|
41
|
+
bdir
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
data/init/iudex-worker
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#. hashdot.profile += daemon
|
4
|
+
#. hashdot.pid_file = ./iudex-worker.pid
|
5
|
+
#. hashdot.io_redirect.file = ./iudex-worker.log
|
6
|
+
#. hashdot.vm.options += -Xmx1g
|
7
|
+
#. hashdot.vm.options += -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled
|
8
|
+
|
9
|
+
#--
|
10
|
+
# Copyright (c) 2008-2011 David Kellum
|
11
|
+
#
|
12
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
13
|
+
# may not use this file except in compliance with the License. You
|
14
|
+
# may obtain a copy of the License at
|
15
|
+
#
|
16
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
17
|
+
#
|
18
|
+
# Unless required by applicable law or agreed to in writing, software
|
19
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
20
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
21
|
+
# implied. See the License for the specific language governing
|
22
|
+
# permissions and limitations under the License.
|
23
|
+
#++
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
|
27
|
+
gem( "iudex-worker", "= 1.0.0" )
|
28
|
+
|
29
|
+
module IudexInitScript
|
30
|
+
|
31
|
+
require 'rjack-logback'
|
32
|
+
include RJack
|
33
|
+
Logback.config_console( :full => true, :thread => true, :mdc => "uhash" )
|
34
|
+
|
35
|
+
require 'iudex-worker'
|
36
|
+
include Iudex
|
37
|
+
|
38
|
+
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
39
|
+
|
40
|
+
if File.exist?( './config.rb' )
|
41
|
+
Hooker.load_file( './config.rb' )
|
42
|
+
end
|
43
|
+
|
44
|
+
Worker::Agent.new.run
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-da'
|
18
|
+
require 'iudex-da/key_helper'
|
19
|
+
require 'iudex-da/pool_data_source_factory'
|
20
|
+
|
21
|
+
require 'iudex-httpclient-3'
|
22
|
+
|
23
|
+
require 'iudex-worker'
|
24
|
+
require 'iudex-worker/filter_chain_factory'
|
25
|
+
|
26
|
+
require 'hooker'
|
27
|
+
|
28
|
+
module Iudex
|
29
|
+
module Worker
|
30
|
+
|
31
|
+
class Agent
|
32
|
+
include Iudex::DA
|
33
|
+
include Iudex::Filter::KeyHelper
|
34
|
+
include Iudex::Core
|
35
|
+
include Iudex::Worker
|
36
|
+
include Gravitext::HTMap
|
37
|
+
|
38
|
+
def initialize
|
39
|
+
Hooker.apply( [ :iudex, :worker ], self )
|
40
|
+
end
|
41
|
+
|
42
|
+
def poll_keys
|
43
|
+
[ :url, :type, :priority, :next_visit_after, :last_visit, :etag ]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Note this can/is used to override factory in derived classes.
|
47
|
+
def filter_chain_factory
|
48
|
+
FilterChainFactory.new( 'agent' )
|
49
|
+
end
|
50
|
+
|
51
|
+
def run
|
52
|
+
Hooker.with( :iudex ) do
|
53
|
+
dsf = PoolDataSourceFactory.new
|
54
|
+
data_source = dsf.create
|
55
|
+
|
56
|
+
cmapper = ContentMapper.new( keys( poll_keys ) )
|
57
|
+
wpoller = WorkPoller.new( data_source, cmapper )
|
58
|
+
Hooker.apply( :work_poller, wpoller )
|
59
|
+
|
60
|
+
mgr = HTTPClient3.create_manager
|
61
|
+
mgr.start
|
62
|
+
http_client = HTTPClient3::HTTPClient3.new( mgr.client )
|
63
|
+
|
64
|
+
fcf = filter_chain_factory
|
65
|
+
fcf.http_client = http_client
|
66
|
+
fcf.data_source = data_source
|
67
|
+
|
68
|
+
Hooker.apply( :filter_factory, fcf )
|
69
|
+
|
70
|
+
fcf.filter do |chain|
|
71
|
+
vexec = VisitExecutor.new( chain, wpoller )
|
72
|
+
Hooker.apply( :visit_executor, vexec )
|
73
|
+
|
74
|
+
Hooker.log_not_applied # All hooks should be used by now
|
75
|
+
|
76
|
+
vexec.start
|
77
|
+
vexec.join #Run until interrupted
|
78
|
+
end # fcf closes
|
79
|
+
|
80
|
+
mgr.shutdown
|
81
|
+
dsf.close
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module Iudex
|
18
|
+
module Worker
|
19
|
+
VERSION = '1.0.0'
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-worker'
|
18
|
+
|
19
|
+
module Iudex
|
20
|
+
module Worker
|
21
|
+
|
22
|
+
module FetchHelper
|
23
|
+
include Iudex::HTTP
|
24
|
+
include Iudex::Core::Filters
|
25
|
+
|
26
|
+
def create_content_fetcher( accept_types, receiver_sym )
|
27
|
+
cf = ContentFetcher.new( http_client, create_chain( receiver_sym ) )
|
28
|
+
|
29
|
+
alist = accept_list( accept_types )
|
30
|
+
cf.accepted_content_types = alist unless alist.include?( '*/*' )
|
31
|
+
|
32
|
+
headers = [ [ 'User-Agent', http_user_agent ],
|
33
|
+
[ 'Accept', accept_header( accept_types ) ] ]
|
34
|
+
|
35
|
+
cf.request_headers = headers.map { |kv| Header.new( *kv ) }
|
36
|
+
|
37
|
+
cf
|
38
|
+
end
|
39
|
+
|
40
|
+
def http_user_agent
|
41
|
+
( "Mozilla/5.0 (compatible; " +
|
42
|
+
"Iudex #{Iudex::Worker::VERSION}; " +
|
43
|
+
"+http://gravitext.com/iudex)" )
|
44
|
+
end
|
45
|
+
|
46
|
+
def feed_mime_types
|
47
|
+
# List of accepted mime types grouped and order in descending
|
48
|
+
# order of preference.
|
49
|
+
[ %w[ application/atom+xml application/rss+xml ],
|
50
|
+
%w[ application/rdf+xml application/xml ],
|
51
|
+
%w[ text/xml ],
|
52
|
+
%w[ text/* ],
|
53
|
+
%w[ */* ] ]
|
54
|
+
end
|
55
|
+
|
56
|
+
def page_mime_types
|
57
|
+
[ %w[ application/xhtml+xml text/html ],
|
58
|
+
%w[ application/xml ],
|
59
|
+
%w[ text/* ] ]
|
60
|
+
end
|
61
|
+
|
62
|
+
def accept_header( types )
|
63
|
+
q = 1.0
|
64
|
+
ts = types.map do |tgrp|
|
65
|
+
tgrp = tgrp.map { |m| "#{m};q=#{q}" } if q < 1.0
|
66
|
+
q -= 0.1
|
67
|
+
tgrp
|
68
|
+
end
|
69
|
+
ts.flatten.join( ',' )
|
70
|
+
end
|
71
|
+
|
72
|
+
def accept_list( types )
|
73
|
+
types.flatten
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-filter'
|
18
|
+
require 'iudex-filter/filter_chain_factory'
|
19
|
+
|
20
|
+
require 'iudex-barc'
|
21
|
+
|
22
|
+
require 'iudex-core'
|
23
|
+
|
24
|
+
require 'iudex-da'
|
25
|
+
require 'iudex-da/factory_helper'
|
26
|
+
|
27
|
+
require 'iudex-rome'
|
28
|
+
|
29
|
+
require 'iudex-html'
|
30
|
+
require 'iudex-html/factory_helper'
|
31
|
+
|
32
|
+
require 'iudex-simhash'
|
33
|
+
require 'iudex-simhash/factory_helper'
|
34
|
+
|
35
|
+
require 'iudex-worker'
|
36
|
+
require 'iudex-worker/fetch_helper'
|
37
|
+
require 'iudex-worker/prioritizer'
|
38
|
+
|
39
|
+
module Iudex
|
40
|
+
module Worker
|
41
|
+
|
42
|
+
class FilterChainFactory < Iudex::Filter::Core::FilterChainFactory
|
43
|
+
include Iudex::Filter::Core
|
44
|
+
include Iudex::BARC
|
45
|
+
include Iudex::Core
|
46
|
+
include Iudex::Core::Filters
|
47
|
+
include Iudex::ROME
|
48
|
+
|
49
|
+
include Iudex::DA::Filters::FactoryHelper
|
50
|
+
include Iudex::HTML::Filters::FactoryHelper
|
51
|
+
include Iudex::SimHash::Filters::FactoryHelper
|
52
|
+
include FetchHelper
|
53
|
+
|
54
|
+
attr_accessor :http_client
|
55
|
+
attr_accessor :data_source
|
56
|
+
|
57
|
+
def initialize( name )
|
58
|
+
super
|
59
|
+
setup_reporters
|
60
|
+
end
|
61
|
+
|
62
|
+
def setup_reporters
|
63
|
+
add_summary_reporter
|
64
|
+
add_by_filter_reporter
|
65
|
+
end
|
66
|
+
|
67
|
+
def filters
|
68
|
+
[ UHashMDCSetter.new,
|
69
|
+
DefaultFilter.new,
|
70
|
+
super,
|
71
|
+
type_switch ].flatten
|
72
|
+
end
|
73
|
+
|
74
|
+
def listeners
|
75
|
+
super + [ MDCUnsetter.new( "uhash" ) ]
|
76
|
+
end
|
77
|
+
|
78
|
+
def type_map
|
79
|
+
{ "FEED" => feed_fetcher,
|
80
|
+
"PAGE" => page_fetcher }
|
81
|
+
end
|
82
|
+
|
83
|
+
def type_switch( tmap = type_map )
|
84
|
+
create_switch( :type.to_k, tmap )
|
85
|
+
end
|
86
|
+
|
87
|
+
def feed_fetcher
|
88
|
+
[ create_content_fetcher( feed_mime_types, :feed_receiver ) ]
|
89
|
+
end
|
90
|
+
|
91
|
+
def page_fetcher
|
92
|
+
[ create_content_fetcher( page_mime_types, :page_receiver ) ]
|
93
|
+
end
|
94
|
+
|
95
|
+
def feed_receiver
|
96
|
+
[ RomeFeedParser.new,
|
97
|
+
DefaultFilter.new,
|
98
|
+
DateChangeFilter.new( false ),
|
99
|
+
feed_updater ]
|
100
|
+
end
|
101
|
+
|
102
|
+
def feed_updater
|
103
|
+
create_update_filter( keys( feed_update_keys ),
|
104
|
+
:feed_post, :feed_ref_update, :feed_ref_new )
|
105
|
+
end
|
106
|
+
|
107
|
+
def feed_ref_new
|
108
|
+
[ UHashMDCSetter.new,
|
109
|
+
ref_common_cleanup,
|
110
|
+
Prioritizer.new( "feed-ref-new",
|
111
|
+
:constant => 50,
|
112
|
+
:min_next => 0.0 ) ].flatten
|
113
|
+
end
|
114
|
+
|
115
|
+
def feed_ref_update
|
116
|
+
[ UHashMDCSetter.new,
|
117
|
+
DateChangeFilter.new( true ),
|
118
|
+
ref_common_cleanup,
|
119
|
+
Prioritizer.new( "feed-ref-update",
|
120
|
+
:constant => 10,
|
121
|
+
:min_next => 0.0 ) ].flatten
|
122
|
+
end
|
123
|
+
|
124
|
+
# Note: *_post is run possibly twice, once for both base content
|
125
|
+
# map and referer map.
|
126
|
+
def feed_post
|
127
|
+
[ UHashMDCSetter.new,
|
128
|
+
ref_common_cleanup,
|
129
|
+
Prioritizer.new( "feed-post",
|
130
|
+
:constant => 30,
|
131
|
+
:visiting_now => true ),
|
132
|
+
last_visit_setter ].flatten
|
133
|
+
end
|
134
|
+
|
135
|
+
def ref_common_cleanup
|
136
|
+
[ ref_html_filters,
|
137
|
+
TextCtrlWSFilter.new( :title.to_k ),
|
138
|
+
FutureDateFilter.new( :pub_date.to_k ) ].flatten
|
139
|
+
end
|
140
|
+
|
141
|
+
def ref_html_filters
|
142
|
+
[ html_clean_filters( :title ),
|
143
|
+
html_clean_filters( :summary ),
|
144
|
+
html_clean_filters( :content ),
|
145
|
+
html_write_filter( :summary ),
|
146
|
+
html_write_filter( :content ) ].flatten
|
147
|
+
end
|
148
|
+
|
149
|
+
def feed_update_keys
|
150
|
+
page_update_keys + [ :title, :summary, :content ]
|
151
|
+
end
|
152
|
+
|
153
|
+
def page_receiver
|
154
|
+
[ html_clean_filters( :source ),
|
155
|
+
simhash_generator,
|
156
|
+
page_updater ].flatten
|
157
|
+
end
|
158
|
+
|
159
|
+
def barc_writer
|
160
|
+
bw = BARCWriter.new( barc_directory )
|
161
|
+
bw.do_compress = true
|
162
|
+
bw
|
163
|
+
end
|
164
|
+
|
165
|
+
def barc_directory
|
166
|
+
bdir = BARCDirectory.new( Java::java.io.File.new( "./barc" ) )
|
167
|
+
bdir
|
168
|
+
end
|
169
|
+
|
170
|
+
def page_updater
|
171
|
+
create_update_filter( keys( page_update_keys ), :page_post )
|
172
|
+
end
|
173
|
+
|
174
|
+
# Note: *_post is run possibly twice, once for both base content
|
175
|
+
# map and referer map.
|
176
|
+
def page_post
|
177
|
+
[ UHashMDCSetter.new,
|
178
|
+
barc_writer, # Not run in 302 referer case, since no SOURCE.
|
179
|
+
Prioritizer.new( "page-post",
|
180
|
+
:constant => 0,
|
181
|
+
:min_next => ( 30 * 60.0 ),
|
182
|
+
:visiting_now => true ),
|
183
|
+
last_visit_setter ]
|
184
|
+
end
|
185
|
+
|
186
|
+
def page_update_keys
|
187
|
+
[ :uhash, :host, :url, :type,
|
188
|
+
:ref_pub_date, :pub_date,
|
189
|
+
:priority, :last_visit, :next_visit_after,
|
190
|
+
:status, :etag, :reason, :referer, :referent,
|
191
|
+
:cache_file, :cache_file_offset, :simhash ]
|
192
|
+
end
|
193
|
+
|
194
|
+
def last_visit_setter
|
195
|
+
Copier.new( *keys( :visit_start, :last_visit ) )
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# Copyright (c) 2008-2011 David Kellum
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
6
|
+
# may not use this file except in compliance with the License. You may
|
7
|
+
# obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
14
|
+
# implied. See the License for the specific language governing
|
15
|
+
# permissions and limitations under the License.
|
16
|
+
#++
|
17
|
+
|
18
|
+
module Iudex
|
19
|
+
module Worker
|
20
|
+
|
21
|
+
class Prioritizer < Iudex::Filter::FilterBase
|
22
|
+
include Math
|
23
|
+
|
24
|
+
attr_accessor :constant
|
25
|
+
attr_accessor :impedance
|
26
|
+
attr_accessor :min_next
|
27
|
+
attr_accessor :min_next_unmodified
|
28
|
+
attr_accessor :factors
|
29
|
+
attr_accessor :visiting_now
|
30
|
+
|
31
|
+
WWW_BEGINS = Time.utc( 1991, "aug", 6, 20,0,0 ) # WWW begins
|
32
|
+
MINUTE = 60.0
|
33
|
+
HOUR = 60.0 * 60.0
|
34
|
+
|
35
|
+
def initialize( name, opts = {} )
|
36
|
+
@name = name
|
37
|
+
|
38
|
+
@constant = 0.0
|
39
|
+
@impedance = 2.0
|
40
|
+
@min_next_unmodified = 5 * MINUTE
|
41
|
+
@min_next = 10 * MINUTE
|
42
|
+
@visiting_now = false
|
43
|
+
|
44
|
+
@factors = [ [ 30.0, :ref_change_rate ],
|
45
|
+
[ -1.0, :log_pub_age ] ]
|
46
|
+
|
47
|
+
@log = RJack::SLF4J[ self.class ]
|
48
|
+
|
49
|
+
opts.each { |k,v| send( k.to_s + '=', v ) }
|
50
|
+
yield self if block_given?
|
51
|
+
|
52
|
+
@min_next_unmodified = [ @min_next_unmodified, @min_next ].min
|
53
|
+
@constant = @constant.to_f
|
54
|
+
end
|
55
|
+
|
56
|
+
def describe
|
57
|
+
[ @name, @constant, @min_next ]
|
58
|
+
end
|
59
|
+
|
60
|
+
def filter( map )
|
61
|
+
|
62
|
+
map.priority, delta = adjust( map, map.priority )
|
63
|
+
|
64
|
+
map.next_visit_after = ( as_time( map.visit_start ) + delta if delta )
|
65
|
+
|
66
|
+
true
|
67
|
+
end
|
68
|
+
|
69
|
+
def adjust( map, priority, delta = 0.0 )
|
70
|
+
|
71
|
+
old_priority = priority
|
72
|
+
memo = ( ( ( @constant != 0.0 ) ? [ @constant ] : [] ) if @log.debug? )
|
73
|
+
|
74
|
+
new_priority = @factors.inject( @constant ) do | p, (w,func)|
|
75
|
+
comp = ( w * send( func, map ) )
|
76
|
+
( memo << "%.1f:%s" % [ comp.to_f, func ] ) if memo && comp != 0.0
|
77
|
+
p + comp
|
78
|
+
end
|
79
|
+
|
80
|
+
#FIXME: new_priority = [ 0.0, new_priority ].max
|
81
|
+
|
82
|
+
priority = ( ( ( priority || 0.0 ) * @impedance + new_priority ) /
|
83
|
+
( @impedance + 1 ) )
|
84
|
+
|
85
|
+
if map.last_visit || visiting_now
|
86
|
+
delta = ( map.status == 304 ) ? @min_next_unmodified : @min_next
|
87
|
+
else
|
88
|
+
delta = 0.0
|
89
|
+
end
|
90
|
+
|
91
|
+
@log.debug do
|
92
|
+
memo.join( ' + ' ) +
|
93
|
+
( " :: %.1f -> %.1f = %.1f in %.1fs" %
|
94
|
+
( [ old_priority, new_priority,
|
95
|
+
priority, delta ].map { |f| f.to_f } ) )
|
96
|
+
end
|
97
|
+
|
98
|
+
[ priority, delta ]
|
99
|
+
end
|
100
|
+
|
101
|
+
def log_pub_age( map )
|
102
|
+
diff = sdiff( ( map.pub_date || WWW_BEGINS ), map.visit_start ) / MINUTE
|
103
|
+
diff = 1.0 / MINUTE if diff < 1.0 / MINUTE
|
104
|
+
( log( diff ) - log( 1.0 / MINUTE ) )
|
105
|
+
end
|
106
|
+
|
107
|
+
# FIXME: Useful?
|
108
|
+
# def ref_pub_age( map )
|
109
|
+
# map.visit_start - ( map.ref_pub_date || WWW_BEGINS )
|
110
|
+
# end
|
111
|
+
|
112
|
+
# References per hour, with updates rated at 1/4 a new reference.
|
113
|
+
def ref_change_rate( map )
|
114
|
+
s = since( map )
|
115
|
+
if s.nil? || s == 0.0
|
116
|
+
0.0
|
117
|
+
else
|
118
|
+
( ( ( map.new_references || 0.0 ) +
|
119
|
+
( map.updated_references || 0.0 ) / 4.0 ) /
|
120
|
+
s *
|
121
|
+
HOUR )
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def since( map )
|
126
|
+
sdiff( map.last_visit || oldest( map.references ),
|
127
|
+
map.visit_start )
|
128
|
+
end
|
129
|
+
|
130
|
+
def oldest( refs )
|
131
|
+
( refs.map { |r| r.pub_date }.compact.min ) if refs
|
132
|
+
end
|
133
|
+
|
134
|
+
def sdiff( prev, now )
|
135
|
+
diff = as_time( now ) - as_time( prev || WWW_BEGINS )
|
136
|
+
( diff < 0.0 ) ? 0.0 : diff
|
137
|
+
end
|
138
|
+
|
139
|
+
# FIXME: Generalize?
|
140
|
+
def as_time( torj )
|
141
|
+
if torj.is_a?( Time )
|
142
|
+
torj
|
143
|
+
else
|
144
|
+
ms = torj.time
|
145
|
+
Time.at( ms / 1_000, ( ms % 1_000 ) * 1_000 ) # s, µs
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
end
|
data/lib/iudex-worker.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
|
19
|
+
require 'iudex-worker/base'
|
20
|
+
require 'iudex-worker/agent'
|
data/test/setup.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
#### General test setup: LOAD_PATH, logging, console output ####
|
18
|
+
|
19
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
20
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
21
|
+
|
22
|
+
require 'rubygems'
|
23
|
+
require 'rjack-logback'
|
24
|
+
RJack::Logback.config_console( :stderr => true )
|
25
|
+
|
26
|
+
require 'minitest/unit'
|
27
|
+
require 'minitest/autorun'
|
28
|
+
|
29
|
+
# Make test output logging compatible: no partial lines.
|
30
|
+
class TestOut
|
31
|
+
def print( *a ); $stdout.puts( *a ); end
|
32
|
+
def puts( *a ); $stdout.puts( *a ); end
|
33
|
+
end
|
34
|
+
MiniTest::Unit.output = TestOut.new
|
data/test/test_agent.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-worker'
|
23
|
+
|
24
|
+
class TestAgent < MiniTest::Unit::TestCase
|
25
|
+
include Iudex::Worker
|
26
|
+
include RJack
|
27
|
+
|
28
|
+
def setup
|
29
|
+
Logback[ 'iudex.worker.FilterChainFactory' ].level = Logback::WARN
|
30
|
+
end
|
31
|
+
|
32
|
+
def teardown
|
33
|
+
Logback[ 'iudex.worker.FilterChainFactory' ].level = nil
|
34
|
+
Hooker.send( :clear )
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_agent_default
|
38
|
+
assert_agent
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_agent_with_sample_config
|
42
|
+
# Test out the sample config
|
43
|
+
Hooker.load_file( File.join( File.dirname( __FILE__ ),
|
44
|
+
'..', 'config', 'config.rb' ) )
|
45
|
+
|
46
|
+
assert_agent
|
47
|
+
end
|
48
|
+
|
49
|
+
def assert_agent
|
50
|
+
|
51
|
+
# Stub VisitExecutor.start to allow agent.run to return early.
|
52
|
+
Hooker.add( [ :iudex, :visit_executor ] ) do |vexec|
|
53
|
+
def vexec.start
|
54
|
+
#disable
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
agent = Agent.new
|
59
|
+
agent.run
|
60
|
+
pass
|
61
|
+
|
62
|
+
Hooker.check_not_applied do |*args|
|
63
|
+
flunk( "Hooks not applied: " + args.inspect )
|
64
|
+
end
|
65
|
+
pass
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
|
23
|
+
|
24
|
+
RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
|
25
|
+
|
26
|
+
require 'iudex-httpclient-3'
|
27
|
+
|
28
|
+
require 'iudex-da'
|
29
|
+
require 'iudex-da/pool_data_source_factory'
|
30
|
+
|
31
|
+
require 'iudex-worker'
|
32
|
+
require 'iudex-worker/filter_chain_factory'
|
33
|
+
|
34
|
+
class TestFilterChainFactory < MiniTest::Unit::TestCase
|
35
|
+
include Iudex
|
36
|
+
include Gravitext::HTMap
|
37
|
+
|
38
|
+
def test_filter
|
39
|
+
fcf = Worker::FilterChainFactory.new( "test" )
|
40
|
+
|
41
|
+
mgr = HTTPClient3.create_manager
|
42
|
+
mgr.start
|
43
|
+
fcf.http_client = HTTPClient3::HTTPClient3.new( mgr.client )
|
44
|
+
|
45
|
+
dsf = DA::PoolDataSourceFactory.new
|
46
|
+
fcf.data_source = dsf.create
|
47
|
+
|
48
|
+
fcf.filter do |chain|
|
49
|
+
# Run twice (assume new the first time, updates the second).
|
50
|
+
2.times do
|
51
|
+
content = UniMap.new
|
52
|
+
content.url = Core::VisitURL.normalize( "http://gravitext.com/atom.xml" )
|
53
|
+
content.type = "FEED"
|
54
|
+
content.priority = 1.0
|
55
|
+
assert( chain.filter( content ) )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
mgr.shutdown
|
60
|
+
dsf.close
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
|
23
|
+
|
24
|
+
RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
|
25
|
+
|
26
|
+
require 'iudex-worker'
|
27
|
+
require 'iudex-worker/prioritizer'
|
28
|
+
|
29
|
+
class TestPrioritizer < MiniTest::Unit::TestCase
|
30
|
+
include Iudex::Worker
|
31
|
+
include Gravitext::HTMap
|
32
|
+
JDate = Java::java.util.Date
|
33
|
+
|
34
|
+
UniMap.define_accessors
|
35
|
+
|
36
|
+
def test_identity
|
37
|
+
m = new_map
|
38
|
+
p = Prioritizer.new( "test", :constant => 3.2,
|
39
|
+
:factors => [], :impedance => 0 )
|
40
|
+
|
41
|
+
assert( p.filter( m ) )
|
42
|
+
assert_equal_fuzzy( 3.2, m.priority )
|
43
|
+
assert_equal( m.visit_start, m.next_visit_after )
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_visiting_now
|
47
|
+
m = new_map
|
48
|
+
p = Prioritizer.new( "test", :visiting_now => true )
|
49
|
+
|
50
|
+
assert( p.filter( m ) )
|
51
|
+
assert_equal_fuzzy( m.visit_start.time/1000.0 + p.min_next,
|
52
|
+
m.next_visit_after.time/1000.0 )
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_oldest
|
56
|
+
map = new_map
|
57
|
+
|
58
|
+
times = [ Time.utc( 2010, "jul", 17, 19,0,0 ),
|
59
|
+
oldest = Time.utc( 2010, "jul", 17, 18,0,0 ),
|
60
|
+
Time.utc( 2010, "jul", 17, 20,0,0 ),
|
61
|
+
nil ]
|
62
|
+
|
63
|
+
map.references = times.map do |t|
|
64
|
+
ref = UniMap.new
|
65
|
+
ref.pub_date = t
|
66
|
+
ref
|
67
|
+
end
|
68
|
+
|
69
|
+
p = prioritizer
|
70
|
+
assert_equal( oldest, p.as_time( p.oldest( map.references ) ) )
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_since_last
|
74
|
+
assert_equal( 60.0, prioritizer.since( one_minute_last_map ) )
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_ref_change_rate
|
78
|
+
map = one_minute_last_map
|
79
|
+
map.new_references = 1
|
80
|
+
map.updated_references = 4
|
81
|
+
assert_equal_fuzzy( 120, prioritizer.ref_change_rate( map ) )
|
82
|
+
end
|
83
|
+
|
84
|
+
def one_minute_last_map
|
85
|
+
map = new_map
|
86
|
+
map.visit_start = start = JDate.new
|
87
|
+
map.last_visit = JDate.new( start.time - ( 1_000 * 60 ) )
|
88
|
+
map
|
89
|
+
end
|
90
|
+
|
91
|
+
def assert_equal_fuzzy( l, r )
|
92
|
+
assert( ( l - r ).abs < 0.1, "#{l} ~!= #{r}" )
|
93
|
+
end
|
94
|
+
|
95
|
+
def new_map
|
96
|
+
map = UniMap.new
|
97
|
+
map.visit_start = JDate.new
|
98
|
+
map
|
99
|
+
end
|
100
|
+
|
101
|
+
def prioritizer
|
102
|
+
Prioritizer.new( "test" )
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-worker
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-04 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: iudex-core
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.0
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rjack-logback
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ~>
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: "1.0"
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: iudex-da
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ~>
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.0.0
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: iudex-rome
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ~>
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: 1.0.0
|
58
|
+
type: :runtime
|
59
|
+
version_requirements: *id004
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: iudex-html
|
62
|
+
prerelease: false
|
63
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 1.0.0
|
69
|
+
type: :runtime
|
70
|
+
version_requirements: *id005
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: iudex-simhash
|
73
|
+
prerelease: false
|
74
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
75
|
+
none: false
|
76
|
+
requirements:
|
77
|
+
- - ~>
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 1.0.0
|
80
|
+
type: :runtime
|
81
|
+
version_requirements: *id006
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: iudex-httpclient-3
|
84
|
+
prerelease: false
|
85
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ~>
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 1.0.0
|
91
|
+
type: :runtime
|
92
|
+
version_requirements: *id007
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: minitest
|
95
|
+
prerelease: false
|
96
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.7.1
|
102
|
+
- - <
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: "2.1"
|
105
|
+
type: :development
|
106
|
+
version_requirements: *id008
|
107
|
+
- !ruby/object:Gem::Dependency
|
108
|
+
name: rjack-tarpit
|
109
|
+
prerelease: false
|
110
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ~>
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: 1.3.0
|
116
|
+
type: :development
|
117
|
+
version_requirements: *id009
|
118
|
+
description: |-
|
119
|
+
Iudex is a general purpose web crawler and feed processor in
|
120
|
+
ruby/java. The iudex-worker gem provides a worker deamon for feed/page
|
121
|
+
processing.
|
122
|
+
email:
|
123
|
+
- dek-oss@gravitext.com
|
124
|
+
executables:
|
125
|
+
- iudex-worker-fg
|
126
|
+
extensions: []
|
127
|
+
|
128
|
+
extra_rdoc_files:
|
129
|
+
- Manifest.txt
|
130
|
+
- History.rdoc
|
131
|
+
- README.rdoc
|
132
|
+
files:
|
133
|
+
- History.rdoc
|
134
|
+
- Manifest.txt
|
135
|
+
- README.rdoc
|
136
|
+
- Rakefile
|
137
|
+
- bin/iudex-worker-fg
|
138
|
+
- config/config.rb
|
139
|
+
- init/iudex-worker
|
140
|
+
- lib/iudex-worker/base.rb
|
141
|
+
- lib/iudex-worker.rb
|
142
|
+
- lib/iudex-worker/agent.rb
|
143
|
+
- lib/iudex-worker/fetch_helper.rb
|
144
|
+
- lib/iudex-worker/filter_chain_factory.rb
|
145
|
+
- lib/iudex-worker/prioritizer.rb
|
146
|
+
- test/setup.rb
|
147
|
+
- test/test_agent.rb
|
148
|
+
- test/test_filter_chain_factory.rb
|
149
|
+
- test/test_prioritizer.rb
|
150
|
+
has_rdoc: true
|
151
|
+
homepage: http://github.com/dekellum/iudex
|
152
|
+
licenses: []
|
153
|
+
|
154
|
+
post_install_message:
|
155
|
+
rdoc_options:
|
156
|
+
- --main
|
157
|
+
- README.rdoc
|
158
|
+
require_paths:
|
159
|
+
- lib
|
160
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
162
|
+
requirements:
|
163
|
+
- - ">="
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: "0"
|
166
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
|
+
none: false
|
168
|
+
requirements:
|
169
|
+
- - ">="
|
170
|
+
- !ruby/object:Gem::Version
|
171
|
+
version: "0"
|
172
|
+
requirements: []
|
173
|
+
|
174
|
+
rubyforge_project: iudex-worker
|
175
|
+
rubygems_version: 1.5.1
|
176
|
+
signing_key:
|
177
|
+
specification_version: 3
|
178
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
179
|
+
test_files:
|
180
|
+
- test/test_agent.rb
|
181
|
+
- test/test_filter_chain_factory.rb
|
182
|
+
- test/test_prioritizer.rb
|