iudex-worker 1.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +2 -0
- data/Manifest.txt +17 -0
- data/README.rdoc +25 -0
- data/Rakefile +41 -0
- data/bin/iudex-worker-fg +50 -0
- data/config/config.rb +46 -0
- data/init/iudex-worker +46 -0
- data/lib/iudex-worker/agent.rb +87 -0
- data/lib/iudex-worker/base.rb +21 -0
- data/lib/iudex-worker/fetch_helper.rb +79 -0
- data/lib/iudex-worker/filter_chain_factory.rb +201 -0
- data/lib/iudex-worker/prioritizer.rb +152 -0
- data/lib/iudex-worker.rb +20 -0
- data/test/setup.rb +34 -0
- data/test/test_agent.rb +69 -0
- data/test/test_filter_chain_factory.rb +63 -0
- data/test/test_prioritizer.rb +105 -0
- metadata +182 -0
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
bin/iudex-worker-fg
|
6
|
+
config/config.rb
|
7
|
+
init/iudex-worker
|
8
|
+
lib/iudex-worker/base.rb
|
9
|
+
lib/iudex-worker.rb
|
10
|
+
lib/iudex-worker/agent.rb
|
11
|
+
lib/iudex-worker/fetch_helper.rb
|
12
|
+
lib/iudex-worker/filter_chain_factory.rb
|
13
|
+
lib/iudex-worker/prioritizer.rb
|
14
|
+
test/setup.rb
|
15
|
+
test/test_agent.rb
|
16
|
+
test/test_filter_chain_factory.rb
|
17
|
+
test/test_prioritizer.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= iudex-worker
|
2
|
+
|
3
|
+
* http://github.com/dekellum/iudex
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Iudex is a general purpose web crawler and feed processor in
|
8
|
+
ruby/java. The iudex-worker gem provides a worker deamon for feed/page
|
9
|
+
processing.
|
10
|
+
|
11
|
+
== License
|
12
|
+
|
13
|
+
Copyright (c) 2008-2011 David Kellum
|
14
|
+
|
15
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
16
|
+
may not use this file except in compliance with the License. You may
|
17
|
+
obtain a copy of the License at
|
18
|
+
|
19
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
20
|
+
|
21
|
+
Unless required by applicable law or agreed to in writing, software
|
22
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
23
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
24
|
+
implied. See the License for the specific language governing
|
25
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$LOAD_PATH << './lib'
|
4
|
+
require 'iudex-worker/base'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
gem 'rjack-tarpit', '~> 1.2'
|
8
|
+
require 'rjack-tarpit'
|
9
|
+
|
10
|
+
t = RJack::TarPit.new( 'iudex-worker', Iudex::Worker::VERSION, :java_platform )
|
11
|
+
|
12
|
+
t.specify do |h|
|
13
|
+
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
14
|
+
|
15
|
+
h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
|
16
|
+
[ 'rjack-logback', '~> 1.0' ],
|
17
|
+
[ 'iudex-da', '~> 1.0.0' ],
|
18
|
+
[ 'iudex-rome', '~> 1.0.0' ],
|
19
|
+
[ 'iudex-html', '~> 1.0.0' ],
|
20
|
+
[ 'iudex-simhash', '~> 1.0.0' ],
|
21
|
+
[ 'iudex-httpclient-3', '~> 1.0.0' ] ]
|
22
|
+
|
23
|
+
h.testlib = :minitest
|
24
|
+
h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ] ]
|
25
|
+
end
|
26
|
+
|
27
|
+
task :chk_hist_vers do
|
28
|
+
t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
|
29
|
+
end
|
30
|
+
task :chk_init_v do
|
31
|
+
t.test_line_match( 'init/iudex-worker', /^gem.+#{t.name}/, /= #{t.version}/ )
|
32
|
+
end
|
33
|
+
task :chk_hist_date do
|
34
|
+
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
35
|
+
end
|
36
|
+
|
37
|
+
task :gem => [ :chk_hist_vers, :chk_init_v ]
|
38
|
+
task :tag => [ :chk_hist_vers, :chk_init_v, :chk_hist_date ]
|
39
|
+
task :push => [ :chk_hist_date ]
|
40
|
+
|
41
|
+
t.define_tasks
|
data/bin/iudex-worker-fg
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
21
|
+
|
22
|
+
require 'optparse'
|
23
|
+
|
24
|
+
module IudexBinScript
|
25
|
+
|
26
|
+
require 'rubygems'
|
27
|
+
require 'rjack-logback'
|
28
|
+
|
29
|
+
include RJack
|
30
|
+
Logback.config_console( :mdc => "uhash", :thread => true )
|
31
|
+
|
32
|
+
require 'iudex-worker'
|
33
|
+
include Iudex
|
34
|
+
|
35
|
+
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
36
|
+
|
37
|
+
OptionParser.new do |opts|
|
38
|
+
opts.on( "-v", "--version", "Display version" ) do |file|
|
39
|
+
puts "iudex-worker: #{ Worker::VERSION }"
|
40
|
+
exit 1
|
41
|
+
end
|
42
|
+
opts.on( "-d", "--debug", "Enable verbose DEBUG logging" ) do |file|
|
43
|
+
Logback[ 'iudex' ].level = Logback::DEBUG
|
44
|
+
end
|
45
|
+
Hooker.register_config( opts )
|
46
|
+
end.parse!
|
47
|
+
|
48
|
+
Worker::Agent.new.run
|
49
|
+
|
50
|
+
end
|
data/config/config.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
|
2
|
+
RJack::Logback[ 'iudex.filter.core.FilterChain.agent' ].level =
|
3
|
+
RJack::Logback::DEBUG
|
4
|
+
|
5
|
+
Iudex.configure do |c|
|
6
|
+
|
7
|
+
threads = 3
|
8
|
+
|
9
|
+
c.setup_connect_props do
|
10
|
+
{ :ds_pool => { :max_active => threads / 3 * 2,
|
11
|
+
:max_idle => threads / 3 },
|
12
|
+
:loglevel => 1 }
|
13
|
+
end
|
14
|
+
|
15
|
+
c.setup_http_client_3 do |mgr|
|
16
|
+
mgr.manager_params.max_total_connections = threads * 10
|
17
|
+
end
|
18
|
+
|
19
|
+
c.setup_visit_executor do |vx|
|
20
|
+
vx.max_threads = threads
|
21
|
+
vx.min_host_delay = 100 #ms
|
22
|
+
end
|
23
|
+
|
24
|
+
c.setup_work_poller do |wp|
|
25
|
+
wp.min_order_remaining_ratio = 0.30
|
26
|
+
wp.max_check_interval = 100 #ms
|
27
|
+
wp.min_poll_interval = 2_000 #ms
|
28
|
+
end
|
29
|
+
|
30
|
+
c.setup_filter_factory do |ff|
|
31
|
+
|
32
|
+
def ff.barc_writer
|
33
|
+
bw = super
|
34
|
+
bw.do_compress = false
|
35
|
+
bw
|
36
|
+
end
|
37
|
+
|
38
|
+
def ff.barc_directory
|
39
|
+
bdir = super
|
40
|
+
bdir.target_length = 2 * ( 1024 ** 2 )
|
41
|
+
bdir
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
data/init/iudex-worker
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#. hashdot.profile += daemon
|
4
|
+
#. hashdot.pid_file = ./iudex-worker.pid
|
5
|
+
#. hashdot.io_redirect.file = ./iudex-worker.log
|
6
|
+
#. hashdot.vm.options += -Xmx1g
|
7
|
+
#. hashdot.vm.options += -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled
|
8
|
+
|
9
|
+
#--
|
10
|
+
# Copyright (c) 2008-2011 David Kellum
|
11
|
+
#
|
12
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
13
|
+
# may not use this file except in compliance with the License. You
|
14
|
+
# may obtain a copy of the License at
|
15
|
+
#
|
16
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
17
|
+
#
|
18
|
+
# Unless required by applicable law or agreed to in writing, software
|
19
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
20
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
21
|
+
# implied. See the License for the specific language governing
|
22
|
+
# permissions and limitations under the License.
|
23
|
+
#++
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
|
27
|
+
gem( "iudex-worker", "= 1.0.0" )
|
28
|
+
|
29
|
+
module IudexInitScript
|
30
|
+
|
31
|
+
require 'rjack-logback'
|
32
|
+
include RJack
|
33
|
+
Logback.config_console( :full => true, :thread => true, :mdc => "uhash" )
|
34
|
+
|
35
|
+
require 'iudex-worker'
|
36
|
+
include Iudex
|
37
|
+
|
38
|
+
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
39
|
+
|
40
|
+
if File.exist?( './config.rb' )
|
41
|
+
Hooker.load_file( './config.rb' )
|
42
|
+
end
|
43
|
+
|
44
|
+
Worker::Agent.new.run
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-da'
|
18
|
+
require 'iudex-da/key_helper'
|
19
|
+
require 'iudex-da/pool_data_source_factory'
|
20
|
+
|
21
|
+
require 'iudex-httpclient-3'
|
22
|
+
|
23
|
+
require 'iudex-worker'
|
24
|
+
require 'iudex-worker/filter_chain_factory'
|
25
|
+
|
26
|
+
require 'hooker'
|
27
|
+
|
28
|
+
module Iudex
|
29
|
+
module Worker
|
30
|
+
|
31
|
+
class Agent
|
32
|
+
include Iudex::DA
|
33
|
+
include Iudex::Filter::KeyHelper
|
34
|
+
include Iudex::Core
|
35
|
+
include Iudex::Worker
|
36
|
+
include Gravitext::HTMap
|
37
|
+
|
38
|
+
def initialize
|
39
|
+
Hooker.apply( [ :iudex, :worker ], self )
|
40
|
+
end
|
41
|
+
|
42
|
+
def poll_keys
|
43
|
+
[ :url, :type, :priority, :next_visit_after, :last_visit, :etag ]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Note this can/is used to override factory in derived classes.
|
47
|
+
def filter_chain_factory
|
48
|
+
FilterChainFactory.new( 'agent' )
|
49
|
+
end
|
50
|
+
|
51
|
+
def run
|
52
|
+
Hooker.with( :iudex ) do
|
53
|
+
dsf = PoolDataSourceFactory.new
|
54
|
+
data_source = dsf.create
|
55
|
+
|
56
|
+
cmapper = ContentMapper.new( keys( poll_keys ) )
|
57
|
+
wpoller = WorkPoller.new( data_source, cmapper )
|
58
|
+
Hooker.apply( :work_poller, wpoller )
|
59
|
+
|
60
|
+
mgr = HTTPClient3.create_manager
|
61
|
+
mgr.start
|
62
|
+
http_client = HTTPClient3::HTTPClient3.new( mgr.client )
|
63
|
+
|
64
|
+
fcf = filter_chain_factory
|
65
|
+
fcf.http_client = http_client
|
66
|
+
fcf.data_source = data_source
|
67
|
+
|
68
|
+
Hooker.apply( :filter_factory, fcf )
|
69
|
+
|
70
|
+
fcf.filter do |chain|
|
71
|
+
vexec = VisitExecutor.new( chain, wpoller )
|
72
|
+
Hooker.apply( :visit_executor, vexec )
|
73
|
+
|
74
|
+
Hooker.log_not_applied # All hooks should be used by now
|
75
|
+
|
76
|
+
vexec.start
|
77
|
+
vexec.join #Run until interrupted
|
78
|
+
end # fcf closes
|
79
|
+
|
80
|
+
mgr.shutdown
|
81
|
+
dsf.close
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module Iudex
|
18
|
+
module Worker
|
19
|
+
VERSION = '1.0.0'
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-worker'
|
18
|
+
|
19
|
+
module Iudex
|
20
|
+
module Worker
|
21
|
+
|
22
|
+
module FetchHelper
|
23
|
+
include Iudex::HTTP
|
24
|
+
include Iudex::Core::Filters
|
25
|
+
|
26
|
+
def create_content_fetcher( accept_types, receiver_sym )
|
27
|
+
cf = ContentFetcher.new( http_client, create_chain( receiver_sym ) )
|
28
|
+
|
29
|
+
alist = accept_list( accept_types )
|
30
|
+
cf.accepted_content_types = alist unless alist.include?( '*/*' )
|
31
|
+
|
32
|
+
headers = [ [ 'User-Agent', http_user_agent ],
|
33
|
+
[ 'Accept', accept_header( accept_types ) ] ]
|
34
|
+
|
35
|
+
cf.request_headers = headers.map { |kv| Header.new( *kv ) }
|
36
|
+
|
37
|
+
cf
|
38
|
+
end
|
39
|
+
|
40
|
+
def http_user_agent
|
41
|
+
( "Mozilla/5.0 (compatible; " +
|
42
|
+
"Iudex #{Iudex::Worker::VERSION}; " +
|
43
|
+
"+http://gravitext.com/iudex)" )
|
44
|
+
end
|
45
|
+
|
46
|
+
def feed_mime_types
|
47
|
+
# List of accepted mime types grouped and order in descending
|
48
|
+
# order of preference.
|
49
|
+
[ %w[ application/atom+xml application/rss+xml ],
|
50
|
+
%w[ application/rdf+xml application/xml ],
|
51
|
+
%w[ text/xml ],
|
52
|
+
%w[ text/* ],
|
53
|
+
%w[ */* ] ]
|
54
|
+
end
|
55
|
+
|
56
|
+
def page_mime_types
|
57
|
+
[ %w[ application/xhtml+xml text/html ],
|
58
|
+
%w[ application/xml ],
|
59
|
+
%w[ text/* ] ]
|
60
|
+
end
|
61
|
+
|
62
|
+
def accept_header( types )
|
63
|
+
q = 1.0
|
64
|
+
ts = types.map do |tgrp|
|
65
|
+
tgrp = tgrp.map { |m| "#{m};q=#{q}" } if q < 1.0
|
66
|
+
q -= 0.1
|
67
|
+
tgrp
|
68
|
+
end
|
69
|
+
ts.flatten.join( ',' )
|
70
|
+
end
|
71
|
+
|
72
|
+
def accept_list( types )
|
73
|
+
types.flatten
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-filter'
|
18
|
+
require 'iudex-filter/filter_chain_factory'
|
19
|
+
|
20
|
+
require 'iudex-barc'
|
21
|
+
|
22
|
+
require 'iudex-core'
|
23
|
+
|
24
|
+
require 'iudex-da'
|
25
|
+
require 'iudex-da/factory_helper'
|
26
|
+
|
27
|
+
require 'iudex-rome'
|
28
|
+
|
29
|
+
require 'iudex-html'
|
30
|
+
require 'iudex-html/factory_helper'
|
31
|
+
|
32
|
+
require 'iudex-simhash'
|
33
|
+
require 'iudex-simhash/factory_helper'
|
34
|
+
|
35
|
+
require 'iudex-worker'
|
36
|
+
require 'iudex-worker/fetch_helper'
|
37
|
+
require 'iudex-worker/prioritizer'
|
38
|
+
|
39
|
+
module Iudex
|
40
|
+
module Worker
|
41
|
+
|
42
|
+
class FilterChainFactory < Iudex::Filter::Core::FilterChainFactory
|
43
|
+
include Iudex::Filter::Core
|
44
|
+
include Iudex::BARC
|
45
|
+
include Iudex::Core
|
46
|
+
include Iudex::Core::Filters
|
47
|
+
include Iudex::ROME
|
48
|
+
|
49
|
+
include Iudex::DA::Filters::FactoryHelper
|
50
|
+
include Iudex::HTML::Filters::FactoryHelper
|
51
|
+
include Iudex::SimHash::Filters::FactoryHelper
|
52
|
+
include FetchHelper
|
53
|
+
|
54
|
+
attr_accessor :http_client
|
55
|
+
attr_accessor :data_source
|
56
|
+
|
57
|
+
def initialize( name )
|
58
|
+
super
|
59
|
+
setup_reporters
|
60
|
+
end
|
61
|
+
|
62
|
+
def setup_reporters
|
63
|
+
add_summary_reporter
|
64
|
+
add_by_filter_reporter
|
65
|
+
end
|
66
|
+
|
67
|
+
def filters
|
68
|
+
[ UHashMDCSetter.new,
|
69
|
+
DefaultFilter.new,
|
70
|
+
super,
|
71
|
+
type_switch ].flatten
|
72
|
+
end
|
73
|
+
|
74
|
+
def listeners
|
75
|
+
super + [ MDCUnsetter.new( "uhash" ) ]
|
76
|
+
end
|
77
|
+
|
78
|
+
def type_map
|
79
|
+
{ "FEED" => feed_fetcher,
|
80
|
+
"PAGE" => page_fetcher }
|
81
|
+
end
|
82
|
+
|
83
|
+
def type_switch( tmap = type_map )
|
84
|
+
create_switch( :type.to_k, tmap )
|
85
|
+
end
|
86
|
+
|
87
|
+
def feed_fetcher
|
88
|
+
[ create_content_fetcher( feed_mime_types, :feed_receiver ) ]
|
89
|
+
end
|
90
|
+
|
91
|
+
def page_fetcher
|
92
|
+
[ create_content_fetcher( page_mime_types, :page_receiver ) ]
|
93
|
+
end
|
94
|
+
|
95
|
+
def feed_receiver
|
96
|
+
[ RomeFeedParser.new,
|
97
|
+
DefaultFilter.new,
|
98
|
+
DateChangeFilter.new( false ),
|
99
|
+
feed_updater ]
|
100
|
+
end
|
101
|
+
|
102
|
+
def feed_updater
|
103
|
+
create_update_filter( keys( feed_update_keys ),
|
104
|
+
:feed_post, :feed_ref_update, :feed_ref_new )
|
105
|
+
end
|
106
|
+
|
107
|
+
def feed_ref_new
|
108
|
+
[ UHashMDCSetter.new,
|
109
|
+
ref_common_cleanup,
|
110
|
+
Prioritizer.new( "feed-ref-new",
|
111
|
+
:constant => 50,
|
112
|
+
:min_next => 0.0 ) ].flatten
|
113
|
+
end
|
114
|
+
|
115
|
+
def feed_ref_update
|
116
|
+
[ UHashMDCSetter.new,
|
117
|
+
DateChangeFilter.new( true ),
|
118
|
+
ref_common_cleanup,
|
119
|
+
Prioritizer.new( "feed-ref-update",
|
120
|
+
:constant => 10,
|
121
|
+
:min_next => 0.0 ) ].flatten
|
122
|
+
end
|
123
|
+
|
124
|
+
# Note: *_post is run possibly twice, once for both base content
|
125
|
+
# map and referer map.
|
126
|
+
def feed_post
|
127
|
+
[ UHashMDCSetter.new,
|
128
|
+
ref_common_cleanup,
|
129
|
+
Prioritizer.new( "feed-post",
|
130
|
+
:constant => 30,
|
131
|
+
:visiting_now => true ),
|
132
|
+
last_visit_setter ].flatten
|
133
|
+
end
|
134
|
+
|
135
|
+
def ref_common_cleanup
|
136
|
+
[ ref_html_filters,
|
137
|
+
TextCtrlWSFilter.new( :title.to_k ),
|
138
|
+
FutureDateFilter.new( :pub_date.to_k ) ].flatten
|
139
|
+
end
|
140
|
+
|
141
|
+
def ref_html_filters
|
142
|
+
[ html_clean_filters( :title ),
|
143
|
+
html_clean_filters( :summary ),
|
144
|
+
html_clean_filters( :content ),
|
145
|
+
html_write_filter( :summary ),
|
146
|
+
html_write_filter( :content ) ].flatten
|
147
|
+
end
|
148
|
+
|
149
|
+
def feed_update_keys
|
150
|
+
page_update_keys + [ :title, :summary, :content ]
|
151
|
+
end
|
152
|
+
|
153
|
+
def page_receiver
|
154
|
+
[ html_clean_filters( :source ),
|
155
|
+
simhash_generator,
|
156
|
+
page_updater ].flatten
|
157
|
+
end
|
158
|
+
|
159
|
+
def barc_writer
|
160
|
+
bw = BARCWriter.new( barc_directory )
|
161
|
+
bw.do_compress = true
|
162
|
+
bw
|
163
|
+
end
|
164
|
+
|
165
|
+
def barc_directory
|
166
|
+
bdir = BARCDirectory.new( Java::java.io.File.new( "./barc" ) )
|
167
|
+
bdir
|
168
|
+
end
|
169
|
+
|
170
|
+
def page_updater
|
171
|
+
create_update_filter( keys( page_update_keys ), :page_post )
|
172
|
+
end
|
173
|
+
|
174
|
+
# Note: *_post is run possibly twice, once for both base content
|
175
|
+
# map and referer map.
|
176
|
+
def page_post
|
177
|
+
[ UHashMDCSetter.new,
|
178
|
+
barc_writer, # Not run in 302 referer case, since no SOURCE.
|
179
|
+
Prioritizer.new( "page-post",
|
180
|
+
:constant => 0,
|
181
|
+
:min_next => ( 30 * 60.0 ),
|
182
|
+
:visiting_now => true ),
|
183
|
+
last_visit_setter ]
|
184
|
+
end
|
185
|
+
|
186
|
+
def page_update_keys
|
187
|
+
[ :uhash, :host, :url, :type,
|
188
|
+
:ref_pub_date, :pub_date,
|
189
|
+
:priority, :last_visit, :next_visit_after,
|
190
|
+
:status, :etag, :reason, :referer, :referent,
|
191
|
+
:cache_file, :cache_file_offset, :simhash ]
|
192
|
+
end
|
193
|
+
|
194
|
+
def last_visit_setter
|
195
|
+
Copier.new( *keys( :visit_start, :last_visit ) )
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# Copyright (c) 2008-2011 David Kellum
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
6
|
+
# may not use this file except in compliance with the License. You may
|
7
|
+
# obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
14
|
+
# implied. See the License for the specific language governing
|
15
|
+
# permissions and limitations under the License.
|
16
|
+
#++
|
17
|
+
|
18
|
+
module Iudex
|
19
|
+
module Worker
|
20
|
+
|
21
|
+
class Prioritizer < Iudex::Filter::FilterBase
|
22
|
+
include Math
|
23
|
+
|
24
|
+
attr_accessor :constant
|
25
|
+
attr_accessor :impedance
|
26
|
+
attr_accessor :min_next
|
27
|
+
attr_accessor :min_next_unmodified
|
28
|
+
attr_accessor :factors
|
29
|
+
attr_accessor :visiting_now
|
30
|
+
|
31
|
+
WWW_BEGINS = Time.utc( 1991, "aug", 6, 20,0,0 ) # WWW begins
|
32
|
+
MINUTE = 60.0
|
33
|
+
HOUR = 60.0 * 60.0
|
34
|
+
|
35
|
+
def initialize( name, opts = {} )
|
36
|
+
@name = name
|
37
|
+
|
38
|
+
@constant = 0.0
|
39
|
+
@impedance = 2.0
|
40
|
+
@min_next_unmodified = 5 * MINUTE
|
41
|
+
@min_next = 10 * MINUTE
|
42
|
+
@visiting_now = false
|
43
|
+
|
44
|
+
@factors = [ [ 30.0, :ref_change_rate ],
|
45
|
+
[ -1.0, :log_pub_age ] ]
|
46
|
+
|
47
|
+
@log = RJack::SLF4J[ self.class ]
|
48
|
+
|
49
|
+
opts.each { |k,v| send( k.to_s + '=', v ) }
|
50
|
+
yield self if block_given?
|
51
|
+
|
52
|
+
@min_next_unmodified = [ @min_next_unmodified, @min_next ].min
|
53
|
+
@constant = @constant.to_f
|
54
|
+
end
|
55
|
+
|
56
|
+
def describe
|
57
|
+
[ @name, @constant, @min_next ]
|
58
|
+
end
|
59
|
+
|
60
|
+
def filter( map )
|
61
|
+
|
62
|
+
map.priority, delta = adjust( map, map.priority )
|
63
|
+
|
64
|
+
map.next_visit_after = ( as_time( map.visit_start ) + delta if delta )
|
65
|
+
|
66
|
+
true
|
67
|
+
end
|
68
|
+
|
69
|
+
def adjust( map, priority, delta = 0.0 )
|
70
|
+
|
71
|
+
old_priority = priority
|
72
|
+
memo = ( ( ( @constant != 0.0 ) ? [ @constant ] : [] ) if @log.debug? )
|
73
|
+
|
74
|
+
new_priority = @factors.inject( @constant ) do | p, (w,func)|
|
75
|
+
comp = ( w * send( func, map ) )
|
76
|
+
( memo << "%.1f:%s" % [ comp.to_f, func ] ) if memo && comp != 0.0
|
77
|
+
p + comp
|
78
|
+
end
|
79
|
+
|
80
|
+
#FIXME: new_priority = [ 0.0, new_priority ].max
|
81
|
+
|
82
|
+
priority = ( ( ( priority || 0.0 ) * @impedance + new_priority ) /
|
83
|
+
( @impedance + 1 ) )
|
84
|
+
|
85
|
+
if map.last_visit || visiting_now
|
86
|
+
delta = ( map.status == 304 ) ? @min_next_unmodified : @min_next
|
87
|
+
else
|
88
|
+
delta = 0.0
|
89
|
+
end
|
90
|
+
|
91
|
+
@log.debug do
|
92
|
+
memo.join( ' + ' ) +
|
93
|
+
( " :: %.1f -> %.1f = %.1f in %.1fs" %
|
94
|
+
( [ old_priority, new_priority,
|
95
|
+
priority, delta ].map { |f| f.to_f } ) )
|
96
|
+
end
|
97
|
+
|
98
|
+
[ priority, delta ]
|
99
|
+
end
|
100
|
+
|
101
|
+
def log_pub_age( map )
|
102
|
+
diff = sdiff( ( map.pub_date || WWW_BEGINS ), map.visit_start ) / MINUTE
|
103
|
+
diff = 1.0 / MINUTE if diff < 1.0 / MINUTE
|
104
|
+
( log( diff ) - log( 1.0 / MINUTE ) )
|
105
|
+
end
|
106
|
+
|
107
|
+
# FIXME: Useful?
|
108
|
+
# def ref_pub_age( map )
|
109
|
+
# map.visit_start - ( map.ref_pub_date || WWW_BEGINS )
|
110
|
+
# end
|
111
|
+
|
112
|
+
# References per hour, with updates rated at 1/4 a new reference.
|
113
|
+
def ref_change_rate( map )
|
114
|
+
s = since( map )
|
115
|
+
if s.nil? || s == 0.0
|
116
|
+
0.0
|
117
|
+
else
|
118
|
+
( ( ( map.new_references || 0.0 ) +
|
119
|
+
( map.updated_references || 0.0 ) / 4.0 ) /
|
120
|
+
s *
|
121
|
+
HOUR )
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def since( map )
|
126
|
+
sdiff( map.last_visit || oldest( map.references ),
|
127
|
+
map.visit_start )
|
128
|
+
end
|
129
|
+
|
130
|
+
def oldest( refs )
|
131
|
+
( refs.map { |r| r.pub_date }.compact.min ) if refs
|
132
|
+
end
|
133
|
+
|
134
|
+
def sdiff( prev, now )
|
135
|
+
diff = as_time( now ) - as_time( prev || WWW_BEGINS )
|
136
|
+
( diff < 0.0 ) ? 0.0 : diff
|
137
|
+
end
|
138
|
+
|
139
|
+
# FIXME: Generalize?
|
140
|
+
def as_time( torj )
|
141
|
+
if torj.is_a?( Time )
|
142
|
+
torj
|
143
|
+
else
|
144
|
+
ms = torj.time
|
145
|
+
Time.at( ms / 1_000, ( ms % 1_000 ) * 1_000 ) # s, µs
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
end
|
data/lib/iudex-worker.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
|
19
|
+
require 'iudex-worker/base'
|
20
|
+
require 'iudex-worker/agent'
|
data/test/setup.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
#### General test setup: LOAD_PATH, logging, console output ####
|
18
|
+
|
19
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
20
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
21
|
+
|
22
|
+
require 'rubygems'
|
23
|
+
require 'rjack-logback'
|
24
|
+
RJack::Logback.config_console( :stderr => true )
|
25
|
+
|
26
|
+
require 'minitest/unit'
|
27
|
+
require 'minitest/autorun'
|
28
|
+
|
29
|
+
# Make test output logging compatible: no partial lines.
|
30
|
+
class TestOut
|
31
|
+
def print( *a ); $stdout.puts( *a ); end
|
32
|
+
def puts( *a ); $stdout.puts( *a ); end
|
33
|
+
end
|
34
|
+
MiniTest::Unit.output = TestOut.new
|
data/test/test_agent.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-worker'
|
23
|
+
|
24
|
+
class TestAgent < MiniTest::Unit::TestCase
|
25
|
+
include Iudex::Worker
|
26
|
+
include RJack
|
27
|
+
|
28
|
+
def setup
|
29
|
+
Logback[ 'iudex.worker.FilterChainFactory' ].level = Logback::WARN
|
30
|
+
end
|
31
|
+
|
32
|
+
def teardown
|
33
|
+
Logback[ 'iudex.worker.FilterChainFactory' ].level = nil
|
34
|
+
Hooker.send( :clear )
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_agent_default
|
38
|
+
assert_agent
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_agent_with_sample_config
|
42
|
+
# Test out the sample config
|
43
|
+
Hooker.load_file( File.join( File.dirname( __FILE__ ),
|
44
|
+
'..', 'config', 'config.rb' ) )
|
45
|
+
|
46
|
+
assert_agent
|
47
|
+
end
|
48
|
+
|
49
|
+
def assert_agent
|
50
|
+
|
51
|
+
# Stub VisitExecutor.start to allow agent.run to return early.
|
52
|
+
Hooker.add( [ :iudex, :visit_executor ] ) do |vexec|
|
53
|
+
def vexec.start
|
54
|
+
#disable
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
agent = Agent.new
|
59
|
+
agent.run
|
60
|
+
pass
|
61
|
+
|
62
|
+
Hooker.check_not_applied do |*args|
|
63
|
+
flunk( "Hooks not applied: " + args.inspect )
|
64
|
+
end
|
65
|
+
pass
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
|
23
|
+
|
24
|
+
RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
|
25
|
+
|
26
|
+
require 'iudex-httpclient-3'
|
27
|
+
|
28
|
+
require 'iudex-da'
|
29
|
+
require 'iudex-da/pool_data_source_factory'
|
30
|
+
|
31
|
+
require 'iudex-worker'
|
32
|
+
require 'iudex-worker/filter_chain_factory'
|
33
|
+
|
34
|
+
class TestFilterChainFactory < MiniTest::Unit::TestCase
|
35
|
+
include Iudex
|
36
|
+
include Gravitext::HTMap
|
37
|
+
|
38
|
+
def test_filter
|
39
|
+
fcf = Worker::FilterChainFactory.new( "test" )
|
40
|
+
|
41
|
+
mgr = HTTPClient3.create_manager
|
42
|
+
mgr.start
|
43
|
+
fcf.http_client = HTTPClient3::HTTPClient3.new( mgr.client )
|
44
|
+
|
45
|
+
dsf = DA::PoolDataSourceFactory.new
|
46
|
+
fcf.data_source = dsf.create
|
47
|
+
|
48
|
+
fcf.filter do |chain|
|
49
|
+
# Run twice (assume new the first time, updates the second).
|
50
|
+
2.times do
|
51
|
+
content = UniMap.new
|
52
|
+
content.url = Core::VisitURL.normalize( "http://gravitext.com/atom.xml" )
|
53
|
+
content.type = "FEED"
|
54
|
+
content.priority = 1.0
|
55
|
+
assert( chain.filter( content ) )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
mgr.shutdown
|
60
|
+
dsf.close
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
|
23
|
+
|
24
|
+
RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
|
25
|
+
|
26
|
+
require 'iudex-worker'
|
27
|
+
require 'iudex-worker/prioritizer'
|
28
|
+
|
29
|
+
class TestPrioritizer < MiniTest::Unit::TestCase
|
30
|
+
include Iudex::Worker
|
31
|
+
include Gravitext::HTMap
|
32
|
+
JDate = Java::java.util.Date
|
33
|
+
|
34
|
+
UniMap.define_accessors
|
35
|
+
|
36
|
+
def test_identity
|
37
|
+
m = new_map
|
38
|
+
p = Prioritizer.new( "test", :constant => 3.2,
|
39
|
+
:factors => [], :impedance => 0 )
|
40
|
+
|
41
|
+
assert( p.filter( m ) )
|
42
|
+
assert_equal_fuzzy( 3.2, m.priority )
|
43
|
+
assert_equal( m.visit_start, m.next_visit_after )
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_visiting_now
|
47
|
+
m = new_map
|
48
|
+
p = Prioritizer.new( "test", :visiting_now => true )
|
49
|
+
|
50
|
+
assert( p.filter( m ) )
|
51
|
+
assert_equal_fuzzy( m.visit_start.time/1000.0 + p.min_next,
|
52
|
+
m.next_visit_after.time/1000.0 )
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_oldest
|
56
|
+
map = new_map
|
57
|
+
|
58
|
+
times = [ Time.utc( 2010, "jul", 17, 19,0,0 ),
|
59
|
+
oldest = Time.utc( 2010, "jul", 17, 18,0,0 ),
|
60
|
+
Time.utc( 2010, "jul", 17, 20,0,0 ),
|
61
|
+
nil ]
|
62
|
+
|
63
|
+
map.references = times.map do |t|
|
64
|
+
ref = UniMap.new
|
65
|
+
ref.pub_date = t
|
66
|
+
ref
|
67
|
+
end
|
68
|
+
|
69
|
+
p = prioritizer
|
70
|
+
assert_equal( oldest, p.as_time( p.oldest( map.references ) ) )
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_since_last
|
74
|
+
assert_equal( 60.0, prioritizer.since( one_minute_last_map ) )
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_ref_change_rate
|
78
|
+
map = one_minute_last_map
|
79
|
+
map.new_references = 1
|
80
|
+
map.updated_references = 4
|
81
|
+
assert_equal_fuzzy( 120, prioritizer.ref_change_rate( map ) )
|
82
|
+
end
|
83
|
+
|
84
|
+
def one_minute_last_map
|
85
|
+
map = new_map
|
86
|
+
map.visit_start = start = JDate.new
|
87
|
+
map.last_visit = JDate.new( start.time - ( 1_000 * 60 ) )
|
88
|
+
map
|
89
|
+
end
|
90
|
+
|
91
|
+
def assert_equal_fuzzy( l, r )
|
92
|
+
assert( ( l - r ).abs < 0.1, "#{l} ~!= #{r}" )
|
93
|
+
end
|
94
|
+
|
95
|
+
def new_map
|
96
|
+
map = UniMap.new
|
97
|
+
map.visit_start = JDate.new
|
98
|
+
map
|
99
|
+
end
|
100
|
+
|
101
|
+
def prioritizer
|
102
|
+
Prioritizer.new( "test" )
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-worker
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-04 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: iudex-core
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.0
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rjack-logback
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ~>
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: "1.0"
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: iudex-da
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ~>
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.0.0
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: iudex-rome
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ~>
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: 1.0.0
|
58
|
+
type: :runtime
|
59
|
+
version_requirements: *id004
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: iudex-html
|
62
|
+
prerelease: false
|
63
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 1.0.0
|
69
|
+
type: :runtime
|
70
|
+
version_requirements: *id005
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: iudex-simhash
|
73
|
+
prerelease: false
|
74
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
75
|
+
none: false
|
76
|
+
requirements:
|
77
|
+
- - ~>
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 1.0.0
|
80
|
+
type: :runtime
|
81
|
+
version_requirements: *id006
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: iudex-httpclient-3
|
84
|
+
prerelease: false
|
85
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ~>
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 1.0.0
|
91
|
+
type: :runtime
|
92
|
+
version_requirements: *id007
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: minitest
|
95
|
+
prerelease: false
|
96
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.7.1
|
102
|
+
- - <
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: "2.1"
|
105
|
+
type: :development
|
106
|
+
version_requirements: *id008
|
107
|
+
- !ruby/object:Gem::Dependency
|
108
|
+
name: rjack-tarpit
|
109
|
+
prerelease: false
|
110
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ~>
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: 1.3.0
|
116
|
+
type: :development
|
117
|
+
version_requirements: *id009
|
118
|
+
description: |-
|
119
|
+
Iudex is a general purpose web crawler and feed processor in
|
120
|
+
ruby/java. The iudex-worker gem provides a worker deamon for feed/page
|
121
|
+
processing.
|
122
|
+
email:
|
123
|
+
- dek-oss@gravitext.com
|
124
|
+
executables:
|
125
|
+
- iudex-worker-fg
|
126
|
+
extensions: []
|
127
|
+
|
128
|
+
extra_rdoc_files:
|
129
|
+
- Manifest.txt
|
130
|
+
- History.rdoc
|
131
|
+
- README.rdoc
|
132
|
+
files:
|
133
|
+
- History.rdoc
|
134
|
+
- Manifest.txt
|
135
|
+
- README.rdoc
|
136
|
+
- Rakefile
|
137
|
+
- bin/iudex-worker-fg
|
138
|
+
- config/config.rb
|
139
|
+
- init/iudex-worker
|
140
|
+
- lib/iudex-worker/base.rb
|
141
|
+
- lib/iudex-worker.rb
|
142
|
+
- lib/iudex-worker/agent.rb
|
143
|
+
- lib/iudex-worker/fetch_helper.rb
|
144
|
+
- lib/iudex-worker/filter_chain_factory.rb
|
145
|
+
- lib/iudex-worker/prioritizer.rb
|
146
|
+
- test/setup.rb
|
147
|
+
- test/test_agent.rb
|
148
|
+
- test/test_filter_chain_factory.rb
|
149
|
+
- test/test_prioritizer.rb
|
150
|
+
has_rdoc: true
|
151
|
+
homepage: http://github.com/dekellum/iudex
|
152
|
+
licenses: []
|
153
|
+
|
154
|
+
post_install_message:
|
155
|
+
rdoc_options:
|
156
|
+
- --main
|
157
|
+
- README.rdoc
|
158
|
+
require_paths:
|
159
|
+
- lib
|
160
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
162
|
+
requirements:
|
163
|
+
- - ">="
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: "0"
|
166
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
|
+
none: false
|
168
|
+
requirements:
|
169
|
+
- - ">="
|
170
|
+
- !ruby/object:Gem::Version
|
171
|
+
version: "0"
|
172
|
+
requirements: []
|
173
|
+
|
174
|
+
rubyforge_project: iudex-worker
|
175
|
+
rubygems_version: 1.5.1
|
176
|
+
signing_key:
|
177
|
+
specification_version: 3
|
178
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
179
|
+
test_files:
|
180
|
+
- test/test_agent.rb
|
181
|
+
- test/test_filter_chain_factory.rb
|
182
|
+
- test/test_prioritizer.rb
|