iudex-worker 1.0.0-java → 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +12 -0
- data/Manifest.txt +2 -0
- data/Rakefile +11 -8
- data/bin/iudex-worker-fg +2 -2
- data/config/config.rb +16 -9
- data/config/config_async_http.rb +13 -0
- data/config/config_jetty_http.rb +11 -0
- data/init/iudex-worker +1 -1
- data/lib/iudex-worker/agent.rb +55 -12
- data/lib/iudex-worker/base.rb +1 -1
- data/lib/iudex-worker/fetch_helper.rb +9 -3
- data/lib/iudex-worker/filter_chain_factory.rb +47 -22
- data/test/setup.rb +1 -1
- data/test/test_agent.rb +26 -3
- data/test/test_filter_chain_factory.rb +42 -10
- metadata +50 -19
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
CHANGED
@@ -1,2 +1,14 @@
|
|
1
|
+
=== 1.1.0 (2011-11-13)
|
2
|
+
* Update to iudex-core, -da, -rome, -html, -simhash ~> 1.1.0
|
3
|
+
* Changes for VisitManager, VisitCounter, RedirectHandler, Revisitor
|
4
|
+
* Use ContentTypeSet in ContentFetcher
|
5
|
+
* Generalize Agent to for all three HTTP clients; all are now
|
6
|
+
optional/dev dependencies
|
7
|
+
* Add iudex-char-detector ~> 1.1.0 dep and use CharDetectFilter in
|
8
|
+
FCF.page_receiver
|
9
|
+
* Improved setup error logging in Agent
|
10
|
+
* Enable :main listeners in filter_chain_factory
|
11
|
+
* Update to minitest ~> 2.3
|
12
|
+
|
1
13
|
=== 1.0.0 (2011-04-04)
|
2
14
|
* Initial release.
|
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH << './lib'
|
|
4
4
|
require 'iudex-worker/base'
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
|
-
gem 'rjack-tarpit', '~> 1.
|
7
|
+
gem 'rjack-tarpit', '~> 1.4'
|
8
8
|
require 'rjack-tarpit'
|
9
9
|
|
10
10
|
t = RJack::TarPit.new( 'iudex-worker', Iudex::Worker::VERSION, :java_platform )
|
@@ -12,16 +12,19 @@ t = RJack::TarPit.new( 'iudex-worker', Iudex::Worker::VERSION, :java_platform )
|
|
12
12
|
t.specify do |h|
|
13
13
|
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
14
14
|
|
15
|
-
h.extra_deps += [ [ 'iudex-core', '~> 1.
|
15
|
+
h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
|
16
16
|
[ 'rjack-logback', '~> 1.0' ],
|
17
|
-
[ 'iudex-da', '~> 1.
|
18
|
-
[ 'iudex-rome', '~> 1.
|
19
|
-
[ 'iudex-html', '~> 1.
|
20
|
-
[ 'iudex-simhash', '~> 1.
|
21
|
-
[ 'iudex-
|
17
|
+
[ 'iudex-da', '~> 1.1.0' ],
|
18
|
+
[ 'iudex-rome', '~> 1.1.0' ],
|
19
|
+
[ 'iudex-html', '~> 1.1.0' ],
|
20
|
+
[ 'iudex-simhash', '~> 1.1.0' ],
|
21
|
+
[ 'iudex-char-detector', '~> 1.1.0' ] ]
|
22
22
|
|
23
23
|
h.testlib = :minitest
|
24
|
-
h.extra_dev_deps += [ [ 'minitest', '
|
24
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
|
25
|
+
[ 'iudex-httpclient-3', '~> 1.1.0' ],
|
26
|
+
[ 'iudex-jetty-httpclient', '~> 1.1.0' ],
|
27
|
+
[ 'iudex-async-httpclient', '~> 1.1.0' ] ]
|
25
28
|
end
|
26
29
|
|
27
30
|
task :chk_hist_vers do
|
data/bin/iudex-worker-fg
CHANGED
@@ -35,11 +35,11 @@ module IudexBinScript
|
|
35
35
|
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
36
36
|
|
37
37
|
OptionParser.new do |opts|
|
38
|
-
opts.on( "-v", "--version", "Display version" ) do
|
38
|
+
opts.on( "-v", "--version", "Display version" ) do
|
39
39
|
puts "iudex-worker: #{ Worker::VERSION }"
|
40
40
|
exit 1
|
41
41
|
end
|
42
|
-
opts.on( "-d", "--debug", "Enable verbose DEBUG logging" ) do
|
42
|
+
opts.on( "-d", "--debug", "Enable verbose DEBUG logging" ) do
|
43
43
|
Logback[ 'iudex' ].level = Logback::DEBUG
|
44
44
|
end
|
45
45
|
Hooker.register_config( opts )
|
data/config/config.rb
CHANGED
@@ -7,7 +7,8 @@ Iudex.configure do |c|
|
|
7
7
|
threads = 3
|
8
8
|
|
9
9
|
c.setup_connect_props do
|
10
|
-
{ :
|
10
|
+
{ :database => 'iudex_test',
|
11
|
+
:ds_pool => { :max_active => threads / 3 * 2,
|
11
12
|
:max_idle => threads / 3 },
|
12
13
|
:loglevel => 1 }
|
13
14
|
end
|
@@ -16,9 +17,15 @@ Iudex.configure do |c|
|
|
16
17
|
mgr.manager_params.max_total_connections = threads * 10
|
17
18
|
end
|
18
19
|
|
19
|
-
c.
|
20
|
+
c.setup_visit_manager do |vx|
|
20
21
|
vx.max_threads = threads
|
21
|
-
|
22
|
+
end
|
23
|
+
|
24
|
+
c.setup_visit_queue do |q|
|
25
|
+
q.default_min_host_delay = 100 #ms
|
26
|
+
q.default_max_access_per_host = 1
|
27
|
+
|
28
|
+
q.configure_host( "gravitext.com", 100, 2 ) # 100ms, 2 connections
|
22
29
|
end
|
23
30
|
|
24
31
|
c.setup_work_poller do |wp|
|
@@ -30,15 +37,15 @@ Iudex.configure do |c|
|
|
30
37
|
c.setup_filter_factory do |ff|
|
31
38
|
|
32
39
|
def ff.barc_writer
|
33
|
-
|
34
|
-
|
35
|
-
|
40
|
+
super.tap do |w|
|
41
|
+
w.do_compress = false
|
42
|
+
end
|
36
43
|
end
|
37
44
|
|
38
45
|
def ff.barc_directory
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
super.tap do |bdir|
|
47
|
+
bdir.target_length = 2 * ( 1024 ** 2 )
|
48
|
+
end
|
42
49
|
end
|
43
50
|
|
44
51
|
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'iudex-async-httpclient'
|
2
|
+
|
3
|
+
Iudex.configure do |c|
|
4
|
+
|
5
|
+
c.setup_async_httpclient do
|
6
|
+
{ :connection_timeout_in_ms => 5_000,
|
7
|
+
:request_timeout_in_ms => 10_000,
|
8
|
+
:idle_connection_timeout_in_ms => 6_000,
|
9
|
+
:maximum_connections_total => 200,
|
10
|
+
:maximum_connections_per_host => 5 }
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
data/init/iudex-worker
CHANGED
data/lib/iudex-worker/agent.rb
CHANGED
@@ -18,8 +18,6 @@ require 'iudex-da'
|
|
18
18
|
require 'iudex-da/key_helper'
|
19
19
|
require 'iudex-da/pool_data_source_factory'
|
20
20
|
|
21
|
-
require 'iudex-httpclient-3'
|
22
|
-
|
23
21
|
require 'iudex-worker'
|
24
22
|
require 'iudex-worker/filter_chain_factory'
|
25
23
|
|
@@ -36,6 +34,8 @@ module Iudex
|
|
36
34
|
include Gravitext::HTMap
|
37
35
|
|
38
36
|
def initialize
|
37
|
+
@log = RJack::SLF4J[ self.class ]
|
38
|
+
@http_manager = nil
|
39
39
|
Hooker.apply( [ :iudex, :worker ], self )
|
40
40
|
end
|
41
41
|
|
@@ -48,28 +48,66 @@ module Iudex
|
|
48
48
|
FilterChainFactory.new( 'agent' )
|
49
49
|
end
|
50
50
|
|
51
|
+
def http_client( executor )
|
52
|
+
if defined?( JettyHTTPClient )
|
53
|
+
@log.info "Setting up JettyHTTPClient"
|
54
|
+
JettyHTTPClient.create_client.tap do |c|
|
55
|
+
c.executor = executor
|
56
|
+
c.start
|
57
|
+
end
|
58
|
+
elsif defined?( AsyncHTTPClient )
|
59
|
+
@log.info "Setting up AsyncHTTPClient"
|
60
|
+
AsyncHTTPClient.create_client( :executor_service => executor )
|
61
|
+
else
|
62
|
+
gem 'iudex-httpclient-3', '~> 1.1.0'
|
63
|
+
require 'iudex-httpclient-3'
|
64
|
+
@log.info "Setting up HTTPClient3"
|
65
|
+
@http_manager = HTTPClient3.create_manager
|
66
|
+
@http_manager.start
|
67
|
+
HTTPClient3::HTTPClient3.new( @http_manager.client )
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def visit_manager( wpoller )
|
72
|
+
vexec = VisitManager.new( wpoller )
|
73
|
+
Hooker.apply( [ :iudex, :visit_manager ], vexec )
|
74
|
+
end
|
75
|
+
|
76
|
+
def work_poller( data_source )
|
77
|
+
cmapper = ContentMapper.new( keys( poll_keys ) )
|
78
|
+
wpoller = WorkPoller.new( data_source, cmapper )
|
79
|
+
|
80
|
+
visit_q = Hooker.apply( [ :iudex, :visit_queue ], VisitQueue.new )
|
81
|
+
|
82
|
+
wpoller.visit_queue_factory = VisitQueueFactory.new( visit_q )
|
83
|
+
|
84
|
+
Hooker.apply( [ :iudex, :work_poller ], wpoller )
|
85
|
+
end
|
86
|
+
|
51
87
|
def run
|
52
88
|
Hooker.with( :iudex ) do
|
53
89
|
dsf = PoolDataSourceFactory.new
|
54
90
|
data_source = dsf.create
|
55
91
|
|
56
|
-
|
57
|
-
|
58
|
-
|
92
|
+
wpoller = work_poller( data_source )
|
93
|
+
vexec = visit_manager( wpoller )
|
94
|
+
vexec.start_executor
|
59
95
|
|
60
|
-
|
61
|
-
mgr.start
|
62
|
-
http_client = HTTPClient3::HTTPClient3.new( mgr.client )
|
96
|
+
hclient = http_client( vexec.executor )
|
63
97
|
|
64
98
|
fcf = filter_chain_factory
|
65
|
-
fcf.http_client =
|
99
|
+
fcf.http_client = hclient
|
66
100
|
fcf.data_source = data_source
|
101
|
+
fcf.visit_counter = vexec
|
102
|
+
|
103
|
+
# FilterChain's executor is the same executor, unless using
|
104
|
+
# HTTPClient3, where executor is best not used
|
105
|
+
fcf.executor = vexec.executor unless @http_manager
|
67
106
|
|
68
107
|
Hooker.apply( :filter_factory, fcf )
|
69
108
|
|
70
109
|
fcf.filter do |chain|
|
71
|
-
vexec =
|
72
|
-
Hooker.apply( :visit_executor, vexec )
|
110
|
+
vexec.filter_chain = chain
|
73
111
|
|
74
112
|
Hooker.log_not_applied # All hooks should be used by now
|
75
113
|
|
@@ -77,10 +115,15 @@ module Iudex
|
|
77
115
|
vexec.join #Run until interrupted
|
78
116
|
end # fcf closes
|
79
117
|
|
80
|
-
|
118
|
+
hclient.close if hclient.respond_to?( :close )
|
119
|
+
@http_manager.shutdown if @http_manager
|
120
|
+
|
81
121
|
dsf.close
|
82
122
|
end
|
123
|
+
rescue => e
|
124
|
+
@log.error( "On run: ", e )
|
83
125
|
end
|
126
|
+
|
84
127
|
end
|
85
128
|
|
86
129
|
end
|
data/lib/iudex-worker/base.rb
CHANGED
@@ -23,11 +23,17 @@ module Iudex
|
|
23
23
|
include Iudex::HTTP
|
24
24
|
include Iudex::Core::Filters
|
25
25
|
|
26
|
-
def create_content_fetcher( accept_types,
|
27
|
-
cf = ContentFetcher.new( http_client,
|
26
|
+
def create_content_fetcher( accept_types, receiver, listener = nil )
|
27
|
+
cf = ContentFetcher.new( http_client,
|
28
|
+
visit_counter,
|
29
|
+
create_chain( receiver, nil, listener ) )
|
30
|
+
|
31
|
+
cf.executor = executor if executor
|
28
32
|
|
29
33
|
alist = accept_list( accept_types )
|
30
|
-
|
34
|
+
unless alist.include?( '*/*' )
|
35
|
+
cf.accepted_content_types = ContentTypeSet.new( alist )
|
36
|
+
end
|
31
37
|
|
32
38
|
headers = [ [ 'User-Agent', http_user_agent ],
|
33
39
|
[ 'Accept', accept_header( accept_types ) ] ]
|
@@ -26,6 +26,8 @@ require 'iudex-da/factory_helper'
|
|
26
26
|
|
27
27
|
require 'iudex-rome'
|
28
28
|
|
29
|
+
require 'iudex-char-detector'
|
30
|
+
|
29
31
|
require 'iudex-html'
|
30
32
|
require 'iudex-html/factory_helper'
|
31
33
|
|
@@ -45,6 +47,7 @@ module Iudex
|
|
45
47
|
include Iudex::Core
|
46
48
|
include Iudex::Core::Filters
|
47
49
|
include Iudex::ROME
|
50
|
+
include Iudex::CharDetector
|
48
51
|
|
49
52
|
include Iudex::DA::Filters::FactoryHelper
|
50
53
|
include Iudex::HTML::Filters::FactoryHelper
|
@@ -53,6 +56,8 @@ module Iudex
|
|
53
56
|
|
54
57
|
attr_accessor :http_client
|
55
58
|
attr_accessor :data_source
|
59
|
+
attr_accessor :visit_counter
|
60
|
+
attr_accessor :executor
|
56
61
|
|
57
62
|
def initialize( name )
|
58
63
|
super
|
@@ -60,15 +65,14 @@ module Iudex
|
|
60
65
|
end
|
61
66
|
|
62
67
|
def setup_reporters
|
63
|
-
|
64
|
-
add_by_filter_reporter
|
68
|
+
# Use default, preserved for overrides
|
65
69
|
end
|
66
70
|
|
67
71
|
def filters
|
68
72
|
[ UHashMDCSetter.new,
|
69
73
|
DefaultFilter.new,
|
70
74
|
super,
|
71
|
-
type_switch ]
|
75
|
+
type_switch ]
|
72
76
|
end
|
73
77
|
|
74
78
|
def listeners
|
@@ -76,8 +80,8 @@ module Iudex
|
|
76
80
|
end
|
77
81
|
|
78
82
|
def type_map
|
79
|
-
{ "FEED" => feed_fetcher,
|
80
|
-
"PAGE" => page_fetcher }
|
83
|
+
{ "FEED" => [ feed_fetcher, :main ],
|
84
|
+
"PAGE" => [ page_fetcher, :main ] }
|
81
85
|
end
|
82
86
|
|
83
87
|
def type_switch( tmap = type_map )
|
@@ -85,15 +89,17 @@ module Iudex
|
|
85
89
|
end
|
86
90
|
|
87
91
|
def feed_fetcher
|
88
|
-
[ create_content_fetcher( feed_mime_types, :feed_receiver ) ]
|
92
|
+
[ create_content_fetcher( feed_mime_types, :feed_receiver, :main ) ]
|
89
93
|
end
|
90
94
|
|
91
95
|
def page_fetcher
|
92
|
-
[ create_content_fetcher( page_mime_types, :page_receiver ) ]
|
96
|
+
[ create_content_fetcher( page_mime_types, :page_receiver, :main ) ]
|
93
97
|
end
|
94
98
|
|
95
99
|
def feed_receiver
|
96
|
-
[
|
100
|
+
[ RedirectHandler.new,
|
101
|
+
Revisitor.new( visit_counter ),
|
102
|
+
RomeFeedParser.new,
|
97
103
|
DefaultFilter.new,
|
98
104
|
DateChangeFilter.new( false ),
|
99
105
|
feed_updater ]
|
@@ -109,7 +115,7 @@ module Iudex
|
|
109
115
|
ref_common_cleanup,
|
110
116
|
Prioritizer.new( "feed-ref-new",
|
111
117
|
:constant => 50,
|
112
|
-
:min_next => 0.0 ) ]
|
118
|
+
:min_next => 0.0 ) ]
|
113
119
|
end
|
114
120
|
|
115
121
|
def feed_ref_update
|
@@ -118,24 +124,32 @@ module Iudex
|
|
118
124
|
ref_common_cleanup,
|
119
125
|
Prioritizer.new( "feed-ref-update",
|
120
126
|
:constant => 10,
|
121
|
-
:min_next => 0.0 ) ]
|
122
|
-
end
|
123
|
-
|
124
|
-
#
|
125
|
-
#
|
127
|
+
:min_next => 0.0 ) ]
|
128
|
+
end
|
129
|
+
|
130
|
+
# Filters to apply for feed update.
|
131
|
+
#
|
132
|
+
# Notes:
|
133
|
+
#
|
134
|
+
# * This is run possibly twice, for both base content map and
|
135
|
+
# referer map if present.
|
136
|
+
#
|
137
|
+
# * If this is an update then these filters act on a *new* map,
|
138
|
+
# thus any changes made here will not be visible after exit
|
139
|
+
# from the update_filter.
|
126
140
|
def feed_post
|
127
141
|
[ UHashMDCSetter.new,
|
128
142
|
ref_common_cleanup,
|
129
143
|
Prioritizer.new( "feed-post",
|
130
144
|
:constant => 30,
|
131
145
|
:visiting_now => true ),
|
132
|
-
last_visit_setter ]
|
146
|
+
last_visit_setter ]
|
133
147
|
end
|
134
148
|
|
135
149
|
def ref_common_cleanup
|
136
150
|
[ ref_html_filters,
|
137
151
|
TextCtrlWSFilter.new( :title.to_k ),
|
138
|
-
FutureDateFilter.new( :pub_date.to_k ) ]
|
152
|
+
FutureDateFilter.new( :pub_date.to_k ) ]
|
139
153
|
end
|
140
154
|
|
141
155
|
def ref_html_filters
|
@@ -143,7 +157,7 @@ module Iudex
|
|
143
157
|
html_clean_filters( :summary ),
|
144
158
|
html_clean_filters( :content ),
|
145
159
|
html_write_filter( :summary ),
|
146
|
-
html_write_filter( :content ) ]
|
160
|
+
html_write_filter( :content ) ]
|
147
161
|
end
|
148
162
|
|
149
163
|
def feed_update_keys
|
@@ -151,9 +165,12 @@ module Iudex
|
|
151
165
|
end
|
152
166
|
|
153
167
|
def page_receiver
|
154
|
-
[
|
168
|
+
[ RedirectHandler.new,
|
169
|
+
Revisitor.new( visit_counter ),
|
170
|
+
CharDetectFilter.new,
|
171
|
+
html_clean_filters( :source ),
|
155
172
|
simhash_generator,
|
156
|
-
page_updater ]
|
173
|
+
page_updater ]
|
157
174
|
end
|
158
175
|
|
159
176
|
def barc_writer
|
@@ -171,8 +188,16 @@ module Iudex
|
|
171
188
|
create_update_filter( keys( page_update_keys ), :page_post )
|
172
189
|
end
|
173
190
|
|
174
|
-
#
|
175
|
-
#
|
191
|
+
# Filters to apply during page update
|
192
|
+
#
|
193
|
+
# Notes:
|
194
|
+
#
|
195
|
+
# * This is run possibly twice, for both base content map and
|
196
|
+
# referer map if present.
|
197
|
+
#
|
198
|
+
# * If this is an update then these filters act on a *new* map,
|
199
|
+
# thus any changes made here will not be visible after exit
|
200
|
+
# from the update_filter.
|
176
201
|
def page_post
|
177
202
|
[ UHashMDCSetter.new,
|
178
203
|
barc_writer, # Not run in 302 referer case, since no SOURCE.
|
@@ -184,7 +209,7 @@ module Iudex
|
|
184
209
|
end
|
185
210
|
|
186
211
|
def page_update_keys
|
187
|
-
[ :uhash, :
|
212
|
+
[ :uhash, :domain, :url, :type,
|
188
213
|
:ref_pub_date, :pub_date,
|
189
214
|
:priority, :last_visit, :next_visit_after,
|
190
215
|
:status, :etag, :reason, :referer, :referent,
|
data/test/setup.rb
CHANGED
@@ -21,7 +21,7 @@ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
|
21
21
|
|
22
22
|
require 'rubygems'
|
23
23
|
require 'rjack-logback'
|
24
|
-
RJack::Logback.config_console( :stderr => true )
|
24
|
+
RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
|
25
25
|
|
26
26
|
require 'minitest/unit'
|
27
27
|
require 'minitest/autorun'
|
data/test/test_agent.rb
CHANGED
@@ -27,6 +27,7 @@ class TestAgent < MiniTest::Unit::TestCase
|
|
27
27
|
|
28
28
|
def setup
|
29
29
|
Logback[ 'iudex.worker.FilterChainFactory' ].level = Logback::WARN
|
30
|
+
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
30
31
|
end
|
31
32
|
|
32
33
|
def teardown
|
@@ -46,11 +47,33 @@ class TestAgent < MiniTest::Unit::TestCase
|
|
46
47
|
assert_agent
|
47
48
|
end
|
48
49
|
|
50
|
+
def test_agent_with_sample_config_jetty_http
|
51
|
+
Hooker.load_file( File.join( File.dirname( __FILE__ ),
|
52
|
+
'..', 'config', 'config_jetty_http.rb' ) )
|
53
|
+
|
54
|
+
assert_agent
|
55
|
+
ensure
|
56
|
+
# Hack to avoid interference in test of other configs, given
|
57
|
+
# require iudex-jetty-httpclient. This only works once, but
|
58
|
+
# thats enough for testing.
|
59
|
+
Iudex.send( :remove_const, :JettyHTTPClient )
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_agent_with_sample_config_async_http
|
63
|
+
Hooker.load_file( File.join( File.dirname( __FILE__ ),
|
64
|
+
'..', 'config', 'config_async_http.rb' ) )
|
65
|
+
|
66
|
+
assert_agent
|
67
|
+
ensure
|
68
|
+
# Same hack as above.
|
69
|
+
Iudex.send( :remove_const, :AsyncHTTPClient )
|
70
|
+
end
|
71
|
+
|
49
72
|
def assert_agent
|
50
73
|
|
51
|
-
# Stub
|
52
|
-
Hooker.add( [ :iudex, :
|
53
|
-
def
|
74
|
+
# Stub VisitManager.start to allow agent.run to return early.
|
75
|
+
Hooker.add( [ :iudex, :visit_manager ] ) do |vm|
|
76
|
+
def vm.start
|
54
77
|
#disable
|
55
78
|
end
|
56
79
|
end
|
@@ -19,10 +19,6 @@
|
|
19
19
|
|
20
20
|
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
21
|
|
22
|
-
RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
|
23
|
-
|
24
|
-
RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
|
25
|
-
|
26
22
|
require 'iudex-httpclient-3'
|
27
23
|
|
28
24
|
require 'iudex-da'
|
@@ -30,34 +26,70 @@ require 'iudex-da/pool_data_source_factory'
|
|
30
26
|
|
31
27
|
require 'iudex-worker'
|
32
28
|
require 'iudex-worker/filter_chain_factory'
|
29
|
+
require 'iudex-httpclient-3'
|
33
30
|
|
34
31
|
class TestFilterChainFactory < MiniTest::Unit::TestCase
|
35
32
|
include Iudex
|
36
33
|
include Gravitext::HTMap
|
37
34
|
|
35
|
+
def setup
|
36
|
+
RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
|
37
|
+
end
|
38
|
+
|
39
|
+
def teardown
|
40
|
+
RJack::Logback[ 'iudex' ].level = nil
|
41
|
+
end
|
42
|
+
|
43
|
+
import 'iudex.core.VisitCounter'
|
44
|
+
class TestVisitCounter
|
45
|
+
include VisitCounter
|
46
|
+
|
47
|
+
attr_reader :released
|
48
|
+
|
49
|
+
def initialize
|
50
|
+
super()
|
51
|
+
@released = []
|
52
|
+
end
|
53
|
+
|
54
|
+
def release( acquired, newOrder )
|
55
|
+
@released << acquired
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
38
59
|
def test_filter
|
39
60
|
fcf = Worker::FilterChainFactory.new( "test" )
|
40
61
|
|
41
62
|
mgr = HTTPClient3.create_manager
|
42
63
|
mgr.start
|
43
64
|
fcf.http_client = HTTPClient3::HTTPClient3.new( mgr.client )
|
65
|
+
fcf.visit_counter = counter = TestVisitCounter.new
|
44
66
|
|
45
67
|
dsf = DA::PoolDataSourceFactory.new
|
46
68
|
fcf.data_source = dsf.create
|
47
69
|
|
48
70
|
fcf.filter do |chain|
|
49
71
|
# Run twice (assume new the first time, updates the second).
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
assert( chain.filter( content ) )
|
72
|
+
|
73
|
+
order = UniMap.new.tap do |o|
|
74
|
+
o.url = Core::VisitURL.normalize( "http://gravitext.com/atom.xml" )
|
75
|
+
o.type = "FEED"
|
76
|
+
o.priority = 1.0
|
56
77
|
end
|
78
|
+
|
79
|
+
orders = [ order, order.clone ]
|
80
|
+
|
81
|
+
orders.each do |o|
|
82
|
+
assert( chain.filter( o ) )
|
83
|
+
end
|
84
|
+
|
85
|
+
# Note this only works timing wise because of blocking
|
86
|
+
# HTTPClient.
|
87
|
+
assert_equal( orders, counter.released )
|
57
88
|
end
|
58
89
|
|
59
90
|
mgr.shutdown
|
60
91
|
dsf.close
|
92
|
+
|
61
93
|
end
|
62
94
|
|
63
95
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: iudex-worker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.1.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
@@ -10,8 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
14
|
-
default_executable:
|
13
|
+
date: 2011-11-13 00:00:00 Z
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
17
16
|
name: iudex-core
|
@@ -21,7 +20,7 @@ dependencies:
|
|
21
20
|
requirements:
|
22
21
|
- - ~>
|
23
22
|
- !ruby/object:Gem::Version
|
24
|
-
version: 1.
|
23
|
+
version: 1.1.0
|
25
24
|
type: :runtime
|
26
25
|
version_requirements: *id001
|
27
26
|
- !ruby/object:Gem::Dependency
|
@@ -43,7 +42,7 @@ dependencies:
|
|
43
42
|
requirements:
|
44
43
|
- - ~>
|
45
44
|
- !ruby/object:Gem::Version
|
46
|
-
version: 1.
|
45
|
+
version: 1.1.0
|
47
46
|
type: :runtime
|
48
47
|
version_requirements: *id003
|
49
48
|
- !ruby/object:Gem::Dependency
|
@@ -54,7 +53,7 @@ dependencies:
|
|
54
53
|
requirements:
|
55
54
|
- - ~>
|
56
55
|
- !ruby/object:Gem::Version
|
57
|
-
version: 1.
|
56
|
+
version: 1.1.0
|
58
57
|
type: :runtime
|
59
58
|
version_requirements: *id004
|
60
59
|
- !ruby/object:Gem::Dependency
|
@@ -65,7 +64,7 @@ dependencies:
|
|
65
64
|
requirements:
|
66
65
|
- - ~>
|
67
66
|
- !ruby/object:Gem::Version
|
68
|
-
version: 1.
|
67
|
+
version: 1.1.0
|
69
68
|
type: :runtime
|
70
69
|
version_requirements: *id005
|
71
70
|
- !ruby/object:Gem::Dependency
|
@@ -76,18 +75,18 @@ dependencies:
|
|
76
75
|
requirements:
|
77
76
|
- - ~>
|
78
77
|
- !ruby/object:Gem::Version
|
79
|
-
version: 1.
|
78
|
+
version: 1.1.0
|
80
79
|
type: :runtime
|
81
80
|
version_requirements: *id006
|
82
81
|
- !ruby/object:Gem::Dependency
|
83
|
-
name: iudex-
|
82
|
+
name: iudex-char-detector
|
84
83
|
prerelease: false
|
85
84
|
requirement: &id007 !ruby/object:Gem::Requirement
|
86
85
|
none: false
|
87
86
|
requirements:
|
88
87
|
- - ~>
|
89
88
|
- !ruby/object:Gem::Version
|
90
|
-
version: 1.
|
89
|
+
version: 1.1.0
|
91
90
|
type: :runtime
|
92
91
|
version_requirements: *id007
|
93
92
|
- !ruby/object:Gem::Dependency
|
@@ -96,25 +95,55 @@ dependencies:
|
|
96
95
|
requirement: &id008 !ruby/object:Gem::Requirement
|
97
96
|
none: false
|
98
97
|
requirements:
|
99
|
-
- -
|
100
|
-
- !ruby/object:Gem::Version
|
101
|
-
version: 1.7.1
|
102
|
-
- - <
|
98
|
+
- - ~>
|
103
99
|
- !ruby/object:Gem::Version
|
104
|
-
version: "2.
|
100
|
+
version: "2.3"
|
105
101
|
type: :development
|
106
102
|
version_requirements: *id008
|
107
103
|
- !ruby/object:Gem::Dependency
|
108
|
-
name:
|
104
|
+
name: iudex-httpclient-3
|
109
105
|
prerelease: false
|
110
106
|
requirement: &id009 !ruby/object:Gem::Requirement
|
111
107
|
none: false
|
112
108
|
requirements:
|
113
109
|
- - ~>
|
114
110
|
- !ruby/object:Gem::Version
|
115
|
-
version: 1.
|
111
|
+
version: 1.1.0
|
116
112
|
type: :development
|
117
113
|
version_requirements: *id009
|
114
|
+
- !ruby/object:Gem::Dependency
|
115
|
+
name: iudex-jetty-httpclient
|
116
|
+
prerelease: false
|
117
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
118
|
+
none: false
|
119
|
+
requirements:
|
120
|
+
- - ~>
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 1.1.0
|
123
|
+
type: :development
|
124
|
+
version_requirements: *id010
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: iudex-async-httpclient
|
127
|
+
prerelease: false
|
128
|
+
requirement: &id011 !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ~>
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: 1.1.0
|
134
|
+
type: :development
|
135
|
+
version_requirements: *id011
|
136
|
+
- !ruby/object:Gem::Dependency
|
137
|
+
name: rjack-tarpit
|
138
|
+
prerelease: false
|
139
|
+
requirement: &id012 !ruby/object:Gem::Requirement
|
140
|
+
none: false
|
141
|
+
requirements:
|
142
|
+
- - ~>
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: 1.4.0
|
145
|
+
type: :development
|
146
|
+
version_requirements: *id012
|
118
147
|
description: |-
|
119
148
|
Iudex is a general purpose web crawler and feed processor in
|
120
149
|
ruby/java. The iudex-worker gem provides a worker deamon for feed/page
|
@@ -136,6 +165,8 @@ files:
|
|
136
165
|
- Rakefile
|
137
166
|
- bin/iudex-worker-fg
|
138
167
|
- config/config.rb
|
168
|
+
- config/config_async_http.rb
|
169
|
+
- config/config_jetty_http.rb
|
139
170
|
- init/iudex-worker
|
140
171
|
- lib/iudex-worker/base.rb
|
141
172
|
- lib/iudex-worker.rb
|
@@ -147,7 +178,7 @@ files:
|
|
147
178
|
- test/test_agent.rb
|
148
179
|
- test/test_filter_chain_factory.rb
|
149
180
|
- test/test_prioritizer.rb
|
150
|
-
|
181
|
+
- .gemtest
|
151
182
|
homepage: http://github.com/dekellum/iudex
|
152
183
|
licenses: []
|
153
184
|
|
@@ -172,7 +203,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
172
203
|
requirements: []
|
173
204
|
|
174
205
|
rubyforge_project: iudex-worker
|
175
|
-
rubygems_version: 1.
|
206
|
+
rubygems_version: 1.8.9
|
176
207
|
signing_key:
|
177
208
|
specification_version: 3
|
178
209
|
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|