iudex-worker 1.3.1-java → 1.3.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +10 -0
- data/config/config.rb +2 -4
- data/init/iudex-worker +1 -1
- data/lib/iudex-worker/base.rb +1 -1
- data/lib/iudex-worker/fetch_helper.rb +76 -13
- data/lib/iudex-worker/filter_chain_factory.rb +14 -27
- metadata +36 -84
data/History.rdoc
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
=== 1.3.2 (2012-11-8)
|
2
|
+
* Upgrade/narrow to iudex-core, -da, -rome, -html, -simhash ~> 1.3.0
|
3
|
+
* FetchHelper.create_content_fetcher now prefers an options Hash,
|
4
|
+
exposing settings for :types, :client, :user_agent,
|
5
|
+
:request_headers, etc. and passing additional options to
|
6
|
+
create_chain (:filters, etc.)
|
7
|
+
* Various updates to FilterChainFactory and config sample using new
|
8
|
+
options, etc.
|
9
|
+
* Upgrade to logback ~> 1.5
|
10
|
+
|
1
11
|
=== 1.3.1 (2012-10-25)
|
2
12
|
* Improve shutdown reliability by adding Agent.run_safe with ensure'd
|
3
13
|
close calls.
|
data/config/config.rb
CHANGED
@@ -22,10 +22,8 @@ Iudex.configure do |c|
|
|
22
22
|
end
|
23
23
|
|
24
24
|
c.setup_visit_queue do |q|
|
25
|
-
q.
|
26
|
-
q.
|
27
|
-
|
28
|
-
q.configure_host( "gravitext.com", 100, 2 ) # 100ms, 2 connections
|
25
|
+
q.config( :rate => 5.0, :cons => 1 )
|
26
|
+
q.config( :domain => "gravitext.com", :rate => 10.0, :cons => 2 )
|
29
27
|
end
|
30
28
|
|
31
29
|
c.setup_work_poller do |wp|
|
data/init/iudex-worker
CHANGED
data/lib/iudex-worker/base.rb
CHANGED
@@ -23,26 +23,79 @@ module Iudex
|
|
23
23
|
include Iudex::HTTP
|
24
24
|
include Iudex::Core::Filters
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
26
|
+
# Create a ContentFetcher including a filter chain to receive
|
27
|
+
# the fetch result.
|
28
|
+
#
|
29
|
+
# === Options
|
30
|
+
#
|
31
|
+
# Options support literal values, or a Proc, Method, or a Symbol
|
32
|
+
# to self send unless otherwise noted.
|
33
|
+
#
|
34
|
+
# :types:: An Array or table of Mime types use Accept header in
|
35
|
+
# default :request_headers and to restrict returned
|
36
|
+
# results on. (Default: #page_mime_types)
|
37
|
+
#
|
38
|
+
# :client:: The Java::iudex.http.HTTPClient implementation to
|
39
|
+
# use (Default: :http_client)
|
40
|
+
#
|
41
|
+
# :user_agent:: The HTTP User-Agent for default
|
42
|
+
# :request_headers. Proc's will receive the
|
43
|
+
# options Hash as parameter (Default: #http_user_agent)
|
44
|
+
#
|
45
|
+
# :visit_counter:: The Java::iudex.core.VisitCounter
|
46
|
+
# implementation. (Default: :visit_counter)
|
47
|
+
#
|
48
|
+
# :executor:: The java.util.concurrent.Executor to use for
|
49
|
+
# running the receiver filter chain. (Default: :executor)
|
50
|
+
#
|
51
|
+
# :request_headers:: HTTP Request headers as Array<iudex.http.Header>
|
52
|
+
# (Default: #http_request_headers)
|
53
|
+
#
|
54
|
+
# All options (including the required :filters option, and
|
55
|
+
# :listener with default :main) are also passed to
|
56
|
+
# self.create_chain for creating the receiver filter chain
|
57
|
+
#
|
58
|
+
# The positional parameters equivalent to ( :types, :filters,
|
59
|
+
# :listener ) as defined above are also supported, but
|
60
|
+
# deprecated.
|
61
|
+
#
|
62
|
+
def create_content_fetcher( *args )
|
63
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
64
|
+
|
65
|
+
opts[ :types ] ||= args.shift
|
66
|
+
opts[ :filters ] ||= args.shift
|
67
|
+
opts[ :listener ] ||= args.shift
|
68
|
+
|
69
|
+
opts = { :types => :page_mime_types,
|
70
|
+
:listener => :main,
|
71
|
+
:client => :http_client,
|
72
|
+
:user_agent => :http_user_agent,
|
73
|
+
:visit_counter => :visit_counter,
|
74
|
+
:executor => :executor,
|
75
|
+
:request_headers => :http_request_headers
|
76
|
+
}.merge( opts )
|
77
|
+
|
78
|
+
cf = ContentFetcher.new( call_if( opts[ :client ] ),
|
79
|
+
call_if( opts[ :visit_counter ] ),
|
80
|
+
create_chain( opts ) )
|
81
|
+
|
82
|
+
cf.executor = call_if( opts[ :executor ] )
|
83
|
+
|
84
|
+
alist = accept_list( call_if( opts[ :types ] ) )
|
34
85
|
unless alist.include?( '*/*' )
|
35
86
|
cf.accepted_content_types = ContentTypeSet.new( alist )
|
36
87
|
end
|
37
88
|
|
38
|
-
|
39
|
-
[ 'Accept', accept_header( accept_types ) ] ]
|
40
|
-
|
41
|
-
cf.request_headers = headers.map { |kv| Header.new( *kv ) }
|
42
|
-
|
89
|
+
cf.request_headers = call_if( opts[ :request_headers ], opts )
|
43
90
|
cf
|
44
91
|
end
|
45
92
|
|
93
|
+
def http_request_headers( opts )
|
94
|
+
[ [ 'User-Agent', call_if( opts[ :user_agent ] ) ],
|
95
|
+
[ 'Accept', accept_header( call_if( opts[ :types ] ) ) ]
|
96
|
+
].map { |kv| Header.new( *kv ) }
|
97
|
+
end
|
98
|
+
|
46
99
|
def http_user_agent
|
47
100
|
( "Mozilla/5.0 (compatible; " +
|
48
101
|
"Iudex #{Iudex::Worker::VERSION}; " +
|
@@ -79,6 +132,16 @@ module Iudex
|
|
79
132
|
types.flatten
|
80
133
|
end
|
81
134
|
|
135
|
+
def call_if( v, *args )
|
136
|
+
if v.is_a?( Proc ) || v.is_a?( Method )
|
137
|
+
v.call( *args )
|
138
|
+
elsif v.is_a?( Symbol )
|
139
|
+
send( v, *args )
|
140
|
+
else
|
141
|
+
v
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
82
145
|
end
|
83
146
|
|
84
147
|
end
|
@@ -89,11 +89,13 @@ module Iudex
|
|
89
89
|
end
|
90
90
|
|
91
91
|
def feed_fetcher
|
92
|
-
[ create_content_fetcher(
|
92
|
+
[ create_content_fetcher( :types => :feed_mime_types,
|
93
|
+
:filters => :feed_receiver ) ]
|
93
94
|
end
|
94
95
|
|
95
96
|
def page_fetcher
|
96
|
-
[ create_content_fetcher(
|
97
|
+
[ create_content_fetcher( :types => :page_mime_types,
|
98
|
+
:filters => :page_receiver ) ]
|
97
99
|
end
|
98
100
|
|
99
101
|
def feed_receiver
|
@@ -106,8 +108,11 @@ module Iudex
|
|
106
108
|
end
|
107
109
|
|
108
110
|
def feed_updater
|
109
|
-
create_update_filter( feed_update_keys,
|
110
|
-
:
|
111
|
+
create_update_filter( :fields => feed_update_keys,
|
112
|
+
:on_content => :feed_post,
|
113
|
+
:on_referer => :feed_post,
|
114
|
+
:on_ref_update => :feed_ref_update,
|
115
|
+
:on_ref_new => :feed_ref_new )
|
111
116
|
end
|
112
117
|
|
113
118
|
def feed_ref_new
|
@@ -127,16 +132,6 @@ module Iudex
|
|
127
132
|
:min_next => 0.0 ) ]
|
128
133
|
end
|
129
134
|
|
130
|
-
# Filters to apply for feed update.
|
131
|
-
#
|
132
|
-
# Notes:
|
133
|
-
#
|
134
|
-
# * This is run possibly twice, for both base content map and
|
135
|
-
# referer map if present.
|
136
|
-
#
|
137
|
-
# * If this is an update then these filters act on a *new* map,
|
138
|
-
# thus any changes made here will not be visible after exit
|
139
|
-
# from the update_filter.
|
140
135
|
def feed_post
|
141
136
|
[ UHashMDCSetter.new,
|
142
137
|
ref_common_cleanup,
|
@@ -185,19 +180,11 @@ module Iudex
|
|
185
180
|
end
|
186
181
|
|
187
182
|
def page_updater
|
188
|
-
create_update_filter( page_update_keys,
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
# Notes:
|
194
|
-
#
|
195
|
-
# * This is run possibly twice, for both base content map and
|
196
|
-
# referer map if present.
|
197
|
-
#
|
198
|
-
# * If this is an update then these filters act on a *new* map,
|
199
|
-
# thus any changes made here will not be visible after exit
|
200
|
-
# from the update_filter.
|
183
|
+
create_update_filter( :fields => page_update_keys,
|
184
|
+
:on_content => :page_post,
|
185
|
+
:on_referer => :page_post )
|
186
|
+
end
|
187
|
+
|
201
188
|
def page_post
|
202
189
|
[ UHashMDCSetter.new,
|
203
190
|
barc_writer, # Not run in 302 referer case, since no SOURCE.
|
metadata
CHANGED
@@ -2,34 +2,28 @@
|
|
2
2
|
name: iudex-worker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.3.
|
5
|
+
version: 1.3.2
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: iudex-core
|
16
16
|
version_requirements: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: 1.2.1
|
21
|
-
- - <
|
18
|
+
- - ~>
|
22
19
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
20
|
+
version: 1.3.0
|
24
21
|
none: false
|
25
22
|
requirement: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: 1.2.1
|
30
|
-
- - <
|
24
|
+
- - ~>
|
31
25
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
26
|
+
version: 1.3.0
|
33
27
|
none: false
|
34
28
|
prerelease: false
|
35
29
|
type: :runtime
|
@@ -53,21 +47,15 @@ dependencies:
|
|
53
47
|
name: iudex-rome
|
54
48
|
version_requirements: !ruby/object:Gem::Requirement
|
55
49
|
requirements:
|
56
|
-
- -
|
57
|
-
- !ruby/object:Gem::Version
|
58
|
-
version: 1.2.1
|
59
|
-
- - <
|
50
|
+
- - ~>
|
60
51
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
52
|
+
version: 1.3.0
|
62
53
|
none: false
|
63
54
|
requirement: !ruby/object:Gem::Requirement
|
64
55
|
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: 1.2.1
|
68
|
-
- - <
|
56
|
+
- - ~>
|
69
57
|
- !ruby/object:Gem::Version
|
70
|
-
version:
|
58
|
+
version: 1.3.0
|
71
59
|
none: false
|
72
60
|
prerelease: false
|
73
61
|
type: :runtime
|
@@ -75,21 +63,15 @@ dependencies:
|
|
75
63
|
name: iudex-html
|
76
64
|
version_requirements: !ruby/object:Gem::Requirement
|
77
65
|
requirements:
|
78
|
-
- -
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: 1.2.1
|
81
|
-
- - <
|
66
|
+
- - ~>
|
82
67
|
- !ruby/object:Gem::Version
|
83
|
-
version:
|
68
|
+
version: 1.3.0
|
84
69
|
none: false
|
85
70
|
requirement: !ruby/object:Gem::Requirement
|
86
71
|
requirements:
|
87
|
-
- -
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: 1.2.1
|
90
|
-
- - <
|
72
|
+
- - ~>
|
91
73
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
74
|
+
version: 1.3.0
|
93
75
|
none: false
|
94
76
|
prerelease: false
|
95
77
|
type: :runtime
|
@@ -97,21 +79,15 @@ dependencies:
|
|
97
79
|
name: iudex-simhash
|
98
80
|
version_requirements: !ruby/object:Gem::Requirement
|
99
81
|
requirements:
|
100
|
-
- -
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
version: 1.2.1
|
103
|
-
- - <
|
82
|
+
- - ~>
|
104
83
|
- !ruby/object:Gem::Version
|
105
|
-
version:
|
84
|
+
version: 1.3.0
|
106
85
|
none: false
|
107
86
|
requirement: !ruby/object:Gem::Requirement
|
108
87
|
requirements:
|
109
|
-
- -
|
110
|
-
- !ruby/object:Gem::Version
|
111
|
-
version: 1.2.1
|
112
|
-
- - <
|
88
|
+
- - ~>
|
113
89
|
- !ruby/object:Gem::Version
|
114
|
-
version:
|
90
|
+
version: 1.3.0
|
115
91
|
none: false
|
116
92
|
prerelease: false
|
117
93
|
type: :runtime
|
@@ -119,21 +95,15 @@ dependencies:
|
|
119
95
|
name: iudex-char-detector
|
120
96
|
version_requirements: !ruby/object:Gem::Requirement
|
121
97
|
requirements:
|
122
|
-
- -
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: 1.2.1
|
125
|
-
- - <
|
98
|
+
- - ~>
|
126
99
|
- !ruby/object:Gem::Version
|
127
|
-
version:
|
100
|
+
version: 1.3.0
|
128
101
|
none: false
|
129
102
|
requirement: !ruby/object:Gem::Requirement
|
130
103
|
requirements:
|
131
|
-
- -
|
132
|
-
- !ruby/object:Gem::Version
|
133
|
-
version: 1.2.1
|
134
|
-
- - <
|
104
|
+
- - ~>
|
135
105
|
- !ruby/object:Gem::Version
|
136
|
-
version:
|
106
|
+
version: 1.3.0
|
137
107
|
none: false
|
138
108
|
prerelease: false
|
139
109
|
type: :runtime
|
@@ -143,13 +113,13 @@ dependencies:
|
|
143
113
|
requirements:
|
144
114
|
- - ~>
|
145
115
|
- !ruby/object:Gem::Version
|
146
|
-
version: '1.
|
116
|
+
version: '1.5'
|
147
117
|
none: false
|
148
118
|
requirement: !ruby/object:Gem::Requirement
|
149
119
|
requirements:
|
150
120
|
- - ~>
|
151
121
|
- !ruby/object:Gem::Version
|
152
|
-
version: '1.
|
122
|
+
version: '1.5'
|
153
123
|
none: false
|
154
124
|
prerelease: false
|
155
125
|
type: :runtime
|
@@ -173,21 +143,15 @@ dependencies:
|
|
173
143
|
name: iudex-httpclient-3
|
174
144
|
version_requirements: !ruby/object:Gem::Requirement
|
175
145
|
requirements:
|
176
|
-
- -
|
177
|
-
- !ruby/object:Gem::Version
|
178
|
-
version: 1.2.1
|
179
|
-
- - <
|
146
|
+
- - ~>
|
180
147
|
- !ruby/object:Gem::Version
|
181
|
-
version:
|
148
|
+
version: 1.3.0
|
182
149
|
none: false
|
183
150
|
requirement: !ruby/object:Gem::Requirement
|
184
151
|
requirements:
|
185
|
-
- -
|
186
|
-
- !ruby/object:Gem::Version
|
187
|
-
version: 1.2.1
|
188
|
-
- - <
|
152
|
+
- - ~>
|
189
153
|
- !ruby/object:Gem::Version
|
190
|
-
version:
|
154
|
+
version: 1.3.0
|
191
155
|
none: false
|
192
156
|
prerelease: false
|
193
157
|
type: :development
|
@@ -195,21 +159,15 @@ dependencies:
|
|
195
159
|
name: iudex-jetty-httpclient
|
196
160
|
version_requirements: !ruby/object:Gem::Requirement
|
197
161
|
requirements:
|
198
|
-
- -
|
199
|
-
- !ruby/object:Gem::Version
|
200
|
-
version: 1.2.1
|
201
|
-
- - <
|
162
|
+
- - ~>
|
202
163
|
- !ruby/object:Gem::Version
|
203
|
-
version:
|
164
|
+
version: 1.3.0
|
204
165
|
none: false
|
205
166
|
requirement: !ruby/object:Gem::Requirement
|
206
167
|
requirements:
|
207
|
-
- -
|
208
|
-
- !ruby/object:Gem::Version
|
209
|
-
version: 1.2.1
|
210
|
-
- - <
|
168
|
+
- - ~>
|
211
169
|
- !ruby/object:Gem::Version
|
212
|
-
version:
|
170
|
+
version: 1.3.0
|
213
171
|
none: false
|
214
172
|
prerelease: false
|
215
173
|
type: :development
|
@@ -217,21 +175,15 @@ dependencies:
|
|
217
175
|
name: iudex-async-httpclient
|
218
176
|
version_requirements: !ruby/object:Gem::Requirement
|
219
177
|
requirements:
|
220
|
-
- -
|
221
|
-
- !ruby/object:Gem::Version
|
222
|
-
version: 1.2.1
|
223
|
-
- - <
|
178
|
+
- - ~>
|
224
179
|
- !ruby/object:Gem::Version
|
225
|
-
version:
|
180
|
+
version: 1.3.0
|
226
181
|
none: false
|
227
182
|
requirement: !ruby/object:Gem::Requirement
|
228
183
|
requirements:
|
229
|
-
- -
|
230
|
-
- !ruby/object:Gem::Version
|
231
|
-
version: 1.2.1
|
232
|
-
- - <
|
184
|
+
- - ~>
|
233
185
|
- !ruby/object:Gem::Version
|
234
|
-
version:
|
186
|
+
version: 1.3.0
|
235
187
|
none: false
|
236
188
|
prerelease: false
|
237
189
|
type: :development
|