iudex-worker 1.3.1-java → 1.3.2-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +10 -0
- data/config/config.rb +2 -4
- data/init/iudex-worker +1 -1
- data/lib/iudex-worker/base.rb +1 -1
- data/lib/iudex-worker/fetch_helper.rb +76 -13
- data/lib/iudex-worker/filter_chain_factory.rb +14 -27
- metadata +36 -84
data/History.rdoc
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
=== 1.3.2 (2012-11-8)
|
2
|
+
* Upgrade/narrow to iudex-core, -da, -rome, -html, -simhash ~> 1.3.0
|
3
|
+
* FetchHelper.create_content_fetcher now prefers an options Hash,
|
4
|
+
exposing settings for :types, :client, :user_agent,
|
5
|
+
:request_headers, etc. and passing additional options to
|
6
|
+
create_chain (:filters, etc.)
|
7
|
+
* Various updates to FilterChainFactory and config sample using new
|
8
|
+
options, etc.
|
9
|
+
* Upgrade to logback ~> 1.5
|
10
|
+
|
1
11
|
=== 1.3.1 (2012-10-25)
|
2
12
|
* Improve shutdown reliability by adding Agent.run_safe with ensure'd
|
3
13
|
close calls.
|
data/config/config.rb
CHANGED
@@ -22,10 +22,8 @@ Iudex.configure do |c|
|
|
22
22
|
end
|
23
23
|
|
24
24
|
c.setup_visit_queue do |q|
|
25
|
-
q.
|
26
|
-
q.
|
27
|
-
|
28
|
-
q.configure_host( "gravitext.com", 100, 2 ) # 100ms, 2 connections
|
25
|
+
q.config( :rate => 5.0, :cons => 1 )
|
26
|
+
q.config( :domain => "gravitext.com", :rate => 10.0, :cons => 2 )
|
29
27
|
end
|
30
28
|
|
31
29
|
c.setup_work_poller do |wp|
|
data/init/iudex-worker
CHANGED
data/lib/iudex-worker/base.rb
CHANGED
@@ -23,26 +23,79 @@ module Iudex
|
|
23
23
|
include Iudex::HTTP
|
24
24
|
include Iudex::Core::Filters
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
26
|
+
# Create a ContentFetcher including a filter chain to receive
|
27
|
+
# the fetch result.
|
28
|
+
#
|
29
|
+
# === Options
|
30
|
+
#
|
31
|
+
# Options support literal values, or a Proc, Method, or a Symbol
|
32
|
+
# to self send unless otherwise noted.
|
33
|
+
#
|
34
|
+
# :types:: An Array or table of Mime types use Accept header in
|
35
|
+
# default :request_headers and to restrict returned
|
36
|
+
# results on. (Default: #page_mime_types)
|
37
|
+
#
|
38
|
+
# :client:: The Java::iudex.http.HTTPClient implementation to
|
39
|
+
# use (Default: :http_client)
|
40
|
+
#
|
41
|
+
# :user_agent:: The HTTP User-Agent for default
|
42
|
+
# :request_headers. Proc's will receive the
|
43
|
+
# options Hash as parameter (Default: #http_user_agent)
|
44
|
+
#
|
45
|
+
# :visit_counter:: The Java::iudex.core.VisitCounter
|
46
|
+
# implementation. (Default: :visit_counter)
|
47
|
+
#
|
48
|
+
# :executor:: The java.util.concurrent.Executor to use for
|
49
|
+
# running the receiver filter chain. (Default: :executor)
|
50
|
+
#
|
51
|
+
# :request_headers:: HTTP Request headers as Array<iudex.http.Header>
|
52
|
+
# (Default: #http_request_headers)
|
53
|
+
#
|
54
|
+
# All options (including the required :filters option, and
|
55
|
+
# :listener with default :main) are also passed to
|
56
|
+
# self.create_chain for creating the receiver filter chain
|
57
|
+
#
|
58
|
+
# The positional parameters equivalent to ( :types, :filters,
|
59
|
+
# :listener ) as defined above are also supported, but
|
60
|
+
# deprecated.
|
61
|
+
#
|
62
|
+
def create_content_fetcher( *args )
|
63
|
+
opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
|
64
|
+
|
65
|
+
opts[ :types ] ||= args.shift
|
66
|
+
opts[ :filters ] ||= args.shift
|
67
|
+
opts[ :listener ] ||= args.shift
|
68
|
+
|
69
|
+
opts = { :types => :page_mime_types,
|
70
|
+
:listener => :main,
|
71
|
+
:client => :http_client,
|
72
|
+
:user_agent => :http_user_agent,
|
73
|
+
:visit_counter => :visit_counter,
|
74
|
+
:executor => :executor,
|
75
|
+
:request_headers => :http_request_headers
|
76
|
+
}.merge( opts )
|
77
|
+
|
78
|
+
cf = ContentFetcher.new( call_if( opts[ :client ] ),
|
79
|
+
call_if( opts[ :visit_counter ] ),
|
80
|
+
create_chain( opts ) )
|
81
|
+
|
82
|
+
cf.executor = call_if( opts[ :executor ] )
|
83
|
+
|
84
|
+
alist = accept_list( call_if( opts[ :types ] ) )
|
34
85
|
unless alist.include?( '*/*' )
|
35
86
|
cf.accepted_content_types = ContentTypeSet.new( alist )
|
36
87
|
end
|
37
88
|
|
38
|
-
|
39
|
-
[ 'Accept', accept_header( accept_types ) ] ]
|
40
|
-
|
41
|
-
cf.request_headers = headers.map { |kv| Header.new( *kv ) }
|
42
|
-
|
89
|
+
cf.request_headers = call_if( opts[ :request_headers ], opts )
|
43
90
|
cf
|
44
91
|
end
|
45
92
|
|
93
|
+
def http_request_headers( opts )
|
94
|
+
[ [ 'User-Agent', call_if( opts[ :user_agent ] ) ],
|
95
|
+
[ 'Accept', accept_header( call_if( opts[ :types ] ) ) ]
|
96
|
+
].map { |kv| Header.new( *kv ) }
|
97
|
+
end
|
98
|
+
|
46
99
|
def http_user_agent
|
47
100
|
( "Mozilla/5.0 (compatible; " +
|
48
101
|
"Iudex #{Iudex::Worker::VERSION}; " +
|
@@ -79,6 +132,16 @@ module Iudex
|
|
79
132
|
types.flatten
|
80
133
|
end
|
81
134
|
|
135
|
+
def call_if( v, *args )
|
136
|
+
if v.is_a?( Proc ) || v.is_a?( Method )
|
137
|
+
v.call( *args )
|
138
|
+
elsif v.is_a?( Symbol )
|
139
|
+
send( v, *args )
|
140
|
+
else
|
141
|
+
v
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
82
145
|
end
|
83
146
|
|
84
147
|
end
|
@@ -89,11 +89,13 @@ module Iudex
|
|
89
89
|
end
|
90
90
|
|
91
91
|
def feed_fetcher
|
92
|
-
[ create_content_fetcher(
|
92
|
+
[ create_content_fetcher( :types => :feed_mime_types,
|
93
|
+
:filters => :feed_receiver ) ]
|
93
94
|
end
|
94
95
|
|
95
96
|
def page_fetcher
|
96
|
-
[ create_content_fetcher(
|
97
|
+
[ create_content_fetcher( :types => :page_mime_types,
|
98
|
+
:filters => :page_receiver ) ]
|
97
99
|
end
|
98
100
|
|
99
101
|
def feed_receiver
|
@@ -106,8 +108,11 @@ module Iudex
|
|
106
108
|
end
|
107
109
|
|
108
110
|
def feed_updater
|
109
|
-
create_update_filter( feed_update_keys,
|
110
|
-
:
|
111
|
+
create_update_filter( :fields => feed_update_keys,
|
112
|
+
:on_content => :feed_post,
|
113
|
+
:on_referer => :feed_post,
|
114
|
+
:on_ref_update => :feed_ref_update,
|
115
|
+
:on_ref_new => :feed_ref_new )
|
111
116
|
end
|
112
117
|
|
113
118
|
def feed_ref_new
|
@@ -127,16 +132,6 @@ module Iudex
|
|
127
132
|
:min_next => 0.0 ) ]
|
128
133
|
end
|
129
134
|
|
130
|
-
# Filters to apply for feed update.
|
131
|
-
#
|
132
|
-
# Notes:
|
133
|
-
#
|
134
|
-
# * This is run possibly twice, for both base content map and
|
135
|
-
# referer map if present.
|
136
|
-
#
|
137
|
-
# * If this is an update then these filters act on a *new* map,
|
138
|
-
# thus any changes made here will not be visible after exit
|
139
|
-
# from the update_filter.
|
140
135
|
def feed_post
|
141
136
|
[ UHashMDCSetter.new,
|
142
137
|
ref_common_cleanup,
|
@@ -185,19 +180,11 @@ module Iudex
|
|
185
180
|
end
|
186
181
|
|
187
182
|
def page_updater
|
188
|
-
create_update_filter( page_update_keys,
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
# Notes:
|
194
|
-
#
|
195
|
-
# * This is run possibly twice, for both base content map and
|
196
|
-
# referer map if present.
|
197
|
-
#
|
198
|
-
# * If this is an update then these filters act on a *new* map,
|
199
|
-
# thus any changes made here will not be visible after exit
|
200
|
-
# from the update_filter.
|
183
|
+
create_update_filter( :fields => page_update_keys,
|
184
|
+
:on_content => :page_post,
|
185
|
+
:on_referer => :page_post )
|
186
|
+
end
|
187
|
+
|
201
188
|
def page_post
|
202
189
|
[ UHashMDCSetter.new,
|
203
190
|
barc_writer, # Not run in 302 referer case, since no SOURCE.
|
metadata
CHANGED
@@ -2,34 +2,28 @@
|
|
2
2
|
name: iudex-worker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.3.
|
5
|
+
version: 1.3.2
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: iudex-core
|
16
16
|
version_requirements: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: 1.2.1
|
21
|
-
- - <
|
18
|
+
- - ~>
|
22
19
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
20
|
+
version: 1.3.0
|
24
21
|
none: false
|
25
22
|
requirement: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: 1.2.1
|
30
|
-
- - <
|
24
|
+
- - ~>
|
31
25
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
26
|
+
version: 1.3.0
|
33
27
|
none: false
|
34
28
|
prerelease: false
|
35
29
|
type: :runtime
|
@@ -53,21 +47,15 @@ dependencies:
|
|
53
47
|
name: iudex-rome
|
54
48
|
version_requirements: !ruby/object:Gem::Requirement
|
55
49
|
requirements:
|
56
|
-
- -
|
57
|
-
- !ruby/object:Gem::Version
|
58
|
-
version: 1.2.1
|
59
|
-
- - <
|
50
|
+
- - ~>
|
60
51
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
52
|
+
version: 1.3.0
|
62
53
|
none: false
|
63
54
|
requirement: !ruby/object:Gem::Requirement
|
64
55
|
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: 1.2.1
|
68
|
-
- - <
|
56
|
+
- - ~>
|
69
57
|
- !ruby/object:Gem::Version
|
70
|
-
version:
|
58
|
+
version: 1.3.0
|
71
59
|
none: false
|
72
60
|
prerelease: false
|
73
61
|
type: :runtime
|
@@ -75,21 +63,15 @@ dependencies:
|
|
75
63
|
name: iudex-html
|
76
64
|
version_requirements: !ruby/object:Gem::Requirement
|
77
65
|
requirements:
|
78
|
-
- -
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: 1.2.1
|
81
|
-
- - <
|
66
|
+
- - ~>
|
82
67
|
- !ruby/object:Gem::Version
|
83
|
-
version:
|
68
|
+
version: 1.3.0
|
84
69
|
none: false
|
85
70
|
requirement: !ruby/object:Gem::Requirement
|
86
71
|
requirements:
|
87
|
-
- -
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: 1.2.1
|
90
|
-
- - <
|
72
|
+
- - ~>
|
91
73
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
74
|
+
version: 1.3.0
|
93
75
|
none: false
|
94
76
|
prerelease: false
|
95
77
|
type: :runtime
|
@@ -97,21 +79,15 @@ dependencies:
|
|
97
79
|
name: iudex-simhash
|
98
80
|
version_requirements: !ruby/object:Gem::Requirement
|
99
81
|
requirements:
|
100
|
-
- -
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
version: 1.2.1
|
103
|
-
- - <
|
82
|
+
- - ~>
|
104
83
|
- !ruby/object:Gem::Version
|
105
|
-
version:
|
84
|
+
version: 1.3.0
|
106
85
|
none: false
|
107
86
|
requirement: !ruby/object:Gem::Requirement
|
108
87
|
requirements:
|
109
|
-
- -
|
110
|
-
- !ruby/object:Gem::Version
|
111
|
-
version: 1.2.1
|
112
|
-
- - <
|
88
|
+
- - ~>
|
113
89
|
- !ruby/object:Gem::Version
|
114
|
-
version:
|
90
|
+
version: 1.3.0
|
115
91
|
none: false
|
116
92
|
prerelease: false
|
117
93
|
type: :runtime
|
@@ -119,21 +95,15 @@ dependencies:
|
|
119
95
|
name: iudex-char-detector
|
120
96
|
version_requirements: !ruby/object:Gem::Requirement
|
121
97
|
requirements:
|
122
|
-
- -
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: 1.2.1
|
125
|
-
- - <
|
98
|
+
- - ~>
|
126
99
|
- !ruby/object:Gem::Version
|
127
|
-
version:
|
100
|
+
version: 1.3.0
|
128
101
|
none: false
|
129
102
|
requirement: !ruby/object:Gem::Requirement
|
130
103
|
requirements:
|
131
|
-
- -
|
132
|
-
- !ruby/object:Gem::Version
|
133
|
-
version: 1.2.1
|
134
|
-
- - <
|
104
|
+
- - ~>
|
135
105
|
- !ruby/object:Gem::Version
|
136
|
-
version:
|
106
|
+
version: 1.3.0
|
137
107
|
none: false
|
138
108
|
prerelease: false
|
139
109
|
type: :runtime
|
@@ -143,13 +113,13 @@ dependencies:
|
|
143
113
|
requirements:
|
144
114
|
- - ~>
|
145
115
|
- !ruby/object:Gem::Version
|
146
|
-
version: '1.
|
116
|
+
version: '1.5'
|
147
117
|
none: false
|
148
118
|
requirement: !ruby/object:Gem::Requirement
|
149
119
|
requirements:
|
150
120
|
- - ~>
|
151
121
|
- !ruby/object:Gem::Version
|
152
|
-
version: '1.
|
122
|
+
version: '1.5'
|
153
123
|
none: false
|
154
124
|
prerelease: false
|
155
125
|
type: :runtime
|
@@ -173,21 +143,15 @@ dependencies:
|
|
173
143
|
name: iudex-httpclient-3
|
174
144
|
version_requirements: !ruby/object:Gem::Requirement
|
175
145
|
requirements:
|
176
|
-
- -
|
177
|
-
- !ruby/object:Gem::Version
|
178
|
-
version: 1.2.1
|
179
|
-
- - <
|
146
|
+
- - ~>
|
180
147
|
- !ruby/object:Gem::Version
|
181
|
-
version:
|
148
|
+
version: 1.3.0
|
182
149
|
none: false
|
183
150
|
requirement: !ruby/object:Gem::Requirement
|
184
151
|
requirements:
|
185
|
-
- -
|
186
|
-
- !ruby/object:Gem::Version
|
187
|
-
version: 1.2.1
|
188
|
-
- - <
|
152
|
+
- - ~>
|
189
153
|
- !ruby/object:Gem::Version
|
190
|
-
version:
|
154
|
+
version: 1.3.0
|
191
155
|
none: false
|
192
156
|
prerelease: false
|
193
157
|
type: :development
|
@@ -195,21 +159,15 @@ dependencies:
|
|
195
159
|
name: iudex-jetty-httpclient
|
196
160
|
version_requirements: !ruby/object:Gem::Requirement
|
197
161
|
requirements:
|
198
|
-
- -
|
199
|
-
- !ruby/object:Gem::Version
|
200
|
-
version: 1.2.1
|
201
|
-
- - <
|
162
|
+
- - ~>
|
202
163
|
- !ruby/object:Gem::Version
|
203
|
-
version:
|
164
|
+
version: 1.3.0
|
204
165
|
none: false
|
205
166
|
requirement: !ruby/object:Gem::Requirement
|
206
167
|
requirements:
|
207
|
-
- -
|
208
|
-
- !ruby/object:Gem::Version
|
209
|
-
version: 1.2.1
|
210
|
-
- - <
|
168
|
+
- - ~>
|
211
169
|
- !ruby/object:Gem::Version
|
212
|
-
version:
|
170
|
+
version: 1.3.0
|
213
171
|
none: false
|
214
172
|
prerelease: false
|
215
173
|
type: :development
|
@@ -217,21 +175,15 @@ dependencies:
|
|
217
175
|
name: iudex-async-httpclient
|
218
176
|
version_requirements: !ruby/object:Gem::Requirement
|
219
177
|
requirements:
|
220
|
-
- -
|
221
|
-
- !ruby/object:Gem::Version
|
222
|
-
version: 1.2.1
|
223
|
-
- - <
|
178
|
+
- - ~>
|
224
179
|
- !ruby/object:Gem::Version
|
225
|
-
version:
|
180
|
+
version: 1.3.0
|
226
181
|
none: false
|
227
182
|
requirement: !ruby/object:Gem::Requirement
|
228
183
|
requirements:
|
229
|
-
- -
|
230
|
-
- !ruby/object:Gem::Version
|
231
|
-
version: 1.2.1
|
232
|
-
- - <
|
184
|
+
- - ~>
|
233
185
|
- !ruby/object:Gem::Version
|
234
|
-
version:
|
186
|
+
version: 1.3.0
|
235
187
|
none: false
|
236
188
|
prerelease: false
|
237
189
|
type: :development
|