iudex-worker 1.3.1-java → 1.3.2-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc CHANGED
@@ -1,3 +1,13 @@
1
+ === 1.3.2 (2012-11-8)
2
+ * Upgrade/narrow to iudex-core, -da, -rome, -html, -simhash ~> 1.3.0
3
+ * FetchHelper.create_content_fetcher now prefers an options Hash,
4
+ exposing settings for :types, :client, :user_agent,
5
+ :request_headers, etc. and passing additional options to
6
+ create_chain (:filters, etc.)
7
+ * Various updates to FilterChainFactory and config sample using new
8
+ options, etc.
9
+ * Upgrade to logback ~> 1.5
10
+
1
11
  === 1.3.1 (2012-10-25)
2
12
  * Improve shutdown reliability by adding Agent.run_safe with ensure'd
3
13
  close calls.
data/config/config.rb CHANGED
@@ -22,10 +22,8 @@ Iudex.configure do |c|
22
22
  end
23
23
 
24
24
  c.setup_visit_queue do |q|
25
- q.default_min_host_delay = 100 #ms
26
- q.default_max_access_per_host = 1
27
-
28
- q.configure_host( "gravitext.com", 100, 2 ) # 100ms, 2 connections
25
+ q.config( :rate => 5.0, :cons => 1 )
26
+ q.config( :domain => "gravitext.com", :rate => 10.0, :cons => 2 )
29
27
  end
30
28
 
31
29
  c.setup_work_poller do |wp|
data/init/iudex-worker CHANGED
@@ -24,7 +24,7 @@
24
24
 
25
25
  require 'rubygems'
26
26
 
27
- gem( "iudex-worker", "= 1.3.1" )
27
+ gem( "iudex-worker", "= 1.3.2" )
28
28
 
29
29
  module IudexInitScript
30
30
 
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module Worker
19
- VERSION = '1.3.1'
19
+ VERSION = '1.3.2'
20
20
  end
21
21
  end
@@ -23,26 +23,79 @@ module Iudex
23
23
  include Iudex::HTTP
24
24
  include Iudex::Core::Filters
25
25
 
26
- def create_content_fetcher( accept_types, receiver, listener = nil )
27
- cf = ContentFetcher.new( http_client,
28
- visit_counter,
29
- create_chain( receiver, nil, listener ) )
30
-
31
- cf.executor = executor if executor
32
-
33
- alist = accept_list( accept_types )
26
+ # Create a ContentFetcher including a filter chain to receive
27
+ # the fetch result.
28
+ #
29
+ # === Options
30
+ #
31
+ # Options support literal values, or a Proc, Method, or a Symbol
32
+ # to self send unless otherwise noted.
33
+ #
34
+ # :types:: An Array or table of Mime types use Accept header in
35
+ # default :request_headers and to restrict returned
36
+ # results on. (Default: #page_mime_types)
37
+ #
38
+ # :client:: The Java::iudex.http.HTTPClient implementation to
39
+ # use (Default: :http_client)
40
+ #
41
+ # :user_agent:: The HTTP User-Agent for default
42
+ # :request_headers. Proc's will receive the
43
+ # options Hash as parameter (Default: #http_user_agent)
44
+ #
45
+ # :visit_counter:: The Java::iudex.core.VisitCounter
46
+ # implementation. (Default: :visit_counter)
47
+ #
48
+ # :executor:: The java.util.concurrent.Executor to use for
49
+ # running the receiver filter chain. (Default: :executor)
50
+ #
51
+ # :request_headers:: HTTP Request headers as Array<iudex.http.Header>
52
+ # (Default: #http_request_headers)
53
+ #
54
+ # All options (including the required :filters option, and
55
+ # :listener with default :main) are also passed to
56
+ # self.create_chain for creating the receiver filter chain
57
+ #
58
+ # The positional parameters equivalent to ( :types, :filters,
59
+ # :listener ) as defined above are also supported, but
60
+ # deprecated.
61
+ #
62
+ def create_content_fetcher( *args )
63
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
64
+
65
+ opts[ :types ] ||= args.shift
66
+ opts[ :filters ] ||= args.shift
67
+ opts[ :listener ] ||= args.shift
68
+
69
+ opts = { :types => :page_mime_types,
70
+ :listener => :main,
71
+ :client => :http_client,
72
+ :user_agent => :http_user_agent,
73
+ :visit_counter => :visit_counter,
74
+ :executor => :executor,
75
+ :request_headers => :http_request_headers
76
+ }.merge( opts )
77
+
78
+ cf = ContentFetcher.new( call_if( opts[ :client ] ),
79
+ call_if( opts[ :visit_counter ] ),
80
+ create_chain( opts ) )
81
+
82
+ cf.executor = call_if( opts[ :executor ] )
83
+
84
+ alist = accept_list( call_if( opts[ :types ] ) )
34
85
  unless alist.include?( '*/*' )
35
86
  cf.accepted_content_types = ContentTypeSet.new( alist )
36
87
  end
37
88
 
38
- headers = [ [ 'User-Agent', http_user_agent ],
39
- [ 'Accept', accept_header( accept_types ) ] ]
40
-
41
- cf.request_headers = headers.map { |kv| Header.new( *kv ) }
42
-
89
+ cf.request_headers = call_if( opts[ :request_headers ], opts )
43
90
  cf
44
91
  end
45
92
 
93
+ def http_request_headers( opts )
94
+ [ [ 'User-Agent', call_if( opts[ :user_agent ] ) ],
95
+ [ 'Accept', accept_header( call_if( opts[ :types ] ) ) ]
96
+ ].map { |kv| Header.new( *kv ) }
97
+ end
98
+
46
99
  def http_user_agent
47
100
  ( "Mozilla/5.0 (compatible; " +
48
101
  "Iudex #{Iudex::Worker::VERSION}; " +
@@ -79,6 +132,16 @@ module Iudex
79
132
  types.flatten
80
133
  end
81
134
 
135
+ def call_if( v, *args )
136
+ if v.is_a?( Proc ) || v.is_a?( Method )
137
+ v.call( *args )
138
+ elsif v.is_a?( Symbol )
139
+ send( v, *args )
140
+ else
141
+ v
142
+ end
143
+ end
144
+
82
145
  end
83
146
 
84
147
  end
@@ -89,11 +89,13 @@ module Iudex
89
89
  end
90
90
 
91
91
  def feed_fetcher
92
- [ create_content_fetcher( feed_mime_types, :feed_receiver, :main ) ]
92
+ [ create_content_fetcher( :types => :feed_mime_types,
93
+ :filters => :feed_receiver ) ]
93
94
  end
94
95
 
95
96
  def page_fetcher
96
- [ create_content_fetcher( page_mime_types, :page_receiver, :main ) ]
97
+ [ create_content_fetcher( :types => :page_mime_types,
98
+ :filters => :page_receiver ) ]
97
99
  end
98
100
 
99
101
  def feed_receiver
@@ -106,8 +108,11 @@ module Iudex
106
108
  end
107
109
 
108
110
  def feed_updater
109
- create_update_filter( feed_update_keys,
110
- :feed_post, :feed_ref_update, :feed_ref_new )
111
+ create_update_filter( :fields => feed_update_keys,
112
+ :on_content => :feed_post,
113
+ :on_referer => :feed_post,
114
+ :on_ref_update => :feed_ref_update,
115
+ :on_ref_new => :feed_ref_new )
111
116
  end
112
117
 
113
118
  def feed_ref_new
@@ -127,16 +132,6 @@ module Iudex
127
132
  :min_next => 0.0 ) ]
128
133
  end
129
134
 
130
- # Filters to apply for feed update.
131
- #
132
- # Notes:
133
- #
134
- # * This is run possibly twice, for both base content map and
135
- # referer map if present.
136
- #
137
- # * If this is an update then these filters act on a *new* map,
138
- # thus any changes made here will not be visible after exit
139
- # from the update_filter.
140
135
  def feed_post
141
136
  [ UHashMDCSetter.new,
142
137
  ref_common_cleanup,
@@ -185,19 +180,11 @@ module Iudex
185
180
  end
186
181
 
187
182
  def page_updater
188
- create_update_filter( page_update_keys, :page_post )
189
- end
190
-
191
- # Filters to apply during page update
192
- #
193
- # Notes:
194
- #
195
- # * This is run possibly twice, for both base content map and
196
- # referer map if present.
197
- #
198
- # * If this is an update then these filters act on a *new* map,
199
- # thus any changes made here will not be visible after exit
200
- # from the update_filter.
183
+ create_update_filter( :fields => page_update_keys,
184
+ :on_content => :page_post,
185
+ :on_referer => :page_post )
186
+ end
187
+
201
188
  def page_post
202
189
  [ UHashMDCSetter.new,
203
190
  barc_writer, # Not run in 302 referer case, since no SOURCE.
metadata CHANGED
@@ -2,34 +2,28 @@
2
2
  name: iudex-worker
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.3.1
5
+ version: 1.3.2
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-25 00:00:00.000000000 Z
12
+ date: 2012-11-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: iudex-core
16
16
  version_requirements: !ruby/object:Gem::Requirement
17
17
  requirements:
18
- - - ! '>='
19
- - !ruby/object:Gem::Version
20
- version: 1.2.1
21
- - - <
18
+ - - ~>
22
19
  - !ruby/object:Gem::Version
23
- version: '1.4'
20
+ version: 1.3.0
24
21
  none: false
25
22
  requirement: !ruby/object:Gem::Requirement
26
23
  requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: 1.2.1
30
- - - <
24
+ - - ~>
31
25
  - !ruby/object:Gem::Version
32
- version: '1.4'
26
+ version: 1.3.0
33
27
  none: false
34
28
  prerelease: false
35
29
  type: :runtime
@@ -53,21 +47,15 @@ dependencies:
53
47
  name: iudex-rome
54
48
  version_requirements: !ruby/object:Gem::Requirement
55
49
  requirements:
56
- - - ! '>='
57
- - !ruby/object:Gem::Version
58
- version: 1.2.1
59
- - - <
50
+ - - ~>
60
51
  - !ruby/object:Gem::Version
61
- version: '1.4'
52
+ version: 1.3.0
62
53
  none: false
63
54
  requirement: !ruby/object:Gem::Requirement
64
55
  requirements:
65
- - - ! '>='
66
- - !ruby/object:Gem::Version
67
- version: 1.2.1
68
- - - <
56
+ - - ~>
69
57
  - !ruby/object:Gem::Version
70
- version: '1.4'
58
+ version: 1.3.0
71
59
  none: false
72
60
  prerelease: false
73
61
  type: :runtime
@@ -75,21 +63,15 @@ dependencies:
75
63
  name: iudex-html
76
64
  version_requirements: !ruby/object:Gem::Requirement
77
65
  requirements:
78
- - - ! '>='
79
- - !ruby/object:Gem::Version
80
- version: 1.2.1
81
- - - <
66
+ - - ~>
82
67
  - !ruby/object:Gem::Version
83
- version: '1.4'
68
+ version: 1.3.0
84
69
  none: false
85
70
  requirement: !ruby/object:Gem::Requirement
86
71
  requirements:
87
- - - ! '>='
88
- - !ruby/object:Gem::Version
89
- version: 1.2.1
90
- - - <
72
+ - - ~>
91
73
  - !ruby/object:Gem::Version
92
- version: '1.4'
74
+ version: 1.3.0
93
75
  none: false
94
76
  prerelease: false
95
77
  type: :runtime
@@ -97,21 +79,15 @@ dependencies:
97
79
  name: iudex-simhash
98
80
  version_requirements: !ruby/object:Gem::Requirement
99
81
  requirements:
100
- - - ! '>='
101
- - !ruby/object:Gem::Version
102
- version: 1.2.1
103
- - - <
82
+ - - ~>
104
83
  - !ruby/object:Gem::Version
105
- version: '1.4'
84
+ version: 1.3.0
106
85
  none: false
107
86
  requirement: !ruby/object:Gem::Requirement
108
87
  requirements:
109
- - - ! '>='
110
- - !ruby/object:Gem::Version
111
- version: 1.2.1
112
- - - <
88
+ - - ~>
113
89
  - !ruby/object:Gem::Version
114
- version: '1.4'
90
+ version: 1.3.0
115
91
  none: false
116
92
  prerelease: false
117
93
  type: :runtime
@@ -119,21 +95,15 @@ dependencies:
119
95
  name: iudex-char-detector
120
96
  version_requirements: !ruby/object:Gem::Requirement
121
97
  requirements:
122
- - - ! '>='
123
- - !ruby/object:Gem::Version
124
- version: 1.2.1
125
- - - <
98
+ - - ~>
126
99
  - !ruby/object:Gem::Version
127
- version: '1.4'
100
+ version: 1.3.0
128
101
  none: false
129
102
  requirement: !ruby/object:Gem::Requirement
130
103
  requirements:
131
- - - ! '>='
132
- - !ruby/object:Gem::Version
133
- version: 1.2.1
134
- - - <
104
+ - - ~>
135
105
  - !ruby/object:Gem::Version
136
- version: '1.4'
106
+ version: 1.3.0
137
107
  none: false
138
108
  prerelease: false
139
109
  type: :runtime
@@ -143,13 +113,13 @@ dependencies:
143
113
  requirements:
144
114
  - - ~>
145
115
  - !ruby/object:Gem::Version
146
- version: '1.2'
116
+ version: '1.5'
147
117
  none: false
148
118
  requirement: !ruby/object:Gem::Requirement
149
119
  requirements:
150
120
  - - ~>
151
121
  - !ruby/object:Gem::Version
152
- version: '1.2'
122
+ version: '1.5'
153
123
  none: false
154
124
  prerelease: false
155
125
  type: :runtime
@@ -173,21 +143,15 @@ dependencies:
173
143
  name: iudex-httpclient-3
174
144
  version_requirements: !ruby/object:Gem::Requirement
175
145
  requirements:
176
- - - ! '>='
177
- - !ruby/object:Gem::Version
178
- version: 1.2.1
179
- - - <
146
+ - - ~>
180
147
  - !ruby/object:Gem::Version
181
- version: '1.4'
148
+ version: 1.3.0
182
149
  none: false
183
150
  requirement: !ruby/object:Gem::Requirement
184
151
  requirements:
185
- - - ! '>='
186
- - !ruby/object:Gem::Version
187
- version: 1.2.1
188
- - - <
152
+ - - ~>
189
153
  - !ruby/object:Gem::Version
190
- version: '1.4'
154
+ version: 1.3.0
191
155
  none: false
192
156
  prerelease: false
193
157
  type: :development
@@ -195,21 +159,15 @@ dependencies:
195
159
  name: iudex-jetty-httpclient
196
160
  version_requirements: !ruby/object:Gem::Requirement
197
161
  requirements:
198
- - - ! '>='
199
- - !ruby/object:Gem::Version
200
- version: 1.2.1
201
- - - <
162
+ - - ~>
202
163
  - !ruby/object:Gem::Version
203
- version: '1.4'
164
+ version: 1.3.0
204
165
  none: false
205
166
  requirement: !ruby/object:Gem::Requirement
206
167
  requirements:
207
- - - ! '>='
208
- - !ruby/object:Gem::Version
209
- version: 1.2.1
210
- - - <
168
+ - - ~>
211
169
  - !ruby/object:Gem::Version
212
- version: '1.4'
170
+ version: 1.3.0
213
171
  none: false
214
172
  prerelease: false
215
173
  type: :development
@@ -217,21 +175,15 @@ dependencies:
217
175
  name: iudex-async-httpclient
218
176
  version_requirements: !ruby/object:Gem::Requirement
219
177
  requirements:
220
- - - ! '>='
221
- - !ruby/object:Gem::Version
222
- version: 1.2.1
223
- - - <
178
+ - - ~>
224
179
  - !ruby/object:Gem::Version
225
- version: '1.4'
180
+ version: 1.3.0
226
181
  none: false
227
182
  requirement: !ruby/object:Gem::Requirement
228
183
  requirements:
229
- - - ! '>='
230
- - !ruby/object:Gem::Version
231
- version: 1.2.1
232
- - - <
184
+ - - ~>
233
185
  - !ruby/object:Gem::Version
234
- version: '1.4'
186
+ version: 1.3.0
235
187
  none: false
236
188
  prerelease: false
237
189
  type: :development