iudex-worker 1.3.1-java → 1.3.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc CHANGED
@@ -1,3 +1,13 @@
1
+ === 1.3.2 (2012-11-8)
2
+ * Upgrade/narrow to iudex-core, -da, -rome, -html, -simhash ~> 1.3.0
3
+ * FetchHelper.create_content_fetcher now prefers an options Hash,
4
+ exposing settings for :types, :client, :user_agent,
5
+ :request_headers, etc. and passing additional options to
6
+ create_chain (:filters, etc.)
7
+ * Various updates to FilterChainFactory and config sample using new
8
+ options, etc.
9
+ * Upgrade to logback ~> 1.5
10
+
1
11
  === 1.3.1 (2012-10-25)
2
12
  * Improve shutdown reliability by adding Agent.run_safe with ensure'd
3
13
  close calls.
data/config/config.rb CHANGED
@@ -22,10 +22,8 @@ Iudex.configure do |c|
22
22
  end
23
23
 
24
24
  c.setup_visit_queue do |q|
25
- q.default_min_host_delay = 100 #ms
26
- q.default_max_access_per_host = 1
27
-
28
- q.configure_host( "gravitext.com", 100, 2 ) # 100ms, 2 connections
25
+ q.config( :rate => 5.0, :cons => 1 )
26
+ q.config( :domain => "gravitext.com", :rate => 10.0, :cons => 2 )
29
27
  end
30
28
 
31
29
  c.setup_work_poller do |wp|
data/init/iudex-worker CHANGED
@@ -24,7 +24,7 @@
24
24
 
25
25
  require 'rubygems'
26
26
 
27
- gem( "iudex-worker", "= 1.3.1" )
27
+ gem( "iudex-worker", "= 1.3.2" )
28
28
 
29
29
  module IudexInitScript
30
30
 
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module Worker
19
- VERSION = '1.3.1'
19
+ VERSION = '1.3.2'
20
20
  end
21
21
  end
@@ -23,26 +23,79 @@ module Iudex
23
23
  include Iudex::HTTP
24
24
  include Iudex::Core::Filters
25
25
 
26
- def create_content_fetcher( accept_types, receiver, listener = nil )
27
- cf = ContentFetcher.new( http_client,
28
- visit_counter,
29
- create_chain( receiver, nil, listener ) )
30
-
31
- cf.executor = executor if executor
32
-
33
- alist = accept_list( accept_types )
26
+ # Create a ContentFetcher including a filter chain to receive
27
+ # the fetch result.
28
+ #
29
+ # === Options
30
+ #
31
+ # Options support literal values, or a Proc, Method, or a Symbol
32
+ # to self send unless otherwise noted.
33
+ #
34
+ # :types:: An Array or table of Mime types use Accept header in
35
+ # default :request_headers and to restrict returned
36
+ # results on. (Default: #page_mime_types)
37
+ #
38
+ # :client:: The Java::iudex.http.HTTPClient implementation to
39
+ # use (Default: :http_client)
40
+ #
41
+ # :user_agent:: The HTTP User-Agent for default
42
+ # :request_headers. Proc's will receive the
43
+ # options Hash as parameter (Default: #http_user_agent)
44
+ #
45
+ # :visit_counter:: The Java::iudex.core.VisitCounter
46
+ # implementation. (Default: :visit_counter)
47
+ #
48
+ # :executor:: The java.util.concurrent.Executor to use for
49
+ # running the receiver filter chain. (Default: :executor)
50
+ #
51
+ # :request_headers:: HTTP Request headers as Array<iudex.http.Header>
52
+ # (Default: #http_request_headers)
53
+ #
54
+ # All options (including the required :filters option, and
55
+ # :listener with default :main) are also passed to
56
+ # self.create_chain for creating the receiver filter chain
57
+ #
58
+ # The positional parameters equivalent to ( :types, :filters,
59
+ # :listener ) as defined above are also supported, but
60
+ # deprecated.
61
+ #
62
+ def create_content_fetcher( *args )
63
+ opts = args.last.is_a?( Hash ) ? args.pop.dup : {}
64
+
65
+ opts[ :types ] ||= args.shift
66
+ opts[ :filters ] ||= args.shift
67
+ opts[ :listener ] ||= args.shift
68
+
69
+ opts = { :types => :page_mime_types,
70
+ :listener => :main,
71
+ :client => :http_client,
72
+ :user_agent => :http_user_agent,
73
+ :visit_counter => :visit_counter,
74
+ :executor => :executor,
75
+ :request_headers => :http_request_headers
76
+ }.merge( opts )
77
+
78
+ cf = ContentFetcher.new( call_if( opts[ :client ] ),
79
+ call_if( opts[ :visit_counter ] ),
80
+ create_chain( opts ) )
81
+
82
+ cf.executor = call_if( opts[ :executor ] )
83
+
84
+ alist = accept_list( call_if( opts[ :types ] ) )
34
85
  unless alist.include?( '*/*' )
35
86
  cf.accepted_content_types = ContentTypeSet.new( alist )
36
87
  end
37
88
 
38
- headers = [ [ 'User-Agent', http_user_agent ],
39
- [ 'Accept', accept_header( accept_types ) ] ]
40
-
41
- cf.request_headers = headers.map { |kv| Header.new( *kv ) }
42
-
89
+ cf.request_headers = call_if( opts[ :request_headers ], opts )
43
90
  cf
44
91
  end
45
92
 
93
+ def http_request_headers( opts )
94
+ [ [ 'User-Agent', call_if( opts[ :user_agent ] ) ],
95
+ [ 'Accept', accept_header( call_if( opts[ :types ] ) ) ]
96
+ ].map { |kv| Header.new( *kv ) }
97
+ end
98
+
46
99
  def http_user_agent
47
100
  ( "Mozilla/5.0 (compatible; " +
48
101
  "Iudex #{Iudex::Worker::VERSION}; " +
@@ -79,6 +132,16 @@ module Iudex
79
132
  types.flatten
80
133
  end
81
134
 
135
+ def call_if( v, *args )
136
+ if v.is_a?( Proc ) || v.is_a?( Method )
137
+ v.call( *args )
138
+ elsif v.is_a?( Symbol )
139
+ send( v, *args )
140
+ else
141
+ v
142
+ end
143
+ end
144
+
82
145
  end
83
146
 
84
147
  end
@@ -89,11 +89,13 @@ module Iudex
89
89
  end
90
90
 
91
91
  def feed_fetcher
92
- [ create_content_fetcher( feed_mime_types, :feed_receiver, :main ) ]
92
+ [ create_content_fetcher( :types => :feed_mime_types,
93
+ :filters => :feed_receiver ) ]
93
94
  end
94
95
 
95
96
  def page_fetcher
96
- [ create_content_fetcher( page_mime_types, :page_receiver, :main ) ]
97
+ [ create_content_fetcher( :types => :page_mime_types,
98
+ :filters => :page_receiver ) ]
97
99
  end
98
100
 
99
101
  def feed_receiver
@@ -106,8 +108,11 @@ module Iudex
106
108
  end
107
109
 
108
110
  def feed_updater
109
- create_update_filter( feed_update_keys,
110
- :feed_post, :feed_ref_update, :feed_ref_new )
111
+ create_update_filter( :fields => feed_update_keys,
112
+ :on_content => :feed_post,
113
+ :on_referer => :feed_post,
114
+ :on_ref_update => :feed_ref_update,
115
+ :on_ref_new => :feed_ref_new )
111
116
  end
112
117
 
113
118
  def feed_ref_new
@@ -127,16 +132,6 @@ module Iudex
127
132
  :min_next => 0.0 ) ]
128
133
  end
129
134
 
130
- # Filters to apply for feed update.
131
- #
132
- # Notes:
133
- #
134
- # * This is run possibly twice, for both base content map and
135
- # referer map if present.
136
- #
137
- # * If this is an update then these filters act on a *new* map,
138
- # thus any changes made here will not be visible after exit
139
- # from the update_filter.
140
135
  def feed_post
141
136
  [ UHashMDCSetter.new,
142
137
  ref_common_cleanup,
@@ -185,19 +180,11 @@ module Iudex
185
180
  end
186
181
 
187
182
  def page_updater
188
- create_update_filter( page_update_keys, :page_post )
189
- end
190
-
191
- # Filters to apply during page update
192
- #
193
- # Notes:
194
- #
195
- # * This is run possibly twice, for both base content map and
196
- # referer map if present.
197
- #
198
- # * If this is an update then these filters act on a *new* map,
199
- # thus any changes made here will not be visible after exit
200
- # from the update_filter.
183
+ create_update_filter( :fields => page_update_keys,
184
+ :on_content => :page_post,
185
+ :on_referer => :page_post )
186
+ end
187
+
201
188
  def page_post
202
189
  [ UHashMDCSetter.new,
203
190
  barc_writer, # Not run in 302 referer case, since no SOURCE.
metadata CHANGED
@@ -2,34 +2,28 @@
2
2
  name: iudex-worker
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.3.1
5
+ version: 1.3.2
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-25 00:00:00.000000000 Z
12
+ date: 2012-11-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: iudex-core
16
16
  version_requirements: !ruby/object:Gem::Requirement
17
17
  requirements:
18
- - - ! '>='
19
- - !ruby/object:Gem::Version
20
- version: 1.2.1
21
- - - <
18
+ - - ~>
22
19
  - !ruby/object:Gem::Version
23
- version: '1.4'
20
+ version: 1.3.0
24
21
  none: false
25
22
  requirement: !ruby/object:Gem::Requirement
26
23
  requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: 1.2.1
30
- - - <
24
+ - - ~>
31
25
  - !ruby/object:Gem::Version
32
- version: '1.4'
26
+ version: 1.3.0
33
27
  none: false
34
28
  prerelease: false
35
29
  type: :runtime
@@ -53,21 +47,15 @@ dependencies:
53
47
  name: iudex-rome
54
48
  version_requirements: !ruby/object:Gem::Requirement
55
49
  requirements:
56
- - - ! '>='
57
- - !ruby/object:Gem::Version
58
- version: 1.2.1
59
- - - <
50
+ - - ~>
60
51
  - !ruby/object:Gem::Version
61
- version: '1.4'
52
+ version: 1.3.0
62
53
  none: false
63
54
  requirement: !ruby/object:Gem::Requirement
64
55
  requirements:
65
- - - ! '>='
66
- - !ruby/object:Gem::Version
67
- version: 1.2.1
68
- - - <
56
+ - - ~>
69
57
  - !ruby/object:Gem::Version
70
- version: '1.4'
58
+ version: 1.3.0
71
59
  none: false
72
60
  prerelease: false
73
61
  type: :runtime
@@ -75,21 +63,15 @@ dependencies:
75
63
  name: iudex-html
76
64
  version_requirements: !ruby/object:Gem::Requirement
77
65
  requirements:
78
- - - ! '>='
79
- - !ruby/object:Gem::Version
80
- version: 1.2.1
81
- - - <
66
+ - - ~>
82
67
  - !ruby/object:Gem::Version
83
- version: '1.4'
68
+ version: 1.3.0
84
69
  none: false
85
70
  requirement: !ruby/object:Gem::Requirement
86
71
  requirements:
87
- - - ! '>='
88
- - !ruby/object:Gem::Version
89
- version: 1.2.1
90
- - - <
72
+ - - ~>
91
73
  - !ruby/object:Gem::Version
92
- version: '1.4'
74
+ version: 1.3.0
93
75
  none: false
94
76
  prerelease: false
95
77
  type: :runtime
@@ -97,21 +79,15 @@ dependencies:
97
79
  name: iudex-simhash
98
80
  version_requirements: !ruby/object:Gem::Requirement
99
81
  requirements:
100
- - - ! '>='
101
- - !ruby/object:Gem::Version
102
- version: 1.2.1
103
- - - <
82
+ - - ~>
104
83
  - !ruby/object:Gem::Version
105
- version: '1.4'
84
+ version: 1.3.0
106
85
  none: false
107
86
  requirement: !ruby/object:Gem::Requirement
108
87
  requirements:
109
- - - ! '>='
110
- - !ruby/object:Gem::Version
111
- version: 1.2.1
112
- - - <
88
+ - - ~>
113
89
  - !ruby/object:Gem::Version
114
- version: '1.4'
90
+ version: 1.3.0
115
91
  none: false
116
92
  prerelease: false
117
93
  type: :runtime
@@ -119,21 +95,15 @@ dependencies:
119
95
  name: iudex-char-detector
120
96
  version_requirements: !ruby/object:Gem::Requirement
121
97
  requirements:
122
- - - ! '>='
123
- - !ruby/object:Gem::Version
124
- version: 1.2.1
125
- - - <
98
+ - - ~>
126
99
  - !ruby/object:Gem::Version
127
- version: '1.4'
100
+ version: 1.3.0
128
101
  none: false
129
102
  requirement: !ruby/object:Gem::Requirement
130
103
  requirements:
131
- - - ! '>='
132
- - !ruby/object:Gem::Version
133
- version: 1.2.1
134
- - - <
104
+ - - ~>
135
105
  - !ruby/object:Gem::Version
136
- version: '1.4'
106
+ version: 1.3.0
137
107
  none: false
138
108
  prerelease: false
139
109
  type: :runtime
@@ -143,13 +113,13 @@ dependencies:
143
113
  requirements:
144
114
  - - ~>
145
115
  - !ruby/object:Gem::Version
146
- version: '1.2'
116
+ version: '1.5'
147
117
  none: false
148
118
  requirement: !ruby/object:Gem::Requirement
149
119
  requirements:
150
120
  - - ~>
151
121
  - !ruby/object:Gem::Version
152
- version: '1.2'
122
+ version: '1.5'
153
123
  none: false
154
124
  prerelease: false
155
125
  type: :runtime
@@ -173,21 +143,15 @@ dependencies:
173
143
  name: iudex-httpclient-3
174
144
  version_requirements: !ruby/object:Gem::Requirement
175
145
  requirements:
176
- - - ! '>='
177
- - !ruby/object:Gem::Version
178
- version: 1.2.1
179
- - - <
146
+ - - ~>
180
147
  - !ruby/object:Gem::Version
181
- version: '1.4'
148
+ version: 1.3.0
182
149
  none: false
183
150
  requirement: !ruby/object:Gem::Requirement
184
151
  requirements:
185
- - - ! '>='
186
- - !ruby/object:Gem::Version
187
- version: 1.2.1
188
- - - <
152
+ - - ~>
189
153
  - !ruby/object:Gem::Version
190
- version: '1.4'
154
+ version: 1.3.0
191
155
  none: false
192
156
  prerelease: false
193
157
  type: :development
@@ -195,21 +159,15 @@ dependencies:
195
159
  name: iudex-jetty-httpclient
196
160
  version_requirements: !ruby/object:Gem::Requirement
197
161
  requirements:
198
- - - ! '>='
199
- - !ruby/object:Gem::Version
200
- version: 1.2.1
201
- - - <
162
+ - - ~>
202
163
  - !ruby/object:Gem::Version
203
- version: '1.4'
164
+ version: 1.3.0
204
165
  none: false
205
166
  requirement: !ruby/object:Gem::Requirement
206
167
  requirements:
207
- - - ! '>='
208
- - !ruby/object:Gem::Version
209
- version: 1.2.1
210
- - - <
168
+ - - ~>
211
169
  - !ruby/object:Gem::Version
212
- version: '1.4'
170
+ version: 1.3.0
213
171
  none: false
214
172
  prerelease: false
215
173
  type: :development
@@ -217,21 +175,15 @@ dependencies:
217
175
  name: iudex-async-httpclient
218
176
  version_requirements: !ruby/object:Gem::Requirement
219
177
  requirements:
220
- - - ! '>='
221
- - !ruby/object:Gem::Version
222
- version: 1.2.1
223
- - - <
178
+ - - ~>
224
179
  - !ruby/object:Gem::Version
225
- version: '1.4'
180
+ version: 1.3.0
226
181
  none: false
227
182
  requirement: !ruby/object:Gem::Requirement
228
183
  requirements:
229
- - - ! '>='
230
- - !ruby/object:Gem::Version
231
- version: 1.2.1
232
- - - <
184
+ - - ~>
233
185
  - !ruby/object:Gem::Version
234
- version: '1.4'
186
+ version: 1.3.0
235
187
  none: false
236
188
  prerelease: false
237
189
  type: :development