pioneer 0.0.1.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/.travis.yml +2 -0
- data/CHANGELOG +11 -0
- data/Gemfile +6 -0
- data/LICENSE +1 -0
- data/README.md +83 -0
- data/Rakefile +11 -0
- data/lib/patch/fiber_periodic_timer_iterator.rb +39 -0
- data/lib/patch/iterator.rb +314 -0
- data/lib/pioneer.rb +16 -0
- data/lib/pioneer/base.rb +83 -0
- data/lib/pioneer/crawler.rb +12 -0
- data/lib/pioneer/http_header.rb +273 -0
- data/lib/pioneer/request.rb +65 -0
- data/lib/pioneer/version.rb +3 -0
- data/pioneer.gemspec +24 -0
- data/spec/pioneer/base_spec.rb +5 -0
- data/spec/pioneer/request_spec.rb +66 -0
- data/spec/spec_helper.rb +51 -0
- metadata +88 -0
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CHANGELOG
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
This is the Licence. Isn't it?
|
data/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# Pioneer
|
2
|
+
|
3
|
+
Pioneer is a simple async HTTP crawler based on em-synchrony
|
4
|
+
|
5
|
+
And it is very alpha right now.
|
6
|
+
|
7
|
+
# Install
|
8
|
+
|
9
|
+
```bash
|
10
|
+
gem install pioneer
|
11
|
+
```
|
12
|
+
|
13
|
+
# Usage
|
14
|
+
|
15
|
+
To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
|
16
|
+
|
17
|
+
First one should return enumerable object and second will accept request object.
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class Crawler << Pioneer::Base
|
21
|
+
def locations
|
22
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
23
|
+
end
|
24
|
+
|
25
|
+
def processing(req)
|
26
|
+
File.open(req.url, "w+") do |f|
|
27
|
+
f << req.response.response
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
Crawler.new.start
|
33
|
+
```
|
34
|
+
|
35
|
+
In this example we are saving two files with html of those two sites.
|
36
|
+
|
37
|
+
`start` method will start iterating over urls and return an Array of what `processing` method returns.
|
38
|
+
|
39
|
+
# Handling request, response errors and statuses
|
40
|
+
|
41
|
+
In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
class Crawler << Pioneer::Base
|
45
|
+
def locations
|
46
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
47
|
+
end
|
48
|
+
|
49
|
+
def processing(req)
|
50
|
+
File.open(req.url, "w+") do |f|
|
51
|
+
f << req.response.response
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def if_request_error(req)
|
56
|
+
puts "Request error: #{req.error}"
|
57
|
+
end
|
58
|
+
|
59
|
+
def if_response_error(req)
|
60
|
+
puts "Response error: #{req.response.error}"
|
61
|
+
end
|
62
|
+
|
63
|
+
def if_status_203(req)
|
64
|
+
puts "He is trying to redirect me"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
```
|
68
|
+
|
69
|
+
also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
|
70
|
+
|
71
|
+
# Overriding behavior
|
72
|
+
|
73
|
+
You can override all methods on the fly:
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
crawler = Pioneer::Crawler.new # base simple crawler
|
77
|
+
crawler.locations = [url1, url2]
|
78
|
+
crawler.processing = proc{ req.response.response_header.status }
|
79
|
+
crawler.if_status_404{ |req| "Oups" }
|
80
|
+
```
|
81
|
+
|
82
|
+
|
83
|
+
... to be continued
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
module EventMachine
|
2
|
+
module Synchrony
|
3
|
+
|
4
|
+
class FiberPeriodicTimerIterator < EM::Synchrony::Iterator
|
5
|
+
|
6
|
+
# set timeout and start point
|
7
|
+
# each Fiber will be executed not earlier than once per timeout
|
8
|
+
def initialize(list, concurrency=1, timeout=0)
|
9
|
+
@timeout = timeout
|
10
|
+
@next_start = Time.now
|
11
|
+
super list, concurrency
|
12
|
+
end
|
13
|
+
|
14
|
+
# execute each iterator block within its own fiber at particular time offset
|
15
|
+
# and auto-advance the iterator after each call
|
16
|
+
def each(foreach=nil, after=nil, &blk)
|
17
|
+
fe = Proc.new do |obj, iter|
|
18
|
+
Fiber.new do
|
19
|
+
sleep
|
20
|
+
(foreach || blk).call(obj); iter.next
|
21
|
+
end.resume
|
22
|
+
end
|
23
|
+
super(fe, after)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Sleep if the last request was recently (less then timout period)
|
27
|
+
def sleep
|
28
|
+
if @timeout > 0
|
29
|
+
now = Time.now
|
30
|
+
sleep_time = @next_start - Time.now
|
31
|
+
sleep_time = 0 if sleep_time < 0
|
32
|
+
@next_start = Time.now + sleep_time + @timeout
|
33
|
+
EM::Synchrony.sleep(sleep_time) if sleep_time > 0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,314 @@
|
|
1
|
+
module EventMachine
|
2
|
+
# A simple iterator for concurrent asynchronous work.
|
3
|
+
#
|
4
|
+
# Unlike ruby's built-in iterators, the end of the current iteration cycle is signaled manually,
|
5
|
+
# instead of happening automatically after the yielded block finishes executing. For example:
|
6
|
+
#
|
7
|
+
# (0..10).each{ |num| }
|
8
|
+
#
|
9
|
+
# becomes:
|
10
|
+
#
|
11
|
+
# EM::Iterator.new(0..10).each{ |num,iter| iter.next }
|
12
|
+
#
|
13
|
+
# This is especially useful when doing asynchronous work via reactor libraries and
|
14
|
+
# functions. For example, given a sync and async http api:
|
15
|
+
#
|
16
|
+
# response = sync_http_get(url); ...
|
17
|
+
# async_http_get(url){ |response| ... }
|
18
|
+
#
|
19
|
+
# a synchronous iterator such as:
|
20
|
+
#
|
21
|
+
# responses = urls.map{ |url| sync_http_get(url) }
|
22
|
+
# ...
|
23
|
+
# puts 'all done!'
|
24
|
+
#
|
25
|
+
# could be written as:
|
26
|
+
#
|
27
|
+
# EM::Iterator.new(urls).map(proc{ |url,iter|
|
28
|
+
# async_http_get(url){ |res|
|
29
|
+
# iter.return(res)
|
30
|
+
# }
|
31
|
+
# }, proc{ |responses|
|
32
|
+
# ...
|
33
|
+
# puts 'all done!'
|
34
|
+
# })
|
35
|
+
#
|
36
|
+
# Now, you can take advantage of the asynchronous api to issue requests in parallel. For example,
|
37
|
+
# to fetch 10 urls at a time, simply pass in a concurrency of 10:
|
38
|
+
#
|
39
|
+
# EM::Iterator.new(urls, 10).each do |url,iter|
|
40
|
+
# async_http_get(url){ iter.next }
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
|
44
|
+
# Support for Enumerable in Ruby 1.9+
|
45
|
+
module IteratorWithEnumerable
|
46
|
+
def setup_list(list)
|
47
|
+
raise ArgumentError, 'argument must be an Enumerable' unless list.respond_to?(:each)
|
48
|
+
list.to_enum
|
49
|
+
end
|
50
|
+
|
51
|
+
def next_item
|
52
|
+
@next_item
|
53
|
+
end
|
54
|
+
|
55
|
+
# We can't check just next_item as far as it can return nil in two cases:
|
56
|
+
# when our enumerator is stopped and when it stores nil value
|
57
|
+
def next?
|
58
|
+
begin
|
59
|
+
@next_item = @list.next
|
60
|
+
true
|
61
|
+
rescue StopIteration
|
62
|
+
false
|
63
|
+
rescue => e
|
64
|
+
raise e
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Ruby 1.8 uses continuations in Enumerable, so we should use Arrays
|
70
|
+
module IteratorWithArray
|
71
|
+
def setup_list(list)
|
72
|
+
raise ArgumentError, 'argument must be an array' unless list.respond_to?(:to_a)
|
73
|
+
list.dup.to_a
|
74
|
+
end
|
75
|
+
|
76
|
+
def next_item
|
77
|
+
@list.shift
|
78
|
+
end
|
79
|
+
|
80
|
+
def next?
|
81
|
+
@list.any?
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class Iterator
|
86
|
+
include IteratorWithEnumerable if defined? Fiber
|
87
|
+
include IteratorWithArray unless defined? Fiber
|
88
|
+
|
89
|
+
# Create a new parallel async iterator with specified concurrency.
|
90
|
+
#
|
91
|
+
# i = EM::Iterator.new(1..100, 10)
|
92
|
+
#
|
93
|
+
# will create an iterator over the range that processes 10 items at a time. Iteration
|
94
|
+
# is started via #each, #map or #inject
|
95
|
+
#
|
96
|
+
def initialize(list, concurrency = 1)
|
97
|
+
@list = setup_list(list)
|
98
|
+
@concurrency = concurrency
|
99
|
+
|
100
|
+
@started = false
|
101
|
+
@ended = false
|
102
|
+
end
|
103
|
+
|
104
|
+
# Change the concurrency of this iterator. Workers will automatically be spawned or destroyed
|
105
|
+
# to accomodate the new concurrency level.
|
106
|
+
#
|
107
|
+
def concurrency=(val)
|
108
|
+
old = @concurrency
|
109
|
+
@concurrency = val
|
110
|
+
|
111
|
+
spawn_workers if val > old and @started and !@ended
|
112
|
+
end
|
113
|
+
attr_reader :concurrency
|
114
|
+
|
115
|
+
# Iterate over a set of items using the specified block or proc.
|
116
|
+
#
|
117
|
+
# EM::Iterator.new(1..100).each do |num, iter|
|
118
|
+
# puts num
|
119
|
+
# iter.next
|
120
|
+
# end
|
121
|
+
#
|
122
|
+
# An optional second proc is invoked after the iteration is complete.
|
123
|
+
#
|
124
|
+
# EM::Iterator.new(1..100).each(
|
125
|
+
# proc{ |num,iter| iter.next },
|
126
|
+
# proc{ puts 'all done' }
|
127
|
+
# )
|
128
|
+
#
|
129
|
+
def each(foreach=nil, after=nil, &blk)
|
130
|
+
raise ArgumentError, 'proc or block required for iteration' unless foreach ||= blk
|
131
|
+
raise RuntimeError, 'cannot iterate over an iterator more than once' if @started or @ended
|
132
|
+
|
133
|
+
@started = true
|
134
|
+
@pending = 0
|
135
|
+
@workers = 0
|
136
|
+
|
137
|
+
all_done = proc{
|
138
|
+
after.call if after and @ended and @pending == 0
|
139
|
+
}
|
140
|
+
|
141
|
+
@process_next = proc{
|
142
|
+
# p [:process_next, :pending=, @pending, :workers=, @workers, :ended=, @ended, :concurrency=, @concurrency, :list=, @list]
|
143
|
+
unless @ended or @workers > @concurrency
|
144
|
+
if next?
|
145
|
+
item = next_item
|
146
|
+
@pending += 1
|
147
|
+
|
148
|
+
is_done = false
|
149
|
+
on_done = proc{
|
150
|
+
raise RuntimeError, 'already completed this iteration' if is_done
|
151
|
+
is_done = true
|
152
|
+
|
153
|
+
@pending -= 1
|
154
|
+
|
155
|
+
if @ended
|
156
|
+
all_done.call
|
157
|
+
else
|
158
|
+
EM.next_tick(@process_next)
|
159
|
+
end
|
160
|
+
}
|
161
|
+
class << on_done
|
162
|
+
alias :next :call
|
163
|
+
end
|
164
|
+
|
165
|
+
foreach.call(item, on_done)
|
166
|
+
else
|
167
|
+
@ended = true
|
168
|
+
@workers -= 1
|
169
|
+
all_done.call
|
170
|
+
end
|
171
|
+
else
|
172
|
+
@workers -= 1
|
173
|
+
end
|
174
|
+
}
|
175
|
+
|
176
|
+
spawn_workers
|
177
|
+
|
178
|
+
self
|
179
|
+
end
|
180
|
+
|
181
|
+
# Collect the results of an asynchronous iteration into an array.
|
182
|
+
#
|
183
|
+
# EM::Iterator.new(%w[ pwd uptime uname date ], 2).map(proc{ |cmd,iter|
|
184
|
+
# EM.system(cmd){ |output,status|
|
185
|
+
# iter.return(output)
|
186
|
+
# }
|
187
|
+
# }, proc{ |results|
|
188
|
+
# p results
|
189
|
+
# })
|
190
|
+
#
|
191
|
+
def map(foreach, after)
|
192
|
+
index = 0
|
193
|
+
|
194
|
+
inject([], proc{ |results,item,iter|
|
195
|
+
i = index
|
196
|
+
index += 1
|
197
|
+
|
198
|
+
is_done = false
|
199
|
+
on_done = proc{ |res|
|
200
|
+
raise RuntimeError, 'already returned a value for this iteration' if is_done
|
201
|
+
is_done = true
|
202
|
+
|
203
|
+
results[i] = res
|
204
|
+
iter.return(results)
|
205
|
+
}
|
206
|
+
class << on_done
|
207
|
+
alias :return :call
|
208
|
+
def next
|
209
|
+
raise NoMethodError, 'must call #return on a map iterator'
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
foreach.call(item, on_done)
|
214
|
+
}, proc{ |results|
|
215
|
+
after.call(results)
|
216
|
+
})
|
217
|
+
end
|
218
|
+
|
219
|
+
# Inject the results of an asynchronous iteration onto a given object.
|
220
|
+
#
|
221
|
+
# EM::Iterator.new(%w[ pwd uptime uname date ], 2).inject({}, proc{ |hash,cmd,iter|
|
222
|
+
# EM.system(cmd){ |output,status|
|
223
|
+
# hash[cmd] = status.exitstatus == 0 ? output.strip : nil
|
224
|
+
# iter.return(hash)
|
225
|
+
# }
|
226
|
+
# }, proc{ |results|
|
227
|
+
# p results
|
228
|
+
# })
|
229
|
+
#
|
230
|
+
def inject(obj, foreach, after)
|
231
|
+
each(proc{ |item,iter|
|
232
|
+
is_done = false
|
233
|
+
on_done = proc{ |res|
|
234
|
+
raise RuntimeError, 'already returned a value for this iteration' if is_done
|
235
|
+
is_done = true
|
236
|
+
|
237
|
+
obj = res
|
238
|
+
iter.next
|
239
|
+
}
|
240
|
+
class << on_done
|
241
|
+
alias :return :call
|
242
|
+
def next
|
243
|
+
raise NoMethodError, 'must call #return on an inject iterator'
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
foreach.call(obj, item, on_done)
|
248
|
+
}, proc{
|
249
|
+
after.call(obj)
|
250
|
+
})
|
251
|
+
end
|
252
|
+
|
253
|
+
private
|
254
|
+
|
255
|
+
# Spawn workers to consume items from the iterator's enumerator based on the current concurrency level.
|
256
|
+
#
|
257
|
+
def spawn_workers
|
258
|
+
EM.next_tick(start_worker = proc{
|
259
|
+
if @workers < @concurrency and !@ended
|
260
|
+
# p [:spawning_worker, :workers=, @workers, :concurrency=, @concurrency, :ended=, @ended]
|
261
|
+
@workers += 1
|
262
|
+
@process_next.call
|
263
|
+
EM.next_tick(start_worker)
|
264
|
+
end
|
265
|
+
})
|
266
|
+
nil
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
if __FILE__ == $0
|
272
|
+
$:.unshift File.join(File.dirname(__FILE__), '..')
|
273
|
+
require 'eventmachine'
|
274
|
+
|
275
|
+
# TODO: real tests
|
276
|
+
# TODO: pass in one object instead of two? .each{ |iter| puts iter.current; iter.next }
|
277
|
+
# TODO: support iter.pause/resume/stop/break/continue?
|
278
|
+
# TODO: create some exceptions instead of using RuntimeError
|
279
|
+
# TODO: support proc instead of enumerable? EM::Iterator.new(proc{ return queue.pop })
|
280
|
+
|
281
|
+
EM.run{
|
282
|
+
EM::Iterator.new(1..50).each{ |num,iter| p num; iter.next }
|
283
|
+
EM::Iterator.new([1,2,3], 10).each{ |num,iter| p num; iter.next }
|
284
|
+
|
285
|
+
i = EM::Iterator.new(1..100, 5)
|
286
|
+
i.each(proc{|num,iter|
|
287
|
+
p num.to_s
|
288
|
+
iter.next
|
289
|
+
}, proc{
|
290
|
+
p :done
|
291
|
+
})
|
292
|
+
EM.add_timer(0.03){
|
293
|
+
i.concurrency = 1
|
294
|
+
}
|
295
|
+
EM.add_timer(0.04){
|
296
|
+
i.concurrency = 3
|
297
|
+
}
|
298
|
+
|
299
|
+
EM::Iterator.new(100..150).map(proc{ |num,iter|
|
300
|
+
EM.add_timer(0.01){ iter.return(num) }
|
301
|
+
}, proc{ |results|
|
302
|
+
p results
|
303
|
+
})
|
304
|
+
|
305
|
+
EM::Iterator.new(%w[ pwd uptime uname date ], 2).inject({}, proc{ |hash,cmd,iter|
|
306
|
+
EM.system(cmd){ |output,status|
|
307
|
+
hash[cmd] = status.exitstatus == 0 ? output.strip : nil
|
308
|
+
iter.return(hash)
|
309
|
+
}
|
310
|
+
}, proc{ |results|
|
311
|
+
p results
|
312
|
+
})
|
313
|
+
}
|
314
|
+
end
|
data/lib/pioneer.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# Eventmachine
|
2
|
+
require "em-synchrony"
|
3
|
+
require "em-synchrony/em-http"
|
4
|
+
require "em-synchrony/fiber_iterator"
|
5
|
+
# patch - to remove! maybe pull to em-synchrony?
|
6
|
+
require "patch/iterator"
|
7
|
+
require "patch/fiber_periodic_timer_iterator"
|
8
|
+
# other
|
9
|
+
require "logger"
|
10
|
+
require 'uri'
|
11
|
+
# Code
|
12
|
+
require "pioneer/version"
|
13
|
+
require "pioneer/base"
|
14
|
+
require "pioneer/request"
|
15
|
+
require "pioneer/http_header"
|
16
|
+
require "pioneer/crawler"
|
data/lib/pioneer/base.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Pioneer
|
3
|
+
class UndefinedLocations < RuntimeError; end
|
4
|
+
class LocationsNotEnumerable < RuntimeError; end
|
5
|
+
class UndefinedProcessing < RuntimeError; end
|
6
|
+
class LocationsNotEnumerator < RuntimeError; end
|
7
|
+
class HttpRequestError < RuntimeError; end
|
8
|
+
class HttpResponseError < RuntimeError; end
|
9
|
+
class Base
|
10
|
+
attr_reader :name, :concurrency, :sleep, :log_level, :redirect
|
11
|
+
|
12
|
+
def initialize(opts = {})
|
13
|
+
raise UndefinedLocations, "you should specify `locations` method in your `self.class`" unless self.methods.include? :locations
|
14
|
+
raise UndefinedProcessing, "you should specify `processing` method in your `self.class`" unless self.methods.include? :processing
|
15
|
+
raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
|
16
|
+
@name = opts[:name] || "crawler"
|
17
|
+
@concurrency = opts[:concurrency] || 10
|
18
|
+
@sleep = opts[:sleep] || 0 # sleep is reversed RPS (1/RPS) - frequency of requests.
|
19
|
+
@log_enabled = opts[:log_enabled] || true # Logger is enabled by default
|
20
|
+
@log_level = opts[:log_level] || Logger::DEBUG
|
21
|
+
@random_header = opts[:random_header] || false
|
22
|
+
@header = opts[:header] || nil
|
23
|
+
@redirects = opts[:redirects] || nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def start
|
27
|
+
raise LocationsNotEnumerable, "location should respond to `each`" unless locations.respond_to? :each
|
28
|
+
result = []
|
29
|
+
EM.synchrony do
|
30
|
+
# Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
|
31
|
+
# In case @sleep is 0 it behaves like standart FiberIterator
|
32
|
+
EM::Synchrony::FiberPeriodicTimerIterator.new(locations, concurrency, sleep).map do |url|
|
33
|
+
result << Request.new(url, self).perform
|
34
|
+
end
|
35
|
+
EM.stop
|
36
|
+
end
|
37
|
+
result
|
38
|
+
end
|
39
|
+
|
40
|
+
def logger
|
41
|
+
@logger ||= begin
|
42
|
+
logger = @log_enabled ? Logger.new("#{name}.log") : Logger.new("/dev/null")
|
43
|
+
logger.level = log_level
|
44
|
+
logger
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def http_opts
|
49
|
+
opts = {}
|
50
|
+
opts[:head] = random_header if @random_header
|
51
|
+
opts[:head] = @header if @header
|
52
|
+
opts[:redirects] = @redirects if @redirects
|
53
|
+
opts
|
54
|
+
end
|
55
|
+
|
56
|
+
def random_header
|
57
|
+
HttpHeader.random
|
58
|
+
end
|
59
|
+
|
60
|
+
# we should override only our methods: locations, processing, if_XXX
|
61
|
+
def method_missing(method_name, *args, &block)
|
62
|
+
case method_name
|
63
|
+
when /locations.*=|processing.*=|if_.+=/
|
64
|
+
method_name = method_name.to_s.gsub("=", "").to_sym
|
65
|
+
override_method(method_name, args.first)
|
66
|
+
else
|
67
|
+
super(method_name, *args, &block)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def override_method(method_name, arg)
|
72
|
+
if Proc === arg
|
73
|
+
self.define_singleton_method method_name do |req|
|
74
|
+
arg.call(req)
|
75
|
+
end
|
76
|
+
else
|
77
|
+
self.define_singleton_method method_name do
|
78
|
+
arg
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,273 @@
|
|
1
|
+
module Pioneer
|
2
|
+
module HttpHeader
|
3
|
+
extend self
|
4
|
+
|
5
|
+
def random
|
6
|
+
header = headers.sample
|
7
|
+
headers = {
|
8
|
+
'Referer' => 'http://www.google.com/',
|
9
|
+
'User-Agent' => header,
|
10
|
+
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
11
|
+
'Connection' => 'keep-alive'
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
# Get more on http://www.useragentstring.com/pages/useragentstring.php
|
16
|
+
def headers
|
17
|
+
[
|
18
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0a2) Gecko/20111101 Firefox/9.0a2',
|
19
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
|
20
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
|
21
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110612 Firefox/6.0a2',
|
22
|
+
'Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0',
|
23
|
+
'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0',
|
24
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0 FirePHP/0.6',
|
25
|
+
'Mozilla/5.0 (Windows NT 5.0; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
|
26
|
+
'Mozilla/5.0 (X11; Linux i686 on x86_64; rv:5.0a2) Gecko/20110524 Firefox/5.0a2',
|
27
|
+
'Mozilla/5.0 (Windows NT 6.1; U; ru; rv:5.0.1.6) Gecko/20110501 Firefox/5.0.1 Firefox/5.0.1',
|
28
|
+
'mozilla/3.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/5.0.1',
|
29
|
+
'Mozilla/5.0 (X11; U; Linux i586; de; rv:5.0) Gecko/20100101 Firefox/5.0',
|
30
|
+
'Mozilla/5.0 (X11; U; Linux amd64; rv:5.0) Gecko/20100101 Firefox/5.0 (Debian)',
|
31
|
+
'Mozilla/5.0 (X11; U; Linux amd64; en-US; rv:5.0) Gecko/20110619 Firefox/5.0',
|
32
|
+
'Mozilla/5.0 (X11; Linux) Gecko Firefox/5.0',
|
33
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:5.0) Gecko/20100101 Firefox/5.0 FirePHP/0.5',
|
34
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:5.0) Gecko/20100101 Firefox/5.0 Firefox/5.0',
|
35
|
+
'Mozilla/5.0 (X11; Linux x86_64) Gecko Firefox/5.0',
|
36
|
+
'Mozilla/5.0 (X11; Linux ppc; rv:5.0) Gecko/20100101 Firefox/5.0',
|
37
|
+
'Mozilla/5.0 (X11; Linux AMD64) Gecko Firefox/5.0',
|
38
|
+
'Mozilla/5.0 (X11; FreeBSD amd64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
39
|
+
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
40
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:5.0) Gecko/20110619 Firefox/5.0',
|
41
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
42
|
+
'Mozilla/5.0 (Windows NT 6.1.1; rv:5.0) Gecko/20100101 Firefox/5.0',
|
43
|
+
'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
44
|
+
'Mozilla/5.0 (Windows NT 5.1; U; rv:5.0) Gecko/20100101 Firefox/5.0',
|
45
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/5.0',
|
46
|
+
'Mozilla/5.0 (Windows NT 5.0; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
47
|
+
'Mozilla/5.0 (Windows NT 5.0; rv:5.0) Gecko/20100101 Firefox/5.0',
|
48
|
+
'Mozilla/5.0 (U; Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
|
49
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:2.2a1pre) Gecko/20110324 Firefox/4.2a1pre',
|
50
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:2.2a1pre) Gecko/20100101 Firefox/4.2a1pre',
|
51
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110324 Firefox/4.2a1pre',
|
52
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110323 Firefox/4.2a1pre',
|
53
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110208 Firefox/4.2a1pre',
|
54
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b9pre) Gecko/20110111 Firefox/4.0b9pre',
|
55
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b9pre) Gecko/20101228 Firefox/4.0b9pre',
|
56
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:2.0b9pre) Gecko/20110105 Firefox/4.0b9pre',
|
57
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b8pre) Gecko/20101114 Firefox/4.0b8pre',
|
58
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b8pre) Gecko/20101213 Firefox/4.0b8pre',
|
59
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b8pre) Gecko/20101128 Firefox/4.0b8pre',
|
60
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b8pre) Gecko/20101114 Firefox/4.0b8pre',
|
61
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:2.0b8pre) Gecko/20101127 Firefox/4.0b8pre',
|
62
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0 Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0',
|
63
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
64
|
+
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
65
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
66
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.6 (KHTML, like Gecko) Chrome/16.0.897.0 Safari/535.6',
|
67
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2',
|
68
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Ubuntu/11.10 Chromium/15.0.874.120 Chrome/15.0.874.120 Safari/535.2',
|
69
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
|
70
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2',
|
71
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2',
|
72
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2',
|
73
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.860.0 Safari/535.2',
|
74
|
+
'Chrome/15.0.860.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/15.0.860.0',
|
75
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/10.04 Chromium/14.0.813.0 Chrome/14.0.813.0 Safari/535.1',
|
76
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
77
|
+
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
78
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
79
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
80
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.814.0 Chrome/14.0.814.0 Safari/535.1',
|
81
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1',
|
82
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
83
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.803.0 Chrome/14.0.803.0 Safari/535.1',
|
84
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
85
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
86
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
87
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
88
|
+
'Mozilla/5.0 Slackware/13.37 (X11; U; Linux x86_64; en-US) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41',
|
89
|
+
'Mozilla/5.0 ArchLinux (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
90
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/13.0.782.41 Chrome/13.0.782.41 Safari/535.1',
|
91
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
92
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
93
|
+
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
94
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
95
|
+
'Mozilla/5.0 (Windows NT 5.2; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
96
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
97
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
98
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_3) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
99
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
100
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1',
|
101
|
+
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1',
|
102
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1',
|
103
|
+
'Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00',
|
104
|
+
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
|
105
|
+
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',
|
106
|
+
'Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51',
|
107
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51',
|
108
|
+
'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50',
|
109
|
+
'Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50',
|
110
|
+
'Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11',
|
111
|
+
'Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11',
|
112
|
+
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11',
|
113
|
+
'Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01',
|
114
|
+
'Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01',
|
115
|
+
'Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01',
|
116
|
+
'Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01',
|
117
|
+
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01',
|
118
|
+
'Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01',
|
119
|
+
'Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01',
|
120
|
+
'Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01',
|
121
|
+
'Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01',
|
122
|
+
'Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01',
|
123
|
+
'Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01',
|
124
|
+
'Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01',
|
125
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01',
|
126
|
+
'Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01',
|
127
|
+
'Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01',
|
128
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01',
|
129
|
+
'Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10',
|
130
|
+
'Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10',
|
131
|
+
'Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10',
|
132
|
+
'Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1',
|
133
|
+
'Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00',
|
134
|
+
'Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00',
|
135
|
+
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00',
|
136
|
+
'Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00',
|
137
|
+
'Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00',
|
138
|
+
'Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00',
|
139
|
+
'Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00',
|
140
|
+
'Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00',
|
141
|
+
'Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00',
|
142
|
+
'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00',
|
143
|
+
'Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00',
|
144
|
+
'Opera/9.80 (Windows NT 5.1; U; it) Presto/2.7.62 Version/11.00',
|
145
|
+
'Mozilla/5.0 (Windows NT 6.0; U; ja; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00',
|
146
|
+
'Mozilla/5.0 (Windows NT 5.1; U; pl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00',
|
147
|
+
'Mozilla/5.0 (Windows NT 5.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00',
|
148
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; X11; Linux x86_64; pl) Opera 11.00',
|
149
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; fr) Opera 11.00',
|
150
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; ja) Opera 11.00',
|
151
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; en) Opera 11.00',
|
152
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; pl) Opera 11.00',
|
153
|
+
'Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.6.31 Version/10.70',
|
154
|
+
'Mozilla/5.0 (Windows NT 5.2; U; ru; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.70',
|
155
|
+
'Mozilla/5.0 (Windows NT 5.1; U; zh-cn; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.70',
|
156
|
+
'Opera/9.80 (X11; Linux i686; U; en-GB) Presto/2.5.24 Version/10.53',
|
157
|
+
'Mozilla/5.0 (Windows NT 5.1; U; zh-cn; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
158
|
+
'Mozilla/5.0 (Windows NT 5.1; U; Firefox/5.0; en; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
159
|
+
'Mozilla/5.0 (Windows NT 5.1; U; Firefox/4.5; en; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
160
|
+
'Mozilla/5.0 (Windows NT 5.1; U; Firefox/3.5; en; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
161
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; ko) Opera 10.53',
|
162
|
+
'Opera/9.80 (X11; U; Linux i686; en-US; rv:1.9.2.3) Presto/2.2.15 Version/10.10',
|
163
|
+
'Opera/9.80 (X11; Linux x86_64; U; it) Presto/2.2.15 Version/10.10',
|
164
|
+
'Opera/9.80 (Windows NT 6.1; U; de) Presto/2.2.15 Version/10.10',
|
165
|
+
'Opera/9.80 (Windows NT 6.0; U; Gecko/20100115; pl) Presto/2.2.15 Version/10.10',
|
166
|
+
'Opera/9.80 (Windows NT 6.0; U; en) Presto/2.2.15 Version/10.10',
|
167
|
+
'Opera/9.80 (Windows NT 5.1; U; de) Presto/2.2.15 Version/10.10',
|
168
|
+
'Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.2.15 Version/10.10',
|
169
|
+
'Mozilla/5.0 (Windows NT 6.0; U; tr; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 10.10',
|
170
|
+
'Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; de) Opera 10.10',
|
171
|
+
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 6.0; tr) Opera 10.10',
|
172
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
|
173
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
|
174
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)',
|
175
|
+
'Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)',
|
176
|
+
'Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)',
|
177
|
+
'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))',
|
178
|
+
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
|
179
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)',
|
180
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)',
|
181
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7',
|
182
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)',
|
183
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)',
|
184
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)',
|
185
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)',
|
186
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)',
|
187
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0',
|
188
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)',
|
189
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)',
|
190
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)',
|
191
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)',
|
192
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)',
|
193
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205',
|
194
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)',
|
195
|
+
'Mozilla/5.0 ( ; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
|
196
|
+
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; FDM; MSIECrawler; Media Center PC 5.0)',
|
197
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
|
198
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)',
|
199
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)',
|
200
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)',
|
201
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
202
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)',
|
203
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)',
|
204
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)',
|
205
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
|
206
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)',
|
207
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8',
|
208
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)',
|
209
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)',
|
210
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)',
|
211
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)',
|
212
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; msn OptimizedIE8;ZHCN)',
|
213
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; InfoPath.3; .NET4.0C; .NET4.0E) chromeframe/8.0.552.224',
|
214
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; .NET4.0C; .NET4.0E; Zune 4.7; InfoPath.3)',
|
215
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; .NET4.0C; .NET4.0E; Zune 4.7)',
|
216
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8)',
|
217
|
+
'Mozilla/4.0(compatible; MSIE 7.0b; Windows NT 6.0)',
|
218
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)',
|
219
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)',
|
220
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; Media Center PC 3.0; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1)',
|
221
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; FDM; .NET CLR 1.1.4322)',
|
222
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727)',
|
223
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1)',
|
224
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; Alexa Toolbar; .NET CLR 2.0.50727)',
|
225
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; Alexa Toolbar)',
|
226
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
227
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.40607)',
|
228
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322)',
|
229
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.0.3705; Media Center PC 3.1; Alexa Toolbar; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
230
|
+
'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
|
231
|
+
'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; el-GR)',
|
232
|
+
'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 5.2)',
|
233
|
+
'Mozilla/5.0 (MSIE 7.0; Macintosh; U; SunOS; X11; gu; SV1; InfoPath.2; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
|
234
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; WOW64; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; c .NET CLR 3.0.04506; .NET CLR 3.5.30707; InfoPath.1; el-GR)',
|
235
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; c .NET CLR 3.0.04506; .NET CLR 3.5.30707; InfoPath.1; el-GR)',
|
236
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; fr-FR)',
|
237
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; en-US)',
|
238
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.2; WOW64; .NET CLR 2.0.50727)',
|
239
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows 98; SpamBlockerUtility 6.3.91; SpamBlockerUtility 6.2.91; .NET CLR 4.1.89;GB)',
|
240
|
+
'Mozilla/4.79 [en] (compatible; MSIE 7.0; Windows NT 5.0; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
|
241
|
+
'Mozilla/4.0 (Windows; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
242
|
+
'Mozilla/4.0 (Mozilla/4.0; MSIE 7.0; Windows NT 5.1; FDM; SV1; .NET CLR 3.0.04506.30)',
|
243
|
+
'Mozilla/4.0 (Mozilla/4.0; MSIE 7.0; Windows NT 5.1; FDM; SV1)',
|
244
|
+
'Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)',
|
245
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)',
|
246
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; .NET4.0C; .NET4.0E; InfoPath.3)',
|
247
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; chromeframe/12.0.742.100)',
|
248
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)',
|
249
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
|
250
|
+
'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
251
|
+
'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
252
|
+
'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)',
|
253
|
+
'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
254
|
+
'Mozilla/45.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
255
|
+
'Mozilla/4.08 (compatible; MSIE 6.0; Windows NT 5.1)',
|
256
|
+
'Mozilla/4.01 (compatible; MSIE 6.0; Windows NT 5.1)',
|
257
|
+
'Mozilla/4.0 (X11; MSIE 6.0; i686; .NET CLR 1.1.4322; .NET CLR 2.0.50727; FDM)',
|
258
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 6.0)',
|
259
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 5.2)',
|
260
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 5.0)',
|
261
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
262
|
+
'Mozilla/4.0 (MSIE 6.0; Windows NT 5.1)',
|
263
|
+
'Mozilla/4.0 (MSIE 6.0; Windows NT 5.0)',
|
264
|
+
'Mozilla/4.0 (compatible;MSIE 6.0;Windows 98;Q312461)',
|
265
|
+
'Mozilla/4.0 (Compatible; Windows NT 5.1; MSIE 6.0) (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
266
|
+
'Mozilla/4.0 (compatible; U; MSIE 6.0; Windows NT 5.1) (Compatible; ; ; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
|
267
|
+
'Mozilla/4.0 (compatible; U; MSIE 6.0; Windows NT 5.1)',
|
268
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3; Tablet PC 2.0)',
|
269
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB6.5; QQDownload 534; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729)'
|
270
|
+
]
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Pioneer
|
3
|
+
class Request
|
4
|
+
attr_reader :pioneer, :url, :result, :response, :error
|
5
|
+
def initialize(url, pioneer)
|
6
|
+
@url, @pioneer = url, pioneer
|
7
|
+
@url = begin
|
8
|
+
url = "http://" + url unless url =~ /http/
|
9
|
+
URI.escape(url)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def perform
|
14
|
+
pioneer.logger.info("going to #{url}")
|
15
|
+
@result = handle_request_error_or_return_result
|
16
|
+
end
|
17
|
+
|
18
|
+
# Handle base fatal request error
|
19
|
+
def handle_request_error_or_return_result
|
20
|
+
begin
|
21
|
+
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
|
+
rescue => e
|
23
|
+
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
|
+
pioneer.logger.fatal(error)
|
25
|
+
if pioneer.respond_to? :if_request_error
|
26
|
+
return pioneer.send(:if_request_error, self)
|
27
|
+
else
|
28
|
+
raise HttpRequestError, @error
|
29
|
+
end
|
30
|
+
end
|
31
|
+
handle_response_error_or_return_result
|
32
|
+
end
|
33
|
+
|
34
|
+
# handle http error
|
35
|
+
def handle_response_error_or_return_result
|
36
|
+
if response.error
|
37
|
+
error = "Response for #{url} get an error: #{response.error}"
|
38
|
+
pioneer.logger.error(error)
|
39
|
+
if pioneer.respond_to? :if_response_error
|
40
|
+
return pioneer.send(:if_response_error, self)
|
41
|
+
else
|
42
|
+
raise HttpResponseError, error
|
43
|
+
end
|
44
|
+
end
|
45
|
+
handle_status_or_return_result
|
46
|
+
end
|
47
|
+
|
48
|
+
def handle_status_or_return_result
|
49
|
+
status = response.response_header.status
|
50
|
+
case status
|
51
|
+
when 200
|
52
|
+
pioneer.processing(self)
|
53
|
+
else
|
54
|
+
pioneer.logger.error("This #{url} returns this http status: #{status}")
|
55
|
+
if pioneer.respond_to? "if_status_#{status}".to_sym
|
56
|
+
pioneer.send("if_status_#{status}", self)
|
57
|
+
elsif pioneer.respond_to? :if_status_not_200
|
58
|
+
pioneer.send(:if_status_not_200, self)
|
59
|
+
else
|
60
|
+
nil # nothing?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/pioneer.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "pioneer/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "pioneer"
|
7
|
+
s.version = Pioneer::VERSION
|
8
|
+
s.authors = ["Petr"]
|
9
|
+
s.email = ["pedro.yanoviches@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = "HTTP crawler"
|
12
|
+
s.description = "Simple async HTTP crawler based on em-synchrony"
|
13
|
+
|
14
|
+
s.rubyforge_project = "pioneer"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "yajl-ruby"
|
23
|
+
s.add_runtime_dependency "em-synchrony"
|
24
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'yajl'
|
4
|
+
#
|
5
|
+
# TODO:
|
6
|
+
# Rewrite real live examples with StubServer
|
7
|
+
#
|
8
|
+
|
9
|
+
describe Pioneer::Request do
|
10
|
+
before do
|
11
|
+
@pioneer1 = CustomCrawler1.new(name: "Custom crawler 1")
|
12
|
+
@pioneer2 = Pioneer::Crawler.new(name: "Base crawler 2")
|
13
|
+
@pioneer3 = Pioneer::Crawler.new(name: "Base crawler 3")
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should return two 200 response statuses" do
|
17
|
+
@pioneer1.start.must_equal [200, 200]
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should redefine methods" do
|
21
|
+
processing = proc{ |req| req.response.response_header.status + 1 }
|
22
|
+
@pioneer2.processing = processing
|
23
|
+
@pioneer2.locations = ["www.apple.com", "www.amazon.com"]
|
24
|
+
@pioneer2.start.must_equal [201, 201]
|
25
|
+
@pioneer2.locations = ["www.ru.erro"]
|
26
|
+
if_response_error = proc{ |req| "fail" }
|
27
|
+
@pioneer2.if_response_error = if_response_error
|
28
|
+
@pioneer2.start.must_equal ["fail"]
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should execute if_status_xxx" do
|
32
|
+
redirector = proc{ |req| "redirected" }
|
33
|
+
error404 = proc{ |req| "notfound" }
|
34
|
+
@pioneer2.locations = ["google.com/redirectmeplease", "http://www.amazon.com/notfoundpage"]
|
35
|
+
@pioneer2.if_status_301 = redirector
|
36
|
+
@pioneer2.if_status_404 = error404
|
37
|
+
@pioneer2.start.must_equal ["redirected", "notfound"]
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should execute if_status_not_200 if another colback is not defined" do
|
41
|
+
not_200 = proc{ "something goes wrong" }
|
42
|
+
redirector = proc{ |req| "redirected" }
|
43
|
+
@pioneer3.locations = ["google.com/redirectmeplease", "http://www.amazon.com/notfoundpage"]
|
44
|
+
@pioneer3.if_status_301 = redirector
|
45
|
+
@pioneer3.if_status_not_200 = not_200
|
46
|
+
@pioneer3.start.must_equal ["redirected", "something goes wrong"]
|
47
|
+
end
|
48
|
+
|
49
|
+
# LAST FM API TEST
|
50
|
+
it "should return similar artists for a number of them" do
|
51
|
+
@lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
|
52
|
+
@lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should use headers" do
|
56
|
+
@crawler1 = KinopoiskCrawler.new(random_header: false)
|
57
|
+
@crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
|
58
|
+
@crawler3 = KinopoiskCrawler.new(random_header: true)
|
59
|
+
# this one will redirect
|
60
|
+
@crawler1.start.must_equal [nil]
|
61
|
+
# this one will return some restrictions (it need real headres)
|
62
|
+
(@crawler2.start.first < 10000).must_equal true
|
63
|
+
# and this one will fire up
|
64
|
+
(@crawler3.start.first > 10000).must_equal true
|
65
|
+
end
|
66
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'pioneer'
|
2
|
+
require 'minitest/spec'
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
# saving two pages
|
7
|
+
class CustomCrawler1 < Pioneer::Base
|
8
|
+
def locations
|
9
|
+
["http://www.ru", "http://www.ru"]
|
10
|
+
end
|
11
|
+
|
12
|
+
def processing(req)
|
13
|
+
req.response.response_header.status
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# LastFM test
|
18
|
+
class LastfmEnum
|
19
|
+
include Enumerable
|
20
|
+
|
21
|
+
ARTISTS = ["Cher", "Madonna", "Rolling Stones", "The Beatles", "Muse"]
|
22
|
+
|
23
|
+
def each
|
24
|
+
ARTISTS.each do |artist|
|
25
|
+
url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
|
26
|
+
yield url
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class LastfmCrawler < Pioneer::Base
|
32
|
+
def locations
|
33
|
+
LastfmEnum.new
|
34
|
+
end
|
35
|
+
|
36
|
+
def processing(req)
|
37
|
+
json = Yajl::Parser.parse(req.response.response)
|
38
|
+
json["similarartists"]["@attr"]["artist"]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Kinopoisk
|
43
|
+
class KinopoiskCrawler < Pioneer::Base
|
44
|
+
def locations
|
45
|
+
["http://www.kinopoisk.ru/level/1/film/614667/"]
|
46
|
+
end
|
47
|
+
|
48
|
+
def processing(req)
|
49
|
+
req.response.response.size
|
50
|
+
end
|
51
|
+
end
|
metadata
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pioneer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1.alpha
|
5
|
+
prerelease: 6
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Petr
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-22 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: yajl-ruby
|
16
|
+
requirement: &74894120 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *74894120
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: em-synchrony
|
27
|
+
requirement: &74893910 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *74893910
|
36
|
+
description: Simple async HTTP crawler based on em-synchrony
|
37
|
+
email:
|
38
|
+
- pedro.yanoviches@gmail.com
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- .gitignore
|
44
|
+
- .travis.yml
|
45
|
+
- CHANGELOG
|
46
|
+
- Gemfile
|
47
|
+
- LICENSE
|
48
|
+
- README.md
|
49
|
+
- Rakefile
|
50
|
+
- lib/patch/fiber_periodic_timer_iterator.rb
|
51
|
+
- lib/patch/iterator.rb
|
52
|
+
- lib/pioneer.rb
|
53
|
+
- lib/pioneer/base.rb
|
54
|
+
- lib/pioneer/crawler.rb
|
55
|
+
- lib/pioneer/http_header.rb
|
56
|
+
- lib/pioneer/request.rb
|
57
|
+
- lib/pioneer/version.rb
|
58
|
+
- pioneer.gemspec
|
59
|
+
- spec/pioneer/base_spec.rb
|
60
|
+
- spec/pioneer/request_spec.rb
|
61
|
+
- spec/spec_helper.rb
|
62
|
+
- tmp/just_for_test/railscasts.txt
|
63
|
+
homepage: ''
|
64
|
+
licenses: []
|
65
|
+
post_install_message:
|
66
|
+
rdoc_options: []
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ! '>='
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>'
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 1.3.1
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project: pioneer
|
83
|
+
rubygems_version: 1.8.15
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: HTTP crawler
|
87
|
+
test_files: []
|
88
|
+
has_rdoc:
|