pioneer 0.0.1.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/.travis.yml +2 -0
- data/CHANGELOG +11 -0
- data/Gemfile +6 -0
- data/LICENSE +1 -0
- data/README.md +83 -0
- data/Rakefile +11 -0
- data/lib/patch/fiber_periodic_timer_iterator.rb +39 -0
- data/lib/patch/iterator.rb +314 -0
- data/lib/pioneer.rb +16 -0
- data/lib/pioneer/base.rb +83 -0
- data/lib/pioneer/crawler.rb +12 -0
- data/lib/pioneer/http_header.rb +273 -0
- data/lib/pioneer/request.rb +65 -0
- data/lib/pioneer/version.rb +3 -0
- data/pioneer.gemspec +24 -0
- data/spec/pioneer/base_spec.rb +5 -0
- data/spec/pioneer/request_spec.rb +66 -0
- data/spec/spec_helper.rb +51 -0
- metadata +88 -0
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CHANGELOG
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
This is the Licence. Isn't it?
|
data/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# Pioneer
|
2
|
+
|
3
|
+
Pioneer is a simple async HTTP crawler based on em-synchrony
|
4
|
+
|
5
|
+
And it is very alpha right now.
|
6
|
+
|
7
|
+
# Install
|
8
|
+
|
9
|
+
```bash
|
10
|
+
gem install pioneer
|
11
|
+
```
|
12
|
+
|
13
|
+
# Usage
|
14
|
+
|
15
|
+
To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
|
16
|
+
|
17
|
+
First one should return enumerable object and second will accept request object.
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class Crawler << Pioneer::Base
|
21
|
+
def locations
|
22
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
23
|
+
end
|
24
|
+
|
25
|
+
def processing(req)
|
26
|
+
File.open(req.url, "w+") do |f|
|
27
|
+
f << req.response.response
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
Crawler.new.start
|
33
|
+
```
|
34
|
+
|
35
|
+
In this example we are saving two files with html of those two sites.
|
36
|
+
|
37
|
+
`start` method will start iterating over urls and return an Array of what `processing` method returns.
|
38
|
+
|
39
|
+
# Handling request, response errors and statuses
|
40
|
+
|
41
|
+
In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
class Crawler << Pioneer::Base
|
45
|
+
def locations
|
46
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
47
|
+
end
|
48
|
+
|
49
|
+
def processing(req)
|
50
|
+
File.open(req.url, "w+") do |f|
|
51
|
+
f << req.response.response
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def if_request_error(req)
|
56
|
+
puts "Request error: #{req.error}"
|
57
|
+
end
|
58
|
+
|
59
|
+
def if_response_error(req)
|
60
|
+
puts "Response error: #{req.response.error}"
|
61
|
+
end
|
62
|
+
|
63
|
+
def if_status_203(req)
|
64
|
+
puts "He is trying to redirect me"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
```
|
68
|
+
|
69
|
+
also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
|
70
|
+
|
71
|
+
# Overriding behavior
|
72
|
+
|
73
|
+
You can override all methods on the fly:
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
crawler = Pioneer::Crawler.new # base simple crawler
|
77
|
+
crawler.locations = [url1, url2]
|
78
|
+
crawler.processing = proc{ req.response.response_header.status }
|
79
|
+
crawler.if_status_404{ |req| "Oups" }
|
80
|
+
```
|
81
|
+
|
82
|
+
|
83
|
+
... to be continued
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
module EventMachine
|
2
|
+
module Synchrony
|
3
|
+
|
4
|
+
class FiberPeriodicTimerIterator < EM::Synchrony::Iterator
|
5
|
+
|
6
|
+
# set timeout and start point
|
7
|
+
# each Fiber will be executed not earlier than once per timeout
|
8
|
+
def initialize(list, concurrency=1, timeout=0)
|
9
|
+
@timeout = timeout
|
10
|
+
@next_start = Time.now
|
11
|
+
super list, concurrency
|
12
|
+
end
|
13
|
+
|
14
|
+
# execute each iterator block within its own fiber at particular time offset
|
15
|
+
# and auto-advance the iterator after each call
|
16
|
+
def each(foreach=nil, after=nil, &blk)
|
17
|
+
fe = Proc.new do |obj, iter|
|
18
|
+
Fiber.new do
|
19
|
+
sleep
|
20
|
+
(foreach || blk).call(obj); iter.next
|
21
|
+
end.resume
|
22
|
+
end
|
23
|
+
super(fe, after)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Sleep if the last request was recently (less then timout period)
|
27
|
+
def sleep
|
28
|
+
if @timeout > 0
|
29
|
+
now = Time.now
|
30
|
+
sleep_time = @next_start - Time.now
|
31
|
+
sleep_time = 0 if sleep_time < 0
|
32
|
+
@next_start = Time.now + sleep_time + @timeout
|
33
|
+
EM::Synchrony.sleep(sleep_time) if sleep_time > 0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,314 @@
|
|
1
|
+
module EventMachine
|
2
|
+
# A simple iterator for concurrent asynchronous work.
|
3
|
+
#
|
4
|
+
# Unlike ruby's built-in iterators, the end of the current iteration cycle is signaled manually,
|
5
|
+
# instead of happening automatically after the yielded block finishes executing. For example:
|
6
|
+
#
|
7
|
+
# (0..10).each{ |num| }
|
8
|
+
#
|
9
|
+
# becomes:
|
10
|
+
#
|
11
|
+
# EM::Iterator.new(0..10).each{ |num,iter| iter.next }
|
12
|
+
#
|
13
|
+
# This is especially useful when doing asynchronous work via reactor libraries and
|
14
|
+
# functions. For example, given a sync and async http api:
|
15
|
+
#
|
16
|
+
# response = sync_http_get(url); ...
|
17
|
+
# async_http_get(url){ |response| ... }
|
18
|
+
#
|
19
|
+
# a synchronous iterator such as:
|
20
|
+
#
|
21
|
+
# responses = urls.map{ |url| sync_http_get(url) }
|
22
|
+
# ...
|
23
|
+
# puts 'all done!'
|
24
|
+
#
|
25
|
+
# could be written as:
|
26
|
+
#
|
27
|
+
# EM::Iterator.new(urls).map(proc{ |url,iter|
|
28
|
+
# async_http_get(url){ |res|
|
29
|
+
# iter.return(res)
|
30
|
+
# }
|
31
|
+
# }, proc{ |responses|
|
32
|
+
# ...
|
33
|
+
# puts 'all done!'
|
34
|
+
# })
|
35
|
+
#
|
36
|
+
# Now, you can take advantage of the asynchronous api to issue requests in parallel. For example,
|
37
|
+
# to fetch 10 urls at a time, simply pass in a concurrency of 10:
|
38
|
+
#
|
39
|
+
# EM::Iterator.new(urls, 10).each do |url,iter|
|
40
|
+
# async_http_get(url){ iter.next }
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
|
44
|
+
# Support for Enumerable in Ruby 1.9+
|
45
|
+
module IteratorWithEnumerable
|
46
|
+
def setup_list(list)
|
47
|
+
raise ArgumentError, 'argument must be an Enumerable' unless list.respond_to?(:each)
|
48
|
+
list.to_enum
|
49
|
+
end
|
50
|
+
|
51
|
+
def next_item
|
52
|
+
@next_item
|
53
|
+
end
|
54
|
+
|
55
|
+
# We can't check just next_item as far as it can return nil in two cases:
|
56
|
+
# when our enumerator is stopped and when it stores nil value
|
57
|
+
def next?
|
58
|
+
begin
|
59
|
+
@next_item = @list.next
|
60
|
+
true
|
61
|
+
rescue StopIteration
|
62
|
+
false
|
63
|
+
rescue => e
|
64
|
+
raise e
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Ruby 1.8 uses continuations in Enumerable, so we should use Arrays
|
70
|
+
module IteratorWithArray
|
71
|
+
def setup_list(list)
|
72
|
+
raise ArgumentError, 'argument must be an array' unless list.respond_to?(:to_a)
|
73
|
+
list.dup.to_a
|
74
|
+
end
|
75
|
+
|
76
|
+
def next_item
|
77
|
+
@list.shift
|
78
|
+
end
|
79
|
+
|
80
|
+
def next?
|
81
|
+
@list.any?
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class Iterator
|
86
|
+
include IteratorWithEnumerable if defined? Fiber
|
87
|
+
include IteratorWithArray unless defined? Fiber
|
88
|
+
|
89
|
+
# Create a new parallel async iterator with specified concurrency.
|
90
|
+
#
|
91
|
+
# i = EM::Iterator.new(1..100, 10)
|
92
|
+
#
|
93
|
+
# will create an iterator over the range that processes 10 items at a time. Iteration
|
94
|
+
# is started via #each, #map or #inject
|
95
|
+
#
|
96
|
+
def initialize(list, concurrency = 1)
|
97
|
+
@list = setup_list(list)
|
98
|
+
@concurrency = concurrency
|
99
|
+
|
100
|
+
@started = false
|
101
|
+
@ended = false
|
102
|
+
end
|
103
|
+
|
104
|
+
# Change the concurrency of this iterator. Workers will automatically be spawned or destroyed
|
105
|
+
# to accomodate the new concurrency level.
|
106
|
+
#
|
107
|
+
def concurrency=(val)
|
108
|
+
old = @concurrency
|
109
|
+
@concurrency = val
|
110
|
+
|
111
|
+
spawn_workers if val > old and @started and !@ended
|
112
|
+
end
|
113
|
+
attr_reader :concurrency
|
114
|
+
|
115
|
+
# Iterate over a set of items using the specified block or proc.
|
116
|
+
#
|
117
|
+
# EM::Iterator.new(1..100).each do |num, iter|
|
118
|
+
# puts num
|
119
|
+
# iter.next
|
120
|
+
# end
|
121
|
+
#
|
122
|
+
# An optional second proc is invoked after the iteration is complete.
|
123
|
+
#
|
124
|
+
# EM::Iterator.new(1..100).each(
|
125
|
+
# proc{ |num,iter| iter.next },
|
126
|
+
# proc{ puts 'all done' }
|
127
|
+
# )
|
128
|
+
#
|
129
|
+
def each(foreach=nil, after=nil, &blk)
|
130
|
+
raise ArgumentError, 'proc or block required for iteration' unless foreach ||= blk
|
131
|
+
raise RuntimeError, 'cannot iterate over an iterator more than once' if @started or @ended
|
132
|
+
|
133
|
+
@started = true
|
134
|
+
@pending = 0
|
135
|
+
@workers = 0
|
136
|
+
|
137
|
+
all_done = proc{
|
138
|
+
after.call if after and @ended and @pending == 0
|
139
|
+
}
|
140
|
+
|
141
|
+
@process_next = proc{
|
142
|
+
# p [:process_next, :pending=, @pending, :workers=, @workers, :ended=, @ended, :concurrency=, @concurrency, :list=, @list]
|
143
|
+
unless @ended or @workers > @concurrency
|
144
|
+
if next?
|
145
|
+
item = next_item
|
146
|
+
@pending += 1
|
147
|
+
|
148
|
+
is_done = false
|
149
|
+
on_done = proc{
|
150
|
+
raise RuntimeError, 'already completed this iteration' if is_done
|
151
|
+
is_done = true
|
152
|
+
|
153
|
+
@pending -= 1
|
154
|
+
|
155
|
+
if @ended
|
156
|
+
all_done.call
|
157
|
+
else
|
158
|
+
EM.next_tick(@process_next)
|
159
|
+
end
|
160
|
+
}
|
161
|
+
class << on_done
|
162
|
+
alias :next :call
|
163
|
+
end
|
164
|
+
|
165
|
+
foreach.call(item, on_done)
|
166
|
+
else
|
167
|
+
@ended = true
|
168
|
+
@workers -= 1
|
169
|
+
all_done.call
|
170
|
+
end
|
171
|
+
else
|
172
|
+
@workers -= 1
|
173
|
+
end
|
174
|
+
}
|
175
|
+
|
176
|
+
spawn_workers
|
177
|
+
|
178
|
+
self
|
179
|
+
end
|
180
|
+
|
181
|
+
# Collect the results of an asynchronous iteration into an array.
|
182
|
+
#
|
183
|
+
# EM::Iterator.new(%w[ pwd uptime uname date ], 2).map(proc{ |cmd,iter|
|
184
|
+
# EM.system(cmd){ |output,status|
|
185
|
+
# iter.return(output)
|
186
|
+
# }
|
187
|
+
# }, proc{ |results|
|
188
|
+
# p results
|
189
|
+
# })
|
190
|
+
#
|
191
|
+
def map(foreach, after)
|
192
|
+
index = 0
|
193
|
+
|
194
|
+
inject([], proc{ |results,item,iter|
|
195
|
+
i = index
|
196
|
+
index += 1
|
197
|
+
|
198
|
+
is_done = false
|
199
|
+
on_done = proc{ |res|
|
200
|
+
raise RuntimeError, 'already returned a value for this iteration' if is_done
|
201
|
+
is_done = true
|
202
|
+
|
203
|
+
results[i] = res
|
204
|
+
iter.return(results)
|
205
|
+
}
|
206
|
+
class << on_done
|
207
|
+
alias :return :call
|
208
|
+
def next
|
209
|
+
raise NoMethodError, 'must call #return on a map iterator'
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
foreach.call(item, on_done)
|
214
|
+
}, proc{ |results|
|
215
|
+
after.call(results)
|
216
|
+
})
|
217
|
+
end
|
218
|
+
|
219
|
+
# Inject the results of an asynchronous iteration onto a given object.
|
220
|
+
#
|
221
|
+
# EM::Iterator.new(%w[ pwd uptime uname date ], 2).inject({}, proc{ |hash,cmd,iter|
|
222
|
+
# EM.system(cmd){ |output,status|
|
223
|
+
# hash[cmd] = status.exitstatus == 0 ? output.strip : nil
|
224
|
+
# iter.return(hash)
|
225
|
+
# }
|
226
|
+
# }, proc{ |results|
|
227
|
+
# p results
|
228
|
+
# })
|
229
|
+
#
|
230
|
+
def inject(obj, foreach, after)
|
231
|
+
each(proc{ |item,iter|
|
232
|
+
is_done = false
|
233
|
+
on_done = proc{ |res|
|
234
|
+
raise RuntimeError, 'already returned a value for this iteration' if is_done
|
235
|
+
is_done = true
|
236
|
+
|
237
|
+
obj = res
|
238
|
+
iter.next
|
239
|
+
}
|
240
|
+
class << on_done
|
241
|
+
alias :return :call
|
242
|
+
def next
|
243
|
+
raise NoMethodError, 'must call #return on an inject iterator'
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
foreach.call(obj, item, on_done)
|
248
|
+
}, proc{
|
249
|
+
after.call(obj)
|
250
|
+
})
|
251
|
+
end
|
252
|
+
|
253
|
+
private
|
254
|
+
|
255
|
+
# Spawn workers to consume items from the iterator's enumerator based on the current concurrency level.
|
256
|
+
#
|
257
|
+
def spawn_workers
|
258
|
+
EM.next_tick(start_worker = proc{
|
259
|
+
if @workers < @concurrency and !@ended
|
260
|
+
# p [:spawning_worker, :workers=, @workers, :concurrency=, @concurrency, :ended=, @ended]
|
261
|
+
@workers += 1
|
262
|
+
@process_next.call
|
263
|
+
EM.next_tick(start_worker)
|
264
|
+
end
|
265
|
+
})
|
266
|
+
nil
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
if __FILE__ == $0
|
272
|
+
$:.unshift File.join(File.dirname(__FILE__), '..')
|
273
|
+
require 'eventmachine'
|
274
|
+
|
275
|
+
# TODO: real tests
|
276
|
+
# TODO: pass in one object instead of two? .each{ |iter| puts iter.current; iter.next }
|
277
|
+
# TODO: support iter.pause/resume/stop/break/continue?
|
278
|
+
# TODO: create some exceptions instead of using RuntimeError
|
279
|
+
# TODO: support proc instead of enumerable? EM::Iterator.new(proc{ return queue.pop })
|
280
|
+
|
281
|
+
EM.run{
|
282
|
+
EM::Iterator.new(1..50).each{ |num,iter| p num; iter.next }
|
283
|
+
EM::Iterator.new([1,2,3], 10).each{ |num,iter| p num; iter.next }
|
284
|
+
|
285
|
+
i = EM::Iterator.new(1..100, 5)
|
286
|
+
i.each(proc{|num,iter|
|
287
|
+
p num.to_s
|
288
|
+
iter.next
|
289
|
+
}, proc{
|
290
|
+
p :done
|
291
|
+
})
|
292
|
+
EM.add_timer(0.03){
|
293
|
+
i.concurrency = 1
|
294
|
+
}
|
295
|
+
EM.add_timer(0.04){
|
296
|
+
i.concurrency = 3
|
297
|
+
}
|
298
|
+
|
299
|
+
EM::Iterator.new(100..150).map(proc{ |num,iter|
|
300
|
+
EM.add_timer(0.01){ iter.return(num) }
|
301
|
+
}, proc{ |results|
|
302
|
+
p results
|
303
|
+
})
|
304
|
+
|
305
|
+
EM::Iterator.new(%w[ pwd uptime uname date ], 2).inject({}, proc{ |hash,cmd,iter|
|
306
|
+
EM.system(cmd){ |output,status|
|
307
|
+
hash[cmd] = status.exitstatus == 0 ? output.strip : nil
|
308
|
+
iter.return(hash)
|
309
|
+
}
|
310
|
+
}, proc{ |results|
|
311
|
+
p results
|
312
|
+
})
|
313
|
+
}
|
314
|
+
end
|
data/lib/pioneer.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# Eventmachine
|
2
|
+
require "em-synchrony"
|
3
|
+
require "em-synchrony/em-http"
|
4
|
+
require "em-synchrony/fiber_iterator"
|
5
|
+
# patch - to remove! maybe pull to em-synchrony?
|
6
|
+
require "patch/iterator"
|
7
|
+
require "patch/fiber_periodic_timer_iterator"
|
8
|
+
# other
|
9
|
+
require "logger"
|
10
|
+
require 'uri'
|
11
|
+
# Code
|
12
|
+
require "pioneer/version"
|
13
|
+
require "pioneer/base"
|
14
|
+
require "pioneer/request"
|
15
|
+
require "pioneer/http_header"
|
16
|
+
require "pioneer/crawler"
|
data/lib/pioneer/base.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Pioneer
|
3
|
+
class UndefinedLocations < RuntimeError; end
|
4
|
+
class LocationsNotEnumerable < RuntimeError; end
|
5
|
+
class UndefinedProcessing < RuntimeError; end
|
6
|
+
class LocationsNotEnumerator < RuntimeError; end
|
7
|
+
class HttpRequestError < RuntimeError; end
|
8
|
+
class HttpResponseError < RuntimeError; end
|
9
|
+
class Base
|
10
|
+
attr_reader :name, :concurrency, :sleep, :log_level, :redirect
|
11
|
+
|
12
|
+
def initialize(opts = {})
|
13
|
+
raise UndefinedLocations, "you should specify `locations` method in your `self.class`" unless self.methods.include? :locations
|
14
|
+
raise UndefinedProcessing, "you should specify `processing` method in your `self.class`" unless self.methods.include? :processing
|
15
|
+
raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
|
16
|
+
@name = opts[:name] || "crawler"
|
17
|
+
@concurrency = opts[:concurrency] || 10
|
18
|
+
@sleep = opts[:sleep] || 0 # sleep is reversed RPS (1/RPS) - frequency of requests.
|
19
|
+
@log_enabled = opts[:log_enabled] || true # Logger is enabled by default
|
20
|
+
@log_level = opts[:log_level] || Logger::DEBUG
|
21
|
+
@random_header = opts[:random_header] || false
|
22
|
+
@header = opts[:header] || nil
|
23
|
+
@redirects = opts[:redirects] || nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def start
|
27
|
+
raise LocationsNotEnumerable, "location should respond to `each`" unless locations.respond_to? :each
|
28
|
+
result = []
|
29
|
+
EM.synchrony do
|
30
|
+
# Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
|
31
|
+
# In case @sleep is 0 it behaves like standart FiberIterator
|
32
|
+
EM::Synchrony::FiberPeriodicTimerIterator.new(locations, concurrency, sleep).map do |url|
|
33
|
+
result << Request.new(url, self).perform
|
34
|
+
end
|
35
|
+
EM.stop
|
36
|
+
end
|
37
|
+
result
|
38
|
+
end
|
39
|
+
|
40
|
+
def logger
|
41
|
+
@logger ||= begin
|
42
|
+
logger = @log_enabled ? Logger.new("#{name}.log") : Logger.new("/dev/null")
|
43
|
+
logger.level = log_level
|
44
|
+
logger
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def http_opts
|
49
|
+
opts = {}
|
50
|
+
opts[:head] = random_header if @random_header
|
51
|
+
opts[:head] = @header if @header
|
52
|
+
opts[:redirects] = @redirects if @redirects
|
53
|
+
opts
|
54
|
+
end
|
55
|
+
|
56
|
+
def random_header
|
57
|
+
HttpHeader.random
|
58
|
+
end
|
59
|
+
|
60
|
+
# we should override only our methods: locations, processing, if_XXX
|
61
|
+
def method_missing(method_name, *args, &block)
|
62
|
+
case method_name
|
63
|
+
when /locations.*=|processing.*=|if_.+=/
|
64
|
+
method_name = method_name.to_s.gsub("=", "").to_sym
|
65
|
+
override_method(method_name, args.first)
|
66
|
+
else
|
67
|
+
super(method_name, *args, &block)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def override_method(method_name, arg)
|
72
|
+
if Proc === arg
|
73
|
+
self.define_singleton_method method_name do |req|
|
74
|
+
arg.call(req)
|
75
|
+
end
|
76
|
+
else
|
77
|
+
self.define_singleton_method method_name do
|
78
|
+
arg
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,273 @@
|
|
1
|
+
module Pioneer
|
2
|
+
module HttpHeader
|
3
|
+
extend self
|
4
|
+
|
5
|
+
def random
|
6
|
+
header = headers.sample
|
7
|
+
headers = {
|
8
|
+
'Referer' => 'http://www.google.com/',
|
9
|
+
'User-Agent' => header,
|
10
|
+
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
11
|
+
'Connection' => 'keep-alive'
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
# Get more on http://www.useragentstring.com/pages/useragentstring.php
|
16
|
+
def headers
|
17
|
+
[
|
18
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0a2) Gecko/20111101 Firefox/9.0a2',
|
19
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
|
20
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
|
21
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110612 Firefox/6.0a2',
|
22
|
+
'Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0',
|
23
|
+
'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0',
|
24
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0 FirePHP/0.6',
|
25
|
+
'Mozilla/5.0 (Windows NT 5.0; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
|
26
|
+
'Mozilla/5.0 (X11; Linux i686 on x86_64; rv:5.0a2) Gecko/20110524 Firefox/5.0a2',
|
27
|
+
'Mozilla/5.0 (Windows NT 6.1; U; ru; rv:5.0.1.6) Gecko/20110501 Firefox/5.0.1 Firefox/5.0.1',
|
28
|
+
'mozilla/3.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/5.0.1',
|
29
|
+
'Mozilla/5.0 (X11; U; Linux i586; de; rv:5.0) Gecko/20100101 Firefox/5.0',
|
30
|
+
'Mozilla/5.0 (X11; U; Linux amd64; rv:5.0) Gecko/20100101 Firefox/5.0 (Debian)',
|
31
|
+
'Mozilla/5.0 (X11; U; Linux amd64; en-US; rv:5.0) Gecko/20110619 Firefox/5.0',
|
32
|
+
'Mozilla/5.0 (X11; Linux) Gecko Firefox/5.0',
|
33
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:5.0) Gecko/20100101 Firefox/5.0 FirePHP/0.5',
|
34
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:5.0) Gecko/20100101 Firefox/5.0 Firefox/5.0',
|
35
|
+
'Mozilla/5.0 (X11; Linux x86_64) Gecko Firefox/5.0',
|
36
|
+
'Mozilla/5.0 (X11; Linux ppc; rv:5.0) Gecko/20100101 Firefox/5.0',
|
37
|
+
'Mozilla/5.0 (X11; Linux AMD64) Gecko Firefox/5.0',
|
38
|
+
'Mozilla/5.0 (X11; FreeBSD amd64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
39
|
+
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
40
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:5.0) Gecko/20110619 Firefox/5.0',
|
41
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
42
|
+
'Mozilla/5.0 (Windows NT 6.1.1; rv:5.0) Gecko/20100101 Firefox/5.0',
|
43
|
+
'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
44
|
+
'Mozilla/5.0 (Windows NT 5.1; U; rv:5.0) Gecko/20100101 Firefox/5.0',
|
45
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/5.0',
|
46
|
+
'Mozilla/5.0 (Windows NT 5.0; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0',
|
47
|
+
'Mozilla/5.0 (Windows NT 5.0; rv:5.0) Gecko/20100101 Firefox/5.0',
|
48
|
+
'Mozilla/5.0 (U; Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
|
49
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:2.2a1pre) Gecko/20110324 Firefox/4.2a1pre',
|
50
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:2.2a1pre) Gecko/20100101 Firefox/4.2a1pre',
|
51
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110324 Firefox/4.2a1pre',
|
52
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110323 Firefox/4.2a1pre',
|
53
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110208 Firefox/4.2a1pre',
|
54
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b9pre) Gecko/20110111 Firefox/4.0b9pre',
|
55
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b9pre) Gecko/20101228 Firefox/4.0b9pre',
|
56
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:2.0b9pre) Gecko/20110105 Firefox/4.0b9pre',
|
57
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b8pre) Gecko/20101114 Firefox/4.0b8pre',
|
58
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b8pre) Gecko/20101213 Firefox/4.0b8pre',
|
59
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b8pre) Gecko/20101128 Firefox/4.0b8pre',
|
60
|
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b8pre) Gecko/20101114 Firefox/4.0b8pre',
|
61
|
+
'Mozilla/5.0 (Windows NT 5.1; rv:2.0b8pre) Gecko/20101127 Firefox/4.0b8pre',
|
62
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0 Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0',
|
63
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
64
|
+
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
65
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
|
66
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.6 (KHTML, like Gecko) Chrome/16.0.897.0 Safari/535.6',
|
67
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2',
|
68
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Ubuntu/11.10 Chromium/15.0.874.120 Chrome/15.0.874.120 Safari/535.2',
|
69
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
|
70
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2',
|
71
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2',
|
72
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2',
|
73
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.860.0 Safari/535.2',
|
74
|
+
'Chrome/15.0.860.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/15.0.860.0',
|
75
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/10.04 Chromium/14.0.813.0 Chrome/14.0.813.0 Safari/535.1',
|
76
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
77
|
+
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
78
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
79
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
|
80
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.814.0 Chrome/14.0.814.0 Safari/535.1',
|
81
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1',
|
82
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
83
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.803.0 Chrome/14.0.803.0 Safari/535.1',
|
84
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
85
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
86
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
87
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1',
|
88
|
+
'Mozilla/5.0 Slackware/13.37 (X11; U; Linux x86_64; en-US) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41',
|
89
|
+
'Mozilla/5.0 ArchLinux (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
90
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/13.0.782.41 Chrome/13.0.782.41 Safari/535.1',
|
91
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
92
|
+
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
93
|
+
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
94
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
95
|
+
'Mozilla/5.0 (Windows NT 5.2; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
96
|
+
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
97
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
98
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_3) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
99
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
|
100
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1',
|
101
|
+
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1',
|
102
|
+
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1',
|
103
|
+
'Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00',
|
104
|
+
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
|
105
|
+
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',
|
106
|
+
'Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51',
|
107
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51',
|
108
|
+
'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50',
|
109
|
+
'Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50',
|
110
|
+
'Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11',
|
111
|
+
'Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11',
|
112
|
+
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11',
|
113
|
+
'Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01',
|
114
|
+
'Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01',
|
115
|
+
'Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01',
|
116
|
+
'Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01',
|
117
|
+
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01',
|
118
|
+
'Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01',
|
119
|
+
'Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01',
|
120
|
+
'Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01',
|
121
|
+
'Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01',
|
122
|
+
'Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01',
|
123
|
+
'Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01',
|
124
|
+
'Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01',
|
125
|
+
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01',
|
126
|
+
'Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01',
|
127
|
+
'Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01',
|
128
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01',
|
129
|
+
'Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10',
|
130
|
+
'Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10',
|
131
|
+
'Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10',
|
132
|
+
'Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1',
|
133
|
+
'Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00',
|
134
|
+
'Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00',
|
135
|
+
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00',
|
136
|
+
'Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00',
|
137
|
+
'Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00',
|
138
|
+
'Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00',
|
139
|
+
'Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00',
|
140
|
+
'Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00',
|
141
|
+
'Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00',
|
142
|
+
'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00',
|
143
|
+
'Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00',
|
144
|
+
'Opera/9.80 (Windows NT 5.1; U; it) Presto/2.7.62 Version/11.00',
|
145
|
+
'Mozilla/5.0 (Windows NT 6.0; U; ja; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00',
|
146
|
+
'Mozilla/5.0 (Windows NT 5.1; U; pl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00',
|
147
|
+
'Mozilla/5.0 (Windows NT 5.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00',
|
148
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; X11; Linux x86_64; pl) Opera 11.00',
|
149
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; fr) Opera 11.00',
|
150
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; ja) Opera 11.00',
|
151
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; en) Opera 11.00',
|
152
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; pl) Opera 11.00',
|
153
|
+
'Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.6.31 Version/10.70',
|
154
|
+
'Mozilla/5.0 (Windows NT 5.2; U; ru; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.70',
|
155
|
+
'Mozilla/5.0 (Windows NT 5.1; U; zh-cn; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.70',
|
156
|
+
'Opera/9.80 (X11; Linux i686; U; en-GB) Presto/2.5.24 Version/10.53',
|
157
|
+
'Mozilla/5.0 (Windows NT 5.1; U; zh-cn; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
158
|
+
'Mozilla/5.0 (Windows NT 5.1; U; Firefox/5.0; en; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
159
|
+
'Mozilla/5.0 (Windows NT 5.1; U; Firefox/4.5; en; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
160
|
+
'Mozilla/5.0 (Windows NT 5.1; U; Firefox/3.5; en; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 10.53',
|
161
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; ko) Opera 10.53',
|
162
|
+
'Opera/9.80 (X11; U; Linux i686; en-US; rv:1.9.2.3) Presto/2.2.15 Version/10.10',
|
163
|
+
'Opera/9.80 (X11; Linux x86_64; U; it) Presto/2.2.15 Version/10.10',
|
164
|
+
'Opera/9.80 (Windows NT 6.1; U; de) Presto/2.2.15 Version/10.10',
|
165
|
+
'Opera/9.80 (Windows NT 6.0; U; Gecko/20100115; pl) Presto/2.2.15 Version/10.10',
|
166
|
+
'Opera/9.80 (Windows NT 6.0; U; en) Presto/2.2.15 Version/10.10',
|
167
|
+
'Opera/9.80 (Windows NT 5.1; U; de) Presto/2.2.15 Version/10.10',
|
168
|
+
'Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.2.15 Version/10.10',
|
169
|
+
'Mozilla/5.0 (Windows NT 6.0; U; tr; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 10.10',
|
170
|
+
'Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; de) Opera 10.10',
|
171
|
+
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 6.0; tr) Opera 10.10',
|
172
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
|
173
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
|
174
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)',
|
175
|
+
'Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)',
|
176
|
+
'Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)',
|
177
|
+
'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))',
|
178
|
+
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
|
179
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)',
|
180
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)',
|
181
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7',
|
182
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)',
|
183
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)',
|
184
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)',
|
185
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)',
|
186
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)',
|
187
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0',
|
188
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)',
|
189
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)',
|
190
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)',
|
191
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)',
|
192
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)',
|
193
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205',
|
194
|
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)',
|
195
|
+
'Mozilla/5.0 ( ; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
|
196
|
+
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; FDM; MSIECrawler; Media Center PC 5.0)',
|
197
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
|
198
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)',
|
199
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)',
|
200
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)',
|
201
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
202
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)',
|
203
|
+
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)',
|
204
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)',
|
205
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
|
206
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)',
|
207
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8',
|
208
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)',
|
209
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)',
|
210
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)',
|
211
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)',
|
212
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; msn OptimizedIE8;ZHCN)',
|
213
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; InfoPath.3; .NET4.0C; .NET4.0E) chromeframe/8.0.552.224',
|
214
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; .NET4.0C; .NET4.0E; Zune 4.7; InfoPath.3)',
|
215
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; .NET4.0C; .NET4.0E; Zune 4.7)',
|
216
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8)',
|
217
|
+
'Mozilla/4.0(compatible; MSIE 7.0b; Windows NT 6.0)',
|
218
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)',
|
219
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)',
|
220
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; Media Center PC 3.0; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1)',
|
221
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; FDM; .NET CLR 1.1.4322)',
|
222
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727)',
|
223
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1)',
|
224
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; Alexa Toolbar; .NET CLR 2.0.50727)',
|
225
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; Alexa Toolbar)',
|
226
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
227
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.40607)',
|
228
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322)',
|
229
|
+
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.0.3705; Media Center PC 3.1; Alexa Toolbar; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
230
|
+
'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
|
231
|
+
'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; el-GR)',
|
232
|
+
'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 5.2)',
|
233
|
+
'Mozilla/5.0 (MSIE 7.0; Macintosh; U; SunOS; X11; gu; SV1; InfoPath.2; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
|
234
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; WOW64; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; c .NET CLR 3.0.04506; .NET CLR 3.5.30707; InfoPath.1; el-GR)',
|
235
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; c .NET CLR 3.0.04506; .NET CLR 3.5.30707; InfoPath.1; el-GR)',
|
236
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; fr-FR)',
|
237
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; en-US)',
|
238
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.2; WOW64; .NET CLR 2.0.50727)',
|
239
|
+
'Mozilla/5.0 (compatible; MSIE 7.0; Windows 98; SpamBlockerUtility 6.3.91; SpamBlockerUtility 6.2.91; .NET CLR 4.1.89;GB)',
|
240
|
+
'Mozilla/4.79 [en] (compatible; MSIE 7.0; Windows NT 5.0; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
|
241
|
+
'Mozilla/4.0 (Windows; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
242
|
+
'Mozilla/4.0 (Mozilla/4.0; MSIE 7.0; Windows NT 5.1; FDM; SV1; .NET CLR 3.0.04506.30)',
|
243
|
+
'Mozilla/4.0 (Mozilla/4.0; MSIE 7.0; Windows NT 5.1; FDM; SV1)',
|
244
|
+
'Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)',
|
245
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)',
|
246
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; .NET4.0C; .NET4.0E; InfoPath.3)',
|
247
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; chromeframe/12.0.742.100)',
|
248
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)',
|
249
|
+
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
|
250
|
+
'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
251
|
+
'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
252
|
+
'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)',
|
253
|
+
'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
254
|
+
'Mozilla/45.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
255
|
+
'Mozilla/4.08 (compatible; MSIE 6.0; Windows NT 5.1)',
|
256
|
+
'Mozilla/4.01 (compatible; MSIE 6.0; Windows NT 5.1)',
|
257
|
+
'Mozilla/4.0 (X11; MSIE 6.0; i686; .NET CLR 1.1.4322; .NET CLR 2.0.50727; FDM)',
|
258
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 6.0)',
|
259
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 5.2)',
|
260
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 5.0)',
|
261
|
+
'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
|
262
|
+
'Mozilla/4.0 (MSIE 6.0; Windows NT 5.1)',
|
263
|
+
'Mozilla/4.0 (MSIE 6.0; Windows NT 5.0)',
|
264
|
+
'Mozilla/4.0 (compatible;MSIE 6.0;Windows 98;Q312461)',
|
265
|
+
'Mozilla/4.0 (Compatible; Windows NT 5.1; MSIE 6.0) (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
266
|
+
'Mozilla/4.0 (compatible; U; MSIE 6.0; Windows NT 5.1) (Compatible; ; ; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
|
267
|
+
'Mozilla/4.0 (compatible; U; MSIE 6.0; Windows NT 5.1)',
|
268
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3; Tablet PC 2.0)',
|
269
|
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB6.5; QQDownload 534; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729)'
|
270
|
+
]
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Pioneer
|
3
|
+
class Request
|
4
|
+
attr_reader :pioneer, :url, :result, :response, :error
|
5
|
+
def initialize(url, pioneer)
|
6
|
+
@url, @pioneer = url, pioneer
|
7
|
+
@url = begin
|
8
|
+
url = "http://" + url unless url =~ /http/
|
9
|
+
URI.escape(url)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def perform
|
14
|
+
pioneer.logger.info("going to #{url}")
|
15
|
+
@result = handle_request_error_or_return_result
|
16
|
+
end
|
17
|
+
|
18
|
+
# Handle base fatal request error
|
19
|
+
def handle_request_error_or_return_result
|
20
|
+
begin
|
21
|
+
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
|
+
rescue => e
|
23
|
+
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
|
+
pioneer.logger.fatal(error)
|
25
|
+
if pioneer.respond_to? :if_request_error
|
26
|
+
return pioneer.send(:if_request_error, self)
|
27
|
+
else
|
28
|
+
raise HttpRequestError, @error
|
29
|
+
end
|
30
|
+
end
|
31
|
+
handle_response_error_or_return_result
|
32
|
+
end
|
33
|
+
|
34
|
+
# handle http error
|
35
|
+
def handle_response_error_or_return_result
|
36
|
+
if response.error
|
37
|
+
error = "Response for #{url} get an error: #{response.error}"
|
38
|
+
pioneer.logger.error(error)
|
39
|
+
if pioneer.respond_to? :if_response_error
|
40
|
+
return pioneer.send(:if_response_error, self)
|
41
|
+
else
|
42
|
+
raise HttpResponseError, error
|
43
|
+
end
|
44
|
+
end
|
45
|
+
handle_status_or_return_result
|
46
|
+
end
|
47
|
+
|
48
|
+
def handle_status_or_return_result
|
49
|
+
status = response.response_header.status
|
50
|
+
case status
|
51
|
+
when 200
|
52
|
+
pioneer.processing(self)
|
53
|
+
else
|
54
|
+
pioneer.logger.error("This #{url} returns this http status: #{status}")
|
55
|
+
if pioneer.respond_to? "if_status_#{status}".to_sym
|
56
|
+
pioneer.send("if_status_#{status}", self)
|
57
|
+
elsif pioneer.respond_to? :if_status_not_200
|
58
|
+
pioneer.send(:if_status_not_200, self)
|
59
|
+
else
|
60
|
+
nil # nothing?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/pioneer.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "pioneer/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "pioneer"
|
7
|
+
s.version = Pioneer::VERSION
|
8
|
+
s.authors = ["Petr"]
|
9
|
+
s.email = ["pedro.yanoviches@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = "HTTP crawler"
|
12
|
+
s.description = "Simple async HTTP crawler based on em-synchrony"
|
13
|
+
|
14
|
+
s.rubyforge_project = "pioneer"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "yajl-ruby"
|
23
|
+
s.add_runtime_dependency "em-synchrony"
|
24
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'yajl'
|
4
|
+
#
|
5
|
+
# TODO:
|
6
|
+
# Rewrite real live examples with StubServer
|
7
|
+
#
|
8
|
+
|
9
|
+
describe Pioneer::Request do
|
10
|
+
before do
|
11
|
+
@pioneer1 = CustomCrawler1.new(name: "Custom crawler 1")
|
12
|
+
@pioneer2 = Pioneer::Crawler.new(name: "Base crawler 2")
|
13
|
+
@pioneer3 = Pioneer::Crawler.new(name: "Base crawler 3")
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should return two 200 response statuses" do
|
17
|
+
@pioneer1.start.must_equal [200, 200]
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should redefine methods" do
|
21
|
+
processing = proc{ |req| req.response.response_header.status + 1 }
|
22
|
+
@pioneer2.processing = processing
|
23
|
+
@pioneer2.locations = ["www.apple.com", "www.amazon.com"]
|
24
|
+
@pioneer2.start.must_equal [201, 201]
|
25
|
+
@pioneer2.locations = ["www.ru.erro"]
|
26
|
+
if_response_error = proc{ |req| "fail" }
|
27
|
+
@pioneer2.if_response_error = if_response_error
|
28
|
+
@pioneer2.start.must_equal ["fail"]
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should execute if_status_xxx" do
|
32
|
+
redirector = proc{ |req| "redirected" }
|
33
|
+
error404 = proc{ |req| "notfound" }
|
34
|
+
@pioneer2.locations = ["google.com/redirectmeplease", "http://www.amazon.com/notfoundpage"]
|
35
|
+
@pioneer2.if_status_301 = redirector
|
36
|
+
@pioneer2.if_status_404 = error404
|
37
|
+
@pioneer2.start.must_equal ["redirected", "notfound"]
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should execute if_status_not_200 if another colback is not defined" do
|
41
|
+
not_200 = proc{ "something goes wrong" }
|
42
|
+
redirector = proc{ |req| "redirected" }
|
43
|
+
@pioneer3.locations = ["google.com/redirectmeplease", "http://www.amazon.com/notfoundpage"]
|
44
|
+
@pioneer3.if_status_301 = redirector
|
45
|
+
@pioneer3.if_status_not_200 = not_200
|
46
|
+
@pioneer3.start.must_equal ["redirected", "something goes wrong"]
|
47
|
+
end
|
48
|
+
|
49
|
+
# LAST FM API TEST
|
50
|
+
it "should return similar artists for a number of them" do
|
51
|
+
@lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
|
52
|
+
@lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should use headers" do
|
56
|
+
@crawler1 = KinopoiskCrawler.new(random_header: false)
|
57
|
+
@crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
|
58
|
+
@crawler3 = KinopoiskCrawler.new(random_header: true)
|
59
|
+
# this one will redirect
|
60
|
+
@crawler1.start.must_equal [nil]
|
61
|
+
# this one will return some restrictions (it need real headres)
|
62
|
+
(@crawler2.start.first < 10000).must_equal true
|
63
|
+
# and this one will fire up
|
64
|
+
(@crawler3.start.first > 10000).must_equal true
|
65
|
+
end
|
66
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'pioneer'
|
2
|
+
require 'minitest/spec'
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
# saving two pages
|
7
|
+
class CustomCrawler1 < Pioneer::Base
|
8
|
+
def locations
|
9
|
+
["http://www.ru", "http://www.ru"]
|
10
|
+
end
|
11
|
+
|
12
|
+
def processing(req)
|
13
|
+
req.response.response_header.status
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# LastFM test
|
18
|
+
class LastfmEnum
|
19
|
+
include Enumerable
|
20
|
+
|
21
|
+
ARTISTS = ["Cher", "Madonna", "Rolling Stones", "The Beatles", "Muse"]
|
22
|
+
|
23
|
+
def each
|
24
|
+
ARTISTS.each do |artist|
|
25
|
+
url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
|
26
|
+
yield url
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class LastfmCrawler < Pioneer::Base
|
32
|
+
def locations
|
33
|
+
LastfmEnum.new
|
34
|
+
end
|
35
|
+
|
36
|
+
def processing(req)
|
37
|
+
json = Yajl::Parser.parse(req.response.response)
|
38
|
+
json["similarartists"]["@attr"]["artist"]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Kinopoisk
|
43
|
+
class KinopoiskCrawler < Pioneer::Base
|
44
|
+
def locations
|
45
|
+
["http://www.kinopoisk.ru/level/1/film/614667/"]
|
46
|
+
end
|
47
|
+
|
48
|
+
def processing(req)
|
49
|
+
req.response.response.size
|
50
|
+
end
|
51
|
+
end
|
metadata
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pioneer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1.alpha
|
5
|
+
prerelease: 6
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Petr
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-22 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: yajl-ruby
|
16
|
+
requirement: &74894120 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *74894120
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: em-synchrony
|
27
|
+
requirement: &74893910 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *74893910
|
36
|
+
description: Simple async HTTP crawler based on em-synchrony
|
37
|
+
email:
|
38
|
+
- pedro.yanoviches@gmail.com
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- .gitignore
|
44
|
+
- .travis.yml
|
45
|
+
- CHANGELOG
|
46
|
+
- Gemfile
|
47
|
+
- LICENSE
|
48
|
+
- README.md
|
49
|
+
- Rakefile
|
50
|
+
- lib/patch/fiber_periodic_timer_iterator.rb
|
51
|
+
- lib/patch/iterator.rb
|
52
|
+
- lib/pioneer.rb
|
53
|
+
- lib/pioneer/base.rb
|
54
|
+
- lib/pioneer/crawler.rb
|
55
|
+
- lib/pioneer/http_header.rb
|
56
|
+
- lib/pioneer/request.rb
|
57
|
+
- lib/pioneer/version.rb
|
58
|
+
- pioneer.gemspec
|
59
|
+
- spec/pioneer/base_spec.rb
|
60
|
+
- spec/pioneer/request_spec.rb
|
61
|
+
- spec/spec_helper.rb
|
62
|
+
- tmp/just_for_test/railscasts.txt
|
63
|
+
homepage: ''
|
64
|
+
licenses: []
|
65
|
+
post_install_message:
|
66
|
+
rdoc_options: []
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ! '>='
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>'
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 1.3.1
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project: pioneer
|
83
|
+
rubygems_version: 1.8.15
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: HTTP crawler
|
87
|
+
test_files: []
|
88
|
+
has_rdoc:
|