pioneer 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -1
- data/lib/pioneer/base.rb +35 -6
- data/lib/pioneer/request.rb +11 -8
- data/lib/pioneer/version.rb +1 -1
- metadata +10 -10
data/CHANGELOG
CHANGED
@@ -23,4 +23,9 @@
|
|
23
23
|
|
24
24
|
* Added skip functionality
|
25
25
|
* Added some docs
|
26
|
-
* Added "response_body" and "response_header" shortcuts
|
26
|
+
* Added "response_body" and "response_header" shortcuts
|
27
|
+
|
28
|
+
## v0.0.7
|
29
|
+
|
30
|
+
* Rescuing of Retry Exception is removed to Base class, so it will be triggered in context of main loop (it will executed with global sleep timeout)
|
31
|
+
* Added `headers` callback support
|
data/lib/pioneer/base.rb
CHANGED
@@ -16,7 +16,7 @@ module Pioneer
|
|
16
16
|
def initialize(opts = {})
|
17
17
|
raise UndefinedLocations, "you should specify `locations` method in your `self.class`" unless self.methods.include? :locations
|
18
18
|
raise UndefinedProcessing, "you should specify `processing` method in your `self.class`" unless self.methods.include? :processing
|
19
|
-
raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
|
19
|
+
# raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
|
20
20
|
@name = opts[:name] || "crawler"
|
21
21
|
@concurrency = opts[:concurrency] || 10
|
22
22
|
@sleep = opts[:sleep] || 0 # sleep is reversed RPS (1/RPS) - frequency of requests.
|
@@ -25,18 +25,24 @@ module Pioneer
|
|
25
25
|
@random_header = opts[:random_header] || false
|
26
26
|
@header = opts[:header] || nil
|
27
27
|
@redirects = opts[:redirects] || nil
|
28
|
+
@headers = opts[:headers] #|| nil
|
28
29
|
end
|
29
30
|
|
31
|
+
#
|
32
|
+
# Main method: starting crawling through locations
|
33
|
+
#
|
30
34
|
def start
|
31
|
-
raise LocationsNotEnumerable, "location should respond to `each`" unless locations.respond_to? :each
|
32
35
|
result = []
|
33
36
|
EM.synchrony do
|
34
|
-
# Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
|
35
|
-
# In case @sleep is 0 it behaves like standart FiberIterator
|
36
37
|
EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
|
37
|
-
|
38
|
+
counter = 0
|
38
39
|
begin
|
39
|
-
|
40
|
+
sleep
|
41
|
+
result << Request.new(url, self, counter).perform
|
42
|
+
rescue Pioneer::HttpRetryRequest => e
|
43
|
+
# return to our loop
|
44
|
+
counter += 1
|
45
|
+
retry
|
40
46
|
rescue Pioneer::HttpSkipRequest => e
|
41
47
|
nil # do nothing?
|
42
48
|
end
|
@@ -46,7 +52,9 @@ module Pioneer
|
|
46
52
|
result
|
47
53
|
end
|
48
54
|
|
55
|
+
#
|
49
56
|
# Sleep if the last request was recently (less then timout period)
|
57
|
+
#
|
50
58
|
def sleep
|
51
59
|
@next_start ||= Time.now
|
52
60
|
if @sleep > 0
|
@@ -58,6 +66,9 @@ module Pioneer
|
|
58
66
|
end
|
59
67
|
end
|
60
68
|
|
69
|
+
#
|
70
|
+
# Default Pioneer logger
|
71
|
+
#
|
61
72
|
def logger
|
62
73
|
@logger ||= begin
|
63
74
|
logger = Logger.new(STDOUT)
|
@@ -66,6 +77,9 @@ module Pioneer
|
|
66
77
|
end
|
67
78
|
end
|
68
79
|
|
80
|
+
#
|
81
|
+
# Set headers, such as redirects, cookies etc
|
82
|
+
#
|
69
83
|
def http_opts
|
70
84
|
opts = {}
|
71
85
|
opts[:head] = random_header if @random_header
|
@@ -74,11 +88,23 @@ module Pioneer
|
|
74
88
|
opts
|
75
89
|
end
|
76
90
|
|
91
|
+
#
|
92
|
+
# Generate random header for request
|
93
|
+
#
|
77
94
|
def random_header
|
78
95
|
HttpHeader.random
|
79
96
|
end
|
80
97
|
|
98
|
+
#
|
99
|
+
# Headers callback
|
100
|
+
#
|
101
|
+
def headers
|
102
|
+
@headers
|
103
|
+
end
|
104
|
+
|
105
|
+
#
|
81
106
|
# we should override only our methods: locations, processing, if_XXX
|
107
|
+
#
|
82
108
|
def method_missing(method_name, *args, &block)
|
83
109
|
case method_name
|
84
110
|
when /locations.*=|processing.*=|if_.+=/
|
@@ -89,6 +115,9 @@ module Pioneer
|
|
89
115
|
end
|
90
116
|
end
|
91
117
|
|
118
|
+
#
|
119
|
+
# Overriding methods as singeltons so they are availible only for current instance of crawler
|
120
|
+
#
|
92
121
|
def override_method(method_name, arg)
|
93
122
|
if Proc === arg
|
94
123
|
self.define_singleton_method method_name do |req|
|
data/lib/pioneer/request.rb
CHANGED
@@ -3,10 +3,10 @@ module Pioneer
|
|
3
3
|
class Request
|
4
4
|
attr_reader :pioneer, :url, :result, :response, :error, :counter
|
5
5
|
|
6
|
-
def initialize(url, pioneer)
|
6
|
+
def initialize(url, pioneer, counter=0)
|
7
7
|
@pioneer = pioneer
|
8
8
|
@url = parse_url(url)
|
9
|
-
@counter =
|
9
|
+
@counter = counter
|
10
10
|
end
|
11
11
|
|
12
12
|
#
|
@@ -24,7 +24,13 @@ module Pioneer
|
|
24
24
|
#
|
25
25
|
def handle_request_error_or_return_result
|
26
26
|
begin
|
27
|
-
|
27
|
+
req = EM::HttpRequest.new(url).aget pioneer.http_opts
|
28
|
+
if pioneer.headers
|
29
|
+
req.headers{
|
30
|
+
pioneer.headers.call(req)
|
31
|
+
}
|
32
|
+
end
|
33
|
+
@response = EM::Synchrony.sync req
|
28
34
|
rescue => e
|
29
35
|
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
30
36
|
pioneer.logger.fatal(@error)
|
@@ -35,8 +41,6 @@ module Pioneer
|
|
35
41
|
end
|
36
42
|
end
|
37
43
|
handle_response_error_or_return_result
|
38
|
-
rescue Pioneer::HttpRetryRequest => e
|
39
|
-
retry
|
40
44
|
end
|
41
45
|
|
42
46
|
#
|
@@ -87,8 +91,7 @@ module Pioneer
|
|
87
91
|
#
|
88
92
|
def retry(count=nil)
|
89
93
|
if count
|
90
|
-
@counter
|
91
|
-
skip if @counter > count
|
94
|
+
skip if @counter >= count
|
92
95
|
end
|
93
96
|
raise Pioneer::HttpRetryRequest
|
94
97
|
end
|
@@ -105,7 +108,7 @@ module Pioneer
|
|
105
108
|
# We should parse url befor sending request
|
106
109
|
# We use URI.escape for escaping
|
107
110
|
# IMPORTAINT: We should replace ampersand (&) in params with "&" !!!
|
108
|
-
# Pluses (+)
|
111
|
+
# Pluses (+) will be replaced with "%2B"
|
109
112
|
#
|
110
113
|
def parse_url(url)
|
111
114
|
url = "http://" + url unless url =~ /http/
|
data/lib/pioneer/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &77891380 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *77891380
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: nokogiri
|
27
|
-
requirement: &
|
27
|
+
requirement: &77891160 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *77891160
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: em-synchrony
|
38
|
-
requirement: &
|
38
|
+
requirement: &77890920 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *77890920
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: em-http-request
|
49
|
-
requirement: &
|
49
|
+
requirement: &77890690 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *77890690
|
58
58
|
description: Simple async HTTP crawler based on em-synchrony
|
59
59
|
email:
|
60
60
|
- pedro.yanoviches@gmail.com
|