pioneer 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -1
- data/lib/pioneer/base.rb +35 -6
- data/lib/pioneer/request.rb +11 -8
- data/lib/pioneer/version.rb +1 -1
- metadata +10 -10
data/CHANGELOG
CHANGED
@@ -23,4 +23,9 @@
|
|
23
23
|
|
24
24
|
* Added skip functionality
|
25
25
|
* Added some docs
|
26
|
-
* Added "response_body" and "response_header" shortcuts
|
26
|
+
* Added "response_body" and "response_header" shortcuts
|
27
|
+
|
28
|
+
## v0.0.7
|
29
|
+
|
30
|
+
* Rescuing of Retry Exception is removed to Base class, so it will be triggered in context of main loop (it will executed with global sleep timeout)
|
31
|
+
* Added `headers` callback support
|
data/lib/pioneer/base.rb
CHANGED
@@ -16,7 +16,7 @@ module Pioneer
|
|
16
16
|
def initialize(opts = {})
|
17
17
|
raise UndefinedLocations, "you should specify `locations` method in your `self.class`" unless self.methods.include? :locations
|
18
18
|
raise UndefinedProcessing, "you should specify `processing` method in your `self.class`" unless self.methods.include? :processing
|
19
|
-
raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
|
19
|
+
# raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
|
20
20
|
@name = opts[:name] || "crawler"
|
21
21
|
@concurrency = opts[:concurrency] || 10
|
22
22
|
@sleep = opts[:sleep] || 0 # sleep is reversed RPS (1/RPS) - frequency of requests.
|
@@ -25,18 +25,24 @@ module Pioneer
|
|
25
25
|
@random_header = opts[:random_header] || false
|
26
26
|
@header = opts[:header] || nil
|
27
27
|
@redirects = opts[:redirects] || nil
|
28
|
+
@headers = opts[:headers] #|| nil
|
28
29
|
end
|
29
30
|
|
31
|
+
#
|
32
|
+
# Main method: starting crawling through locations
|
33
|
+
#
|
30
34
|
def start
|
31
|
-
raise LocationsNotEnumerable, "location should respond to `each`" unless locations.respond_to? :each
|
32
35
|
result = []
|
33
36
|
EM.synchrony do
|
34
|
-
# Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
|
35
|
-
# In case @sleep is 0 it behaves like standart FiberIterator
|
36
37
|
EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
|
37
|
-
|
38
|
+
counter = 0
|
38
39
|
begin
|
39
|
-
|
40
|
+
sleep
|
41
|
+
result << Request.new(url, self, counter).perform
|
42
|
+
rescue Pioneer::HttpRetryRequest => e
|
43
|
+
# return to our loop
|
44
|
+
counter += 1
|
45
|
+
retry
|
40
46
|
rescue Pioneer::HttpSkipRequest => e
|
41
47
|
nil # do nothing?
|
42
48
|
end
|
@@ -46,7 +52,9 @@ module Pioneer
|
|
46
52
|
result
|
47
53
|
end
|
48
54
|
|
55
|
+
#
|
49
56
|
# Sleep if the last request was recently (less then timout period)
|
57
|
+
#
|
50
58
|
def sleep
|
51
59
|
@next_start ||= Time.now
|
52
60
|
if @sleep > 0
|
@@ -58,6 +66,9 @@ module Pioneer
|
|
58
66
|
end
|
59
67
|
end
|
60
68
|
|
69
|
+
#
|
70
|
+
# Default Pioneer logger
|
71
|
+
#
|
61
72
|
def logger
|
62
73
|
@logger ||= begin
|
63
74
|
logger = Logger.new(STDOUT)
|
@@ -66,6 +77,9 @@ module Pioneer
|
|
66
77
|
end
|
67
78
|
end
|
68
79
|
|
80
|
+
#
|
81
|
+
# Set headers, such as redirects, cookies etc
|
82
|
+
#
|
69
83
|
def http_opts
|
70
84
|
opts = {}
|
71
85
|
opts[:head] = random_header if @random_header
|
@@ -74,11 +88,23 @@ module Pioneer
|
|
74
88
|
opts
|
75
89
|
end
|
76
90
|
|
91
|
+
#
|
92
|
+
# Generate random header for request
|
93
|
+
#
|
77
94
|
def random_header
|
78
95
|
HttpHeader.random
|
79
96
|
end
|
80
97
|
|
98
|
+
#
|
99
|
+
# Headers callback
|
100
|
+
#
|
101
|
+
def headers
|
102
|
+
@headers
|
103
|
+
end
|
104
|
+
|
105
|
+
#
|
81
106
|
# we should override only our methods: locations, processing, if_XXX
|
107
|
+
#
|
82
108
|
def method_missing(method_name, *args, &block)
|
83
109
|
case method_name
|
84
110
|
when /locations.*=|processing.*=|if_.+=/
|
@@ -89,6 +115,9 @@ module Pioneer
|
|
89
115
|
end
|
90
116
|
end
|
91
117
|
|
118
|
+
#
|
119
|
+
# Overriding methods as singeltons so they are availible only for current instance of crawler
|
120
|
+
#
|
92
121
|
def override_method(method_name, arg)
|
93
122
|
if Proc === arg
|
94
123
|
self.define_singleton_method method_name do |req|
|
data/lib/pioneer/request.rb
CHANGED
@@ -3,10 +3,10 @@ module Pioneer
|
|
3
3
|
class Request
|
4
4
|
attr_reader :pioneer, :url, :result, :response, :error, :counter
|
5
5
|
|
6
|
-
def initialize(url, pioneer)
|
6
|
+
def initialize(url, pioneer, counter=0)
|
7
7
|
@pioneer = pioneer
|
8
8
|
@url = parse_url(url)
|
9
|
-
@counter =
|
9
|
+
@counter = counter
|
10
10
|
end
|
11
11
|
|
12
12
|
#
|
@@ -24,7 +24,13 @@ module Pioneer
|
|
24
24
|
#
|
25
25
|
def handle_request_error_or_return_result
|
26
26
|
begin
|
27
|
-
|
27
|
+
req = EM::HttpRequest.new(url).aget pioneer.http_opts
|
28
|
+
if pioneer.headers
|
29
|
+
req.headers{
|
30
|
+
pioneer.headers.call(req)
|
31
|
+
}
|
32
|
+
end
|
33
|
+
@response = EM::Synchrony.sync req
|
28
34
|
rescue => e
|
29
35
|
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
30
36
|
pioneer.logger.fatal(@error)
|
@@ -35,8 +41,6 @@ module Pioneer
|
|
35
41
|
end
|
36
42
|
end
|
37
43
|
handle_response_error_or_return_result
|
38
|
-
rescue Pioneer::HttpRetryRequest => e
|
39
|
-
retry
|
40
44
|
end
|
41
45
|
|
42
46
|
#
|
@@ -87,8 +91,7 @@ module Pioneer
|
|
87
91
|
#
|
88
92
|
def retry(count=nil)
|
89
93
|
if count
|
90
|
-
@counter
|
91
|
-
skip if @counter > count
|
94
|
+
skip if @counter >= count
|
92
95
|
end
|
93
96
|
raise Pioneer::HttpRetryRequest
|
94
97
|
end
|
@@ -105,7 +108,7 @@ module Pioneer
|
|
105
108
|
# We should parse url befor sending request
|
106
109
|
# We use URI.escape for escaping
|
107
110
|
# IMPORTAINT: We should replace ampersand (&) in params with "&" !!!
|
108
|
-
# Pluses (+)
|
111
|
+
# Pluses (+) will be replaced with "%2B"
|
109
112
|
#
|
110
113
|
def parse_url(url)
|
111
114
|
url = "http://" + url unless url =~ /http/
|
data/lib/pioneer/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &77891380 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *77891380
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: nokogiri
|
27
|
-
requirement: &
|
27
|
+
requirement: &77891160 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *77891160
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: em-synchrony
|
38
|
-
requirement: &
|
38
|
+
requirement: &77890920 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *77890920
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: em-http-request
|
49
|
-
requirement: &
|
49
|
+
requirement: &77890690 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *77890690
|
58
58
|
description: Simple async HTTP crawler based on em-synchrony
|
59
59
|
email:
|
60
60
|
- pedro.yanoviches@gmail.com
|