pioneer 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +11 -1
- data/lib/pioneer.rb +0 -1
- data/lib/pioneer/base.rb +28 -10
- data/lib/pioneer/http_header.rb +3 -2
- data/lib/pioneer/request.rb +79 -16
- data/lib/pioneer/version.rb +1 -1
- data/pioneer.gemspec +2 -0
- data/spec/pioneer/request_spec.rb +23 -1
- metadata +28 -6
data/CHANGELOG
CHANGED
@@ -13,4 +13,14 @@
|
|
13
13
|
## v0.0.4
|
14
14
|
|
15
15
|
* Fixed stackoverflow on retrying request after failing
|
16
|
-
* added `req.retry` method to retry http request in error handlers
|
16
|
+
* added `req.retry` method to retry http request in error handlers
|
17
|
+
|
18
|
+
## v0.0.5
|
19
|
+
|
20
|
+
* Added some minor fixes
|
21
|
+
|
22
|
+
## v0.0.6
|
23
|
+
|
24
|
+
* Added skip functionality
|
25
|
+
* Added some docs
|
26
|
+
* Added "response_body" and "response_header" shortcuts
|
data/lib/pioneer.rb
CHANGED
data/lib/pioneer/base.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
|
-
class UndefinedLocations <
|
4
|
-
class LocationsNotEnumerable <
|
5
|
-
class UndefinedProcessing <
|
6
|
-
class LocationsNotEnumerator <
|
7
|
-
class HttpRequestError <
|
8
|
-
class HttpResponseError <
|
9
|
-
class HttpStatusError <
|
10
|
-
class HttpRetryRequest <
|
3
|
+
class UndefinedLocations < StandardError; end
|
4
|
+
class LocationsNotEnumerable < StandardError; end
|
5
|
+
class UndefinedProcessing < StandardError; end
|
6
|
+
class LocationsNotEnumerator < StandardError; end
|
7
|
+
class HttpRequestError < StandardError; end
|
8
|
+
class HttpResponseError < StandardError; end
|
9
|
+
class HttpStatusError < StandardError; end
|
10
|
+
class HttpRetryRequest < StandardError; end
|
11
|
+
class HttpSkipRequest < StandardError; end
|
11
12
|
|
12
13
|
class Base
|
13
14
|
attr_reader :name, :concurrency, :sleep, :log_level, :redirect
|
@@ -32,14 +33,31 @@ module Pioneer
|
|
32
33
|
EM.synchrony do
|
33
34
|
# Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
|
34
35
|
# In case @sleep is 0 it behaves like standart FiberIterator
|
35
|
-
EM::Synchrony::
|
36
|
-
|
36
|
+
EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
|
37
|
+
sleep
|
38
|
+
begin
|
39
|
+
result << Request.new(url, self).perform
|
40
|
+
rescue Pioneer::HttpSkipRequest => e
|
41
|
+
nil # do nothing?
|
42
|
+
end
|
37
43
|
end
|
38
44
|
EM.stop
|
39
45
|
end
|
40
46
|
result
|
41
47
|
end
|
42
48
|
|
49
|
+
# Sleep if the last request was recently (less then timout period)
|
50
|
+
def sleep
|
51
|
+
@next_start ||= Time.now
|
52
|
+
if @sleep > 0
|
53
|
+
now = Time.now
|
54
|
+
sleep_time = @next_start - Time.now
|
55
|
+
sleep_time = 0 if sleep_time < 0
|
56
|
+
@next_start = Time.now + sleep_time + @sleep
|
57
|
+
EM::Synchrony.sleep(sleep_time) if sleep_time > 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
43
61
|
def logger
|
44
62
|
@logger ||= begin
|
45
63
|
logger = Logger.new(STDOUT)
|
data/lib/pioneer/http_header.rb
CHANGED
@@ -5,10 +5,11 @@ module Pioneer
|
|
5
5
|
def random
|
6
6
|
header = headers.sample
|
7
7
|
headers = {
|
8
|
-
'Referer' => 'http://www.google.
|
8
|
+
'Referer' => 'http://www.google.ru/#hl=ru&newwindow=1&sa=X&ei=6oPXTp3OB4Tl4QTdl-zyDQ&ved=0CCAQvwUoAQ&q=kinopoisk&spell=1&bav=on.2,or.r_gc.r_pw.r_cp.,cf.osb&fp=c8a5ca24098b41f5&biw=1127&bih=628',
|
9
9
|
'User-Agent' => header,
|
10
10
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
11
|
-
'Connection' => 'keep-alive'
|
11
|
+
'Connection' => 'keep-alive',
|
12
|
+
'Cache-Control' => 'no-cache'
|
12
13
|
}
|
13
14
|
end
|
14
15
|
|
data/lib/pioneer/request.rb
CHANGED
@@ -1,59 +1,76 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
3
|
class Request
|
4
|
-
attr_reader :pioneer, :url, :result, :response, :error
|
4
|
+
attr_reader :pioneer, :url, :result, :response, :error, :counter
|
5
|
+
|
5
6
|
def initialize(url, pioneer)
|
6
|
-
@
|
7
|
-
@url
|
8
|
-
|
9
|
-
URI.escape(url)
|
10
|
-
end
|
7
|
+
@pioneer = pioneer
|
8
|
+
@url = parse_url(url)
|
9
|
+
@counter = 0
|
11
10
|
end
|
12
11
|
|
12
|
+
#
|
13
|
+
# Request processing
|
14
|
+
#
|
13
15
|
def perform
|
14
16
|
pioneer.logger.info("going to #{url}")
|
15
17
|
@result = handle_request_error_or_return_result
|
16
18
|
end
|
17
19
|
|
20
|
+
#
|
18
21
|
# Handle base fatal request error
|
22
|
+
# If we have got connection error or whatever
|
23
|
+
# we will fire either Exception or call "if_request_error" if exists
|
24
|
+
#
|
19
25
|
def handle_request_error_or_return_result
|
20
26
|
begin
|
21
27
|
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
28
|
rescue => e
|
23
29
|
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
|
-
pioneer.logger.fatal(error)
|
30
|
+
pioneer.logger.fatal(@error)
|
25
31
|
if pioneer.respond_to? :if_request_error
|
26
32
|
return pioneer.if_request_error(self)
|
27
33
|
else
|
28
|
-
raise HttpRequestError, @error
|
34
|
+
raise Pioneer::HttpRequestError, @error
|
29
35
|
end
|
30
36
|
end
|
31
37
|
handle_response_error_or_return_result
|
32
|
-
rescue HttpRetryRequest => e
|
38
|
+
rescue Pioneer::HttpRetryRequest => e
|
33
39
|
retry
|
34
40
|
end
|
35
41
|
|
36
|
-
#
|
42
|
+
#
|
43
|
+
# Handle http error
|
44
|
+
# If we can't make proper response we will ether fire Exception
|
45
|
+
# or call "if_response_error" if exists
|
46
|
+
#
|
37
47
|
def handle_response_error_or_return_result
|
38
48
|
if response.error
|
39
|
-
error = "Response for #{url} get an error: #{response.error}"
|
40
|
-
pioneer.logger.error(error)
|
49
|
+
@error = "Response for #{url} get an error: #{response.error}"
|
50
|
+
pioneer.logger.error(@error)
|
41
51
|
if pioneer.respond_to? :if_response_error
|
42
52
|
return pioneer.if_response_error(self)
|
43
53
|
else
|
44
|
-
raise HttpResponseError, error
|
54
|
+
raise Pioneer::HttpResponseError, error
|
45
55
|
end
|
46
56
|
end
|
47
57
|
handle_status_or_return_result
|
48
58
|
end
|
49
59
|
|
60
|
+
#
|
61
|
+
# Handle wrong status or run "processing"
|
62
|
+
# If status is not 200 we will either do nothing (?)
|
63
|
+
# or call "if_status_XXX" if exist
|
64
|
+
# or "if_status_not_200"
|
65
|
+
#
|
50
66
|
def handle_status_or_return_result
|
51
67
|
status = response.response_header.status
|
52
68
|
case status
|
53
69
|
when 200
|
54
70
|
pioneer.processing(self)
|
55
71
|
else
|
56
|
-
|
72
|
+
@error = "This #{url} returns this http status: #{status}"
|
73
|
+
pioneer.logger.error(@error)
|
57
74
|
if pioneer.respond_to? "if_status_#{status}".to_sym
|
58
75
|
pioneer.send("if_status_#{status}", self)
|
59
76
|
elsif pioneer.respond_to? :if_status_not_200
|
@@ -64,8 +81,54 @@ module Pioneer
|
|
64
81
|
end
|
65
82
|
end
|
66
83
|
|
67
|
-
|
68
|
-
|
84
|
+
#
|
85
|
+
# We can call retry from crawler like "req.retry"
|
86
|
+
# If count is seted, so it will retry it not more then "count" times
|
87
|
+
#
|
88
|
+
def retry(count=nil)
|
89
|
+
if count
|
90
|
+
@counter += 1
|
91
|
+
skip if @counter > count
|
92
|
+
end
|
93
|
+
raise Pioneer::HttpRetryRequest
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# We can skip request from crawler like "req.skip"
|
98
|
+
# I.E. if response_body is blank or 404 error
|
99
|
+
#
|
100
|
+
def skip
|
101
|
+
raise Pioneer::HttpSkipRequest
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# We should parse url befor sending request
|
106
|
+
# We use URI.escape for escaping
|
107
|
+
# IMPORTAINT: We should replace ampersand (&) in params with "&" !!!
|
108
|
+
# Pluses (+) weill be replaced with "%2B"
|
109
|
+
#
|
110
|
+
def parse_url(url)
|
111
|
+
url = "http://" + url unless url =~ /http/
|
112
|
+
url = URI.escape(url)
|
113
|
+
# replace "&" ampersands :)
|
114
|
+
url = url.gsub("&", "%26")
|
115
|
+
# replace pluses
|
116
|
+
url = url.gsub("+", "%2B")
|
117
|
+
url
|
118
|
+
end
|
119
|
+
|
120
|
+
#
|
121
|
+
# Shortcut for response.response
|
122
|
+
#
|
123
|
+
def response_body
|
124
|
+
response.response
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Shortcut for response.response_header
|
129
|
+
#
|
130
|
+
def response_header
|
131
|
+
response.response_header
|
69
132
|
end
|
70
133
|
end
|
71
134
|
end
|
data/lib/pioneer/version.rb
CHANGED
data/pioneer.gemspec
CHANGED
@@ -20,5 +20,7 @@ Gem::Specification.new do |s|
|
|
20
20
|
|
21
21
|
# specify any dependencies here; for example:
|
22
22
|
s.add_development_dependency "yajl-ruby"
|
23
|
+
s.add_development_dependency "nokogiri"
|
23
24
|
s.add_runtime_dependency "em-synchrony"
|
25
|
+
s.add_runtime_dependency "em-http-request"
|
24
26
|
end
|
@@ -18,7 +18,7 @@ describe Pioneer::Request do
|
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should redefine methods" do
|
21
|
-
processing = proc{ |req| req.
|
21
|
+
processing = proc{ |req| req.response_header.status + 1 }
|
22
22
|
@pioneer2.processing = processing
|
23
23
|
@pioneer2.locations = ["www.apple.com", "www.amazon.com"]
|
24
24
|
@pioneer2.start.must_equal [201, 201]
|
@@ -63,4 +63,26 @@ describe Pioneer::Request do
|
|
63
63
|
# and this one will fire up
|
64
64
|
(@crawler3.start.first > 10000).must_equal true
|
65
65
|
end
|
66
|
+
|
67
|
+
it "should skip url" do
|
68
|
+
@result = []
|
69
|
+
crawler = Pioneer::Crawler.new(redirects: 1)
|
70
|
+
crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
|
71
|
+
crawler.processing = proc{ |req| @result << req.url }
|
72
|
+
crawler.if_response_error = proc{ |req| req.skip }
|
73
|
+
crawler.start
|
74
|
+
@result.must_equal ["http://amazon.com"]
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should retry 2 times and skip" do
|
78
|
+
@result = []
|
79
|
+
@retries = nil
|
80
|
+
crawler = Pioneer::Crawler.new(redirects: 1)
|
81
|
+
crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
|
82
|
+
crawler.processing = proc{ |req| @result << req.url }
|
83
|
+
crawler.if_response_error = proc{ |req| @retries = req.counter; req.retry(2); }
|
84
|
+
crawler.start
|
85
|
+
@result.must_equal ["http://amazon.com"]
|
86
|
+
@retries.must_equal 2
|
87
|
+
end
|
66
88
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-13 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &76352180 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,32 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *76352180
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: nokogiri
|
27
|
+
requirement: &76351940 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *76351940
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: em-synchrony
|
27
|
-
requirement: &
|
38
|
+
requirement: &76351700 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *76351700
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: em-http-request
|
49
|
+
requirement: &76351430 !ruby/object:Gem::Requirement
|
28
50
|
none: false
|
29
51
|
requirements:
|
30
52
|
- - ! '>='
|
@@ -32,7 +54,7 @@ dependencies:
|
|
32
54
|
version: '0'
|
33
55
|
type: :runtime
|
34
56
|
prerelease: false
|
35
|
-
version_requirements: *
|
57
|
+
version_requirements: *76351430
|
36
58
|
description: Simple async HTTP crawler based on em-synchrony
|
37
59
|
email:
|
38
60
|
- pedro.yanoviches@gmail.com
|