pioneer 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +11 -1
- data/lib/pioneer.rb +0 -1
- data/lib/pioneer/base.rb +28 -10
- data/lib/pioneer/http_header.rb +3 -2
- data/lib/pioneer/request.rb +79 -16
- data/lib/pioneer/version.rb +1 -1
- data/pioneer.gemspec +2 -0
- data/spec/pioneer/request_spec.rb +23 -1
- metadata +28 -6
data/CHANGELOG
CHANGED
@@ -13,4 +13,14 @@
|
|
13
13
|
## v0.0.4
|
14
14
|
|
15
15
|
* Fixed stackoverflow on retrying request after failing
|
16
|
-
* added `req.retry` method to retry http request in error handlers
|
16
|
+
* added `req.retry` method to retry http request in error handlers
|
17
|
+
|
18
|
+
## v0.0.5
|
19
|
+
|
20
|
+
* Added some minor fixes
|
21
|
+
|
22
|
+
## v0.0.6
|
23
|
+
|
24
|
+
* Added skip functionality
|
25
|
+
* Added some docs
|
26
|
+
* Added "response_body" and "response_header" shortcuts
|
data/lib/pioneer.rb
CHANGED
data/lib/pioneer/base.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
|
-
class UndefinedLocations <
|
4
|
-
class LocationsNotEnumerable <
|
5
|
-
class UndefinedProcessing <
|
6
|
-
class LocationsNotEnumerator <
|
7
|
-
class HttpRequestError <
|
8
|
-
class HttpResponseError <
|
9
|
-
class HttpStatusError <
|
10
|
-
class HttpRetryRequest <
|
3
|
+
class UndefinedLocations < StandardError; end
|
4
|
+
class LocationsNotEnumerable < StandardError; end
|
5
|
+
class UndefinedProcessing < StandardError; end
|
6
|
+
class LocationsNotEnumerator < StandardError; end
|
7
|
+
class HttpRequestError < StandardError; end
|
8
|
+
class HttpResponseError < StandardError; end
|
9
|
+
class HttpStatusError < StandardError; end
|
10
|
+
class HttpRetryRequest < StandardError; end
|
11
|
+
class HttpSkipRequest < StandardError; end
|
11
12
|
|
12
13
|
class Base
|
13
14
|
attr_reader :name, :concurrency, :sleep, :log_level, :redirect
|
@@ -32,14 +33,31 @@ module Pioneer
|
|
32
33
|
EM.synchrony do
|
33
34
|
# Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
|
34
35
|
# In case @sleep is 0 it behaves like standart FiberIterator
|
35
|
-
EM::Synchrony::
|
36
|
-
|
36
|
+
EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
|
37
|
+
sleep
|
38
|
+
begin
|
39
|
+
result << Request.new(url, self).perform
|
40
|
+
rescue Pioneer::HttpSkipRequest => e
|
41
|
+
nil # do nothing?
|
42
|
+
end
|
37
43
|
end
|
38
44
|
EM.stop
|
39
45
|
end
|
40
46
|
result
|
41
47
|
end
|
42
48
|
|
49
|
+
# Sleep if the last request was recently (less then timout period)
|
50
|
+
def sleep
|
51
|
+
@next_start ||= Time.now
|
52
|
+
if @sleep > 0
|
53
|
+
now = Time.now
|
54
|
+
sleep_time = @next_start - Time.now
|
55
|
+
sleep_time = 0 if sleep_time < 0
|
56
|
+
@next_start = Time.now + sleep_time + @sleep
|
57
|
+
EM::Synchrony.sleep(sleep_time) if sleep_time > 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
43
61
|
def logger
|
44
62
|
@logger ||= begin
|
45
63
|
logger = Logger.new(STDOUT)
|
data/lib/pioneer/http_header.rb
CHANGED
@@ -5,10 +5,11 @@ module Pioneer
|
|
5
5
|
def random
|
6
6
|
header = headers.sample
|
7
7
|
headers = {
|
8
|
-
'Referer' => 'http://www.google.
|
8
|
+
'Referer' => 'http://www.google.ru/#hl=ru&newwindow=1&sa=X&ei=6oPXTp3OB4Tl4QTdl-zyDQ&ved=0CCAQvwUoAQ&q=kinopoisk&spell=1&bav=on.2,or.r_gc.r_pw.r_cp.,cf.osb&fp=c8a5ca24098b41f5&biw=1127&bih=628',
|
9
9
|
'User-Agent' => header,
|
10
10
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
11
|
-
'Connection' => 'keep-alive'
|
11
|
+
'Connection' => 'keep-alive',
|
12
|
+
'Cache-Control' => 'no-cache'
|
12
13
|
}
|
13
14
|
end
|
14
15
|
|
data/lib/pioneer/request.rb
CHANGED
@@ -1,59 +1,76 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
3
|
class Request
|
4
|
-
attr_reader :pioneer, :url, :result, :response, :error
|
4
|
+
attr_reader :pioneer, :url, :result, :response, :error, :counter
|
5
|
+
|
5
6
|
def initialize(url, pioneer)
|
6
|
-
@
|
7
|
-
@url
|
8
|
-
|
9
|
-
URI.escape(url)
|
10
|
-
end
|
7
|
+
@pioneer = pioneer
|
8
|
+
@url = parse_url(url)
|
9
|
+
@counter = 0
|
11
10
|
end
|
12
11
|
|
12
|
+
#
|
13
|
+
# Request processing
|
14
|
+
#
|
13
15
|
def perform
|
14
16
|
pioneer.logger.info("going to #{url}")
|
15
17
|
@result = handle_request_error_or_return_result
|
16
18
|
end
|
17
19
|
|
20
|
+
#
|
18
21
|
# Handle base fatal request error
|
22
|
+
# If we have got connection error or whatever
|
23
|
+
# we will fire either Exception or call "if_request_error" if exists
|
24
|
+
#
|
19
25
|
def handle_request_error_or_return_result
|
20
26
|
begin
|
21
27
|
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
28
|
rescue => e
|
23
29
|
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
|
-
pioneer.logger.fatal(error)
|
30
|
+
pioneer.logger.fatal(@error)
|
25
31
|
if pioneer.respond_to? :if_request_error
|
26
32
|
return pioneer.if_request_error(self)
|
27
33
|
else
|
28
|
-
raise HttpRequestError, @error
|
34
|
+
raise Pioneer::HttpRequestError, @error
|
29
35
|
end
|
30
36
|
end
|
31
37
|
handle_response_error_or_return_result
|
32
|
-
rescue HttpRetryRequest => e
|
38
|
+
rescue Pioneer::HttpRetryRequest => e
|
33
39
|
retry
|
34
40
|
end
|
35
41
|
|
36
|
-
#
|
42
|
+
#
|
43
|
+
# Handle http error
|
44
|
+
# If we can't make proper response we will ether fire Exception
|
45
|
+
# or call "if_response_error" if exists
|
46
|
+
#
|
37
47
|
def handle_response_error_or_return_result
|
38
48
|
if response.error
|
39
|
-
error = "Response for #{url} get an error: #{response.error}"
|
40
|
-
pioneer.logger.error(error)
|
49
|
+
@error = "Response for #{url} get an error: #{response.error}"
|
50
|
+
pioneer.logger.error(@error)
|
41
51
|
if pioneer.respond_to? :if_response_error
|
42
52
|
return pioneer.if_response_error(self)
|
43
53
|
else
|
44
|
-
raise HttpResponseError, error
|
54
|
+
raise Pioneer::HttpResponseError, error
|
45
55
|
end
|
46
56
|
end
|
47
57
|
handle_status_or_return_result
|
48
58
|
end
|
49
59
|
|
60
|
+
#
|
61
|
+
# Handle wrong status or run "processing"
|
62
|
+
# If status is not 200 we will either do nothing (?)
|
63
|
+
# or call "if_status_XXX" if exist
|
64
|
+
# or "if_status_not_200"
|
65
|
+
#
|
50
66
|
def handle_status_or_return_result
|
51
67
|
status = response.response_header.status
|
52
68
|
case status
|
53
69
|
when 200
|
54
70
|
pioneer.processing(self)
|
55
71
|
else
|
56
|
-
|
72
|
+
@error = "This #{url} returns this http status: #{status}"
|
73
|
+
pioneer.logger.error(@error)
|
57
74
|
if pioneer.respond_to? "if_status_#{status}".to_sym
|
58
75
|
pioneer.send("if_status_#{status}", self)
|
59
76
|
elsif pioneer.respond_to? :if_status_not_200
|
@@ -64,8 +81,54 @@ module Pioneer
|
|
64
81
|
end
|
65
82
|
end
|
66
83
|
|
67
|
-
|
68
|
-
|
84
|
+
#
|
85
|
+
# We can call retry from crawler like "req.retry"
|
86
|
+
# If count is seted, so it will retry it not more then "count" times
|
87
|
+
#
|
88
|
+
def retry(count=nil)
|
89
|
+
if count
|
90
|
+
@counter += 1
|
91
|
+
skip if @counter > count
|
92
|
+
end
|
93
|
+
raise Pioneer::HttpRetryRequest
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# We can skip request from crawler like "req.skip"
|
98
|
+
# I.E. if response_body is blank or 404 error
|
99
|
+
#
|
100
|
+
def skip
|
101
|
+
raise Pioneer::HttpSkipRequest
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# We should parse url befor sending request
|
106
|
+
# We use URI.escape for escaping
|
107
|
+
# IMPORTAINT: We should replace ampersand (&) in params with "&" !!!
|
108
|
+
# Pluses (+) weill be replaced with "%2B"
|
109
|
+
#
|
110
|
+
def parse_url(url)
|
111
|
+
url = "http://" + url unless url =~ /http/
|
112
|
+
url = URI.escape(url)
|
113
|
+
# replace "&" ampersands :)
|
114
|
+
url = url.gsub("&", "%26")
|
115
|
+
# replace pluses
|
116
|
+
url = url.gsub("+", "%2B")
|
117
|
+
url
|
118
|
+
end
|
119
|
+
|
120
|
+
#
|
121
|
+
# Shortcut for response.response
|
122
|
+
#
|
123
|
+
def response_body
|
124
|
+
response.response
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Shortcut for response.response_header
|
129
|
+
#
|
130
|
+
def response_header
|
131
|
+
response.response_header
|
69
132
|
end
|
70
133
|
end
|
71
134
|
end
|
data/lib/pioneer/version.rb
CHANGED
data/pioneer.gemspec
CHANGED
@@ -20,5 +20,7 @@ Gem::Specification.new do |s|
|
|
20
20
|
|
21
21
|
# specify any dependencies here; for example:
|
22
22
|
s.add_development_dependency "yajl-ruby"
|
23
|
+
s.add_development_dependency "nokogiri"
|
23
24
|
s.add_runtime_dependency "em-synchrony"
|
25
|
+
s.add_runtime_dependency "em-http-request"
|
24
26
|
end
|
@@ -18,7 +18,7 @@ describe Pioneer::Request do
|
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should redefine methods" do
|
21
|
-
processing = proc{ |req| req.
|
21
|
+
processing = proc{ |req| req.response_header.status + 1 }
|
22
22
|
@pioneer2.processing = processing
|
23
23
|
@pioneer2.locations = ["www.apple.com", "www.amazon.com"]
|
24
24
|
@pioneer2.start.must_equal [201, 201]
|
@@ -63,4 +63,26 @@ describe Pioneer::Request do
|
|
63
63
|
# and this one will fire up
|
64
64
|
(@crawler3.start.first > 10000).must_equal true
|
65
65
|
end
|
66
|
+
|
67
|
+
it "should skip url" do
|
68
|
+
@result = []
|
69
|
+
crawler = Pioneer::Crawler.new(redirects: 1)
|
70
|
+
crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
|
71
|
+
crawler.processing = proc{ |req| @result << req.url }
|
72
|
+
crawler.if_response_error = proc{ |req| req.skip }
|
73
|
+
crawler.start
|
74
|
+
@result.must_equal ["http://amazon.com"]
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should retry 2 times and skip" do
|
78
|
+
@result = []
|
79
|
+
@retries = nil
|
80
|
+
crawler = Pioneer::Crawler.new(redirects: 1)
|
81
|
+
crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
|
82
|
+
crawler.processing = proc{ |req| @result << req.url }
|
83
|
+
crawler.if_response_error = proc{ |req| @retries = req.counter; req.retry(2); }
|
84
|
+
crawler.start
|
85
|
+
@result.must_equal ["http://amazon.com"]
|
86
|
+
@retries.must_equal 2
|
87
|
+
end
|
66
88
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-13 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &76352180 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,32 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *76352180
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: nokogiri
|
27
|
+
requirement: &76351940 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *76351940
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: em-synchrony
|
27
|
-
requirement: &
|
38
|
+
requirement: &76351700 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *76351700
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: em-http-request
|
49
|
+
requirement: &76351430 !ruby/object:Gem::Requirement
|
28
50
|
none: false
|
29
51
|
requirements:
|
30
52
|
- - ! '>='
|
@@ -32,7 +54,7 @@ dependencies:
|
|
32
54
|
version: '0'
|
33
55
|
type: :runtime
|
34
56
|
prerelease: false
|
35
|
-
version_requirements: *
|
57
|
+
version_requirements: *76351430
|
36
58
|
description: Simple async HTTP crawler based on em-synchrony
|
37
59
|
email:
|
38
60
|
- pedro.yanoviches@gmail.com
|