pioneer 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -13,4 +13,14 @@
13
13
  ## v0.0.4
14
14
 
15
15
  * Fixed stackoverflow on retrying request after failing
16
- * added `req.retry` method to retry http request in error handlers
16
+ * added `req.retry` method to retry http request in error handlers
17
+
18
+ ## v0.0.5
19
+
20
+ * Added some minor fixes
21
+
22
+ ## v0.0.6
23
+
24
+ * Added skip functionality
25
+ * Added some docs
26
+ * Added "response_body" and "response_header" shortcuts
@@ -4,7 +4,6 @@ require "em-synchrony/em-http"
4
4
  require "em-synchrony/fiber_iterator"
5
5
  # patch - to remove! maybe pull to em-synchrony?
6
6
  require "patch/iterator"
7
- require "patch/fiber_periodic_timer_iterator"
8
7
  # other
9
8
  require "logger"
10
9
  require 'uri'
@@ -1,13 +1,14 @@
1
1
  # encoding: utf-8
2
2
  module Pioneer
3
- class UndefinedLocations < RuntimeError; end
4
- class LocationsNotEnumerable < RuntimeError; end
5
- class UndefinedProcessing < RuntimeError; end
6
- class LocationsNotEnumerator < RuntimeError; end
7
- class HttpRequestError < RuntimeError; end
8
- class HttpResponseError < RuntimeError; end
9
- class HttpStatusError < RuntimeError; end
10
- class HttpRetryRequest < RuntimeError; end
3
+ class UndefinedLocations < StandardError; end
4
+ class LocationsNotEnumerable < StandardError; end
5
+ class UndefinedProcessing < StandardError; end
6
+ class LocationsNotEnumerator < StandardError; end
7
+ class HttpRequestError < StandardError; end
8
+ class HttpResponseError < StandardError; end
9
+ class HttpStatusError < StandardError; end
10
+ class HttpRetryRequest < StandardError; end
11
+ class HttpSkipRequest < StandardError; end
11
12
 
12
13
  class Base
13
14
  attr_reader :name, :concurrency, :sleep, :log_level, :redirect
@@ -32,14 +33,31 @@ module Pioneer
32
33
  EM.synchrony do
33
34
  # Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
34
35
  # In case @sleep is 0 it behaves like standart FiberIterator
35
- EM::Synchrony::FiberPeriodicTimerIterator.new(locations, concurrency, sleep).map do |url|
36
- result << Request.new(url, self).perform
36
+ EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
37
+ sleep
38
+ begin
39
+ result << Request.new(url, self).perform
40
+ rescue Pioneer::HttpSkipRequest => e
41
+ nil # do nothing?
42
+ end
37
43
  end
38
44
  EM.stop
39
45
  end
40
46
  result
41
47
  end
42
48
 
49
+ # Sleep if the last request was recently (less then timout period)
50
+ def sleep
51
+ @next_start ||= Time.now
52
+ if @sleep > 0
53
+ now = Time.now
54
+ sleep_time = @next_start - Time.now
55
+ sleep_time = 0 if sleep_time < 0
56
+ @next_start = Time.now + sleep_time + @sleep
57
+ EM::Synchrony.sleep(sleep_time) if sleep_time > 0
58
+ end
59
+ end
60
+
43
61
  def logger
44
62
  @logger ||= begin
45
63
  logger = Logger.new(STDOUT)
@@ -5,10 +5,11 @@ module Pioneer
5
5
  def random
6
6
  header = headers.sample
7
7
  headers = {
8
- 'Referer' => 'http://www.google.com/',
8
+ 'Referer' => 'http://www.google.ru/#hl=ru&newwindow=1&sa=X&ei=6oPXTp3OB4Tl4QTdl-zyDQ&ved=0CCAQvwUoAQ&q=kinopoisk&spell=1&bav=on.2,or.r_gc.r_pw.r_cp.,cf.osb&fp=c8a5ca24098b41f5&biw=1127&bih=628',
9
9
  'User-Agent' => header,
10
10
  'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11
- 'Connection' => 'keep-alive'
11
+ 'Connection' => 'keep-alive',
12
+ 'Cache-Control' => 'no-cache'
12
13
  }
13
14
  end
14
15
 
@@ -1,59 +1,76 @@
1
1
  # encoding: utf-8
2
2
  module Pioneer
3
3
  class Request
4
- attr_reader :pioneer, :url, :result, :response, :error
4
+ attr_reader :pioneer, :url, :result, :response, :error, :counter
5
+
5
6
  def initialize(url, pioneer)
6
- @url, @pioneer = url, pioneer
7
- @url = begin
8
- url = "http://" + url unless url =~ /http/
9
- URI.escape(url)
10
- end
7
+ @pioneer = pioneer
8
+ @url = parse_url(url)
9
+ @counter = 0
11
10
  end
12
11
 
12
+ #
13
+ # Request processing
14
+ #
13
15
  def perform
14
16
  pioneer.logger.info("going to #{url}")
15
17
  @result = handle_request_error_or_return_result
16
18
  end
17
19
 
20
+ #
18
21
  # Handle base fatal request error
22
+ # If we have got connection error or whatever
23
+ # we will fire either Exception or call "if_request_error" if exists
24
+ #
19
25
  def handle_request_error_or_return_result
20
26
  begin
21
27
  @response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
22
28
  rescue => e
23
29
  @error = "Request totaly failed. Url: #{url}, error: #{e.message}"
24
- pioneer.logger.fatal(error)
30
+ pioneer.logger.fatal(@error)
25
31
  if pioneer.respond_to? :if_request_error
26
32
  return pioneer.if_request_error(self)
27
33
  else
28
- raise HttpRequestError, @error
34
+ raise Pioneer::HttpRequestError, @error
29
35
  end
30
36
  end
31
37
  handle_response_error_or_return_result
32
- rescue HttpRetryRequest => e
38
+ rescue Pioneer::HttpRetryRequest => e
33
39
  retry
34
40
  end
35
41
 
36
- # handle http error
42
+ #
43
+ # Handle http error
44
+ # If we can't make proper response we will ether fire Exception
45
+ # or call "if_response_error" if exists
46
+ #
37
47
  def handle_response_error_or_return_result
38
48
  if response.error
39
- error = "Response for #{url} get an error: #{response.error}"
40
- pioneer.logger.error(error)
49
+ @error = "Response for #{url} get an error: #{response.error}"
50
+ pioneer.logger.error(@error)
41
51
  if pioneer.respond_to? :if_response_error
42
52
  return pioneer.if_response_error(self)
43
53
  else
44
- raise HttpResponseError, error
54
+ raise Pioneer::HttpResponseError, error
45
55
  end
46
56
  end
47
57
  handle_status_or_return_result
48
58
  end
49
59
 
60
+ #
61
+ # Handle wrong status or run "processing"
62
+ # If status is not 200 we will either do nothing (?)
63
+ # or call "if_status_XXX" if exist
64
+ # or "if_status_not_200"
65
+ #
50
66
  def handle_status_or_return_result
51
67
  status = response.response_header.status
52
68
  case status
53
69
  when 200
54
70
  pioneer.processing(self)
55
71
  else
56
- pioneer.logger.error("This #{url} returns this http status: #{status}")
72
+ @error = "This #{url} returns this http status: #{status}"
73
+ pioneer.logger.error(@error)
57
74
  if pioneer.respond_to? "if_status_#{status}".to_sym
58
75
  pioneer.send("if_status_#{status}", self)
59
76
  elsif pioneer.respond_to? :if_status_not_200
@@ -64,8 +81,54 @@ module Pioneer
64
81
  end
65
82
  end
66
83
 
67
- def retry
68
- raise HttpRetryRequest
84
+ #
85
+ # We can call retry from crawler like "req.retry"
86
+ # If count is seted, so it will retry it not more then "count" times
87
+ #
88
+ def retry(count=nil)
89
+ if count
90
+ @counter += 1
91
+ skip if @counter > count
92
+ end
93
+ raise Pioneer::HttpRetryRequest
94
+ end
95
+
96
+ #
97
+ # We can skip request from crawler like "req.skip"
98
+ # I.E. if response_body is blank or 404 error
99
+ #
100
+ def skip
101
+ raise Pioneer::HttpSkipRequest
102
+ end
103
+
104
+ #
105
+ # We should parse url befor sending request
106
+ # We use URI.escape for escaping
107
+ # IMPORTAINT: We should replace ampersand (&) in params with "&amp;" !!!
108
+ # Pluses (+) weill be replaced with "%2B"
109
+ #
110
+ def parse_url(url)
111
+ url = "http://" + url unless url =~ /http/
112
+ url = URI.escape(url)
113
+ # replace "&" ampersands :)
114
+ url = url.gsub("&amp;", "%26")
115
+ # replace pluses
116
+ url = url.gsub("+", "%2B")
117
+ url
118
+ end
119
+
120
+ #
121
+ # Shortcut for response.response
122
+ #
123
+ def response_body
124
+ response.response
125
+ end
126
+
127
+ #
128
+ # Shortcut for response.response_header
129
+ #
130
+ def response_header
131
+ response.response_header
69
132
  end
70
133
  end
71
134
  end
@@ -1,3 +1,3 @@
1
1
  module Pioneer
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -20,5 +20,7 @@ Gem::Specification.new do |s|
20
20
 
21
21
  # specify any dependencies here; for example:
22
22
  s.add_development_dependency "yajl-ruby"
23
+ s.add_development_dependency "nokogiri"
23
24
  s.add_runtime_dependency "em-synchrony"
25
+ s.add_runtime_dependency "em-http-request"
24
26
  end
@@ -18,7 +18,7 @@ describe Pioneer::Request do
18
18
  end
19
19
 
20
20
  it "should redefine methods" do
21
- processing = proc{ |req| req.response.response_header.status + 1 }
21
+ processing = proc{ |req| req.response_header.status + 1 }
22
22
  @pioneer2.processing = processing
23
23
  @pioneer2.locations = ["www.apple.com", "www.amazon.com"]
24
24
  @pioneer2.start.must_equal [201, 201]
@@ -63,4 +63,26 @@ describe Pioneer::Request do
63
63
  # and this one will fire up
64
64
  (@crawler3.start.first > 10000).must_equal true
65
65
  end
66
+
67
+ it "should skip url" do
68
+ @result = []
69
+ crawler = Pioneer::Crawler.new(redirects: 1)
70
+ crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
71
+ crawler.processing = proc{ |req| @result << req.url }
72
+ crawler.if_response_error = proc{ |req| req.skip }
73
+ crawler.start
74
+ @result.must_equal ["http://amazon.com"]
75
+ end
76
+
77
+ it "should retry 2 times and skip" do
78
+ @result = []
79
+ @retries = nil
80
+ crawler = Pioneer::Crawler.new(redirects: 1)
81
+ crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
82
+ crawler.processing = proc{ |req| @result << req.url }
83
+ crawler.if_response_error = proc{ |req| @retries = req.counter; req.retry(2); }
84
+ crawler.start
85
+ @result.must_equal ["http://amazon.com"]
86
+ @retries.must_equal 2
87
+ end
66
88
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pioneer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-29 00:00:00.000000000Z
12
+ date: 2012-03-13 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yajl-ruby
16
- requirement: &73303650 !ruby/object:Gem::Requirement
16
+ requirement: &76352180 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,32 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *73303650
24
+ version_requirements: *76352180
25
+ - !ruby/object:Gem::Dependency
26
+ name: nokogiri
27
+ requirement: &76351940 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *76351940
25
36
  - !ruby/object:Gem::Dependency
26
37
  name: em-synchrony
27
- requirement: &73303440 !ruby/object:Gem::Requirement
38
+ requirement: &76351700 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *76351700
47
+ - !ruby/object:Gem::Dependency
48
+ name: em-http-request
49
+ requirement: &76351430 !ruby/object:Gem::Requirement
28
50
  none: false
29
51
  requirements:
30
52
  - - ! '>='
@@ -32,7 +54,7 @@ dependencies:
32
54
  version: '0'
33
55
  type: :runtime
34
56
  prerelease: false
35
- version_requirements: *73303440
57
+ version_requirements: *76351430
36
58
  description: Simple async HTTP crawler based on em-synchrony
37
59
  email:
38
60
  - pedro.yanoviches@gmail.com