pioneer 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -13,4 +13,14 @@
13
13
  ## v0.0.4
14
14
 
15
15
  * Fixed stackoverflow on retrying request after failing
16
- * added `req.retry` method to retry http request in error handlers
16
+ * added `req.retry` method to retry http request in error handlers
17
+
18
+ ## v0.0.5
19
+
20
+ * Added some minor fixes
21
+
22
+ ## v0.0.6
23
+
24
+ * Added skip functionality
25
+ * Added some docs
26
+ * Added "response_body" and "response_header" shortcuts
@@ -4,7 +4,6 @@ require "em-synchrony/em-http"
4
4
  require "em-synchrony/fiber_iterator"
5
5
  # patch - to remove! maybe pull to em-synchrony?
6
6
  require "patch/iterator"
7
- require "patch/fiber_periodic_timer_iterator"
8
7
  # other
9
8
  require "logger"
10
9
  require 'uri'
@@ -1,13 +1,14 @@
1
1
  # encoding: utf-8
2
2
  module Pioneer
3
- class UndefinedLocations < RuntimeError; end
4
- class LocationsNotEnumerable < RuntimeError; end
5
- class UndefinedProcessing < RuntimeError; end
6
- class LocationsNotEnumerator < RuntimeError; end
7
- class HttpRequestError < RuntimeError; end
8
- class HttpResponseError < RuntimeError; end
9
- class HttpStatusError < RuntimeError; end
10
- class HttpRetryRequest < RuntimeError; end
3
+ class UndefinedLocations < StandardError; end
4
+ class LocationsNotEnumerable < StandardError; end
5
+ class UndefinedProcessing < StandardError; end
6
+ class LocationsNotEnumerator < StandardError; end
7
+ class HttpRequestError < StandardError; end
8
+ class HttpResponseError < StandardError; end
9
+ class HttpStatusError < StandardError; end
10
+ class HttpRetryRequest < StandardError; end
11
+ class HttpSkipRequest < StandardError; end
11
12
 
12
13
  class Base
13
14
  attr_reader :name, :concurrency, :sleep, :log_level, :redirect
@@ -32,14 +33,31 @@ module Pioneer
32
33
  EM.synchrony do
33
34
  # Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
34
35
  # In case @sleep is 0 it behaves like standart FiberIterator
35
- EM::Synchrony::FiberPeriodicTimerIterator.new(locations, concurrency, sleep).map do |url|
36
- result << Request.new(url, self).perform
36
+ EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
37
+ sleep
38
+ begin
39
+ result << Request.new(url, self).perform
40
+ rescue Pioneer::HttpSkipRequest => e
41
+ nil # do nothing?
42
+ end
37
43
  end
38
44
  EM.stop
39
45
  end
40
46
  result
41
47
  end
42
48
 
49
+ # Sleep if the last request was recently (less then timout period)
50
+ def sleep
51
+ @next_start ||= Time.now
52
+ if @sleep > 0
53
+ now = Time.now
54
+ sleep_time = @next_start - Time.now
55
+ sleep_time = 0 if sleep_time < 0
56
+ @next_start = Time.now + sleep_time + @sleep
57
+ EM::Synchrony.sleep(sleep_time) if sleep_time > 0
58
+ end
59
+ end
60
+
43
61
  def logger
44
62
  @logger ||= begin
45
63
  logger = Logger.new(STDOUT)
@@ -5,10 +5,11 @@ module Pioneer
5
5
  def random
6
6
  header = headers.sample
7
7
  headers = {
8
- 'Referer' => 'http://www.google.com/',
8
+ 'Referer' => 'http://www.google.ru/#hl=ru&newwindow=1&sa=X&ei=6oPXTp3OB4Tl4QTdl-zyDQ&ved=0CCAQvwUoAQ&q=kinopoisk&spell=1&bav=on.2,or.r_gc.r_pw.r_cp.,cf.osb&fp=c8a5ca24098b41f5&biw=1127&bih=628',
9
9
  'User-Agent' => header,
10
10
  'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
11
- 'Connection' => 'keep-alive'
11
+ 'Connection' => 'keep-alive',
12
+ 'Cache-Control' => 'no-cache'
12
13
  }
13
14
  end
14
15
 
@@ -1,59 +1,76 @@
1
1
  # encoding: utf-8
2
2
  module Pioneer
3
3
  class Request
4
- attr_reader :pioneer, :url, :result, :response, :error
4
+ attr_reader :pioneer, :url, :result, :response, :error, :counter
5
+
5
6
  def initialize(url, pioneer)
6
- @url, @pioneer = url, pioneer
7
- @url = begin
8
- url = "http://" + url unless url =~ /http/
9
- URI.escape(url)
10
- end
7
+ @pioneer = pioneer
8
+ @url = parse_url(url)
9
+ @counter = 0
11
10
  end
12
11
 
12
+ #
13
+ # Request processing
14
+ #
13
15
  def perform
14
16
  pioneer.logger.info("going to #{url}")
15
17
  @result = handle_request_error_or_return_result
16
18
  end
17
19
 
20
+ #
18
21
  # Handle base fatal request error
22
+ # If we have got connection error or whatever
23
+ # we will fire either Exception or call "if_request_error" if exists
24
+ #
19
25
  def handle_request_error_or_return_result
20
26
  begin
21
27
  @response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
22
28
  rescue => e
23
29
  @error = "Request totaly failed. Url: #{url}, error: #{e.message}"
24
- pioneer.logger.fatal(error)
30
+ pioneer.logger.fatal(@error)
25
31
  if pioneer.respond_to? :if_request_error
26
32
  return pioneer.if_request_error(self)
27
33
  else
28
- raise HttpRequestError, @error
34
+ raise Pioneer::HttpRequestError, @error
29
35
  end
30
36
  end
31
37
  handle_response_error_or_return_result
32
- rescue HttpRetryRequest => e
38
+ rescue Pioneer::HttpRetryRequest => e
33
39
  retry
34
40
  end
35
41
 
36
- # handle http error
42
+ #
43
+ # Handle http error
44
+ # If we can't make proper response we will ether fire Exception
45
+ # or call "if_response_error" if exists
46
+ #
37
47
  def handle_response_error_or_return_result
38
48
  if response.error
39
- error = "Response for #{url} get an error: #{response.error}"
40
- pioneer.logger.error(error)
49
+ @error = "Response for #{url} get an error: #{response.error}"
50
+ pioneer.logger.error(@error)
41
51
  if pioneer.respond_to? :if_response_error
42
52
  return pioneer.if_response_error(self)
43
53
  else
44
- raise HttpResponseError, error
54
+ raise Pioneer::HttpResponseError, error
45
55
  end
46
56
  end
47
57
  handle_status_or_return_result
48
58
  end
49
59
 
60
+ #
61
+ # Handle wrong status or run "processing"
62
+ # If status is not 200 we will either do nothing (?)
63
+ # or call "if_status_XXX" if exist
64
+ # or "if_status_not_200"
65
+ #
50
66
  def handle_status_or_return_result
51
67
  status = response.response_header.status
52
68
  case status
53
69
  when 200
54
70
  pioneer.processing(self)
55
71
  else
56
- pioneer.logger.error("This #{url} returns this http status: #{status}")
72
+ @error = "This #{url} returns this http status: #{status}"
73
+ pioneer.logger.error(@error)
57
74
  if pioneer.respond_to? "if_status_#{status}".to_sym
58
75
  pioneer.send("if_status_#{status}", self)
59
76
  elsif pioneer.respond_to? :if_status_not_200
@@ -64,8 +81,54 @@ module Pioneer
64
81
  end
65
82
  end
66
83
 
67
- def retry
68
- raise HttpRetryRequest
84
+ #
85
+ # We can call retry from crawler like "req.retry"
86
+ # If count is seted, so it will retry it not more then "count" times
87
+ #
88
+ def retry(count=nil)
89
+ if count
90
+ @counter += 1
91
+ skip if @counter > count
92
+ end
93
+ raise Pioneer::HttpRetryRequest
94
+ end
95
+
96
+ #
97
+ # We can skip request from crawler like "req.skip"
98
+ # I.E. if response_body is blank or 404 error
99
+ #
100
+ def skip
101
+ raise Pioneer::HttpSkipRequest
102
+ end
103
+
104
+ #
105
+ # We should parse url befor sending request
106
+ # We use URI.escape for escaping
107
+ # IMPORTAINT: We should replace ampersand (&) in params with "&amp;" !!!
108
+ # Pluses (+) weill be replaced with "%2B"
109
+ #
110
+ def parse_url(url)
111
+ url = "http://" + url unless url =~ /http/
112
+ url = URI.escape(url)
113
+ # replace "&" ampersands :)
114
+ url = url.gsub("&amp;", "%26")
115
+ # replace pluses
116
+ url = url.gsub("+", "%2B")
117
+ url
118
+ end
119
+
120
+ #
121
+ # Shortcut for response.response
122
+ #
123
+ def response_body
124
+ response.response
125
+ end
126
+
127
+ #
128
+ # Shortcut for response.response_header
129
+ #
130
+ def response_header
131
+ response.response_header
69
132
  end
70
133
  end
71
134
  end
@@ -1,3 +1,3 @@
1
1
  module Pioneer
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -20,5 +20,7 @@ Gem::Specification.new do |s|
20
20
 
21
21
  # specify any dependencies here; for example:
22
22
  s.add_development_dependency "yajl-ruby"
23
+ s.add_development_dependency "nokogiri"
23
24
  s.add_runtime_dependency "em-synchrony"
25
+ s.add_runtime_dependency "em-http-request"
24
26
  end
@@ -18,7 +18,7 @@ describe Pioneer::Request do
18
18
  end
19
19
 
20
20
  it "should redefine methods" do
21
- processing = proc{ |req| req.response.response_header.status + 1 }
21
+ processing = proc{ |req| req.response_header.status + 1 }
22
22
  @pioneer2.processing = processing
23
23
  @pioneer2.locations = ["www.apple.com", "www.amazon.com"]
24
24
  @pioneer2.start.must_equal [201, 201]
@@ -63,4 +63,26 @@ describe Pioneer::Request do
63
63
  # and this one will fire up
64
64
  (@crawler3.start.first > 10000).must_equal true
65
65
  end
66
+
67
+ it "should skip url" do
68
+ @result = []
69
+ crawler = Pioneer::Crawler.new(redirects: 1)
70
+ crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
71
+ crawler.processing = proc{ |req| @result << req.url }
72
+ crawler.if_response_error = proc{ |req| req.skip }
73
+ crawler.start
74
+ @result.must_equal ["http://amazon.com"]
75
+ end
76
+
77
+ it "should retry 2 times and skip" do
78
+ @result = []
79
+ @retries = nil
80
+ crawler = Pioneer::Crawler.new(redirects: 1)
81
+ crawler.locations = ["http://not.exist.page.com", "http://amazon.com"]
82
+ crawler.processing = proc{ |req| @result << req.url }
83
+ crawler.if_response_error = proc{ |req| @retries = req.counter; req.retry(2); }
84
+ crawler.start
85
+ @result.must_equal ["http://amazon.com"]
86
+ @retries.must_equal 2
87
+ end
66
88
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pioneer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-29 00:00:00.000000000Z
12
+ date: 2012-03-13 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yajl-ruby
16
- requirement: &73303650 !ruby/object:Gem::Requirement
16
+ requirement: &76352180 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,32 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *73303650
24
+ version_requirements: *76352180
25
+ - !ruby/object:Gem::Dependency
26
+ name: nokogiri
27
+ requirement: &76351940 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *76351940
25
36
  - !ruby/object:Gem::Dependency
26
37
  name: em-synchrony
27
- requirement: &73303440 !ruby/object:Gem::Requirement
38
+ requirement: &76351700 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *76351700
47
+ - !ruby/object:Gem::Dependency
48
+ name: em-http-request
49
+ requirement: &76351430 !ruby/object:Gem::Requirement
28
50
  none: false
29
51
  requirements:
30
52
  - - ! '>='
@@ -32,7 +54,7 @@ dependencies:
32
54
  version: '0'
33
55
  type: :runtime
34
56
  prerelease: false
35
- version_requirements: *73303440
57
+ version_requirements: *76351430
36
58
  description: Simple async HTTP crawler based on em-synchrony
37
59
  email:
38
60
  - pedro.yanoviches@gmail.com