pioneer 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -23,4 +23,9 @@
23
23
 
24
24
  * Added skip functionality
25
25
  * Added some docs
26
- * Added "response_body" and "response_header" shortcuts
26
+ * Added "response_body" and "response_header" shortcuts
27
+
28
+ ## v0.0.7
29
+
30
+ * Rescuing of Retry Exception is removed to Base class, so it will be triggered in context of main loop (it will executed with global sleep timeout)
31
+ * Added `headers` callback support
@@ -16,7 +16,7 @@ module Pioneer
16
16
  def initialize(opts = {})
17
17
  raise UndefinedLocations, "you should specify `locations` method in your `self.class`" unless self.methods.include? :locations
18
18
  raise UndefinedProcessing, "you should specify `processing` method in your `self.class`" unless self.methods.include? :processing
19
- raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
19
+ # raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
20
20
  @name = opts[:name] || "crawler"
21
21
  @concurrency = opts[:concurrency] || 10
22
22
  @sleep = opts[:sleep] || 0 # sleep is reversed RPS (1/RPS) - frequency of requests.
@@ -25,18 +25,24 @@ module Pioneer
25
25
  @random_header = opts[:random_header] || false
26
26
  @header = opts[:header] || nil
27
27
  @redirects = opts[:redirects] || nil
28
+ @headers = opts[:headers] #|| nil
28
29
  end
29
30
 
31
+ #
32
+ # Main method: starting crawling through locations
33
+ #
30
34
  def start
31
- raise LocationsNotEnumerable, "location should respond to `each`" unless locations.respond_to? :each
32
35
  result = []
33
36
  EM.synchrony do
34
- # Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
35
- # In case @sleep is 0 it behaves like standart FiberIterator
36
37
  EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
37
- sleep
38
+ counter = 0
38
39
  begin
39
- result << Request.new(url, self).perform
40
+ sleep
41
+ result << Request.new(url, self, counter).perform
42
+ rescue Pioneer::HttpRetryRequest => e
43
+ # return to our loop
44
+ counter += 1
45
+ retry
40
46
  rescue Pioneer::HttpSkipRequest => e
41
47
  nil # do nothing?
42
48
  end
@@ -46,7 +52,9 @@ module Pioneer
46
52
  result
47
53
  end
48
54
 
55
+ #
49
56
  # Sleep if the last request was recently (less then timout period)
57
+ #
50
58
  def sleep
51
59
  @next_start ||= Time.now
52
60
  if @sleep > 0
@@ -58,6 +66,9 @@ module Pioneer
58
66
  end
59
67
  end
60
68
 
69
+ #
70
+ # Default Pioneer logger
71
+ #
61
72
  def logger
62
73
  @logger ||= begin
63
74
  logger = Logger.new(STDOUT)
@@ -66,6 +77,9 @@ module Pioneer
66
77
  end
67
78
  end
68
79
 
80
+ #
81
+ # Set headers, such as redirects, cookies etc
82
+ #
69
83
  def http_opts
70
84
  opts = {}
71
85
  opts[:head] = random_header if @random_header
@@ -74,11 +88,23 @@ module Pioneer
74
88
  opts
75
89
  end
76
90
 
91
+ #
92
+ # Generate random header for request
93
+ #
77
94
  def random_header
78
95
  HttpHeader.random
79
96
  end
80
97
 
98
+ #
99
+ # Headers callback
100
+ #
101
+ def headers
102
+ @headers
103
+ end
104
+
105
+ #
81
106
  # we should override only our methods: locations, processing, if_XXX
107
+ #
82
108
  def method_missing(method_name, *args, &block)
83
109
  case method_name
84
110
  when /locations.*=|processing.*=|if_.+=/
@@ -89,6 +115,9 @@ module Pioneer
89
115
  end
90
116
  end
91
117
 
118
+ #
119
+ # Overriding methods as singeltons so they are availible only for current instance of crawler
120
+ #
92
121
  def override_method(method_name, arg)
93
122
  if Proc === arg
94
123
  self.define_singleton_method method_name do |req|
@@ -3,10 +3,10 @@ module Pioneer
3
3
  class Request
4
4
  attr_reader :pioneer, :url, :result, :response, :error, :counter
5
5
 
6
- def initialize(url, pioneer)
6
+ def initialize(url, pioneer, counter=0)
7
7
  @pioneer = pioneer
8
8
  @url = parse_url(url)
9
- @counter = 0
9
+ @counter = counter
10
10
  end
11
11
 
12
12
  #
@@ -24,7 +24,13 @@ module Pioneer
24
24
  #
25
25
  def handle_request_error_or_return_result
26
26
  begin
27
- @response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
27
+ req = EM::HttpRequest.new(url).aget pioneer.http_opts
28
+ if pioneer.headers
29
+ req.headers{
30
+ pioneer.headers.call(req)
31
+ }
32
+ end
33
+ @response = EM::Synchrony.sync req
28
34
  rescue => e
29
35
  @error = "Request totaly failed. Url: #{url}, error: #{e.message}"
30
36
  pioneer.logger.fatal(@error)
@@ -35,8 +41,6 @@ module Pioneer
35
41
  end
36
42
  end
37
43
  handle_response_error_or_return_result
38
- rescue Pioneer::HttpRetryRequest => e
39
- retry
40
44
  end
41
45
 
42
46
  #
@@ -87,8 +91,7 @@ module Pioneer
87
91
  #
88
92
  def retry(count=nil)
89
93
  if count
90
- @counter += 1
91
- skip if @counter > count
94
+ skip if @counter >= count
92
95
  end
93
96
  raise Pioneer::HttpRetryRequest
94
97
  end
@@ -105,7 +108,7 @@ module Pioneer
105
108
  # We should parse url befor sending request
106
109
  # We use URI.escape for escaping
107
110
  # IMPORTAINT: We should replace ampersand (&) in params with "&amp;" !!!
108
- # Pluses (+) weill be replaced with "%2B"
111
+ # Pluses (+) will be replaced with "%2B"
109
112
  #
110
113
  def parse_url(url)
111
114
  url = "http://" + url unless url =~ /http/
@@ -1,3 +1,3 @@
1
1
  module Pioneer
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pioneer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-13 00:00:00.000000000Z
12
+ date: 2012-03-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yajl-ruby
16
- requirement: &76352180 !ruby/object:Gem::Requirement
16
+ requirement: &77891380 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *76352180
24
+ version_requirements: *77891380
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: nokogiri
27
- requirement: &76351940 !ruby/object:Gem::Requirement
27
+ requirement: &77891160 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *76351940
35
+ version_requirements: *77891160
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: em-synchrony
38
- requirement: &76351700 !ruby/object:Gem::Requirement
38
+ requirement: &77890920 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *76351700
46
+ version_requirements: *77890920
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: em-http-request
49
- requirement: &76351430 !ruby/object:Gem::Requirement
49
+ requirement: &77890690 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *76351430
57
+ version_requirements: *77890690
58
58
  description: Simple async HTTP crawler based on em-synchrony
59
59
  email:
60
60
  - pedro.yanoviches@gmail.com