pioneer 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -23,4 +23,9 @@
23
23
 
24
24
  * Added skip functionality
25
25
  * Added some docs
26
- * Added "response_body" and "response_header" shortcuts
26
+ * Added "response_body" and "response_header" shortcuts
27
+
28
+ ## v0.0.7
29
+
30
+ * Rescuing of Retry Exception is removed to Base class, so it will be triggered in context of main loop (it will executed with global sleep timeout)
31
+ * Added `headers` callback support
@@ -16,7 +16,7 @@ module Pioneer
16
16
  def initialize(opts = {})
17
17
  raise UndefinedLocations, "you should specify `locations` method in your `self.class`" unless self.methods.include? :locations
18
18
  raise UndefinedProcessing, "you should specify `processing` method in your `self.class`" unless self.methods.include? :processing
19
- raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
19
+ # raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
20
20
  @name = opts[:name] || "crawler"
21
21
  @concurrency = opts[:concurrency] || 10
22
22
  @sleep = opts[:sleep] || 0 # sleep is reversed RPS (1/RPS) - frequency of requests.
@@ -25,18 +25,24 @@ module Pioneer
25
25
  @random_header = opts[:random_header] || false
26
26
  @header = opts[:header] || nil
27
27
  @redirects = opts[:redirects] || nil
28
+ @headers = opts[:headers] #|| nil
28
29
  end
29
30
 
31
+ #
32
+ # Main method: starting crawling through locations
33
+ #
30
34
  def start
31
- raise LocationsNotEnumerable, "location should respond to `each`" unless locations.respond_to? :each
32
35
  result = []
33
36
  EM.synchrony do
34
- # Using FiberPeriodicTimerIterator that implements RPS (request per second feature)
35
- # In case @sleep is 0 it behaves like standart FiberIterator
36
37
  EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
37
- sleep
38
+ counter = 0
38
39
  begin
39
- result << Request.new(url, self).perform
40
+ sleep
41
+ result << Request.new(url, self, counter).perform
42
+ rescue Pioneer::HttpRetryRequest => e
43
+ # return to our loop
44
+ counter += 1
45
+ retry
40
46
  rescue Pioneer::HttpSkipRequest => e
41
47
  nil # do nothing?
42
48
  end
@@ -46,7 +52,9 @@ module Pioneer
46
52
  result
47
53
  end
48
54
 
55
+ #
49
56
  # Sleep if the last request was recently (less then timout period)
57
+ #
50
58
  def sleep
51
59
  @next_start ||= Time.now
52
60
  if @sleep > 0
@@ -58,6 +66,9 @@ module Pioneer
58
66
  end
59
67
  end
60
68
 
69
+ #
70
+ # Default Pioneer logger
71
+ #
61
72
  def logger
62
73
  @logger ||= begin
63
74
  logger = Logger.new(STDOUT)
@@ -66,6 +77,9 @@ module Pioneer
66
77
  end
67
78
  end
68
79
 
80
+ #
81
+ # Set headers, such as redirects, cookies etc
82
+ #
69
83
  def http_opts
70
84
  opts = {}
71
85
  opts[:head] = random_header if @random_header
@@ -74,11 +88,23 @@ module Pioneer
74
88
  opts
75
89
  end
76
90
 
91
+ #
92
+ # Generate random header for request
93
+ #
77
94
  def random_header
78
95
  HttpHeader.random
79
96
  end
80
97
 
98
+ #
99
+ # Headers callback
100
+ #
101
+ def headers
102
+ @headers
103
+ end
104
+
105
+ #
81
106
  # we should override only our methods: locations, processing, if_XXX
107
+ #
82
108
  def method_missing(method_name, *args, &block)
83
109
  case method_name
84
110
  when /locations.*=|processing.*=|if_.+=/
@@ -89,6 +115,9 @@ module Pioneer
89
115
  end
90
116
  end
91
117
 
118
+ #
119
+ # Overriding methods as singeltons so they are availible only for current instance of crawler
120
+ #
92
121
  def override_method(method_name, arg)
93
122
  if Proc === arg
94
123
  self.define_singleton_method method_name do |req|
@@ -3,10 +3,10 @@ module Pioneer
3
3
  class Request
4
4
  attr_reader :pioneer, :url, :result, :response, :error, :counter
5
5
 
6
- def initialize(url, pioneer)
6
+ def initialize(url, pioneer, counter=0)
7
7
  @pioneer = pioneer
8
8
  @url = parse_url(url)
9
- @counter = 0
9
+ @counter = counter
10
10
  end
11
11
 
12
12
  #
@@ -24,7 +24,13 @@ module Pioneer
24
24
  #
25
25
  def handle_request_error_or_return_result
26
26
  begin
27
- @response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
27
+ req = EM::HttpRequest.new(url).aget pioneer.http_opts
28
+ if pioneer.headers
29
+ req.headers{
30
+ pioneer.headers.call(req)
31
+ }
32
+ end
33
+ @response = EM::Synchrony.sync req
28
34
  rescue => e
29
35
  @error = "Request totaly failed. Url: #{url}, error: #{e.message}"
30
36
  pioneer.logger.fatal(@error)
@@ -35,8 +41,6 @@ module Pioneer
35
41
  end
36
42
  end
37
43
  handle_response_error_or_return_result
38
- rescue Pioneer::HttpRetryRequest => e
39
- retry
40
44
  end
41
45
 
42
46
  #
@@ -87,8 +91,7 @@ module Pioneer
87
91
  #
88
92
  def retry(count=nil)
89
93
  if count
90
- @counter += 1
91
- skip if @counter > count
94
+ skip if @counter >= count
92
95
  end
93
96
  raise Pioneer::HttpRetryRequest
94
97
  end
@@ -105,7 +108,7 @@ module Pioneer
105
108
  # We should parse url befor sending request
106
109
  # We use URI.escape for escaping
107
110
  # IMPORTAINT: We should replace ampersand (&) in params with "&amp;" !!!
108
- # Pluses (+) weill be replaced with "%2B"
111
+ # Pluses (+) will be replaced with "%2B"
109
112
  #
110
113
  def parse_url(url)
111
114
  url = "http://" + url unless url =~ /http/
@@ -1,3 +1,3 @@
1
1
  module Pioneer
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pioneer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-13 00:00:00.000000000Z
12
+ date: 2012-03-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yajl-ruby
16
- requirement: &76352180 !ruby/object:Gem::Requirement
16
+ requirement: &77891380 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *76352180
24
+ version_requirements: *77891380
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: nokogiri
27
- requirement: &76351940 !ruby/object:Gem::Requirement
27
+ requirement: &77891160 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *76351940
35
+ version_requirements: *77891160
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: em-synchrony
38
- requirement: &76351700 !ruby/object:Gem::Requirement
38
+ requirement: &77890920 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *76351700
46
+ version_requirements: *77890920
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: em-http-request
49
- requirement: &76351430 !ruby/object:Gem::Requirement
49
+ requirement: &77890690 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *76351430
57
+ version_requirements: *77890690
58
58
  description: Simple async HTTP crawler based on em-synchrony
59
59
  email:
60
60
  - pedro.yanoviches@gmail.com