pioneer 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -4,3 +4,4 @@ Gemfile.lock
4
4
  pkg/*
5
5
  tmp/*
6
6
  *.log
7
+ test.rb
data/.travis.yml ADDED
@@ -0,0 +1,2 @@
1
+ rvm:
2
+ - 1.9.2
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
1
  ## v0.0.1
2
2
 
3
- * Initial release
3
+ * Initial release
4
+
5
+ ## v0.0.2
6
+
7
+ * added some options and logging
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source "http://rubygems.org"
2
2
 
3
+ gem "rake"
4
+
3
5
  # Specify your gem's dependencies in pioneer.gemspec
4
- gemspec
6
+ gemspec
data/README.md CHANGED
@@ -1,3 +1,115 @@
1
1
  # Pioneer
2
2
 
3
- Pioneer is asynchronous crawler over em-synchrony.
3
+ Pioneer is a simple async HTTP crawler based on em-synchrony
4
+
5
+ # Install
6
+
7
+ ```bash
8
+ gem install pioneer
9
+ ```
10
+
11
+ # Usage
12
+
13
+ To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
14
+
15
+ First one should return enumerable object and second will accept request object.
16
+
17
+ ```ruby
18
+ class Crawler << Pioneer::Base
19
+ def locations
20
+ ["http://www.amazon.com", "http://www.apple.com"]
21
+ end
22
+
23
+ def processing(req)
24
+ File.open(req.url, "w+") do |f|
25
+ f << req.response.response
26
+ end
27
+ end
28
+ end
29
+
30
+ Crawler.new.start
31
+ ```
32
+
33
+ In this example we are saving two files with html of those two sites.
34
+
35
+ `start` method will start iterating over urls and return an Array of what `processing` method returns.
36
+
37
+ # Handling request, response errors and statuses
38
+
39
+ In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
40
+
41
+ ```ruby
42
+ class Crawler << Pioneer::Base
43
+ def locations
44
+ ["http://www.amazon.com", "http://www.apple.com"]
45
+ end
46
+
47
+ def processing(req)
48
+ File.open(req.url, "w+") do |f|
49
+ f << req.response.response
50
+ end
51
+ end
52
+
53
+ def if_request_error(req)
54
+ puts "Request error: #{req.error}"
55
+ end
56
+
57
+ def if_response_error(req)
58
+ puts "Response error: #{req.response.error}"
59
+ end
60
+
61
+ def if_status_203(req)
62
+ puts "He is trying to redirect me"
63
+ end
64
+ end
65
+ ```
66
+
67
+ also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
68
+
69
+ # Overriding behavior
70
+
71
+ You can override all methods on the fly:
72
+
73
+ ```ruby
74
+ crawler = Pioneer::Crawler.new # base simple crawler
75
+ crawler.locations = [url1, url2]
76
+ crawler.processing = proc{ req.response.response_header.status }
77
+ crawler.if_status_404{ |req| "Oups" }
78
+ ```
79
+
80
+ As far as `locations` should return Enumerable you can use nested crawlers to save whole site
81
+
82
+ ```ruby
83
+ require 'pioneer'
84
+ require 'nokogiri'
85
+ class Links
86
+ include Enumerable
87
+ def initialize(link)
88
+ @links = [link]
89
+ end
90
+
91
+ def <<(link)
92
+ @links << link
93
+ end
94
+
95
+ def each
96
+ @links.each{ |url| url }
97
+ end
98
+ end
99
+
100
+ class LinksCrawler < Pioneer::Base
101
+ def locations
102
+ @links = Links.new("http://www.gazeta.ru")
103
+ end
104
+
105
+ def processing(req)
106
+ doc = Nokogiri::HTML.parse(req.response.response)
107
+ links = doc.css("a").map{|link| link["href"]} # + some logic to filter links to prevent duplications and another hosts etc
108
+ @links << links
109
+ File.new(req.url, "w+"){ |f| f << req.response.response }
110
+ end
111
+ end
112
+ LinksCrawler.new(concurrency: 20, redirects: 1, sleep: 0.5).start
113
+ ```
114
+
115
+ ... to be continued
@@ -1,7 +1,7 @@
1
1
  # encoding: utf-8
2
2
  module Pioneer
3
3
  class Request
4
- attr_reader :pioneer, :url, :result, :response
4
+ attr_reader :pioneer, :url, :result, :response, :error
5
5
  def initialize(url, pioneer)
6
6
  @url, @pioneer = url, pioneer
7
7
  @url = begin
@@ -20,12 +20,12 @@ module Pioneer
20
20
  begin
21
21
  @response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
22
22
  rescue => e
23
- error = "Request totaly failed. Url: #{url}, error: #{e.message}"
23
+ @error = "Request totaly failed. Url: #{url}, error: #{e.message}"
24
24
  pioneer.logger.fatal(error)
25
25
  if pioneer.respond_to? :if_request_error
26
26
  return pioneer.send(:if_request_error, self)
27
27
  else
28
- raise HttpRequestError, error
28
+ raise HttpRequestError, @error
29
29
  end
30
30
  end
31
31
  handle_response_error_or_return_result
@@ -1,3 +1,3 @@
1
1
  module Pioneer
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -51,4 +51,16 @@ describe Pioneer::Request do
51
51
  @lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
52
52
  @lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
53
53
  end
54
+
55
+ it "should use headers" do
56
+ @crawler1 = KinopoiskCrawler.new(random_header: false)
57
+ @crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
58
+ @crawler3 = KinopoiskCrawler.new(random_header: true)
59
+ # this one will redirect
60
+ @crawler1.start.must_equal [nil]
61
+ # this one will return some restrictions (it need real headres)
62
+ (@crawler2.start.first < 10000).must_equal true
63
+ # and this one will fire up
64
+ (@crawler3.start.first > 10000).must_equal true
65
+ end
54
66
  end
data/spec/spec_helper.rb CHANGED
@@ -22,7 +22,6 @@ class LastfmEnum
22
22
 
23
23
  def each
24
24
  ARTISTS.each do |artist|
25
- p artist
26
25
  url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
27
26
  yield url
28
27
  end
@@ -38,4 +37,15 @@ class LastfmCrawler < Pioneer::Base
38
37
  json = Yajl::Parser.parse(req.response.response)
39
38
  json["similarartists"]["@attr"]["artist"]
40
39
  end
40
+ end
41
+
42
+ # Kinopoisk
43
+ class KinopoiskCrawler < Pioneer::Base
44
+ def locations
45
+ ["http://www.kinopoisk.ru/level/1/film/614667/"]
46
+ end
47
+
48
+ def processing(req)
49
+ req.response.response.size
50
+ end
41
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pioneer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-21 00:00:00.000000000Z
12
+ date: 2012-02-22 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yajl-ruby
16
- requirement: &73645260 !ruby/object:Gem::Requirement
16
+ requirement: &76399260 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *73645260
24
+ version_requirements: *76399260
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: em-synchrony
27
- requirement: &73644990 !ruby/object:Gem::Requirement
27
+ requirement: &76399050 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,7 +32,7 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *73644990
35
+ version_requirements: *76399050
36
36
  description: Simple async HTTP crawler based on em-synchrony
37
37
  email:
38
38
  - pedro.yanoviches@gmail.com
@@ -41,6 +41,7 @@ extensions: []
41
41
  extra_rdoc_files: []
42
42
  files:
43
43
  - .gitignore
44
+ - .travis.yml
44
45
  - CHANGELOG
45
46
  - Gemfile
46
47
  - LICENSE