pioneer 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -4,3 +4,4 @@ Gemfile.lock
4
4
  pkg/*
5
5
  tmp/*
6
6
  *.log
7
+ test.rb
data/.travis.yml ADDED
@@ -0,0 +1,2 @@
1
+ rvm:
2
+ - 1.9.2
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
1
  ## v0.0.1
2
2
 
3
- * Initial release
3
+ * Initial release
4
+
5
+ ## v0.0.2
6
+
7
+ * added some options and logging
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source "http://rubygems.org"
2
2
 
3
+ gem "rake"
4
+
3
5
  # Specify your gem's dependencies in pioneer.gemspec
4
- gemspec
6
+ gemspec
data/README.md CHANGED
@@ -1,3 +1,115 @@
1
1
  # Pioneer
2
2
 
3
- Pioneer is asynchronous crawler over em-synchrony.
3
+ Pioneer is a simple async HTTP crawler based on em-synchrony
4
+
5
+ # Install
6
+
7
+ ```bash
8
+ gem install pioneer
9
+ ```
10
+
11
+ # Usage
12
+
13
+ To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
14
+
15
+ First one should return enumerable object and second will accept request object.
16
+
17
+ ```ruby
18
+ class Crawler << Pioneer::Base
19
+ def locations
20
+ ["http://www.amazon.com", "http://www.apple.com"]
21
+ end
22
+
23
+ def processing(req)
24
+ File.open(req.url, "w+") do |f|
25
+ f << req.response.response
26
+ end
27
+ end
28
+ end
29
+
30
+ Crawler.new.start
31
+ ```
32
+
33
+ In this example we are saving two files with html of those two sites.
34
+
35
+ `start` method will start iterating over urls and return an Array of what `processing` method returns.
36
+
37
+ # Handling request, response errors and statuses
38
+
39
+ In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
40
+
41
+ ```ruby
42
+ class Crawler << Pioneer::Base
43
+ def locations
44
+ ["http://www.amazon.com", "http://www.apple.com"]
45
+ end
46
+
47
+ def processing(req)
48
+ File.open(req.url, "w+") do |f|
49
+ f << req.response.response
50
+ end
51
+ end
52
+
53
+ def if_request_error(req)
54
+ puts "Request error: #{req.error}"
55
+ end
56
+
57
+ def if_response_error(req)
58
+ puts "Response error: #{req.response.error}"
59
+ end
60
+
61
+ def if_status_203(req)
62
+ puts "He is trying to redirect me"
63
+ end
64
+ end
65
+ ```
66
+
67
+ also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
68
+
69
+ # Overriding behavior
70
+
71
+ You can override all methods on the fly:
72
+
73
+ ```ruby
74
+ crawler = Pioneer::Crawler.new # base simple crawler
75
+ crawler.locations = [url1, url2]
76
+ crawler.processing = proc{ req.response.response_header.status }
77
+ crawler.if_status_404{ |req| "Oups" }
78
+ ```
79
+
80
+ As far as `locations` should return Enumerable you can use nested crawlers to save whole site
81
+
82
+ ```ruby
83
+ require 'pioneer'
84
+ require 'nokogiri'
85
+ class Links
86
+ include Enumerable
87
+ def initialize(link)
88
+ @links = [link]
89
+ end
90
+
91
+ def <<(link)
92
+ @links << link
93
+ end
94
+
95
+ def each
96
+ @links.each{ |url| url }
97
+ end
98
+ end
99
+
100
+ class LinksCrawler < Pioneer::Base
101
+ def locations
102
+ @links = Links.new("http://www.gazeta.ru")
103
+ end
104
+
105
+ def processing(req)
106
+ doc = Nokogiri::HTML.parse(req.response.response)
107
+ links = doc.css("a").map{|link| link["href"]} # + some logic to filter links to prevent duplications and another hosts etc
108
+ @links << links
109
+ File.new(req.url, "w+"){ |f| f << req.response.response }
110
+ end
111
+ end
112
+ LinksCrawler.new(concurrency: 20, redirects: 1, sleep: 0.5).start
113
+ ```
114
+
115
+ ... to be continued
@@ -1,7 +1,7 @@
1
1
  # encoding: utf-8
2
2
  module Pioneer
3
3
  class Request
4
- attr_reader :pioneer, :url, :result, :response
4
+ attr_reader :pioneer, :url, :result, :response, :error
5
5
  def initialize(url, pioneer)
6
6
  @url, @pioneer = url, pioneer
7
7
  @url = begin
@@ -20,12 +20,12 @@ module Pioneer
20
20
  begin
21
21
  @response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
22
22
  rescue => e
23
- error = "Request totaly failed. Url: #{url}, error: #{e.message}"
23
+ @error = "Request totaly failed. Url: #{url}, error: #{e.message}"
24
24
  pioneer.logger.fatal(error)
25
25
  if pioneer.respond_to? :if_request_error
26
26
  return pioneer.send(:if_request_error, self)
27
27
  else
28
- raise HttpRequestError, error
28
+ raise HttpRequestError, @error
29
29
  end
30
30
  end
31
31
  handle_response_error_or_return_result
@@ -1,3 +1,3 @@
1
1
  module Pioneer
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -51,4 +51,16 @@ describe Pioneer::Request do
51
51
  @lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
52
52
  @lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
53
53
  end
54
+
55
+ it "should use headers" do
56
+ @crawler1 = KinopoiskCrawler.new(random_header: false)
57
+ @crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
58
+ @crawler3 = KinopoiskCrawler.new(random_header: true)
59
+ # this one will redirect
60
+ @crawler1.start.must_equal [nil]
61
+ # this one will return some restrictions (it need real headres)
62
+ (@crawler2.start.first < 10000).must_equal true
63
+ # and this one will fire up
64
+ (@crawler3.start.first > 10000).must_equal true
65
+ end
54
66
  end
data/spec/spec_helper.rb CHANGED
@@ -22,7 +22,6 @@ class LastfmEnum
22
22
 
23
23
  def each
24
24
  ARTISTS.each do |artist|
25
- p artist
26
25
  url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
27
26
  yield url
28
27
  end
@@ -38,4 +37,15 @@ class LastfmCrawler < Pioneer::Base
38
37
  json = Yajl::Parser.parse(req.response.response)
39
38
  json["similarartists"]["@attr"]["artist"]
40
39
  end
40
+ end
41
+
42
+ # Kinopoisk
43
+ class KinopoiskCrawler < Pioneer::Base
44
+ def locations
45
+ ["http://www.kinopoisk.ru/level/1/film/614667/"]
46
+ end
47
+
48
+ def processing(req)
49
+ req.response.response.size
50
+ end
41
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pioneer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-21 00:00:00.000000000Z
12
+ date: 2012-02-22 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yajl-ruby
16
- requirement: &73645260 !ruby/object:Gem::Requirement
16
+ requirement: &76399260 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *73645260
24
+ version_requirements: *76399260
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: em-synchrony
27
- requirement: &73644990 !ruby/object:Gem::Requirement
27
+ requirement: &76399050 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,7 +32,7 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *73644990
35
+ version_requirements: *76399050
36
36
  description: Simple async HTTP crawler based on em-synchrony
37
37
  email:
38
38
  - pedro.yanoviches@gmail.com
@@ -41,6 +41,7 @@ extensions: []
41
41
  extra_rdoc_files: []
42
42
  files:
43
43
  - .gitignore
44
+ - .travis.yml
44
45
  - CHANGELOG
45
46
  - Gemfile
46
47
  - LICENSE