pioneer 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.travis.yml +2 -0
- data/CHANGELOG +5 -1
- data/Gemfile +3 -1
- data/README.md +113 -1
- data/lib/pioneer/request.rb +3 -3
- data/lib/pioneer/version.rb +1 -1
- data/spec/pioneer/request_spec.rb +12 -0
- data/spec/spec_helper.rb +11 -1
- metadata +7 -6
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,3 +1,115 @@
|
|
1
1
|
# Pioneer
|
2
2
|
|
3
|
-
Pioneer is
|
3
|
+
Pioneer is a simple async HTTP crawler based on em-synchrony
|
4
|
+
|
5
|
+
# Install
|
6
|
+
|
7
|
+
```bash
|
8
|
+
gem install pioneer
|
9
|
+
```
|
10
|
+
|
11
|
+
# Usage
|
12
|
+
|
13
|
+
To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
|
14
|
+
|
15
|
+
First one should return enumerable object and second will accept request object.
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
class Crawler << Pioneer::Base
|
19
|
+
def locations
|
20
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
21
|
+
end
|
22
|
+
|
23
|
+
def processing(req)
|
24
|
+
File.open(req.url, "w+") do |f|
|
25
|
+
f << req.response.response
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
Crawler.new.start
|
31
|
+
```
|
32
|
+
|
33
|
+
In this example we are saving two files with html of those two sites.
|
34
|
+
|
35
|
+
`start` method will start iterating over urls and return an Array of what `processing` method returns.
|
36
|
+
|
37
|
+
# Handling request, response errors and statuses
|
38
|
+
|
39
|
+
In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
class Crawler << Pioneer::Base
|
43
|
+
def locations
|
44
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
45
|
+
end
|
46
|
+
|
47
|
+
def processing(req)
|
48
|
+
File.open(req.url, "w+") do |f|
|
49
|
+
f << req.response.response
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def if_request_error(req)
|
54
|
+
puts "Request error: #{req.error}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def if_response_error(req)
|
58
|
+
puts "Response error: #{req.response.error}"
|
59
|
+
end
|
60
|
+
|
61
|
+
def if_status_203(req)
|
62
|
+
puts "He is trying to redirect me"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
```
|
66
|
+
|
67
|
+
also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
|
68
|
+
|
69
|
+
# Overriding behavior
|
70
|
+
|
71
|
+
You can override all methods on the fly:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
crawler = Pioneer::Crawler.new # base simple crawler
|
75
|
+
crawler.locations = [url1, url2]
|
76
|
+
crawler.processing = proc{ req.response.response_header.status }
|
77
|
+
crawler.if_status_404{ |req| "Oups" }
|
78
|
+
```
|
79
|
+
|
80
|
+
As far as `locations` should return Enumerable you can use nested crawlers to save whole site
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
require 'pioneer'
|
84
|
+
require 'nokogiri'
|
85
|
+
class Links
|
86
|
+
include Enumerable
|
87
|
+
def initialize(link)
|
88
|
+
@links = [link]
|
89
|
+
end
|
90
|
+
|
91
|
+
def <<(link)
|
92
|
+
@links << link
|
93
|
+
end
|
94
|
+
|
95
|
+
def each
|
96
|
+
@links.each{ |url| url }
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class LinksCrawler < Pioneer::Base
|
101
|
+
def locations
|
102
|
+
@links = Links.new("http://www.gazeta.ru")
|
103
|
+
end
|
104
|
+
|
105
|
+
def processing(req)
|
106
|
+
doc = Nokogiri::HTML.parse(req.response.response)
|
107
|
+
links = doc.css("a").map{|link| link["href"]} # + some logic to filter links to prevent duplications and another hosts etc
|
108
|
+
@links << links
|
109
|
+
File.new(req.url, "w+"){ |f| f << req.response.response }
|
110
|
+
end
|
111
|
+
end
|
112
|
+
LinksCrawler.new(concurrency: 20, redirects: 1, sleep: 0.5).start
|
113
|
+
```
|
114
|
+
|
115
|
+
... to be continued
|
data/lib/pioneer/request.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
3
|
class Request
|
4
|
-
attr_reader :pioneer, :url, :result, :response
|
4
|
+
attr_reader :pioneer, :url, :result, :response, :error
|
5
5
|
def initialize(url, pioneer)
|
6
6
|
@url, @pioneer = url, pioneer
|
7
7
|
@url = begin
|
@@ -20,12 +20,12 @@ module Pioneer
|
|
20
20
|
begin
|
21
21
|
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
22
|
rescue => e
|
23
|
-
error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
23
|
+
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
24
|
pioneer.logger.fatal(error)
|
25
25
|
if pioneer.respond_to? :if_request_error
|
26
26
|
return pioneer.send(:if_request_error, self)
|
27
27
|
else
|
28
|
-
raise HttpRequestError, error
|
28
|
+
raise HttpRequestError, @error
|
29
29
|
end
|
30
30
|
end
|
31
31
|
handle_response_error_or_return_result
|
data/lib/pioneer/version.rb
CHANGED
@@ -51,4 +51,16 @@ describe Pioneer::Request do
|
|
51
51
|
@lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
|
52
52
|
@lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
|
53
53
|
end
|
54
|
+
|
55
|
+
it "should use headers" do
|
56
|
+
@crawler1 = KinopoiskCrawler.new(random_header: false)
|
57
|
+
@crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
|
58
|
+
@crawler3 = KinopoiskCrawler.new(random_header: true)
|
59
|
+
# this one will redirect
|
60
|
+
@crawler1.start.must_equal [nil]
|
61
|
+
# this one will return some restrictions (it need real headres)
|
62
|
+
(@crawler2.start.first < 10000).must_equal true
|
63
|
+
# and this one will fire up
|
64
|
+
(@crawler3.start.first > 10000).must_equal true
|
65
|
+
end
|
54
66
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -22,7 +22,6 @@ class LastfmEnum
|
|
22
22
|
|
23
23
|
def each
|
24
24
|
ARTISTS.each do |artist|
|
25
|
-
p artist
|
26
25
|
url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
|
27
26
|
yield url
|
28
27
|
end
|
@@ -38,4 +37,15 @@ class LastfmCrawler < Pioneer::Base
|
|
38
37
|
json = Yajl::Parser.parse(req.response.response)
|
39
38
|
json["similarartists"]["@attr"]["artist"]
|
40
39
|
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Kinopoisk
|
43
|
+
class KinopoiskCrawler < Pioneer::Base
|
44
|
+
def locations
|
45
|
+
["http://www.kinopoisk.ru/level/1/film/614667/"]
|
46
|
+
end
|
47
|
+
|
48
|
+
def processing(req)
|
49
|
+
req.response.response.size
|
50
|
+
end
|
41
51
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-22 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &76399260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *76399260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: em-synchrony
|
27
|
-
requirement: &
|
27
|
+
requirement: &76399050 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *76399050
|
36
36
|
description: Simple async HTTP crawler based on em-synchrony
|
37
37
|
email:
|
38
38
|
- pedro.yanoviches@gmail.com
|
@@ -41,6 +41,7 @@ extensions: []
|
|
41
41
|
extra_rdoc_files: []
|
42
42
|
files:
|
43
43
|
- .gitignore
|
44
|
+
- .travis.yml
|
44
45
|
- CHANGELOG
|
45
46
|
- Gemfile
|
46
47
|
- LICENSE
|