pioneer 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/.travis.yml +2 -0
- data/CHANGELOG +5 -1
- data/Gemfile +3 -1
- data/README.md +113 -1
- data/lib/pioneer/request.rb +3 -3
- data/lib/pioneer/version.rb +1 -1
- data/spec/pioneer/request_spec.rb +12 -0
- data/spec/spec_helper.rb +11 -1
- metadata +7 -6
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,3 +1,115 @@
|
|
1
1
|
# Pioneer
|
2
2
|
|
3
|
-
Pioneer is
|
3
|
+
Pioneer is a simple async HTTP crawler based on em-synchrony
|
4
|
+
|
5
|
+
# Install
|
6
|
+
|
7
|
+
```bash
|
8
|
+
gem install pioneer
|
9
|
+
```
|
10
|
+
|
11
|
+
# Usage
|
12
|
+
|
13
|
+
To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
|
14
|
+
|
15
|
+
First one should return enumerable object and second will accept request object.
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
class Crawler << Pioneer::Base
|
19
|
+
def locations
|
20
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
21
|
+
end
|
22
|
+
|
23
|
+
def processing(req)
|
24
|
+
File.open(req.url, "w+") do |f|
|
25
|
+
f << req.response.response
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
Crawler.new.start
|
31
|
+
```
|
32
|
+
|
33
|
+
In this example we are saving two files with html of those two sites.
|
34
|
+
|
35
|
+
`start` method will start iterating over urls and return an Array of what `processing` method returns.
|
36
|
+
|
37
|
+
# Handling request, response errors and statuses
|
38
|
+
|
39
|
+
In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
class Crawler << Pioneer::Base
|
43
|
+
def locations
|
44
|
+
["http://www.amazon.com", "http://www.apple.com"]
|
45
|
+
end
|
46
|
+
|
47
|
+
def processing(req)
|
48
|
+
File.open(req.url, "w+") do |f|
|
49
|
+
f << req.response.response
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def if_request_error(req)
|
54
|
+
puts "Request error: #{req.error}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def if_response_error(req)
|
58
|
+
puts "Response error: #{req.response.error}"
|
59
|
+
end
|
60
|
+
|
61
|
+
def if_status_203(req)
|
62
|
+
puts "He is trying to redirect me"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
```
|
66
|
+
|
67
|
+
also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
|
68
|
+
|
69
|
+
# Overriding behavior
|
70
|
+
|
71
|
+
You can override all methods on the fly:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
crawler = Pioneer::Crawler.new # base simple crawler
|
75
|
+
crawler.locations = [url1, url2]
|
76
|
+
crawler.processing = proc{ req.response.response_header.status }
|
77
|
+
crawler.if_status_404{ |req| "Oups" }
|
78
|
+
```
|
79
|
+
|
80
|
+
As far as `locations` should return Enumerable you can use nested crawlers to save whole site
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
require 'pioneer'
|
84
|
+
require 'nokogiri'
|
85
|
+
class Links
|
86
|
+
include Enumerable
|
87
|
+
def initialize(link)
|
88
|
+
@links = [link]
|
89
|
+
end
|
90
|
+
|
91
|
+
def <<(link)
|
92
|
+
@links << link
|
93
|
+
end
|
94
|
+
|
95
|
+
def each
|
96
|
+
@links.each{ |url| url }
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class LinksCrawler < Pioneer::Base
|
101
|
+
def locations
|
102
|
+
@links = Links.new("http://www.gazeta.ru")
|
103
|
+
end
|
104
|
+
|
105
|
+
def processing(req)
|
106
|
+
doc = Nokogiri::HTML.parse(req.response.response)
|
107
|
+
links = doc.css("a").map{|link| link["href"]} # + some logic to filter links to prevent duplications and another hosts etc
|
108
|
+
@links << links
|
109
|
+
File.new(req.url, "w+"){ |f| f << req.response.response }
|
110
|
+
end
|
111
|
+
end
|
112
|
+
LinksCrawler.new(concurrency: 20, redirects: 1, sleep: 0.5).start
|
113
|
+
```
|
114
|
+
|
115
|
+
... to be continued
|
data/lib/pioneer/request.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
3
|
class Request
|
4
|
-
attr_reader :pioneer, :url, :result, :response
|
4
|
+
attr_reader :pioneer, :url, :result, :response, :error
|
5
5
|
def initialize(url, pioneer)
|
6
6
|
@url, @pioneer = url, pioneer
|
7
7
|
@url = begin
|
@@ -20,12 +20,12 @@ module Pioneer
|
|
20
20
|
begin
|
21
21
|
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
22
|
rescue => e
|
23
|
-
error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
23
|
+
@error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
24
|
pioneer.logger.fatal(error)
|
25
25
|
if pioneer.respond_to? :if_request_error
|
26
26
|
return pioneer.send(:if_request_error, self)
|
27
27
|
else
|
28
|
-
raise HttpRequestError, error
|
28
|
+
raise HttpRequestError, @error
|
29
29
|
end
|
30
30
|
end
|
31
31
|
handle_response_error_or_return_result
|
data/lib/pioneer/version.rb
CHANGED
@@ -51,4 +51,16 @@ describe Pioneer::Request do
|
|
51
51
|
@lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
|
52
52
|
@lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
|
53
53
|
end
|
54
|
+
|
55
|
+
it "should use headers" do
|
56
|
+
@crawler1 = KinopoiskCrawler.new(random_header: false)
|
57
|
+
@crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
|
58
|
+
@crawler3 = KinopoiskCrawler.new(random_header: true)
|
59
|
+
# this one will redirect
|
60
|
+
@crawler1.start.must_equal [nil]
|
61
|
+
# this one will return some restrictions (it need real headres)
|
62
|
+
(@crawler2.start.first < 10000).must_equal true
|
63
|
+
# and this one will fire up
|
64
|
+
(@crawler3.start.first > 10000).must_equal true
|
65
|
+
end
|
54
66
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -22,7 +22,6 @@ class LastfmEnum
|
|
22
22
|
|
23
23
|
def each
|
24
24
|
ARTISTS.each do |artist|
|
25
|
-
p artist
|
26
25
|
url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
|
27
26
|
yield url
|
28
27
|
end
|
@@ -38,4 +37,15 @@ class LastfmCrawler < Pioneer::Base
|
|
38
37
|
json = Yajl::Parser.parse(req.response.response)
|
39
38
|
json["similarartists"]["@attr"]["artist"]
|
40
39
|
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Kinopoisk
|
43
|
+
class KinopoiskCrawler < Pioneer::Base
|
44
|
+
def locations
|
45
|
+
["http://www.kinopoisk.ru/level/1/film/614667/"]
|
46
|
+
end
|
47
|
+
|
48
|
+
def processing(req)
|
49
|
+
req.response.response.size
|
50
|
+
end
|
41
51
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-22 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &76399260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *76399260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: em-synchrony
|
27
|
-
requirement: &
|
27
|
+
requirement: &76399050 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *76399050
|
36
36
|
description: Simple async HTTP crawler based on em-synchrony
|
37
37
|
email:
|
38
38
|
- pedro.yanoviches@gmail.com
|
@@ -41,6 +41,7 @@ extensions: []
|
|
41
41
|
extra_rdoc_files: []
|
42
42
|
files:
|
43
43
|
- .gitignore
|
44
|
+
- .travis.yml
|
44
45
|
- CHANGELOG
|
45
46
|
- Gemfile
|
46
47
|
- LICENSE
|