pioneer 0.0.1.alpha → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +0 -1
- data/CHANGELOG +1 -9
- data/Gemfile +1 -3
- data/README.md +1 -81
- data/lib/pioneer/request.rb +3 -3
- data/lib/pioneer/version.rb +1 -1
- data/spec/pioneer/request_spec.rb +0 -12
- data/spec/spec_helper.rb +1 -11
- metadata +9 -10
- data/.travis.yml +0 -2
data/.gitignore
CHANGED
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,83 +1,3 @@
|
|
1
1
|
# Pioneer
|
2
2
|
|
3
|
-
Pioneer is
|
4
|
-
|
5
|
-
And it is very alpha right now.
|
6
|
-
|
7
|
-
# Install
|
8
|
-
|
9
|
-
```bash
|
10
|
-
gem install pioneer
|
11
|
-
```
|
12
|
-
|
13
|
-
# Usage
|
14
|
-
|
15
|
-
To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
|
16
|
-
|
17
|
-
First one should return enumerable object and second will accept request object.
|
18
|
-
|
19
|
-
```ruby
|
20
|
-
class Crawler << Pioneer::Base
|
21
|
-
def locations
|
22
|
-
["http://www.amazon.com", "http://www.apple.com"]
|
23
|
-
end
|
24
|
-
|
25
|
-
def processing(req)
|
26
|
-
File.open(req.url, "w+") do |f|
|
27
|
-
f << req.response.response
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
Crawler.new.start
|
33
|
-
```
|
34
|
-
|
35
|
-
In this example we are saving two files with html of those two sites.
|
36
|
-
|
37
|
-
`start` method will start iterating over urls and return an Array of what `processing` method returns.
|
38
|
-
|
39
|
-
# Handling request, response errors and statuses
|
40
|
-
|
41
|
-
In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
|
42
|
-
|
43
|
-
```ruby
|
44
|
-
class Crawler << Pioneer::Base
|
45
|
-
def locations
|
46
|
-
["http://www.amazon.com", "http://www.apple.com"]
|
47
|
-
end
|
48
|
-
|
49
|
-
def processing(req)
|
50
|
-
File.open(req.url, "w+") do |f|
|
51
|
-
f << req.response.response
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def if_request_error(req)
|
56
|
-
puts "Request error: #{req.error}"
|
57
|
-
end
|
58
|
-
|
59
|
-
def if_response_error(req)
|
60
|
-
puts "Response error: #{req.response.error}"
|
61
|
-
end
|
62
|
-
|
63
|
-
def if_status_203(req)
|
64
|
-
puts "He is trying to redirect me"
|
65
|
-
end
|
66
|
-
end
|
67
|
-
```
|
68
|
-
|
69
|
-
also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
|
70
|
-
|
71
|
-
# Overriding behavior
|
72
|
-
|
73
|
-
You can override all methods on the fly:
|
74
|
-
|
75
|
-
```ruby
|
76
|
-
crawler = Pioneer::Crawler.new # base simple crawler
|
77
|
-
crawler.locations = [url1, url2]
|
78
|
-
crawler.processing = proc{ req.response.response_header.status }
|
79
|
-
crawler.if_status_404{ |req| "Oups" }
|
80
|
-
```
|
81
|
-
|
82
|
-
|
83
|
-
... to be continued
|
3
|
+
Pioneer is asynchronous crawler over em-synchrony.
|
data/lib/pioneer/request.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
3
|
class Request
|
4
|
-
attr_reader :pioneer, :url, :result, :response
|
4
|
+
attr_reader :pioneer, :url, :result, :response
|
5
5
|
def initialize(url, pioneer)
|
6
6
|
@url, @pioneer = url, pioneer
|
7
7
|
@url = begin
|
@@ -20,12 +20,12 @@ module Pioneer
|
|
20
20
|
begin
|
21
21
|
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
22
|
rescue => e
|
23
|
-
|
23
|
+
error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
24
|
pioneer.logger.fatal(error)
|
25
25
|
if pioneer.respond_to? :if_request_error
|
26
26
|
return pioneer.send(:if_request_error, self)
|
27
27
|
else
|
28
|
-
raise HttpRequestError,
|
28
|
+
raise HttpRequestError, error
|
29
29
|
end
|
30
30
|
end
|
31
31
|
handle_response_error_or_return_result
|
data/lib/pioneer/version.rb
CHANGED
@@ -51,16 +51,4 @@ describe Pioneer::Request do
|
|
51
51
|
@lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
|
52
52
|
@lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
|
53
53
|
end
|
54
|
-
|
55
|
-
it "should use headers" do
|
56
|
-
@crawler1 = KinopoiskCrawler.new(random_header: false)
|
57
|
-
@crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
|
58
|
-
@crawler3 = KinopoiskCrawler.new(random_header: true)
|
59
|
-
# this one will redirect
|
60
|
-
@crawler1.start.must_equal [nil]
|
61
|
-
# this one will return some restrictions (it need real headres)
|
62
|
-
(@crawler2.start.first < 10000).must_equal true
|
63
|
-
# and this one will fire up
|
64
|
-
(@crawler3.start.first > 10000).must_equal true
|
65
|
-
end
|
66
54
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -22,6 +22,7 @@ class LastfmEnum
|
|
22
22
|
|
23
23
|
def each
|
24
24
|
ARTISTS.each do |artist|
|
25
|
+
p artist
|
25
26
|
url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
|
26
27
|
yield url
|
27
28
|
end
|
@@ -37,15 +38,4 @@ class LastfmCrawler < Pioneer::Base
|
|
37
38
|
json = Yajl::Parser.parse(req.response.response)
|
38
39
|
json["similarartists"]["@attr"]["artist"]
|
39
40
|
end
|
40
|
-
end
|
41
|
-
|
42
|
-
# Kinopoisk
|
43
|
-
class KinopoiskCrawler < Pioneer::Base
|
44
|
-
def locations
|
45
|
-
["http://www.kinopoisk.ru/level/1/film/614667/"]
|
46
|
-
end
|
47
|
-
|
48
|
-
def processing(req)
|
49
|
-
req.response.response.size
|
50
|
-
end
|
51
41
|
end
|
metadata
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.1
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Petr
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &73645260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *73645260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: em-synchrony
|
27
|
-
requirement: &
|
27
|
+
requirement: &73644990 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *73644990
|
36
36
|
description: Simple async HTTP crawler based on em-synchrony
|
37
37
|
email:
|
38
38
|
- pedro.yanoviches@gmail.com
|
@@ -41,7 +41,6 @@ extensions: []
|
|
41
41
|
extra_rdoc_files: []
|
42
42
|
files:
|
43
43
|
- .gitignore
|
44
|
-
- .travis.yml
|
45
44
|
- CHANGELOG
|
46
45
|
- Gemfile
|
47
46
|
- LICENSE
|
@@ -75,9 +74,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
74
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
75
|
none: false
|
77
76
|
requirements:
|
78
|
-
- - ! '
|
77
|
+
- - ! '>='
|
79
78
|
- !ruby/object:Gem::Version
|
80
|
-
version:
|
79
|
+
version: '0'
|
81
80
|
requirements: []
|
82
81
|
rubyforge_project: pioneer
|
83
82
|
rubygems_version: 1.8.15
|
data/.travis.yml
DELETED