pioneer 0.0.1.alpha → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +0 -1
- data/CHANGELOG +1 -9
- data/Gemfile +1 -3
- data/README.md +1 -81
- data/lib/pioneer/request.rb +3 -3
- data/lib/pioneer/version.rb +1 -1
- data/spec/pioneer/request_spec.rb +0 -12
- data/spec/spec_helper.rb +1 -11
- metadata +9 -10
- data/.travis.yml +0 -2
data/.gitignore
CHANGED
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,83 +1,3 @@
|
|
1
1
|
# Pioneer
|
2
2
|
|
3
|
-
Pioneer is
|
4
|
-
|
5
|
-
And it is very alpha right now.
|
6
|
-
|
7
|
-
# Install
|
8
|
-
|
9
|
-
```bash
|
10
|
-
gem install pioneer
|
11
|
-
```
|
12
|
-
|
13
|
-
# Usage
|
14
|
-
|
15
|
-
To use `Pioneer` you should specify a class with two methods: `locations` and `processing(req)`.
|
16
|
-
|
17
|
-
First one should return enumerable object and second will accept request object.
|
18
|
-
|
19
|
-
```ruby
|
20
|
-
class Crawler << Pioneer::Base
|
21
|
-
def locations
|
22
|
-
["http://www.amazon.com", "http://www.apple.com"]
|
23
|
-
end
|
24
|
-
|
25
|
-
def processing(req)
|
26
|
-
File.open(req.url, "w+") do |f|
|
27
|
-
f << req.response.response
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
Crawler.new.start
|
33
|
-
```
|
34
|
-
|
35
|
-
In this example we are saving two files with html of those two sites.
|
36
|
-
|
37
|
-
`start` method will start iterating over urls and return an Array of what `processing` method returns.
|
38
|
-
|
39
|
-
# Handling request, response errors and statuses
|
40
|
-
|
41
|
-
In case of request or response error `Pioneer` will raise an error. Or we can catch them this way:
|
42
|
-
|
43
|
-
```ruby
|
44
|
-
class Crawler << Pioneer::Base
|
45
|
-
def locations
|
46
|
-
["http://www.amazon.com", "http://www.apple.com"]
|
47
|
-
end
|
48
|
-
|
49
|
-
def processing(req)
|
50
|
-
File.open(req.url, "w+") do |f|
|
51
|
-
f << req.response.response
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def if_request_error(req)
|
56
|
-
puts "Request error: #{req.error}"
|
57
|
-
end
|
58
|
-
|
59
|
-
def if_response_error(req)
|
60
|
-
puts "Response error: #{req.response.error}"
|
61
|
-
end
|
62
|
-
|
63
|
-
def if_status_203(req)
|
64
|
-
puts "He is trying to redirect me"
|
65
|
-
end
|
66
|
-
end
|
67
|
-
```
|
68
|
-
|
69
|
-
also you can write `if_status_not_200` to handle all statuses not 200, or `if_status_XXX` for any status you want.
|
70
|
-
|
71
|
-
# Overriding behavior
|
72
|
-
|
73
|
-
You can override all methods on the fly:
|
74
|
-
|
75
|
-
```ruby
|
76
|
-
crawler = Pioneer::Crawler.new # base simple crawler
|
77
|
-
crawler.locations = [url1, url2]
|
78
|
-
crawler.processing = proc{ req.response.response_header.status }
|
79
|
-
crawler.if_status_404{ |req| "Oups" }
|
80
|
-
```
|
81
|
-
|
82
|
-
|
83
|
-
... to be continued
|
3
|
+
Pioneer is asynchronous crawler over em-synchrony.
|
data/lib/pioneer/request.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Pioneer
|
3
3
|
class Request
|
4
|
-
attr_reader :pioneer, :url, :result, :response
|
4
|
+
attr_reader :pioneer, :url, :result, :response
|
5
5
|
def initialize(url, pioneer)
|
6
6
|
@url, @pioneer = url, pioneer
|
7
7
|
@url = begin
|
@@ -20,12 +20,12 @@ module Pioneer
|
|
20
20
|
begin
|
21
21
|
@response = EventMachine::HttpRequest.new(url).get(pioneer.http_opts)
|
22
22
|
rescue => e
|
23
|
-
|
23
|
+
error = "Request totaly failed. Url: #{url}, error: #{e.message}"
|
24
24
|
pioneer.logger.fatal(error)
|
25
25
|
if pioneer.respond_to? :if_request_error
|
26
26
|
return pioneer.send(:if_request_error, self)
|
27
27
|
else
|
28
|
-
raise HttpRequestError,
|
28
|
+
raise HttpRequestError, error
|
29
29
|
end
|
30
30
|
end
|
31
31
|
handle_response_error_or_return_result
|
data/lib/pioneer/version.rb
CHANGED
@@ -51,16 +51,4 @@ describe Pioneer::Request do
|
|
51
51
|
@lastfm_pioneer = LastfmCrawler.new(sleep: 0.25)
|
52
52
|
@lastfm_pioneer.start.sort.must_equal LastfmEnum.const_get(:ARTISTS).sort
|
53
53
|
end
|
54
|
-
|
55
|
-
it "should use headers" do
|
56
|
-
@crawler1 = KinopoiskCrawler.new(random_header: false)
|
57
|
-
@crawler2 = KinopoiskCrawler.new(random_header: false, redirects: 1)
|
58
|
-
@crawler3 = KinopoiskCrawler.new(random_header: true)
|
59
|
-
# this one will redirect
|
60
|
-
@crawler1.start.must_equal [nil]
|
61
|
-
# this one will return some restrictions (it need real headres)
|
62
|
-
(@crawler2.start.first < 10000).must_equal true
|
63
|
-
# and this one will fire up
|
64
|
-
(@crawler3.start.first > 10000).must_equal true
|
65
|
-
end
|
66
54
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -22,6 +22,7 @@ class LastfmEnum
|
|
22
22
|
|
23
23
|
def each
|
24
24
|
ARTISTS.each do |artist|
|
25
|
+
p artist
|
25
26
|
url = "http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist=#{artist}&api_key=b25b959554ed76058ac220b7b2e0a026&format=json"
|
26
27
|
yield url
|
27
28
|
end
|
@@ -37,15 +38,4 @@ class LastfmCrawler < Pioneer::Base
|
|
37
38
|
json = Yajl::Parser.parse(req.response.response)
|
38
39
|
json["similarartists"]["@attr"]["artist"]
|
39
40
|
end
|
40
|
-
end
|
41
|
-
|
42
|
-
# Kinopoisk
|
43
|
-
class KinopoiskCrawler < Pioneer::Base
|
44
|
-
def locations
|
45
|
-
["http://www.kinopoisk.ru/level/1/film/614667/"]
|
46
|
-
end
|
47
|
-
|
48
|
-
def processing(req)
|
49
|
-
req.response.response.size
|
50
|
-
end
|
51
41
|
end
|
metadata
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.1
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Petr
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &73645260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *73645260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: em-synchrony
|
27
|
-
requirement: &
|
27
|
+
requirement: &73644990 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *73644990
|
36
36
|
description: Simple async HTTP crawler based on em-synchrony
|
37
37
|
email:
|
38
38
|
- pedro.yanoviches@gmail.com
|
@@ -41,7 +41,6 @@ extensions: []
|
|
41
41
|
extra_rdoc_files: []
|
42
42
|
files:
|
43
43
|
- .gitignore
|
44
|
-
- .travis.yml
|
45
44
|
- CHANGELOG
|
46
45
|
- Gemfile
|
47
46
|
- LICENSE
|
@@ -75,9 +74,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
74
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
75
|
none: false
|
77
76
|
requirements:
|
78
|
-
- - ! '
|
77
|
+
- - ! '>='
|
79
78
|
- !ruby/object:Gem::Version
|
80
|
-
version:
|
79
|
+
version: '0'
|
81
80
|
requirements: []
|
82
81
|
rubyforge_project: pioneer
|
83
82
|
rubygems_version: 1.8.15
|
data/.travis.yml
DELETED