pioneer 0.0.2 → 0.0.3.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +5 -1
- data/README.md +2 -34
- data/lib/pioneer/base.rb +1 -1
- data/lib/pioneer/version.rb +1 -1
- metadata +9 -9
data/CHANGELOG
CHANGED
data/README.md
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
Pioneer is a simple async HTTP crawler based on em-synchrony
|
4
4
|
|
5
|
+
And it is very alpha right now.
|
6
|
+
|
5
7
|
# Install
|
6
8
|
|
7
9
|
```bash
|
@@ -77,39 +79,5 @@ crawler.processing = proc{ req.response.response_header.status }
|
|
77
79
|
crawler.if_status_404{ |req| "Oups" }
|
78
80
|
```
|
79
81
|
|
80
|
-
As far as `locations` should return Enumerable you can use nested crawlers to save whole site
|
81
|
-
|
82
|
-
```ruby
|
83
|
-
require 'pioneer'
|
84
|
-
require 'nokogiri'
|
85
|
-
class Links
|
86
|
-
include Enumerable
|
87
|
-
def initialize(link)
|
88
|
-
@links = [link]
|
89
|
-
end
|
90
|
-
|
91
|
-
def <<(link)
|
92
|
-
@links << link
|
93
|
-
end
|
94
|
-
|
95
|
-
def each
|
96
|
-
@links.each{ |url| url }
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
class LinksCrawler < Pioneer::Base
|
101
|
-
def locations
|
102
|
-
@links = Links.new("http://www.gazeta.ru")
|
103
|
-
end
|
104
|
-
|
105
|
-
def processing(req)
|
106
|
-
doc = Nokogiri::HTML.parse(req.response.response)
|
107
|
-
links = doc.css("a").map{|link| link["href"]} # + some logic to filter links to prevent duplications and another hosts etc
|
108
|
-
@links << links
|
109
|
-
File.new(req.url, "w+"){ |f| f << req.response.response }
|
110
|
-
end
|
111
|
-
end
|
112
|
-
LinksCrawler.new(concurrency: 20, redirects: 1, sleep: 0.5).start
|
113
|
-
```
|
114
82
|
|
115
83
|
... to be continued
|
data/lib/pioneer/base.rb
CHANGED
data/lib/pioneer/version.rb
CHANGED
metadata
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pioneer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.3.alpha
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Petr
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-28 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &73021090 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *73021090
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: em-synchrony
|
27
|
-
requirement: &
|
27
|
+
requirement: &73020880 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *73020880
|
36
36
|
description: Simple async HTTP crawler based on em-synchrony
|
37
37
|
email:
|
38
38
|
- pedro.yanoviches@gmail.com
|
@@ -75,9 +75,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
76
|
none: false
|
77
77
|
requirements:
|
78
|
-
- - ! '
|
78
|
+
- - ! '>'
|
79
79
|
- !ruby/object:Gem::Version
|
80
|
-
version:
|
80
|
+
version: 1.3.1
|
81
81
|
requirements: []
|
82
82
|
rubyforge_project: pioneer
|
83
83
|
rubygems_version: 1.8.15
|