wombat 2.1.1 → 2.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +13 -11
- data/VERSION +1 -1
- data/examples/hashes.rb +43 -0
- data/lib/wombat.rb +12 -1
- data/lib/wombat/dsl/follower.rb +2 -2
- data/lib/wombat/processing/node_selector.rb +20 -9
- data/lib/wombat/processing/parser.rb +1 -4
- data/spec/integration/integration_spec.rb +47 -17
- data/spec/wombat_spec.rb +8 -1
- data/wombat.gemspec +2 -1
- metadata +2 -1
data/README.md
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
# Wombat
|
2
2
|
|
3
|
-
[![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)][travis] [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)][gemnasium] [![Code Climate](https://codeclimate.com/
|
3
|
+
[[![Gem Version](https://badge.fury.io/rb/wombat.png)](http://badge.fury.io/rb/wombat)][rubygems] [![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)][travis] [![Dependency Status](https://gemnasium.com/felipecsl/wombat.png?travis)][gemnasium] [![Code Climate](https://codeclimate.com/github/felipecsl/wombat.png)][codeclimate]
|
4
4
|
|
5
|
+
[rubygems]: http://rubygems.org/gems/wombat
|
5
6
|
[travis]: http://travis-ci.org/felipecsl/wombat
|
6
7
|
[gemnasium]: https://gemnasium.com/felipecsl/wombat
|
7
8
|
[codeclimate]: https://codeclimate.com/github/felipecsl/wombat
|
@@ -25,19 +26,19 @@ Wombat.crawl do
|
|
25
26
|
base_url "http://www.github.com"
|
26
27
|
path "/"
|
27
28
|
|
28
|
-
headline "
|
29
|
-
subheading "
|
29
|
+
headline xpath: "//h1"
|
30
|
+
subheading css: "p.subheading"
|
30
31
|
|
31
|
-
what_is "
|
32
|
+
what_is({ css: ".teaser h3" }, :list)
|
32
33
|
|
33
34
|
links do
|
34
|
-
explore '
|
35
|
+
explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
|
35
36
|
e.gsub(/Explore/, "Love")
|
36
37
|
end
|
37
38
|
|
38
|
-
search '
|
39
|
-
features '
|
40
|
-
blog '
|
39
|
+
search css: '.search'
|
40
|
+
features css: '.features'
|
41
|
+
blog css: '.blog'
|
41
42
|
end
|
42
43
|
end
|
43
44
|
```
|
@@ -62,10 +63,11 @@ end
|
|
62
63
|
}
|
63
64
|
```
|
64
65
|
|
65
|
-
### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the
|
66
|
-
### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
|
67
|
-
### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
|
66
|
+
### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the links below:
|
68
67
|
|
68
|
+
### [Wiki](http://github.com/felipecsl/wombat/wiki).
|
69
|
+
### [API Documentation](http://rubydoc.info/gems/wombat/2.1.1/frames)
|
70
|
+
### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
|
69
71
|
|
70
72
|
## Contributing to Wombat
|
71
73
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.1.
|
1
|
+
2.1.2
|
data/examples/hashes.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
data = Wombat.crawl do
|
5
|
+
base_url "http://www.github.com"
|
6
|
+
path "/"
|
7
|
+
|
8
|
+
headline xpath: "//h1"
|
9
|
+
subheading css: "p.subheading"
|
10
|
+
|
11
|
+
what_is({ css: ".teaser h3" }, :list)
|
12
|
+
|
13
|
+
links do
|
14
|
+
explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
|
15
|
+
e.gsub(/Explore/, "Love")
|
16
|
+
end
|
17
|
+
|
18
|
+
search css: '.search'
|
19
|
+
features css: '.features'
|
20
|
+
blog css: '.blog'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
pp data
|
25
|
+
|
26
|
+
=begin
|
27
|
+
pp data
|
28
|
+
{
|
29
|
+
"headline"=>"Build software better, together.",
|
30
|
+
"subheading"=>
|
31
|
+
"Powerful collaboration, review, and code management for open source and private development projects.",
|
32
|
+
"what_is"=>
|
33
|
+
["Great collaboration starts with communication.",
|
34
|
+
"Manage and contribute from all your devices.",
|
35
|
+
"The world’s largest open source community."],
|
36
|
+
"links"=>
|
37
|
+
{"explore"=>"Love GitHub",
|
38
|
+
"search"=>"Search",
|
39
|
+
"features"=>"Features",
|
40
|
+
"blog"=>"Blog"
|
41
|
+
}
|
42
|
+
}
|
43
|
+
=end
|
data/lib/wombat.rb
CHANGED
@@ -4,12 +4,23 @@ require 'wombat/crawler'
|
|
4
4
|
|
5
5
|
module Wombat
|
6
6
|
class << self
|
7
|
+
|
8
|
+
attr_reader :proxy_args
|
9
|
+
|
7
10
|
def crawl(&block)
|
8
11
|
klass = Class.new
|
9
12
|
klass.send(:include, Wombat::Crawler)
|
10
13
|
klass.new.crawl(&block)
|
11
14
|
end
|
12
|
-
|
15
|
+
|
16
|
+
def configure
|
17
|
+
yield self
|
18
|
+
end
|
19
|
+
|
20
|
+
def set_proxy(*args)
|
21
|
+
@proxy_args = args
|
22
|
+
end
|
23
|
+
|
13
24
|
alias_method :scrape, :crawl
|
14
25
|
end
|
15
26
|
end
|
data/lib/wombat/dsl/follower.rb
CHANGED
@@ -5,12 +5,12 @@ module Wombat
|
|
5
5
|
|
6
6
|
def initialize(name, selector)
|
7
7
|
@wombat_property_selector = selector
|
8
|
-
|
8
|
+
|
9
9
|
super(name)
|
10
10
|
end
|
11
11
|
|
12
12
|
# So that Property::Locators::Iterator can identify this class
|
13
|
-
# as
|
13
|
+
# as a follow property.
|
14
14
|
def wombat_property_format
|
15
15
|
:follow
|
16
16
|
end
|
@@ -1,12 +1,23 @@
|
|
1
1
|
module Wombat
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
module Processing
|
3
|
+
module NodeSelector
|
4
|
+
def select_nodes(selector, namespaces = nil)
|
5
|
+
return [selector.to_s] if selector.is_a? Symbol
|
6
|
+
|
7
|
+
if selector.is_a? String
|
8
|
+
return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
9
|
+
return @context.css selector[4..-1] if selector.start_with? "css="
|
10
|
+
end
|
11
|
+
|
12
|
+
if selector.is_a? Hash
|
13
|
+
method = selector.flatten.first
|
14
|
+
selector = selector.flatten.last
|
15
|
+
return @context.xpath selector, namespaces if method == :xpath
|
16
|
+
return @context.css selector if method == :css
|
17
|
+
end
|
18
|
+
|
19
|
+
[selector]
|
20
|
+
end
|
21
|
+
end
|
11
22
|
end
|
12
23
|
end
|
@@ -6,7 +6,7 @@ describe 'basic crawler setup' do
|
|
6
6
|
VCR.use_cassette('basic_crawler_page') do
|
7
7
|
crawler = Class.new
|
8
8
|
crawler.send(:include, Wombat::Crawler)
|
9
|
-
|
9
|
+
|
10
10
|
crawler.base_url "http://www.terra.com.br"
|
11
11
|
crawler.path '/portal'
|
12
12
|
|
@@ -32,6 +32,36 @@ describe 'basic crawler setup' do
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
+
it 'should support hash based selectors' do
|
36
|
+
VCR.use_cassette('basic_crawler_page') do
|
37
|
+
crawler = Class.new
|
38
|
+
crawler.send(:include, Wombat::Crawler)
|
39
|
+
|
40
|
+
crawler.base_url "http://www.terra.com.br"
|
41
|
+
crawler.path '/portal'
|
42
|
+
|
43
|
+
crawler.search css: ".btn-search"
|
44
|
+
crawler.social do
|
45
|
+
twitter css: ".ctn-bar li.last"
|
46
|
+
end
|
47
|
+
crawler.links({css: ".ctn-links"}, :iterator) do
|
48
|
+
menu css: "a"
|
49
|
+
end
|
50
|
+
crawler.subheader css: "h2.ttl-dynamic" do |h|
|
51
|
+
h.gsub("London", "Londres")
|
52
|
+
end
|
53
|
+
|
54
|
+
crawler_instance = crawler.new
|
55
|
+
|
56
|
+
results = crawler_instance.crawl
|
57
|
+
|
58
|
+
results["search"].should == "Buscar"
|
59
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
60
|
+
results["subheader"].should == "Londres 2012"
|
61
|
+
results["social"]["twitter"].should == "Verão"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
35
65
|
it 'should clear iterators between multiple runs' do
|
36
66
|
crawler = Class.new
|
37
67
|
crawler.send(:include, Wombat::Crawler)
|
@@ -56,7 +86,7 @@ describe 'basic crawler setup' do
|
|
56
86
|
VCR.use_cassette('basic_crawler_page') do
|
57
87
|
results = crawler_instance.crawl
|
58
88
|
end
|
59
|
-
|
89
|
+
|
60
90
|
results["links"].should == result_hash
|
61
91
|
end
|
62
92
|
|
@@ -123,7 +153,7 @@ describe 'basic crawler setup' do
|
|
123
153
|
VCR.use_cassette('for_each_page') do
|
124
154
|
crawler = Class.new
|
125
155
|
crawler.send(:include, Wombat::Crawler)
|
126
|
-
|
156
|
+
|
127
157
|
crawler.base_url "https://www.github.com"
|
128
158
|
crawler.path "/explore"
|
129
159
|
|
@@ -151,13 +181,13 @@ describe 'basic crawler setup' do
|
|
151
181
|
VCR.use_cassette('xml_with_namespace') do
|
152
182
|
crawler = Class.new
|
153
183
|
crawler.send(:include, Wombat::Crawler)
|
154
|
-
|
184
|
+
|
155
185
|
crawler.document_format :xml
|
156
186
|
crawler.base_url "http://ws.audioscrobbler.com"
|
157
187
|
crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
|
158
188
|
|
159
189
|
crawler.artist "xpath=//title", :list
|
160
|
-
|
190
|
+
|
161
191
|
crawler.location 'xpath=//event', :iterator do
|
162
192
|
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
163
193
|
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
@@ -168,18 +198,18 @@ describe 'basic crawler setup' do
|
|
168
198
|
iterator = results['location']
|
169
199
|
|
170
200
|
iterator.should == [
|
171
|
-
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
172
|
-
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
173
|
-
{"latitude"=>"37.869784", "longitude"=>"-122.267701"},
|
174
|
-
{"latitude"=>"37.870873", "longitude"=>"-122.269313"},
|
175
|
-
{"latitude"=>"37.782348", "longitude"=>"-122.408059"},
|
176
|
-
{"latitude"=>"37.775529", "longitude"=>"-122.437757"},
|
177
|
-
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
178
|
-
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
179
|
-
{"latitude"=>"37.784963", "longitude"=>"-122.418871"},
|
201
|
+
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
202
|
+
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
203
|
+
{"latitude"=>"37.869784", "longitude"=>"-122.267701"},
|
204
|
+
{"latitude"=>"37.870873", "longitude"=>"-122.269313"},
|
205
|
+
{"latitude"=>"37.782348", "longitude"=>"-122.408059"},
|
206
|
+
{"latitude"=>"37.775529", "longitude"=>"-122.437757"},
|
207
|
+
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
208
|
+
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
209
|
+
{"latitude"=>"37.784963", "longitude"=>"-122.418871"},
|
180
210
|
{"latitude"=>"37.788978", "longitude"=>"-122.40664"}
|
181
211
|
]
|
182
|
-
|
212
|
+
|
183
213
|
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
184
214
|
end
|
185
215
|
end
|
@@ -188,7 +218,7 @@ describe 'basic crawler setup' do
|
|
188
218
|
VCR.use_cassette('follow_links') do
|
189
219
|
crawler = Class.new
|
190
220
|
crawler.send(:include, Wombat::Crawler)
|
191
|
-
|
221
|
+
|
192
222
|
crawler.base_url "https://www.github.com"
|
193
223
|
crawler.path "/"
|
194
224
|
|
@@ -199,7 +229,7 @@ describe 'basic crawler setup' do
|
|
199
229
|
crawler_instance = crawler.new
|
200
230
|
results = crawler_instance.crawl
|
201
231
|
|
202
|
-
results.should == {
|
232
|
+
results.should == {
|
203
233
|
"github" => [
|
204
234
|
{ "heading"=>"GitHub helps people build software together." },
|
205
235
|
{ "heading"=>nil },
|
data/spec/wombat_spec.rb
CHANGED
@@ -17,9 +17,16 @@ describe Wombat do
|
|
17
17
|
Wombat.scrape
|
18
18
|
end
|
19
19
|
|
20
|
+
it 'should provide configuration method with block' do
|
21
|
+
Wombat.configure do |config|
|
22
|
+
config.set_proxy "10.0.0.1", 8080
|
23
|
+
end
|
24
|
+
Wombat.proxy_args.should == ["10.0.0.1", 8080]
|
25
|
+
end
|
26
|
+
|
20
27
|
it 'should accept regular properties (non-selectors)' do
|
21
28
|
VCR.use_cassette('broken_selector') do
|
22
|
-
lambda {
|
29
|
+
lambda {
|
23
30
|
Wombat.crawl do
|
24
31
|
base_url "http://www.github.com"
|
25
32
|
path "/"
|
data/wombat.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "2.1.
|
8
|
+
s.version = "2.1.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
@@ -27,6 +27,7 @@ Gem::Specification.new do |s|
|
|
27
27
|
"README.md",
|
28
28
|
"Rakefile",
|
29
29
|
"VERSION",
|
30
|
+
"examples/hashes.rb",
|
30
31
|
"examples/iterator.rb",
|
31
32
|
"examples/list.rb",
|
32
33
|
"examples/no_class.rb",
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- README.md
|
190
190
|
- Rakefile
|
191
191
|
- VERSION
|
192
|
+
- examples/hashes.rb
|
192
193
|
- examples/iterator.rb
|
193
194
|
- examples/list.rb
|
194
195
|
- examples/no_class.rb
|