wombat 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +13 -11
- data/VERSION +1 -1
- data/examples/hashes.rb +43 -0
- data/lib/wombat.rb +12 -1
- data/lib/wombat/dsl/follower.rb +2 -2
- data/lib/wombat/processing/node_selector.rb +20 -9
- data/lib/wombat/processing/parser.rb +1 -4
- data/spec/integration/integration_spec.rb +47 -17
- data/spec/wombat_spec.rb +8 -1
- data/wombat.gemspec +2 -1
- metadata +2 -1
data/README.md
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
# Wombat
|
2
2
|
|
3
|
-
[][travis] [][gemnasium] [](http://badge.fury.io/rb/wombat)][rubygems] [][travis] [][gemnasium] [][codeclimate]
|
4
4
|
|
5
|
+
[rubygems]: http://rubygems.org/gems/wombat
|
5
6
|
[travis]: http://travis-ci.org/felipecsl/wombat
|
6
7
|
[gemnasium]: https://gemnasium.com/felipecsl/wombat
|
7
8
|
[codeclimate]: https://codeclimate.com/github/felipecsl/wombat
|
@@ -25,19 +26,19 @@ Wombat.crawl do
|
|
25
26
|
base_url "http://www.github.com"
|
26
27
|
path "/"
|
27
28
|
|
28
|
-
headline "
|
29
|
-
subheading "
|
29
|
+
headline xpath: "//h1"
|
30
|
+
subheading css: "p.subheading"
|
30
31
|
|
31
|
-
what_is "
|
32
|
+
what_is({ css: ".teaser h3" }, :list)
|
32
33
|
|
33
34
|
links do
|
34
|
-
explore '
|
35
|
+
explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
|
35
36
|
e.gsub(/Explore/, "Love")
|
36
37
|
end
|
37
38
|
|
38
|
-
search '
|
39
|
-
features '
|
40
|
-
blog '
|
39
|
+
search css: '.search'
|
40
|
+
features css: '.features'
|
41
|
+
blog css: '.blog'
|
41
42
|
end
|
42
43
|
end
|
43
44
|
```
|
@@ -62,10 +63,11 @@ end
|
|
62
63
|
}
|
63
64
|
```
|
64
65
|
|
65
|
-
### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the
|
66
|
-
### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
|
67
|
-
### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
|
66
|
+
### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the links below:
|
68
67
|
|
68
|
+
### [Wiki](http://github.com/felipecsl/wombat/wiki).
|
69
|
+
### [API Documentation](http://rubydoc.info/gems/wombat/2.1.1/frames)
|
70
|
+
### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
|
69
71
|
|
70
72
|
## Contributing to Wombat
|
71
73
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.1.
|
1
|
+
2.1.2
|
data/examples/hashes.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
data = Wombat.crawl do
|
5
|
+
base_url "http://www.github.com"
|
6
|
+
path "/"
|
7
|
+
|
8
|
+
headline xpath: "//h1"
|
9
|
+
subheading css: "p.subheading"
|
10
|
+
|
11
|
+
what_is({ css: ".teaser h3" }, :list)
|
12
|
+
|
13
|
+
links do
|
14
|
+
explore xpath: '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a' do |e|
|
15
|
+
e.gsub(/Explore/, "Love")
|
16
|
+
end
|
17
|
+
|
18
|
+
search css: '.search'
|
19
|
+
features css: '.features'
|
20
|
+
blog css: '.blog'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
pp data
|
25
|
+
|
26
|
+
=begin
|
27
|
+
pp data
|
28
|
+
{
|
29
|
+
"headline"=>"Build software better, together.",
|
30
|
+
"subheading"=>
|
31
|
+
"Powerful collaboration, review, and code management for open source and private development projects.",
|
32
|
+
"what_is"=>
|
33
|
+
["Great collaboration starts with communication.",
|
34
|
+
"Manage and contribute from all your devices.",
|
35
|
+
"The world’s largest open source community."],
|
36
|
+
"links"=>
|
37
|
+
{"explore"=>"Love GitHub",
|
38
|
+
"search"=>"Search",
|
39
|
+
"features"=>"Features",
|
40
|
+
"blog"=>"Blog"
|
41
|
+
}
|
42
|
+
}
|
43
|
+
=end
|
data/lib/wombat.rb
CHANGED
@@ -4,12 +4,23 @@ require 'wombat/crawler'
|
|
4
4
|
|
5
5
|
module Wombat
|
6
6
|
class << self
|
7
|
+
|
8
|
+
attr_reader :proxy_args
|
9
|
+
|
7
10
|
def crawl(&block)
|
8
11
|
klass = Class.new
|
9
12
|
klass.send(:include, Wombat::Crawler)
|
10
13
|
klass.new.crawl(&block)
|
11
14
|
end
|
12
|
-
|
15
|
+
|
16
|
+
def configure
|
17
|
+
yield self
|
18
|
+
end
|
19
|
+
|
20
|
+
def set_proxy(*args)
|
21
|
+
@proxy_args = args
|
22
|
+
end
|
23
|
+
|
13
24
|
alias_method :scrape, :crawl
|
14
25
|
end
|
15
26
|
end
|
data/lib/wombat/dsl/follower.rb
CHANGED
@@ -5,12 +5,12 @@ module Wombat
|
|
5
5
|
|
6
6
|
def initialize(name, selector)
|
7
7
|
@wombat_property_selector = selector
|
8
|
-
|
8
|
+
|
9
9
|
super(name)
|
10
10
|
end
|
11
11
|
|
12
12
|
# So that Property::Locators::Iterator can identify this class
|
13
|
-
# as
|
13
|
+
# as a follow property.
|
14
14
|
def wombat_property_format
|
15
15
|
:follow
|
16
16
|
end
|
@@ -1,12 +1,23 @@
|
|
1
1
|
module Wombat
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
module Processing
|
3
|
+
module NodeSelector
|
4
|
+
def select_nodes(selector, namespaces = nil)
|
5
|
+
return [selector.to_s] if selector.is_a? Symbol
|
6
|
+
|
7
|
+
if selector.is_a? String
|
8
|
+
return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
9
|
+
return @context.css selector[4..-1] if selector.start_with? "css="
|
10
|
+
end
|
11
|
+
|
12
|
+
if selector.is_a? Hash
|
13
|
+
method = selector.flatten.first
|
14
|
+
selector = selector.flatten.last
|
15
|
+
return @context.xpath selector, namespaces if method == :xpath
|
16
|
+
return @context.css selector if method == :css
|
17
|
+
end
|
18
|
+
|
19
|
+
[selector]
|
20
|
+
end
|
21
|
+
end
|
11
22
|
end
|
12
23
|
end
|
@@ -6,7 +6,7 @@ describe 'basic crawler setup' do
|
|
6
6
|
VCR.use_cassette('basic_crawler_page') do
|
7
7
|
crawler = Class.new
|
8
8
|
crawler.send(:include, Wombat::Crawler)
|
9
|
-
|
9
|
+
|
10
10
|
crawler.base_url "http://www.terra.com.br"
|
11
11
|
crawler.path '/portal'
|
12
12
|
|
@@ -32,6 +32,36 @@ describe 'basic crawler setup' do
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
+
it 'should support hash based selectors' do
|
36
|
+
VCR.use_cassette('basic_crawler_page') do
|
37
|
+
crawler = Class.new
|
38
|
+
crawler.send(:include, Wombat::Crawler)
|
39
|
+
|
40
|
+
crawler.base_url "http://www.terra.com.br"
|
41
|
+
crawler.path '/portal'
|
42
|
+
|
43
|
+
crawler.search css: ".btn-search"
|
44
|
+
crawler.social do
|
45
|
+
twitter css: ".ctn-bar li.last"
|
46
|
+
end
|
47
|
+
crawler.links({css: ".ctn-links"}, :iterator) do
|
48
|
+
menu css: "a"
|
49
|
+
end
|
50
|
+
crawler.subheader css: "h2.ttl-dynamic" do |h|
|
51
|
+
h.gsub("London", "Londres")
|
52
|
+
end
|
53
|
+
|
54
|
+
crawler_instance = crawler.new
|
55
|
+
|
56
|
+
results = crawler_instance.crawl
|
57
|
+
|
58
|
+
results["search"].should == "Buscar"
|
59
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
60
|
+
results["subheader"].should == "Londres 2012"
|
61
|
+
results["social"]["twitter"].should == "Verão"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
35
65
|
it 'should clear iterators between multiple runs' do
|
36
66
|
crawler = Class.new
|
37
67
|
crawler.send(:include, Wombat::Crawler)
|
@@ -56,7 +86,7 @@ describe 'basic crawler setup' do
|
|
56
86
|
VCR.use_cassette('basic_crawler_page') do
|
57
87
|
results = crawler_instance.crawl
|
58
88
|
end
|
59
|
-
|
89
|
+
|
60
90
|
results["links"].should == result_hash
|
61
91
|
end
|
62
92
|
|
@@ -123,7 +153,7 @@ describe 'basic crawler setup' do
|
|
123
153
|
VCR.use_cassette('for_each_page') do
|
124
154
|
crawler = Class.new
|
125
155
|
crawler.send(:include, Wombat::Crawler)
|
126
|
-
|
156
|
+
|
127
157
|
crawler.base_url "https://www.github.com"
|
128
158
|
crawler.path "/explore"
|
129
159
|
|
@@ -151,13 +181,13 @@ describe 'basic crawler setup' do
|
|
151
181
|
VCR.use_cassette('xml_with_namespace') do
|
152
182
|
crawler = Class.new
|
153
183
|
crawler.send(:include, Wombat::Crawler)
|
154
|
-
|
184
|
+
|
155
185
|
crawler.document_format :xml
|
156
186
|
crawler.base_url "http://ws.audioscrobbler.com"
|
157
187
|
crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
|
158
188
|
|
159
189
|
crawler.artist "xpath=//title", :list
|
160
|
-
|
190
|
+
|
161
191
|
crawler.location 'xpath=//event', :iterator do
|
162
192
|
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
163
193
|
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
@@ -168,18 +198,18 @@ describe 'basic crawler setup' do
|
|
168
198
|
iterator = results['location']
|
169
199
|
|
170
200
|
iterator.should == [
|
171
|
-
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
172
|
-
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
173
|
-
{"latitude"=>"37.869784", "longitude"=>"-122.267701"},
|
174
|
-
{"latitude"=>"37.870873", "longitude"=>"-122.269313"},
|
175
|
-
{"latitude"=>"37.782348", "longitude"=>"-122.408059"},
|
176
|
-
{"latitude"=>"37.775529", "longitude"=>"-122.437757"},
|
177
|
-
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
178
|
-
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
179
|
-
{"latitude"=>"37.784963", "longitude"=>"-122.418871"},
|
201
|
+
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
202
|
+
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
203
|
+
{"latitude"=>"37.869784", "longitude"=>"-122.267701"},
|
204
|
+
{"latitude"=>"37.870873", "longitude"=>"-122.269313"},
|
205
|
+
{"latitude"=>"37.782348", "longitude"=>"-122.408059"},
|
206
|
+
{"latitude"=>"37.775529", "longitude"=>"-122.437757"},
|
207
|
+
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
208
|
+
{"latitude"=>"37.771079", "longitude"=>"-122.412604"},
|
209
|
+
{"latitude"=>"37.784963", "longitude"=>"-122.418871"},
|
180
210
|
{"latitude"=>"37.788978", "longitude"=>"-122.40664"}
|
181
211
|
]
|
182
|
-
|
212
|
+
|
183
213
|
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
184
214
|
end
|
185
215
|
end
|
@@ -188,7 +218,7 @@ describe 'basic crawler setup' do
|
|
188
218
|
VCR.use_cassette('follow_links') do
|
189
219
|
crawler = Class.new
|
190
220
|
crawler.send(:include, Wombat::Crawler)
|
191
|
-
|
221
|
+
|
192
222
|
crawler.base_url "https://www.github.com"
|
193
223
|
crawler.path "/"
|
194
224
|
|
@@ -199,7 +229,7 @@ describe 'basic crawler setup' do
|
|
199
229
|
crawler_instance = crawler.new
|
200
230
|
results = crawler_instance.crawl
|
201
231
|
|
202
|
-
results.should == {
|
232
|
+
results.should == {
|
203
233
|
"github" => [
|
204
234
|
{ "heading"=>"GitHub helps people build software together." },
|
205
235
|
{ "heading"=>nil },
|
data/spec/wombat_spec.rb
CHANGED
@@ -17,9 +17,16 @@ describe Wombat do
|
|
17
17
|
Wombat.scrape
|
18
18
|
end
|
19
19
|
|
20
|
+
it 'should provide configuration method with block' do
|
21
|
+
Wombat.configure do |config|
|
22
|
+
config.set_proxy "10.0.0.1", 8080
|
23
|
+
end
|
24
|
+
Wombat.proxy_args.should == ["10.0.0.1", 8080]
|
25
|
+
end
|
26
|
+
|
20
27
|
it 'should accept regular properties (non-selectors)' do
|
21
28
|
VCR.use_cassette('broken_selector') do
|
22
|
-
lambda {
|
29
|
+
lambda {
|
23
30
|
Wombat.crawl do
|
24
31
|
base_url "http://www.github.com"
|
25
32
|
path "/"
|
data/wombat.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "2.1.
|
8
|
+
s.version = "2.1.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
@@ -27,6 +27,7 @@ Gem::Specification.new do |s|
|
|
27
27
|
"README.md",
|
28
28
|
"Rakefile",
|
29
29
|
"VERSION",
|
30
|
+
"examples/hashes.rb",
|
30
31
|
"examples/iterator.rb",
|
31
32
|
"examples/list.rb",
|
32
33
|
"examples/no_class.rb",
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- README.md
|
190
190
|
- Rakefile
|
191
191
|
- VERSION
|
192
|
+
- examples/hashes.rb
|
192
193
|
- examples/iterator.rb
|
193
194
|
- examples/list.rb
|
194
195
|
- examples/no_class.rb
|