wombat 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +13 -30
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
- data/lib/wombat/crawler.rb +7 -17
- data/lib/wombat/dsl/follower.rb +19 -0
- data/lib/wombat/dsl/iterator.rb +19 -0
- data/lib/wombat/dsl/metadata.rb +27 -0
- data/lib/wombat/dsl/property.rb +27 -0
- data/lib/wombat/dsl/property_group.rb +48 -0
- data/lib/wombat/processing/node_selector.rb +12 -0
- data/lib/wombat/processing/parser.rb +48 -0
- data/lib/wombat/property/locators/base.rb +33 -0
- data/lib/wombat/property/locators/factory.rb +39 -0
- data/lib/wombat/property/locators/follow.rb +25 -0
- data/lib/wombat/property/locators/html.rb +14 -0
- data/lib/wombat/property/locators/iterator.rb +23 -0
- data/lib/wombat/property/locators/list.rb +17 -0
- data/lib/wombat/property/locators/property_group.rb +20 -0
- data/lib/wombat/property/locators/text.rb +22 -0
- data/lib/wombat.rb +8 -4
- data/spec/crawler_spec.rb +38 -48
- data/spec/dsl/property_spec.rb +12 -0
- data/spec/helpers/sample_crawler.rb +2 -15
- data/spec/integration/integration_spec.rb +61 -33
- data/spec/processing/parser_spec.rb +32 -0
- data/spec/property/locators/factory_spec.rb +18 -0
- data/spec/property/locators/follow_spec.rb +4 -0
- data/spec/property/locators/html_spec.rb +15 -0
- data/spec/property/locators/iterator_spec.rb +4 -0
- data/spec/property/locators/list_spec.rb +13 -0
- data/spec/property/locators/text_spec.rb +49 -0
- data/spec/sample_crawler_spec.rb +7 -11
- data/spec/wombat_spec.rb +13 -1
- data/wombat.gemspec +27 -16
- metadata +27 -16
- data/lib/wombat/iterator.rb +0 -38
- data/lib/wombat/metadata.rb +0 -24
- data/lib/wombat/node_selector.rb +0 -10
- data/lib/wombat/parser.rb +0 -59
- data/lib/wombat/property.rb +0 -21
- data/lib/wombat/property_container.rb +0 -70
- data/lib/wombat/property_locator.rb +0 -20
- data/spec/iterator_spec.rb +0 -52
- data/spec/metadata_spec.rb +0 -20
- data/spec/parser_spec.rb +0 -125
- data/spec/property_container_spec.rb +0 -62
- data/spec/property_locator_spec.rb +0 -75
- data/spec/property_spec.rb +0 -16
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::DSL::Property do
|
4
|
+
it 'should store property data' do
|
5
|
+
property = Wombat::DSL::Property.new("title", *["/some/selector", :html]) { false }
|
6
|
+
|
7
|
+
property.wombat_property_name.should == "title"
|
8
|
+
property.selector.should == "/some/selector"
|
9
|
+
property.format.should == :html
|
10
|
+
property.callback.should == lambda { false }
|
11
|
+
end
|
12
|
+
end
|
@@ -5,9 +5,9 @@ class SampleCrawler
|
|
5
5
|
include Wombat::Crawler
|
6
6
|
|
7
7
|
base_url "http://www.obaoba.com.br"
|
8
|
-
|
8
|
+
path "/porto-alegre/agenda"
|
9
9
|
|
10
|
-
|
10
|
+
event_group "css=div.title-agenda", :iterator do
|
11
11
|
event do |e|
|
12
12
|
e.title("xpath=.") { |t| t.split(" | ")[1].strip }
|
13
13
|
e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
|
@@ -19,18 +19,5 @@ class SampleCrawler
|
|
19
19
|
venue do |v|
|
20
20
|
v.name("xpath=.") { |n| name.split(" | ")[2].strip }
|
21
21
|
end
|
22
|
-
|
23
|
-
# follow_links "xpath=.//a[1]/@href" do
|
24
|
-
# event { |e| e.description "css=#main-node-content", :html }
|
25
|
-
# venue do |v|
|
26
|
-
# v.phone "css=span.tel .value"
|
27
|
-
# v.image "xpath=//div[@id='article-image']/div/img/@src"
|
28
|
-
# end
|
29
|
-
|
30
|
-
# location do |l|
|
31
|
-
# l.city "css=span.locality"
|
32
|
-
# l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
-
# end
|
34
|
-
# end
|
35
22
|
end
|
36
23
|
end
|
@@ -8,17 +8,15 @@ describe 'basic crawler setup' do
|
|
8
8
|
crawler.send(:include, Wombat::Crawler)
|
9
9
|
|
10
10
|
crawler.base_url "http://www.terra.com.br"
|
11
|
-
crawler.
|
11
|
+
crawler.path '/portal'
|
12
12
|
|
13
13
|
crawler.search "css=.btn-search"
|
14
|
-
crawler.social do
|
15
|
-
|
14
|
+
crawler.social do
|
15
|
+
twitter "css=.ctn-bar li.last"
|
16
16
|
end
|
17
|
-
|
18
|
-
crawler.for_each "css=.ctn-links" do
|
17
|
+
crawler.links "css=.ctn-links", :iterator do
|
19
18
|
menu "css=a"
|
20
19
|
end
|
21
|
-
|
22
20
|
crawler.subheader "css=h2.ttl-dynamic" do |h|
|
23
21
|
h.gsub("London", "Londres")
|
24
22
|
end
|
@@ -28,7 +26,7 @@ describe 'basic crawler setup' do
|
|
28
26
|
results = crawler_instance.crawl
|
29
27
|
|
30
28
|
results["search"].should == "Buscar"
|
31
|
-
results["
|
29
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
32
30
|
results["subheader"].should == "Londres 2012"
|
33
31
|
results["social"]["twitter"].should == "Verão"
|
34
32
|
end
|
@@ -39,9 +37,9 @@ describe 'basic crawler setup' do
|
|
39
37
|
crawler.send(:include, Wombat::Crawler)
|
40
38
|
|
41
39
|
crawler.base_url "http://www.terra.com.br"
|
42
|
-
crawler.
|
40
|
+
crawler.path '/portal'
|
43
41
|
|
44
|
-
crawler.
|
42
|
+
crawler.links "css=.ctn-links", :iterator do
|
45
43
|
menu "css=a"
|
46
44
|
end
|
47
45
|
|
@@ -53,13 +51,13 @@ describe 'basic crawler setup' do
|
|
53
51
|
results = crawler_instance.crawl
|
54
52
|
end
|
55
53
|
|
56
|
-
results["
|
54
|
+
results["links"].should == result_hash
|
57
55
|
|
58
56
|
VCR.use_cassette('basic_crawler_page') do
|
59
57
|
results = crawler_instance.crawl
|
60
58
|
end
|
61
59
|
|
62
|
-
results["
|
60
|
+
results["links"].should == result_hash
|
63
61
|
end
|
64
62
|
|
65
63
|
it 'should crawl page through block to class instance crawl method' do
|
@@ -69,15 +67,15 @@ describe 'basic crawler setup' do
|
|
69
67
|
crawler_instance = crawler.new
|
70
68
|
results = crawler_instance.crawl do
|
71
69
|
base_url "http://www.terra.com.br"
|
72
|
-
|
70
|
+
path '/portal'
|
73
71
|
|
74
72
|
search "css=.btn-search"
|
75
73
|
|
76
|
-
social do
|
77
|
-
|
74
|
+
social do
|
75
|
+
twitter "css=.ctn-bar li.last"
|
78
76
|
end
|
79
77
|
|
80
|
-
|
78
|
+
links "css=.ctn-links", :iterator do
|
81
79
|
menu "css=a"
|
82
80
|
end
|
83
81
|
|
@@ -87,7 +85,7 @@ describe 'basic crawler setup' do
|
|
87
85
|
end
|
88
86
|
|
89
87
|
results["search"].should == "Buscar"
|
90
|
-
results["
|
88
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
91
89
|
results["subheader"].should == "Londres 2012"
|
92
90
|
results["social"]["twitter"].should == "Verão"
|
93
91
|
end
|
@@ -97,15 +95,15 @@ describe 'basic crawler setup' do
|
|
97
95
|
VCR.use_cassette('basic_crawler_page') do
|
98
96
|
results = Wombat.crawl do
|
99
97
|
base_url "http://www.terra.com.br"
|
100
|
-
|
98
|
+
path '/portal'
|
101
99
|
|
102
100
|
search "css=.btn-search"
|
103
101
|
|
104
|
-
social do
|
105
|
-
|
102
|
+
social do
|
103
|
+
twitter "css=.ctn-bar li.last"
|
106
104
|
end
|
107
105
|
|
108
|
-
|
106
|
+
links "css=.ctn-links", :iterator do
|
109
107
|
menu "css=a"
|
110
108
|
end
|
111
109
|
|
@@ -115,7 +113,7 @@ describe 'basic crawler setup' do
|
|
115
113
|
end
|
116
114
|
|
117
115
|
results["search"].should == "Buscar"
|
118
|
-
results["
|
116
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
119
117
|
results["subheader"].should == "Londres 2012"
|
120
118
|
results["social"]["twitter"].should == "Verão"
|
121
119
|
end
|
@@ -127,24 +125,24 @@ describe 'basic crawler setup' do
|
|
127
125
|
crawler.send(:include, Wombat::Crawler)
|
128
126
|
|
129
127
|
crawler.base_url "https://www.github.com"
|
130
|
-
crawler.
|
128
|
+
crawler.path "/explore"
|
131
129
|
|
132
|
-
crawler.
|
133
|
-
project do
|
134
|
-
|
135
|
-
|
130
|
+
crawler.repos "css=ol.ranked-repositories>li", :iterator do
|
131
|
+
project do
|
132
|
+
repo 'css=h3'
|
133
|
+
description('css=p.description') { |d| d ? d.gsub(/for/, '') : nil }
|
136
134
|
end
|
137
135
|
end
|
138
136
|
|
139
|
-
|
140
|
-
results = crawler_instance.crawl
|
137
|
+
results = crawler.new.crawl
|
141
138
|
|
142
|
-
results.should == { "
|
139
|
+
results.should == { "repos" => [
|
143
140
|
{ "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
|
144
141
|
{ "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
|
145
142
|
{ "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
|
146
143
|
{ "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
|
147
|
-
{ "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } }
|
144
|
+
{ "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } },
|
145
|
+
{ "project" => { "repo" => nil, "description" => nil}}
|
148
146
|
]}
|
149
147
|
end
|
150
148
|
end
|
@@ -156,18 +154,18 @@ describe 'basic crawler setup' do
|
|
156
154
|
|
157
155
|
crawler.document_format :xml
|
158
156
|
crawler.base_url "http://ws.audioscrobbler.com"
|
159
|
-
crawler.
|
157
|
+
crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
|
160
158
|
|
161
159
|
crawler.artist "xpath=//title", :list
|
162
160
|
|
163
|
-
crawler.
|
161
|
+
crawler.location 'xpath=//event', :iterator do
|
164
162
|
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
165
163
|
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
166
164
|
end
|
167
165
|
|
168
166
|
crawler_instance = crawler.new
|
169
167
|
results = crawler_instance.crawl
|
170
|
-
iterator = results['
|
168
|
+
iterator = results['location']
|
171
169
|
|
172
170
|
iterator.should == [
|
173
171
|
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
@@ -185,4 +183,34 @@ describe 'basic crawler setup' do
|
|
185
183
|
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
186
184
|
end
|
187
185
|
end
|
186
|
+
|
187
|
+
it 'should follow links' do
|
188
|
+
VCR.use_cassette('follow_links') do
|
189
|
+
crawler = Class.new
|
190
|
+
crawler.send(:include, Wombat::Crawler)
|
191
|
+
|
192
|
+
crawler.document_format :html
|
193
|
+
crawler.base_url "https://www.github.com"
|
194
|
+
crawler.path "/"
|
195
|
+
|
196
|
+
crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
|
197
|
+
heading 'css=h1'
|
198
|
+
end
|
199
|
+
|
200
|
+
crawler_instance = crawler.new
|
201
|
+
results = crawler_instance.crawl
|
202
|
+
|
203
|
+
results.should == {
|
204
|
+
"github" => [
|
205
|
+
{ "heading"=>"GitHub helps people build software together."},
|
206
|
+
{ "heading"=>nil},
|
207
|
+
{ "heading"=>"Features"},
|
208
|
+
{ "heading"=>"Contact GitHub"},
|
209
|
+
{ "heading"=>"GitHub Training — Git Training from the Experts"},
|
210
|
+
{ "heading"=>"GitHub on Your Servers"},
|
211
|
+
{ "heading"=>"Loading..."}
|
212
|
+
]
|
213
|
+
}
|
214
|
+
end
|
215
|
+
end
|
188
216
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Processing::Parser do
|
4
|
+
before(:each) do
|
5
|
+
crawler = Class.new
|
6
|
+
crawler.send(:include, Wombat::Processing::Parser)
|
7
|
+
@parser = crawler.new
|
8
|
+
@metadata = Wombat::DSL::Metadata.new
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should request page document with correct url' do
|
12
|
+
@metadata.base_url "http://www.google.com"
|
13
|
+
@metadata.path "/search"
|
14
|
+
fake_document = double :document
|
15
|
+
fake_parser = double :parser
|
16
|
+
fake_document.should_receive(:parser).and_return(fake_parser)
|
17
|
+
@parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
|
18
|
+
|
19
|
+
@parser.parse @metadata
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should correctly parse xml documents' do
|
23
|
+
fake_document = double :xml
|
24
|
+
fake_parser = double :parser
|
25
|
+
@metadata.document_format :xml
|
26
|
+
@parser.mechanize.should_not_receive(:get)
|
27
|
+
RestClient.should_receive(:get).and_return fake_document
|
28
|
+
Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
|
29
|
+
|
30
|
+
@parser.parse @metadata
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::Factory do
|
4
|
+
it 'should instantiate correct locator according to property type' do
|
5
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :text)).should be_a(Wombat::Property::Locators::Text)
|
6
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :html)).should be_a(Wombat::Property::Locators::Html)
|
7
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :list)).should be_a(Wombat::Property::Locators::List)
|
8
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :follow)).should be_a(Wombat::Property::Locators::Follow)
|
9
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :iterator)).should be_a(Wombat::Property::Locators::Iterator)
|
10
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :container)).should be_a(Wombat::Property::Locators::PropertyGroup)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should raise correct exception if provided property is of unknown type' do
|
14
|
+
lambda {
|
15
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :weird))
|
16
|
+
}.should raise_error(Wombat::Property::Locators::UnknownTypeException, "Unknown property format weird.")
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::Html do
|
4
|
+
it 'should locate html property' do
|
5
|
+
fake_elem = double :element
|
6
|
+
context = double :context
|
7
|
+
fake_elem.stub inner_html: "Something cool "
|
8
|
+
context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
|
9
|
+
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
10
|
+
|
11
|
+
locator = Wombat::Property::Locators::Html.new(property)
|
12
|
+
|
13
|
+
locator.locate(context).should == { "data1" => "Something cool" }
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::List do
|
4
|
+
it 'should locate a list of nodes' do
|
5
|
+
context = double :context
|
6
|
+
context.stub(:css).with(".selector").and_return %w(1 2 3 4 5)
|
7
|
+
property = Wombat::DSL::Property.new('data1', 'css=.selector', :list)
|
8
|
+
|
9
|
+
locator = Wombat::Property::Locators::List.new(property)
|
10
|
+
|
11
|
+
locator.locate(context).should == { "data1" => %w(1 2 3 4 5) }
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::Text do
|
4
|
+
it 'should locate text property with xpath selector and namespaces' do
|
5
|
+
fake_elem = double :element
|
6
|
+
context = double :context
|
7
|
+
fake_elem.stub inner_text: "Something cool "
|
8
|
+
context.stub(:xpath).with("/abc", 'boom').and_return [fake_elem]
|
9
|
+
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :text, 'boom')
|
10
|
+
|
11
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
12
|
+
|
13
|
+
locator.locate(context).should == { "data1" => "Something cool" }
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should locate text property with css selector' do
|
17
|
+
fake_elem = double :element
|
18
|
+
context = double :context
|
19
|
+
fake_elem.stub inner_text: "My name"
|
20
|
+
context.stub(:css).with("/def").and_return [fake_elem]
|
21
|
+
property = Wombat::DSL::Property.new('data1', 'css=/def', :text)
|
22
|
+
|
23
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
24
|
+
|
25
|
+
locator.locate(context).should == { "data1" => "My name" }
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should return plain symbols as strings' do
|
29
|
+
fake_elem = double :element
|
30
|
+
context = double :context
|
31
|
+
property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
|
32
|
+
|
33
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
34
|
+
|
35
|
+
locator.locate(context).should == { "data_2" => "hardcoded_value" }
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should invoke property callback' do
|
39
|
+
fake_elem = double :element
|
40
|
+
context = double :context
|
41
|
+
fake_elem.stub inner_text: "My name"
|
42
|
+
context.stub(:css).with("/def").and_return [fake_elem]
|
43
|
+
property = Wombat::DSL::Property.new('data1', 'css=/def', :text) { |s| s.gsub(/name/, 'ass') }
|
44
|
+
|
45
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
46
|
+
|
47
|
+
locator.locate(context).should == { "data1" => "My ass" }
|
48
|
+
end
|
49
|
+
end
|
data/spec/sample_crawler_spec.rb
CHANGED
@@ -8,19 +8,15 @@ describe SampleCrawler do
|
|
8
8
|
|
9
9
|
it 'should correctly assign event metadata' do
|
10
10
|
@sample_crawler.should_receive(:parse) do |args|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
it
|
16
|
-
it.
|
17
|
-
it["event"]["title"].selector.should == "xpath=."
|
18
|
-
it["event"]["date"].selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
|
19
|
-
it["event"]["type"].selector.should == "xpath=.type"
|
20
|
-
it["venue"]["name"].selector.should == "xpath=."
|
11
|
+
args['event_group'].wombat_property_selector.should == "css=div.title-agenda"
|
12
|
+
it = args['event_group']
|
13
|
+
it["event"]["title"].wombat_property_selector.should == "xpath=."
|
14
|
+
it["event"]["date"].wombat_property_selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
|
15
|
+
it["event"]["type"].wombat_property_selector.should == "xpath=.type"
|
16
|
+
it["venue"]["name"].wombat_property_selector.should == "xpath=."
|
21
17
|
|
22
18
|
args[:base_url].should == 'http://www.obaoba.com.br'
|
23
|
-
args[:
|
19
|
+
args[:path].should == '/porto-alegre/agenda'
|
24
20
|
end
|
25
21
|
|
26
22
|
@sample_crawler.crawl
|
data/spec/wombat_spec.rb
CHANGED
@@ -5,12 +5,24 @@ describe Wombat do
|
|
5
5
|
Wombat.should respond_to(:crawl)
|
6
6
|
end
|
7
7
|
|
8
|
+
it 'should provide syntactic sugar method Wombat.scrape' do
|
9
|
+
Wombat.should respond_to(:scrape)
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should redirect .scrape to .crawl' do
|
13
|
+
fake_class = double :fake
|
14
|
+
fake_class.stub :include
|
15
|
+
fake_class.should_receive(:new).and_return(stub(crawl: nil))
|
16
|
+
Class.stub :new => fake_class
|
17
|
+
Wombat.scrape
|
18
|
+
end
|
19
|
+
|
8
20
|
it 'should accept regular properties (non-selectors)' do
|
9
21
|
VCR.use_cassette('broken_selector') do
|
10
22
|
lambda {
|
11
23
|
Wombat.crawl do
|
12
24
|
base_url "http://www.github.com"
|
13
|
-
|
25
|
+
path "/"
|
14
26
|
|
15
27
|
source :obaoba
|
16
28
|
description 'Oba Oba'
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "
|
8
|
+
s.version = "2.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-07-31"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -30,26 +30,37 @@ Gem::Specification.new do |s|
|
|
30
30
|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
31
31
|
"fixtures/vcr_cassettes/broken_selector.yml",
|
32
32
|
"fixtures/vcr_cassettes/error_page.yml",
|
33
|
+
"fixtures/vcr_cassettes/follow_links.yml",
|
33
34
|
"fixtures/vcr_cassettes/for_each_page.yml",
|
34
35
|
"fixtures/vcr_cassettes/xml_with_namespace.yml",
|
35
36
|
"lib/wombat.rb",
|
36
37
|
"lib/wombat/crawler.rb",
|
37
|
-
"lib/wombat/
|
38
|
-
"lib/wombat/
|
39
|
-
"lib/wombat/
|
40
|
-
"lib/wombat/
|
41
|
-
"lib/wombat/
|
42
|
-
"lib/wombat/
|
43
|
-
"lib/wombat/
|
38
|
+
"lib/wombat/dsl/follower.rb",
|
39
|
+
"lib/wombat/dsl/iterator.rb",
|
40
|
+
"lib/wombat/dsl/metadata.rb",
|
41
|
+
"lib/wombat/dsl/property.rb",
|
42
|
+
"lib/wombat/dsl/property_group.rb",
|
43
|
+
"lib/wombat/processing/node_selector.rb",
|
44
|
+
"lib/wombat/processing/parser.rb",
|
45
|
+
"lib/wombat/property/locators/base.rb",
|
46
|
+
"lib/wombat/property/locators/factory.rb",
|
47
|
+
"lib/wombat/property/locators/follow.rb",
|
48
|
+
"lib/wombat/property/locators/html.rb",
|
49
|
+
"lib/wombat/property/locators/iterator.rb",
|
50
|
+
"lib/wombat/property/locators/list.rb",
|
51
|
+
"lib/wombat/property/locators/property_group.rb",
|
52
|
+
"lib/wombat/property/locators/text.rb",
|
44
53
|
"spec/crawler_spec.rb",
|
54
|
+
"spec/dsl/property_spec.rb",
|
45
55
|
"spec/helpers/sample_crawler.rb",
|
46
56
|
"spec/integration/integration_spec.rb",
|
47
|
-
"spec/
|
48
|
-
"spec/
|
49
|
-
"spec/
|
50
|
-
"spec/
|
51
|
-
"spec/
|
52
|
-
"spec/
|
57
|
+
"spec/processing/parser_spec.rb",
|
58
|
+
"spec/property/locators/factory_spec.rb",
|
59
|
+
"spec/property/locators/follow_spec.rb",
|
60
|
+
"spec/property/locators/html_spec.rb",
|
61
|
+
"spec/property/locators/iterator_spec.rb",
|
62
|
+
"spec/property/locators/list_spec.rb",
|
63
|
+
"spec/property/locators/text_spec.rb",
|
53
64
|
"spec/sample_crawler_spec.rb",
|
54
65
|
"spec/spec_helper.rb",
|
55
66
|
"spec/wombat_spec.rb",
|
@@ -60,7 +71,7 @@ Gem::Specification.new do |s|
|
|
60
71
|
s.require_paths = ["lib"]
|
61
72
|
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
62
73
|
s.rubygems_version = "1.8.24"
|
63
|
-
s.summary = "Ruby DSL to
|
74
|
+
s.summary = "Ruby DSL to scrape web pages"
|
64
75
|
|
65
76
|
if s.respond_to? :specification_version then
|
66
77
|
s.specification_version = 3
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-07-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -192,26 +192,37 @@ files:
|
|
192
192
|
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
193
193
|
- fixtures/vcr_cassettes/broken_selector.yml
|
194
194
|
- fixtures/vcr_cassettes/error_page.yml
|
195
|
+
- fixtures/vcr_cassettes/follow_links.yml
|
195
196
|
- fixtures/vcr_cassettes/for_each_page.yml
|
196
197
|
- fixtures/vcr_cassettes/xml_with_namespace.yml
|
197
198
|
- lib/wombat.rb
|
198
199
|
- lib/wombat/crawler.rb
|
199
|
-
- lib/wombat/
|
200
|
-
- lib/wombat/
|
201
|
-
- lib/wombat/
|
202
|
-
- lib/wombat/
|
203
|
-
- lib/wombat/
|
204
|
-
- lib/wombat/
|
205
|
-
- lib/wombat/
|
200
|
+
- lib/wombat/dsl/follower.rb
|
201
|
+
- lib/wombat/dsl/iterator.rb
|
202
|
+
- lib/wombat/dsl/metadata.rb
|
203
|
+
- lib/wombat/dsl/property.rb
|
204
|
+
- lib/wombat/dsl/property_group.rb
|
205
|
+
- lib/wombat/processing/node_selector.rb
|
206
|
+
- lib/wombat/processing/parser.rb
|
207
|
+
- lib/wombat/property/locators/base.rb
|
208
|
+
- lib/wombat/property/locators/factory.rb
|
209
|
+
- lib/wombat/property/locators/follow.rb
|
210
|
+
- lib/wombat/property/locators/html.rb
|
211
|
+
- lib/wombat/property/locators/iterator.rb
|
212
|
+
- lib/wombat/property/locators/list.rb
|
213
|
+
- lib/wombat/property/locators/property_group.rb
|
214
|
+
- lib/wombat/property/locators/text.rb
|
206
215
|
- spec/crawler_spec.rb
|
216
|
+
- spec/dsl/property_spec.rb
|
207
217
|
- spec/helpers/sample_crawler.rb
|
208
218
|
- spec/integration/integration_spec.rb
|
209
|
-
- spec/
|
210
|
-
- spec/
|
211
|
-
- spec/
|
212
|
-
- spec/
|
213
|
-
- spec/
|
214
|
-
- spec/
|
219
|
+
- spec/processing/parser_spec.rb
|
220
|
+
- spec/property/locators/factory_spec.rb
|
221
|
+
- spec/property/locators/follow_spec.rb
|
222
|
+
- spec/property/locators/html_spec.rb
|
223
|
+
- spec/property/locators/iterator_spec.rb
|
224
|
+
- spec/property/locators/list_spec.rb
|
225
|
+
- spec/property/locators/text_spec.rb
|
215
226
|
- spec/sample_crawler_spec.rb
|
216
227
|
- spec/spec_helper.rb
|
217
228
|
- spec/wombat_spec.rb
|
@@ -240,5 +251,5 @@ rubyforge_project:
|
|
240
251
|
rubygems_version: 1.8.24
|
241
252
|
signing_key:
|
242
253
|
specification_version: 3
|
243
|
-
summary: Ruby DSL to
|
254
|
+
summary: Ruby DSL to scrape web pages
|
244
255
|
test_files: []
|
data/lib/wombat/iterator.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
module Wombat
|
2
|
-
class Iterator < PropertyContainer
|
3
|
-
attr_accessor :selector
|
4
|
-
|
5
|
-
def initialize(selector)
|
6
|
-
@selector = selector
|
7
|
-
super()
|
8
|
-
end
|
9
|
-
|
10
|
-
def parse
|
11
|
-
raise ArgumentError.new('Must provide a block to locate property values') unless block_given?
|
12
|
-
|
13
|
-
all_properties.each do |p|
|
14
|
-
p.result ||= []
|
15
|
-
result = yield p
|
16
|
-
if result
|
17
|
-
result = p.callback ? p.callback.call(result) : result
|
18
|
-
p.result << result
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def reset
|
24
|
-
all_properties.each { |p| p.reset }
|
25
|
-
end
|
26
|
-
|
27
|
-
def flatten(depth = nil)
|
28
|
-
# determine the iterator length by the biggest property array that we have
|
29
|
-
length = all_properties.map(&:result).sort { |a| a.length }.last.size
|
30
|
-
|
31
|
-
Array.new.tap do |a|
|
32
|
-
length.times do |i|
|
33
|
-
a << super(i)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|