wombat 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +13 -30
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
- data/lib/wombat/crawler.rb +7 -17
- data/lib/wombat/dsl/follower.rb +19 -0
- data/lib/wombat/dsl/iterator.rb +19 -0
- data/lib/wombat/dsl/metadata.rb +27 -0
- data/lib/wombat/dsl/property.rb +27 -0
- data/lib/wombat/dsl/property_group.rb +48 -0
- data/lib/wombat/processing/node_selector.rb +12 -0
- data/lib/wombat/processing/parser.rb +48 -0
- data/lib/wombat/property/locators/base.rb +33 -0
- data/lib/wombat/property/locators/factory.rb +39 -0
- data/lib/wombat/property/locators/follow.rb +25 -0
- data/lib/wombat/property/locators/html.rb +14 -0
- data/lib/wombat/property/locators/iterator.rb +23 -0
- data/lib/wombat/property/locators/list.rb +17 -0
- data/lib/wombat/property/locators/property_group.rb +20 -0
- data/lib/wombat/property/locators/text.rb +22 -0
- data/lib/wombat.rb +8 -4
- data/spec/crawler_spec.rb +38 -48
- data/spec/dsl/property_spec.rb +12 -0
- data/spec/helpers/sample_crawler.rb +2 -15
- data/spec/integration/integration_spec.rb +61 -33
- data/spec/processing/parser_spec.rb +32 -0
- data/spec/property/locators/factory_spec.rb +18 -0
- data/spec/property/locators/follow_spec.rb +4 -0
- data/spec/property/locators/html_spec.rb +15 -0
- data/spec/property/locators/iterator_spec.rb +4 -0
- data/spec/property/locators/list_spec.rb +13 -0
- data/spec/property/locators/text_spec.rb +49 -0
- data/spec/sample_crawler_spec.rb +7 -11
- data/spec/wombat_spec.rb +13 -1
- data/wombat.gemspec +27 -16
- metadata +27 -16
- data/lib/wombat/iterator.rb +0 -38
- data/lib/wombat/metadata.rb +0 -24
- data/lib/wombat/node_selector.rb +0 -10
- data/lib/wombat/parser.rb +0 -59
- data/lib/wombat/property.rb +0 -21
- data/lib/wombat/property_container.rb +0 -70
- data/lib/wombat/property_locator.rb +0 -20
- data/spec/iterator_spec.rb +0 -52
- data/spec/metadata_spec.rb +0 -20
- data/spec/parser_spec.rb +0 -125
- data/spec/property_container_spec.rb +0 -62
- data/spec/property_locator_spec.rb +0 -75
- data/spec/property_spec.rb +0 -16
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::DSL::Property do
|
4
|
+
it 'should store property data' do
|
5
|
+
property = Wombat::DSL::Property.new("title", *["/some/selector", :html]) { false }
|
6
|
+
|
7
|
+
property.wombat_property_name.should == "title"
|
8
|
+
property.selector.should == "/some/selector"
|
9
|
+
property.format.should == :html
|
10
|
+
property.callback.should == lambda { false }
|
11
|
+
end
|
12
|
+
end
|
@@ -5,9 +5,9 @@ class SampleCrawler
|
|
5
5
|
include Wombat::Crawler
|
6
6
|
|
7
7
|
base_url "http://www.obaoba.com.br"
|
8
|
-
|
8
|
+
path "/porto-alegre/agenda"
|
9
9
|
|
10
|
-
|
10
|
+
event_group "css=div.title-agenda", :iterator do
|
11
11
|
event do |e|
|
12
12
|
e.title("xpath=.") { |t| t.split(" | ")[1].strip }
|
13
13
|
e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
|
@@ -19,18 +19,5 @@ class SampleCrawler
|
|
19
19
|
venue do |v|
|
20
20
|
v.name("xpath=.") { |n| name.split(" | ")[2].strip }
|
21
21
|
end
|
22
|
-
|
23
|
-
# follow_links "xpath=.//a[1]/@href" do
|
24
|
-
# event { |e| e.description "css=#main-node-content", :html }
|
25
|
-
# venue do |v|
|
26
|
-
# v.phone "css=span.tel .value"
|
27
|
-
# v.image "xpath=//div[@id='article-image']/div/img/@src"
|
28
|
-
# end
|
29
|
-
|
30
|
-
# location do |l|
|
31
|
-
# l.city "css=span.locality"
|
32
|
-
# l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
-
# end
|
34
|
-
# end
|
35
22
|
end
|
36
23
|
end
|
@@ -8,17 +8,15 @@ describe 'basic crawler setup' do
|
|
8
8
|
crawler.send(:include, Wombat::Crawler)
|
9
9
|
|
10
10
|
crawler.base_url "http://www.terra.com.br"
|
11
|
-
crawler.
|
11
|
+
crawler.path '/portal'
|
12
12
|
|
13
13
|
crawler.search "css=.btn-search"
|
14
|
-
crawler.social do
|
15
|
-
|
14
|
+
crawler.social do
|
15
|
+
twitter "css=.ctn-bar li.last"
|
16
16
|
end
|
17
|
-
|
18
|
-
crawler.for_each "css=.ctn-links" do
|
17
|
+
crawler.links "css=.ctn-links", :iterator do
|
19
18
|
menu "css=a"
|
20
19
|
end
|
21
|
-
|
22
20
|
crawler.subheader "css=h2.ttl-dynamic" do |h|
|
23
21
|
h.gsub("London", "Londres")
|
24
22
|
end
|
@@ -28,7 +26,7 @@ describe 'basic crawler setup' do
|
|
28
26
|
results = crawler_instance.crawl
|
29
27
|
|
30
28
|
results["search"].should == "Buscar"
|
31
|
-
results["
|
29
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
32
30
|
results["subheader"].should == "Londres 2012"
|
33
31
|
results["social"]["twitter"].should == "Verão"
|
34
32
|
end
|
@@ -39,9 +37,9 @@ describe 'basic crawler setup' do
|
|
39
37
|
crawler.send(:include, Wombat::Crawler)
|
40
38
|
|
41
39
|
crawler.base_url "http://www.terra.com.br"
|
42
|
-
crawler.
|
40
|
+
crawler.path '/portal'
|
43
41
|
|
44
|
-
crawler.
|
42
|
+
crawler.links "css=.ctn-links", :iterator do
|
45
43
|
menu "css=a"
|
46
44
|
end
|
47
45
|
|
@@ -53,13 +51,13 @@ describe 'basic crawler setup' do
|
|
53
51
|
results = crawler_instance.crawl
|
54
52
|
end
|
55
53
|
|
56
|
-
results["
|
54
|
+
results["links"].should == result_hash
|
57
55
|
|
58
56
|
VCR.use_cassette('basic_crawler_page') do
|
59
57
|
results = crawler_instance.crawl
|
60
58
|
end
|
61
59
|
|
62
|
-
results["
|
60
|
+
results["links"].should == result_hash
|
63
61
|
end
|
64
62
|
|
65
63
|
it 'should crawl page through block to class instance crawl method' do
|
@@ -69,15 +67,15 @@ describe 'basic crawler setup' do
|
|
69
67
|
crawler_instance = crawler.new
|
70
68
|
results = crawler_instance.crawl do
|
71
69
|
base_url "http://www.terra.com.br"
|
72
|
-
|
70
|
+
path '/portal'
|
73
71
|
|
74
72
|
search "css=.btn-search"
|
75
73
|
|
76
|
-
social do
|
77
|
-
|
74
|
+
social do
|
75
|
+
twitter "css=.ctn-bar li.last"
|
78
76
|
end
|
79
77
|
|
80
|
-
|
78
|
+
links "css=.ctn-links", :iterator do
|
81
79
|
menu "css=a"
|
82
80
|
end
|
83
81
|
|
@@ -87,7 +85,7 @@ describe 'basic crawler setup' do
|
|
87
85
|
end
|
88
86
|
|
89
87
|
results["search"].should == "Buscar"
|
90
|
-
results["
|
88
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
91
89
|
results["subheader"].should == "Londres 2012"
|
92
90
|
results["social"]["twitter"].should == "Verão"
|
93
91
|
end
|
@@ -97,15 +95,15 @@ describe 'basic crawler setup' do
|
|
97
95
|
VCR.use_cassette('basic_crawler_page') do
|
98
96
|
results = Wombat.crawl do
|
99
97
|
base_url "http://www.terra.com.br"
|
100
|
-
|
98
|
+
path '/portal'
|
101
99
|
|
102
100
|
search "css=.btn-search"
|
103
101
|
|
104
|
-
social do
|
105
|
-
|
102
|
+
social do
|
103
|
+
twitter "css=.ctn-bar li.last"
|
106
104
|
end
|
107
105
|
|
108
|
-
|
106
|
+
links "css=.ctn-links", :iterator do
|
109
107
|
menu "css=a"
|
110
108
|
end
|
111
109
|
|
@@ -115,7 +113,7 @@ describe 'basic crawler setup' do
|
|
115
113
|
end
|
116
114
|
|
117
115
|
results["search"].should == "Buscar"
|
118
|
-
results["
|
116
|
+
results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
|
119
117
|
results["subheader"].should == "Londres 2012"
|
120
118
|
results["social"]["twitter"].should == "Verão"
|
121
119
|
end
|
@@ -127,24 +125,24 @@ describe 'basic crawler setup' do
|
|
127
125
|
crawler.send(:include, Wombat::Crawler)
|
128
126
|
|
129
127
|
crawler.base_url "https://www.github.com"
|
130
|
-
crawler.
|
128
|
+
crawler.path "/explore"
|
131
129
|
|
132
|
-
crawler.
|
133
|
-
project do
|
134
|
-
|
135
|
-
|
130
|
+
crawler.repos "css=ol.ranked-repositories>li", :iterator do
|
131
|
+
project do
|
132
|
+
repo 'css=h3'
|
133
|
+
description('css=p.description') { |d| d ? d.gsub(/for/, '') : nil }
|
136
134
|
end
|
137
135
|
end
|
138
136
|
|
139
|
-
|
140
|
-
results = crawler_instance.crawl
|
137
|
+
results = crawler.new.crawl
|
141
138
|
|
142
|
-
results.should == { "
|
139
|
+
results.should == { "repos" => [
|
143
140
|
{ "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
|
144
141
|
{ "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
|
145
142
|
{ "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
|
146
143
|
{ "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
|
147
|
-
{ "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } }
|
144
|
+
{ "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } },
|
145
|
+
{ "project" => { "repo" => nil, "description" => nil}}
|
148
146
|
]}
|
149
147
|
end
|
150
148
|
end
|
@@ -156,18 +154,18 @@ describe 'basic crawler setup' do
|
|
156
154
|
|
157
155
|
crawler.document_format :xml
|
158
156
|
crawler.base_url "http://ws.audioscrobbler.com"
|
159
|
-
crawler.
|
157
|
+
crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
|
160
158
|
|
161
159
|
crawler.artist "xpath=//title", :list
|
162
160
|
|
163
|
-
crawler.
|
161
|
+
crawler.location 'xpath=//event', :iterator do
|
164
162
|
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
165
163
|
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
166
164
|
end
|
167
165
|
|
168
166
|
crawler_instance = crawler.new
|
169
167
|
results = crawler_instance.crawl
|
170
|
-
iterator = results['
|
168
|
+
iterator = results['location']
|
171
169
|
|
172
170
|
iterator.should == [
|
173
171
|
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
|
@@ -185,4 +183,34 @@ describe 'basic crawler setup' do
|
|
185
183
|
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
186
184
|
end
|
187
185
|
end
|
186
|
+
|
187
|
+
it 'should follow links' do
|
188
|
+
VCR.use_cassette('follow_links') do
|
189
|
+
crawler = Class.new
|
190
|
+
crawler.send(:include, Wombat::Crawler)
|
191
|
+
|
192
|
+
crawler.document_format :html
|
193
|
+
crawler.base_url "https://www.github.com"
|
194
|
+
crawler.path "/"
|
195
|
+
|
196
|
+
crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
|
197
|
+
heading 'css=h1'
|
198
|
+
end
|
199
|
+
|
200
|
+
crawler_instance = crawler.new
|
201
|
+
results = crawler_instance.crawl
|
202
|
+
|
203
|
+
results.should == {
|
204
|
+
"github" => [
|
205
|
+
{ "heading"=>"GitHub helps people build software together."},
|
206
|
+
{ "heading"=>nil},
|
207
|
+
{ "heading"=>"Features"},
|
208
|
+
{ "heading"=>"Contact GitHub"},
|
209
|
+
{ "heading"=>"GitHub Training — Git Training from the Experts"},
|
210
|
+
{ "heading"=>"GitHub on Your Servers"},
|
211
|
+
{ "heading"=>"Loading..."}
|
212
|
+
]
|
213
|
+
}
|
214
|
+
end
|
215
|
+
end
|
188
216
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Processing::Parser do
|
4
|
+
before(:each) do
|
5
|
+
crawler = Class.new
|
6
|
+
crawler.send(:include, Wombat::Processing::Parser)
|
7
|
+
@parser = crawler.new
|
8
|
+
@metadata = Wombat::DSL::Metadata.new
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should request page document with correct url' do
|
12
|
+
@metadata.base_url "http://www.google.com"
|
13
|
+
@metadata.path "/search"
|
14
|
+
fake_document = double :document
|
15
|
+
fake_parser = double :parser
|
16
|
+
fake_document.should_receive(:parser).and_return(fake_parser)
|
17
|
+
@parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
|
18
|
+
|
19
|
+
@parser.parse @metadata
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should correctly parse xml documents' do
|
23
|
+
fake_document = double :xml
|
24
|
+
fake_parser = double :parser
|
25
|
+
@metadata.document_format :xml
|
26
|
+
@parser.mechanize.should_not_receive(:get)
|
27
|
+
RestClient.should_receive(:get).and_return fake_document
|
28
|
+
Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
|
29
|
+
|
30
|
+
@parser.parse @metadata
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::Factory do
|
4
|
+
it 'should instantiate correct locator according to property type' do
|
5
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :text)).should be_a(Wombat::Property::Locators::Text)
|
6
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :html)).should be_a(Wombat::Property::Locators::Html)
|
7
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :list)).should be_a(Wombat::Property::Locators::List)
|
8
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :follow)).should be_a(Wombat::Property::Locators::Follow)
|
9
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :iterator)).should be_a(Wombat::Property::Locators::Iterator)
|
10
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :container)).should be_a(Wombat::Property::Locators::PropertyGroup)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should raise correct exception if provided property is of unknown type' do
|
14
|
+
lambda {
|
15
|
+
Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :weird))
|
16
|
+
}.should raise_error(Wombat::Property::Locators::UnknownTypeException, "Unknown property format weird.")
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::Html do
|
4
|
+
it 'should locate html property' do
|
5
|
+
fake_elem = double :element
|
6
|
+
context = double :context
|
7
|
+
fake_elem.stub inner_html: "Something cool "
|
8
|
+
context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
|
9
|
+
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
10
|
+
|
11
|
+
locator = Wombat::Property::Locators::Html.new(property)
|
12
|
+
|
13
|
+
locator.locate(context).should == { "data1" => "Something cool" }
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::List do
|
4
|
+
it 'should locate a list of nodes' do
|
5
|
+
context = double :context
|
6
|
+
context.stub(:css).with(".selector").and_return %w(1 2 3 4 5)
|
7
|
+
property = Wombat::DSL::Property.new('data1', 'css=.selector', :list)
|
8
|
+
|
9
|
+
locator = Wombat::Property::Locators::List.new(property)
|
10
|
+
|
11
|
+
locator.locate(context).should == { "data1" => %w(1 2 3 4 5) }
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wombat::Property::Locators::Text do
|
4
|
+
it 'should locate text property with xpath selector and namespaces' do
|
5
|
+
fake_elem = double :element
|
6
|
+
context = double :context
|
7
|
+
fake_elem.stub inner_text: "Something cool "
|
8
|
+
context.stub(:xpath).with("/abc", 'boom').and_return [fake_elem]
|
9
|
+
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :text, 'boom')
|
10
|
+
|
11
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
12
|
+
|
13
|
+
locator.locate(context).should == { "data1" => "Something cool" }
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should locate text property with css selector' do
|
17
|
+
fake_elem = double :element
|
18
|
+
context = double :context
|
19
|
+
fake_elem.stub inner_text: "My name"
|
20
|
+
context.stub(:css).with("/def").and_return [fake_elem]
|
21
|
+
property = Wombat::DSL::Property.new('data1', 'css=/def', :text)
|
22
|
+
|
23
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
24
|
+
|
25
|
+
locator.locate(context).should == { "data1" => "My name" }
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should return plain symbols as strings' do
|
29
|
+
fake_elem = double :element
|
30
|
+
context = double :context
|
31
|
+
property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
|
32
|
+
|
33
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
34
|
+
|
35
|
+
locator.locate(context).should == { "data_2" => "hardcoded_value" }
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should invoke property callback' do
|
39
|
+
fake_elem = double :element
|
40
|
+
context = double :context
|
41
|
+
fake_elem.stub inner_text: "My name"
|
42
|
+
context.stub(:css).with("/def").and_return [fake_elem]
|
43
|
+
property = Wombat::DSL::Property.new('data1', 'css=/def', :text) { |s| s.gsub(/name/, 'ass') }
|
44
|
+
|
45
|
+
locator = Wombat::Property::Locators::Text.new(property)
|
46
|
+
|
47
|
+
locator.locate(context).should == { "data1" => "My ass" }
|
48
|
+
end
|
49
|
+
end
|
data/spec/sample_crawler_spec.rb
CHANGED
@@ -8,19 +8,15 @@ describe SampleCrawler do
|
|
8
8
|
|
9
9
|
it 'should correctly assign event metadata' do
|
10
10
|
@sample_crawler.should_receive(:parse) do |args|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
it
|
16
|
-
it.
|
17
|
-
it["event"]["title"].selector.should == "xpath=."
|
18
|
-
it["event"]["date"].selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
|
19
|
-
it["event"]["type"].selector.should == "xpath=.type"
|
20
|
-
it["venue"]["name"].selector.should == "xpath=."
|
11
|
+
args['event_group'].wombat_property_selector.should == "css=div.title-agenda"
|
12
|
+
it = args['event_group']
|
13
|
+
it["event"]["title"].wombat_property_selector.should == "xpath=."
|
14
|
+
it["event"]["date"].wombat_property_selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
|
15
|
+
it["event"]["type"].wombat_property_selector.should == "xpath=.type"
|
16
|
+
it["venue"]["name"].wombat_property_selector.should == "xpath=."
|
21
17
|
|
22
18
|
args[:base_url].should == 'http://www.obaoba.com.br'
|
23
|
-
args[:
|
19
|
+
args[:path].should == '/porto-alegre/agenda'
|
24
20
|
end
|
25
21
|
|
26
22
|
@sample_crawler.crawl
|
data/spec/wombat_spec.rb
CHANGED
@@ -5,12 +5,24 @@ describe Wombat do
|
|
5
5
|
Wombat.should respond_to(:crawl)
|
6
6
|
end
|
7
7
|
|
8
|
+
it 'should provide syntactic sugar method Wombat.scrape' do
|
9
|
+
Wombat.should respond_to(:scrape)
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should redirect .scrape to .crawl' do
|
13
|
+
fake_class = double :fake
|
14
|
+
fake_class.stub :include
|
15
|
+
fake_class.should_receive(:new).and_return(stub(crawl: nil))
|
16
|
+
Class.stub :new => fake_class
|
17
|
+
Wombat.scrape
|
18
|
+
end
|
19
|
+
|
8
20
|
it 'should accept regular properties (non-selectors)' do
|
9
21
|
VCR.use_cassette('broken_selector') do
|
10
22
|
lambda {
|
11
23
|
Wombat.crawl do
|
12
24
|
base_url "http://www.github.com"
|
13
|
-
|
25
|
+
path "/"
|
14
26
|
|
15
27
|
source :obaoba
|
16
28
|
description 'Oba Oba'
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "
|
8
|
+
s.version = "2.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-07-31"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -30,26 +30,37 @@ Gem::Specification.new do |s|
|
|
30
30
|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
31
31
|
"fixtures/vcr_cassettes/broken_selector.yml",
|
32
32
|
"fixtures/vcr_cassettes/error_page.yml",
|
33
|
+
"fixtures/vcr_cassettes/follow_links.yml",
|
33
34
|
"fixtures/vcr_cassettes/for_each_page.yml",
|
34
35
|
"fixtures/vcr_cassettes/xml_with_namespace.yml",
|
35
36
|
"lib/wombat.rb",
|
36
37
|
"lib/wombat/crawler.rb",
|
37
|
-
"lib/wombat/
|
38
|
-
"lib/wombat/
|
39
|
-
"lib/wombat/
|
40
|
-
"lib/wombat/
|
41
|
-
"lib/wombat/
|
42
|
-
"lib/wombat/
|
43
|
-
"lib/wombat/
|
38
|
+
"lib/wombat/dsl/follower.rb",
|
39
|
+
"lib/wombat/dsl/iterator.rb",
|
40
|
+
"lib/wombat/dsl/metadata.rb",
|
41
|
+
"lib/wombat/dsl/property.rb",
|
42
|
+
"lib/wombat/dsl/property_group.rb",
|
43
|
+
"lib/wombat/processing/node_selector.rb",
|
44
|
+
"lib/wombat/processing/parser.rb",
|
45
|
+
"lib/wombat/property/locators/base.rb",
|
46
|
+
"lib/wombat/property/locators/factory.rb",
|
47
|
+
"lib/wombat/property/locators/follow.rb",
|
48
|
+
"lib/wombat/property/locators/html.rb",
|
49
|
+
"lib/wombat/property/locators/iterator.rb",
|
50
|
+
"lib/wombat/property/locators/list.rb",
|
51
|
+
"lib/wombat/property/locators/property_group.rb",
|
52
|
+
"lib/wombat/property/locators/text.rb",
|
44
53
|
"spec/crawler_spec.rb",
|
54
|
+
"spec/dsl/property_spec.rb",
|
45
55
|
"spec/helpers/sample_crawler.rb",
|
46
56
|
"spec/integration/integration_spec.rb",
|
47
|
-
"spec/
|
48
|
-
"spec/
|
49
|
-
"spec/
|
50
|
-
"spec/
|
51
|
-
"spec/
|
52
|
-
"spec/
|
57
|
+
"spec/processing/parser_spec.rb",
|
58
|
+
"spec/property/locators/factory_spec.rb",
|
59
|
+
"spec/property/locators/follow_spec.rb",
|
60
|
+
"spec/property/locators/html_spec.rb",
|
61
|
+
"spec/property/locators/iterator_spec.rb",
|
62
|
+
"spec/property/locators/list_spec.rb",
|
63
|
+
"spec/property/locators/text_spec.rb",
|
53
64
|
"spec/sample_crawler_spec.rb",
|
54
65
|
"spec/spec_helper.rb",
|
55
66
|
"spec/wombat_spec.rb",
|
@@ -60,7 +71,7 @@ Gem::Specification.new do |s|
|
|
60
71
|
s.require_paths = ["lib"]
|
61
72
|
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
62
73
|
s.rubygems_version = "1.8.24"
|
63
|
-
s.summary = "Ruby DSL to
|
74
|
+
s.summary = "Ruby DSL to scrape web pages"
|
64
75
|
|
65
76
|
if s.respond_to? :specification_version then
|
66
77
|
s.specification_version = 3
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-07-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -192,26 +192,37 @@ files:
|
|
192
192
|
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
193
193
|
- fixtures/vcr_cassettes/broken_selector.yml
|
194
194
|
- fixtures/vcr_cassettes/error_page.yml
|
195
|
+
- fixtures/vcr_cassettes/follow_links.yml
|
195
196
|
- fixtures/vcr_cassettes/for_each_page.yml
|
196
197
|
- fixtures/vcr_cassettes/xml_with_namespace.yml
|
197
198
|
- lib/wombat.rb
|
198
199
|
- lib/wombat/crawler.rb
|
199
|
-
- lib/wombat/
|
200
|
-
- lib/wombat/
|
201
|
-
- lib/wombat/
|
202
|
-
- lib/wombat/
|
203
|
-
- lib/wombat/
|
204
|
-
- lib/wombat/
|
205
|
-
- lib/wombat/
|
200
|
+
- lib/wombat/dsl/follower.rb
|
201
|
+
- lib/wombat/dsl/iterator.rb
|
202
|
+
- lib/wombat/dsl/metadata.rb
|
203
|
+
- lib/wombat/dsl/property.rb
|
204
|
+
- lib/wombat/dsl/property_group.rb
|
205
|
+
- lib/wombat/processing/node_selector.rb
|
206
|
+
- lib/wombat/processing/parser.rb
|
207
|
+
- lib/wombat/property/locators/base.rb
|
208
|
+
- lib/wombat/property/locators/factory.rb
|
209
|
+
- lib/wombat/property/locators/follow.rb
|
210
|
+
- lib/wombat/property/locators/html.rb
|
211
|
+
- lib/wombat/property/locators/iterator.rb
|
212
|
+
- lib/wombat/property/locators/list.rb
|
213
|
+
- lib/wombat/property/locators/property_group.rb
|
214
|
+
- lib/wombat/property/locators/text.rb
|
206
215
|
- spec/crawler_spec.rb
|
216
|
+
- spec/dsl/property_spec.rb
|
207
217
|
- spec/helpers/sample_crawler.rb
|
208
218
|
- spec/integration/integration_spec.rb
|
209
|
-
- spec/
|
210
|
-
- spec/
|
211
|
-
- spec/
|
212
|
-
- spec/
|
213
|
-
- spec/
|
214
|
-
- spec/
|
219
|
+
- spec/processing/parser_spec.rb
|
220
|
+
- spec/property/locators/factory_spec.rb
|
221
|
+
- spec/property/locators/follow_spec.rb
|
222
|
+
- spec/property/locators/html_spec.rb
|
223
|
+
- spec/property/locators/iterator_spec.rb
|
224
|
+
- spec/property/locators/list_spec.rb
|
225
|
+
- spec/property/locators/text_spec.rb
|
215
226
|
- spec/sample_crawler_spec.rb
|
216
227
|
- spec/spec_helper.rb
|
217
228
|
- spec/wombat_spec.rb
|
@@ -240,5 +251,5 @@ rubyforge_project:
|
|
240
251
|
rubygems_version: 1.8.24
|
241
252
|
signing_key:
|
242
253
|
specification_version: 3
|
243
|
-
summary: Ruby DSL to
|
254
|
+
summary: Ruby DSL to scrape web pages
|
244
255
|
test_files: []
|
data/lib/wombat/iterator.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
module Wombat
|
2
|
-
class Iterator < PropertyContainer
|
3
|
-
attr_accessor :selector
|
4
|
-
|
5
|
-
def initialize(selector)
|
6
|
-
@selector = selector
|
7
|
-
super()
|
8
|
-
end
|
9
|
-
|
10
|
-
def parse
|
11
|
-
raise ArgumentError.new('Must provide a block to locate property values') unless block_given?
|
12
|
-
|
13
|
-
all_properties.each do |p|
|
14
|
-
p.result ||= []
|
15
|
-
result = yield p
|
16
|
-
if result
|
17
|
-
result = p.callback ? p.callback.call(result) : result
|
18
|
-
p.result << result
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def reset
|
24
|
-
all_properties.each { |p| p.reset }
|
25
|
-
end
|
26
|
-
|
27
|
-
def flatten(depth = nil)
|
28
|
-
# determine the iterator length by the biggest property array that we have
|
29
|
-
length = all_properties.map(&:result).sort { |a| a.length }.last.size
|
30
|
-
|
31
|
-
Array.new.tap do |a|
|
32
|
-
length.times do |i|
|
33
|
-
a << super(i)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|