wombat 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/README.md +13 -30
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
  5. data/lib/wombat/crawler.rb +7 -17
  6. data/lib/wombat/dsl/follower.rb +19 -0
  7. data/lib/wombat/dsl/iterator.rb +19 -0
  8. data/lib/wombat/dsl/metadata.rb +27 -0
  9. data/lib/wombat/dsl/property.rb +27 -0
  10. data/lib/wombat/dsl/property_group.rb +48 -0
  11. data/lib/wombat/processing/node_selector.rb +12 -0
  12. data/lib/wombat/processing/parser.rb +48 -0
  13. data/lib/wombat/property/locators/base.rb +33 -0
  14. data/lib/wombat/property/locators/factory.rb +39 -0
  15. data/lib/wombat/property/locators/follow.rb +25 -0
  16. data/lib/wombat/property/locators/html.rb +14 -0
  17. data/lib/wombat/property/locators/iterator.rb +23 -0
  18. data/lib/wombat/property/locators/list.rb +17 -0
  19. data/lib/wombat/property/locators/property_group.rb +20 -0
  20. data/lib/wombat/property/locators/text.rb +22 -0
  21. data/lib/wombat.rb +8 -4
  22. data/spec/crawler_spec.rb +38 -48
  23. data/spec/dsl/property_spec.rb +12 -0
  24. data/spec/helpers/sample_crawler.rb +2 -15
  25. data/spec/integration/integration_spec.rb +61 -33
  26. data/spec/processing/parser_spec.rb +32 -0
  27. data/spec/property/locators/factory_spec.rb +18 -0
  28. data/spec/property/locators/follow_spec.rb +4 -0
  29. data/spec/property/locators/html_spec.rb +15 -0
  30. data/spec/property/locators/iterator_spec.rb +4 -0
  31. data/spec/property/locators/list_spec.rb +13 -0
  32. data/spec/property/locators/text_spec.rb +49 -0
  33. data/spec/sample_crawler_spec.rb +7 -11
  34. data/spec/wombat_spec.rb +13 -1
  35. data/wombat.gemspec +27 -16
  36. metadata +27 -16
  37. data/lib/wombat/iterator.rb +0 -38
  38. data/lib/wombat/metadata.rb +0 -24
  39. data/lib/wombat/node_selector.rb +0 -10
  40. data/lib/wombat/parser.rb +0 -59
  41. data/lib/wombat/property.rb +0 -21
  42. data/lib/wombat/property_container.rb +0 -70
  43. data/lib/wombat/property_locator.rb +0 -20
  44. data/spec/iterator_spec.rb +0 -52
  45. data/spec/metadata_spec.rb +0 -20
  46. data/spec/parser_spec.rb +0 -125
  47. data/spec/property_container_spec.rb +0 -62
  48. data/spec/property_locator_spec.rb +0 -75
  49. data/spec/property_spec.rb +0 -16
@@ -0,0 +1,12 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::DSL::Property do
4
+ it 'should store property data' do
5
+ property = Wombat::DSL::Property.new("title", *["/some/selector", :html]) { false }
6
+
7
+ property.wombat_property_name.should == "title"
8
+ property.selector.should == "/some/selector"
9
+ property.format.should == :html
10
+ property.callback.should == lambda { false }
11
+ end
12
+ end
@@ -5,9 +5,9 @@ class SampleCrawler
5
5
  include Wombat::Crawler
6
6
 
7
7
  base_url "http://www.obaoba.com.br"
8
- list_page "/porto-alegre/agenda"
8
+ path "/porto-alegre/agenda"
9
9
 
10
- for_each "css=div.title-agenda" do
10
+ event_group "css=div.title-agenda", :iterator do
11
11
  event do |e|
12
12
  e.title("xpath=.") { |t| t.split(" | ")[1].strip }
13
13
  e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
@@ -19,18 +19,5 @@ class SampleCrawler
19
19
  venue do |v|
20
20
  v.name("xpath=.") { |n| name.split(" | ")[2].strip }
21
21
  end
22
-
23
- # follow_links "xpath=.//a[1]/@href" do
24
- # event { |e| e.description "css=#main-node-content", :html }
25
- # venue do |v|
26
- # v.phone "css=span.tel .value"
27
- # v.image "xpath=//div[@id='article-image']/div/img/@src"
28
- # end
29
-
30
- # location do |l|
31
- # l.city "css=span.locality"
32
- # l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
33
- # end
34
- # end
35
22
  end
36
23
  end
@@ -8,17 +8,15 @@ describe 'basic crawler setup' do
8
8
  crawler.send(:include, Wombat::Crawler)
9
9
 
10
10
  crawler.base_url "http://www.terra.com.br"
11
- crawler.list_page '/portal'
11
+ crawler.path '/portal'
12
12
 
13
13
  crawler.search "css=.btn-search"
14
- crawler.social do |s|
15
- s.twitter "css=.ctn-bar li.last"
14
+ crawler.social do
15
+ twitter "css=.ctn-bar li.last"
16
16
  end
17
-
18
- crawler.for_each "css=.ctn-links" do
17
+ crawler.links "css=.ctn-links", :iterator do
19
18
  menu "css=a"
20
19
  end
21
-
22
20
  crawler.subheader "css=h2.ttl-dynamic" do |h|
23
21
  h.gsub("London", "Londres")
24
22
  end
@@ -28,7 +26,7 @@ describe 'basic crawler setup' do
28
26
  results = crawler_instance.crawl
29
27
 
30
28
  results["search"].should == "Buscar"
31
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
29
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
32
30
  results["subheader"].should == "Londres 2012"
33
31
  results["social"]["twitter"].should == "Verão"
34
32
  end
@@ -39,9 +37,9 @@ describe 'basic crawler setup' do
39
37
  crawler.send(:include, Wombat::Crawler)
40
38
 
41
39
  crawler.base_url "http://www.terra.com.br"
42
- crawler.list_page '/portal'
40
+ crawler.path '/portal'
43
41
 
44
- crawler.for_each "css=.ctn-links" do
42
+ crawler.links "css=.ctn-links", :iterator do
45
43
  menu "css=a"
46
44
  end
47
45
 
@@ -53,13 +51,13 @@ describe 'basic crawler setup' do
53
51
  results = crawler_instance.crawl
54
52
  end
55
53
 
56
- results["iterator0"].should == result_hash
54
+ results["links"].should == result_hash
57
55
 
58
56
  VCR.use_cassette('basic_crawler_page') do
59
57
  results = crawler_instance.crawl
60
58
  end
61
59
 
62
- results["iterator0"].should == result_hash
60
+ results["links"].should == result_hash
63
61
  end
64
62
 
65
63
  it 'should crawl page through block to class instance crawl method' do
@@ -69,15 +67,15 @@ describe 'basic crawler setup' do
69
67
  crawler_instance = crawler.new
70
68
  results = crawler_instance.crawl do
71
69
  base_url "http://www.terra.com.br"
72
- list_page '/portal'
70
+ path '/portal'
73
71
 
74
72
  search "css=.btn-search"
75
73
 
76
- social do |s|
77
- s.twitter "css=.ctn-bar li.last"
74
+ social do
75
+ twitter "css=.ctn-bar li.last"
78
76
  end
79
77
 
80
- for_each "css=.ctn-links" do
78
+ links "css=.ctn-links", :iterator do
81
79
  menu "css=a"
82
80
  end
83
81
 
@@ -87,7 +85,7 @@ describe 'basic crawler setup' do
87
85
  end
88
86
 
89
87
  results["search"].should == "Buscar"
90
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
88
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
91
89
  results["subheader"].should == "Londres 2012"
92
90
  results["social"]["twitter"].should == "Verão"
93
91
  end
@@ -97,15 +95,15 @@ describe 'basic crawler setup' do
97
95
  VCR.use_cassette('basic_crawler_page') do
98
96
  results = Wombat.crawl do
99
97
  base_url "http://www.terra.com.br"
100
- list_page '/portal'
98
+ path '/portal'
101
99
 
102
100
  search "css=.btn-search"
103
101
 
104
- social do |s|
105
- s.twitter "css=.ctn-bar li.last"
102
+ social do
103
+ twitter "css=.ctn-bar li.last"
106
104
  end
107
105
 
108
- for_each "css=.ctn-links" do
106
+ links "css=.ctn-links", :iterator do
109
107
  menu "css=a"
110
108
  end
111
109
 
@@ -115,7 +113,7 @@ describe 'basic crawler setup' do
115
113
  end
116
114
 
117
115
  results["search"].should == "Buscar"
118
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
116
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
119
117
  results["subheader"].should == "Londres 2012"
120
118
  results["social"]["twitter"].should == "Verão"
121
119
  end
@@ -127,24 +125,24 @@ describe 'basic crawler setup' do
127
125
  crawler.send(:include, Wombat::Crawler)
128
126
 
129
127
  crawler.base_url "https://www.github.com"
130
- crawler.list_page "/explore"
128
+ crawler.path "/explore"
131
129
 
132
- crawler.for_each "css=ol.ranked-repositories li" do
133
- project do |p|
134
- p.repo 'css=h3'
135
- p.description('css=p.description') { |d| d.gsub(/for/, '') }
130
+ crawler.repos "css=ol.ranked-repositories>li", :iterator do
131
+ project do
132
+ repo 'css=h3'
133
+ description('css=p.description') { |d| d ? d.gsub(/for/, '') : nil }
136
134
  end
137
135
  end
138
136
 
139
- crawler_instance = crawler.new
140
- results = crawler_instance.crawl
137
+ results = crawler.new.crawl
141
138
 
142
- results.should == { "iterator0" => [
139
+ results.should == { "repos" => [
143
140
  { "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
144
141
  { "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
145
142
  { "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
146
143
  { "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
147
- { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } }
144
+ { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } },
145
+ { "project" => { "repo" => nil, "description" => nil}}
148
146
  ]}
149
147
  end
150
148
  end
@@ -156,18 +154,18 @@ describe 'basic crawler setup' do
156
154
 
157
155
  crawler.document_format :xml
158
156
  crawler.base_url "http://ws.audioscrobbler.com"
159
- crawler.list_page "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
157
+ crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
160
158
 
161
159
  crawler.artist "xpath=//title", :list
162
160
 
163
- crawler.for_each 'xpath=//event' do
161
+ crawler.location 'xpath=//event', :iterator do
164
162
  latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
165
163
  longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
166
164
  end
167
165
 
168
166
  crawler_instance = crawler.new
169
167
  results = crawler_instance.crawl
170
- iterator = results['iterator0']
168
+ iterator = results['location']
171
169
 
172
170
  iterator.should == [
173
171
  {"latitude"=>"37.807775", "longitude"=>"-122.272736"},
@@ -185,4 +183,34 @@ describe 'basic crawler setup' do
185
183
  results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
186
184
  end
187
185
  end
186
+
187
+ it 'should follow links' do
188
+ VCR.use_cassette('follow_links') do
189
+ crawler = Class.new
190
+ crawler.send(:include, Wombat::Crawler)
191
+
192
+ crawler.document_format :html
193
+ crawler.base_url "https://www.github.com"
194
+ crawler.path "/"
195
+
196
+ crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
197
+ heading 'css=h1'
198
+ end
199
+
200
+ crawler_instance = crawler.new
201
+ results = crawler_instance.crawl
202
+
203
+ results.should == {
204
+ "github" => [
205
+ { "heading"=>"GitHub helps people build software together."},
206
+ { "heading"=>nil},
207
+ { "heading"=>"Features"},
208
+ { "heading"=>"Contact GitHub"},
209
+ { "heading"=>"GitHub Training — Git Training from the Experts"},
210
+ { "heading"=>"GitHub on Your Servers"},
211
+ { "heading"=>"Loading..."}
212
+ ]
213
+ }
214
+ end
215
+ end
188
216
  end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Processing::Parser do
4
+ before(:each) do
5
+ crawler = Class.new
6
+ crawler.send(:include, Wombat::Processing::Parser)
7
+ @parser = crawler.new
8
+ @metadata = Wombat::DSL::Metadata.new
9
+ end
10
+
11
+ it 'should request page document with correct url' do
12
+ @metadata.base_url "http://www.google.com"
13
+ @metadata.path "/search"
14
+ fake_document = double :document
15
+ fake_parser = double :parser
16
+ fake_document.should_receive(:parser).and_return(fake_parser)
17
+ @parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
18
+
19
+ @parser.parse @metadata
20
+ end
21
+
22
+ it 'should correctly parse xml documents' do
23
+ fake_document = double :xml
24
+ fake_parser = double :parser
25
+ @metadata.document_format :xml
26
+ @parser.mechanize.should_not_receive(:get)
27
+ RestClient.should_receive(:get).and_return fake_document
28
+ Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
29
+
30
+ @parser.parse @metadata
31
+ end
32
+ end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Factory do
4
+ it 'should instantiate correct locator according to property type' do
5
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :text)).should be_a(Wombat::Property::Locators::Text)
6
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :html)).should be_a(Wombat::Property::Locators::Html)
7
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :list)).should be_a(Wombat::Property::Locators::List)
8
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :follow)).should be_a(Wombat::Property::Locators::Follow)
9
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :iterator)).should be_a(Wombat::Property::Locators::Iterator)
10
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :container)).should be_a(Wombat::Property::Locators::PropertyGroup)
11
+ end
12
+
13
+ it 'should raise correct exception if provided property is of unknown type' do
14
+ lambda {
15
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :weird))
16
+ }.should raise_error(Wombat::Property::Locators::UnknownTypeException, "Unknown property format weird.")
17
+ end
18
+ end
@@ -0,0 +1,4 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Follow do
4
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Html do
4
+ it 'should locate html property' do
5
+ fake_elem = double :element
6
+ context = double :context
7
+ fake_elem.stub inner_html: "Something cool "
8
+ context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
9
+ property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
10
+
11
+ locator = Wombat::Property::Locators::Html.new(property)
12
+
13
+ locator.locate(context).should == { "data1" => "Something cool" }
14
+ end
15
+ end
@@ -0,0 +1,4 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Iterator do
4
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::List do
4
+ it 'should locate a list of nodes' do
5
+ context = double :context
6
+ context.stub(:css).with(".selector").and_return %w(1 2 3 4 5)
7
+ property = Wombat::DSL::Property.new('data1', 'css=.selector', :list)
8
+
9
+ locator = Wombat::Property::Locators::List.new(property)
10
+
11
+ locator.locate(context).should == { "data1" => %w(1 2 3 4 5) }
12
+ end
13
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Text do
4
+ it 'should locate text property with xpath selector and namespaces' do
5
+ fake_elem = double :element
6
+ context = double :context
7
+ fake_elem.stub inner_text: "Something cool "
8
+ context.stub(:xpath).with("/abc", 'boom').and_return [fake_elem]
9
+ property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :text, 'boom')
10
+
11
+ locator = Wombat::Property::Locators::Text.new(property)
12
+
13
+ locator.locate(context).should == { "data1" => "Something cool" }
14
+ end
15
+
16
+ it 'should locate text property with css selector' do
17
+ fake_elem = double :element
18
+ context = double :context
19
+ fake_elem.stub inner_text: "My name"
20
+ context.stub(:css).with("/def").and_return [fake_elem]
21
+ property = Wombat::DSL::Property.new('data1', 'css=/def', :text)
22
+
23
+ locator = Wombat::Property::Locators::Text.new(property)
24
+
25
+ locator.locate(context).should == { "data1" => "My name" }
26
+ end
27
+
28
+ it 'should return plain symbols as strings' do
29
+ fake_elem = double :element
30
+ context = double :context
31
+ property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
32
+
33
+ locator = Wombat::Property::Locators::Text.new(property)
34
+
35
+ locator.locate(context).should == { "data_2" => "hardcoded_value" }
36
+ end
37
+
38
+ it 'should invoke property callback' do
39
+ fake_elem = double :element
40
+ context = double :context
41
+ fake_elem.stub inner_text: "My name"
42
+ context.stub(:css).with("/def").and_return [fake_elem]
43
+ property = Wombat::DSL::Property.new('data1', 'css=/def', :text) { |s| s.gsub(/name/, 'ass') }
44
+
45
+ locator = Wombat::Property::Locators::Text.new(property)
46
+
47
+ locator.locate(context).should == { "data1" => "My ass" }
48
+ end
49
+ end
@@ -8,19 +8,15 @@ describe SampleCrawler do
8
8
 
9
9
  it 'should correctly assign event metadata' do
10
10
  @sample_crawler.should_receive(:parse) do |args|
11
- # args["event"]["description"].selector.should == "css=#main-node-content"
12
-
13
- # args["venue"]["address"].selector.should == "324 Dom Pedro II Street"
14
-
15
- it = args.iterators.first
16
- it.selector.should == "css=div.title-agenda"
17
- it["event"]["title"].selector.should == "xpath=."
18
- it["event"]["date"].selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
19
- it["event"]["type"].selector.should == "xpath=.type"
20
- it["venue"]["name"].selector.should == "xpath=."
11
+ args['event_group'].wombat_property_selector.should == "css=div.title-agenda"
12
+ it = args['event_group']
13
+ it["event"]["title"].wombat_property_selector.should == "xpath=."
14
+ it["event"]["date"].wombat_property_selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
15
+ it["event"]["type"].wombat_property_selector.should == "xpath=.type"
16
+ it["venue"]["name"].wombat_property_selector.should == "xpath=."
21
17
 
22
18
  args[:base_url].should == 'http://www.obaoba.com.br'
23
- args[:list_page].should == '/porto-alegre/agenda'
19
+ args[:path].should == '/porto-alegre/agenda'
24
20
  end
25
21
 
26
22
  @sample_crawler.crawl
data/spec/wombat_spec.rb CHANGED
@@ -5,12 +5,24 @@ describe Wombat do
5
5
  Wombat.should respond_to(:crawl)
6
6
  end
7
7
 
8
+ it 'should provide syntactic sugar method Wombat.scrape' do
9
+ Wombat.should respond_to(:scrape)
10
+ end
11
+
12
+ it 'should redirect .scrape to .crawl' do
13
+ fake_class = double :fake
14
+ fake_class.stub :include
15
+ fake_class.should_receive(:new).and_return(stub(crawl: nil))
16
+ Class.stub :new => fake_class
17
+ Wombat.scrape
18
+ end
19
+
8
20
  it 'should accept regular properties (non-selectors)' do
9
21
  VCR.use_cassette('broken_selector') do
10
22
  lambda {
11
23
  Wombat.crawl do
12
24
  base_url "http://www.github.com"
13
- list_page "/"
25
+ path "/"
14
26
 
15
27
  source :obaoba
16
28
  description 'Oba Oba'
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "1.0.0"
8
+ s.version = "2.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-06-25"
12
+ s.date = "2012-07-31"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -30,26 +30,37 @@ Gem::Specification.new do |s|
30
30
  "fixtures/vcr_cassettes/basic_crawler_page.yml",
31
31
  "fixtures/vcr_cassettes/broken_selector.yml",
32
32
  "fixtures/vcr_cassettes/error_page.yml",
33
+ "fixtures/vcr_cassettes/follow_links.yml",
33
34
  "fixtures/vcr_cassettes/for_each_page.yml",
34
35
  "fixtures/vcr_cassettes/xml_with_namespace.yml",
35
36
  "lib/wombat.rb",
36
37
  "lib/wombat/crawler.rb",
37
- "lib/wombat/iterator.rb",
38
- "lib/wombat/metadata.rb",
39
- "lib/wombat/node_selector.rb",
40
- "lib/wombat/parser.rb",
41
- "lib/wombat/property.rb",
42
- "lib/wombat/property_container.rb",
43
- "lib/wombat/property_locator.rb",
38
+ "lib/wombat/dsl/follower.rb",
39
+ "lib/wombat/dsl/iterator.rb",
40
+ "lib/wombat/dsl/metadata.rb",
41
+ "lib/wombat/dsl/property.rb",
42
+ "lib/wombat/dsl/property_group.rb",
43
+ "lib/wombat/processing/node_selector.rb",
44
+ "lib/wombat/processing/parser.rb",
45
+ "lib/wombat/property/locators/base.rb",
46
+ "lib/wombat/property/locators/factory.rb",
47
+ "lib/wombat/property/locators/follow.rb",
48
+ "lib/wombat/property/locators/html.rb",
49
+ "lib/wombat/property/locators/iterator.rb",
50
+ "lib/wombat/property/locators/list.rb",
51
+ "lib/wombat/property/locators/property_group.rb",
52
+ "lib/wombat/property/locators/text.rb",
44
53
  "spec/crawler_spec.rb",
54
+ "spec/dsl/property_spec.rb",
45
55
  "spec/helpers/sample_crawler.rb",
46
56
  "spec/integration/integration_spec.rb",
47
- "spec/iterator_spec.rb",
48
- "spec/metadata_spec.rb",
49
- "spec/parser_spec.rb",
50
- "spec/property_container_spec.rb",
51
- "spec/property_locator_spec.rb",
52
- "spec/property_spec.rb",
57
+ "spec/processing/parser_spec.rb",
58
+ "spec/property/locators/factory_spec.rb",
59
+ "spec/property/locators/follow_spec.rb",
60
+ "spec/property/locators/html_spec.rb",
61
+ "spec/property/locators/iterator_spec.rb",
62
+ "spec/property/locators/list_spec.rb",
63
+ "spec/property/locators/text_spec.rb",
53
64
  "spec/sample_crawler_spec.rb",
54
65
  "spec/spec_helper.rb",
55
66
  "spec/wombat_spec.rb",
@@ -60,7 +71,7 @@ Gem::Specification.new do |s|
60
71
  s.require_paths = ["lib"]
61
72
  s.required_ruby_version = Gem::Requirement.new(">= 1.9")
62
73
  s.rubygems_version = "1.8.24"
63
- s.summary = "Ruby DSL to crawl web pages"
74
+ s.summary = "Ruby DSL to scrape web pages"
64
75
 
65
76
  if s.respond_to? :specification_version then
66
77
  s.specification_version = 3
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-25 00:00:00.000000000 Z
12
+ date: 2012-07-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -192,26 +192,37 @@ files:
192
192
  - fixtures/vcr_cassettes/basic_crawler_page.yml
193
193
  - fixtures/vcr_cassettes/broken_selector.yml
194
194
  - fixtures/vcr_cassettes/error_page.yml
195
+ - fixtures/vcr_cassettes/follow_links.yml
195
196
  - fixtures/vcr_cassettes/for_each_page.yml
196
197
  - fixtures/vcr_cassettes/xml_with_namespace.yml
197
198
  - lib/wombat.rb
198
199
  - lib/wombat/crawler.rb
199
- - lib/wombat/iterator.rb
200
- - lib/wombat/metadata.rb
201
- - lib/wombat/node_selector.rb
202
- - lib/wombat/parser.rb
203
- - lib/wombat/property.rb
204
- - lib/wombat/property_container.rb
205
- - lib/wombat/property_locator.rb
200
+ - lib/wombat/dsl/follower.rb
201
+ - lib/wombat/dsl/iterator.rb
202
+ - lib/wombat/dsl/metadata.rb
203
+ - lib/wombat/dsl/property.rb
204
+ - lib/wombat/dsl/property_group.rb
205
+ - lib/wombat/processing/node_selector.rb
206
+ - lib/wombat/processing/parser.rb
207
+ - lib/wombat/property/locators/base.rb
208
+ - lib/wombat/property/locators/factory.rb
209
+ - lib/wombat/property/locators/follow.rb
210
+ - lib/wombat/property/locators/html.rb
211
+ - lib/wombat/property/locators/iterator.rb
212
+ - lib/wombat/property/locators/list.rb
213
+ - lib/wombat/property/locators/property_group.rb
214
+ - lib/wombat/property/locators/text.rb
206
215
  - spec/crawler_spec.rb
216
+ - spec/dsl/property_spec.rb
207
217
  - spec/helpers/sample_crawler.rb
208
218
  - spec/integration/integration_spec.rb
209
- - spec/iterator_spec.rb
210
- - spec/metadata_spec.rb
211
- - spec/parser_spec.rb
212
- - spec/property_container_spec.rb
213
- - spec/property_locator_spec.rb
214
- - spec/property_spec.rb
219
+ - spec/processing/parser_spec.rb
220
+ - spec/property/locators/factory_spec.rb
221
+ - spec/property/locators/follow_spec.rb
222
+ - spec/property/locators/html_spec.rb
223
+ - spec/property/locators/iterator_spec.rb
224
+ - spec/property/locators/list_spec.rb
225
+ - spec/property/locators/text_spec.rb
215
226
  - spec/sample_crawler_spec.rb
216
227
  - spec/spec_helper.rb
217
228
  - spec/wombat_spec.rb
@@ -240,5 +251,5 @@ rubyforge_project:
240
251
  rubygems_version: 1.8.24
241
252
  signing_key:
242
253
  specification_version: 3
243
- summary: Ruby DSL to crawl web pages
254
+ summary: Ruby DSL to scrape web pages
244
255
  test_files: []
@@ -1,38 +0,0 @@
1
- module Wombat
2
- class Iterator < PropertyContainer
3
- attr_accessor :selector
4
-
5
- def initialize(selector)
6
- @selector = selector
7
- super()
8
- end
9
-
10
- def parse
11
- raise ArgumentError.new('Must provide a block to locate property values') unless block_given?
12
-
13
- all_properties.each do |p|
14
- p.result ||= []
15
- result = yield p
16
- if result
17
- result = p.callback ? p.callback.call(result) : result
18
- p.result << result
19
- end
20
- end
21
- end
22
-
23
- def reset
24
- all_properties.each { |p| p.reset }
25
- end
26
-
27
- def flatten(depth = nil)
28
- # determine the iterator length by the biggest property array that we have
29
- length = all_properties.map(&:result).sort { |a| a.length }.last.size
30
-
31
- Array.new.tap do |a|
32
- length.times do |i|
33
- a << super(i)
34
- end
35
- end
36
- end
37
- end
38
- end