wombat 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/README.md +13 -30
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
  5. data/lib/wombat/crawler.rb +7 -17
  6. data/lib/wombat/dsl/follower.rb +19 -0
  7. data/lib/wombat/dsl/iterator.rb +19 -0
  8. data/lib/wombat/dsl/metadata.rb +27 -0
  9. data/lib/wombat/dsl/property.rb +27 -0
  10. data/lib/wombat/dsl/property_group.rb +48 -0
  11. data/lib/wombat/processing/node_selector.rb +12 -0
  12. data/lib/wombat/processing/parser.rb +48 -0
  13. data/lib/wombat/property/locators/base.rb +33 -0
  14. data/lib/wombat/property/locators/factory.rb +39 -0
  15. data/lib/wombat/property/locators/follow.rb +25 -0
  16. data/lib/wombat/property/locators/html.rb +14 -0
  17. data/lib/wombat/property/locators/iterator.rb +23 -0
  18. data/lib/wombat/property/locators/list.rb +17 -0
  19. data/lib/wombat/property/locators/property_group.rb +20 -0
  20. data/lib/wombat/property/locators/text.rb +22 -0
  21. data/lib/wombat.rb +8 -4
  22. data/spec/crawler_spec.rb +38 -48
  23. data/spec/dsl/property_spec.rb +12 -0
  24. data/spec/helpers/sample_crawler.rb +2 -15
  25. data/spec/integration/integration_spec.rb +61 -33
  26. data/spec/processing/parser_spec.rb +32 -0
  27. data/spec/property/locators/factory_spec.rb +18 -0
  28. data/spec/property/locators/follow_spec.rb +4 -0
  29. data/spec/property/locators/html_spec.rb +15 -0
  30. data/spec/property/locators/iterator_spec.rb +4 -0
  31. data/spec/property/locators/list_spec.rb +13 -0
  32. data/spec/property/locators/text_spec.rb +49 -0
  33. data/spec/sample_crawler_spec.rb +7 -11
  34. data/spec/wombat_spec.rb +13 -1
  35. data/wombat.gemspec +27 -16
  36. metadata +27 -16
  37. data/lib/wombat/iterator.rb +0 -38
  38. data/lib/wombat/metadata.rb +0 -24
  39. data/lib/wombat/node_selector.rb +0 -10
  40. data/lib/wombat/parser.rb +0 -59
  41. data/lib/wombat/property.rb +0 -21
  42. data/lib/wombat/property_container.rb +0 -70
  43. data/lib/wombat/property_locator.rb +0 -20
  44. data/spec/iterator_spec.rb +0 -52
  45. data/spec/metadata_spec.rb +0 -20
  46. data/spec/parser_spec.rb +0 -125
  47. data/spec/property_container_spec.rb +0 -62
  48. data/spec/property_locator_spec.rb +0 -75
  49. data/spec/property_spec.rb +0 -16
@@ -0,0 +1,12 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::DSL::Property do
4
+ it 'should store property data' do
5
+ property = Wombat::DSL::Property.new("title", *["/some/selector", :html]) { false }
6
+
7
+ property.wombat_property_name.should == "title"
8
+ property.selector.should == "/some/selector"
9
+ property.format.should == :html
10
+ property.callback.should == lambda { false }
11
+ end
12
+ end
@@ -5,9 +5,9 @@ class SampleCrawler
5
5
  include Wombat::Crawler
6
6
 
7
7
  base_url "http://www.obaoba.com.br"
8
- list_page "/porto-alegre/agenda"
8
+ path "/porto-alegre/agenda"
9
9
 
10
- for_each "css=div.title-agenda" do
10
+ event_group "css=div.title-agenda", :iterator do
11
11
  event do |e|
12
12
  e.title("xpath=.") { |t| t.split(" | ")[1].strip }
13
13
  e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
@@ -19,18 +19,5 @@ class SampleCrawler
19
19
  venue do |v|
20
20
  v.name("xpath=.") { |n| name.split(" | ")[2].strip }
21
21
  end
22
-
23
- # follow_links "xpath=.//a[1]/@href" do
24
- # event { |e| e.description "css=#main-node-content", :html }
25
- # venue do |v|
26
- # v.phone "css=span.tel .value"
27
- # v.image "xpath=//div[@id='article-image']/div/img/@src"
28
- # end
29
-
30
- # location do |l|
31
- # l.city "css=span.locality"
32
- # l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
33
- # end
34
- # end
35
22
  end
36
23
  end
@@ -8,17 +8,15 @@ describe 'basic crawler setup' do
8
8
  crawler.send(:include, Wombat::Crawler)
9
9
 
10
10
  crawler.base_url "http://www.terra.com.br"
11
- crawler.list_page '/portal'
11
+ crawler.path '/portal'
12
12
 
13
13
  crawler.search "css=.btn-search"
14
- crawler.social do |s|
15
- s.twitter "css=.ctn-bar li.last"
14
+ crawler.social do
15
+ twitter "css=.ctn-bar li.last"
16
16
  end
17
-
18
- crawler.for_each "css=.ctn-links" do
17
+ crawler.links "css=.ctn-links", :iterator do
19
18
  menu "css=a"
20
19
  end
21
-
22
20
  crawler.subheader "css=h2.ttl-dynamic" do |h|
23
21
  h.gsub("London", "Londres")
24
22
  end
@@ -28,7 +26,7 @@ describe 'basic crawler setup' do
28
26
  results = crawler_instance.crawl
29
27
 
30
28
  results["search"].should == "Buscar"
31
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
29
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
32
30
  results["subheader"].should == "Londres 2012"
33
31
  results["social"]["twitter"].should == "Verão"
34
32
  end
@@ -39,9 +37,9 @@ describe 'basic crawler setup' do
39
37
  crawler.send(:include, Wombat::Crawler)
40
38
 
41
39
  crawler.base_url "http://www.terra.com.br"
42
- crawler.list_page '/portal'
40
+ crawler.path '/portal'
43
41
 
44
- crawler.for_each "css=.ctn-links" do
42
+ crawler.links "css=.ctn-links", :iterator do
45
43
  menu "css=a"
46
44
  end
47
45
 
@@ -53,13 +51,13 @@ describe 'basic crawler setup' do
53
51
  results = crawler_instance.crawl
54
52
  end
55
53
 
56
- results["iterator0"].should == result_hash
54
+ results["links"].should == result_hash
57
55
 
58
56
  VCR.use_cassette('basic_crawler_page') do
59
57
  results = crawler_instance.crawl
60
58
  end
61
59
 
62
- results["iterator0"].should == result_hash
60
+ results["links"].should == result_hash
63
61
  end
64
62
 
65
63
  it 'should crawl page through block to class instance crawl method' do
@@ -69,15 +67,15 @@ describe 'basic crawler setup' do
69
67
  crawler_instance = crawler.new
70
68
  results = crawler_instance.crawl do
71
69
  base_url "http://www.terra.com.br"
72
- list_page '/portal'
70
+ path '/portal'
73
71
 
74
72
  search "css=.btn-search"
75
73
 
76
- social do |s|
77
- s.twitter "css=.ctn-bar li.last"
74
+ social do
75
+ twitter "css=.ctn-bar li.last"
78
76
  end
79
77
 
80
- for_each "css=.ctn-links" do
78
+ links "css=.ctn-links", :iterator do
81
79
  menu "css=a"
82
80
  end
83
81
 
@@ -87,7 +85,7 @@ describe 'basic crawler setup' do
87
85
  end
88
86
 
89
87
  results["search"].should == "Buscar"
90
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
88
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
91
89
  results["subheader"].should == "Londres 2012"
92
90
  results["social"]["twitter"].should == "Verão"
93
91
  end
@@ -97,15 +95,15 @@ describe 'basic crawler setup' do
97
95
  VCR.use_cassette('basic_crawler_page') do
98
96
  results = Wombat.crawl do
99
97
  base_url "http://www.terra.com.br"
100
- list_page '/portal'
98
+ path '/portal'
101
99
 
102
100
  search "css=.btn-search"
103
101
 
104
- social do |s|
105
- s.twitter "css=.ctn-bar li.last"
102
+ social do
103
+ twitter "css=.ctn-bar li.last"
106
104
  end
107
105
 
108
- for_each "css=.ctn-links" do
106
+ links "css=.ctn-links", :iterator do
109
107
  menu "css=a"
110
108
  end
111
109
 
@@ -115,7 +113,7 @@ describe 'basic crawler setup' do
115
113
  end
116
114
 
117
115
  results["search"].should == "Buscar"
118
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
116
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
119
117
  results["subheader"].should == "Londres 2012"
120
118
  results["social"]["twitter"].should == "Verão"
121
119
  end
@@ -127,24 +125,24 @@ describe 'basic crawler setup' do
127
125
  crawler.send(:include, Wombat::Crawler)
128
126
 
129
127
  crawler.base_url "https://www.github.com"
130
- crawler.list_page "/explore"
128
+ crawler.path "/explore"
131
129
 
132
- crawler.for_each "css=ol.ranked-repositories li" do
133
- project do |p|
134
- p.repo 'css=h3'
135
- p.description('css=p.description') { |d| d.gsub(/for/, '') }
130
+ crawler.repos "css=ol.ranked-repositories>li", :iterator do
131
+ project do
132
+ repo 'css=h3'
133
+ description('css=p.description') { |d| d ? d.gsub(/for/, '') : nil }
136
134
  end
137
135
  end
138
136
 
139
- crawler_instance = crawler.new
140
- results = crawler_instance.crawl
137
+ results = crawler.new.crawl
141
138
 
142
- results.should == { "iterator0" => [
139
+ results.should == { "repos" => [
143
140
  { "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
144
141
  { "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
145
142
  { "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
146
143
  { "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
147
- { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } }
144
+ { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } },
145
+ { "project" => { "repo" => nil, "description" => nil}}
148
146
  ]}
149
147
  end
150
148
  end
@@ -156,18 +154,18 @@ describe 'basic crawler setup' do
156
154
 
157
155
  crawler.document_format :xml
158
156
  crawler.base_url "http://ws.audioscrobbler.com"
159
- crawler.list_page "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
157
+ crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
160
158
 
161
159
  crawler.artist "xpath=//title", :list
162
160
 
163
- crawler.for_each 'xpath=//event' do
161
+ crawler.location 'xpath=//event', :iterator do
164
162
  latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
165
163
  longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
166
164
  end
167
165
 
168
166
  crawler_instance = crawler.new
169
167
  results = crawler_instance.crawl
170
- iterator = results['iterator0']
168
+ iterator = results['location']
171
169
 
172
170
  iterator.should == [
173
171
  {"latitude"=>"37.807775", "longitude"=>"-122.272736"},
@@ -185,4 +183,34 @@ describe 'basic crawler setup' do
185
183
  results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
186
184
  end
187
185
  end
186
+
187
+ it 'should follow links' do
188
+ VCR.use_cassette('follow_links') do
189
+ crawler = Class.new
190
+ crawler.send(:include, Wombat::Crawler)
191
+
192
+ crawler.document_format :html
193
+ crawler.base_url "https://www.github.com"
194
+ crawler.path "/"
195
+
196
+ crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
197
+ heading 'css=h1'
198
+ end
199
+
200
+ crawler_instance = crawler.new
201
+ results = crawler_instance.crawl
202
+
203
+ results.should == {
204
+ "github" => [
205
+ { "heading"=>"GitHub helps people build software together."},
206
+ { "heading"=>nil},
207
+ { "heading"=>"Features"},
208
+ { "heading"=>"Contact GitHub"},
209
+ { "heading"=>"GitHub Training — Git Training from the Experts"},
210
+ { "heading"=>"GitHub on Your Servers"},
211
+ { "heading"=>"Loading..."}
212
+ ]
213
+ }
214
+ end
215
+ end
188
216
  end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Processing::Parser do
4
+ before(:each) do
5
+ crawler = Class.new
6
+ crawler.send(:include, Wombat::Processing::Parser)
7
+ @parser = crawler.new
8
+ @metadata = Wombat::DSL::Metadata.new
9
+ end
10
+
11
+ it 'should request page document with correct url' do
12
+ @metadata.base_url "http://www.google.com"
13
+ @metadata.path "/search"
14
+ fake_document = double :document
15
+ fake_parser = double :parser
16
+ fake_document.should_receive(:parser).and_return(fake_parser)
17
+ @parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
18
+
19
+ @parser.parse @metadata
20
+ end
21
+
22
+ it 'should correctly parse xml documents' do
23
+ fake_document = double :xml
24
+ fake_parser = double :parser
25
+ @metadata.document_format :xml
26
+ @parser.mechanize.should_not_receive(:get)
27
+ RestClient.should_receive(:get).and_return fake_document
28
+ Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
29
+
30
+ @parser.parse @metadata
31
+ end
32
+ end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Factory do
4
+ it 'should instantiate correct locator according to property type' do
5
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :text)).should be_a(Wombat::Property::Locators::Text)
6
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :html)).should be_a(Wombat::Property::Locators::Html)
7
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :list)).should be_a(Wombat::Property::Locators::List)
8
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :follow)).should be_a(Wombat::Property::Locators::Follow)
9
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :iterator)).should be_a(Wombat::Property::Locators::Iterator)
10
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :container)).should be_a(Wombat::Property::Locators::PropertyGroup)
11
+ end
12
+
13
+ it 'should raise correct exception if provided property is of unknown type' do
14
+ lambda {
15
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :weird))
16
+ }.should raise_error(Wombat::Property::Locators::UnknownTypeException, "Unknown property format weird.")
17
+ end
18
+ end
@@ -0,0 +1,4 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Follow do
4
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Html do
4
+ it 'should locate html property' do
5
+ fake_elem = double :element
6
+ context = double :context
7
+ fake_elem.stub inner_html: "Something cool "
8
+ context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
9
+ property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
10
+
11
+ locator = Wombat::Property::Locators::Html.new(property)
12
+
13
+ locator.locate(context).should == { "data1" => "Something cool" }
14
+ end
15
+ end
@@ -0,0 +1,4 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Iterator do
4
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::List do
4
+ it 'should locate a list of nodes' do
5
+ context = double :context
6
+ context.stub(:css).with(".selector").and_return %w(1 2 3 4 5)
7
+ property = Wombat::DSL::Property.new('data1', 'css=.selector', :list)
8
+
9
+ locator = Wombat::Property::Locators::List.new(property)
10
+
11
+ locator.locate(context).should == { "data1" => %w(1 2 3 4 5) }
12
+ end
13
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wombat::Property::Locators::Text do
4
+ it 'should locate text property with xpath selector and namespaces' do
5
+ fake_elem = double :element
6
+ context = double :context
7
+ fake_elem.stub inner_text: "Something cool "
8
+ context.stub(:xpath).with("/abc", 'boom').and_return [fake_elem]
9
+ property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :text, 'boom')
10
+
11
+ locator = Wombat::Property::Locators::Text.new(property)
12
+
13
+ locator.locate(context).should == { "data1" => "Something cool" }
14
+ end
15
+
16
+ it 'should locate text property with css selector' do
17
+ fake_elem = double :element
18
+ context = double :context
19
+ fake_elem.stub inner_text: "My name"
20
+ context.stub(:css).with("/def").and_return [fake_elem]
21
+ property = Wombat::DSL::Property.new('data1', 'css=/def', :text)
22
+
23
+ locator = Wombat::Property::Locators::Text.new(property)
24
+
25
+ locator.locate(context).should == { "data1" => "My name" }
26
+ end
27
+
28
+ it 'should return plain symbols as strings' do
29
+ fake_elem = double :element
30
+ context = double :context
31
+ property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
32
+
33
+ locator = Wombat::Property::Locators::Text.new(property)
34
+
35
+ locator.locate(context).should == { "data_2" => "hardcoded_value" }
36
+ end
37
+
38
+ it 'should invoke property callback' do
39
+ fake_elem = double :element
40
+ context = double :context
41
+ fake_elem.stub inner_text: "My name"
42
+ context.stub(:css).with("/def").and_return [fake_elem]
43
+ property = Wombat::DSL::Property.new('data1', 'css=/def', :text) { |s| s.gsub(/name/, 'ass') }
44
+
45
+ locator = Wombat::Property::Locators::Text.new(property)
46
+
47
+ locator.locate(context).should == { "data1" => "My ass" }
48
+ end
49
+ end
@@ -8,19 +8,15 @@ describe SampleCrawler do
8
8
 
9
9
  it 'should correctly assign event metadata' do
10
10
  @sample_crawler.should_receive(:parse) do |args|
11
- # args["event"]["description"].selector.should == "css=#main-node-content"
12
-
13
- # args["venue"]["address"].selector.should == "324 Dom Pedro II Street"
14
-
15
- it = args.iterators.first
16
- it.selector.should == "css=div.title-agenda"
17
- it["event"]["title"].selector.should == "xpath=."
18
- it["event"]["date"].selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
19
- it["event"]["type"].selector.should == "xpath=.type"
20
- it["venue"]["name"].selector.should == "xpath=."
11
+ args['event_group'].wombat_property_selector.should == "css=div.title-agenda"
12
+ it = args['event_group']
13
+ it["event"]["title"].wombat_property_selector.should == "xpath=."
14
+ it["event"]["date"].wombat_property_selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
15
+ it["event"]["type"].wombat_property_selector.should == "xpath=.type"
16
+ it["venue"]["name"].wombat_property_selector.should == "xpath=."
21
17
 
22
18
  args[:base_url].should == 'http://www.obaoba.com.br'
23
- args[:list_page].should == '/porto-alegre/agenda'
19
+ args[:path].should == '/porto-alegre/agenda'
24
20
  end
25
21
 
26
22
  @sample_crawler.crawl
data/spec/wombat_spec.rb CHANGED
@@ -5,12 +5,24 @@ describe Wombat do
5
5
  Wombat.should respond_to(:crawl)
6
6
  end
7
7
 
8
+ it 'should provide syntactic sugar method Wombat.scrape' do
9
+ Wombat.should respond_to(:scrape)
10
+ end
11
+
12
+ it 'should redirect .scrape to .crawl' do
13
+ fake_class = double :fake
14
+ fake_class.stub :include
15
+ fake_class.should_receive(:new).and_return(stub(crawl: nil))
16
+ Class.stub :new => fake_class
17
+ Wombat.scrape
18
+ end
19
+
8
20
  it 'should accept regular properties (non-selectors)' do
9
21
  VCR.use_cassette('broken_selector') do
10
22
  lambda {
11
23
  Wombat.crawl do
12
24
  base_url "http://www.github.com"
13
- list_page "/"
25
+ path "/"
14
26
 
15
27
  source :obaoba
16
28
  description 'Oba Oba'
data/wombat.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "wombat"
8
- s.version = "1.0.0"
8
+ s.version = "2.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Felipe Lima"]
12
- s.date = "2012-06-25"
12
+ s.date = "2012-07-31"
13
13
  s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
14
14
  s.email = "felipe.lima@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -30,26 +30,37 @@ Gem::Specification.new do |s|
30
30
  "fixtures/vcr_cassettes/basic_crawler_page.yml",
31
31
  "fixtures/vcr_cassettes/broken_selector.yml",
32
32
  "fixtures/vcr_cassettes/error_page.yml",
33
+ "fixtures/vcr_cassettes/follow_links.yml",
33
34
  "fixtures/vcr_cassettes/for_each_page.yml",
34
35
  "fixtures/vcr_cassettes/xml_with_namespace.yml",
35
36
  "lib/wombat.rb",
36
37
  "lib/wombat/crawler.rb",
37
- "lib/wombat/iterator.rb",
38
- "lib/wombat/metadata.rb",
39
- "lib/wombat/node_selector.rb",
40
- "lib/wombat/parser.rb",
41
- "lib/wombat/property.rb",
42
- "lib/wombat/property_container.rb",
43
- "lib/wombat/property_locator.rb",
38
+ "lib/wombat/dsl/follower.rb",
39
+ "lib/wombat/dsl/iterator.rb",
40
+ "lib/wombat/dsl/metadata.rb",
41
+ "lib/wombat/dsl/property.rb",
42
+ "lib/wombat/dsl/property_group.rb",
43
+ "lib/wombat/processing/node_selector.rb",
44
+ "lib/wombat/processing/parser.rb",
45
+ "lib/wombat/property/locators/base.rb",
46
+ "lib/wombat/property/locators/factory.rb",
47
+ "lib/wombat/property/locators/follow.rb",
48
+ "lib/wombat/property/locators/html.rb",
49
+ "lib/wombat/property/locators/iterator.rb",
50
+ "lib/wombat/property/locators/list.rb",
51
+ "lib/wombat/property/locators/property_group.rb",
52
+ "lib/wombat/property/locators/text.rb",
44
53
  "spec/crawler_spec.rb",
54
+ "spec/dsl/property_spec.rb",
45
55
  "spec/helpers/sample_crawler.rb",
46
56
  "spec/integration/integration_spec.rb",
47
- "spec/iterator_spec.rb",
48
- "spec/metadata_spec.rb",
49
- "spec/parser_spec.rb",
50
- "spec/property_container_spec.rb",
51
- "spec/property_locator_spec.rb",
52
- "spec/property_spec.rb",
57
+ "spec/processing/parser_spec.rb",
58
+ "spec/property/locators/factory_spec.rb",
59
+ "spec/property/locators/follow_spec.rb",
60
+ "spec/property/locators/html_spec.rb",
61
+ "spec/property/locators/iterator_spec.rb",
62
+ "spec/property/locators/list_spec.rb",
63
+ "spec/property/locators/text_spec.rb",
53
64
  "spec/sample_crawler_spec.rb",
54
65
  "spec/spec_helper.rb",
55
66
  "spec/wombat_spec.rb",
@@ -60,7 +71,7 @@ Gem::Specification.new do |s|
60
71
  s.require_paths = ["lib"]
61
72
  s.required_ruby_version = Gem::Requirement.new(">= 1.9")
62
73
  s.rubygems_version = "1.8.24"
63
- s.summary = "Ruby DSL to crawl web pages"
74
+ s.summary = "Ruby DSL to scrape web pages"
64
75
 
65
76
  if s.respond_to? :specification_version then
66
77
  s.specification_version = 3
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wombat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-25 00:00:00.000000000 Z
12
+ date: 2012-07-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -192,26 +192,37 @@ files:
192
192
  - fixtures/vcr_cassettes/basic_crawler_page.yml
193
193
  - fixtures/vcr_cassettes/broken_selector.yml
194
194
  - fixtures/vcr_cassettes/error_page.yml
195
+ - fixtures/vcr_cassettes/follow_links.yml
195
196
  - fixtures/vcr_cassettes/for_each_page.yml
196
197
  - fixtures/vcr_cassettes/xml_with_namespace.yml
197
198
  - lib/wombat.rb
198
199
  - lib/wombat/crawler.rb
199
- - lib/wombat/iterator.rb
200
- - lib/wombat/metadata.rb
201
- - lib/wombat/node_selector.rb
202
- - lib/wombat/parser.rb
203
- - lib/wombat/property.rb
204
- - lib/wombat/property_container.rb
205
- - lib/wombat/property_locator.rb
200
+ - lib/wombat/dsl/follower.rb
201
+ - lib/wombat/dsl/iterator.rb
202
+ - lib/wombat/dsl/metadata.rb
203
+ - lib/wombat/dsl/property.rb
204
+ - lib/wombat/dsl/property_group.rb
205
+ - lib/wombat/processing/node_selector.rb
206
+ - lib/wombat/processing/parser.rb
207
+ - lib/wombat/property/locators/base.rb
208
+ - lib/wombat/property/locators/factory.rb
209
+ - lib/wombat/property/locators/follow.rb
210
+ - lib/wombat/property/locators/html.rb
211
+ - lib/wombat/property/locators/iterator.rb
212
+ - lib/wombat/property/locators/list.rb
213
+ - lib/wombat/property/locators/property_group.rb
214
+ - lib/wombat/property/locators/text.rb
206
215
  - spec/crawler_spec.rb
216
+ - spec/dsl/property_spec.rb
207
217
  - spec/helpers/sample_crawler.rb
208
218
  - spec/integration/integration_spec.rb
209
- - spec/iterator_spec.rb
210
- - spec/metadata_spec.rb
211
- - spec/parser_spec.rb
212
- - spec/property_container_spec.rb
213
- - spec/property_locator_spec.rb
214
- - spec/property_spec.rb
219
+ - spec/processing/parser_spec.rb
220
+ - spec/property/locators/factory_spec.rb
221
+ - spec/property/locators/follow_spec.rb
222
+ - spec/property/locators/html_spec.rb
223
+ - spec/property/locators/iterator_spec.rb
224
+ - spec/property/locators/list_spec.rb
225
+ - spec/property/locators/text_spec.rb
215
226
  - spec/sample_crawler_spec.rb
216
227
  - spec/spec_helper.rb
217
228
  - spec/wombat_spec.rb
@@ -240,5 +251,5 @@ rubyforge_project:
240
251
  rubygems_version: 1.8.24
241
252
  signing_key:
242
253
  specification_version: 3
243
- summary: Ruby DSL to crawl web pages
254
+ summary: Ruby DSL to scrape web pages
244
255
  test_files: []
@@ -1,38 +0,0 @@
1
- module Wombat
2
- class Iterator < PropertyContainer
3
- attr_accessor :selector
4
-
5
- def initialize(selector)
6
- @selector = selector
7
- super()
8
- end
9
-
10
- def parse
11
- raise ArgumentError.new('Must provide a block to locate property values') unless block_given?
12
-
13
- all_properties.each do |p|
14
- p.result ||= []
15
- result = yield p
16
- if result
17
- result = p.callback ? p.callback.call(result) : result
18
- p.result << result
19
- end
20
- end
21
- end
22
-
23
- def reset
24
- all_properties.each { |p| p.reset }
25
- end
26
-
27
- def flatten(depth = nil)
28
- # determine the iterator length by the biggest property array that we have
29
- length = all_properties.map(&:result).sort { |a| a.length }.last.size
30
-
31
- Array.new.tap do |a|
32
- length.times do |i|
33
- a << super(i)
34
- end
35
- end
36
- end
37
- end
38
- end