wombat 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/Gemfile.lock +4 -0
- data/README.md +6 -1
- data/VERSION +1 -1
- data/lib/wombat/crawler.rb +1 -6
- data/lib/wombat/iterator.rb +1 -0
- data/lib/wombat/metadata.rb +0 -12
- data/lib/wombat/node_selector.rb +10 -0
- data/lib/wombat/parser.rb +15 -5
- data/lib/wombat/property.rb +4 -0
- data/lib/wombat/property_container.rb +31 -19
- data/lib/wombat/property_locator.rb +7 -14
- data/spec/crawler_spec.rb +14 -9
- data/spec/helpers/sample_crawler.rb +14 -14
- data/spec/integration/integration_spec.rb +5 -0
- data/spec/parser_spec.rb +33 -7
- data/spec/property_container_spec.rb +15 -3
- data/spec/property_locator_spec.rb +9 -9
- data/spec/sample_crawler_spec.rb +12 -8
- data/wombat.gemspec +6 -2
- metadata +38 -26
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -8,6 +8,8 @@ GEM
|
|
8
8
|
fakeweb (1.3.0)
|
9
9
|
ffi (1.0.11)
|
10
10
|
git (1.2.5)
|
11
|
+
growl_notify (0.0.3)
|
12
|
+
rb-appscript
|
11
13
|
guard (0.9.4)
|
12
14
|
ffi (>= 0.5.0)
|
13
15
|
thor (~> 0.14.6)
|
@@ -32,6 +34,7 @@ GEM
|
|
32
34
|
nokogiri (1.5.0)
|
33
35
|
ntlm-http (0.1.1)
|
34
36
|
rake (0.9.2.2)
|
37
|
+
rb-appscript (0.6.1)
|
35
38
|
rspec (2.7.0)
|
36
39
|
rspec-core (~> 2.7.0)
|
37
40
|
rspec-expectations (~> 2.7.0)
|
@@ -56,6 +59,7 @@ DEPENDENCIES
|
|
56
59
|
activesupport
|
57
60
|
bundler
|
58
61
|
fakeweb
|
62
|
+
growl_notify
|
59
63
|
guard
|
60
64
|
guard-bundler
|
61
65
|
guard-rspec
|
data/README.md
CHANGED
@@ -8,6 +8,8 @@ Generic Web crawler with a DSL that parses structured data from web pages.
|
|
8
8
|
|
9
9
|
``gem install wombat``
|
10
10
|
|
11
|
+
Creating a crawler:
|
12
|
+
|
11
13
|
```ruby
|
12
14
|
|
13
15
|
# => github_crawler.rb
|
@@ -37,8 +39,11 @@ class GithubCrawler
|
|
37
39
|
end
|
38
40
|
end
|
39
41
|
```
|
42
|
+
|
43
|
+
Running it:
|
44
|
+
|
40
45
|
```ruby
|
41
|
-
irb>
|
46
|
+
irb> GithubCrawler.new.crawl
|
42
47
|
=>
|
43
48
|
{
|
44
49
|
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/wombat/crawler.rb
CHANGED
@@ -21,12 +21,7 @@ module Wombat
|
|
21
21
|
|
22
22
|
module ClassMethods
|
23
23
|
def method_missing method, *args, &block
|
24
|
-
|
25
|
-
metadata["#{method.to_s}"] = PropertyContainer.new unless metadata["#{method.to_s}"]
|
26
|
-
block.call(metadata["#{method.to_s}"])
|
27
|
-
else
|
28
|
-
metadata.send method, *args, &block
|
29
|
-
end
|
24
|
+
metadata.send method, *args, &block
|
30
25
|
end
|
31
26
|
|
32
27
|
def for_each selector, &block
|
data/lib/wombat/iterator.rb
CHANGED
data/lib/wombat/metadata.rb
CHANGED
@@ -4,12 +4,6 @@ require 'wombat/iterator'
|
|
4
4
|
|
5
5
|
module Wombat
|
6
6
|
class Metadata < PropertyContainer
|
7
|
-
attr_accessor :iterators
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
@iterators = []
|
11
|
-
end
|
12
|
-
|
13
7
|
def base_url url
|
14
8
|
self[:base_url] = url
|
15
9
|
end
|
@@ -17,11 +11,5 @@ module Wombat
|
|
17
11
|
def list_page url
|
18
12
|
self[:list_page] = url
|
19
13
|
end
|
20
|
-
|
21
|
-
def for_each selector
|
22
|
-
Iterator.new(selector).tap do |i|
|
23
|
-
iterators << i
|
24
|
-
end
|
25
|
-
end
|
26
14
|
end
|
27
15
|
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module Wombat
|
2
|
+
module NodeSelector
|
3
|
+
def select_nodes selector, namespaces = nil
|
4
|
+
return [selector.to_s] if selector.is_a? Symbol
|
5
|
+
return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
6
|
+
return context.css selector[4..-1] if selector.start_with? "css="
|
7
|
+
nil
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -12,14 +12,24 @@ module Wombat
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def parse metadata
|
15
|
-
|
15
|
+
self.context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
|
16
|
+
original_context = self.context
|
16
17
|
|
17
|
-
|
18
|
+
metadata.iterators.each do |it|
|
19
|
+
select_nodes(it.selector).each do |n|
|
20
|
+
self.context = n
|
21
|
+
it.all_properties.each do |p|
|
22
|
+
p.result ||= []
|
23
|
+
p.result << locate_first(p)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
18
27
|
|
19
|
-
|
28
|
+
self.context = original_context
|
20
29
|
|
21
|
-
|
22
|
-
|
30
|
+
metadata.all_properties.each do |p|
|
31
|
+
result = locate_first p
|
32
|
+
p.result = p.callback ? p.callback.call(result) : result
|
23
33
|
end
|
24
34
|
|
25
35
|
metadata.flatten
|
data/lib/wombat/property.rb
CHANGED
@@ -2,42 +2,54 @@
|
|
2
2
|
|
3
3
|
module Wombat
|
4
4
|
class PropertyContainer < Hash
|
5
|
-
|
6
|
-
self[property.name] = property
|
7
|
-
end
|
5
|
+
attr_accessor :iterators
|
8
6
|
|
9
|
-
def
|
10
|
-
|
7
|
+
def initialize
|
8
|
+
@iterators = []
|
11
9
|
end
|
12
10
|
|
13
11
|
def method_missing method, *args, &block
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
12
|
+
if args.empty? && block
|
13
|
+
self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
|
14
|
+
block.call(self["#{method.to_s}"])
|
15
|
+
else
|
16
|
+
self[method.to_s] = Property.new(
|
17
|
+
name: method.to_s,
|
18
|
+
selector: args.first,
|
19
|
+
format: args[1],
|
20
|
+
namespaces: args[2],
|
21
|
+
callback: block)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_ary
|
20
26
|
end
|
21
27
|
|
22
28
|
def all_properties
|
23
29
|
values.flat_map { |v|
|
24
|
-
v.kind_of?
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
30
|
+
if v.kind_of? PropertyContainer
|
31
|
+
v.all_properties
|
32
|
+
elsif v.kind_of? Property
|
33
|
+
v
|
34
|
+
else
|
35
|
+
nil
|
36
|
+
end
|
29
37
|
}.compact
|
30
38
|
end
|
31
39
|
|
32
40
|
def flatten
|
33
41
|
Hash.new.tap do |h|
|
34
42
|
keys.map do |k|
|
35
|
-
if self[k].kind_of?(PropertyContainer)
|
43
|
+
if self[k].kind_of?(PropertyContainer) || self[k].kind_of?(Property)
|
36
44
|
h[k] = self[k].flatten
|
37
|
-
elsif self[k].kind_of?(Property)
|
38
|
-
h[k] = self[k].result
|
39
45
|
end
|
40
46
|
end
|
47
|
+
end.merge(iterators.inject({}) {|memo, i| memo.merge(i.flatten) })
|
48
|
+
end
|
49
|
+
|
50
|
+
def for_each selector
|
51
|
+
Iterator.new(selector).tap do |i|
|
52
|
+
iterators << i
|
41
53
|
end
|
42
54
|
end
|
43
55
|
end
|
@@ -1,25 +1,18 @@
|
|
1
1
|
#coding: utf-8
|
2
|
+
require 'wombat/node_selector'
|
2
3
|
|
3
4
|
module Wombat
|
4
5
|
module PropertyLocator
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
include NodeSelector
|
7
|
+
|
8
|
+
def locate_first property
|
9
|
+
locate(property).first
|
9
10
|
end
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
result = locate_selector(property.selector, property.namespaces).to_a
|
12
|
+
def locate property
|
13
|
+
result = select_nodes(property.selector, property.namespaces).to_a
|
14
14
|
result.map! {|r| r.inner_html.strip } if property.format == :html
|
15
15
|
result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
|
16
16
|
end
|
17
|
-
|
18
|
-
def locate_selector selector, namespaces = nil
|
19
|
-
return [selector.to_s] if selector.is_a? Symbol
|
20
|
-
return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
21
|
-
return context.css selector[4..-1] if selector.start_with? "css="
|
22
|
-
nil
|
23
|
-
end
|
24
17
|
end
|
25
18
|
end
|
data/spec/crawler_spec.rb
CHANGED
@@ -33,10 +33,10 @@ describe Wombat::Crawler do
|
|
33
33
|
@crawler.location { |v| v.latitude -50.2323 }
|
34
34
|
|
35
35
|
@crawler_instance.should_receive(:parse) do |arg|
|
36
|
-
arg["event"]
|
37
|
-
arg["event"]
|
38
|
-
arg["venue"]
|
39
|
-
arg["location"]
|
36
|
+
arg["event"]["title"].selector.should == "Fulltronic Dezembro"
|
37
|
+
arg["event"]["time"].selector.to_s.should == time.to_s
|
38
|
+
arg["venue"]["name"].selector.should == "Scooba"
|
39
|
+
arg["location"]["latitude"].selector.should == -50.2323
|
40
40
|
end
|
41
41
|
|
42
42
|
@crawler_instance.crawl
|
@@ -48,11 +48,11 @@ describe Wombat::Crawler do
|
|
48
48
|
another_crawler_instance = another_crawler.new
|
49
49
|
|
50
50
|
another_crawler.event { |e| e.title 'Ibiza' }
|
51
|
-
another_crawler_instance.should_receive(:parse) { |arg| arg["event"]
|
51
|
+
another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
|
52
52
|
another_crawler_instance.crawl
|
53
53
|
|
54
54
|
@crawler.event { |e| e.title 'Fulltronic Dezembro' }
|
55
|
-
@crawler_instance.should_receive(:parse) { |arg| arg["event"]
|
55
|
+
@crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
|
56
56
|
@crawler_instance.crawl
|
57
57
|
end
|
58
58
|
|
@@ -96,12 +96,17 @@ describe Wombat::Crawler do
|
|
96
96
|
@crawler.for_each "css=.element" do
|
97
97
|
title "css=.title"
|
98
98
|
body "css=.body"
|
99
|
+
event do |e|
|
100
|
+
e.all "yeah"
|
101
|
+
end
|
99
102
|
end
|
100
103
|
|
101
104
|
@crawler_instance.should_receive(:parse) do |arg|
|
102
|
-
arg.iterators.first
|
103
|
-
|
104
|
-
|
105
|
+
it = arg.iterators.first
|
106
|
+
it.selector.should == "css=.element"
|
107
|
+
it["title"].selector.should == "css=.title"
|
108
|
+
it["body"].selector.should == "css=.body"
|
109
|
+
it["event"]["all"].selector.should == "yeah"
|
105
110
|
end
|
106
111
|
|
107
112
|
@crawler_instance.crawl
|
@@ -13,24 +13,24 @@ class SampleCrawler
|
|
13
13
|
e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
|
14
14
|
DateTime.strptime(d, '%d/%m')
|
15
15
|
end
|
16
|
-
e.type("xpath=.") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
|
16
|
+
e.type("xpath=.type") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
|
17
17
|
end
|
18
18
|
|
19
19
|
venue do |v|
|
20
20
|
v.name("xpath=.") { |n| name.split(" | ")[2].strip }
|
21
21
|
end
|
22
|
+
|
23
|
+
# follow_links "xpath=.//a[1]/@href" do
|
24
|
+
# event { |e| e.description "css=#main-node-content", :html }
|
25
|
+
# venue do |v|
|
26
|
+
# v.phone "css=span.tel .value"
|
27
|
+
# v.image "xpath=//div[@id='article-image']/div/img/@src"
|
28
|
+
# end
|
22
29
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
location do |l|
|
31
|
-
l.city "css=span.locality"
|
32
|
-
l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
30
|
+
# location do |l|
|
31
|
+
# l.city "css=span.locality"
|
32
|
+
# l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
+
# end
|
34
|
+
# end
|
35
|
+
end
|
36
36
|
end
|
@@ -15,6 +15,10 @@ describe 'basic crawler setup' do
|
|
15
15
|
s.twitter "css=.ctn-bar li.last"
|
16
16
|
end
|
17
17
|
|
18
|
+
crawler.for_each "css=.ctn-links" do
|
19
|
+
menu "css=a"
|
20
|
+
end
|
21
|
+
|
18
22
|
crawler.subheader "css=h2.ttl-dynamic" do |h|
|
19
23
|
h.gsub("London", "Londres")
|
20
24
|
end
|
@@ -24,6 +28,7 @@ describe 'basic crawler setup' do
|
|
24
28
|
results = crawler_instance.crawl
|
25
29
|
|
26
30
|
results["search"].should == "Buscar"
|
31
|
+
results["menu"].should =~ ["Agenda", "Brasileiro", "Brasil", "Bolsas", "Cinema", "Galerias de Fotos", "Beleza", "Esportes", "Assine o RSS"]
|
27
32
|
results["subheader"].should == "Londres 2012"
|
28
33
|
results["social"]["twitter"].should == "Verão"
|
29
34
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -24,7 +24,7 @@ describe Wombat::Parser do
|
|
24
24
|
fake_parser = double :parser
|
25
25
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
26
26
|
@parser.mechanize.stub(:get).and_return fake_document
|
27
|
-
@parser.
|
27
|
+
@parser.should_not_receive :locate
|
28
28
|
@parser.parse @metadata
|
29
29
|
end
|
30
30
|
|
@@ -42,7 +42,7 @@ describe Wombat::Parser do
|
|
42
42
|
|
43
43
|
@parser.mechanize.stub(:get).and_return fake_document
|
44
44
|
@metadata.stub(:all_properties).and_return [property]
|
45
|
-
@parser.should_receive(:
|
45
|
+
@parser.should_receive(:locate_first).with(property)
|
46
46
|
|
47
47
|
@parser.parse @metadata
|
48
48
|
|
@@ -59,21 +59,20 @@ describe Wombat::Parser do
|
|
59
59
|
p.should == "blah"
|
60
60
|
}
|
61
61
|
|
62
|
-
property.should_receive(:result).and_return("blah")
|
63
62
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
64
63
|
property.should_receive(:callback).twice.and_return(block)
|
65
64
|
property.should_receive(:result=).with(true)
|
66
65
|
|
67
66
|
@parser.mechanize.stub(:get).and_return fake_document
|
68
67
|
@metadata.stub(:all_properties).and_return [property]
|
69
|
-
@parser.should_receive(:
|
68
|
+
@parser.should_receive(:locate_first).with(property).and_return("blah")
|
70
69
|
|
71
70
|
@parser.parse @metadata
|
72
71
|
|
73
72
|
block_called.should be_true
|
74
73
|
end
|
75
74
|
|
76
|
-
it 'should return
|
75
|
+
it 'should return hash with requested properties' do
|
77
76
|
hash = double :results
|
78
77
|
fake_parser = double :parser
|
79
78
|
fake_document = double :document
|
@@ -81,7 +80,34 @@ describe Wombat::Parser do
|
|
81
80
|
fake_document.should_receive(:parser).and_return fake_parser
|
82
81
|
@parser.mechanize.stub(:get).and_return fake_document
|
83
82
|
@metadata.should_receive(:flatten).and_return hash
|
84
|
-
|
83
|
+
|
85
84
|
@parser.parse(@metadata).should == hash
|
86
85
|
end
|
87
|
-
|
86
|
+
|
87
|
+
it 'should iterate in for_each properties' do
|
88
|
+
fake_parser = double :parser
|
89
|
+
fake_document = double :document
|
90
|
+
c1 = double :context
|
91
|
+
c2 = double :context
|
92
|
+
it = Wombat::Iterator.new "it_selector"
|
93
|
+
it.prop_1 "some_selector"
|
94
|
+
it.prop_2 "another_selector"
|
95
|
+
|
96
|
+
@parser.should_receive(:context=).ordered
|
97
|
+
@metadata.should_receive(:iterators).and_return [it]
|
98
|
+
@metadata.should_receive(:flatten)
|
99
|
+
fake_document.should_receive(:parser).and_return(fake_parser)
|
100
|
+
it['prop_1'].should_receive(:result).exactly(4).times.and_return([])
|
101
|
+
it['prop_2'].should_receive(:result).exactly(4).times.and_return([])
|
102
|
+
@parser.mechanize.stub(:get).and_return fake_document
|
103
|
+
@parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
|
104
|
+
@parser.should_receive(:context=).with(c1).ordered
|
105
|
+
@parser.should_receive(:context=).with(c2).ordered
|
106
|
+
@parser.should_receive(:context=).ordered
|
107
|
+
@parser.should_receive(:locate_first).with(it['prop_1']).twice
|
108
|
+
@parser.should_receive(:locate_first).with(it['prop_2']).twice
|
109
|
+
@parser.stub(:locate)
|
110
|
+
|
111
|
+
@parser.parse(@metadata)
|
112
|
+
end
|
113
|
+
end
|
@@ -5,17 +5,24 @@ describe Wombat::PropertyContainer do
|
|
5
5
|
@metadata = Wombat::PropertyContainer.new
|
6
6
|
end
|
7
7
|
|
8
|
-
it 'should return an array with all the metadata properties' do
|
8
|
+
it 'should return an array with all the metadata properties excluding iterators' do
|
9
9
|
@metadata["event"] = Wombat::PropertyContainer.new
|
10
10
|
@metadata["venue"] = Wombat::PropertyContainer.new
|
11
11
|
@metadata.another_property "/some/selector", :text
|
12
12
|
@metadata["event"]["something"] = Wombat::PropertyContainer.new
|
13
13
|
@metadata["event"]["something"].else "Wohooo"
|
14
14
|
@metadata["venue"].awesome "whooea"
|
15
|
+
it = Wombat::Iterator.new "it_selector"
|
16
|
+
it.felipe "lima"
|
17
|
+
@metadata.iterators << it
|
15
18
|
|
16
19
|
all_propes = @metadata.all_properties
|
17
20
|
|
18
|
-
all_propes.should =~ [
|
21
|
+
all_propes.should =~ [
|
22
|
+
@metadata["another_property"],
|
23
|
+
@metadata["event"]["something"]["else"],
|
24
|
+
@metadata["venue"]["awesome"]
|
25
|
+
]
|
19
26
|
end
|
20
27
|
|
21
28
|
it 'should be able to change properties via all_properties' do
|
@@ -24,7 +31,7 @@ describe Wombat::PropertyContainer do
|
|
24
31
|
@metadata["another_property"].selector.should == "abc"
|
25
32
|
end
|
26
33
|
|
27
|
-
it 'should return metadata in plain hash format' do
|
34
|
+
it 'should return metadata in plain hash format including iterators' do
|
28
35
|
@metadata.title "/some/selector"
|
29
36
|
@metadata["title"].result = "Gogobot Inc."
|
30
37
|
@metadata["holder"] = Wombat::PropertyContainer.new
|
@@ -33,6 +40,10 @@ describe Wombat::PropertyContainer do
|
|
33
40
|
@metadata["holder"]["subheader"] = Wombat::PropertyContainer.new
|
34
41
|
@metadata["holder"]["subheader"].section "/blah"
|
35
42
|
@metadata["holder"]["subheader"]["section"].result = "Lorem Ipsum"
|
43
|
+
it = Wombat::Iterator.new "it_selector"
|
44
|
+
it.felipe "lima"
|
45
|
+
it["felipe"].result = ["correa", "de souza", "lima"]
|
46
|
+
@metadata.iterators = [it]
|
36
47
|
@metadata.footer("another thing", :html) { |a| true }
|
37
48
|
@metadata["footer"].result = "bla bla bla"
|
38
49
|
|
@@ -44,6 +55,7 @@ describe Wombat::PropertyContainer do
|
|
44
55
|
"section" => "Lorem Ipsum"
|
45
56
|
}
|
46
57
|
},
|
58
|
+
"felipe" => ["correa", "de souza", "lima"],
|
47
59
|
"footer" => "bla bla bla"
|
48
60
|
}
|
49
61
|
end
|
@@ -28,12 +28,12 @@ describe Wombat::PropertyLocator do
|
|
28
28
|
|
29
29
|
@locator_instance.stub(:context).and_return context
|
30
30
|
|
31
|
-
@
|
31
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate_first p }
|
32
32
|
|
33
|
-
@metadata
|
34
|
-
@metadata["event"]
|
35
|
-
@metadata["venue"]
|
36
|
-
@metadata["location"]
|
33
|
+
@metadata["blah"].result.should == "abc"
|
34
|
+
@metadata["event"]["data1"].result.should == "Something cool"
|
35
|
+
@metadata["venue"]["data2"].result.should == "farms"
|
36
|
+
@metadata["location"]["data3"].result.should == "Another stuff"
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'should support properties with html format' do
|
@@ -47,9 +47,9 @@ describe Wombat::PropertyLocator do
|
|
47
47
|
|
48
48
|
@metadata["event"].another_info "xpath=/anotherData", :html
|
49
49
|
|
50
|
-
@
|
50
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate_first p }
|
51
51
|
|
52
|
-
@metadata["event"]
|
52
|
+
@metadata["event"]["another_info"].result.should == "some another info"
|
53
53
|
end
|
54
54
|
|
55
55
|
it 'should trim property contents and use namespaces if present' do
|
@@ -59,8 +59,8 @@ describe Wombat::PropertyLocator do
|
|
59
59
|
@locator_instance.stub(:context).and_return context
|
60
60
|
@metadata["event"].description "xpath=/event/some/description", :text, "blah"
|
61
61
|
|
62
|
-
@
|
62
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate_first p }
|
63
63
|
|
64
|
-
@metadata["event"]
|
64
|
+
@metadata["event"]["description"].result.should == "awesome event"
|
65
65
|
end
|
66
66
|
end
|
data/spec/sample_crawler_spec.rb
CHANGED
@@ -6,17 +6,21 @@ describe SampleCrawler do
|
|
6
6
|
@sample_crawler = SampleCrawler.new
|
7
7
|
end
|
8
8
|
|
9
|
-
|
9
|
+
it 'should correctly assign event metadata' do
|
10
10
|
@sample_crawler.should_receive(:parse) do |args|
|
11
|
-
args
|
12
|
-
args.event["description"].selector.should == "css=#main-node-content"
|
13
|
-
args.event["date"].selector.should == DateTime.now.to_date
|
11
|
+
# args["event"]["description"].selector.should == "css=#main-node-content"
|
14
12
|
|
15
|
-
args
|
16
|
-
args.venue["address"].selector.should == "324 Dom Pedro II Street"
|
13
|
+
# args["venue"]["address"].selector.should == "324 Dom Pedro II Street"
|
17
14
|
|
18
|
-
|
19
|
-
|
15
|
+
it = args.iterators.first
|
16
|
+
it.selector.should == "css=div.title-agenda"
|
17
|
+
it["event"]["title"].selector.should == "xpath=."
|
18
|
+
it["event"]["date"].selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
|
19
|
+
it["event"]["type"].selector.should == "xpath=.type"
|
20
|
+
it["venue"]["name"].selector.should == "xpath=."
|
21
|
+
|
22
|
+
args[:base_url].should == 'http://www.obaoba.com.br'
|
23
|
+
args[:list_page].should == '/porto-alegre/agenda'
|
20
24
|
end
|
21
25
|
|
22
26
|
@sample_crawler.crawl
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-02-
|
12
|
+
s.date = "2012-02-14"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/wombat/crawler.rb",
|
33
33
|
"lib/wombat/iterator.rb",
|
34
34
|
"lib/wombat/metadata.rb",
|
35
|
+
"lib/wombat/node_selector.rb",
|
35
36
|
"lib/wombat/parser.rb",
|
36
37
|
"lib/wombat/property.rb",
|
37
38
|
"lib/wombat/property_container.rb",
|
@@ -66,6 +67,7 @@ Gem::Specification.new do |s|
|
|
66
67
|
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
67
68
|
s.add_development_dependency(%q<rspec>, [">= 0"])
|
68
69
|
s.add_development_dependency(%q<guard>, [">= 0"])
|
70
|
+
s.add_development_dependency(%q<growl_notify>, [">= 0"])
|
69
71
|
s.add_development_dependency(%q<guard-rspec>, [">= 0"])
|
70
72
|
s.add_development_dependency(%q<guard-bundler>, [">= 0"])
|
71
73
|
s.add_development_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
@@ -79,6 +81,7 @@ Gem::Specification.new do |s|
|
|
79
81
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
80
82
|
s.add_dependency(%q<rspec>, [">= 0"])
|
81
83
|
s.add_dependency(%q<guard>, [">= 0"])
|
84
|
+
s.add_dependency(%q<growl_notify>, [">= 0"])
|
82
85
|
s.add_dependency(%q<guard-rspec>, [">= 0"])
|
83
86
|
s.add_dependency(%q<guard-bundler>, [">= 0"])
|
84
87
|
s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
@@ -93,6 +96,7 @@ Gem::Specification.new do |s|
|
|
93
96
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
94
97
|
s.add_dependency(%q<rspec>, [">= 0"])
|
95
98
|
s.add_dependency(%q<guard>, [">= 0"])
|
99
|
+
s.add_dependency(%q<growl_notify>, [">= 0"])
|
96
100
|
s.add_dependency(%q<guard-rspec>, [">= 0"])
|
97
101
|
s.add_dependency(%q<guard-bundler>, [">= 0"])
|
98
102
|
s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70107446346260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70107446346260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: activesupport
|
27
|
-
requirement: &
|
27
|
+
requirement: &70107446345760 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70107446345760
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: bundler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70107446345060 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70107446345060
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &70107446344320 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70107446344320
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: yard
|
60
|
-
requirement: &
|
60
|
+
requirement: &70107446359960 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70107446359960
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70107446359480 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70107446359480
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rspec
|
82
|
-
requirement: &
|
82
|
+
requirement: &70107446358960 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70107446358960
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: guard
|
93
|
-
requirement: &
|
93
|
+
requirement: &70107446358460 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,21 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70107446358460
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: growl_notify
|
104
|
+
requirement: &70107446357840 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :development
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: *70107446357840
|
102
113
|
- !ruby/object:Gem::Dependency
|
103
114
|
name: guard-rspec
|
104
|
-
requirement: &
|
115
|
+
requirement: &70107446357240 !ruby/object:Gem::Requirement
|
105
116
|
none: false
|
106
117
|
requirements:
|
107
118
|
- - ! '>='
|
@@ -109,10 +120,10 @@ dependencies:
|
|
109
120
|
version: '0'
|
110
121
|
type: :development
|
111
122
|
prerelease: false
|
112
|
-
version_requirements: *
|
123
|
+
version_requirements: *70107446357240
|
113
124
|
- !ruby/object:Gem::Dependency
|
114
125
|
name: guard-bundler
|
115
|
-
requirement: &
|
126
|
+
requirement: &70107446356580 !ruby/object:Gem::Requirement
|
116
127
|
none: false
|
117
128
|
requirements:
|
118
129
|
- - ! '>='
|
@@ -120,10 +131,10 @@ dependencies:
|
|
120
131
|
version: '0'
|
121
132
|
type: :development
|
122
133
|
prerelease: false
|
123
|
-
version_requirements: *
|
134
|
+
version_requirements: *70107446356580
|
124
135
|
- !ruby/object:Gem::Dependency
|
125
136
|
name: vcr
|
126
|
-
requirement: &
|
137
|
+
requirement: &70107446355780 !ruby/object:Gem::Requirement
|
127
138
|
none: false
|
128
139
|
requirements:
|
129
140
|
- - =
|
@@ -131,10 +142,10 @@ dependencies:
|
|
131
142
|
version: 2.0.0.rc1
|
132
143
|
type: :development
|
133
144
|
prerelease: false
|
134
|
-
version_requirements: *
|
145
|
+
version_requirements: *70107446355780
|
135
146
|
- !ruby/object:Gem::Dependency
|
136
147
|
name: fakeweb
|
137
|
-
requirement: &
|
148
|
+
requirement: &70107446355300 !ruby/object:Gem::Requirement
|
138
149
|
none: false
|
139
150
|
requirements:
|
140
151
|
- - ! '>='
|
@@ -142,7 +153,7 @@ dependencies:
|
|
142
153
|
version: '0'
|
143
154
|
type: :development
|
144
155
|
prerelease: false
|
145
|
-
version_requirements: *
|
156
|
+
version_requirements: *70107446355300
|
146
157
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
147
158
|
email: felipe.lima@gmail.com
|
148
159
|
executables: []
|
@@ -166,6 +177,7 @@ files:
|
|
166
177
|
- lib/wombat/crawler.rb
|
167
178
|
- lib/wombat/iterator.rb
|
168
179
|
- lib/wombat/metadata.rb
|
180
|
+
- lib/wombat/node_selector.rb
|
169
181
|
- lib/wombat/parser.rb
|
170
182
|
- lib/wombat/property.rb
|
171
183
|
- lib/wombat/property_container.rb
|