wombat 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/Gemfile.lock +4 -0
- data/README.md +6 -1
- data/VERSION +1 -1
- data/lib/wombat/crawler.rb +1 -6
- data/lib/wombat/iterator.rb +1 -0
- data/lib/wombat/metadata.rb +0 -12
- data/lib/wombat/node_selector.rb +10 -0
- data/lib/wombat/parser.rb +15 -5
- data/lib/wombat/property.rb +4 -0
- data/lib/wombat/property_container.rb +31 -19
- data/lib/wombat/property_locator.rb +7 -14
- data/spec/crawler_spec.rb +14 -9
- data/spec/helpers/sample_crawler.rb +14 -14
- data/spec/integration/integration_spec.rb +5 -0
- data/spec/parser_spec.rb +33 -7
- data/spec/property_container_spec.rb +15 -3
- data/spec/property_locator_spec.rb +9 -9
- data/spec/sample_crawler_spec.rb +12 -8
- data/wombat.gemspec +6 -2
- metadata +38 -26
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -8,6 +8,8 @@ GEM
|
|
8
8
|
fakeweb (1.3.0)
|
9
9
|
ffi (1.0.11)
|
10
10
|
git (1.2.5)
|
11
|
+
growl_notify (0.0.3)
|
12
|
+
rb-appscript
|
11
13
|
guard (0.9.4)
|
12
14
|
ffi (>= 0.5.0)
|
13
15
|
thor (~> 0.14.6)
|
@@ -32,6 +34,7 @@ GEM
|
|
32
34
|
nokogiri (1.5.0)
|
33
35
|
ntlm-http (0.1.1)
|
34
36
|
rake (0.9.2.2)
|
37
|
+
rb-appscript (0.6.1)
|
35
38
|
rspec (2.7.0)
|
36
39
|
rspec-core (~> 2.7.0)
|
37
40
|
rspec-expectations (~> 2.7.0)
|
@@ -56,6 +59,7 @@ DEPENDENCIES
|
|
56
59
|
activesupport
|
57
60
|
bundler
|
58
61
|
fakeweb
|
62
|
+
growl_notify
|
59
63
|
guard
|
60
64
|
guard-bundler
|
61
65
|
guard-rspec
|
data/README.md
CHANGED
@@ -8,6 +8,8 @@ Generic Web crawler with a DSL that parses structured data from web pages.
|
|
8
8
|
|
9
9
|
``gem install wombat``
|
10
10
|
|
11
|
+
Creating a crawler:
|
12
|
+
|
11
13
|
```ruby
|
12
14
|
|
13
15
|
# => github_crawler.rb
|
@@ -37,8 +39,11 @@ class GithubCrawler
|
|
37
39
|
end
|
38
40
|
end
|
39
41
|
```
|
42
|
+
|
43
|
+
Running it:
|
44
|
+
|
40
45
|
```ruby
|
41
|
-
irb>
|
46
|
+
irb> GithubCrawler.new.crawl
|
42
47
|
=>
|
43
48
|
{
|
44
49
|
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/wombat/crawler.rb
CHANGED
@@ -21,12 +21,7 @@ module Wombat
|
|
21
21
|
|
22
22
|
module ClassMethods
|
23
23
|
def method_missing method, *args, &block
|
24
|
-
|
25
|
-
metadata["#{method.to_s}"] = PropertyContainer.new unless metadata["#{method.to_s}"]
|
26
|
-
block.call(metadata["#{method.to_s}"])
|
27
|
-
else
|
28
|
-
metadata.send method, *args, &block
|
29
|
-
end
|
24
|
+
metadata.send method, *args, &block
|
30
25
|
end
|
31
26
|
|
32
27
|
def for_each selector, &block
|
data/lib/wombat/iterator.rb
CHANGED
data/lib/wombat/metadata.rb
CHANGED
@@ -4,12 +4,6 @@ require 'wombat/iterator'
|
|
4
4
|
|
5
5
|
module Wombat
|
6
6
|
class Metadata < PropertyContainer
|
7
|
-
attr_accessor :iterators
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
@iterators = []
|
11
|
-
end
|
12
|
-
|
13
7
|
def base_url url
|
14
8
|
self[:base_url] = url
|
15
9
|
end
|
@@ -17,11 +11,5 @@ module Wombat
|
|
17
11
|
def list_page url
|
18
12
|
self[:list_page] = url
|
19
13
|
end
|
20
|
-
|
21
|
-
def for_each selector
|
22
|
-
Iterator.new(selector).tap do |i|
|
23
|
-
iterators << i
|
24
|
-
end
|
25
|
-
end
|
26
14
|
end
|
27
15
|
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module Wombat
|
2
|
+
module NodeSelector
|
3
|
+
def select_nodes selector, namespaces = nil
|
4
|
+
return [selector.to_s] if selector.is_a? Symbol
|
5
|
+
return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
6
|
+
return context.css selector[4..-1] if selector.start_with? "css="
|
7
|
+
nil
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -12,14 +12,24 @@ module Wombat
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def parse metadata
|
15
|
-
|
15
|
+
self.context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
|
16
|
+
original_context = self.context
|
16
17
|
|
17
|
-
|
18
|
+
metadata.iterators.each do |it|
|
19
|
+
select_nodes(it.selector).each do |n|
|
20
|
+
self.context = n
|
21
|
+
it.all_properties.each do |p|
|
22
|
+
p.result ||= []
|
23
|
+
p.result << locate_first(p)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
18
27
|
|
19
|
-
|
28
|
+
self.context = original_context
|
20
29
|
|
21
|
-
|
22
|
-
|
30
|
+
metadata.all_properties.each do |p|
|
31
|
+
result = locate_first p
|
32
|
+
p.result = p.callback ? p.callback.call(result) : result
|
23
33
|
end
|
24
34
|
|
25
35
|
metadata.flatten
|
data/lib/wombat/property.rb
CHANGED
@@ -2,42 +2,54 @@
|
|
2
2
|
|
3
3
|
module Wombat
|
4
4
|
class PropertyContainer < Hash
|
5
|
-
|
6
|
-
self[property.name] = property
|
7
|
-
end
|
5
|
+
attr_accessor :iterators
|
8
6
|
|
9
|
-
def
|
10
|
-
|
7
|
+
def initialize
|
8
|
+
@iterators = []
|
11
9
|
end
|
12
10
|
|
13
11
|
def method_missing method, *args, &block
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
12
|
+
if args.empty? && block
|
13
|
+
self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
|
14
|
+
block.call(self["#{method.to_s}"])
|
15
|
+
else
|
16
|
+
self[method.to_s] = Property.new(
|
17
|
+
name: method.to_s,
|
18
|
+
selector: args.first,
|
19
|
+
format: args[1],
|
20
|
+
namespaces: args[2],
|
21
|
+
callback: block)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_ary
|
20
26
|
end
|
21
27
|
|
22
28
|
def all_properties
|
23
29
|
values.flat_map { |v|
|
24
|
-
v.kind_of?
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
30
|
+
if v.kind_of? PropertyContainer
|
31
|
+
v.all_properties
|
32
|
+
elsif v.kind_of? Property
|
33
|
+
v
|
34
|
+
else
|
35
|
+
nil
|
36
|
+
end
|
29
37
|
}.compact
|
30
38
|
end
|
31
39
|
|
32
40
|
def flatten
|
33
41
|
Hash.new.tap do |h|
|
34
42
|
keys.map do |k|
|
35
|
-
if self[k].kind_of?(PropertyContainer)
|
43
|
+
if self[k].kind_of?(PropertyContainer) || self[k].kind_of?(Property)
|
36
44
|
h[k] = self[k].flatten
|
37
|
-
elsif self[k].kind_of?(Property)
|
38
|
-
h[k] = self[k].result
|
39
45
|
end
|
40
46
|
end
|
47
|
+
end.merge(iterators.inject({}) {|memo, i| memo.merge(i.flatten) })
|
48
|
+
end
|
49
|
+
|
50
|
+
def for_each selector
|
51
|
+
Iterator.new(selector).tap do |i|
|
52
|
+
iterators << i
|
41
53
|
end
|
42
54
|
end
|
43
55
|
end
|
@@ -1,25 +1,18 @@
|
|
1
1
|
#coding: utf-8
|
2
|
+
require 'wombat/node_selector'
|
2
3
|
|
3
4
|
module Wombat
|
4
5
|
module PropertyLocator
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
include NodeSelector
|
7
|
+
|
8
|
+
def locate_first property
|
9
|
+
locate(property).first
|
9
10
|
end
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
result = locate_selector(property.selector, property.namespaces).to_a
|
12
|
+
def locate property
|
13
|
+
result = select_nodes(property.selector, property.namespaces).to_a
|
14
14
|
result.map! {|r| r.inner_html.strip } if property.format == :html
|
15
15
|
result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
|
16
16
|
end
|
17
|
-
|
18
|
-
def locate_selector selector, namespaces = nil
|
19
|
-
return [selector.to_s] if selector.is_a? Symbol
|
20
|
-
return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
21
|
-
return context.css selector[4..-1] if selector.start_with? "css="
|
22
|
-
nil
|
23
|
-
end
|
24
17
|
end
|
25
18
|
end
|
data/spec/crawler_spec.rb
CHANGED
@@ -33,10 +33,10 @@ describe Wombat::Crawler do
|
|
33
33
|
@crawler.location { |v| v.latitude -50.2323 }
|
34
34
|
|
35
35
|
@crawler_instance.should_receive(:parse) do |arg|
|
36
|
-
arg["event"]
|
37
|
-
arg["event"]
|
38
|
-
arg["venue"]
|
39
|
-
arg["location"]
|
36
|
+
arg["event"]["title"].selector.should == "Fulltronic Dezembro"
|
37
|
+
arg["event"]["time"].selector.to_s.should == time.to_s
|
38
|
+
arg["venue"]["name"].selector.should == "Scooba"
|
39
|
+
arg["location"]["latitude"].selector.should == -50.2323
|
40
40
|
end
|
41
41
|
|
42
42
|
@crawler_instance.crawl
|
@@ -48,11 +48,11 @@ describe Wombat::Crawler do
|
|
48
48
|
another_crawler_instance = another_crawler.new
|
49
49
|
|
50
50
|
another_crawler.event { |e| e.title 'Ibiza' }
|
51
|
-
another_crawler_instance.should_receive(:parse) { |arg| arg["event"]
|
51
|
+
another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
|
52
52
|
another_crawler_instance.crawl
|
53
53
|
|
54
54
|
@crawler.event { |e| e.title 'Fulltronic Dezembro' }
|
55
|
-
@crawler_instance.should_receive(:parse) { |arg| arg["event"]
|
55
|
+
@crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
|
56
56
|
@crawler_instance.crawl
|
57
57
|
end
|
58
58
|
|
@@ -96,12 +96,17 @@ describe Wombat::Crawler do
|
|
96
96
|
@crawler.for_each "css=.element" do
|
97
97
|
title "css=.title"
|
98
98
|
body "css=.body"
|
99
|
+
event do |e|
|
100
|
+
e.all "yeah"
|
101
|
+
end
|
99
102
|
end
|
100
103
|
|
101
104
|
@crawler_instance.should_receive(:parse) do |arg|
|
102
|
-
arg.iterators.first
|
103
|
-
|
104
|
-
|
105
|
+
it = arg.iterators.first
|
106
|
+
it.selector.should == "css=.element"
|
107
|
+
it["title"].selector.should == "css=.title"
|
108
|
+
it["body"].selector.should == "css=.body"
|
109
|
+
it["event"]["all"].selector.should == "yeah"
|
105
110
|
end
|
106
111
|
|
107
112
|
@crawler_instance.crawl
|
@@ -13,24 +13,24 @@ class SampleCrawler
|
|
13
13
|
e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
|
14
14
|
DateTime.strptime(d, '%d/%m')
|
15
15
|
end
|
16
|
-
e.type("xpath=.") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
|
16
|
+
e.type("xpath=.type") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
|
17
17
|
end
|
18
18
|
|
19
19
|
venue do |v|
|
20
20
|
v.name("xpath=.") { |n| name.split(" | ")[2].strip }
|
21
21
|
end
|
22
|
+
|
23
|
+
# follow_links "xpath=.//a[1]/@href" do
|
24
|
+
# event { |e| e.description "css=#main-node-content", :html }
|
25
|
+
# venue do |v|
|
26
|
+
# v.phone "css=span.tel .value"
|
27
|
+
# v.image "xpath=//div[@id='article-image']/div/img/@src"
|
28
|
+
# end
|
22
29
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
location do |l|
|
31
|
-
l.city "css=span.locality"
|
32
|
-
l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
30
|
+
# location do |l|
|
31
|
+
# l.city "css=span.locality"
|
32
|
+
# l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
+
# end
|
34
|
+
# end
|
35
|
+
end
|
36
36
|
end
|
@@ -15,6 +15,10 @@ describe 'basic crawler setup' do
|
|
15
15
|
s.twitter "css=.ctn-bar li.last"
|
16
16
|
end
|
17
17
|
|
18
|
+
crawler.for_each "css=.ctn-links" do
|
19
|
+
menu "css=a"
|
20
|
+
end
|
21
|
+
|
18
22
|
crawler.subheader "css=h2.ttl-dynamic" do |h|
|
19
23
|
h.gsub("London", "Londres")
|
20
24
|
end
|
@@ -24,6 +28,7 @@ describe 'basic crawler setup' do
|
|
24
28
|
results = crawler_instance.crawl
|
25
29
|
|
26
30
|
results["search"].should == "Buscar"
|
31
|
+
results["menu"].should =~ ["Agenda", "Brasileiro", "Brasil", "Bolsas", "Cinema", "Galerias de Fotos", "Beleza", "Esportes", "Assine o RSS"]
|
27
32
|
results["subheader"].should == "Londres 2012"
|
28
33
|
results["social"]["twitter"].should == "Verão"
|
29
34
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -24,7 +24,7 @@ describe Wombat::Parser do
|
|
24
24
|
fake_parser = double :parser
|
25
25
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
26
26
|
@parser.mechanize.stub(:get).and_return fake_document
|
27
|
-
@parser.
|
27
|
+
@parser.should_not_receive :locate
|
28
28
|
@parser.parse @metadata
|
29
29
|
end
|
30
30
|
|
@@ -42,7 +42,7 @@ describe Wombat::Parser do
|
|
42
42
|
|
43
43
|
@parser.mechanize.stub(:get).and_return fake_document
|
44
44
|
@metadata.stub(:all_properties).and_return [property]
|
45
|
-
@parser.should_receive(:
|
45
|
+
@parser.should_receive(:locate_first).with(property)
|
46
46
|
|
47
47
|
@parser.parse @metadata
|
48
48
|
|
@@ -59,21 +59,20 @@ describe Wombat::Parser do
|
|
59
59
|
p.should == "blah"
|
60
60
|
}
|
61
61
|
|
62
|
-
property.should_receive(:result).and_return("blah")
|
63
62
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
64
63
|
property.should_receive(:callback).twice.and_return(block)
|
65
64
|
property.should_receive(:result=).with(true)
|
66
65
|
|
67
66
|
@parser.mechanize.stub(:get).and_return fake_document
|
68
67
|
@metadata.stub(:all_properties).and_return [property]
|
69
|
-
@parser.should_receive(:
|
68
|
+
@parser.should_receive(:locate_first).with(property).and_return("blah")
|
70
69
|
|
71
70
|
@parser.parse @metadata
|
72
71
|
|
73
72
|
block_called.should be_true
|
74
73
|
end
|
75
74
|
|
76
|
-
it 'should return
|
75
|
+
it 'should return hash with requested properties' do
|
77
76
|
hash = double :results
|
78
77
|
fake_parser = double :parser
|
79
78
|
fake_document = double :document
|
@@ -81,7 +80,34 @@ describe Wombat::Parser do
|
|
81
80
|
fake_document.should_receive(:parser).and_return fake_parser
|
82
81
|
@parser.mechanize.stub(:get).and_return fake_document
|
83
82
|
@metadata.should_receive(:flatten).and_return hash
|
84
|
-
|
83
|
+
|
85
84
|
@parser.parse(@metadata).should == hash
|
86
85
|
end
|
87
|
-
|
86
|
+
|
87
|
+
it 'should iterate in for_each properties' do
|
88
|
+
fake_parser = double :parser
|
89
|
+
fake_document = double :document
|
90
|
+
c1 = double :context
|
91
|
+
c2 = double :context
|
92
|
+
it = Wombat::Iterator.new "it_selector"
|
93
|
+
it.prop_1 "some_selector"
|
94
|
+
it.prop_2 "another_selector"
|
95
|
+
|
96
|
+
@parser.should_receive(:context=).ordered
|
97
|
+
@metadata.should_receive(:iterators).and_return [it]
|
98
|
+
@metadata.should_receive(:flatten)
|
99
|
+
fake_document.should_receive(:parser).and_return(fake_parser)
|
100
|
+
it['prop_1'].should_receive(:result).exactly(4).times.and_return([])
|
101
|
+
it['prop_2'].should_receive(:result).exactly(4).times.and_return([])
|
102
|
+
@parser.mechanize.stub(:get).and_return fake_document
|
103
|
+
@parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
|
104
|
+
@parser.should_receive(:context=).with(c1).ordered
|
105
|
+
@parser.should_receive(:context=).with(c2).ordered
|
106
|
+
@parser.should_receive(:context=).ordered
|
107
|
+
@parser.should_receive(:locate_first).with(it['prop_1']).twice
|
108
|
+
@parser.should_receive(:locate_first).with(it['prop_2']).twice
|
109
|
+
@parser.stub(:locate)
|
110
|
+
|
111
|
+
@parser.parse(@metadata)
|
112
|
+
end
|
113
|
+
end
|
@@ -5,17 +5,24 @@ describe Wombat::PropertyContainer do
|
|
5
5
|
@metadata = Wombat::PropertyContainer.new
|
6
6
|
end
|
7
7
|
|
8
|
-
it 'should return an array with all the metadata properties' do
|
8
|
+
it 'should return an array with all the metadata properties excluding iterators' do
|
9
9
|
@metadata["event"] = Wombat::PropertyContainer.new
|
10
10
|
@metadata["venue"] = Wombat::PropertyContainer.new
|
11
11
|
@metadata.another_property "/some/selector", :text
|
12
12
|
@metadata["event"]["something"] = Wombat::PropertyContainer.new
|
13
13
|
@metadata["event"]["something"].else "Wohooo"
|
14
14
|
@metadata["venue"].awesome "whooea"
|
15
|
+
it = Wombat::Iterator.new "it_selector"
|
16
|
+
it.felipe "lima"
|
17
|
+
@metadata.iterators << it
|
15
18
|
|
16
19
|
all_propes = @metadata.all_properties
|
17
20
|
|
18
|
-
all_propes.should =~ [
|
21
|
+
all_propes.should =~ [
|
22
|
+
@metadata["another_property"],
|
23
|
+
@metadata["event"]["something"]["else"],
|
24
|
+
@metadata["venue"]["awesome"]
|
25
|
+
]
|
19
26
|
end
|
20
27
|
|
21
28
|
it 'should be able to change properties via all_properties' do
|
@@ -24,7 +31,7 @@ describe Wombat::PropertyContainer do
|
|
24
31
|
@metadata["another_property"].selector.should == "abc"
|
25
32
|
end
|
26
33
|
|
27
|
-
it 'should return metadata in plain hash format' do
|
34
|
+
it 'should return metadata in plain hash format including iterators' do
|
28
35
|
@metadata.title "/some/selector"
|
29
36
|
@metadata["title"].result = "Gogobot Inc."
|
30
37
|
@metadata["holder"] = Wombat::PropertyContainer.new
|
@@ -33,6 +40,10 @@ describe Wombat::PropertyContainer do
|
|
33
40
|
@metadata["holder"]["subheader"] = Wombat::PropertyContainer.new
|
34
41
|
@metadata["holder"]["subheader"].section "/blah"
|
35
42
|
@metadata["holder"]["subheader"]["section"].result = "Lorem Ipsum"
|
43
|
+
it = Wombat::Iterator.new "it_selector"
|
44
|
+
it.felipe "lima"
|
45
|
+
it["felipe"].result = ["correa", "de souza", "lima"]
|
46
|
+
@metadata.iterators = [it]
|
36
47
|
@metadata.footer("another thing", :html) { |a| true }
|
37
48
|
@metadata["footer"].result = "bla bla bla"
|
38
49
|
|
@@ -44,6 +55,7 @@ describe Wombat::PropertyContainer do
|
|
44
55
|
"section" => "Lorem Ipsum"
|
45
56
|
}
|
46
57
|
},
|
58
|
+
"felipe" => ["correa", "de souza", "lima"],
|
47
59
|
"footer" => "bla bla bla"
|
48
60
|
}
|
49
61
|
end
|
@@ -28,12 +28,12 @@ describe Wombat::PropertyLocator do
|
|
28
28
|
|
29
29
|
@locator_instance.stub(:context).and_return context
|
30
30
|
|
31
|
-
@
|
31
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate_first p }
|
32
32
|
|
33
|
-
@metadata
|
34
|
-
@metadata["event"]
|
35
|
-
@metadata["venue"]
|
36
|
-
@metadata["location"]
|
33
|
+
@metadata["blah"].result.should == "abc"
|
34
|
+
@metadata["event"]["data1"].result.should == "Something cool"
|
35
|
+
@metadata["venue"]["data2"].result.should == "farms"
|
36
|
+
@metadata["location"]["data3"].result.should == "Another stuff"
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'should support properties with html format' do
|
@@ -47,9 +47,9 @@ describe Wombat::PropertyLocator do
|
|
47
47
|
|
48
48
|
@metadata["event"].another_info "xpath=/anotherData", :html
|
49
49
|
|
50
|
-
@
|
50
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate_first p }
|
51
51
|
|
52
|
-
@metadata["event"]
|
52
|
+
@metadata["event"]["another_info"].result.should == "some another info"
|
53
53
|
end
|
54
54
|
|
55
55
|
it 'should trim property contents and use namespaces if present' do
|
@@ -59,8 +59,8 @@ describe Wombat::PropertyLocator do
|
|
59
59
|
@locator_instance.stub(:context).and_return context
|
60
60
|
@metadata["event"].description "xpath=/event/some/description", :text, "blah"
|
61
61
|
|
62
|
-
@
|
62
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate_first p }
|
63
63
|
|
64
|
-
@metadata["event"]
|
64
|
+
@metadata["event"]["description"].result.should == "awesome event"
|
65
65
|
end
|
66
66
|
end
|
data/spec/sample_crawler_spec.rb
CHANGED
@@ -6,17 +6,21 @@ describe SampleCrawler do
|
|
6
6
|
@sample_crawler = SampleCrawler.new
|
7
7
|
end
|
8
8
|
|
9
|
-
|
9
|
+
it 'should correctly assign event metadata' do
|
10
10
|
@sample_crawler.should_receive(:parse) do |args|
|
11
|
-
args
|
12
|
-
args.event["description"].selector.should == "css=#main-node-content"
|
13
|
-
args.event["date"].selector.should == DateTime.now.to_date
|
11
|
+
# args["event"]["description"].selector.should == "css=#main-node-content"
|
14
12
|
|
15
|
-
args
|
16
|
-
args.venue["address"].selector.should == "324 Dom Pedro II Street"
|
13
|
+
# args["venue"]["address"].selector.should == "324 Dom Pedro II Street"
|
17
14
|
|
18
|
-
|
19
|
-
|
15
|
+
it = args.iterators.first
|
16
|
+
it.selector.should == "css=div.title-agenda"
|
17
|
+
it["event"]["title"].selector.should == "xpath=."
|
18
|
+
it["event"]["date"].selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
|
19
|
+
it["event"]["type"].selector.should == "xpath=.type"
|
20
|
+
it["venue"]["name"].selector.should == "xpath=."
|
21
|
+
|
22
|
+
args[:base_url].should == 'http://www.obaoba.com.br'
|
23
|
+
args[:list_page].should == '/porto-alegre/agenda'
|
20
24
|
end
|
21
25
|
|
22
26
|
@sample_crawler.crawl
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-02-
|
12
|
+
s.date = "2012-02-14"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/wombat/crawler.rb",
|
33
33
|
"lib/wombat/iterator.rb",
|
34
34
|
"lib/wombat/metadata.rb",
|
35
|
+
"lib/wombat/node_selector.rb",
|
35
36
|
"lib/wombat/parser.rb",
|
36
37
|
"lib/wombat/property.rb",
|
37
38
|
"lib/wombat/property_container.rb",
|
@@ -66,6 +67,7 @@ Gem::Specification.new do |s|
|
|
66
67
|
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
67
68
|
s.add_development_dependency(%q<rspec>, [">= 0"])
|
68
69
|
s.add_development_dependency(%q<guard>, [">= 0"])
|
70
|
+
s.add_development_dependency(%q<growl_notify>, [">= 0"])
|
69
71
|
s.add_development_dependency(%q<guard-rspec>, [">= 0"])
|
70
72
|
s.add_development_dependency(%q<guard-bundler>, [">= 0"])
|
71
73
|
s.add_development_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
@@ -79,6 +81,7 @@ Gem::Specification.new do |s|
|
|
79
81
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
80
82
|
s.add_dependency(%q<rspec>, [">= 0"])
|
81
83
|
s.add_dependency(%q<guard>, [">= 0"])
|
84
|
+
s.add_dependency(%q<growl_notify>, [">= 0"])
|
82
85
|
s.add_dependency(%q<guard-rspec>, [">= 0"])
|
83
86
|
s.add_dependency(%q<guard-bundler>, [">= 0"])
|
84
87
|
s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
@@ -93,6 +96,7 @@ Gem::Specification.new do |s|
|
|
93
96
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
94
97
|
s.add_dependency(%q<rspec>, [">= 0"])
|
95
98
|
s.add_dependency(%q<guard>, [">= 0"])
|
99
|
+
s.add_dependency(%q<growl_notify>, [">= 0"])
|
96
100
|
s.add_dependency(%q<guard-rspec>, [">= 0"])
|
97
101
|
s.add_dependency(%q<guard-bundler>, [">= 0"])
|
98
102
|
s.add_dependency(%q<vcr>, ["= 2.0.0.rc1"])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70107446346260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70107446346260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: activesupport
|
27
|
-
requirement: &
|
27
|
+
requirement: &70107446345760 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70107446345760
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: bundler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70107446345060 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70107446345060
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &70107446344320 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70107446344320
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: yard
|
60
|
-
requirement: &
|
60
|
+
requirement: &70107446359960 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70107446359960
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70107446359480 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70107446359480
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rspec
|
82
|
-
requirement: &
|
82
|
+
requirement: &70107446358960 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70107446358960
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: guard
|
93
|
-
requirement: &
|
93
|
+
requirement: &70107446358460 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,21 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70107446358460
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: growl_notify
|
104
|
+
requirement: &70107446357840 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :development
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: *70107446357840
|
102
113
|
- !ruby/object:Gem::Dependency
|
103
114
|
name: guard-rspec
|
104
|
-
requirement: &
|
115
|
+
requirement: &70107446357240 !ruby/object:Gem::Requirement
|
105
116
|
none: false
|
106
117
|
requirements:
|
107
118
|
- - ! '>='
|
@@ -109,10 +120,10 @@ dependencies:
|
|
109
120
|
version: '0'
|
110
121
|
type: :development
|
111
122
|
prerelease: false
|
112
|
-
version_requirements: *
|
123
|
+
version_requirements: *70107446357240
|
113
124
|
- !ruby/object:Gem::Dependency
|
114
125
|
name: guard-bundler
|
115
|
-
requirement: &
|
126
|
+
requirement: &70107446356580 !ruby/object:Gem::Requirement
|
116
127
|
none: false
|
117
128
|
requirements:
|
118
129
|
- - ! '>='
|
@@ -120,10 +131,10 @@ dependencies:
|
|
120
131
|
version: '0'
|
121
132
|
type: :development
|
122
133
|
prerelease: false
|
123
|
-
version_requirements: *
|
134
|
+
version_requirements: *70107446356580
|
124
135
|
- !ruby/object:Gem::Dependency
|
125
136
|
name: vcr
|
126
|
-
requirement: &
|
137
|
+
requirement: &70107446355780 !ruby/object:Gem::Requirement
|
127
138
|
none: false
|
128
139
|
requirements:
|
129
140
|
- - =
|
@@ -131,10 +142,10 @@ dependencies:
|
|
131
142
|
version: 2.0.0.rc1
|
132
143
|
type: :development
|
133
144
|
prerelease: false
|
134
|
-
version_requirements: *
|
145
|
+
version_requirements: *70107446355780
|
135
146
|
- !ruby/object:Gem::Dependency
|
136
147
|
name: fakeweb
|
137
|
-
requirement: &
|
148
|
+
requirement: &70107446355300 !ruby/object:Gem::Requirement
|
138
149
|
none: false
|
139
150
|
requirements:
|
140
151
|
- - ! '>='
|
@@ -142,7 +153,7 @@ dependencies:
|
|
142
153
|
version: '0'
|
143
154
|
type: :development
|
144
155
|
prerelease: false
|
145
|
-
version_requirements: *
|
156
|
+
version_requirements: *70107446355300
|
146
157
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
147
158
|
email: felipe.lima@gmail.com
|
148
159
|
executables: []
|
@@ -166,6 +177,7 @@ files:
|
|
166
177
|
- lib/wombat/crawler.rb
|
167
178
|
- lib/wombat/iterator.rb
|
168
179
|
- lib/wombat/metadata.rb
|
180
|
+
- lib/wombat/node_selector.rb
|
169
181
|
- lib/wombat/parser.rb
|
170
182
|
- lib/wombat/property.rb
|
171
183
|
- lib/wombat/property_container.rb
|