wombat 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +1 -4
- data/Gemfile +4 -0
- data/Gemfile.lock +11 -0
- data/README.md +22 -0
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/basic_crawler_page.yml +600 -0
- data/lib/wombat/crawler.rb +20 -17
- data/lib/wombat/metadata.rb +7 -17
- data/lib/wombat/parser.rb +6 -4
- data/lib/wombat/property_container.rb +44 -0
- data/lib/wombat/property_locator.rb +3 -3
- data/spec/crawler_spec.rb +35 -13
- data/spec/helpers/sample_crawler.rb +27 -8
- data/spec/integration/integration_spec.rb +25 -0
- data/spec/metadata_spec.rb +7 -14
- data/spec/parser_spec.rb +20 -54
- data/spec/property_container_spec.rb +49 -0
- data/spec/property_locator_spec.rb +21 -11
- data/spec/sample_crawler_spec.rb +10 -8
- data/spec/spec_helper.rb +7 -1
- data/wombat.gemspec +20 -6
- metadata +68 -22
- data/README.rdoc +0 -20
- data/lib/wombat/properties.rb +0 -31
- data/spec/properties_spec.rb +0 -31
data/lib/wombat/crawler.rb
CHANGED
@@ -1,49 +1,52 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
require 'wombat/properties'
|
3
2
|
require 'wombat/metadata'
|
3
|
+
require 'wombat/property'
|
4
4
|
require 'wombat/parser'
|
5
5
|
require 'active_support'
|
6
6
|
require 'date'
|
7
7
|
|
8
8
|
module Wombat
|
9
9
|
module Crawler
|
10
|
+
include Parser
|
10
11
|
extend ActiveSupport::Concern
|
11
12
|
|
12
13
|
module InstanceMethods
|
13
14
|
def crawl
|
14
|
-
|
15
|
+
parse self.class.send(:metadata)
|
15
16
|
end
|
16
17
|
|
17
18
|
def supports_city?
|
18
19
|
end
|
19
|
-
|
20
|
-
def parser
|
21
|
-
@parser ||= Parser.new
|
22
|
-
end
|
23
|
-
|
24
|
-
def parser= parser
|
25
|
-
@parser = parser
|
26
|
-
end
|
27
20
|
end
|
28
21
|
|
29
22
|
module ClassMethods
|
30
|
-
[:event, :venue, :location].each do |m|
|
31
|
-
define_method(m) do |&block|
|
32
|
-
block.call(metadata["#{m.to_s}_props".to_sym]) if block
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
23
|
def method_missing method, *args, &block
|
37
|
-
|
24
|
+
if args.empty? && block
|
25
|
+
metadata["#{method.to_s}"] = PropertyContainer.new unless metadata["#{method.to_s}"]
|
26
|
+
block.call(metadata["#{method.to_s}"])
|
27
|
+
else
|
28
|
+
metadata.send method, *args, &block
|
29
|
+
end
|
38
30
|
end
|
39
31
|
|
40
32
|
def with_details_page
|
41
33
|
yield metadata if block_given?
|
42
34
|
end
|
43
35
|
|
36
|
+
def for_each selector
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
def follow_links selector
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
44
|
def supported_cities
|
45
45
|
end
|
46
46
|
|
47
|
+
def to_ary
|
48
|
+
end
|
49
|
+
|
47
50
|
private
|
48
51
|
def metadata
|
49
52
|
@metadata ||= Metadata.new
|
data/lib/wombat/metadata.rb
CHANGED
@@ -1,24 +1,14 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
|
3
|
-
class Metadata < Hash
|
4
|
-
def initialize
|
5
|
-
self[:event_props] = Properties.new
|
6
|
-
self[:venue_props] = Properties.new
|
7
|
-
self[:location_props] = Properties.new
|
8
|
-
end
|
2
|
+
require 'wombat/property_container'
|
9
3
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
4
|
+
module Wombat
|
5
|
+
class Metadata < PropertyContainer
|
6
|
+
def base_url url
|
7
|
+
self[:base_url] = url
|
14
8
|
end
|
15
9
|
|
16
|
-
def
|
17
|
-
|
18
|
-
self[method] = args.first
|
19
|
-
else
|
20
|
-
self[method]
|
21
|
-
end
|
10
|
+
def list_page url
|
11
|
+
self[:list_page] = url
|
22
12
|
end
|
23
13
|
end
|
24
14
|
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
#coding: utf-8
|
1
|
+
#coding: utf-8
|
2
2
|
require 'wombat/property_locator'
|
3
3
|
require 'mechanize'
|
4
4
|
|
5
5
|
module Wombat
|
6
|
-
|
6
|
+
module Parser
|
7
7
|
include PropertyLocator
|
8
8
|
attr_accessor :mechanize, :context
|
9
9
|
|
@@ -12,13 +12,15 @@ module Wombat
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def parse metadata
|
15
|
-
@context = @mechanize.get("#{metadata
|
15
|
+
@context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
|
16
16
|
|
17
17
|
locate metadata
|
18
18
|
|
19
|
-
|
19
|
+
metadata.all_properties.each do |p|
|
20
20
|
p.callback.call(p.result) if p.callback
|
21
21
|
end
|
22
|
+
|
23
|
+
metadata.flatten
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
class PropertyContainer < Hash
|
5
|
+
def add_property property
|
6
|
+
self[property.name] = property
|
7
|
+
end
|
8
|
+
|
9
|
+
def get_property name
|
10
|
+
self[name]
|
11
|
+
end
|
12
|
+
|
13
|
+
def method_missing method, *args, &block
|
14
|
+
self[method.to_s] = Property.new(
|
15
|
+
name: method.to_s,
|
16
|
+
selector: args.first,
|
17
|
+
format: args[1],
|
18
|
+
namespaces: args[2],
|
19
|
+
callback: block)
|
20
|
+
end
|
21
|
+
|
22
|
+
def all_properties
|
23
|
+
values.flat_map { |v|
|
24
|
+
v.kind_of?(PropertyContainer) \
|
25
|
+
? v.values \
|
26
|
+
: v.kind_of?(Property) \
|
27
|
+
? v \
|
28
|
+
: nil
|
29
|
+
}.compact
|
30
|
+
end
|
31
|
+
|
32
|
+
def flatten
|
33
|
+
Hash.new.tap do |h|
|
34
|
+
keys.map do |k|
|
35
|
+
if self[k].kind_of?(PropertyContainer)
|
36
|
+
h[k] = self[k].flatten
|
37
|
+
elsif self[k].kind_of?(Property)
|
38
|
+
h[k] = self[k].result
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module Wombat
|
4
4
|
module PropertyLocator
|
5
5
|
def locate metadata
|
6
|
-
|
6
|
+
metadata.all_properties.each do |p|
|
7
7
|
p.result = locate_property(p).first
|
8
8
|
end
|
9
9
|
end
|
@@ -11,8 +11,8 @@ module Wombat
|
|
11
11
|
private
|
12
12
|
def locate_property property
|
13
13
|
result = locate_selector(property.selector, property.namespaces)
|
14
|
-
result.map! {|r| r.inner_html } if property.format == :html
|
15
|
-
result.map {|r| r.
|
14
|
+
result.map! {|r| r.inner_html.strip } if property.format == :html
|
15
|
+
result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
|
16
16
|
end
|
17
17
|
|
18
18
|
def locate_selector selector, namespaces = nil
|
data/spec/crawler_spec.rb
CHANGED
@@ -3,10 +3,8 @@ require 'spec_helper'
|
|
3
3
|
describe Wombat::Crawler do
|
4
4
|
before(:each) do
|
5
5
|
@crawler = Class.new
|
6
|
-
@parser = Wombat::Parser.new
|
7
6
|
@crawler.send(:include, Wombat::Crawler)
|
8
7
|
@crawler_instance = @crawler.new
|
9
|
-
@crawler_instance.parser = @parser
|
10
8
|
end
|
11
9
|
|
12
10
|
it 'should call the provided block' do
|
@@ -34,35 +32,59 @@ describe Wombat::Crawler do
|
|
34
32
|
@crawler.venue { |v| v.name "Scooba" }
|
35
33
|
@crawler.location { |v| v.latitude -50.2323 }
|
36
34
|
|
37
|
-
@
|
38
|
-
arg.
|
39
|
-
arg.
|
40
|
-
arg.
|
41
|
-
arg.
|
35
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
36
|
+
arg["event"].get_property("title").selector.should == "Fulltronic Dezembro"
|
37
|
+
arg["event"].get_property("time").selector.to_s.should == time.to_s
|
38
|
+
arg["venue"].get_property("name").selector.should == "Scooba"
|
39
|
+
arg["location"].get_property("latitude").selector.should == -50.2323
|
42
40
|
end
|
43
41
|
|
44
42
|
@crawler_instance.crawl
|
45
43
|
end
|
46
44
|
|
47
45
|
it 'should isolate metadata between different instances' do
|
48
|
-
another_parser = Wombat::Parser.new
|
49
46
|
another_crawler = Class.new
|
50
47
|
another_crawler.send(:include, Wombat::Crawler)
|
51
48
|
another_crawler_instance = another_crawler.new
|
52
|
-
another_crawler_instance.parser = another_parser
|
53
49
|
|
54
50
|
another_crawler.event { |e| e.title 'Ibiza' }
|
55
|
-
|
51
|
+
another_crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Ibiza" }
|
56
52
|
another_crawler_instance.crawl
|
57
53
|
|
58
54
|
@crawler.event { |e| e.title 'Fulltronic Dezembro' }
|
59
|
-
@
|
55
|
+
@crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Fulltronic Dezembro" }
|
60
56
|
@crawler_instance.crawl
|
61
57
|
end
|
62
58
|
|
63
59
|
it 'should be able to assign arbitrary plain text metadata' do
|
64
|
-
@crawler.some_data
|
65
|
-
|
60
|
+
@crawler.some_data("/event/list", :html, "geo") {|p| true }
|
61
|
+
|
62
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
63
|
+
prop = arg['some_data']
|
64
|
+
prop.name.should == "some_data"
|
65
|
+
prop.selector.should == "/event/list"
|
66
|
+
prop.format.should == :html
|
67
|
+
prop.namespaces.should == "geo"
|
68
|
+
prop.callback.should_not be_nil
|
69
|
+
end
|
70
|
+
|
71
|
+
@crawler_instance.crawl
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'should be able to specify arbitrary block structure more than once' do
|
75
|
+
@crawler.structure do |s|
|
76
|
+
s.data "xpath=/xyz"
|
77
|
+
end
|
78
|
+
|
79
|
+
@crawler.structure do |s|
|
80
|
+
s.another "css=.information"
|
81
|
+
end
|
82
|
+
|
83
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
84
|
+
arg["structure"]["data"].selector.should == "xpath=/xyz"
|
85
|
+
arg["structure"]["another"].selector.should == "css=.information"
|
86
|
+
end
|
87
|
+
|
66
88
|
@crawler_instance.crawl
|
67
89
|
end
|
68
90
|
|
@@ -4,14 +4,33 @@ require 'wombat'
|
|
4
4
|
class SampleCrawler
|
5
5
|
include Wombat::Crawler
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
base_url "http://www.obaoba.com.br"
|
8
|
+
list_page "/porto-alegre/agenda"
|
9
|
+
|
10
|
+
for_each "css=div.title-agenda" do
|
11
|
+
event do |e|
|
12
|
+
e.title("xpath=.") { |t| t.split(" | ")[1].strip }
|
13
|
+
e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
|
14
|
+
DateTime.strptime(d, '%d/%m')
|
15
|
+
end
|
16
|
+
e.type("xpath=.") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
|
17
|
+
end
|
18
|
+
|
19
|
+
venue do |v|
|
20
|
+
v.name("xpath=.") { |n| name.split(" | ")[2].strip }
|
21
|
+
end
|
22
|
+
|
23
|
+
follow_links "xpath=.//a[1]/@href" do
|
24
|
+
event { |e| e.description "css=#main-node-content", :html }
|
25
|
+
venue do |v|
|
26
|
+
v.phone "css=span.tel .value"
|
27
|
+
v.image "xpath=//div[@id='article-image']/div/img/@src"
|
28
|
+
end
|
12
29
|
|
13
|
-
|
14
|
-
|
15
|
-
|
30
|
+
location do |l|
|
31
|
+
l.city "css=span.locality"
|
32
|
+
l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
+
end
|
34
|
+
end
|
16
35
|
end
|
17
36
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'basic crawler setup' do
|
4
|
+
it 'should crawl page' do
|
5
|
+
VCR.use_cassette('basic_crawler_page') do
|
6
|
+
crawler = Class.new
|
7
|
+
crawler.send(:include, Wombat::Crawler)
|
8
|
+
|
9
|
+
crawler.base_url "http://www.terra.com.br"
|
10
|
+
crawler.list_page '/portal'
|
11
|
+
|
12
|
+
crawler.search "css=.btn-search"
|
13
|
+
crawler.social do |s|
|
14
|
+
s.twitter "css=.ctn-bar li.last"
|
15
|
+
end
|
16
|
+
|
17
|
+
crawler_instance = crawler.new
|
18
|
+
|
19
|
+
results = crawler_instance.crawl
|
20
|
+
|
21
|
+
results["search"].should == "Buscar"
|
22
|
+
results["social"]["twitter"].should == "Terra Magazine"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/spec/metadata_spec.rb
CHANGED
@@ -1,21 +1,14 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Wombat::Metadata do
|
4
|
-
|
5
|
-
metadata = Wombat::Metadata.new
|
6
|
-
|
7
|
-
metadata[:event_props].class.should == Wombat::Properties
|
8
|
-
metadata[:venue_props].class.should == Wombat::Properties
|
9
|
-
metadata[:location_props].class.should == Wombat::Properties
|
10
|
-
|
11
|
-
metadata.event_props.should == metadata[:event_props]
|
12
|
-
metadata.venue_props.should == metadata[:venue_props]
|
13
|
-
metadata.location_props.should == metadata[:location_props]
|
4
|
+
before(:each) do
|
5
|
+
@metadata = Wombat::Metadata.new
|
14
6
|
end
|
15
7
|
|
16
|
-
it 'should
|
17
|
-
|
18
|
-
|
19
|
-
|
8
|
+
it 'should not include non-properties in all properties list' do
|
9
|
+
@metadata.another_property "/some/selector", :text
|
10
|
+
@metadata.base_url "felipecsl.com"
|
11
|
+
@metadata.list_page "/yeah"
|
12
|
+
@metadata.all_properties.should == [@metadata['another_property']]
|
20
13
|
end
|
21
14
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -2,13 +2,15 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Wombat::Parser do
|
4
4
|
before(:each) do
|
5
|
-
|
5
|
+
crawler = Class.new
|
6
|
+
crawler.send(:include, Wombat::Parser)
|
7
|
+
@parser = crawler.new
|
6
8
|
@metadata = Wombat::Metadata.new
|
7
9
|
end
|
8
10
|
|
9
11
|
it 'should request page document with correct url' do
|
10
|
-
@metadata
|
11
|
-
@metadata
|
12
|
+
@metadata.base_url "http://www.google.com"
|
13
|
+
@metadata.list_page "/search"
|
12
14
|
fake_document = double :document
|
13
15
|
fake_parser = double :parser
|
14
16
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
@@ -26,66 +28,20 @@ describe Wombat::Parser do
|
|
26
28
|
@parser.parse @metadata
|
27
29
|
end
|
28
30
|
|
29
|
-
it 'should invoke
|
31
|
+
it 'should invoke metadata callbacks' do
|
30
32
|
fake_document = double :document
|
31
33
|
fake_parser = double :parser
|
32
34
|
property = double :property
|
33
|
-
properties = double :properties
|
34
35
|
block_called = false
|
35
36
|
block = lambda { |p| block_called = true }
|
36
37
|
|
37
38
|
property.stub(:result)
|
38
39
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
39
40
|
property.should_receive(:callback).twice.and_return(block)
|
40
|
-
properties.should_receive(:all_properties).and_return [property]
|
41
41
|
|
42
42
|
@parser.mechanize.stub(:get).and_return fake_document
|
43
43
|
@parser.should_receive(:locate).with(@metadata)
|
44
|
-
@metadata.should_receive(:
|
45
|
-
|
46
|
-
@parser.parse @metadata
|
47
|
-
|
48
|
-
block_called.should be_true
|
49
|
-
end
|
50
|
-
|
51
|
-
it 'should invoke venue callbacks' do
|
52
|
-
fake_document = double :document
|
53
|
-
fake_parser = double :parser
|
54
|
-
property = double :property
|
55
|
-
properties = double :properties
|
56
|
-
block_called = false
|
57
|
-
block = lambda { |p| block_called = true }
|
58
|
-
|
59
|
-
property.stub(:result)
|
60
|
-
fake_document.should_receive(:parser).and_return(fake_parser)
|
61
|
-
property.should_receive(:callback).twice.and_return(block)
|
62
|
-
properties.should_receive(:all_properties).and_return [property]
|
63
|
-
|
64
|
-
@parser.mechanize.stub(:get).and_return fake_document
|
65
|
-
@parser.should_receive(:locate).with(@metadata)
|
66
|
-
@metadata.should_receive(:venue_props).and_return properties
|
67
|
-
|
68
|
-
@parser.parse @metadata
|
69
|
-
|
70
|
-
block_called.should be_true
|
71
|
-
end
|
72
|
-
|
73
|
-
it 'should invoke location callbacks' do
|
74
|
-
fake_document = double :document
|
75
|
-
fake_parser = double :parser
|
76
|
-
property = double :property
|
77
|
-
properties = double :properties
|
78
|
-
block_called = false
|
79
|
-
block = lambda { |p| block_called = true }
|
80
|
-
|
81
|
-
property.stub(:result)
|
82
|
-
fake_document.should_receive(:parser).and_return(fake_parser)
|
83
|
-
property.should_receive(:callback).twice.and_return(block)
|
84
|
-
properties.should_receive(:all_properties).and_return [property]
|
85
|
-
|
86
|
-
@parser.mechanize.stub(:get).and_return fake_document
|
87
|
-
@parser.should_receive(:locate).with(@metadata)
|
88
|
-
@metadata.should_receive(:venue_props).and_return properties
|
44
|
+
@metadata.should_receive(:all_properties).and_return [property]
|
89
45
|
|
90
46
|
@parser.parse @metadata
|
91
47
|
|
@@ -96,7 +52,6 @@ describe Wombat::Parser do
|
|
96
52
|
fake_document = double :document
|
97
53
|
fake_parser = double :parser
|
98
54
|
property = double :property
|
99
|
-
properties = double :properties
|
100
55
|
block_called = false
|
101
56
|
block = lambda { |p|
|
102
57
|
block_called = true
|
@@ -106,14 +61,25 @@ describe Wombat::Parser do
|
|
106
61
|
property.should_receive(:result).and_return("blah")
|
107
62
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
108
63
|
property.should_receive(:callback).twice.and_return(block)
|
109
|
-
properties.should_receive(:all_properties).and_return [property]
|
110
64
|
|
111
65
|
@parser.mechanize.stub(:get).and_return fake_document
|
112
66
|
@parser.should_receive(:locate).with(@metadata)
|
113
|
-
@metadata.should_receive(:
|
67
|
+
@metadata.should_receive(:all_properties).and_return [property]
|
114
68
|
|
115
69
|
@parser.parse @metadata
|
116
70
|
|
117
71
|
block_called.should be_true
|
118
72
|
end
|
73
|
+
|
74
|
+
it 'should return array with requested properties' do
|
75
|
+
hash = double :results
|
76
|
+
fake_parser = double :parser
|
77
|
+
fake_document = double :document
|
78
|
+
|
79
|
+
fake_document.should_receive(:parser).and_return fake_parser
|
80
|
+
@parser.mechanize.stub(:get).and_return fake_document
|
81
|
+
@metadata.should_receive(:flatten).and_return hash
|
82
|
+
|
83
|
+
@parser.parse(@metadata).should == hash
|
84
|
+
end
|
119
85
|
end
|