wombat 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +1 -4
- data/Gemfile +4 -0
- data/Gemfile.lock +11 -0
- data/README.md +22 -0
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/basic_crawler_page.yml +600 -0
- data/lib/wombat/crawler.rb +20 -17
- data/lib/wombat/metadata.rb +7 -17
- data/lib/wombat/parser.rb +6 -4
- data/lib/wombat/property_container.rb +44 -0
- data/lib/wombat/property_locator.rb +3 -3
- data/spec/crawler_spec.rb +35 -13
- data/spec/helpers/sample_crawler.rb +27 -8
- data/spec/integration/integration_spec.rb +25 -0
- data/spec/metadata_spec.rb +7 -14
- data/spec/parser_spec.rb +20 -54
- data/spec/property_container_spec.rb +49 -0
- data/spec/property_locator_spec.rb +21 -11
- data/spec/sample_crawler_spec.rb +10 -8
- data/spec/spec_helper.rb +7 -1
- data/wombat.gemspec +20 -6
- metadata +68 -22
- data/README.rdoc +0 -20
- data/lib/wombat/properties.rb +0 -31
- data/spec/properties_spec.rb +0 -31
data/lib/wombat/crawler.rb
CHANGED
@@ -1,49 +1,52 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
require 'wombat/properties'
|
3
2
|
require 'wombat/metadata'
|
3
|
+
require 'wombat/property'
|
4
4
|
require 'wombat/parser'
|
5
5
|
require 'active_support'
|
6
6
|
require 'date'
|
7
7
|
|
8
8
|
module Wombat
|
9
9
|
module Crawler
|
10
|
+
include Parser
|
10
11
|
extend ActiveSupport::Concern
|
11
12
|
|
12
13
|
module InstanceMethods
|
13
14
|
def crawl
|
14
|
-
|
15
|
+
parse self.class.send(:metadata)
|
15
16
|
end
|
16
17
|
|
17
18
|
def supports_city?
|
18
19
|
end
|
19
|
-
|
20
|
-
def parser
|
21
|
-
@parser ||= Parser.new
|
22
|
-
end
|
23
|
-
|
24
|
-
def parser= parser
|
25
|
-
@parser = parser
|
26
|
-
end
|
27
20
|
end
|
28
21
|
|
29
22
|
module ClassMethods
|
30
|
-
[:event, :venue, :location].each do |m|
|
31
|
-
define_method(m) do |&block|
|
32
|
-
block.call(metadata["#{m.to_s}_props".to_sym]) if block
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
23
|
def method_missing method, *args, &block
|
37
|
-
|
24
|
+
if args.empty? && block
|
25
|
+
metadata["#{method.to_s}"] = PropertyContainer.new unless metadata["#{method.to_s}"]
|
26
|
+
block.call(metadata["#{method.to_s}"])
|
27
|
+
else
|
28
|
+
metadata.send method, *args, &block
|
29
|
+
end
|
38
30
|
end
|
39
31
|
|
40
32
|
def with_details_page
|
41
33
|
yield metadata if block_given?
|
42
34
|
end
|
43
35
|
|
36
|
+
def for_each selector
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
def follow_links selector
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
44
|
def supported_cities
|
45
45
|
end
|
46
46
|
|
47
|
+
def to_ary
|
48
|
+
end
|
49
|
+
|
47
50
|
private
|
48
51
|
def metadata
|
49
52
|
@metadata ||= Metadata.new
|
data/lib/wombat/metadata.rb
CHANGED
@@ -1,24 +1,14 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
|
3
|
-
class Metadata < Hash
|
4
|
-
def initialize
|
5
|
-
self[:event_props] = Properties.new
|
6
|
-
self[:venue_props] = Properties.new
|
7
|
-
self[:location_props] = Properties.new
|
8
|
-
end
|
2
|
+
require 'wombat/property_container'
|
9
3
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
4
|
+
module Wombat
|
5
|
+
class Metadata < PropertyContainer
|
6
|
+
def base_url url
|
7
|
+
self[:base_url] = url
|
14
8
|
end
|
15
9
|
|
16
|
-
def
|
17
|
-
|
18
|
-
self[method] = args.first
|
19
|
-
else
|
20
|
-
self[method]
|
21
|
-
end
|
10
|
+
def list_page url
|
11
|
+
self[:list_page] = url
|
22
12
|
end
|
23
13
|
end
|
24
14
|
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
#coding: utf-8
|
1
|
+
#coding: utf-8
|
2
2
|
require 'wombat/property_locator'
|
3
3
|
require 'mechanize'
|
4
4
|
|
5
5
|
module Wombat
|
6
|
-
|
6
|
+
module Parser
|
7
7
|
include PropertyLocator
|
8
8
|
attr_accessor :mechanize, :context
|
9
9
|
|
@@ -12,13 +12,15 @@ module Wombat
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def parse metadata
|
15
|
-
@context = @mechanize.get("#{metadata
|
15
|
+
@context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
|
16
16
|
|
17
17
|
locate metadata
|
18
18
|
|
19
|
-
|
19
|
+
metadata.all_properties.each do |p|
|
20
20
|
p.callback.call(p.result) if p.callback
|
21
21
|
end
|
22
|
+
|
23
|
+
metadata.flatten
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
class PropertyContainer < Hash
|
5
|
+
def add_property property
|
6
|
+
self[property.name] = property
|
7
|
+
end
|
8
|
+
|
9
|
+
def get_property name
|
10
|
+
self[name]
|
11
|
+
end
|
12
|
+
|
13
|
+
def method_missing method, *args, &block
|
14
|
+
self[method.to_s] = Property.new(
|
15
|
+
name: method.to_s,
|
16
|
+
selector: args.first,
|
17
|
+
format: args[1],
|
18
|
+
namespaces: args[2],
|
19
|
+
callback: block)
|
20
|
+
end
|
21
|
+
|
22
|
+
def all_properties
|
23
|
+
values.flat_map { |v|
|
24
|
+
v.kind_of?(PropertyContainer) \
|
25
|
+
? v.values \
|
26
|
+
: v.kind_of?(Property) \
|
27
|
+
? v \
|
28
|
+
: nil
|
29
|
+
}.compact
|
30
|
+
end
|
31
|
+
|
32
|
+
def flatten
|
33
|
+
Hash.new.tap do |h|
|
34
|
+
keys.map do |k|
|
35
|
+
if self[k].kind_of?(PropertyContainer)
|
36
|
+
h[k] = self[k].flatten
|
37
|
+
elsif self[k].kind_of?(Property)
|
38
|
+
h[k] = self[k].result
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module Wombat
|
4
4
|
module PropertyLocator
|
5
5
|
def locate metadata
|
6
|
-
|
6
|
+
metadata.all_properties.each do |p|
|
7
7
|
p.result = locate_property(p).first
|
8
8
|
end
|
9
9
|
end
|
@@ -11,8 +11,8 @@ module Wombat
|
|
11
11
|
private
|
12
12
|
def locate_property property
|
13
13
|
result = locate_selector(property.selector, property.namespaces)
|
14
|
-
result.map! {|r| r.inner_html } if property.format == :html
|
15
|
-
result.map {|r| r.
|
14
|
+
result.map! {|r| r.inner_html.strip } if property.format == :html
|
15
|
+
result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
|
16
16
|
end
|
17
17
|
|
18
18
|
def locate_selector selector, namespaces = nil
|
data/spec/crawler_spec.rb
CHANGED
@@ -3,10 +3,8 @@ require 'spec_helper'
|
|
3
3
|
describe Wombat::Crawler do
|
4
4
|
before(:each) do
|
5
5
|
@crawler = Class.new
|
6
|
-
@parser = Wombat::Parser.new
|
7
6
|
@crawler.send(:include, Wombat::Crawler)
|
8
7
|
@crawler_instance = @crawler.new
|
9
|
-
@crawler_instance.parser = @parser
|
10
8
|
end
|
11
9
|
|
12
10
|
it 'should call the provided block' do
|
@@ -34,35 +32,59 @@ describe Wombat::Crawler do
|
|
34
32
|
@crawler.venue { |v| v.name "Scooba" }
|
35
33
|
@crawler.location { |v| v.latitude -50.2323 }
|
36
34
|
|
37
|
-
@
|
38
|
-
arg.
|
39
|
-
arg.
|
40
|
-
arg.
|
41
|
-
arg.
|
35
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
36
|
+
arg["event"].get_property("title").selector.should == "Fulltronic Dezembro"
|
37
|
+
arg["event"].get_property("time").selector.to_s.should == time.to_s
|
38
|
+
arg["venue"].get_property("name").selector.should == "Scooba"
|
39
|
+
arg["location"].get_property("latitude").selector.should == -50.2323
|
42
40
|
end
|
43
41
|
|
44
42
|
@crawler_instance.crawl
|
45
43
|
end
|
46
44
|
|
47
45
|
it 'should isolate metadata between different instances' do
|
48
|
-
another_parser = Wombat::Parser.new
|
49
46
|
another_crawler = Class.new
|
50
47
|
another_crawler.send(:include, Wombat::Crawler)
|
51
48
|
another_crawler_instance = another_crawler.new
|
52
|
-
another_crawler_instance.parser = another_parser
|
53
49
|
|
54
50
|
another_crawler.event { |e| e.title 'Ibiza' }
|
55
|
-
|
51
|
+
another_crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Ibiza" }
|
56
52
|
another_crawler_instance.crawl
|
57
53
|
|
58
54
|
@crawler.event { |e| e.title 'Fulltronic Dezembro' }
|
59
|
-
@
|
55
|
+
@crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Fulltronic Dezembro" }
|
60
56
|
@crawler_instance.crawl
|
61
57
|
end
|
62
58
|
|
63
59
|
it 'should be able to assign arbitrary plain text metadata' do
|
64
|
-
@crawler.some_data
|
65
|
-
|
60
|
+
@crawler.some_data("/event/list", :html, "geo") {|p| true }
|
61
|
+
|
62
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
63
|
+
prop = arg['some_data']
|
64
|
+
prop.name.should == "some_data"
|
65
|
+
prop.selector.should == "/event/list"
|
66
|
+
prop.format.should == :html
|
67
|
+
prop.namespaces.should == "geo"
|
68
|
+
prop.callback.should_not be_nil
|
69
|
+
end
|
70
|
+
|
71
|
+
@crawler_instance.crawl
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'should be able to specify arbitrary block structure more than once' do
|
75
|
+
@crawler.structure do |s|
|
76
|
+
s.data "xpath=/xyz"
|
77
|
+
end
|
78
|
+
|
79
|
+
@crawler.structure do |s|
|
80
|
+
s.another "css=.information"
|
81
|
+
end
|
82
|
+
|
83
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
84
|
+
arg["structure"]["data"].selector.should == "xpath=/xyz"
|
85
|
+
arg["structure"]["another"].selector.should == "css=.information"
|
86
|
+
end
|
87
|
+
|
66
88
|
@crawler_instance.crawl
|
67
89
|
end
|
68
90
|
|
@@ -4,14 +4,33 @@ require 'wombat'
|
|
4
4
|
class SampleCrawler
|
5
5
|
include Wombat::Crawler
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
base_url "http://www.obaoba.com.br"
|
8
|
+
list_page "/porto-alegre/agenda"
|
9
|
+
|
10
|
+
for_each "css=div.title-agenda" do
|
11
|
+
event do |e|
|
12
|
+
e.title("xpath=.") { |t| t.split(" | ")[1].strip }
|
13
|
+
e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
|
14
|
+
DateTime.strptime(d, '%d/%m')
|
15
|
+
end
|
16
|
+
e.type("xpath=.") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
|
17
|
+
end
|
18
|
+
|
19
|
+
venue do |v|
|
20
|
+
v.name("xpath=.") { |n| name.split(" | ")[2].strip }
|
21
|
+
end
|
22
|
+
|
23
|
+
follow_links "xpath=.//a[1]/@href" do
|
24
|
+
event { |e| e.description "css=#main-node-content", :html }
|
25
|
+
venue do |v|
|
26
|
+
v.phone "css=span.tel .value"
|
27
|
+
v.image "xpath=//div[@id='article-image']/div/img/@src"
|
28
|
+
end
|
12
29
|
|
13
|
-
|
14
|
-
|
15
|
-
|
30
|
+
location do |l|
|
31
|
+
l.city "css=span.locality"
|
32
|
+
l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
|
33
|
+
end
|
34
|
+
end
|
16
35
|
end
|
17
36
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'basic crawler setup' do
|
4
|
+
it 'should crawl page' do
|
5
|
+
VCR.use_cassette('basic_crawler_page') do
|
6
|
+
crawler = Class.new
|
7
|
+
crawler.send(:include, Wombat::Crawler)
|
8
|
+
|
9
|
+
crawler.base_url "http://www.terra.com.br"
|
10
|
+
crawler.list_page '/portal'
|
11
|
+
|
12
|
+
crawler.search "css=.btn-search"
|
13
|
+
crawler.social do |s|
|
14
|
+
s.twitter "css=.ctn-bar li.last"
|
15
|
+
end
|
16
|
+
|
17
|
+
crawler_instance = crawler.new
|
18
|
+
|
19
|
+
results = crawler_instance.crawl
|
20
|
+
|
21
|
+
results["search"].should == "Buscar"
|
22
|
+
results["social"]["twitter"].should == "Terra Magazine"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/spec/metadata_spec.rb
CHANGED
@@ -1,21 +1,14 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Wombat::Metadata do
|
4
|
-
|
5
|
-
metadata = Wombat::Metadata.new
|
6
|
-
|
7
|
-
metadata[:event_props].class.should == Wombat::Properties
|
8
|
-
metadata[:venue_props].class.should == Wombat::Properties
|
9
|
-
metadata[:location_props].class.should == Wombat::Properties
|
10
|
-
|
11
|
-
metadata.event_props.should == metadata[:event_props]
|
12
|
-
metadata.venue_props.should == metadata[:venue_props]
|
13
|
-
metadata.location_props.should == metadata[:location_props]
|
4
|
+
before(:each) do
|
5
|
+
@metadata = Wombat::Metadata.new
|
14
6
|
end
|
15
7
|
|
16
|
-
it 'should
|
17
|
-
|
18
|
-
|
19
|
-
|
8
|
+
it 'should not include non-properties in all properties list' do
|
9
|
+
@metadata.another_property "/some/selector", :text
|
10
|
+
@metadata.base_url "felipecsl.com"
|
11
|
+
@metadata.list_page "/yeah"
|
12
|
+
@metadata.all_properties.should == [@metadata['another_property']]
|
20
13
|
end
|
21
14
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -2,13 +2,15 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Wombat::Parser do
|
4
4
|
before(:each) do
|
5
|
-
|
5
|
+
crawler = Class.new
|
6
|
+
crawler.send(:include, Wombat::Parser)
|
7
|
+
@parser = crawler.new
|
6
8
|
@metadata = Wombat::Metadata.new
|
7
9
|
end
|
8
10
|
|
9
11
|
it 'should request page document with correct url' do
|
10
|
-
@metadata
|
11
|
-
@metadata
|
12
|
+
@metadata.base_url "http://www.google.com"
|
13
|
+
@metadata.list_page "/search"
|
12
14
|
fake_document = double :document
|
13
15
|
fake_parser = double :parser
|
14
16
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
@@ -26,66 +28,20 @@ describe Wombat::Parser do
|
|
26
28
|
@parser.parse @metadata
|
27
29
|
end
|
28
30
|
|
29
|
-
it 'should invoke
|
31
|
+
it 'should invoke metadata callbacks' do
|
30
32
|
fake_document = double :document
|
31
33
|
fake_parser = double :parser
|
32
34
|
property = double :property
|
33
|
-
properties = double :properties
|
34
35
|
block_called = false
|
35
36
|
block = lambda { |p| block_called = true }
|
36
37
|
|
37
38
|
property.stub(:result)
|
38
39
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
39
40
|
property.should_receive(:callback).twice.and_return(block)
|
40
|
-
properties.should_receive(:all_properties).and_return [property]
|
41
41
|
|
42
42
|
@parser.mechanize.stub(:get).and_return fake_document
|
43
43
|
@parser.should_receive(:locate).with(@metadata)
|
44
|
-
@metadata.should_receive(:
|
45
|
-
|
46
|
-
@parser.parse @metadata
|
47
|
-
|
48
|
-
block_called.should be_true
|
49
|
-
end
|
50
|
-
|
51
|
-
it 'should invoke venue callbacks' do
|
52
|
-
fake_document = double :document
|
53
|
-
fake_parser = double :parser
|
54
|
-
property = double :property
|
55
|
-
properties = double :properties
|
56
|
-
block_called = false
|
57
|
-
block = lambda { |p| block_called = true }
|
58
|
-
|
59
|
-
property.stub(:result)
|
60
|
-
fake_document.should_receive(:parser).and_return(fake_parser)
|
61
|
-
property.should_receive(:callback).twice.and_return(block)
|
62
|
-
properties.should_receive(:all_properties).and_return [property]
|
63
|
-
|
64
|
-
@parser.mechanize.stub(:get).and_return fake_document
|
65
|
-
@parser.should_receive(:locate).with(@metadata)
|
66
|
-
@metadata.should_receive(:venue_props).and_return properties
|
67
|
-
|
68
|
-
@parser.parse @metadata
|
69
|
-
|
70
|
-
block_called.should be_true
|
71
|
-
end
|
72
|
-
|
73
|
-
it 'should invoke location callbacks' do
|
74
|
-
fake_document = double :document
|
75
|
-
fake_parser = double :parser
|
76
|
-
property = double :property
|
77
|
-
properties = double :properties
|
78
|
-
block_called = false
|
79
|
-
block = lambda { |p| block_called = true }
|
80
|
-
|
81
|
-
property.stub(:result)
|
82
|
-
fake_document.should_receive(:parser).and_return(fake_parser)
|
83
|
-
property.should_receive(:callback).twice.and_return(block)
|
84
|
-
properties.should_receive(:all_properties).and_return [property]
|
85
|
-
|
86
|
-
@parser.mechanize.stub(:get).and_return fake_document
|
87
|
-
@parser.should_receive(:locate).with(@metadata)
|
88
|
-
@metadata.should_receive(:venue_props).and_return properties
|
44
|
+
@metadata.should_receive(:all_properties).and_return [property]
|
89
45
|
|
90
46
|
@parser.parse @metadata
|
91
47
|
|
@@ -96,7 +52,6 @@ describe Wombat::Parser do
|
|
96
52
|
fake_document = double :document
|
97
53
|
fake_parser = double :parser
|
98
54
|
property = double :property
|
99
|
-
properties = double :properties
|
100
55
|
block_called = false
|
101
56
|
block = lambda { |p|
|
102
57
|
block_called = true
|
@@ -106,14 +61,25 @@ describe Wombat::Parser do
|
|
106
61
|
property.should_receive(:result).and_return("blah")
|
107
62
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
108
63
|
property.should_receive(:callback).twice.and_return(block)
|
109
|
-
properties.should_receive(:all_properties).and_return [property]
|
110
64
|
|
111
65
|
@parser.mechanize.stub(:get).and_return fake_document
|
112
66
|
@parser.should_receive(:locate).with(@metadata)
|
113
|
-
@metadata.should_receive(:
|
67
|
+
@metadata.should_receive(:all_properties).and_return [property]
|
114
68
|
|
115
69
|
@parser.parse @metadata
|
116
70
|
|
117
71
|
block_called.should be_true
|
118
72
|
end
|
73
|
+
|
74
|
+
it 'should return array with requested properties' do
|
75
|
+
hash = double :results
|
76
|
+
fake_parser = double :parser
|
77
|
+
fake_document = double :document
|
78
|
+
|
79
|
+
fake_document.should_receive(:parser).and_return fake_parser
|
80
|
+
@parser.mechanize.stub(:get).and_return fake_document
|
81
|
+
@metadata.should_receive(:flatten).and_return hash
|
82
|
+
|
83
|
+
@parser.parse(@metadata).should == hash
|
84
|
+
end
|
119
85
|
end
|