wombat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,49 +1,52 @@
1
1
  #coding: utf-8
2
- require 'wombat/properties'
3
2
  require 'wombat/metadata'
3
+ require 'wombat/property'
4
4
  require 'wombat/parser'
5
5
  require 'active_support'
6
6
  require 'date'
7
7
 
8
8
  module Wombat
9
9
  module Crawler
10
+ include Parser
10
11
  extend ActiveSupport::Concern
11
12
 
12
13
  module InstanceMethods
13
14
  def crawl
14
- parser.parse self.class.send(:metadata)
15
+ parse self.class.send(:metadata)
15
16
  end
16
17
 
17
18
  def supports_city?
18
19
  end
19
-
20
- def parser
21
- @parser ||= Parser.new
22
- end
23
-
24
- def parser= parser
25
- @parser = parser
26
- end
27
20
  end
28
21
 
29
22
  module ClassMethods
30
- [:event, :venue, :location].each do |m|
31
- define_method(m) do |&block|
32
- block.call(metadata["#{m.to_s}_props".to_sym]) if block
33
- end
34
- end
35
-
36
23
  def method_missing method, *args, &block
37
- metadata[method] = args.first
24
+ if args.empty? && block
25
+ metadata["#{method.to_s}"] = PropertyContainer.new unless metadata["#{method.to_s}"]
26
+ block.call(metadata["#{method.to_s}"])
27
+ else
28
+ metadata.send method, *args, &block
29
+ end
38
30
  end
39
31
 
40
32
  def with_details_page
41
33
  yield metadata if block_given?
42
34
  end
43
35
 
36
+ def for_each selector
37
+
38
+ end
39
+
40
+ def follow_links selector
41
+
42
+ end
43
+
44
44
  def supported_cities
45
45
  end
46
46
 
47
+ def to_ary
48
+ end
49
+
47
50
  private
48
51
  def metadata
49
52
  @metadata ||= Metadata.new
@@ -1,24 +1,14 @@
1
1
  #coding: utf-8
2
- module Wombat
3
- class Metadata < Hash
4
- def initialize
5
- self[:event_props] = Properties.new
6
- self[:venue_props] = Properties.new
7
- self[:location_props] = Properties.new
8
- end
2
+ require 'wombat/property_container'
9
3
 
10
- [:event, :venue, :location].each do |m|
11
- define_method(m) do
12
- self["#{m.to_s}_props".to_sym]
13
- end
4
+ module Wombat
5
+ class Metadata < PropertyContainer
6
+ def base_url url
7
+ self[:base_url] = url
14
8
  end
15
9
 
16
- def method_missing method, *args, &block
17
- if method.to_s.end_with? '='
18
- self[method] = args.first
19
- else
20
- self[method]
21
- end
10
+ def list_page url
11
+ self[:list_page] = url
22
12
  end
23
13
  end
24
14
  end
data/lib/wombat/parser.rb CHANGED
@@ -1,9 +1,9 @@
1
- #coding: utf-8
1
+ #coding: utf-8
2
2
  require 'wombat/property_locator'
3
3
  require 'mechanize'
4
4
 
5
5
  module Wombat
6
- class Parser
6
+ module Parser
7
7
  include PropertyLocator
8
8
  attr_accessor :mechanize, :context
9
9
 
@@ -12,13 +12,15 @@ module Wombat
12
12
  end
13
13
 
14
14
  def parse metadata
15
- @context = @mechanize.get("#{metadata.base_url}#{metadata.event_list_page}").parser
15
+ @context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
16
16
 
17
17
  locate metadata
18
18
 
19
- [metadata.event_props, metadata.venue_props, metadata.location_props].flat_map { |p| p.all_properties }.each do |p|
19
+ metadata.all_properties.each do |p|
20
20
  p.callback.call(p.result) if p.callback
21
21
  end
22
+
23
+ metadata.flatten
22
24
  end
23
25
  end
24
26
  end
@@ -0,0 +1,44 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ class PropertyContainer < Hash
5
+ def add_property property
6
+ self[property.name] = property
7
+ end
8
+
9
+ def get_property name
10
+ self[name]
11
+ end
12
+
13
+ def method_missing method, *args, &block
14
+ self[method.to_s] = Property.new(
15
+ name: method.to_s,
16
+ selector: args.first,
17
+ format: args[1],
18
+ namespaces: args[2],
19
+ callback: block)
20
+ end
21
+
22
+ def all_properties
23
+ values.flat_map { |v|
24
+ v.kind_of?(PropertyContainer) \
25
+ ? v.values \
26
+ : v.kind_of?(Property) \
27
+ ? v \
28
+ : nil
29
+ }.compact
30
+ end
31
+
32
+ def flatten
33
+ Hash.new.tap do |h|
34
+ keys.map do |k|
35
+ if self[k].kind_of?(PropertyContainer)
36
+ h[k] = self[k].flatten
37
+ elsif self[k].kind_of?(Property)
38
+ h[k] = self[k].result
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -3,7 +3,7 @@
3
3
  module Wombat
4
4
  module PropertyLocator
5
5
  def locate metadata
6
- [metadata.event_props, metadata.venue_props, metadata.location_props].flat_map { |p| p.all_properties }.each do |p|
6
+ metadata.all_properties.each do |p|
7
7
  p.result = locate_property(p).first
8
8
  end
9
9
  end
@@ -11,8 +11,8 @@ module Wombat
11
11
  private
12
12
  def locate_property property
13
13
  result = locate_selector(property.selector, property.namespaces)
14
- result.map! {|r| r.inner_html } if property.format == :html
15
- result.map {|r| r.strip }
14
+ result.map! {|r| r.inner_html.strip } if property.format == :html
15
+ result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
16
16
  end
17
17
 
18
18
  def locate_selector selector, namespaces = nil
data/spec/crawler_spec.rb CHANGED
@@ -3,10 +3,8 @@ require 'spec_helper'
3
3
  describe Wombat::Crawler do
4
4
  before(:each) do
5
5
  @crawler = Class.new
6
- @parser = Wombat::Parser.new
7
6
  @crawler.send(:include, Wombat::Crawler)
8
7
  @crawler_instance = @crawler.new
9
- @crawler_instance.parser = @parser
10
8
  end
11
9
 
12
10
  it 'should call the provided block' do
@@ -34,35 +32,59 @@ describe Wombat::Crawler do
34
32
  @crawler.venue { |v| v.name "Scooba" }
35
33
  @crawler.location { |v| v.latitude -50.2323 }
36
34
 
37
- @parser.should_receive(:parse) do |arg|
38
- arg.event_props.get_property("title").selector.should == "Fulltronic Dezembro"
39
- arg.event_props.get_property("time").selector.to_s.should == time.to_s
40
- arg.venue_props.get_property("name").selector.should == "Scooba"
41
- arg.location_props.get_property("latitude").selector.should == -50.2323
35
+ @crawler_instance.should_receive(:parse) do |arg|
36
+ arg["event"].get_property("title").selector.should == "Fulltronic Dezembro"
37
+ arg["event"].get_property("time").selector.to_s.should == time.to_s
38
+ arg["venue"].get_property("name").selector.should == "Scooba"
39
+ arg["location"].get_property("latitude").selector.should == -50.2323
42
40
  end
43
41
 
44
42
  @crawler_instance.crawl
45
43
  end
46
44
 
47
45
  it 'should isolate metadata between different instances' do
48
- another_parser = Wombat::Parser.new
49
46
  another_crawler = Class.new
50
47
  another_crawler.send(:include, Wombat::Crawler)
51
48
  another_crawler_instance = another_crawler.new
52
- another_crawler_instance.parser = another_parser
53
49
 
54
50
  another_crawler.event { |e| e.title 'Ibiza' }
55
- another_parser.should_receive(:parse) { |arg| arg.event_props.get_property("title").selector.should == "Ibiza" }
51
+ another_crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Ibiza" }
56
52
  another_crawler_instance.crawl
57
53
 
58
54
  @crawler.event { |e| e.title 'Fulltronic Dezembro' }
59
- @parser.should_receive(:parse) { |arg| arg.event_props.get_property("title").selector.should == "Fulltronic Dezembro" }
55
+ @crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Fulltronic Dezembro" }
60
56
  @crawler_instance.crawl
61
57
  end
62
58
 
63
59
  it 'should be able to assign arbitrary plain text metadata' do
64
- @crawler.some_data "/event/list"
65
- @parser.should_receive(:parse) { |arg| arg.some_data.should == "/event/list" }
60
+ @crawler.some_data("/event/list", :html, "geo") {|p| true }
61
+
62
+ @crawler_instance.should_receive(:parse) do |arg|
63
+ prop = arg['some_data']
64
+ prop.name.should == "some_data"
65
+ prop.selector.should == "/event/list"
66
+ prop.format.should == :html
67
+ prop.namespaces.should == "geo"
68
+ prop.callback.should_not be_nil
69
+ end
70
+
71
+ @crawler_instance.crawl
72
+ end
73
+
74
+ it 'should be able to specify arbitrary block structure more than once' do
75
+ @crawler.structure do |s|
76
+ s.data "xpath=/xyz"
77
+ end
78
+
79
+ @crawler.structure do |s|
80
+ s.another "css=.information"
81
+ end
82
+
83
+ @crawler_instance.should_receive(:parse) do |arg|
84
+ arg["structure"]["data"].selector.should == "xpath=/xyz"
85
+ arg["structure"]["another"].selector.should == "css=.information"
86
+ end
87
+
66
88
  @crawler_instance.crawl
67
89
  end
68
90
 
@@ -4,14 +4,33 @@ require 'wombat'
4
4
  class SampleCrawler
5
5
  include Wombat::Crawler
6
6
 
7
- event do |e|
8
- e.title "Sample Event"
9
- e.description "This event's description"
10
- e.date DateTime.now.to_date
11
- end
7
+ base_url "http://www.obaoba.com.br"
8
+ list_page "/porto-alegre/agenda"
9
+
10
+ for_each "css=div.title-agenda" do
11
+ event do |e|
12
+ e.title("xpath=.") { |t| t.split(" | ")[1].strip }
13
+ e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
14
+ DateTime.strptime(d, '%d/%m')
15
+ end
16
+ e.type("xpath=.") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
17
+ end
18
+
19
+ venue do |v|
20
+ v.name("xpath=.") { |n| name.split(" | ")[2].strip }
21
+ end
22
+
23
+ follow_links "xpath=.//a[1]/@href" do
24
+ event { |e| e.description "css=#main-node-content", :html }
25
+ venue do |v|
26
+ v.phone "css=span.tel .value"
27
+ v.image "xpath=//div[@id='article-image']/div/img/@src"
28
+ end
12
29
 
13
- venue do |v|
14
- v.name "Cafe de La Musique"
15
- v.address "324 Dom Pedro II Street"
30
+ location do |l|
31
+ l.city "css=span.locality"
32
+ l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
33
+ end
34
+ end
16
35
  end
17
36
  end
@@ -0,0 +1,25 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'basic crawler setup' do
4
+ it 'should crawl page' do
5
+ VCR.use_cassette('basic_crawler_page') do
6
+ crawler = Class.new
7
+ crawler.send(:include, Wombat::Crawler)
8
+
9
+ crawler.base_url "http://www.terra.com.br"
10
+ crawler.list_page '/portal'
11
+
12
+ crawler.search "css=.btn-search"
13
+ crawler.social do |s|
14
+ s.twitter "css=.ctn-bar li.last"
15
+ end
16
+
17
+ crawler_instance = crawler.new
18
+
19
+ results = crawler_instance.crawl
20
+
21
+ results["search"].should == "Buscar"
22
+ results["social"]["twitter"].should == "Terra Magazine"
23
+ end
24
+ end
25
+ end
@@ -1,21 +1,14 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Wombat::Metadata do
4
- it 'should have basic structure' do
5
- metadata = Wombat::Metadata.new
6
-
7
- metadata[:event_props].class.should == Wombat::Properties
8
- metadata[:venue_props].class.should == Wombat::Properties
9
- metadata[:location_props].class.should == Wombat::Properties
10
-
11
- metadata.event_props.should == metadata[:event_props]
12
- metadata.venue_props.should == metadata[:venue_props]
13
- metadata.location_props.should == metadata[:location_props]
4
+ before(:each) do
5
+ @metadata = Wombat::Metadata.new
14
6
  end
15
7
 
16
- it 'should be able to get hash key like a method' do
17
- m = Wombat::Metadata.new
18
- m[:some_data] = "yeah"
19
- m.some_data.should == "yeah"
8
+ it 'should not include non-properties in all properties list' do
9
+ @metadata.another_property "/some/selector", :text
10
+ @metadata.base_url "felipecsl.com"
11
+ @metadata.list_page "/yeah"
12
+ @metadata.all_properties.should == [@metadata['another_property']]
20
13
  end
21
14
  end
data/spec/parser_spec.rb CHANGED
@@ -2,13 +2,15 @@ require 'spec_helper'
2
2
 
3
3
  describe Wombat::Parser do
4
4
  before(:each) do
5
- @parser = Wombat::Parser.new
5
+ crawler = Class.new
6
+ crawler.send(:include, Wombat::Parser)
7
+ @parser = crawler.new
6
8
  @metadata = Wombat::Metadata.new
7
9
  end
8
10
 
9
11
  it 'should request page document with correct url' do
10
- @metadata[:base_url] = "http://www.google.com"
11
- @metadata[:event_list_page] = "/search"
12
+ @metadata.base_url "http://www.google.com"
13
+ @metadata.list_page "/search"
12
14
  fake_document = double :document
13
15
  fake_parser = double :parser
14
16
  fake_document.should_receive(:parser).and_return(fake_parser)
@@ -26,66 +28,20 @@ describe Wombat::Parser do
26
28
  @parser.parse @metadata
27
29
  end
28
30
 
29
- it 'should invoke event callbacks' do
31
+ it 'should invoke metadata callbacks' do
30
32
  fake_document = double :document
31
33
  fake_parser = double :parser
32
34
  property = double :property
33
- properties = double :properties
34
35
  block_called = false
35
36
  block = lambda { |p| block_called = true }
36
37
 
37
38
  property.stub(:result)
38
39
  fake_document.should_receive(:parser).and_return(fake_parser)
39
40
  property.should_receive(:callback).twice.and_return(block)
40
- properties.should_receive(:all_properties).and_return [property]
41
41
 
42
42
  @parser.mechanize.stub(:get).and_return fake_document
43
43
  @parser.should_receive(:locate).with(@metadata)
44
- @metadata.should_receive(:event_props).and_return properties
45
-
46
- @parser.parse @metadata
47
-
48
- block_called.should be_true
49
- end
50
-
51
- it 'should invoke venue callbacks' do
52
- fake_document = double :document
53
- fake_parser = double :parser
54
- property = double :property
55
- properties = double :properties
56
- block_called = false
57
- block = lambda { |p| block_called = true }
58
-
59
- property.stub(:result)
60
- fake_document.should_receive(:parser).and_return(fake_parser)
61
- property.should_receive(:callback).twice.and_return(block)
62
- properties.should_receive(:all_properties).and_return [property]
63
-
64
- @parser.mechanize.stub(:get).and_return fake_document
65
- @parser.should_receive(:locate).with(@metadata)
66
- @metadata.should_receive(:venue_props).and_return properties
67
-
68
- @parser.parse @metadata
69
-
70
- block_called.should be_true
71
- end
72
-
73
- it 'should invoke location callbacks' do
74
- fake_document = double :document
75
- fake_parser = double :parser
76
- property = double :property
77
- properties = double :properties
78
- block_called = false
79
- block = lambda { |p| block_called = true }
80
-
81
- property.stub(:result)
82
- fake_document.should_receive(:parser).and_return(fake_parser)
83
- property.should_receive(:callback).twice.and_return(block)
84
- properties.should_receive(:all_properties).and_return [property]
85
-
86
- @parser.mechanize.stub(:get).and_return fake_document
87
- @parser.should_receive(:locate).with(@metadata)
88
- @metadata.should_receive(:venue_props).and_return properties
44
+ @metadata.should_receive(:all_properties).and_return [property]
89
45
 
90
46
  @parser.parse @metadata
91
47
 
@@ -96,7 +52,6 @@ describe Wombat::Parser do
96
52
  fake_document = double :document
97
53
  fake_parser = double :parser
98
54
  property = double :property
99
- properties = double :properties
100
55
  block_called = false
101
56
  block = lambda { |p|
102
57
  block_called = true
@@ -106,14 +61,25 @@ describe Wombat::Parser do
106
61
  property.should_receive(:result).and_return("blah")
107
62
  fake_document.should_receive(:parser).and_return(fake_parser)
108
63
  property.should_receive(:callback).twice.and_return(block)
109
- properties.should_receive(:all_properties).and_return [property]
110
64
 
111
65
  @parser.mechanize.stub(:get).and_return fake_document
112
66
  @parser.should_receive(:locate).with(@metadata)
113
- @metadata.should_receive(:event_props).and_return properties
67
+ @metadata.should_receive(:all_properties).and_return [property]
114
68
 
115
69
  @parser.parse @metadata
116
70
 
117
71
  block_called.should be_true
118
72
  end
73
+
74
+ it 'should return array with requested properties' do
75
+ hash = double :results
76
+ fake_parser = double :parser
77
+ fake_document = double :document
78
+
79
+ fake_document.should_receive(:parser).and_return fake_parser
80
+ @parser.mechanize.stub(:get).and_return fake_document
81
+ @metadata.should_receive(:flatten).and_return hash
82
+
83
+ @parser.parse(@metadata).should == hash
84
+ end
119
85
  end