wombat 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,49 +1,52 @@
1
1
  #coding: utf-8
2
- require 'wombat/properties'
3
2
  require 'wombat/metadata'
3
+ require 'wombat/property'
4
4
  require 'wombat/parser'
5
5
  require 'active_support'
6
6
  require 'date'
7
7
 
8
8
  module Wombat
9
9
  module Crawler
10
+ include Parser
10
11
  extend ActiveSupport::Concern
11
12
 
12
13
  module InstanceMethods
13
14
  def crawl
14
- parser.parse self.class.send(:metadata)
15
+ parse self.class.send(:metadata)
15
16
  end
16
17
 
17
18
  def supports_city?
18
19
  end
19
-
20
- def parser
21
- @parser ||= Parser.new
22
- end
23
-
24
- def parser= parser
25
- @parser = parser
26
- end
27
20
  end
28
21
 
29
22
  module ClassMethods
30
- [:event, :venue, :location].each do |m|
31
- define_method(m) do |&block|
32
- block.call(metadata["#{m.to_s}_props".to_sym]) if block
33
- end
34
- end
35
-
36
23
  def method_missing method, *args, &block
37
- metadata[method] = args.first
24
+ if args.empty? && block
25
+ metadata["#{method.to_s}"] = PropertyContainer.new unless metadata["#{method.to_s}"]
26
+ block.call(metadata["#{method.to_s}"])
27
+ else
28
+ metadata.send method, *args, &block
29
+ end
38
30
  end
39
31
 
40
32
  def with_details_page
41
33
  yield metadata if block_given?
42
34
  end
43
35
 
36
+ def for_each selector
37
+
38
+ end
39
+
40
+ def follow_links selector
41
+
42
+ end
43
+
44
44
  def supported_cities
45
45
  end
46
46
 
47
+ def to_ary
48
+ end
49
+
47
50
  private
48
51
  def metadata
49
52
  @metadata ||= Metadata.new
@@ -1,24 +1,14 @@
1
1
  #coding: utf-8
2
- module Wombat
3
- class Metadata < Hash
4
- def initialize
5
- self[:event_props] = Properties.new
6
- self[:venue_props] = Properties.new
7
- self[:location_props] = Properties.new
8
- end
2
+ require 'wombat/property_container'
9
3
 
10
- [:event, :venue, :location].each do |m|
11
- define_method(m) do
12
- self["#{m.to_s}_props".to_sym]
13
- end
4
+ module Wombat
5
+ class Metadata < PropertyContainer
6
+ def base_url url
7
+ self[:base_url] = url
14
8
  end
15
9
 
16
- def method_missing method, *args, &block
17
- if method.to_s.end_with? '='
18
- self[method] = args.first
19
- else
20
- self[method]
21
- end
10
+ def list_page url
11
+ self[:list_page] = url
22
12
  end
23
13
  end
24
14
  end
data/lib/wombat/parser.rb CHANGED
@@ -1,9 +1,9 @@
1
- #coding: utf-8
1
+ #coding: utf-8
2
2
  require 'wombat/property_locator'
3
3
  require 'mechanize'
4
4
 
5
5
  module Wombat
6
- class Parser
6
+ module Parser
7
7
  include PropertyLocator
8
8
  attr_accessor :mechanize, :context
9
9
 
@@ -12,13 +12,15 @@ module Wombat
12
12
  end
13
13
 
14
14
  def parse metadata
15
- @context = @mechanize.get("#{metadata.base_url}#{metadata.event_list_page}").parser
15
+ @context = @mechanize.get("#{metadata[:base_url]}#{metadata[:list_page]}").parser
16
16
 
17
17
  locate metadata
18
18
 
19
- [metadata.event_props, metadata.venue_props, metadata.location_props].flat_map { |p| p.all_properties }.each do |p|
19
+ metadata.all_properties.each do |p|
20
20
  p.callback.call(p.result) if p.callback
21
21
  end
22
+
23
+ metadata.flatten
22
24
  end
23
25
  end
24
26
  end
@@ -0,0 +1,44 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ class PropertyContainer < Hash
5
+ def add_property property
6
+ self[property.name] = property
7
+ end
8
+
9
+ def get_property name
10
+ self[name]
11
+ end
12
+
13
+ def method_missing method, *args, &block
14
+ self[method.to_s] = Property.new(
15
+ name: method.to_s,
16
+ selector: args.first,
17
+ format: args[1],
18
+ namespaces: args[2],
19
+ callback: block)
20
+ end
21
+
22
+ def all_properties
23
+ values.flat_map { |v|
24
+ v.kind_of?(PropertyContainer) \
25
+ ? v.values \
26
+ : v.kind_of?(Property) \
27
+ ? v \
28
+ : nil
29
+ }.compact
30
+ end
31
+
32
+ def flatten
33
+ Hash.new.tap do |h|
34
+ keys.map do |k|
35
+ if self[k].kind_of?(PropertyContainer)
36
+ h[k] = self[k].flatten
37
+ elsif self[k].kind_of?(Property)
38
+ h[k] = self[k].result
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -3,7 +3,7 @@
3
3
  module Wombat
4
4
  module PropertyLocator
5
5
  def locate metadata
6
- [metadata.event_props, metadata.venue_props, metadata.location_props].flat_map { |p| p.all_properties }.each do |p|
6
+ metadata.all_properties.each do |p|
7
7
  p.result = locate_property(p).first
8
8
  end
9
9
  end
@@ -11,8 +11,8 @@ module Wombat
11
11
  private
12
12
  def locate_property property
13
13
  result = locate_selector(property.selector, property.namespaces)
14
- result.map! {|r| r.inner_html } if property.format == :html
15
- result.map {|r| r.strip }
14
+ result.map! {|r| r.inner_html.strip } if property.format == :html
15
+ result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
16
16
  end
17
17
 
18
18
  def locate_selector selector, namespaces = nil
data/spec/crawler_spec.rb CHANGED
@@ -3,10 +3,8 @@ require 'spec_helper'
3
3
  describe Wombat::Crawler do
4
4
  before(:each) do
5
5
  @crawler = Class.new
6
- @parser = Wombat::Parser.new
7
6
  @crawler.send(:include, Wombat::Crawler)
8
7
  @crawler_instance = @crawler.new
9
- @crawler_instance.parser = @parser
10
8
  end
11
9
 
12
10
  it 'should call the provided block' do
@@ -34,35 +32,59 @@ describe Wombat::Crawler do
34
32
  @crawler.venue { |v| v.name "Scooba" }
35
33
  @crawler.location { |v| v.latitude -50.2323 }
36
34
 
37
- @parser.should_receive(:parse) do |arg|
38
- arg.event_props.get_property("title").selector.should == "Fulltronic Dezembro"
39
- arg.event_props.get_property("time").selector.to_s.should == time.to_s
40
- arg.venue_props.get_property("name").selector.should == "Scooba"
41
- arg.location_props.get_property("latitude").selector.should == -50.2323
35
+ @crawler_instance.should_receive(:parse) do |arg|
36
+ arg["event"].get_property("title").selector.should == "Fulltronic Dezembro"
37
+ arg["event"].get_property("time").selector.to_s.should == time.to_s
38
+ arg["venue"].get_property("name").selector.should == "Scooba"
39
+ arg["location"].get_property("latitude").selector.should == -50.2323
42
40
  end
43
41
 
44
42
  @crawler_instance.crawl
45
43
  end
46
44
 
47
45
  it 'should isolate metadata between different instances' do
48
- another_parser = Wombat::Parser.new
49
46
  another_crawler = Class.new
50
47
  another_crawler.send(:include, Wombat::Crawler)
51
48
  another_crawler_instance = another_crawler.new
52
- another_crawler_instance.parser = another_parser
53
49
 
54
50
  another_crawler.event { |e| e.title 'Ibiza' }
55
- another_parser.should_receive(:parse) { |arg| arg.event_props.get_property("title").selector.should == "Ibiza" }
51
+ another_crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Ibiza" }
56
52
  another_crawler_instance.crawl
57
53
 
58
54
  @crawler.event { |e| e.title 'Fulltronic Dezembro' }
59
- @parser.should_receive(:parse) { |arg| arg.event_props.get_property("title").selector.should == "Fulltronic Dezembro" }
55
+ @crawler_instance.should_receive(:parse) { |arg| arg["event"].get_property("title").selector.should == "Fulltronic Dezembro" }
60
56
  @crawler_instance.crawl
61
57
  end
62
58
 
63
59
  it 'should be able to assign arbitrary plain text metadata' do
64
- @crawler.some_data "/event/list"
65
- @parser.should_receive(:parse) { |arg| arg.some_data.should == "/event/list" }
60
+ @crawler.some_data("/event/list", :html, "geo") {|p| true }
61
+
62
+ @crawler_instance.should_receive(:parse) do |arg|
63
+ prop = arg['some_data']
64
+ prop.name.should == "some_data"
65
+ prop.selector.should == "/event/list"
66
+ prop.format.should == :html
67
+ prop.namespaces.should == "geo"
68
+ prop.callback.should_not be_nil
69
+ end
70
+
71
+ @crawler_instance.crawl
72
+ end
73
+
74
+ it 'should be able to specify arbitrary block structure more than once' do
75
+ @crawler.structure do |s|
76
+ s.data "xpath=/xyz"
77
+ end
78
+
79
+ @crawler.structure do |s|
80
+ s.another "css=.information"
81
+ end
82
+
83
+ @crawler_instance.should_receive(:parse) do |arg|
84
+ arg["structure"]["data"].selector.should == "xpath=/xyz"
85
+ arg["structure"]["another"].selector.should == "css=.information"
86
+ end
87
+
66
88
  @crawler_instance.crawl
67
89
  end
68
90
 
@@ -4,14 +4,33 @@ require 'wombat'
4
4
  class SampleCrawler
5
5
  include Wombat::Crawler
6
6
 
7
- event do |e|
8
- e.title "Sample Event"
9
- e.description "This event's description"
10
- e.date DateTime.now.to_date
11
- end
7
+ base_url "http://www.obaoba.com.br"
8
+ list_page "/porto-alegre/agenda"
9
+
10
+ for_each "css=div.title-agenda" do
11
+ event do |e|
12
+ e.title("xpath=.") { |t| t.split(" | ")[1].strip }
13
+ e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
14
+ DateTime.strptime(d, '%d/%m')
15
+ end
16
+ e.type("xpath=.") { |t| t.split(" | ").first.strip.casecmp('SHOW') == 0 ? :show : :party }
17
+ end
18
+
19
+ venue do |v|
20
+ v.name("xpath=.") { |n| name.split(" | ")[2].strip }
21
+ end
22
+
23
+ follow_links "xpath=.//a[1]/@href" do
24
+ event { |e| e.description "css=#main-node-content", :html }
25
+ venue do |v|
26
+ v.phone "css=span.tel .value"
27
+ v.image "xpath=//div[@id='article-image']/div/img/@src"
28
+ end
12
29
 
13
- venue do |v|
14
- v.name "Cafe de La Musique"
15
- v.address "324 Dom Pedro II Street"
30
+ location do |l|
31
+ l.city "css=span.locality"
32
+ l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
33
+ end
34
+ end
16
35
  end
17
36
  end
@@ -0,0 +1,25 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'basic crawler setup' do
4
+ it 'should crawl page' do
5
+ VCR.use_cassette('basic_crawler_page') do
6
+ crawler = Class.new
7
+ crawler.send(:include, Wombat::Crawler)
8
+
9
+ crawler.base_url "http://www.terra.com.br"
10
+ crawler.list_page '/portal'
11
+
12
+ crawler.search "css=.btn-search"
13
+ crawler.social do |s|
14
+ s.twitter "css=.ctn-bar li.last"
15
+ end
16
+
17
+ crawler_instance = crawler.new
18
+
19
+ results = crawler_instance.crawl
20
+
21
+ results["search"].should == "Buscar"
22
+ results["social"]["twitter"].should == "Terra Magazine"
23
+ end
24
+ end
25
+ end
@@ -1,21 +1,14 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Wombat::Metadata do
4
- it 'should have basic structure' do
5
- metadata = Wombat::Metadata.new
6
-
7
- metadata[:event_props].class.should == Wombat::Properties
8
- metadata[:venue_props].class.should == Wombat::Properties
9
- metadata[:location_props].class.should == Wombat::Properties
10
-
11
- metadata.event_props.should == metadata[:event_props]
12
- metadata.venue_props.should == metadata[:venue_props]
13
- metadata.location_props.should == metadata[:location_props]
4
+ before(:each) do
5
+ @metadata = Wombat::Metadata.new
14
6
  end
15
7
 
16
- it 'should be able to get hash key like a method' do
17
- m = Wombat::Metadata.new
18
- m[:some_data] = "yeah"
19
- m.some_data.should == "yeah"
8
+ it 'should not include non-properties in all properties list' do
9
+ @metadata.another_property "/some/selector", :text
10
+ @metadata.base_url "felipecsl.com"
11
+ @metadata.list_page "/yeah"
12
+ @metadata.all_properties.should == [@metadata['another_property']]
20
13
  end
21
14
  end
data/spec/parser_spec.rb CHANGED
@@ -2,13 +2,15 @@ require 'spec_helper'
2
2
 
3
3
  describe Wombat::Parser do
4
4
  before(:each) do
5
- @parser = Wombat::Parser.new
5
+ crawler = Class.new
6
+ crawler.send(:include, Wombat::Parser)
7
+ @parser = crawler.new
6
8
  @metadata = Wombat::Metadata.new
7
9
  end
8
10
 
9
11
  it 'should request page document with correct url' do
10
- @metadata[:base_url] = "http://www.google.com"
11
- @metadata[:event_list_page] = "/search"
12
+ @metadata.base_url "http://www.google.com"
13
+ @metadata.list_page "/search"
12
14
  fake_document = double :document
13
15
  fake_parser = double :parser
14
16
  fake_document.should_receive(:parser).and_return(fake_parser)
@@ -26,66 +28,20 @@ describe Wombat::Parser do
26
28
  @parser.parse @metadata
27
29
  end
28
30
 
29
- it 'should invoke event callbacks' do
31
+ it 'should invoke metadata callbacks' do
30
32
  fake_document = double :document
31
33
  fake_parser = double :parser
32
34
  property = double :property
33
- properties = double :properties
34
35
  block_called = false
35
36
  block = lambda { |p| block_called = true }
36
37
 
37
38
  property.stub(:result)
38
39
  fake_document.should_receive(:parser).and_return(fake_parser)
39
40
  property.should_receive(:callback).twice.and_return(block)
40
- properties.should_receive(:all_properties).and_return [property]
41
41
 
42
42
  @parser.mechanize.stub(:get).and_return fake_document
43
43
  @parser.should_receive(:locate).with(@metadata)
44
- @metadata.should_receive(:event_props).and_return properties
45
-
46
- @parser.parse @metadata
47
-
48
- block_called.should be_true
49
- end
50
-
51
- it 'should invoke venue callbacks' do
52
- fake_document = double :document
53
- fake_parser = double :parser
54
- property = double :property
55
- properties = double :properties
56
- block_called = false
57
- block = lambda { |p| block_called = true }
58
-
59
- property.stub(:result)
60
- fake_document.should_receive(:parser).and_return(fake_parser)
61
- property.should_receive(:callback).twice.and_return(block)
62
- properties.should_receive(:all_properties).and_return [property]
63
-
64
- @parser.mechanize.stub(:get).and_return fake_document
65
- @parser.should_receive(:locate).with(@metadata)
66
- @metadata.should_receive(:venue_props).and_return properties
67
-
68
- @parser.parse @metadata
69
-
70
- block_called.should be_true
71
- end
72
-
73
- it 'should invoke location callbacks' do
74
- fake_document = double :document
75
- fake_parser = double :parser
76
- property = double :property
77
- properties = double :properties
78
- block_called = false
79
- block = lambda { |p| block_called = true }
80
-
81
- property.stub(:result)
82
- fake_document.should_receive(:parser).and_return(fake_parser)
83
- property.should_receive(:callback).twice.and_return(block)
84
- properties.should_receive(:all_properties).and_return [property]
85
-
86
- @parser.mechanize.stub(:get).and_return fake_document
87
- @parser.should_receive(:locate).with(@metadata)
88
- @metadata.should_receive(:venue_props).and_return properties
44
+ @metadata.should_receive(:all_properties).and_return [property]
89
45
 
90
46
  @parser.parse @metadata
91
47
 
@@ -96,7 +52,6 @@ describe Wombat::Parser do
96
52
  fake_document = double :document
97
53
  fake_parser = double :parser
98
54
  property = double :property
99
- properties = double :properties
100
55
  block_called = false
101
56
  block = lambda { |p|
102
57
  block_called = true
@@ -106,14 +61,25 @@ describe Wombat::Parser do
106
61
  property.should_receive(:result).and_return("blah")
107
62
  fake_document.should_receive(:parser).and_return(fake_parser)
108
63
  property.should_receive(:callback).twice.and_return(block)
109
- properties.should_receive(:all_properties).and_return [property]
110
64
 
111
65
  @parser.mechanize.stub(:get).and_return fake_document
112
66
  @parser.should_receive(:locate).with(@metadata)
113
- @metadata.should_receive(:event_props).and_return properties
67
+ @metadata.should_receive(:all_properties).and_return [property]
114
68
 
115
69
  @parser.parse @metadata
116
70
 
117
71
  block_called.should be_true
118
72
  end
73
+
74
+ it 'should return array with requested properties' do
75
+ hash = double :results
76
+ fake_parser = double :parser
77
+ fake_document = double :document
78
+
79
+ fake_document.should_receive(:parser).and_return fake_parser
80
+ @parser.mechanize.stub(:get).and_return fake_document
81
+ @metadata.should_receive(:flatten).and_return hash
82
+
83
+ @parser.parse(@metadata).should == hash
84
+ end
119
85
  end