wombat 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/README.md +13 -30
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
  5. data/lib/wombat/crawler.rb +7 -17
  6. data/lib/wombat/dsl/follower.rb +19 -0
  7. data/lib/wombat/dsl/iterator.rb +19 -0
  8. data/lib/wombat/dsl/metadata.rb +27 -0
  9. data/lib/wombat/dsl/property.rb +27 -0
  10. data/lib/wombat/dsl/property_group.rb +48 -0
  11. data/lib/wombat/processing/node_selector.rb +12 -0
  12. data/lib/wombat/processing/parser.rb +48 -0
  13. data/lib/wombat/property/locators/base.rb +33 -0
  14. data/lib/wombat/property/locators/factory.rb +39 -0
  15. data/lib/wombat/property/locators/follow.rb +25 -0
  16. data/lib/wombat/property/locators/html.rb +14 -0
  17. data/lib/wombat/property/locators/iterator.rb +23 -0
  18. data/lib/wombat/property/locators/list.rb +17 -0
  19. data/lib/wombat/property/locators/property_group.rb +20 -0
  20. data/lib/wombat/property/locators/text.rb +22 -0
  21. data/lib/wombat.rb +8 -4
  22. data/spec/crawler_spec.rb +38 -48
  23. data/spec/dsl/property_spec.rb +12 -0
  24. data/spec/helpers/sample_crawler.rb +2 -15
  25. data/spec/integration/integration_spec.rb +61 -33
  26. data/spec/processing/parser_spec.rb +32 -0
  27. data/spec/property/locators/factory_spec.rb +18 -0
  28. data/spec/property/locators/follow_spec.rb +4 -0
  29. data/spec/property/locators/html_spec.rb +15 -0
  30. data/spec/property/locators/iterator_spec.rb +4 -0
  31. data/spec/property/locators/list_spec.rb +13 -0
  32. data/spec/property/locators/text_spec.rb +49 -0
  33. data/spec/sample_crawler_spec.rb +7 -11
  34. data/spec/wombat_spec.rb +13 -1
  35. data/wombat.gemspec +27 -16
  36. metadata +27 -16
  37. data/lib/wombat/iterator.rb +0 -38
  38. data/lib/wombat/metadata.rb +0 -24
  39. data/lib/wombat/node_selector.rb +0 -10
  40. data/lib/wombat/parser.rb +0 -59
  41. data/lib/wombat/property.rb +0 -21
  42. data/lib/wombat/property_container.rb +0 -70
  43. data/lib/wombat/property_locator.rb +0 -20
  44. data/spec/iterator_spec.rb +0 -52
  45. data/spec/metadata_spec.rb +0 -20
  46. data/spec/parser_spec.rb +0 -125
  47. data/spec/property_container_spec.rb +0 -62
  48. data/spec/property_locator_spec.rb +0 -75
  49. data/spec/property_spec.rb +0 -16
@@ -1,24 +0,0 @@
1
- #coding: utf-8
2
- require 'wombat/property_container'
3
- require 'wombat/iterator'
4
-
5
- module Wombat
6
- class Metadata < PropertyContainer
7
- def initialize
8
- self[:document_format] = :html
9
- super
10
- end
11
-
12
- def base_url(url)
13
- self[:base_url] = url
14
- end
15
-
16
- def list_page(url)
17
- self[:list_page] = url
18
- end
19
-
20
- def document_format(format)
21
- self[:document_format] = format
22
- end
23
- end
24
- end
@@ -1,10 +0,0 @@
1
- module Wombat
2
- module NodeSelector
3
- def select_nodes(selector, namespaces = nil)
4
- return [selector.to_s] if selector.is_a? Symbol
5
- return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
6
- return context.css selector[4..-1] if selector.start_with? "css="
7
- [selector]
8
- end
9
- end
10
- end
data/lib/wombat/parser.rb DELETED
@@ -1,59 +0,0 @@
1
- #coding: utf-8
2
- require 'wombat/property_locator'
3
- require 'mechanize'
4
- require 'restclient'
5
-
6
- module Wombat
7
- module Parser
8
- include PropertyLocator
9
- attr_accessor :mechanize, :context, :response_code
10
-
11
- def initialize
12
- @mechanize = Mechanize.new
13
- end
14
-
15
- def parse(metadata)
16
- self.context = parser_for metadata
17
- original_context = self.context
18
-
19
- metadata.iterators.each do |it|
20
- it.reset # Clean up iterator results before starting
21
- select_nodes(it.selector).each do |node|
22
- self.context = node
23
- it.parse { |p| locate p }
24
- end
25
- end
26
-
27
- self.context = original_context
28
-
29
- metadata.parse { |p| locate p }
30
-
31
- metadata.flatten
32
- end
33
-
34
- private
35
- def parser_for(metadata)
36
- url = "#{metadata[:base_url]}#{metadata[:list_page]}"
37
- page = nil
38
- parser = nil
39
- begin
40
- if metadata[:document_format] == :html
41
- page = @mechanize.get(url)
42
- parser = page.parser
43
- else
44
- page = RestClient.get(url)
45
- parser = Nokogiri::XML page
46
- end
47
- self.response_code = page.code.to_i if page.respond_to? :code
48
- parser
49
- rescue
50
- if $!.respond_to? :http_code
51
- self.response_code = $!.http_code.to_i
52
- elsif $!.respond_to? :response_code
53
- self.response_code = $!.response_code.to_i
54
- end
55
- raise $!
56
- end
57
- end
58
- end
59
- end
@@ -1,21 +0,0 @@
1
- module Wombat
2
- class Property
3
- attr_accessor :name, :selector, :format, :namespaces, :callback, :result
4
-
5
- def initialize(options)
6
- @name = options[:name]
7
- @selector = options[:selector]
8
- @format = options[:format]
9
- @namespaces = options[:namespaces]
10
- @callback = options[:callback]
11
- end
12
-
13
- def flatten(depth = nil)
14
- depth ? result[depth] : result
15
- end
16
-
17
- def reset
18
- self.result = nil
19
- end
20
- end
21
- end
@@ -1,70 +0,0 @@
1
- #coding: utf-8
2
-
3
- module Wombat
4
- class PropertyContainer < Hash
5
- attr_accessor :iterators
6
-
7
- def initialize
8
- @iterators = []
9
- end
10
-
11
- def method_missing(method, *args, &block)
12
- if args.empty? && block
13
- self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
14
- block.call(self["#{method.to_s}"])
15
- else
16
- self[method.to_s] = Property.new(
17
- name: method.to_s,
18
- selector: args.first,
19
- format: args[1],
20
- namespaces: args[2],
21
- callback: block)
22
- end
23
- end
24
-
25
- def to_ary
26
- end
27
-
28
- def all_properties
29
- values.flat_map { |v|
30
- if v.kind_of? PropertyContainer
31
- v.all_properties
32
- elsif v.kind_of? Property
33
- v
34
- else
35
- nil
36
- end
37
- }.compact
38
- end
39
-
40
- def parse
41
- all_properties.each do |p|
42
- result = yield p if block_given?
43
- p.result = p.callback ? p.callback.call(result) : result
44
- end
45
- end
46
-
47
- def flatten(depth = nil)
48
- properties = Hash.new.tap do |h|
49
- keys.map do |k|
50
- val = self[k]
51
- if val.is_a?(PropertyContainer) || val.is_a?(Property)
52
- h[k] = val.flatten depth
53
- end
54
- end
55
- end
56
-
57
- iters = iterators.reduce({}) do |memo, i|
58
- memo.merge("iterator#{iterators.index(i)}" => i.flatten)
59
- end
60
-
61
- properties.merge iters
62
- end
63
-
64
- def for_each(selector)
65
- Iterator.new(selector).tap do |i|
66
- iterators << i
67
- end
68
- end
69
- end
70
- end
@@ -1,20 +0,0 @@
1
- #coding: utf-8
2
- require 'wombat/node_selector'
3
-
4
- module Wombat
5
- module PropertyLocator
6
- include NodeSelector
7
-
8
- def locate(property)
9
- props = _locate property
10
- property.format != :list ? props.first : props
11
- end
12
-
13
- private
14
- def _locate(property)
15
- result = select_nodes(property.selector, property.namespaces).to_a
16
- result.map! {|r| r.inner_html.strip } if property.format == :html
17
- result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
18
- end
19
- end
20
- end
@@ -1,52 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Iterator do
4
- let(:it) { Wombat::Iterator.new "it_selector" }
5
-
6
- context 'parse' do
7
- it 'should iterate in for_each properties' do
8
- it.prop_1 "some_selector"
9
- it.prop_2 "another_selector"
10
-
11
- it['prop_1'].should_receive(:result).twice.and_return([])
12
- it['prop_2'].should_receive(:result).twice.and_return([])
13
-
14
- parser = double :parser
15
- parser.should_receive(:locate).with(it['prop_1']).twice
16
- parser.should_receive(:locate).with(it['prop_2']).twice
17
-
18
- it.parse { |p| parser.locate p }
19
- it.parse { |p| parser.locate p }
20
- end
21
-
22
- it 'should raise if no block given' do
23
- expect{
24
- it.parse
25
- }.to raise_error(ArgumentError)
26
- end
27
- end
28
-
29
- context 'reset' do
30
- it 'should clean up properties results' do
31
- it.prop_1 'some_selector'
32
- it['prop_1'].result = [1, 2]
33
- it.reset
34
- it['prop_1'].result.should be_nil
35
- end
36
- end
37
-
38
- it 'should flatten properties to plain hash format' do
39
- it.prop_1 "some_selector"
40
- it.prop_2 "another_selector"
41
-
42
- it.parse {|p| }
43
- it.parse {|p| }
44
- it['prop_1'].result = ['result 1', 'result 2']
45
- it['prop_2'].result = ['result 3', 'result 4']
46
-
47
- it.flatten.should == [
48
- { "prop_1" => "result 1", "prop_2" => "result 3" },
49
- { "prop_1" => "result 2", "prop_2" => "result 4" }
50
- ]
51
- end
52
- end
@@ -1,20 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Metadata do
4
- before(:each) do
5
- @metadata = Wombat::Metadata.new
6
- end
7
-
8
- it 'should not include non-properties in all properties list' do
9
- @metadata.another_property "/some/selector", :text
10
- @metadata.base_url "felipecsl.com"
11
- @metadata.list_page "/yeah"
12
- @metadata.all_properties.should == [@metadata['another_property']]
13
- end
14
-
15
- it 'should store iterators' do
16
- @metadata.for_each("some_selector").kind_of?(Wombat::Iterator).should be_true
17
- @metadata.iterators.size.should == 1
18
- @metadata.iterators.first.selector.should == "some_selector"
19
- end
20
- end
data/spec/parser_spec.rb DELETED
@@ -1,125 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Parser do
4
- before(:each) do
5
- crawler = Class.new
6
- crawler.send(:include, Wombat::Parser)
7
- @parser = crawler.new
8
- @metadata = Wombat::Metadata.new
9
- end
10
-
11
- it 'should request page document with correct url' do
12
- @metadata.base_url "http://www.google.com"
13
- @metadata.list_page "/search"
14
- fake_document = double :document
15
- fake_parser = double :parser
16
- fake_document.should_receive(:parser).and_return(fake_parser)
17
- @parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
18
-
19
- @parser.parse @metadata
20
- end
21
-
22
- it 'should send correct data to locate method' do
23
- fake_document = double :document
24
- fake_parser = double :parser
25
- fake_document.should_receive(:parser).and_return(fake_parser)
26
- @parser.mechanize.stub(:get).and_return fake_document
27
- @parser.should_not_receive :locate
28
- @parser.parse @metadata
29
- end
30
-
31
- it 'should invoke metadata callbacks' do
32
- fake_document = double :document
33
- fake_parser = double :parser
34
- property = double :property
35
- block_called = false
36
- block = lambda { |p| block_called = true }
37
-
38
- property.stub(:result)
39
- fake_document.should_receive(:parser).and_return(fake_parser)
40
- property.should_receive(:callback).twice.and_return(block)
41
- property.should_receive(:result=).with(true)
42
-
43
- @parser.mechanize.stub(:get).and_return fake_document
44
- @metadata.stub(:all_properties).and_return [property]
45
- @parser.should_receive(:locate).with(property)
46
-
47
- @parser.parse @metadata
48
-
49
- block_called.should be_true
50
- end
51
-
52
- it 'should invoke callback with parsed data' do
53
- fake_document = double :document
54
- fake_parser = double :parser
55
- property = double :property
56
- block_called = false
57
- block = lambda { |p|
58
- block_called = true
59
- p.should == "blah"
60
- }
61
-
62
- fake_document.should_receive(:parser).and_return(fake_parser)
63
- property.should_receive(:callback).twice.and_return(block)
64
- property.should_receive(:result=).with(true)
65
-
66
- @parser.mechanize.stub(:get).and_return fake_document
67
- @metadata.stub(:all_properties).and_return [property]
68
- @parser.should_receive(:locate).with(property).and_return("blah")
69
-
70
- @parser.parse @metadata
71
-
72
- block_called.should be_true
73
- end
74
-
75
- it 'should return hash with requested properties' do
76
- hash = double :results
77
- fake_parser = double :parser
78
- fake_document = double :document
79
-
80
- fake_document.should_receive(:parser).and_return fake_parser
81
- @parser.mechanize.stub(:get).and_return fake_document
82
- @metadata.should_receive(:flatten).and_return hash
83
-
84
- @parser.parse(@metadata).should == hash
85
- end
86
-
87
- it 'should not include null results in iterated block' do
88
- fake_parser = double :parser
89
- fake_document = double :document
90
- c1 = double :context
91
- c2 = double :context
92
- it = Wombat::Iterator.new "it_selector"
93
- it.prop_1 "some_selector"
94
-
95
- @parser.should_receive(:context=).ordered
96
- @metadata.should_receive(:iterators).and_return [it]
97
- @metadata.should_receive(:flatten)
98
- fake_document.should_receive(:parser).and_return(fake_parser)
99
- @parser.mechanize.stub(:get).and_return fake_document
100
- @parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
101
- @parser.should_receive(:context=).with(c1).ordered
102
- @parser.should_receive(:context=).with(c2).ordered
103
- @parser.should_receive(:context=).ordered
104
- @parser.should_receive(:locate).with(it['prop_1']).and_return(12)
105
- @parser.should_receive(:locate).with(it['prop_1']).and_return(nil)
106
- @parser.stub(:locate)
107
-
108
- @parser.parse(@metadata)
109
-
110
- it["prop_1"].result.should == [12]
111
- end
112
-
113
- it 'should correctly parse xml documents' do
114
- fake_document = double :xml
115
- fake_parser = double :parser
116
- @metadata.document_format :xml
117
- @parser.mechanize.should_not_receive(:get)
118
- RestClient.should_receive(:get).and_return fake_document
119
- Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
120
- @parser.should_receive(:context=).with(fake_parser)
121
- @parser.should_receive(:context=)
122
-
123
- @parser.parse @metadata
124
- end
125
- end
@@ -1,62 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::PropertyContainer do
4
- before(:each) do
5
- @metadata = Wombat::PropertyContainer.new
6
- end
7
-
8
- it 'should return an array with all the metadata properties excluding iterators' do
9
- @metadata["event"] = Wombat::PropertyContainer.new
10
- @metadata["venue"] = Wombat::PropertyContainer.new
11
- @metadata.another_property "/some/selector", :text
12
- @metadata["event"]["something"] = Wombat::PropertyContainer.new
13
- @metadata["event"]["something"].else "Wohooo"
14
- @metadata["venue"].awesome "whooea"
15
- it = Wombat::Iterator.new "it_selector"
16
- it.felipe "lima"
17
- @metadata.iterators << it
18
-
19
- all_propes = @metadata.all_properties
20
-
21
- all_propes.should =~ [
22
- @metadata["another_property"],
23
- @metadata["event"]["something"]["else"],
24
- @metadata["venue"]["awesome"]
25
- ]
26
- end
27
-
28
- it 'should be able to change properties via all_properties' do
29
- @metadata.another_property "/some/selector", :text
30
- @metadata.all_properties.first.selector = "abc"
31
- @metadata["another_property"].selector.should == "abc"
32
- end
33
-
34
- it 'should return metadata in plain hash format including iterators' do
35
- @metadata.title "/some/selector"
36
- @metadata["title"].result = "Gogobot Inc."
37
- @metadata["holder"] = Wombat::PropertyContainer.new
38
- @metadata["holder"].heading "css=.heading"
39
- @metadata["holder"]["heading"].result = 123456
40
- @metadata["holder"]["subheader"] = Wombat::PropertyContainer.new
41
- @metadata["holder"]["subheader"].section "/blah"
42
- @metadata["holder"]["subheader"]["section"].result = "Lorem Ipsum"
43
- it = Wombat::Iterator.new "it_selector"
44
- it.felipe "lima"
45
- it["felipe"].result = ["correa", "de souza", "lima"]
46
- @metadata.iterators = [it]
47
- @metadata.footer("another thing", :html) { |a| true }
48
- @metadata["footer"].result = "bla bla bla"
49
-
50
- @metadata.flatten.should == {
51
- "title" => "Gogobot Inc.",
52
- "holder" => {
53
- "heading" => 123456,
54
- "subheader" => {
55
- "section" => "Lorem Ipsum"
56
- }
57
- },
58
- "iterator0"=>[{"felipe"=>"correa"}, {"felipe"=>"de souza"}, {"felipe"=>"lima"}],
59
- "footer" => "bla bla bla"
60
- }
61
- end
62
- end
@@ -1,75 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::PropertyLocator do
4
- before(:each) do
5
- @locator = Class.new
6
- @locator.send(:include, Wombat::PropertyLocator)
7
- @locator_instance = @locator.new
8
- @metadata = Wombat::Metadata.new
9
- @metadata["event"] = Wombat::PropertyContainer.new
10
- @metadata["venue"] = Wombat::PropertyContainer.new
11
- @metadata["location"] = Wombat::PropertyContainer.new
12
- end
13
-
14
- it 'should locate metadata properties' do
15
- context = double :context
16
- abc = double :abc
17
-
18
- abc.stub(:inner_text).and_return("Something cool")
19
-
20
- context.stub(:xpath).with("/abc", nil).and_return([abc])
21
- context.stub(:xpath).with("/bah", nil).and_return(["abc"])
22
- context.stub(:css).with("/ghi").and_return(["Another stuff"])
23
-
24
- @metadata["event"].data1 "xpath=/abc"
25
- @metadata["venue"].data2 :farms
26
- @metadata["location"].data3 "css=/ghi"
27
- @metadata.blah "xpath=/bah"
28
-
29
- @locator_instance.stub(:context).and_return context
30
-
31
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
32
-
33
- @metadata["blah"].result.should == "abc"
34
- @metadata["event"]["data1"].result.should == "Something cool"
35
- @metadata["venue"]["data2"].result.should == "farms"
36
- @metadata["location"]["data3"].result.should == "Another stuff"
37
- end
38
-
39
- it 'should support properties with html format' do
40
- context = double :context
41
- html_info = double :html_info
42
-
43
- html_info.should_receive(:inner_html).and_return("some another info ")
44
- context.should_receive(:xpath).with("/anotherData", nil).and_return([html_info])
45
-
46
- @locator_instance.stub(:context).and_return context
47
-
48
- @metadata["event"].another_info "xpath=/anotherData", :html
49
-
50
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
51
-
52
- @metadata["event"]["another_info"].result.should == "some another info"
53
- end
54
-
55
- it 'should trim property contents and use namespaces if present' do
56
- context = double :context
57
- context.should_receive(:xpath).with("/event/some/description", "blah").and_return([" awesome event "])
58
-
59
- @locator_instance.stub(:context).and_return context
60
- @metadata["event"].description "xpath=/event/some/description", :text, "blah"
61
-
62
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
63
-
64
- @metadata["event"]["description"].result.should == "awesome event"
65
- end
66
-
67
- it 'should return array of matching nodes for list properties' do
68
- context = double :context
69
- @metadata.list_prop "css=.selector", :list
70
- @locator_instance.stub(:context).and_return context
71
- @locator_instance.should_receive(:select_nodes).with("css=.selector", nil).and_return %w(1 2 3 4 5)
72
-
73
- @locator_instance.locate(@metadata["list_prop"]).should == %w(1 2 3 4 5)
74
- end
75
- end
@@ -1,16 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Property do
4
- it 'should store property data' do
5
- property = Wombat::Property.new(
6
- name: "title",
7
- selector: "/some/selector",
8
- format: :html,
9
- callback: lambda {})
10
-
11
- property.name.should == "title"
12
- property.selector.should == "/some/selector"
13
- property.format.should == :html
14
- property.callback.should == lambda {}
15
- end
16
- end