wombat 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/README.md +13 -30
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
  5. data/lib/wombat/crawler.rb +7 -17
  6. data/lib/wombat/dsl/follower.rb +19 -0
  7. data/lib/wombat/dsl/iterator.rb +19 -0
  8. data/lib/wombat/dsl/metadata.rb +27 -0
  9. data/lib/wombat/dsl/property.rb +27 -0
  10. data/lib/wombat/dsl/property_group.rb +48 -0
  11. data/lib/wombat/processing/node_selector.rb +12 -0
  12. data/lib/wombat/processing/parser.rb +48 -0
  13. data/lib/wombat/property/locators/base.rb +33 -0
  14. data/lib/wombat/property/locators/factory.rb +39 -0
  15. data/lib/wombat/property/locators/follow.rb +25 -0
  16. data/lib/wombat/property/locators/html.rb +14 -0
  17. data/lib/wombat/property/locators/iterator.rb +23 -0
  18. data/lib/wombat/property/locators/list.rb +17 -0
  19. data/lib/wombat/property/locators/property_group.rb +20 -0
  20. data/lib/wombat/property/locators/text.rb +22 -0
  21. data/lib/wombat.rb +8 -4
  22. data/spec/crawler_spec.rb +38 -48
  23. data/spec/dsl/property_spec.rb +12 -0
  24. data/spec/helpers/sample_crawler.rb +2 -15
  25. data/spec/integration/integration_spec.rb +61 -33
  26. data/spec/processing/parser_spec.rb +32 -0
  27. data/spec/property/locators/factory_spec.rb +18 -0
  28. data/spec/property/locators/follow_spec.rb +4 -0
  29. data/spec/property/locators/html_spec.rb +15 -0
  30. data/spec/property/locators/iterator_spec.rb +4 -0
  31. data/spec/property/locators/list_spec.rb +13 -0
  32. data/spec/property/locators/text_spec.rb +49 -0
  33. data/spec/sample_crawler_spec.rb +7 -11
  34. data/spec/wombat_spec.rb +13 -1
  35. data/wombat.gemspec +27 -16
  36. metadata +27 -16
  37. data/lib/wombat/iterator.rb +0 -38
  38. data/lib/wombat/metadata.rb +0 -24
  39. data/lib/wombat/node_selector.rb +0 -10
  40. data/lib/wombat/parser.rb +0 -59
  41. data/lib/wombat/property.rb +0 -21
  42. data/lib/wombat/property_container.rb +0 -70
  43. data/lib/wombat/property_locator.rb +0 -20
  44. data/spec/iterator_spec.rb +0 -52
  45. data/spec/metadata_spec.rb +0 -20
  46. data/spec/parser_spec.rb +0 -125
  47. data/spec/property_container_spec.rb +0 -62
  48. data/spec/property_locator_spec.rb +0 -75
  49. data/spec/property_spec.rb +0 -16
@@ -1,24 +0,0 @@
1
- #coding: utf-8
2
- require 'wombat/property_container'
3
- require 'wombat/iterator'
4
-
5
- module Wombat
6
- class Metadata < PropertyContainer
7
- def initialize
8
- self[:document_format] = :html
9
- super
10
- end
11
-
12
- def base_url(url)
13
- self[:base_url] = url
14
- end
15
-
16
- def list_page(url)
17
- self[:list_page] = url
18
- end
19
-
20
- def document_format(format)
21
- self[:document_format] = format
22
- end
23
- end
24
- end
@@ -1,10 +0,0 @@
1
- module Wombat
2
- module NodeSelector
3
- def select_nodes(selector, namespaces = nil)
4
- return [selector.to_s] if selector.is_a? Symbol
5
- return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
6
- return context.css selector[4..-1] if selector.start_with? "css="
7
- [selector]
8
- end
9
- end
10
- end
data/lib/wombat/parser.rb DELETED
@@ -1,59 +0,0 @@
1
- #coding: utf-8
2
- require 'wombat/property_locator'
3
- require 'mechanize'
4
- require 'restclient'
5
-
6
- module Wombat
7
- module Parser
8
- include PropertyLocator
9
- attr_accessor :mechanize, :context, :response_code
10
-
11
- def initialize
12
- @mechanize = Mechanize.new
13
- end
14
-
15
- def parse(metadata)
16
- self.context = parser_for metadata
17
- original_context = self.context
18
-
19
- metadata.iterators.each do |it|
20
- it.reset # Clean up iterator results before starting
21
- select_nodes(it.selector).each do |node|
22
- self.context = node
23
- it.parse { |p| locate p }
24
- end
25
- end
26
-
27
- self.context = original_context
28
-
29
- metadata.parse { |p| locate p }
30
-
31
- metadata.flatten
32
- end
33
-
34
- private
35
- def parser_for(metadata)
36
- url = "#{metadata[:base_url]}#{metadata[:list_page]}"
37
- page = nil
38
- parser = nil
39
- begin
40
- if metadata[:document_format] == :html
41
- page = @mechanize.get(url)
42
- parser = page.parser
43
- else
44
- page = RestClient.get(url)
45
- parser = Nokogiri::XML page
46
- end
47
- self.response_code = page.code.to_i if page.respond_to? :code
48
- parser
49
- rescue
50
- if $!.respond_to? :http_code
51
- self.response_code = $!.http_code.to_i
52
- elsif $!.respond_to? :response_code
53
- self.response_code = $!.response_code.to_i
54
- end
55
- raise $!
56
- end
57
- end
58
- end
59
- end
@@ -1,21 +0,0 @@
1
- module Wombat
2
- class Property
3
- attr_accessor :name, :selector, :format, :namespaces, :callback, :result
4
-
5
- def initialize(options)
6
- @name = options[:name]
7
- @selector = options[:selector]
8
- @format = options[:format]
9
- @namespaces = options[:namespaces]
10
- @callback = options[:callback]
11
- end
12
-
13
- def flatten(depth = nil)
14
- depth ? result[depth] : result
15
- end
16
-
17
- def reset
18
- self.result = nil
19
- end
20
- end
21
- end
@@ -1,70 +0,0 @@
1
- #coding: utf-8
2
-
3
- module Wombat
4
- class PropertyContainer < Hash
5
- attr_accessor :iterators
6
-
7
- def initialize
8
- @iterators = []
9
- end
10
-
11
- def method_missing(method, *args, &block)
12
- if args.empty? && block
13
- self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
14
- block.call(self["#{method.to_s}"])
15
- else
16
- self[method.to_s] = Property.new(
17
- name: method.to_s,
18
- selector: args.first,
19
- format: args[1],
20
- namespaces: args[2],
21
- callback: block)
22
- end
23
- end
24
-
25
- def to_ary
26
- end
27
-
28
- def all_properties
29
- values.flat_map { |v|
30
- if v.kind_of? PropertyContainer
31
- v.all_properties
32
- elsif v.kind_of? Property
33
- v
34
- else
35
- nil
36
- end
37
- }.compact
38
- end
39
-
40
- def parse
41
- all_properties.each do |p|
42
- result = yield p if block_given?
43
- p.result = p.callback ? p.callback.call(result) : result
44
- end
45
- end
46
-
47
- def flatten(depth = nil)
48
- properties = Hash.new.tap do |h|
49
- keys.map do |k|
50
- val = self[k]
51
- if val.is_a?(PropertyContainer) || val.is_a?(Property)
52
- h[k] = val.flatten depth
53
- end
54
- end
55
- end
56
-
57
- iters = iterators.reduce({}) do |memo, i|
58
- memo.merge("iterator#{iterators.index(i)}" => i.flatten)
59
- end
60
-
61
- properties.merge iters
62
- end
63
-
64
- def for_each(selector)
65
- Iterator.new(selector).tap do |i|
66
- iterators << i
67
- end
68
- end
69
- end
70
- end
@@ -1,20 +0,0 @@
1
- #coding: utf-8
2
- require 'wombat/node_selector'
3
-
4
- module Wombat
5
- module PropertyLocator
6
- include NodeSelector
7
-
8
- def locate(property)
9
- props = _locate property
10
- property.format != :list ? props.first : props
11
- end
12
-
13
- private
14
- def _locate(property)
15
- result = select_nodes(property.selector, property.namespaces).to_a
16
- result.map! {|r| r.inner_html.strip } if property.format == :html
17
- result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
18
- end
19
- end
20
- end
@@ -1,52 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Iterator do
4
- let(:it) { Wombat::Iterator.new "it_selector" }
5
-
6
- context 'parse' do
7
- it 'should iterate in for_each properties' do
8
- it.prop_1 "some_selector"
9
- it.prop_2 "another_selector"
10
-
11
- it['prop_1'].should_receive(:result).twice.and_return([])
12
- it['prop_2'].should_receive(:result).twice.and_return([])
13
-
14
- parser = double :parser
15
- parser.should_receive(:locate).with(it['prop_1']).twice
16
- parser.should_receive(:locate).with(it['prop_2']).twice
17
-
18
- it.parse { |p| parser.locate p }
19
- it.parse { |p| parser.locate p }
20
- end
21
-
22
- it 'should raise if no block given' do
23
- expect{
24
- it.parse
25
- }.to raise_error(ArgumentError)
26
- end
27
- end
28
-
29
- context 'reset' do
30
- it 'should clean up properties results' do
31
- it.prop_1 'some_selector'
32
- it['prop_1'].result = [1, 2]
33
- it.reset
34
- it['prop_1'].result.should be_nil
35
- end
36
- end
37
-
38
- it 'should flatten properties to plain hash format' do
39
- it.prop_1 "some_selector"
40
- it.prop_2 "another_selector"
41
-
42
- it.parse {|p| }
43
- it.parse {|p| }
44
- it['prop_1'].result = ['result 1', 'result 2']
45
- it['prop_2'].result = ['result 3', 'result 4']
46
-
47
- it.flatten.should == [
48
- { "prop_1" => "result 1", "prop_2" => "result 3" },
49
- { "prop_1" => "result 2", "prop_2" => "result 4" }
50
- ]
51
- end
52
- end
@@ -1,20 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Metadata do
4
- before(:each) do
5
- @metadata = Wombat::Metadata.new
6
- end
7
-
8
- it 'should not include non-properties in all properties list' do
9
- @metadata.another_property "/some/selector", :text
10
- @metadata.base_url "felipecsl.com"
11
- @metadata.list_page "/yeah"
12
- @metadata.all_properties.should == [@metadata['another_property']]
13
- end
14
-
15
- it 'should store iterators' do
16
- @metadata.for_each("some_selector").kind_of?(Wombat::Iterator).should be_true
17
- @metadata.iterators.size.should == 1
18
- @metadata.iterators.first.selector.should == "some_selector"
19
- end
20
- end
data/spec/parser_spec.rb DELETED
@@ -1,125 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Parser do
4
- before(:each) do
5
- crawler = Class.new
6
- crawler.send(:include, Wombat::Parser)
7
- @parser = crawler.new
8
- @metadata = Wombat::Metadata.new
9
- end
10
-
11
- it 'should request page document with correct url' do
12
- @metadata.base_url "http://www.google.com"
13
- @metadata.list_page "/search"
14
- fake_document = double :document
15
- fake_parser = double :parser
16
- fake_document.should_receive(:parser).and_return(fake_parser)
17
- @parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
18
-
19
- @parser.parse @metadata
20
- end
21
-
22
- it 'should send correct data to locate method' do
23
- fake_document = double :document
24
- fake_parser = double :parser
25
- fake_document.should_receive(:parser).and_return(fake_parser)
26
- @parser.mechanize.stub(:get).and_return fake_document
27
- @parser.should_not_receive :locate
28
- @parser.parse @metadata
29
- end
30
-
31
- it 'should invoke metadata callbacks' do
32
- fake_document = double :document
33
- fake_parser = double :parser
34
- property = double :property
35
- block_called = false
36
- block = lambda { |p| block_called = true }
37
-
38
- property.stub(:result)
39
- fake_document.should_receive(:parser).and_return(fake_parser)
40
- property.should_receive(:callback).twice.and_return(block)
41
- property.should_receive(:result=).with(true)
42
-
43
- @parser.mechanize.stub(:get).and_return fake_document
44
- @metadata.stub(:all_properties).and_return [property]
45
- @parser.should_receive(:locate).with(property)
46
-
47
- @parser.parse @metadata
48
-
49
- block_called.should be_true
50
- end
51
-
52
- it 'should invoke callback with parsed data' do
53
- fake_document = double :document
54
- fake_parser = double :parser
55
- property = double :property
56
- block_called = false
57
- block = lambda { |p|
58
- block_called = true
59
- p.should == "blah"
60
- }
61
-
62
- fake_document.should_receive(:parser).and_return(fake_parser)
63
- property.should_receive(:callback).twice.and_return(block)
64
- property.should_receive(:result=).with(true)
65
-
66
- @parser.mechanize.stub(:get).and_return fake_document
67
- @metadata.stub(:all_properties).and_return [property]
68
- @parser.should_receive(:locate).with(property).and_return("blah")
69
-
70
- @parser.parse @metadata
71
-
72
- block_called.should be_true
73
- end
74
-
75
- it 'should return hash with requested properties' do
76
- hash = double :results
77
- fake_parser = double :parser
78
- fake_document = double :document
79
-
80
- fake_document.should_receive(:parser).and_return fake_parser
81
- @parser.mechanize.stub(:get).and_return fake_document
82
- @metadata.should_receive(:flatten).and_return hash
83
-
84
- @parser.parse(@metadata).should == hash
85
- end
86
-
87
- it 'should not include null results in iterated block' do
88
- fake_parser = double :parser
89
- fake_document = double :document
90
- c1 = double :context
91
- c2 = double :context
92
- it = Wombat::Iterator.new "it_selector"
93
- it.prop_1 "some_selector"
94
-
95
- @parser.should_receive(:context=).ordered
96
- @metadata.should_receive(:iterators).and_return [it]
97
- @metadata.should_receive(:flatten)
98
- fake_document.should_receive(:parser).and_return(fake_parser)
99
- @parser.mechanize.stub(:get).and_return fake_document
100
- @parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
101
- @parser.should_receive(:context=).with(c1).ordered
102
- @parser.should_receive(:context=).with(c2).ordered
103
- @parser.should_receive(:context=).ordered
104
- @parser.should_receive(:locate).with(it['prop_1']).and_return(12)
105
- @parser.should_receive(:locate).with(it['prop_1']).and_return(nil)
106
- @parser.stub(:locate)
107
-
108
- @parser.parse(@metadata)
109
-
110
- it["prop_1"].result.should == [12]
111
- end
112
-
113
- it 'should correctly parse xml documents' do
114
- fake_document = double :xml
115
- fake_parser = double :parser
116
- @metadata.document_format :xml
117
- @parser.mechanize.should_not_receive(:get)
118
- RestClient.should_receive(:get).and_return fake_document
119
- Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
120
- @parser.should_receive(:context=).with(fake_parser)
121
- @parser.should_receive(:context=)
122
-
123
- @parser.parse @metadata
124
- end
125
- end
@@ -1,62 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::PropertyContainer do
4
- before(:each) do
5
- @metadata = Wombat::PropertyContainer.new
6
- end
7
-
8
- it 'should return an array with all the metadata properties excluding iterators' do
9
- @metadata["event"] = Wombat::PropertyContainer.new
10
- @metadata["venue"] = Wombat::PropertyContainer.new
11
- @metadata.another_property "/some/selector", :text
12
- @metadata["event"]["something"] = Wombat::PropertyContainer.new
13
- @metadata["event"]["something"].else "Wohooo"
14
- @metadata["venue"].awesome "whooea"
15
- it = Wombat::Iterator.new "it_selector"
16
- it.felipe "lima"
17
- @metadata.iterators << it
18
-
19
- all_propes = @metadata.all_properties
20
-
21
- all_propes.should =~ [
22
- @metadata["another_property"],
23
- @metadata["event"]["something"]["else"],
24
- @metadata["venue"]["awesome"]
25
- ]
26
- end
27
-
28
- it 'should be able to change properties via all_properties' do
29
- @metadata.another_property "/some/selector", :text
30
- @metadata.all_properties.first.selector = "abc"
31
- @metadata["another_property"].selector.should == "abc"
32
- end
33
-
34
- it 'should return metadata in plain hash format including iterators' do
35
- @metadata.title "/some/selector"
36
- @metadata["title"].result = "Gogobot Inc."
37
- @metadata["holder"] = Wombat::PropertyContainer.new
38
- @metadata["holder"].heading "css=.heading"
39
- @metadata["holder"]["heading"].result = 123456
40
- @metadata["holder"]["subheader"] = Wombat::PropertyContainer.new
41
- @metadata["holder"]["subheader"].section "/blah"
42
- @metadata["holder"]["subheader"]["section"].result = "Lorem Ipsum"
43
- it = Wombat::Iterator.new "it_selector"
44
- it.felipe "lima"
45
- it["felipe"].result = ["correa", "de souza", "lima"]
46
- @metadata.iterators = [it]
47
- @metadata.footer("another thing", :html) { |a| true }
48
- @metadata["footer"].result = "bla bla bla"
49
-
50
- @metadata.flatten.should == {
51
- "title" => "Gogobot Inc.",
52
- "holder" => {
53
- "heading" => 123456,
54
- "subheader" => {
55
- "section" => "Lorem Ipsum"
56
- }
57
- },
58
- "iterator0"=>[{"felipe"=>"correa"}, {"felipe"=>"de souza"}, {"felipe"=>"lima"}],
59
- "footer" => "bla bla bla"
60
- }
61
- end
62
- end
@@ -1,75 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::PropertyLocator do
4
- before(:each) do
5
- @locator = Class.new
6
- @locator.send(:include, Wombat::PropertyLocator)
7
- @locator_instance = @locator.new
8
- @metadata = Wombat::Metadata.new
9
- @metadata["event"] = Wombat::PropertyContainer.new
10
- @metadata["venue"] = Wombat::PropertyContainer.new
11
- @metadata["location"] = Wombat::PropertyContainer.new
12
- end
13
-
14
- it 'should locate metadata properties' do
15
- context = double :context
16
- abc = double :abc
17
-
18
- abc.stub(:inner_text).and_return("Something cool")
19
-
20
- context.stub(:xpath).with("/abc", nil).and_return([abc])
21
- context.stub(:xpath).with("/bah", nil).and_return(["abc"])
22
- context.stub(:css).with("/ghi").and_return(["Another stuff"])
23
-
24
- @metadata["event"].data1 "xpath=/abc"
25
- @metadata["venue"].data2 :farms
26
- @metadata["location"].data3 "css=/ghi"
27
- @metadata.blah "xpath=/bah"
28
-
29
- @locator_instance.stub(:context).and_return context
30
-
31
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
32
-
33
- @metadata["blah"].result.should == "abc"
34
- @metadata["event"]["data1"].result.should == "Something cool"
35
- @metadata["venue"]["data2"].result.should == "farms"
36
- @metadata["location"]["data3"].result.should == "Another stuff"
37
- end
38
-
39
- it 'should support properties with html format' do
40
- context = double :context
41
- html_info = double :html_info
42
-
43
- html_info.should_receive(:inner_html).and_return("some another info ")
44
- context.should_receive(:xpath).with("/anotherData", nil).and_return([html_info])
45
-
46
- @locator_instance.stub(:context).and_return context
47
-
48
- @metadata["event"].another_info "xpath=/anotherData", :html
49
-
50
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
51
-
52
- @metadata["event"]["another_info"].result.should == "some another info"
53
- end
54
-
55
- it 'should trim property contents and use namespaces if present' do
56
- context = double :context
57
- context.should_receive(:xpath).with("/event/some/description", "blah").and_return([" awesome event "])
58
-
59
- @locator_instance.stub(:context).and_return context
60
- @metadata["event"].description "xpath=/event/some/description", :text, "blah"
61
-
62
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
63
-
64
- @metadata["event"]["description"].result.should == "awesome event"
65
- end
66
-
67
- it 'should return array of matching nodes for list properties' do
68
- context = double :context
69
- @metadata.list_prop "css=.selector", :list
70
- @locator_instance.stub(:context).and_return context
71
- @locator_instance.should_receive(:select_nodes).with("css=.selector", nil).and_return %w(1 2 3 4 5)
72
-
73
- @locator_instance.locate(@metadata["list_prop"]).should == %w(1 2 3 4 5)
74
- end
75
- end
@@ -1,16 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Wombat::Property do
4
- it 'should store property data' do
5
- property = Wombat::Property.new(
6
- name: "title",
7
- selector: "/some/selector",
8
- format: :html,
9
- callback: lambda {})
10
-
11
- property.name.should == "title"
12
- property.selector.should == "/some/selector"
13
- property.format.should == :html
14
- property.callback.should == lambda {}
15
- end
16
- end