wombat 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/README.md +13 -30
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
  5. data/lib/wombat/crawler.rb +7 -17
  6. data/lib/wombat/dsl/follower.rb +19 -0
  7. data/lib/wombat/dsl/iterator.rb +19 -0
  8. data/lib/wombat/dsl/metadata.rb +27 -0
  9. data/lib/wombat/dsl/property.rb +27 -0
  10. data/lib/wombat/dsl/property_group.rb +48 -0
  11. data/lib/wombat/processing/node_selector.rb +12 -0
  12. data/lib/wombat/processing/parser.rb +48 -0
  13. data/lib/wombat/property/locators/base.rb +33 -0
  14. data/lib/wombat/property/locators/factory.rb +39 -0
  15. data/lib/wombat/property/locators/follow.rb +25 -0
  16. data/lib/wombat/property/locators/html.rb +14 -0
  17. data/lib/wombat/property/locators/iterator.rb +23 -0
  18. data/lib/wombat/property/locators/list.rb +17 -0
  19. data/lib/wombat/property/locators/property_group.rb +20 -0
  20. data/lib/wombat/property/locators/text.rb +22 -0
  21. data/lib/wombat.rb +8 -4
  22. data/spec/crawler_spec.rb +38 -48
  23. data/spec/dsl/property_spec.rb +12 -0
  24. data/spec/helpers/sample_crawler.rb +2 -15
  25. data/spec/integration/integration_spec.rb +61 -33
  26. data/spec/processing/parser_spec.rb +32 -0
  27. data/spec/property/locators/factory_spec.rb +18 -0
  28. data/spec/property/locators/follow_spec.rb +4 -0
  29. data/spec/property/locators/html_spec.rb +15 -0
  30. data/spec/property/locators/iterator_spec.rb +4 -0
  31. data/spec/property/locators/list_spec.rb +13 -0
  32. data/spec/property/locators/text_spec.rb +49 -0
  33. data/spec/sample_crawler_spec.rb +7 -11
  34. data/spec/wombat_spec.rb +13 -1
  35. data/wombat.gemspec +27 -16
  36. metadata +27 -16
  37. data/lib/wombat/iterator.rb +0 -38
  38. data/lib/wombat/metadata.rb +0 -24
  39. data/lib/wombat/node_selector.rb +0 -10
  40. data/lib/wombat/parser.rb +0 -59
  41. data/lib/wombat/property.rb +0 -21
  42. data/lib/wombat/property_container.rb +0 -70
  43. data/lib/wombat/property_locator.rb +0 -20
  44. data/spec/iterator_spec.rb +0 -52
  45. data/spec/metadata_spec.rb +0 -20
  46. data/spec/parser_spec.rb +0 -125
  47. data/spec/property_container_spec.rb +0 -62
  48. data/spec/property_locator_spec.rb +0 -75
  49. data/spec/property_spec.rb +0 -16
@@ -1,13 +1,13 @@
1
1
  #coding: utf-8
2
- require 'wombat/metadata'
3
- require 'wombat/property'
4
- require 'wombat/parser'
2
+ require 'wombat/dsl/metadata'
3
+ require 'wombat/dsl/property'
4
+ require 'wombat/processing/parser'
5
5
  require 'active_support'
6
6
  require 'date'
7
7
 
8
8
  module Wombat
9
9
  module Crawler
10
- include Parser
10
+ include Processing::Parser
11
11
  extend ActiveSupport::Concern
12
12
 
13
13
  def crawl(&block)
@@ -31,33 +31,23 @@ module Wombat
31
31
  end
32
32
  end
33
33
 
34
+ alias_method :scrape, :crawl
35
+
34
36
  def method_missing(method, *args, &block)
35
37
  self.class.send method, *args, &block
36
38
  end
37
39
 
38
- def for_each(selector, &block)
39
- self.class.for_each selector, &block
40
- end
41
-
42
40
  module ClassMethods
43
41
  def method_missing(method, *args, &block)
44
42
  metadata.send method, *args, &block
45
43
  end
46
44
 
47
- def for_each(selector, &block)
48
- metadata.for_each(selector).instance_eval(&block) if block
49
- end
50
-
51
- def follow_links(selector)
52
-
53
- end
54
-
55
45
  def to_ary
56
46
  end
57
47
 
58
48
  private
59
49
  def metadata
60
- @metadata ||= Metadata.new
50
+ @metadata ||= DSL::Metadata.new
61
51
  end
62
52
  end
63
53
  end
@@ -0,0 +1,19 @@
1
+ module Wombat
2
+ module DSL
3
+ class Follower < PropertyGroup
4
+ attr_accessor :wombat_property_selector
5
+
6
+ def initialize(name, selector)
7
+ @wombat_property_selector = selector
8
+
9
+ super(name)
10
+ end
11
+
12
+ # So that Property::Locators::Iterator can identify this class
13
+ # as an iterator property.
14
+ def wombat_property_format
15
+ :follow
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ module Wombat
2
+ module DSL
3
+ class Iterator < PropertyGroup
4
+ attr_accessor :wombat_property_selector
5
+
6
+ def initialize(name, selector)
7
+ @wombat_property_selector = selector
8
+
9
+ super(name)
10
+ end
11
+
12
+ # So that Property::Locators::Iterator can identify this class
13
+ # as an iterator property.
14
+ def wombat_property_format
15
+ :iterator
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,27 @@
1
+ #coding: utf-8
2
+ require 'wombat/dsl/property_group'
3
+ require 'wombat/dsl/iterator'
4
+ require 'wombat/dsl/follower'
5
+
6
+ module Wombat
7
+ module DSL
8
+ class Metadata < PropertyGroup
9
+ def initialize
10
+ self[:document_format] = :html
11
+ super
12
+ end
13
+
14
+ def base_url(url)
15
+ self[:base_url] = url
16
+ end
17
+
18
+ def path(url)
19
+ self[:path] = url
20
+ end
21
+
22
+ def document_format(format)
23
+ self[:document_format] = format
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ module Wombat
2
+ module DSL
3
+ class Property
4
+ attr_accessor :wombat_property_name, :wombat_property_selector, :wombat_property_format, :wombat_property_namespaces, :callback
5
+
6
+ def initialize(name, *args, &block)
7
+ @wombat_property_name = name
8
+ @wombat_property_selector = args[0]
9
+ @wombat_property_format = args[1] || :text
10
+ @wombat_property_namespaces = args[2]
11
+ @callback = block
12
+ end
13
+
14
+ def selector
15
+ @wombat_property_selector
16
+ end
17
+
18
+ def namespaces
19
+ @wombat_property_namespaces
20
+ end
21
+
22
+ def format
23
+ @wombat_property_format
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,48 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module DSL
5
+ class PropertyGroup < Hash
6
+ attr_accessor :wombat_property_name
7
+
8
+ def initialize(name = nil)
9
+ @wombat_property_name = name
10
+ end
11
+
12
+ def method_missing(method, *args, &block)
13
+ property_name = method.to_s
14
+
15
+ if args.empty? && block
16
+ # TODO: Verify if another property with same name already exists
17
+ # before overwriting
18
+ property_group = self[property_name] || PropertyGroup.new(property_name)
19
+ self[property_name] = property_group
20
+ property_group.instance_eval(&block)
21
+ else
22
+ if args[1] == :iterator
23
+ it = Iterator.new(property_name, args.first)
24
+ self[property_name] = it
25
+ it.instance_eval(&block) if block
26
+ elsif args[1] == :follow
27
+ it = Follower.new(property_name, args.first)
28
+ self[property_name] = it
29
+ it.instance_eval(&block) if block
30
+ else
31
+ self[property_name] = Property.new(property_name, *args, &block)
32
+ end
33
+ end
34
+ end
35
+
36
+ def to_ary
37
+ end
38
+
39
+ def wombat_property_format
40
+ :container
41
+ end
42
+
43
+ def wombat_property_namespaces
44
+ nil
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,12 @@
1
+ module Wombat
2
+ module Processing
3
+ module NodeSelector
4
+ def select_nodes(selector, namespaces = nil)
5
+ return [selector.to_s] if selector.is_a? Symbol
6
+ return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
7
+ return @context.css selector[4..-1] if selector.start_with? "css="
8
+ [selector]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,48 @@
1
+ #coding: utf-8
2
+ require 'wombat/property/locators/factory'
3
+ require 'wombat/processing/node_selector'
4
+ require 'mechanize'
5
+ require 'restclient'
6
+
7
+ module Wombat
8
+ module Processing
9
+ module Parser
10
+ attr_accessor :mechanize, :context, :response_code, :page
11
+
12
+ def initialize
13
+ @mechanize = Mechanize.new
14
+ end
15
+
16
+ def parse(metadata)
17
+ @context = parser_for metadata
18
+
19
+ Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
20
+ end
21
+
22
+ private
23
+ def parser_for(metadata)
24
+ url = "#{metadata[:base_url]}#{metadata[:path]}"
25
+ page = nil
26
+ parser = nil
27
+ begin
28
+ if metadata[:document_format] == :html
29
+ @page = @mechanize.get(url)
30
+ parser = @page.parser
31
+ else
32
+ @page = RestClient.get(url)
33
+ parser = Nokogiri::XML @page
34
+ end
35
+ @response_code = @page.code.to_i if @page.respond_to? :code
36
+ parser
37
+ rescue
38
+ if $!.respond_to? :http_code
39
+ @response_code = $!.http_code.to_i
40
+ elsif $!.respond_to? :response_code
41
+ @response_code = $!.response_code.to_i
42
+ end
43
+ raise $!
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,33 @@
1
+ #coding: utf-8
2
+ require 'wombat/processing/node_selector'
3
+
4
+ module Wombat
5
+ module Property
6
+ module Locators
7
+ # Abstract base class
8
+ class Base
9
+ include Wombat::Processing::NodeSelector
10
+
11
+ def initialize(property)
12
+ @property = property
13
+ end
14
+
15
+ def locate(context, page = nil)
16
+ @context = context
17
+
18
+ raw_data = yield if block_given?
19
+ data = @property.respond_to?(:callback) && @property.callback ? @property.callback.call(raw_data) : raw_data
20
+
21
+ @property.wombat_property_name ? { @property.wombat_property_name => data } : data
22
+ end
23
+
24
+ protected
25
+ def locate_nodes(context)
26
+ @context = context
27
+
28
+ select_nodes @property.wombat_property_selector, @property.wombat_property_namespaces
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,39 @@
1
+ #coding: utf-8
2
+ require 'wombat/property/locators/base'
3
+ require 'wombat/property/locators/follow'
4
+ require 'wombat/property/locators/html'
5
+ require 'wombat/property/locators/iterator'
6
+ require 'wombat/property/locators/property_group'
7
+ require 'wombat/property/locators/list'
8
+ require 'wombat/property/locators/text'
9
+
10
+ class Wombat::Property::Locators::UnknownTypeException < Exception; end;
11
+
12
+ module Wombat
13
+ module Property
14
+ module Locators
15
+ module Factory
16
+ def self.locator_for(property)
17
+ klass = case(property.wombat_property_format)
18
+ when :text
19
+ Text
20
+ when :list
21
+ List
22
+ when :html
23
+ Html
24
+ when :iterator
25
+ Iterator
26
+ when :container
27
+ PropertyGroup
28
+ when :follow
29
+ Follow
30
+ else
31
+ raise Wombat::Property::Locators::UnknownTypeException.new("Unknown property format #{property.format}.")
32
+ end
33
+
34
+ klass.new(property)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,25 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class Follow < Base
7
+ def locate(context, page = nil)
8
+ super do
9
+ locate_nodes(context).flat_map do |node|
10
+ target_page = page.click node
11
+ context = target_page.parser
12
+
13
+ Hash.new.tap do |h|
14
+ @property.values
15
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
16
+ .map { |p| Factory.locator_for(p).locate(context, page) }
17
+ .map { |p| h.merge! p }
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,14 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class Html < Base
7
+ def locate(context, page = nil)
8
+ node = locate_nodes(context).first
9
+ super { node.inner_html.strip }
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,23 @@
1
+ #coding: utf-8
2
+ require 'wombat/property/locators/property_group'
3
+
4
+ module Wombat
5
+ module Property
6
+ module Locators
7
+ class Iterator < Base
8
+ def locate(contex, page = nil)
9
+ super do
10
+ locate_nodes(contex).flat_map do |node|
11
+ Hash.new.tap do |h|
12
+ @property.values
13
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
14
+ .map { |p| Factory.locator_for(p).locate(node, page) }
15
+ .map { |p| h.merge! p }
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,17 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class List < Base
7
+ def locate(context, page = nil)
8
+ super do
9
+ locate_nodes(context).map do |n|
10
+ n.is_a?(String) ? n.strip : n.inner_text.strip
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,20 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class PropertyGroup < Base
7
+ def locate(context, page = nil)
8
+ super do
9
+ Hash.new.tap do |h|
10
+ @property.values
11
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
12
+ .map { |p| Factory.locator_for(p).locate(context, page) }
13
+ .map { |p| h.merge! p }
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class Text < Base
7
+ def locate(context, page = nil)
8
+ node = locate_nodes(context).first
9
+
10
+ value =
11
+ unless node
12
+ nil
13
+ else
14
+ node.is_a?(String) ? node.strip : node.inner_text.strip
15
+ end
16
+
17
+ super { value }
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
data/lib/wombat.rb CHANGED
@@ -3,9 +3,13 @@
3
3
  require 'wombat/crawler'
4
4
 
5
5
  module Wombat
6
- def self.crawl(&block)
7
- klass = Class.new
8
- klass.send(:include, Wombat::Crawler)
9
- klass.new.crawl(&block)
6
+ class << self
7
+ def crawl(&block)
8
+ klass = Class.new
9
+ klass.send(:include, Wombat::Crawler)
10
+ klass.new.crawl(&block)
11
+ end
12
+
13
+ alias_method :scrape, :crawl
10
14
  end
11
15
  end
data/spec/crawler_spec.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Wombat::Crawler do
4
- describe '#crawl' do
5
- before(:each) do
6
- @crawler = Class.new
7
- @crawler.send(:include, Wombat::Crawler)
8
- @crawler_instance = @crawler.new
9
- end
4
+ before(:each) do
5
+ @crawler = Class.new
6
+ @crawler.send(:include, Wombat::Crawler)
7
+ @crawler_instance = @crawler.new
8
+ end
10
9
 
10
+ describe '#crawl' do
11
11
  it 'should call the provided block' do
12
12
  event_called = false
13
13
 
@@ -17,8 +17,8 @@ describe Wombat::Crawler do
17
17
  end
18
18
 
19
19
  it 'should provide metadata to yielded block' do
20
- @crawler.event do |e|
21
- e.should_not be_nil
20
+ @crawler.event do
21
+ self.class.should == Wombat::DSL::PropertyGroup
22
22
  end
23
23
  end
24
24
 
@@ -30,7 +30,10 @@ describe Wombat::Crawler do
30
30
  e.time Time.now
31
31
  end
32
32
 
33
- @crawler.venue { |v| v.name "Scooba" }
33
+ @crawler.venue do |v|
34
+ v.name "Scooba"
35
+ end
36
+
34
37
  @crawler.location { |v| v.latitude -50.2323 }
35
38
 
36
39
  @crawler_instance.should_receive(:parse) do |arg|
@@ -62,7 +65,7 @@ describe Wombat::Crawler do
62
65
 
63
66
  @crawler_instance.should_receive(:parse) do |arg|
64
67
  prop = arg['some_data']
65
- prop.name.should == "some_data"
68
+ prop.wombat_property_name.should == "some_data"
66
69
  prop.selector.should == "/event/list"
67
70
  prop.format.should == :html
68
71
  prop.namespaces.should == "geo"
@@ -73,12 +76,12 @@ describe Wombat::Crawler do
73
76
  end
74
77
 
75
78
  it 'should be able to specify arbitrary block structure more than once' do
76
- @crawler.structure do |s|
77
- s.data "xpath=/xyz"
79
+ @crawler.structure do
80
+ data "xpath=/xyz"
78
81
  end
79
82
 
80
- @crawler.structure do |s|
81
- s.another "css=.information"
83
+ @crawler.structure do
84
+ another "css=.information"
82
85
  end
83
86
 
84
87
  @crawler_instance.should_receive(:parse) do |arg|
@@ -93,26 +96,6 @@ describe Wombat::Crawler do
93
96
  @crawler.event
94
97
  end
95
98
 
96
- it 'should iterate on elements inside for_each block' do
97
- @crawler.for_each "css=.element" do
98
- title "css=.title"
99
- body "css=.body"
100
- event do |e|
101
- e.all "yeah"
102
- end
103
- end
104
-
105
- @crawler_instance.should_receive(:parse) do |arg|
106
- it = arg.iterators.first
107
- it.selector.should == "css=.element"
108
- it["title"].selector.should == "css=.title"
109
- it["body"].selector.should == "css=.body"
110
- it["event"]["all"].selector.should == "yeah"
111
- end
112
-
113
- @crawler_instance.crawl
114
- end
115
-
116
99
  it 'should assign metadata format' do
117
100
  @crawler_instance.should_receive(:parse) do |arg|
118
101
  arg[:document_format].should == :xml
@@ -123,22 +106,22 @@ describe Wombat::Crawler do
123
106
 
124
107
  it 'should crawl with block' do
125
108
  @crawler.base_url "danielnc.com"
126
- @crawler.list_page "/itens"
109
+ @crawler.path "/itens"
127
110
 
128
111
  @crawler_instance.should_receive(:parse) do |arg|
129
112
  arg[:base_url].should == "danielnc.com"
130
- arg[:list_page].should == "/itens/1"
113
+ arg[:path].should == "/itens/1"
131
114
  end
132
115
 
133
116
  @crawler_instance.crawl do
134
- list_page "/itens/1"
117
+ path "/itens/1"
135
118
  end
136
119
 
137
120
  another_instance = @crawler.new
138
121
 
139
122
  another_instance.should_receive(:parse) do |arg|
140
123
  arg[:base_url].should == "danielnc.com"
141
- arg[:list_page].should == "/itens"
124
+ arg[:path].should == "/itens"
142
125
  end
143
126
 
144
127
  another_instance.crawl
@@ -146,15 +129,15 @@ describe Wombat::Crawler do
146
129
 
147
130
  it 'should remove created method missing' do
148
131
  @crawler.base_url "danielnc.com"
149
- @crawler.list_page "/itens"
132
+ @crawler.path "/itens"
150
133
 
151
134
  @crawler_instance.should_receive(:parse) do |arg|
152
135
  arg[:base_url].should == "danielnc.com"
153
- arg[:list_page].should == "/itens/1"
136
+ arg[:path].should == "/itens/1"
154
137
  end
155
138
 
156
139
  @crawler_instance.crawl do
157
- list_page "/itens/1"
140
+ path "/itens/1"
158
141
  end
159
142
 
160
143
  lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
@@ -162,15 +145,15 @@ describe Wombat::Crawler do
162
145
 
163
146
  it 'should remove created instance variable' do
164
147
  @crawler.base_url "danielnc.com"
165
- @crawler.list_page "/itens"
148
+ @crawler.path "/itens"
166
149
 
167
150
  @crawler_instance.should_receive(:parse) do |arg|
168
151
  arg[:base_url].should == "danielnc.com"
169
- arg[:list_page].should == "/itens/1"
152
+ arg[:path].should == "/itens/1"
170
153
  end
171
154
 
172
155
  @crawler_instance.crawl do
173
- list_page "/itens/1"
156
+ path "/itens/1"
174
157
  end
175
158
 
176
159
  @crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
@@ -181,7 +164,7 @@ describe Wombat::Crawler do
181
164
  VCR.use_cassette('basic_crawler_page') do
182
165
 
183
166
  @crawler.base_url "http://www.terra.com.br"
184
- @crawler.list_page '/portal'
167
+ @crawler.path '/portal'
185
168
 
186
169
  @crawler.search "css=.btn-search"
187
170
 
@@ -194,7 +177,7 @@ describe Wombat::Crawler do
194
177
  VCR.use_cassette('basic_crawler_page') do
195
178
 
196
179
  @crawler.base_url "http://www.terra.com.br"
197
- @crawler.list_page '/portal'
180
+ @crawler.path '/portal'
198
181
 
199
182
  @crawler.search "css=.btn-search"
200
183
  @crawler.document_format :xml
@@ -208,7 +191,7 @@ describe Wombat::Crawler do
208
191
  VCR.use_cassette('error_page') do
209
192
 
210
193
  @crawler.base_url "http://www.terra.com.br"
211
- @crawler.list_page '/portal'
194
+ @crawler.path '/portal'
212
195
 
213
196
  @crawler.search "css=.btn-search"
214
197
 
@@ -221,7 +204,7 @@ describe Wombat::Crawler do
221
204
  VCR.use_cassette('error_page') do
222
205
 
223
206
  @crawler.base_url "http://www.terra.com.br"
224
- @crawler.list_page '/portal'
207
+ @crawler.path '/portal'
225
208
 
226
209
  @crawler.search "css=.btn-search"
227
210
  @crawler.document_format :xml
@@ -231,4 +214,11 @@ describe Wombat::Crawler do
231
214
  end
232
215
  end
233
216
  end
217
+
218
+ describe '#scrape' do
219
+ it 'should alias to crawl' do
220
+ @crawler_instance.should_receive :parse
221
+ @crawler_instance.scrape
222
+ end
223
+ end
234
224
  end