wombat 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/README.md +13 -30
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
  5. data/lib/wombat/crawler.rb +7 -17
  6. data/lib/wombat/dsl/follower.rb +19 -0
  7. data/lib/wombat/dsl/iterator.rb +19 -0
  8. data/lib/wombat/dsl/metadata.rb +27 -0
  9. data/lib/wombat/dsl/property.rb +27 -0
  10. data/lib/wombat/dsl/property_group.rb +48 -0
  11. data/lib/wombat/processing/node_selector.rb +12 -0
  12. data/lib/wombat/processing/parser.rb +48 -0
  13. data/lib/wombat/property/locators/base.rb +33 -0
  14. data/lib/wombat/property/locators/factory.rb +39 -0
  15. data/lib/wombat/property/locators/follow.rb +25 -0
  16. data/lib/wombat/property/locators/html.rb +14 -0
  17. data/lib/wombat/property/locators/iterator.rb +23 -0
  18. data/lib/wombat/property/locators/list.rb +17 -0
  19. data/lib/wombat/property/locators/property_group.rb +20 -0
  20. data/lib/wombat/property/locators/text.rb +22 -0
  21. data/lib/wombat.rb +8 -4
  22. data/spec/crawler_spec.rb +38 -48
  23. data/spec/dsl/property_spec.rb +12 -0
  24. data/spec/helpers/sample_crawler.rb +2 -15
  25. data/spec/integration/integration_spec.rb +61 -33
  26. data/spec/processing/parser_spec.rb +32 -0
  27. data/spec/property/locators/factory_spec.rb +18 -0
  28. data/spec/property/locators/follow_spec.rb +4 -0
  29. data/spec/property/locators/html_spec.rb +15 -0
  30. data/spec/property/locators/iterator_spec.rb +4 -0
  31. data/spec/property/locators/list_spec.rb +13 -0
  32. data/spec/property/locators/text_spec.rb +49 -0
  33. data/spec/sample_crawler_spec.rb +7 -11
  34. data/spec/wombat_spec.rb +13 -1
  35. data/wombat.gemspec +27 -16
  36. metadata +27 -16
  37. data/lib/wombat/iterator.rb +0 -38
  38. data/lib/wombat/metadata.rb +0 -24
  39. data/lib/wombat/node_selector.rb +0 -10
  40. data/lib/wombat/parser.rb +0 -59
  41. data/lib/wombat/property.rb +0 -21
  42. data/lib/wombat/property_container.rb +0 -70
  43. data/lib/wombat/property_locator.rb +0 -20
  44. data/spec/iterator_spec.rb +0 -52
  45. data/spec/metadata_spec.rb +0 -20
  46. data/spec/parser_spec.rb +0 -125
  47. data/spec/property_container_spec.rb +0 -62
  48. data/spec/property_locator_spec.rb +0 -75
  49. data/spec/property_spec.rb +0 -16
@@ -1,13 +1,13 @@
1
1
  #coding: utf-8
2
- require 'wombat/metadata'
3
- require 'wombat/property'
4
- require 'wombat/parser'
2
+ require 'wombat/dsl/metadata'
3
+ require 'wombat/dsl/property'
4
+ require 'wombat/processing/parser'
5
5
  require 'active_support'
6
6
  require 'date'
7
7
 
8
8
  module Wombat
9
9
  module Crawler
10
- include Parser
10
+ include Processing::Parser
11
11
  extend ActiveSupport::Concern
12
12
 
13
13
  def crawl(&block)
@@ -31,33 +31,23 @@ module Wombat
31
31
  end
32
32
  end
33
33
 
34
+ alias_method :scrape, :crawl
35
+
34
36
  def method_missing(method, *args, &block)
35
37
  self.class.send method, *args, &block
36
38
  end
37
39
 
38
- def for_each(selector, &block)
39
- self.class.for_each selector, &block
40
- end
41
-
42
40
  module ClassMethods
43
41
  def method_missing(method, *args, &block)
44
42
  metadata.send method, *args, &block
45
43
  end
46
44
 
47
- def for_each(selector, &block)
48
- metadata.for_each(selector).instance_eval(&block) if block
49
- end
50
-
51
- def follow_links(selector)
52
-
53
- end
54
-
55
45
  def to_ary
56
46
  end
57
47
 
58
48
  private
59
49
  def metadata
60
- @metadata ||= Metadata.new
50
+ @metadata ||= DSL::Metadata.new
61
51
  end
62
52
  end
63
53
  end
@@ -0,0 +1,19 @@
1
+ module Wombat
2
+ module DSL
3
+ class Follower < PropertyGroup
4
+ attr_accessor :wombat_property_selector
5
+
6
+ def initialize(name, selector)
7
+ @wombat_property_selector = selector
8
+
9
+ super(name)
10
+ end
11
+
12
+ # So that Property::Locators::Iterator can identify this class
13
+ # as an iterator property.
14
+ def wombat_property_format
15
+ :follow
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ module Wombat
2
+ module DSL
3
+ class Iterator < PropertyGroup
4
+ attr_accessor :wombat_property_selector
5
+
6
+ def initialize(name, selector)
7
+ @wombat_property_selector = selector
8
+
9
+ super(name)
10
+ end
11
+
12
+ # So that Property::Locators::Iterator can identify this class
13
+ # as an iterator property.
14
+ def wombat_property_format
15
+ :iterator
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,27 @@
1
+ #coding: utf-8
2
+ require 'wombat/dsl/property_group'
3
+ require 'wombat/dsl/iterator'
4
+ require 'wombat/dsl/follower'
5
+
6
+ module Wombat
7
+ module DSL
8
+ class Metadata < PropertyGroup
9
+ def initialize
10
+ self[:document_format] = :html
11
+ super
12
+ end
13
+
14
+ def base_url(url)
15
+ self[:base_url] = url
16
+ end
17
+
18
+ def path(url)
19
+ self[:path] = url
20
+ end
21
+
22
+ def document_format(format)
23
+ self[:document_format] = format
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ module Wombat
2
+ module DSL
3
+ class Property
4
+ attr_accessor :wombat_property_name, :wombat_property_selector, :wombat_property_format, :wombat_property_namespaces, :callback
5
+
6
+ def initialize(name, *args, &block)
7
+ @wombat_property_name = name
8
+ @wombat_property_selector = args[0]
9
+ @wombat_property_format = args[1] || :text
10
+ @wombat_property_namespaces = args[2]
11
+ @callback = block
12
+ end
13
+
14
+ def selector
15
+ @wombat_property_selector
16
+ end
17
+
18
+ def namespaces
19
+ @wombat_property_namespaces
20
+ end
21
+
22
+ def format
23
+ @wombat_property_format
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,48 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module DSL
5
+ class PropertyGroup < Hash
6
+ attr_accessor :wombat_property_name
7
+
8
+ def initialize(name = nil)
9
+ @wombat_property_name = name
10
+ end
11
+
12
+ def method_missing(method, *args, &block)
13
+ property_name = method.to_s
14
+
15
+ if args.empty? && block
16
+ # TODO: Verify if another property with same name already exists
17
+ # before overwriting
18
+ property_group = self[property_name] || PropertyGroup.new(property_name)
19
+ self[property_name] = property_group
20
+ property_group.instance_eval(&block)
21
+ else
22
+ if args[1] == :iterator
23
+ it = Iterator.new(property_name, args.first)
24
+ self[property_name] = it
25
+ it.instance_eval(&block) if block
26
+ elsif args[1] == :follow
27
+ it = Follower.new(property_name, args.first)
28
+ self[property_name] = it
29
+ it.instance_eval(&block) if block
30
+ else
31
+ self[property_name] = Property.new(property_name, *args, &block)
32
+ end
33
+ end
34
+ end
35
+
36
+ def to_ary
37
+ end
38
+
39
+ def wombat_property_format
40
+ :container
41
+ end
42
+
43
+ def wombat_property_namespaces
44
+ nil
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,12 @@
1
+ module Wombat
2
+ module Processing
3
+ module NodeSelector
4
+ def select_nodes(selector, namespaces = nil)
5
+ return [selector.to_s] if selector.is_a? Symbol
6
+ return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
7
+ return @context.css selector[4..-1] if selector.start_with? "css="
8
+ [selector]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,48 @@
1
+ #coding: utf-8
2
+ require 'wombat/property/locators/factory'
3
+ require 'wombat/processing/node_selector'
4
+ require 'mechanize'
5
+ require 'restclient'
6
+
7
+ module Wombat
8
+ module Processing
9
+ module Parser
10
+ attr_accessor :mechanize, :context, :response_code, :page
11
+
12
+ def initialize
13
+ @mechanize = Mechanize.new
14
+ end
15
+
16
+ def parse(metadata)
17
+ @context = parser_for metadata
18
+
19
+ Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
20
+ end
21
+
22
+ private
23
+ def parser_for(metadata)
24
+ url = "#{metadata[:base_url]}#{metadata[:path]}"
25
+ page = nil
26
+ parser = nil
27
+ begin
28
+ if metadata[:document_format] == :html
29
+ @page = @mechanize.get(url)
30
+ parser = @page.parser
31
+ else
32
+ @page = RestClient.get(url)
33
+ parser = Nokogiri::XML @page
34
+ end
35
+ @response_code = @page.code.to_i if @page.respond_to? :code
36
+ parser
37
+ rescue
38
+ if $!.respond_to? :http_code
39
+ @response_code = $!.http_code.to_i
40
+ elsif $!.respond_to? :response_code
41
+ @response_code = $!.response_code.to_i
42
+ end
43
+ raise $!
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,33 @@
1
+ #coding: utf-8
2
+ require 'wombat/processing/node_selector'
3
+
4
+ module Wombat
5
+ module Property
6
+ module Locators
7
+ # Abstract base class
8
+ class Base
9
+ include Wombat::Processing::NodeSelector
10
+
11
+ def initialize(property)
12
+ @property = property
13
+ end
14
+
15
+ def locate(context, page = nil)
16
+ @context = context
17
+
18
+ raw_data = yield if block_given?
19
+ data = @property.respond_to?(:callback) && @property.callback ? @property.callback.call(raw_data) : raw_data
20
+
21
+ @property.wombat_property_name ? { @property.wombat_property_name => data } : data
22
+ end
23
+
24
+ protected
25
+ def locate_nodes(context)
26
+ @context = context
27
+
28
+ select_nodes @property.wombat_property_selector, @property.wombat_property_namespaces
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,39 @@
1
+ #coding: utf-8
2
+ require 'wombat/property/locators/base'
3
+ require 'wombat/property/locators/follow'
4
+ require 'wombat/property/locators/html'
5
+ require 'wombat/property/locators/iterator'
6
+ require 'wombat/property/locators/property_group'
7
+ require 'wombat/property/locators/list'
8
+ require 'wombat/property/locators/text'
9
+
10
+ class Wombat::Property::Locators::UnknownTypeException < Exception; end;
11
+
12
+ module Wombat
13
+ module Property
14
+ module Locators
15
+ module Factory
16
+ def self.locator_for(property)
17
+ klass = case(property.wombat_property_format)
18
+ when :text
19
+ Text
20
+ when :list
21
+ List
22
+ when :html
23
+ Html
24
+ when :iterator
25
+ Iterator
26
+ when :container
27
+ PropertyGroup
28
+ when :follow
29
+ Follow
30
+ else
31
+ raise Wombat::Property::Locators::UnknownTypeException.new("Unknown property format #{property.format}.")
32
+ end
33
+
34
+ klass.new(property)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,25 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class Follow < Base
7
+ def locate(context, page = nil)
8
+ super do
9
+ locate_nodes(context).flat_map do |node|
10
+ target_page = page.click node
11
+ context = target_page.parser
12
+
13
+ Hash.new.tap do |h|
14
+ @property.values
15
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
16
+ .map { |p| Factory.locator_for(p).locate(context, page) }
17
+ .map { |p| h.merge! p }
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,14 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class Html < Base
7
+ def locate(context, page = nil)
8
+ node = locate_nodes(context).first
9
+ super { node.inner_html.strip }
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,23 @@
1
+ #coding: utf-8
2
+ require 'wombat/property/locators/property_group'
3
+
4
+ module Wombat
5
+ module Property
6
+ module Locators
7
+ class Iterator < Base
8
+ def locate(contex, page = nil)
9
+ super do
10
+ locate_nodes(contex).flat_map do |node|
11
+ Hash.new.tap do |h|
12
+ @property.values
13
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
14
+ .map { |p| Factory.locator_for(p).locate(node, page) }
15
+ .map { |p| h.merge! p }
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,17 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class List < Base
7
+ def locate(context, page = nil)
8
+ super do
9
+ locate_nodes(context).map do |n|
10
+ n.is_a?(String) ? n.strip : n.inner_text.strip
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,20 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class PropertyGroup < Base
7
+ def locate(context, page = nil)
8
+ super do
9
+ Hash.new.tap do |h|
10
+ @property.values
11
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
12
+ .map { |p| Factory.locator_for(p).locate(context, page) }
13
+ .map { |p| h.merge! p }
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ #coding: utf-8
2
+
3
+ module Wombat
4
+ module Property
5
+ module Locators
6
+ class Text < Base
7
+ def locate(context, page = nil)
8
+ node = locate_nodes(context).first
9
+
10
+ value =
11
+ unless node
12
+ nil
13
+ else
14
+ node.is_a?(String) ? node.strip : node.inner_text.strip
15
+ end
16
+
17
+ super { value }
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
data/lib/wombat.rb CHANGED
@@ -3,9 +3,13 @@
3
3
  require 'wombat/crawler'
4
4
 
5
5
  module Wombat
6
- def self.crawl(&block)
7
- klass = Class.new
8
- klass.send(:include, Wombat::Crawler)
9
- klass.new.crawl(&block)
6
+ class << self
7
+ def crawl(&block)
8
+ klass = Class.new
9
+ klass.send(:include, Wombat::Crawler)
10
+ klass.new.crawl(&block)
11
+ end
12
+
13
+ alias_method :scrape, :crawl
10
14
  end
11
15
  end
data/spec/crawler_spec.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Wombat::Crawler do
4
- describe '#crawl' do
5
- before(:each) do
6
- @crawler = Class.new
7
- @crawler.send(:include, Wombat::Crawler)
8
- @crawler_instance = @crawler.new
9
- end
4
+ before(:each) do
5
+ @crawler = Class.new
6
+ @crawler.send(:include, Wombat::Crawler)
7
+ @crawler_instance = @crawler.new
8
+ end
10
9
 
10
+ describe '#crawl' do
11
11
  it 'should call the provided block' do
12
12
  event_called = false
13
13
 
@@ -17,8 +17,8 @@ describe Wombat::Crawler do
17
17
  end
18
18
 
19
19
  it 'should provide metadata to yielded block' do
20
- @crawler.event do |e|
21
- e.should_not be_nil
20
+ @crawler.event do
21
+ self.class.should == Wombat::DSL::PropertyGroup
22
22
  end
23
23
  end
24
24
 
@@ -30,7 +30,10 @@ describe Wombat::Crawler do
30
30
  e.time Time.now
31
31
  end
32
32
 
33
- @crawler.venue { |v| v.name "Scooba" }
33
+ @crawler.venue do |v|
34
+ v.name "Scooba"
35
+ end
36
+
34
37
  @crawler.location { |v| v.latitude -50.2323 }
35
38
 
36
39
  @crawler_instance.should_receive(:parse) do |arg|
@@ -62,7 +65,7 @@ describe Wombat::Crawler do
62
65
 
63
66
  @crawler_instance.should_receive(:parse) do |arg|
64
67
  prop = arg['some_data']
65
- prop.name.should == "some_data"
68
+ prop.wombat_property_name.should == "some_data"
66
69
  prop.selector.should == "/event/list"
67
70
  prop.format.should == :html
68
71
  prop.namespaces.should == "geo"
@@ -73,12 +76,12 @@ describe Wombat::Crawler do
73
76
  end
74
77
 
75
78
  it 'should be able to specify arbitrary block structure more than once' do
76
- @crawler.structure do |s|
77
- s.data "xpath=/xyz"
79
+ @crawler.structure do
80
+ data "xpath=/xyz"
78
81
  end
79
82
 
80
- @crawler.structure do |s|
81
- s.another "css=.information"
83
+ @crawler.structure do
84
+ another "css=.information"
82
85
  end
83
86
 
84
87
  @crawler_instance.should_receive(:parse) do |arg|
@@ -93,26 +96,6 @@ describe Wombat::Crawler do
93
96
  @crawler.event
94
97
  end
95
98
 
96
- it 'should iterate on elements inside for_each block' do
97
- @crawler.for_each "css=.element" do
98
- title "css=.title"
99
- body "css=.body"
100
- event do |e|
101
- e.all "yeah"
102
- end
103
- end
104
-
105
- @crawler_instance.should_receive(:parse) do |arg|
106
- it = arg.iterators.first
107
- it.selector.should == "css=.element"
108
- it["title"].selector.should == "css=.title"
109
- it["body"].selector.should == "css=.body"
110
- it["event"]["all"].selector.should == "yeah"
111
- end
112
-
113
- @crawler_instance.crawl
114
- end
115
-
116
99
  it 'should assign metadata format' do
117
100
  @crawler_instance.should_receive(:parse) do |arg|
118
101
  arg[:document_format].should == :xml
@@ -123,22 +106,22 @@ describe Wombat::Crawler do
123
106
 
124
107
  it 'should crawl with block' do
125
108
  @crawler.base_url "danielnc.com"
126
- @crawler.list_page "/itens"
109
+ @crawler.path "/itens"
127
110
 
128
111
  @crawler_instance.should_receive(:parse) do |arg|
129
112
  arg[:base_url].should == "danielnc.com"
130
- arg[:list_page].should == "/itens/1"
113
+ arg[:path].should == "/itens/1"
131
114
  end
132
115
 
133
116
  @crawler_instance.crawl do
134
- list_page "/itens/1"
117
+ path "/itens/1"
135
118
  end
136
119
 
137
120
  another_instance = @crawler.new
138
121
 
139
122
  another_instance.should_receive(:parse) do |arg|
140
123
  arg[:base_url].should == "danielnc.com"
141
- arg[:list_page].should == "/itens"
124
+ arg[:path].should == "/itens"
142
125
  end
143
126
 
144
127
  another_instance.crawl
@@ -146,15 +129,15 @@ describe Wombat::Crawler do
146
129
 
147
130
  it 'should remove created method missing' do
148
131
  @crawler.base_url "danielnc.com"
149
- @crawler.list_page "/itens"
132
+ @crawler.path "/itens"
150
133
 
151
134
  @crawler_instance.should_receive(:parse) do |arg|
152
135
  arg[:base_url].should == "danielnc.com"
153
- arg[:list_page].should == "/itens/1"
136
+ arg[:path].should == "/itens/1"
154
137
  end
155
138
 
156
139
  @crawler_instance.crawl do
157
- list_page "/itens/1"
140
+ path "/itens/1"
158
141
  end
159
142
 
160
143
  lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
@@ -162,15 +145,15 @@ describe Wombat::Crawler do
162
145
 
163
146
  it 'should remove created instance variable' do
164
147
  @crawler.base_url "danielnc.com"
165
- @crawler.list_page "/itens"
148
+ @crawler.path "/itens"
166
149
 
167
150
  @crawler_instance.should_receive(:parse) do |arg|
168
151
  arg[:base_url].should == "danielnc.com"
169
- arg[:list_page].should == "/itens/1"
152
+ arg[:path].should == "/itens/1"
170
153
  end
171
154
 
172
155
  @crawler_instance.crawl do
173
- list_page "/itens/1"
156
+ path "/itens/1"
174
157
  end
175
158
 
176
159
  @crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
@@ -181,7 +164,7 @@ describe Wombat::Crawler do
181
164
  VCR.use_cassette('basic_crawler_page') do
182
165
 
183
166
  @crawler.base_url "http://www.terra.com.br"
184
- @crawler.list_page '/portal'
167
+ @crawler.path '/portal'
185
168
 
186
169
  @crawler.search "css=.btn-search"
187
170
 
@@ -194,7 +177,7 @@ describe Wombat::Crawler do
194
177
  VCR.use_cassette('basic_crawler_page') do
195
178
 
196
179
  @crawler.base_url "http://www.terra.com.br"
197
- @crawler.list_page '/portal'
180
+ @crawler.path '/portal'
198
181
 
199
182
  @crawler.search "css=.btn-search"
200
183
  @crawler.document_format :xml
@@ -208,7 +191,7 @@ describe Wombat::Crawler do
208
191
  VCR.use_cassette('error_page') do
209
192
 
210
193
  @crawler.base_url "http://www.terra.com.br"
211
- @crawler.list_page '/portal'
194
+ @crawler.path '/portal'
212
195
 
213
196
  @crawler.search "css=.btn-search"
214
197
 
@@ -221,7 +204,7 @@ describe Wombat::Crawler do
221
204
  VCR.use_cassette('error_page') do
222
205
 
223
206
  @crawler.base_url "http://www.terra.com.br"
224
- @crawler.list_page '/portal'
207
+ @crawler.path '/portal'
225
208
 
226
209
  @crawler.search "css=.btn-search"
227
210
  @crawler.document_format :xml
@@ -231,4 +214,11 @@ describe Wombat::Crawler do
231
214
  end
232
215
  end
233
216
  end
217
+
218
+ describe '#scrape' do
219
+ it 'should alias to crawl' do
220
+ @crawler_instance.should_receive :parse
221
+ @crawler_instance.scrape
222
+ end
223
+ end
234
224
  end