wombat 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +13 -30
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
- data/lib/wombat/crawler.rb +7 -17
- data/lib/wombat/dsl/follower.rb +19 -0
- data/lib/wombat/dsl/iterator.rb +19 -0
- data/lib/wombat/dsl/metadata.rb +27 -0
- data/lib/wombat/dsl/property.rb +27 -0
- data/lib/wombat/dsl/property_group.rb +48 -0
- data/lib/wombat/processing/node_selector.rb +12 -0
- data/lib/wombat/processing/parser.rb +48 -0
- data/lib/wombat/property/locators/base.rb +33 -0
- data/lib/wombat/property/locators/factory.rb +39 -0
- data/lib/wombat/property/locators/follow.rb +25 -0
- data/lib/wombat/property/locators/html.rb +14 -0
- data/lib/wombat/property/locators/iterator.rb +23 -0
- data/lib/wombat/property/locators/list.rb +17 -0
- data/lib/wombat/property/locators/property_group.rb +20 -0
- data/lib/wombat/property/locators/text.rb +22 -0
- data/lib/wombat.rb +8 -4
- data/spec/crawler_spec.rb +38 -48
- data/spec/dsl/property_spec.rb +12 -0
- data/spec/helpers/sample_crawler.rb +2 -15
- data/spec/integration/integration_spec.rb +61 -33
- data/spec/processing/parser_spec.rb +32 -0
- data/spec/property/locators/factory_spec.rb +18 -0
- data/spec/property/locators/follow_spec.rb +4 -0
- data/spec/property/locators/html_spec.rb +15 -0
- data/spec/property/locators/iterator_spec.rb +4 -0
- data/spec/property/locators/list_spec.rb +13 -0
- data/spec/property/locators/text_spec.rb +49 -0
- data/spec/sample_crawler_spec.rb +7 -11
- data/spec/wombat_spec.rb +13 -1
- data/wombat.gemspec +27 -16
- metadata +27 -16
- data/lib/wombat/iterator.rb +0 -38
- data/lib/wombat/metadata.rb +0 -24
- data/lib/wombat/node_selector.rb +0 -10
- data/lib/wombat/parser.rb +0 -59
- data/lib/wombat/property.rb +0 -21
- data/lib/wombat/property_container.rb +0 -70
- data/lib/wombat/property_locator.rb +0 -20
- data/spec/iterator_spec.rb +0 -52
- data/spec/metadata_spec.rb +0 -20
- data/spec/parser_spec.rb +0 -125
- data/spec/property_container_spec.rb +0 -62
- data/spec/property_locator_spec.rb +0 -75
- data/spec/property_spec.rb +0 -16
data/lib/wombat/crawler.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
require 'wombat/metadata'
|
3
|
-
require 'wombat/property'
|
4
|
-
require 'wombat/parser'
|
2
|
+
require 'wombat/dsl/metadata'
|
3
|
+
require 'wombat/dsl/property'
|
4
|
+
require 'wombat/processing/parser'
|
5
5
|
require 'active_support'
|
6
6
|
require 'date'
|
7
7
|
|
8
8
|
module Wombat
|
9
9
|
module Crawler
|
10
|
-
include Parser
|
10
|
+
include Processing::Parser
|
11
11
|
extend ActiveSupport::Concern
|
12
12
|
|
13
13
|
def crawl(&block)
|
@@ -31,33 +31,23 @@ module Wombat
|
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
+
alias_method :scrape, :crawl
|
35
|
+
|
34
36
|
def method_missing(method, *args, &block)
|
35
37
|
self.class.send method, *args, &block
|
36
38
|
end
|
37
39
|
|
38
|
-
def for_each(selector, &block)
|
39
|
-
self.class.for_each selector, &block
|
40
|
-
end
|
41
|
-
|
42
40
|
module ClassMethods
|
43
41
|
def method_missing(method, *args, &block)
|
44
42
|
metadata.send method, *args, &block
|
45
43
|
end
|
46
44
|
|
47
|
-
def for_each(selector, &block)
|
48
|
-
metadata.for_each(selector).instance_eval(&block) if block
|
49
|
-
end
|
50
|
-
|
51
|
-
def follow_links(selector)
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
45
|
def to_ary
|
56
46
|
end
|
57
47
|
|
58
48
|
private
|
59
49
|
def metadata
|
60
|
-
@metadata ||= Metadata.new
|
50
|
+
@metadata ||= DSL::Metadata.new
|
61
51
|
end
|
62
52
|
end
|
63
53
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Wombat
|
2
|
+
module DSL
|
3
|
+
class Follower < PropertyGroup
|
4
|
+
attr_accessor :wombat_property_selector
|
5
|
+
|
6
|
+
def initialize(name, selector)
|
7
|
+
@wombat_property_selector = selector
|
8
|
+
|
9
|
+
super(name)
|
10
|
+
end
|
11
|
+
|
12
|
+
# So that Property::Locators::Iterator can identify this class
|
13
|
+
# as an iterator property.
|
14
|
+
def wombat_property_format
|
15
|
+
:follow
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Wombat
|
2
|
+
module DSL
|
3
|
+
class Iterator < PropertyGroup
|
4
|
+
attr_accessor :wombat_property_selector
|
5
|
+
|
6
|
+
def initialize(name, selector)
|
7
|
+
@wombat_property_selector = selector
|
8
|
+
|
9
|
+
super(name)
|
10
|
+
end
|
11
|
+
|
12
|
+
# So that Property::Locators::Iterator can identify this class
|
13
|
+
# as an iterator property.
|
14
|
+
def wombat_property_format
|
15
|
+
:iterator
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/dsl/property_group'
|
3
|
+
require 'wombat/dsl/iterator'
|
4
|
+
require 'wombat/dsl/follower'
|
5
|
+
|
6
|
+
module Wombat
|
7
|
+
module DSL
|
8
|
+
class Metadata < PropertyGroup
|
9
|
+
def initialize
|
10
|
+
self[:document_format] = :html
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def base_url(url)
|
15
|
+
self[:base_url] = url
|
16
|
+
end
|
17
|
+
|
18
|
+
def path(url)
|
19
|
+
self[:path] = url
|
20
|
+
end
|
21
|
+
|
22
|
+
def document_format(format)
|
23
|
+
self[:document_format] = format
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Wombat
|
2
|
+
module DSL
|
3
|
+
class Property
|
4
|
+
attr_accessor :wombat_property_name, :wombat_property_selector, :wombat_property_format, :wombat_property_namespaces, :callback
|
5
|
+
|
6
|
+
def initialize(name, *args, &block)
|
7
|
+
@wombat_property_name = name
|
8
|
+
@wombat_property_selector = args[0]
|
9
|
+
@wombat_property_format = args[1] || :text
|
10
|
+
@wombat_property_namespaces = args[2]
|
11
|
+
@callback = block
|
12
|
+
end
|
13
|
+
|
14
|
+
def selector
|
15
|
+
@wombat_property_selector
|
16
|
+
end
|
17
|
+
|
18
|
+
def namespaces
|
19
|
+
@wombat_property_namespaces
|
20
|
+
end
|
21
|
+
|
22
|
+
def format
|
23
|
+
@wombat_property_format
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module DSL
|
5
|
+
class PropertyGroup < Hash
|
6
|
+
attr_accessor :wombat_property_name
|
7
|
+
|
8
|
+
def initialize(name = nil)
|
9
|
+
@wombat_property_name = name
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_missing(method, *args, &block)
|
13
|
+
property_name = method.to_s
|
14
|
+
|
15
|
+
if args.empty? && block
|
16
|
+
# TODO: Verify if another property with same name already exists
|
17
|
+
# before overwriting
|
18
|
+
property_group = self[property_name] || PropertyGroup.new(property_name)
|
19
|
+
self[property_name] = property_group
|
20
|
+
property_group.instance_eval(&block)
|
21
|
+
else
|
22
|
+
if args[1] == :iterator
|
23
|
+
it = Iterator.new(property_name, args.first)
|
24
|
+
self[property_name] = it
|
25
|
+
it.instance_eval(&block) if block
|
26
|
+
elsif args[1] == :follow
|
27
|
+
it = Follower.new(property_name, args.first)
|
28
|
+
self[property_name] = it
|
29
|
+
it.instance_eval(&block) if block
|
30
|
+
else
|
31
|
+
self[property_name] = Property.new(property_name, *args, &block)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_ary
|
37
|
+
end
|
38
|
+
|
39
|
+
def wombat_property_format
|
40
|
+
:container
|
41
|
+
end
|
42
|
+
|
43
|
+
def wombat_property_namespaces
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Wombat
|
2
|
+
module Processing
|
3
|
+
module NodeSelector
|
4
|
+
def select_nodes(selector, namespaces = nil)
|
5
|
+
return [selector.to_s] if selector.is_a? Symbol
|
6
|
+
return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
7
|
+
return @context.css selector[4..-1] if selector.start_with? "css="
|
8
|
+
[selector]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/property/locators/factory'
|
3
|
+
require 'wombat/processing/node_selector'
|
4
|
+
require 'mechanize'
|
5
|
+
require 'restclient'
|
6
|
+
|
7
|
+
module Wombat
|
8
|
+
module Processing
|
9
|
+
module Parser
|
10
|
+
attr_accessor :mechanize, :context, :response_code, :page
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@mechanize = Mechanize.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse(metadata)
|
17
|
+
@context = parser_for metadata
|
18
|
+
|
19
|
+
Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
def parser_for(metadata)
|
24
|
+
url = "#{metadata[:base_url]}#{metadata[:path]}"
|
25
|
+
page = nil
|
26
|
+
parser = nil
|
27
|
+
begin
|
28
|
+
if metadata[:document_format] == :html
|
29
|
+
@page = @mechanize.get(url)
|
30
|
+
parser = @page.parser
|
31
|
+
else
|
32
|
+
@page = RestClient.get(url)
|
33
|
+
parser = Nokogiri::XML @page
|
34
|
+
end
|
35
|
+
@response_code = @page.code.to_i if @page.respond_to? :code
|
36
|
+
parser
|
37
|
+
rescue
|
38
|
+
if $!.respond_to? :http_code
|
39
|
+
@response_code = $!.http_code.to_i
|
40
|
+
elsif $!.respond_to? :response_code
|
41
|
+
@response_code = $!.response_code.to_i
|
42
|
+
end
|
43
|
+
raise $!
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/processing/node_selector'
|
3
|
+
|
4
|
+
module Wombat
|
5
|
+
module Property
|
6
|
+
module Locators
|
7
|
+
# Abstract base class
|
8
|
+
class Base
|
9
|
+
include Wombat::Processing::NodeSelector
|
10
|
+
|
11
|
+
def initialize(property)
|
12
|
+
@property = property
|
13
|
+
end
|
14
|
+
|
15
|
+
def locate(context, page = nil)
|
16
|
+
@context = context
|
17
|
+
|
18
|
+
raw_data = yield if block_given?
|
19
|
+
data = @property.respond_to?(:callback) && @property.callback ? @property.callback.call(raw_data) : raw_data
|
20
|
+
|
21
|
+
@property.wombat_property_name ? { @property.wombat_property_name => data } : data
|
22
|
+
end
|
23
|
+
|
24
|
+
protected
|
25
|
+
def locate_nodes(context)
|
26
|
+
@context = context
|
27
|
+
|
28
|
+
select_nodes @property.wombat_property_selector, @property.wombat_property_namespaces
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/property/locators/base'
|
3
|
+
require 'wombat/property/locators/follow'
|
4
|
+
require 'wombat/property/locators/html'
|
5
|
+
require 'wombat/property/locators/iterator'
|
6
|
+
require 'wombat/property/locators/property_group'
|
7
|
+
require 'wombat/property/locators/list'
|
8
|
+
require 'wombat/property/locators/text'
|
9
|
+
|
10
|
+
class Wombat::Property::Locators::UnknownTypeException < Exception; end;
|
11
|
+
|
12
|
+
module Wombat
|
13
|
+
module Property
|
14
|
+
module Locators
|
15
|
+
module Factory
|
16
|
+
def self.locator_for(property)
|
17
|
+
klass = case(property.wombat_property_format)
|
18
|
+
when :text
|
19
|
+
Text
|
20
|
+
when :list
|
21
|
+
List
|
22
|
+
when :html
|
23
|
+
Html
|
24
|
+
when :iterator
|
25
|
+
Iterator
|
26
|
+
when :container
|
27
|
+
PropertyGroup
|
28
|
+
when :follow
|
29
|
+
Follow
|
30
|
+
else
|
31
|
+
raise Wombat::Property::Locators::UnknownTypeException.new("Unknown property format #{property.format}.")
|
32
|
+
end
|
33
|
+
|
34
|
+
klass.new(property)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class Follow < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
super do
|
9
|
+
locate_nodes(context).flat_map do |node|
|
10
|
+
target_page = page.click node
|
11
|
+
context = target_page.parser
|
12
|
+
|
13
|
+
Hash.new.tap do |h|
|
14
|
+
@property.values
|
15
|
+
.select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
|
16
|
+
.map { |p| Factory.locator_for(p).locate(context, page) }
|
17
|
+
.map { |p| h.merge! p }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/property/locators/property_group'
|
3
|
+
|
4
|
+
module Wombat
|
5
|
+
module Property
|
6
|
+
module Locators
|
7
|
+
class Iterator < Base
|
8
|
+
def locate(contex, page = nil)
|
9
|
+
super do
|
10
|
+
locate_nodes(contex).flat_map do |node|
|
11
|
+
Hash.new.tap do |h|
|
12
|
+
@property.values
|
13
|
+
.select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
|
14
|
+
.map { |p| Factory.locator_for(p).locate(node, page) }
|
15
|
+
.map { |p| h.merge! p }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class List < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
super do
|
9
|
+
locate_nodes(context).map do |n|
|
10
|
+
n.is_a?(String) ? n.strip : n.inner_text.strip
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class PropertyGroup < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
super do
|
9
|
+
Hash.new.tap do |h|
|
10
|
+
@property.values
|
11
|
+
.select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
|
12
|
+
.map { |p| Factory.locator_for(p).locate(context, page) }
|
13
|
+
.map { |p| h.merge! p }
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class Text < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
node = locate_nodes(context).first
|
9
|
+
|
10
|
+
value =
|
11
|
+
unless node
|
12
|
+
nil
|
13
|
+
else
|
14
|
+
node.is_a?(String) ? node.strip : node.inner_text.strip
|
15
|
+
end
|
16
|
+
|
17
|
+
super { value }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/wombat.rb
CHANGED
@@ -3,9 +3,13 @@
|
|
3
3
|
require 'wombat/crawler'
|
4
4
|
|
5
5
|
module Wombat
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
class << self
|
7
|
+
def crawl(&block)
|
8
|
+
klass = Class.new
|
9
|
+
klass.send(:include, Wombat::Crawler)
|
10
|
+
klass.new.crawl(&block)
|
11
|
+
end
|
12
|
+
|
13
|
+
alias_method :scrape, :crawl
|
10
14
|
end
|
11
15
|
end
|
data/spec/crawler_spec.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Wombat::Crawler do
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
4
|
+
before(:each) do
|
5
|
+
@crawler = Class.new
|
6
|
+
@crawler.send(:include, Wombat::Crawler)
|
7
|
+
@crawler_instance = @crawler.new
|
8
|
+
end
|
10
9
|
|
10
|
+
describe '#crawl' do
|
11
11
|
it 'should call the provided block' do
|
12
12
|
event_called = false
|
13
13
|
|
@@ -17,8 +17,8 @@ describe Wombat::Crawler do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'should provide metadata to yielded block' do
|
20
|
-
@crawler.event do
|
21
|
-
|
20
|
+
@crawler.event do
|
21
|
+
self.class.should == Wombat::DSL::PropertyGroup
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
@@ -30,7 +30,10 @@ describe Wombat::Crawler do
|
|
30
30
|
e.time Time.now
|
31
31
|
end
|
32
32
|
|
33
|
-
@crawler.venue
|
33
|
+
@crawler.venue do |v|
|
34
|
+
v.name "Scooba"
|
35
|
+
end
|
36
|
+
|
34
37
|
@crawler.location { |v| v.latitude -50.2323 }
|
35
38
|
|
36
39
|
@crawler_instance.should_receive(:parse) do |arg|
|
@@ -62,7 +65,7 @@ describe Wombat::Crawler do
|
|
62
65
|
|
63
66
|
@crawler_instance.should_receive(:parse) do |arg|
|
64
67
|
prop = arg['some_data']
|
65
|
-
prop.
|
68
|
+
prop.wombat_property_name.should == "some_data"
|
66
69
|
prop.selector.should == "/event/list"
|
67
70
|
prop.format.should == :html
|
68
71
|
prop.namespaces.should == "geo"
|
@@ -73,12 +76,12 @@ describe Wombat::Crawler do
|
|
73
76
|
end
|
74
77
|
|
75
78
|
it 'should be able to specify arbitrary block structure more than once' do
|
76
|
-
@crawler.structure do
|
77
|
-
|
79
|
+
@crawler.structure do
|
80
|
+
data "xpath=/xyz"
|
78
81
|
end
|
79
82
|
|
80
|
-
@crawler.structure do
|
81
|
-
|
83
|
+
@crawler.structure do
|
84
|
+
another "css=.information"
|
82
85
|
end
|
83
86
|
|
84
87
|
@crawler_instance.should_receive(:parse) do |arg|
|
@@ -93,26 +96,6 @@ describe Wombat::Crawler do
|
|
93
96
|
@crawler.event
|
94
97
|
end
|
95
98
|
|
96
|
-
it 'should iterate on elements inside for_each block' do
|
97
|
-
@crawler.for_each "css=.element" do
|
98
|
-
title "css=.title"
|
99
|
-
body "css=.body"
|
100
|
-
event do |e|
|
101
|
-
e.all "yeah"
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
@crawler_instance.should_receive(:parse) do |arg|
|
106
|
-
it = arg.iterators.first
|
107
|
-
it.selector.should == "css=.element"
|
108
|
-
it["title"].selector.should == "css=.title"
|
109
|
-
it["body"].selector.should == "css=.body"
|
110
|
-
it["event"]["all"].selector.should == "yeah"
|
111
|
-
end
|
112
|
-
|
113
|
-
@crawler_instance.crawl
|
114
|
-
end
|
115
|
-
|
116
99
|
it 'should assign metadata format' do
|
117
100
|
@crawler_instance.should_receive(:parse) do |arg|
|
118
101
|
arg[:document_format].should == :xml
|
@@ -123,22 +106,22 @@ describe Wombat::Crawler do
|
|
123
106
|
|
124
107
|
it 'should crawl with block' do
|
125
108
|
@crawler.base_url "danielnc.com"
|
126
|
-
@crawler.
|
109
|
+
@crawler.path "/itens"
|
127
110
|
|
128
111
|
@crawler_instance.should_receive(:parse) do |arg|
|
129
112
|
arg[:base_url].should == "danielnc.com"
|
130
|
-
arg[:
|
113
|
+
arg[:path].should == "/itens/1"
|
131
114
|
end
|
132
115
|
|
133
116
|
@crawler_instance.crawl do
|
134
|
-
|
117
|
+
path "/itens/1"
|
135
118
|
end
|
136
119
|
|
137
120
|
another_instance = @crawler.new
|
138
121
|
|
139
122
|
another_instance.should_receive(:parse) do |arg|
|
140
123
|
arg[:base_url].should == "danielnc.com"
|
141
|
-
arg[:
|
124
|
+
arg[:path].should == "/itens"
|
142
125
|
end
|
143
126
|
|
144
127
|
another_instance.crawl
|
@@ -146,15 +129,15 @@ describe Wombat::Crawler do
|
|
146
129
|
|
147
130
|
it 'should remove created method missing' do
|
148
131
|
@crawler.base_url "danielnc.com"
|
149
|
-
@crawler.
|
132
|
+
@crawler.path "/itens"
|
150
133
|
|
151
134
|
@crawler_instance.should_receive(:parse) do |arg|
|
152
135
|
arg[:base_url].should == "danielnc.com"
|
153
|
-
arg[:
|
136
|
+
arg[:path].should == "/itens/1"
|
154
137
|
end
|
155
138
|
|
156
139
|
@crawler_instance.crawl do
|
157
|
-
|
140
|
+
path "/itens/1"
|
158
141
|
end
|
159
142
|
|
160
143
|
lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
|
@@ -162,15 +145,15 @@ describe Wombat::Crawler do
|
|
162
145
|
|
163
146
|
it 'should remove created instance variable' do
|
164
147
|
@crawler.base_url "danielnc.com"
|
165
|
-
@crawler.
|
148
|
+
@crawler.path "/itens"
|
166
149
|
|
167
150
|
@crawler_instance.should_receive(:parse) do |arg|
|
168
151
|
arg[:base_url].should == "danielnc.com"
|
169
|
-
arg[:
|
152
|
+
arg[:path].should == "/itens/1"
|
170
153
|
end
|
171
154
|
|
172
155
|
@crawler_instance.crawl do
|
173
|
-
|
156
|
+
path "/itens/1"
|
174
157
|
end
|
175
158
|
|
176
159
|
@crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
|
@@ -181,7 +164,7 @@ describe Wombat::Crawler do
|
|
181
164
|
VCR.use_cassette('basic_crawler_page') do
|
182
165
|
|
183
166
|
@crawler.base_url "http://www.terra.com.br"
|
184
|
-
@crawler.
|
167
|
+
@crawler.path '/portal'
|
185
168
|
|
186
169
|
@crawler.search "css=.btn-search"
|
187
170
|
|
@@ -194,7 +177,7 @@ describe Wombat::Crawler do
|
|
194
177
|
VCR.use_cassette('basic_crawler_page') do
|
195
178
|
|
196
179
|
@crawler.base_url "http://www.terra.com.br"
|
197
|
-
@crawler.
|
180
|
+
@crawler.path '/portal'
|
198
181
|
|
199
182
|
@crawler.search "css=.btn-search"
|
200
183
|
@crawler.document_format :xml
|
@@ -208,7 +191,7 @@ describe Wombat::Crawler do
|
|
208
191
|
VCR.use_cassette('error_page') do
|
209
192
|
|
210
193
|
@crawler.base_url "http://www.terra.com.br"
|
211
|
-
@crawler.
|
194
|
+
@crawler.path '/portal'
|
212
195
|
|
213
196
|
@crawler.search "css=.btn-search"
|
214
197
|
|
@@ -221,7 +204,7 @@ describe Wombat::Crawler do
|
|
221
204
|
VCR.use_cassette('error_page') do
|
222
205
|
|
223
206
|
@crawler.base_url "http://www.terra.com.br"
|
224
|
-
@crawler.
|
207
|
+
@crawler.path '/portal'
|
225
208
|
|
226
209
|
@crawler.search "css=.btn-search"
|
227
210
|
@crawler.document_format :xml
|
@@ -231,4 +214,11 @@ describe Wombat::Crawler do
|
|
231
214
|
end
|
232
215
|
end
|
233
216
|
end
|
217
|
+
|
218
|
+
describe '#scrape' do
|
219
|
+
it 'should alias to crawl' do
|
220
|
+
@crawler_instance.should_receive :parse
|
221
|
+
@crawler_instance.scrape
|
222
|
+
end
|
223
|
+
end
|
234
224
|
end
|