wombat 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +13 -30
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/follow_links.yml +2143 -0
- data/lib/wombat/crawler.rb +7 -17
- data/lib/wombat/dsl/follower.rb +19 -0
- data/lib/wombat/dsl/iterator.rb +19 -0
- data/lib/wombat/dsl/metadata.rb +27 -0
- data/lib/wombat/dsl/property.rb +27 -0
- data/lib/wombat/dsl/property_group.rb +48 -0
- data/lib/wombat/processing/node_selector.rb +12 -0
- data/lib/wombat/processing/parser.rb +48 -0
- data/lib/wombat/property/locators/base.rb +33 -0
- data/lib/wombat/property/locators/factory.rb +39 -0
- data/lib/wombat/property/locators/follow.rb +25 -0
- data/lib/wombat/property/locators/html.rb +14 -0
- data/lib/wombat/property/locators/iterator.rb +23 -0
- data/lib/wombat/property/locators/list.rb +17 -0
- data/lib/wombat/property/locators/property_group.rb +20 -0
- data/lib/wombat/property/locators/text.rb +22 -0
- data/lib/wombat.rb +8 -4
- data/spec/crawler_spec.rb +38 -48
- data/spec/dsl/property_spec.rb +12 -0
- data/spec/helpers/sample_crawler.rb +2 -15
- data/spec/integration/integration_spec.rb +61 -33
- data/spec/processing/parser_spec.rb +32 -0
- data/spec/property/locators/factory_spec.rb +18 -0
- data/spec/property/locators/follow_spec.rb +4 -0
- data/spec/property/locators/html_spec.rb +15 -0
- data/spec/property/locators/iterator_spec.rb +4 -0
- data/spec/property/locators/list_spec.rb +13 -0
- data/spec/property/locators/text_spec.rb +49 -0
- data/spec/sample_crawler_spec.rb +7 -11
- data/spec/wombat_spec.rb +13 -1
- data/wombat.gemspec +27 -16
- metadata +27 -16
- data/lib/wombat/iterator.rb +0 -38
- data/lib/wombat/metadata.rb +0 -24
- data/lib/wombat/node_selector.rb +0 -10
- data/lib/wombat/parser.rb +0 -59
- data/lib/wombat/property.rb +0 -21
- data/lib/wombat/property_container.rb +0 -70
- data/lib/wombat/property_locator.rb +0 -20
- data/spec/iterator_spec.rb +0 -52
- data/spec/metadata_spec.rb +0 -20
- data/spec/parser_spec.rb +0 -125
- data/spec/property_container_spec.rb +0 -62
- data/spec/property_locator_spec.rb +0 -75
- data/spec/property_spec.rb +0 -16
data/lib/wombat/crawler.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
require 'wombat/metadata'
|
3
|
-
require 'wombat/property'
|
4
|
-
require 'wombat/parser'
|
2
|
+
require 'wombat/dsl/metadata'
|
3
|
+
require 'wombat/dsl/property'
|
4
|
+
require 'wombat/processing/parser'
|
5
5
|
require 'active_support'
|
6
6
|
require 'date'
|
7
7
|
|
8
8
|
module Wombat
|
9
9
|
module Crawler
|
10
|
-
include Parser
|
10
|
+
include Processing::Parser
|
11
11
|
extend ActiveSupport::Concern
|
12
12
|
|
13
13
|
def crawl(&block)
|
@@ -31,33 +31,23 @@ module Wombat
|
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
+
alias_method :scrape, :crawl
|
35
|
+
|
34
36
|
def method_missing(method, *args, &block)
|
35
37
|
self.class.send method, *args, &block
|
36
38
|
end
|
37
39
|
|
38
|
-
def for_each(selector, &block)
|
39
|
-
self.class.for_each selector, &block
|
40
|
-
end
|
41
|
-
|
42
40
|
module ClassMethods
|
43
41
|
def method_missing(method, *args, &block)
|
44
42
|
metadata.send method, *args, &block
|
45
43
|
end
|
46
44
|
|
47
|
-
def for_each(selector, &block)
|
48
|
-
metadata.for_each(selector).instance_eval(&block) if block
|
49
|
-
end
|
50
|
-
|
51
|
-
def follow_links(selector)
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
45
|
def to_ary
|
56
46
|
end
|
57
47
|
|
58
48
|
private
|
59
49
|
def metadata
|
60
|
-
@metadata ||= Metadata.new
|
50
|
+
@metadata ||= DSL::Metadata.new
|
61
51
|
end
|
62
52
|
end
|
63
53
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Wombat
|
2
|
+
module DSL
|
3
|
+
class Follower < PropertyGroup
|
4
|
+
attr_accessor :wombat_property_selector
|
5
|
+
|
6
|
+
def initialize(name, selector)
|
7
|
+
@wombat_property_selector = selector
|
8
|
+
|
9
|
+
super(name)
|
10
|
+
end
|
11
|
+
|
12
|
+
# So that Property::Locators::Iterator can identify this class
|
13
|
+
# as an iterator property.
|
14
|
+
def wombat_property_format
|
15
|
+
:follow
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Wombat
|
2
|
+
module DSL
|
3
|
+
class Iterator < PropertyGroup
|
4
|
+
attr_accessor :wombat_property_selector
|
5
|
+
|
6
|
+
def initialize(name, selector)
|
7
|
+
@wombat_property_selector = selector
|
8
|
+
|
9
|
+
super(name)
|
10
|
+
end
|
11
|
+
|
12
|
+
# So that Property::Locators::Iterator can identify this class
|
13
|
+
# as an iterator property.
|
14
|
+
def wombat_property_format
|
15
|
+
:iterator
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/dsl/property_group'
|
3
|
+
require 'wombat/dsl/iterator'
|
4
|
+
require 'wombat/dsl/follower'
|
5
|
+
|
6
|
+
module Wombat
|
7
|
+
module DSL
|
8
|
+
class Metadata < PropertyGroup
|
9
|
+
def initialize
|
10
|
+
self[:document_format] = :html
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def base_url(url)
|
15
|
+
self[:base_url] = url
|
16
|
+
end
|
17
|
+
|
18
|
+
def path(url)
|
19
|
+
self[:path] = url
|
20
|
+
end
|
21
|
+
|
22
|
+
def document_format(format)
|
23
|
+
self[:document_format] = format
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Wombat
|
2
|
+
module DSL
|
3
|
+
class Property
|
4
|
+
attr_accessor :wombat_property_name, :wombat_property_selector, :wombat_property_format, :wombat_property_namespaces, :callback
|
5
|
+
|
6
|
+
def initialize(name, *args, &block)
|
7
|
+
@wombat_property_name = name
|
8
|
+
@wombat_property_selector = args[0]
|
9
|
+
@wombat_property_format = args[1] || :text
|
10
|
+
@wombat_property_namespaces = args[2]
|
11
|
+
@callback = block
|
12
|
+
end
|
13
|
+
|
14
|
+
def selector
|
15
|
+
@wombat_property_selector
|
16
|
+
end
|
17
|
+
|
18
|
+
def namespaces
|
19
|
+
@wombat_property_namespaces
|
20
|
+
end
|
21
|
+
|
22
|
+
def format
|
23
|
+
@wombat_property_format
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module DSL
|
5
|
+
class PropertyGroup < Hash
|
6
|
+
attr_accessor :wombat_property_name
|
7
|
+
|
8
|
+
def initialize(name = nil)
|
9
|
+
@wombat_property_name = name
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_missing(method, *args, &block)
|
13
|
+
property_name = method.to_s
|
14
|
+
|
15
|
+
if args.empty? && block
|
16
|
+
# TODO: Verify if another property with same name already exists
|
17
|
+
# before overwriting
|
18
|
+
property_group = self[property_name] || PropertyGroup.new(property_name)
|
19
|
+
self[property_name] = property_group
|
20
|
+
property_group.instance_eval(&block)
|
21
|
+
else
|
22
|
+
if args[1] == :iterator
|
23
|
+
it = Iterator.new(property_name, args.first)
|
24
|
+
self[property_name] = it
|
25
|
+
it.instance_eval(&block) if block
|
26
|
+
elsif args[1] == :follow
|
27
|
+
it = Follower.new(property_name, args.first)
|
28
|
+
self[property_name] = it
|
29
|
+
it.instance_eval(&block) if block
|
30
|
+
else
|
31
|
+
self[property_name] = Property.new(property_name, *args, &block)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_ary
|
37
|
+
end
|
38
|
+
|
39
|
+
def wombat_property_format
|
40
|
+
:container
|
41
|
+
end
|
42
|
+
|
43
|
+
def wombat_property_namespaces
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Wombat
|
2
|
+
module Processing
|
3
|
+
module NodeSelector
|
4
|
+
def select_nodes(selector, namespaces = nil)
|
5
|
+
return [selector.to_s] if selector.is_a? Symbol
|
6
|
+
return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
|
7
|
+
return @context.css selector[4..-1] if selector.start_with? "css="
|
8
|
+
[selector]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/property/locators/factory'
|
3
|
+
require 'wombat/processing/node_selector'
|
4
|
+
require 'mechanize'
|
5
|
+
require 'restclient'
|
6
|
+
|
7
|
+
module Wombat
|
8
|
+
module Processing
|
9
|
+
module Parser
|
10
|
+
attr_accessor :mechanize, :context, :response_code, :page
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@mechanize = Mechanize.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse(metadata)
|
17
|
+
@context = parser_for metadata
|
18
|
+
|
19
|
+
Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
def parser_for(metadata)
|
24
|
+
url = "#{metadata[:base_url]}#{metadata[:path]}"
|
25
|
+
page = nil
|
26
|
+
parser = nil
|
27
|
+
begin
|
28
|
+
if metadata[:document_format] == :html
|
29
|
+
@page = @mechanize.get(url)
|
30
|
+
parser = @page.parser
|
31
|
+
else
|
32
|
+
@page = RestClient.get(url)
|
33
|
+
parser = Nokogiri::XML @page
|
34
|
+
end
|
35
|
+
@response_code = @page.code.to_i if @page.respond_to? :code
|
36
|
+
parser
|
37
|
+
rescue
|
38
|
+
if $!.respond_to? :http_code
|
39
|
+
@response_code = $!.http_code.to_i
|
40
|
+
elsif $!.respond_to? :response_code
|
41
|
+
@response_code = $!.response_code.to_i
|
42
|
+
end
|
43
|
+
raise $!
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/processing/node_selector'
|
3
|
+
|
4
|
+
module Wombat
|
5
|
+
module Property
|
6
|
+
module Locators
|
7
|
+
# Abstract base class
|
8
|
+
class Base
|
9
|
+
include Wombat::Processing::NodeSelector
|
10
|
+
|
11
|
+
def initialize(property)
|
12
|
+
@property = property
|
13
|
+
end
|
14
|
+
|
15
|
+
def locate(context, page = nil)
|
16
|
+
@context = context
|
17
|
+
|
18
|
+
raw_data = yield if block_given?
|
19
|
+
data = @property.respond_to?(:callback) && @property.callback ? @property.callback.call(raw_data) : raw_data
|
20
|
+
|
21
|
+
@property.wombat_property_name ? { @property.wombat_property_name => data } : data
|
22
|
+
end
|
23
|
+
|
24
|
+
protected
|
25
|
+
def locate_nodes(context)
|
26
|
+
@context = context
|
27
|
+
|
28
|
+
select_nodes @property.wombat_property_selector, @property.wombat_property_namespaces
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/property/locators/base'
|
3
|
+
require 'wombat/property/locators/follow'
|
4
|
+
require 'wombat/property/locators/html'
|
5
|
+
require 'wombat/property/locators/iterator'
|
6
|
+
require 'wombat/property/locators/property_group'
|
7
|
+
require 'wombat/property/locators/list'
|
8
|
+
require 'wombat/property/locators/text'
|
9
|
+
|
10
|
+
class Wombat::Property::Locators::UnknownTypeException < Exception; end;
|
11
|
+
|
12
|
+
module Wombat
|
13
|
+
module Property
|
14
|
+
module Locators
|
15
|
+
module Factory
|
16
|
+
def self.locator_for(property)
|
17
|
+
klass = case(property.wombat_property_format)
|
18
|
+
when :text
|
19
|
+
Text
|
20
|
+
when :list
|
21
|
+
List
|
22
|
+
when :html
|
23
|
+
Html
|
24
|
+
when :iterator
|
25
|
+
Iterator
|
26
|
+
when :container
|
27
|
+
PropertyGroup
|
28
|
+
when :follow
|
29
|
+
Follow
|
30
|
+
else
|
31
|
+
raise Wombat::Property::Locators::UnknownTypeException.new("Unknown property format #{property.format}.")
|
32
|
+
end
|
33
|
+
|
34
|
+
klass.new(property)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class Follow < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
super do
|
9
|
+
locate_nodes(context).flat_map do |node|
|
10
|
+
target_page = page.click node
|
11
|
+
context = target_page.parser
|
12
|
+
|
13
|
+
Hash.new.tap do |h|
|
14
|
+
@property.values
|
15
|
+
.select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
|
16
|
+
.map { |p| Factory.locator_for(p).locate(context, page) }
|
17
|
+
.map { |p| h.merge! p }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat/property/locators/property_group'
|
3
|
+
|
4
|
+
module Wombat
|
5
|
+
module Property
|
6
|
+
module Locators
|
7
|
+
class Iterator < Base
|
8
|
+
def locate(contex, page = nil)
|
9
|
+
super do
|
10
|
+
locate_nodes(contex).flat_map do |node|
|
11
|
+
Hash.new.tap do |h|
|
12
|
+
@property.values
|
13
|
+
.select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
|
14
|
+
.map { |p| Factory.locator_for(p).locate(node, page) }
|
15
|
+
.map { |p| h.merge! p }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class List < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
super do
|
9
|
+
locate_nodes(context).map do |n|
|
10
|
+
n.is_a?(String) ? n.strip : n.inner_text.strip
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class PropertyGroup < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
super do
|
9
|
+
Hash.new.tap do |h|
|
10
|
+
@property.values
|
11
|
+
.select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
|
12
|
+
.map { |p| Factory.locator_for(p).locate(context, page) }
|
13
|
+
.map { |p| h.merge! p }
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
module Wombat
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class Text < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
node = locate_nodes(context).first
|
9
|
+
|
10
|
+
value =
|
11
|
+
unless node
|
12
|
+
nil
|
13
|
+
else
|
14
|
+
node.is_a?(String) ? node.strip : node.inner_text.strip
|
15
|
+
end
|
16
|
+
|
17
|
+
super { value }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/wombat.rb
CHANGED
@@ -3,9 +3,13 @@
|
|
3
3
|
require 'wombat/crawler'
|
4
4
|
|
5
5
|
module Wombat
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
class << self
|
7
|
+
def crawl(&block)
|
8
|
+
klass = Class.new
|
9
|
+
klass.send(:include, Wombat::Crawler)
|
10
|
+
klass.new.crawl(&block)
|
11
|
+
end
|
12
|
+
|
13
|
+
alias_method :scrape, :crawl
|
10
14
|
end
|
11
15
|
end
|
data/spec/crawler_spec.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Wombat::Crawler do
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
4
|
+
before(:each) do
|
5
|
+
@crawler = Class.new
|
6
|
+
@crawler.send(:include, Wombat::Crawler)
|
7
|
+
@crawler_instance = @crawler.new
|
8
|
+
end
|
10
9
|
|
10
|
+
describe '#crawl' do
|
11
11
|
it 'should call the provided block' do
|
12
12
|
event_called = false
|
13
13
|
|
@@ -17,8 +17,8 @@ describe Wombat::Crawler do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'should provide metadata to yielded block' do
|
20
|
-
@crawler.event do
|
21
|
-
|
20
|
+
@crawler.event do
|
21
|
+
self.class.should == Wombat::DSL::PropertyGroup
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
@@ -30,7 +30,10 @@ describe Wombat::Crawler do
|
|
30
30
|
e.time Time.now
|
31
31
|
end
|
32
32
|
|
33
|
-
@crawler.venue
|
33
|
+
@crawler.venue do |v|
|
34
|
+
v.name "Scooba"
|
35
|
+
end
|
36
|
+
|
34
37
|
@crawler.location { |v| v.latitude -50.2323 }
|
35
38
|
|
36
39
|
@crawler_instance.should_receive(:parse) do |arg|
|
@@ -62,7 +65,7 @@ describe Wombat::Crawler do
|
|
62
65
|
|
63
66
|
@crawler_instance.should_receive(:parse) do |arg|
|
64
67
|
prop = arg['some_data']
|
65
|
-
prop.
|
68
|
+
prop.wombat_property_name.should == "some_data"
|
66
69
|
prop.selector.should == "/event/list"
|
67
70
|
prop.format.should == :html
|
68
71
|
prop.namespaces.should == "geo"
|
@@ -73,12 +76,12 @@ describe Wombat::Crawler do
|
|
73
76
|
end
|
74
77
|
|
75
78
|
it 'should be able to specify arbitrary block structure more than once' do
|
76
|
-
@crawler.structure do
|
77
|
-
|
79
|
+
@crawler.structure do
|
80
|
+
data "xpath=/xyz"
|
78
81
|
end
|
79
82
|
|
80
|
-
@crawler.structure do
|
81
|
-
|
83
|
+
@crawler.structure do
|
84
|
+
another "css=.information"
|
82
85
|
end
|
83
86
|
|
84
87
|
@crawler_instance.should_receive(:parse) do |arg|
|
@@ -93,26 +96,6 @@ describe Wombat::Crawler do
|
|
93
96
|
@crawler.event
|
94
97
|
end
|
95
98
|
|
96
|
-
it 'should iterate on elements inside for_each block' do
|
97
|
-
@crawler.for_each "css=.element" do
|
98
|
-
title "css=.title"
|
99
|
-
body "css=.body"
|
100
|
-
event do |e|
|
101
|
-
e.all "yeah"
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
@crawler_instance.should_receive(:parse) do |arg|
|
106
|
-
it = arg.iterators.first
|
107
|
-
it.selector.should == "css=.element"
|
108
|
-
it["title"].selector.should == "css=.title"
|
109
|
-
it["body"].selector.should == "css=.body"
|
110
|
-
it["event"]["all"].selector.should == "yeah"
|
111
|
-
end
|
112
|
-
|
113
|
-
@crawler_instance.crawl
|
114
|
-
end
|
115
|
-
|
116
99
|
it 'should assign metadata format' do
|
117
100
|
@crawler_instance.should_receive(:parse) do |arg|
|
118
101
|
arg[:document_format].should == :xml
|
@@ -123,22 +106,22 @@ describe Wombat::Crawler do
|
|
123
106
|
|
124
107
|
it 'should crawl with block' do
|
125
108
|
@crawler.base_url "danielnc.com"
|
126
|
-
@crawler.
|
109
|
+
@crawler.path "/itens"
|
127
110
|
|
128
111
|
@crawler_instance.should_receive(:parse) do |arg|
|
129
112
|
arg[:base_url].should == "danielnc.com"
|
130
|
-
arg[:
|
113
|
+
arg[:path].should == "/itens/1"
|
131
114
|
end
|
132
115
|
|
133
116
|
@crawler_instance.crawl do
|
134
|
-
|
117
|
+
path "/itens/1"
|
135
118
|
end
|
136
119
|
|
137
120
|
another_instance = @crawler.new
|
138
121
|
|
139
122
|
another_instance.should_receive(:parse) do |arg|
|
140
123
|
arg[:base_url].should == "danielnc.com"
|
141
|
-
arg[:
|
124
|
+
arg[:path].should == "/itens"
|
142
125
|
end
|
143
126
|
|
144
127
|
another_instance.crawl
|
@@ -146,15 +129,15 @@ describe Wombat::Crawler do
|
|
146
129
|
|
147
130
|
it 'should remove created method missing' do
|
148
131
|
@crawler.base_url "danielnc.com"
|
149
|
-
@crawler.
|
132
|
+
@crawler.path "/itens"
|
150
133
|
|
151
134
|
@crawler_instance.should_receive(:parse) do |arg|
|
152
135
|
arg[:base_url].should == "danielnc.com"
|
153
|
-
arg[:
|
136
|
+
arg[:path].should == "/itens/1"
|
154
137
|
end
|
155
138
|
|
156
139
|
@crawler_instance.crawl do
|
157
|
-
|
140
|
+
path "/itens/1"
|
158
141
|
end
|
159
142
|
|
160
143
|
lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
|
@@ -162,15 +145,15 @@ describe Wombat::Crawler do
|
|
162
145
|
|
163
146
|
it 'should remove created instance variable' do
|
164
147
|
@crawler.base_url "danielnc.com"
|
165
|
-
@crawler.
|
148
|
+
@crawler.path "/itens"
|
166
149
|
|
167
150
|
@crawler_instance.should_receive(:parse) do |arg|
|
168
151
|
arg[:base_url].should == "danielnc.com"
|
169
|
-
arg[:
|
152
|
+
arg[:path].should == "/itens/1"
|
170
153
|
end
|
171
154
|
|
172
155
|
@crawler_instance.crawl do
|
173
|
-
|
156
|
+
path "/itens/1"
|
174
157
|
end
|
175
158
|
|
176
159
|
@crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
|
@@ -181,7 +164,7 @@ describe Wombat::Crawler do
|
|
181
164
|
VCR.use_cassette('basic_crawler_page') do
|
182
165
|
|
183
166
|
@crawler.base_url "http://www.terra.com.br"
|
184
|
-
@crawler.
|
167
|
+
@crawler.path '/portal'
|
185
168
|
|
186
169
|
@crawler.search "css=.btn-search"
|
187
170
|
|
@@ -194,7 +177,7 @@ describe Wombat::Crawler do
|
|
194
177
|
VCR.use_cassette('basic_crawler_page') do
|
195
178
|
|
196
179
|
@crawler.base_url "http://www.terra.com.br"
|
197
|
-
@crawler.
|
180
|
+
@crawler.path '/portal'
|
198
181
|
|
199
182
|
@crawler.search "css=.btn-search"
|
200
183
|
@crawler.document_format :xml
|
@@ -208,7 +191,7 @@ describe Wombat::Crawler do
|
|
208
191
|
VCR.use_cassette('error_page') do
|
209
192
|
|
210
193
|
@crawler.base_url "http://www.terra.com.br"
|
211
|
-
@crawler.
|
194
|
+
@crawler.path '/portal'
|
212
195
|
|
213
196
|
@crawler.search "css=.btn-search"
|
214
197
|
|
@@ -221,7 +204,7 @@ describe Wombat::Crawler do
|
|
221
204
|
VCR.use_cassette('error_page') do
|
222
205
|
|
223
206
|
@crawler.base_url "http://www.terra.com.br"
|
224
|
-
@crawler.
|
207
|
+
@crawler.path '/portal'
|
225
208
|
|
226
209
|
@crawler.search "css=.btn-search"
|
227
210
|
@crawler.document_format :xml
|
@@ -231,4 +214,11 @@ describe Wombat::Crawler do
|
|
231
214
|
end
|
232
215
|
end
|
233
216
|
end
|
217
|
+
|
218
|
+
describe '#scrape' do
|
219
|
+
it 'should alias to crawl' do
|
220
|
+
@crawler_instance.should_receive :parse
|
221
|
+
@crawler_instance.scrape
|
222
|
+
end
|
223
|
+
end
|
234
224
|
end
|