extraloop 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +2 -0
- data/README.md +135 -0
- data/examples/google_news_scraper.rb +22 -0
- data/examples/wikipedia_categories.rb +49 -0
- data/lib/extraloop/dom_extractor.rb +45 -0
- data/lib/extraloop/extraction_environment.rb +20 -0
- data/lib/extraloop/extraction_loop.rb +46 -0
- data/lib/extraloop/extractor_base.rb +40 -0
- data/lib/extraloop/hookable.rb +26 -0
- data/lib/extraloop/iterative_scraper.rb +291 -0
- data/lib/extraloop/json_extractor.rb +36 -0
- data/lib/extraloop/loggable.rb +64 -0
- data/lib/extraloop/scraper_base.rb +166 -0
- data/lib/extraloop/utils.rb +75 -0
- data/lib/extraloop.rb +43 -0
- data/spec/dom_extractor_spec.rb +165 -0
- data/spec/extraction_loop_spec.rb +76 -0
- data/spec/fixtures/doc.html +1324 -0
- data/spec/fixtures/doc.json +1 -0
- data/spec/helpers/scraper_helper.rb +46 -0
- data/spec/helpers/spec_helper.rb +12 -0
- data/spec/iterative_scraper_spec.rb +175 -0
- data/spec/json_extractor_spec.rb +146 -0
- data/spec/loggable_spec.rb +25 -0
- data/spec/scraper_base_spec.rb +178 -0
- data/spec/utils_spec.rb +44 -0
- metadata +140 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'pry'
|
2
|
+
class JsonExtractor < ExtractorBase
|
3
|
+
|
4
|
+
def initialize(*args)
|
5
|
+
@path = args[2] && args[2].is_a?(Array) ? args[2] : nil
|
6
|
+
super(*args)
|
7
|
+
end
|
8
|
+
|
9
|
+
def extract_field(node, record=nil)
|
10
|
+
output = node = node.is_a?(String) ? parse(node) : node
|
11
|
+
output = node.get_in(@path) if @path
|
12
|
+
output = node[@attribute.to_s] if @attribute
|
13
|
+
output = @environment.run(output, record, &@callback) if @callback
|
14
|
+
|
15
|
+
# when no attribute and no callback is provided, try fetching by field name
|
16
|
+
if !@attribute && !@callback
|
17
|
+
output = node[@field_name.to_s] if node[@field_name.to_s]
|
18
|
+
end
|
19
|
+
output
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract_list(input)
|
23
|
+
#TODO: implement more clever stuff here after looking
|
24
|
+
# into possible hash traversal techniques
|
25
|
+
|
26
|
+
input = input.is_a?(String) ? parse(input) : input
|
27
|
+
input = input.get_in(@path) if @path
|
28
|
+
|
29
|
+
@callback && Array(@environment.run(input, &@callback)) || input
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse(input)
|
33
|
+
super(input)
|
34
|
+
@environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
autoload :Logging, "logging"
|
2
|
+
|
3
|
+
# Decorates a class with an instance of Logging.logger and a convenient
|
4
|
+
# helper method to log messages.
|
5
|
+
|
6
|
+
module Loggable
|
7
|
+
protected
|
8
|
+
|
9
|
+
#
|
10
|
+
# Initializes the incorporated logger object.
|
11
|
+
#
|
12
|
+
# Returns nothing.
|
13
|
+
#
|
14
|
+
|
15
|
+
def init_log!
|
16
|
+
return unless @options[:log]
|
17
|
+
|
18
|
+
@options[:log] ||= {
|
19
|
+
:appenders => [ Logging.appenders.stderr ],
|
20
|
+
:log_level => :info
|
21
|
+
}
|
22
|
+
|
23
|
+
if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
|
24
|
+
@log = Logging.logger["#{self}"]
|
25
|
+
@log.add_appenders(@options[:log][:appenders])
|
26
|
+
@log.level = @options[:log] && @options[:log][:log_level] || :info
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Convenience method for logging messages.
|
32
|
+
#
|
33
|
+
# messages - the message content
|
34
|
+
# log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
|
35
|
+
#
|
36
|
+
# Returns nothing.
|
37
|
+
#
|
38
|
+
|
39
|
+
def log(message, log_level = :info)
|
40
|
+
@log.send(log_level, message) if @log
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
#
|
46
|
+
# Monkey patches ScraperBase.
|
47
|
+
#
|
48
|
+
class ScraperBase
|
49
|
+
include Loggable
|
50
|
+
alias_method :base_initialize, :initialize
|
51
|
+
|
52
|
+
#
|
53
|
+
# Wrapp ScraperBase#initialize method into Loggable#initialize
|
54
|
+
#
|
55
|
+
# args - The arguments to be passed over to the ScraperBase#initialize method.
|
56
|
+
#
|
57
|
+
# Returns itself.
|
58
|
+
#
|
59
|
+
def initialize(*args)
|
60
|
+
base_initialize(*args)
|
61
|
+
init_log!
|
62
|
+
self
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
class ScraperBase
|
2
|
+
include Hookable
|
3
|
+
include Utils::Support
|
4
|
+
|
5
|
+
attr_reader :results, :options
|
6
|
+
|
7
|
+
#
|
8
|
+
# Public: Initalizes a web scraper.
|
9
|
+
#
|
10
|
+
# urls - One or several urls.
|
11
|
+
# options - Hash of scraper options
|
12
|
+
# async : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
|
13
|
+
# log : logging options (defaults to standard error).
|
14
|
+
# appenders : specifies where the log messages should be appended to (defaults to standard error).
|
15
|
+
# log_level : specifies the log level (defaults to info).
|
16
|
+
# arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
|
17
|
+
#
|
18
|
+
#
|
19
|
+
#
|
20
|
+
# Returns itself.
|
21
|
+
#
|
22
|
+
|
23
|
+
def initialize(urls, options = {}, arguments = {})
|
24
|
+
@urls = Array(urls)
|
25
|
+
@loop_extractor_args = nil
|
26
|
+
@extractor_args = []
|
27
|
+
@loop = nil
|
28
|
+
|
29
|
+
@request_arguments = arguments
|
30
|
+
|
31
|
+
@options = {
|
32
|
+
:async => false
|
33
|
+
}.merge(options)
|
34
|
+
|
35
|
+
|
36
|
+
@response_count = 0
|
37
|
+
@queued_count = 0
|
38
|
+
|
39
|
+
@hooks = {}
|
40
|
+
@failed_requests = []
|
41
|
+
|
42
|
+
hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
|
43
|
+
@hydra = Typhoeus::Hydra.new hydra_options
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Public: Sets the scraper extraction loop.
|
49
|
+
#
|
50
|
+
# Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
|
51
|
+
#
|
52
|
+
#
|
53
|
+
# selector - The CSS3 selector identifying the node list over which iterate (optional).
|
54
|
+
# callback - A block of code (optional).
|
55
|
+
# attribute - An attribute name (optional).
|
56
|
+
#
|
57
|
+
# Returns itself.
|
58
|
+
#
|
59
|
+
|
60
|
+
def loop_on(*args)
|
61
|
+
@loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
|
62
|
+
self
|
63
|
+
end
|
64
|
+
|
65
|
+
# Public: Registers a new extractor to be added to the loop.
|
66
|
+
#
|
67
|
+
# Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
|
68
|
+
#
|
69
|
+
# selector - The CSS3 selector identifying the node list over which iterate (optional).
|
70
|
+
# callback - A block of code (optional).
|
71
|
+
# attribute - An attribute name (optional).
|
72
|
+
#
|
73
|
+
# Returns itself.
|
74
|
+
#
|
75
|
+
#
|
76
|
+
|
77
|
+
def extract(*args)
|
78
|
+
@extractor_args << args.insert(1, ExtractionEnvironment.new(self))
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
#
|
83
|
+
# Public: Runs the main scraping loop.
|
84
|
+
#
|
85
|
+
# Returns nothing
|
86
|
+
#
|
87
|
+
def run
|
88
|
+
@urls.each do |url|
|
89
|
+
issue_request(url)
|
90
|
+
|
91
|
+
# if the scraper is asynchronous start processing the Hydra HTTP queue
|
92
|
+
# only after that the last url has been appended to the queue (see #issue_request).
|
93
|
+
#
|
94
|
+
if @options[:async]
|
95
|
+
if url == @urls.last
|
96
|
+
@hydra.run
|
97
|
+
end
|
98
|
+
else
|
99
|
+
@hydra.run
|
100
|
+
end
|
101
|
+
end
|
102
|
+
self
|
103
|
+
end
|
104
|
+
|
105
|
+
protected
|
106
|
+
|
107
|
+
def issue_request(url)
|
108
|
+
|
109
|
+
@request_arguments[:params] = merge_request_parameters(url)
|
110
|
+
url_without_params = url.gsub(/\?.*/,"")
|
111
|
+
|
112
|
+
arguments = {
|
113
|
+
'headers' => [
|
114
|
+
'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
|
115
|
+
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
116
|
+
].join("\n")
|
117
|
+
}
|
118
|
+
|
119
|
+
arguments.merge!(@request_arguments)
|
120
|
+
request = Typhoeus::Request.new(*[url_without_params, arguments])
|
121
|
+
|
122
|
+
request.on_complete do |response|
|
123
|
+
handle_response(response)
|
124
|
+
end
|
125
|
+
|
126
|
+
log("queueing url: #{url}, params #{arguments[:params]}", :debug)
|
127
|
+
@queued_count += 1
|
128
|
+
@hydra.queue(request)
|
129
|
+
end
|
130
|
+
|
131
|
+
def merge_request_parameters(url)
|
132
|
+
url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
|
133
|
+
return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
|
134
|
+
|
135
|
+
params = symbolize_keys(@request_arguments[:params] ||= {})
|
136
|
+
url_params.merge(params)
|
137
|
+
end
|
138
|
+
|
139
|
+
def handle_response(response)
|
140
|
+
@response_count += 1
|
141
|
+
@loop = prepare_loop(response)
|
142
|
+
log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
|
143
|
+
@loop.run
|
144
|
+
|
145
|
+
@environment = @loop.environment
|
146
|
+
run_hook(:data, [@loop.records, response])
|
147
|
+
end
|
148
|
+
|
149
|
+
def prepare_loop(response)
|
150
|
+
format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
|
151
|
+
extractor_class = format == :json ? JsonExtractor : DomExtractor
|
152
|
+
loop_extractor = extractor_class.new(*@loop_extractor_args)
|
153
|
+
extractors = @extractor_args.map { |args| extractor_class.new(*args) }
|
154
|
+
ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
|
155
|
+
end
|
156
|
+
|
157
|
+
def detect_format(content_type)
|
158
|
+
#TODO: add support for xml/rdf documents
|
159
|
+
if content_type && content_type =~ /json$/
|
160
|
+
:json
|
161
|
+
else
|
162
|
+
:html
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Utils
|
2
|
+
module ScrapingHelpers
|
3
|
+
#
|
4
|
+
# Generates a proc that iterates over a list of anchors
|
5
|
+
# and collects the value of the specified paramenter
|
6
|
+
#
|
7
|
+
def values_for_param(param)
|
8
|
+
lambda { |nodeList|
|
9
|
+
nodeList.collect {|node|
|
10
|
+
query = URI::parse(node.attr(:href)).query
|
11
|
+
query.split("&").collect { |token| token.split("=") }.
|
12
|
+
detect{ |chunks| chunks.first == param.to_s }.last
|
13
|
+
}.uniq
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module URIAddition
|
19
|
+
#
|
20
|
+
# Public
|
21
|
+
#
|
22
|
+
# Generates a hash representation of a uri's query string.
|
23
|
+
#
|
24
|
+
# Returns a hash mapping the URL query parameters to their respective values
|
25
|
+
#
|
26
|
+
# NOTE: this is intended as a decorator method for instances of URI::HTTP.
|
27
|
+
#
|
28
|
+
# examples:
|
29
|
+
#
|
30
|
+
# URI::parse(url).extend(URIAddition).query_hash
|
31
|
+
#
|
32
|
+
|
33
|
+
def query_hash
|
34
|
+
return unless self.query
|
35
|
+
self.query.split("&").reduce({}) do |memo, item|
|
36
|
+
param, value = *item.split("=")
|
37
|
+
memo.merge(param.to_sym => value)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module DeepFetchable
|
43
|
+
def get_in(path)
|
44
|
+
keys, node = Array(path), self
|
45
|
+
|
46
|
+
keys.each_with_index do |key, index|
|
47
|
+
node = node[key]
|
48
|
+
next_key = keys[index + 1]
|
49
|
+
break unless node
|
50
|
+
end
|
51
|
+
|
52
|
+
node
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
module Support
|
57
|
+
def symbolize_keys(hash)
|
58
|
+
hash.reduce({}) { |memo, (k,v)| memo.merge(k => v) }
|
59
|
+
end
|
60
|
+
#
|
61
|
+
# Creates instance variables from a hash.
|
62
|
+
#
|
63
|
+
# hash - An hash representing of instance variables to be created.
|
64
|
+
# defaults - An hash representing the attributes' default values (optional).
|
65
|
+
#
|
66
|
+
protected
|
67
|
+
def set_attributes(hash, defaults={})
|
68
|
+
allowed = defaults.keys
|
69
|
+
hash.each { |key, value| self.instance_variable_set("@#{key}", value)}
|
70
|
+
defaults.each do |key, value|
|
71
|
+
self.instance_variable_set("@#{key}", value) unless self.instance_variable_get("@#{key}")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
data/lib/extraloop.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop" )
|
2
|
+
|
3
|
+
# Standard library
|
4
|
+
autoload :OpenStruct, "ostruct"
|
5
|
+
|
6
|
+
# Rubygems
|
7
|
+
|
8
|
+
gem "yajl-ruby"
|
9
|
+
gem "nokogiri"
|
10
|
+
gem "typhoeus"
|
11
|
+
gem "logging"
|
12
|
+
|
13
|
+
|
14
|
+
autoload :Nokogiri, "nokogiri"
|
15
|
+
autoload :Yajl, "yajl"
|
16
|
+
autoload :Typhoeus, "typhoeus"
|
17
|
+
|
18
|
+
|
19
|
+
# Extraloop components
|
20
|
+
|
21
|
+
autoload :Utils , "#{base_path}/utils"
|
22
|
+
autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
|
23
|
+
autoload :ExtractorBase , "#{base_path}/extractor_base"
|
24
|
+
autoload :DomExtractor , "#{base_path}/dom_extractor"
|
25
|
+
autoload :JsonExtractor , "#{base_path}/json_extractor"
|
26
|
+
autoload :ExtractionLoop , "#{base_path}/extraction_loop"
|
27
|
+
autoload :ScraperBase , "#{base_path}/scraper_base"
|
28
|
+
autoload :Loggable , "#{base_path}/loggable"
|
29
|
+
autoload :Hookable , "#{base_path}/hookable"
|
30
|
+
autoload :IterativeScraper , "#{base_path}/iterative_scraper"
|
31
|
+
|
32
|
+
|
33
|
+
# monkey patch scraperbase with the Loggable module.
|
34
|
+
#
|
35
|
+
# This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
|
36
|
+
#
|
37
|
+
ScraperBase
|
38
|
+
Loggable
|
39
|
+
|
40
|
+
|
41
|
+
class ExtraLoop
|
42
|
+
VERSION = '0.0.1'
|
43
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
|
3
|
+
describe DomExtractor do
|
4
|
+
before(:each) do
|
5
|
+
stub(scraper = Object.new).options
|
6
|
+
stub(scraper).results
|
7
|
+
@env = ExtractionEnvironment.new(scraper)
|
8
|
+
@html ||= <<-EOF
|
9
|
+
<div class="entry">
|
10
|
+
<p><a href="http://example.com">my dummy link</a></p>
|
11
|
+
</div>
|
12
|
+
<div class="entry exclude" />
|
13
|
+
<div class="entry" />
|
14
|
+
EOF
|
15
|
+
|
16
|
+
@xml ||= <<-EOF
|
17
|
+
<?xml version="1.0"?>
|
18
|
+
<StandardDataObject xmlns="myns">
|
19
|
+
<InteractionElements>
|
20
|
+
<TargetCenter>92f4-MPA</TargetCenter>
|
21
|
+
<Trace>7.19879</Trace>
|
22
|
+
</InteractionElements>
|
23
|
+
</StandardDataObject>
|
24
|
+
EOF
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#new" do
|
28
|
+
subject { DomExtractor.new(:my_field, @env, "p a", :href) }
|
29
|
+
it { subject.field_name.should eql(:my_field) }
|
30
|
+
end
|
31
|
+
|
32
|
+
context "when no attribute is provided" do
|
33
|
+
before do
|
34
|
+
@extractor = DomExtractor.new(:anchor, @env, "p a")
|
35
|
+
@node = @extractor.parse(@html)
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#extract_field" do
|
39
|
+
subject { @extractor.extract_field(@node) }
|
40
|
+
it { should eql("my dummy link") }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
context "when an attribute is provided" do
|
46
|
+
before do
|
47
|
+
@extractor = DomExtractor.new(:anchor, @env, "p a", :href)
|
48
|
+
@node = @extractor.parse(@html)
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#extract_field" do
|
52
|
+
subject { @extractor.extract_field(@node) }
|
53
|
+
it { should eql("http://example.com") }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
context "when a selector and a block is provided" do
|
58
|
+
before do
|
59
|
+
@extractor = DomExtractor.new(:anchor, @env, "p a", proc { |node|
|
60
|
+
node.text.gsub("dummy", "fancy")
|
61
|
+
})
|
62
|
+
@node = @extractor.parse(@html)
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "#extract_field" do
|
66
|
+
subject { @extractor.extract_field(@node) }
|
67
|
+
it { should match(/my fancy/) }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
context "when only a block is provided" do
|
72
|
+
before do
|
73
|
+
@extractor = DomExtractor.new(:anchor, @env, proc { |document|
|
74
|
+
document.at_css("p a").text.gsub(/dummy/,'fancy')
|
75
|
+
})
|
76
|
+
@node = @extractor.parse(@html)
|
77
|
+
end
|
78
|
+
|
79
|
+
describe "#extract_field" do
|
80
|
+
subject { @extractor.extract_field(@node) }
|
81
|
+
it { should match(/my fancy/) }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
context "when only an attribute is provided" do
|
86
|
+
before do
|
87
|
+
@extractor = DomExtractor.new(:url, @env, :href)
|
88
|
+
@node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
|
89
|
+
end
|
90
|
+
describe "#extract_field" do
|
91
|
+
subject { @extractor.extract_field(@node) }
|
92
|
+
it { should eql("hello-world") }
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
context "when nothing but a field name is provided" do
|
97
|
+
before do
|
98
|
+
@extractor = DomExtractor.new(:url, @env)
|
99
|
+
@node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
|
100
|
+
end
|
101
|
+
describe "#extract_field" do
|
102
|
+
subject { @extractor.extract_field(@node) }
|
103
|
+
it { should eql("Hello") }
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
describe "extract_list" do
|
108
|
+
context "no block provided" do
|
109
|
+
before do
|
110
|
+
@extractor = DomExtractor.new(nil, @env, "div.entry")
|
111
|
+
@node = @extractor.parse(@html)
|
112
|
+
end
|
113
|
+
|
114
|
+
subject { @extractor.extract_list(@node) }
|
115
|
+
it { subject.should have(3).items }
|
116
|
+
end
|
117
|
+
|
118
|
+
context "block provided" do
|
119
|
+
before do
|
120
|
+
@extractor = DomExtractor.new(nil, @env, "div.entry", lambda { |nodeList|
|
121
|
+
nodeList.reject {|node| node.attr(:class).split(" ").include?('exclude') }
|
122
|
+
})
|
123
|
+
end
|
124
|
+
|
125
|
+
subject { @extractor.extract_list(@html) }
|
126
|
+
it { subject.should have(2).items }
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
context "xml input" do
|
131
|
+
describe "#parse" do
|
132
|
+
before do
|
133
|
+
@extractor = DomExtractor.new(nil, @env, "entry")
|
134
|
+
end
|
135
|
+
|
136
|
+
subject { @extractor.parse(@xml) }
|
137
|
+
it { should be_an_instance_of(Nokogiri::XML::Document)}
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
context "html input" do
|
143
|
+
describe "#parse" do
|
144
|
+
before do
|
145
|
+
@extractor = DomExtractor.new(nil, @env, "entry")
|
146
|
+
end
|
147
|
+
|
148
|
+
subject { @extractor.parse(@html) }
|
149
|
+
it { should be_an_instance_of(Nokogiri::HTML::Document)}
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
context "non-string input" do
|
154
|
+
describe "#parse" do
|
155
|
+
before do
|
156
|
+
@extractor = DomExtractor.new(nil, @env, "entry")
|
157
|
+
end
|
158
|
+
|
159
|
+
it "Should raise an exception" do
|
160
|
+
expect { @extractor.parse(Nokogiri::HTML(@html)) }.to raise_exception(ExtractorBase::Exceptions::ExtractorParseError)
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
|
3
|
+
describe ExtractionLoop do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@fake_scraper = stub!.options
|
7
|
+
stub(@fake_scraper).results
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "#new" do
|
11
|
+
before do
|
12
|
+
@mock_loop = Object.new
|
13
|
+
stub(@mock_loop).parse {}
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
subject { ExtractionLoop.new(@mock_loop ) }
|
18
|
+
|
19
|
+
it "should allow read/write access to public attributes" do
|
20
|
+
|
21
|
+
{:extractors => [:fake, :fake],
|
22
|
+
:document => nil,
|
23
|
+
:hooks => { }
|
24
|
+
}.each do |k, v|
|
25
|
+
subject.send("#{k}=", v)
|
26
|
+
subject.send(k).should eql(v)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "run" do
|
32
|
+
before(:each) do
|
33
|
+
|
34
|
+
@extractors = [:a, :b].map do |field_name|
|
35
|
+
object = Object.new
|
36
|
+
stub(object).extract_field { |node, record| node[field_name] }
|
37
|
+
stub(object).field_name { field_name }
|
38
|
+
object
|
39
|
+
end
|
40
|
+
|
41
|
+
@loop_extractor = Object.new
|
42
|
+
|
43
|
+
stub(@loop_extractor).parse { |input| Nokogiri::HTML("<html><body>Hello test!</body></html>") }
|
44
|
+
|
45
|
+
stub(@loop_extractor).extract_list { |document|
|
46
|
+
#list of fake dom elements
|
47
|
+
(0..9).to_a.map { |n| {:a => n, :b => n*n } }
|
48
|
+
}
|
49
|
+
|
50
|
+
|
51
|
+
before, before_extract, after_extract, after = *(1..4).to_a.map { proc {} }
|
52
|
+
hooks = {before: [before], before_extract: [before_extract], after_extract: [after_extract], after: [after]}
|
53
|
+
|
54
|
+
any_instance_of(ExtractionEnvironment) do |env|
|
55
|
+
mock(env).run.with_any_args.times(20 + 2)
|
56
|
+
end
|
57
|
+
|
58
|
+
@extraction_loop = ExtractionLoop.new(@loop_extractor, @extractors, "fake document", hooks, @fake_scraper).run
|
59
|
+
end
|
60
|
+
|
61
|
+
subject { @extraction_loop.run }
|
62
|
+
|
63
|
+
it "should produce 10 records" do
|
64
|
+
@extraction_loop.records.size.should eql(10)
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should run extractors" do
|
68
|
+
@extraction_loop.records.all? { |record| record.a && record.b && record.b == record.a ** 2 }
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should convert extracted records into OpenStruct instances" do
|
72
|
+
@extraction_loop.records.all? { |record| record.is_a?(OpenStruct) }
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|