extraloop 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +2 -0
- data/README.md +135 -0
- data/examples/google_news_scraper.rb +22 -0
- data/examples/wikipedia_categories.rb +49 -0
- data/lib/extraloop/dom_extractor.rb +45 -0
- data/lib/extraloop/extraction_environment.rb +20 -0
- data/lib/extraloop/extraction_loop.rb +46 -0
- data/lib/extraloop/extractor_base.rb +40 -0
- data/lib/extraloop/hookable.rb +26 -0
- data/lib/extraloop/iterative_scraper.rb +291 -0
- data/lib/extraloop/json_extractor.rb +36 -0
- data/lib/extraloop/loggable.rb +64 -0
- data/lib/extraloop/scraper_base.rb +166 -0
- data/lib/extraloop/utils.rb +75 -0
- data/lib/extraloop.rb +43 -0
- data/spec/dom_extractor_spec.rb +165 -0
- data/spec/extraction_loop_spec.rb +76 -0
- data/spec/fixtures/doc.html +1324 -0
- data/spec/fixtures/doc.json +1 -0
- data/spec/helpers/scraper_helper.rb +46 -0
- data/spec/helpers/spec_helper.rb +12 -0
- data/spec/iterative_scraper_spec.rb +175 -0
- data/spec/json_extractor_spec.rb +146 -0
- data/spec/loggable_spec.rb +25 -0
- data/spec/scraper_base_spec.rb +178 -0
- data/spec/utils_spec.rb +44 -0
- metadata +140 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'pry'
|
2
|
+
class JsonExtractor < ExtractorBase
|
3
|
+
|
4
|
+
def initialize(*args)
|
5
|
+
@path = args[2] && args[2].is_a?(Array) ? args[2] : nil
|
6
|
+
super(*args)
|
7
|
+
end
|
8
|
+
|
9
|
+
def extract_field(node, record=nil)
|
10
|
+
output = node = node.is_a?(String) ? parse(node) : node
|
11
|
+
output = node.get_in(@path) if @path
|
12
|
+
output = node[@attribute.to_s] if @attribute
|
13
|
+
output = @environment.run(output, record, &@callback) if @callback
|
14
|
+
|
15
|
+
# when no attribute and no callback is provided, try fetching by field name
|
16
|
+
if !@attribute && !@callback
|
17
|
+
output = node[@field_name.to_s] if node[@field_name.to_s]
|
18
|
+
end
|
19
|
+
output
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract_list(input)
|
23
|
+
#TODO: implement more clever stuff here after looking
|
24
|
+
# into possible hash traversal techniques
|
25
|
+
|
26
|
+
input = input.is_a?(String) ? parse(input) : input
|
27
|
+
input = input.get_in(@path) if @path
|
28
|
+
|
29
|
+
@callback && Array(@environment.run(input, &@callback)) || input
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse(input)
|
33
|
+
super(input)
|
34
|
+
@environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
autoload :Logging, "logging"
|
2
|
+
|
3
|
+
# Decorates a class with an instance of Logging.logger and a convenient
|
4
|
+
# helper method to log messages.
|
5
|
+
|
6
|
+
module Loggable
|
7
|
+
protected
|
8
|
+
|
9
|
+
#
|
10
|
+
# Initializes the incorporated logger object.
|
11
|
+
#
|
12
|
+
# Returns nothing.
|
13
|
+
#
|
14
|
+
|
15
|
+
def init_log!
|
16
|
+
return unless @options[:log]
|
17
|
+
|
18
|
+
@options[:log] ||= {
|
19
|
+
:appenders => [ Logging.appenders.stderr ],
|
20
|
+
:log_level => :info
|
21
|
+
}
|
22
|
+
|
23
|
+
if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
|
24
|
+
@log = Logging.logger["#{self}"]
|
25
|
+
@log.add_appenders(@options[:log][:appenders])
|
26
|
+
@log.level = @options[:log] && @options[:log][:log_level] || :info
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Convenience method for logging messages.
|
32
|
+
#
|
33
|
+
# messages - the message content
|
34
|
+
# log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
|
35
|
+
#
|
36
|
+
# Returns nothing.
|
37
|
+
#
|
38
|
+
|
39
|
+
def log(message, log_level = :info)
|
40
|
+
@log.send(log_level, message) if @log
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
#
|
46
|
+
# Monkey patches ScraperBase.
|
47
|
+
#
|
48
|
+
class ScraperBase
|
49
|
+
include Loggable
|
50
|
+
alias_method :base_initialize, :initialize
|
51
|
+
|
52
|
+
#
|
53
|
+
# Wrapp ScraperBase#initialize method into Loggable#initialize
|
54
|
+
#
|
55
|
+
# args - The arguments to be passed over to the ScraperBase#initialize method.
|
56
|
+
#
|
57
|
+
# Returns itself.
|
58
|
+
#
|
59
|
+
def initialize(*args)
|
60
|
+
base_initialize(*args)
|
61
|
+
init_log!
|
62
|
+
self
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
class ScraperBase
|
2
|
+
include Hookable
|
3
|
+
include Utils::Support
|
4
|
+
|
5
|
+
attr_reader :results, :options
|
6
|
+
|
7
|
+
#
|
8
|
+
# Public: Initalizes a web scraper.
|
9
|
+
#
|
10
|
+
# urls - One or several urls.
|
11
|
+
# options - Hash of scraper options
|
12
|
+
# async : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
|
13
|
+
# log : logging options (defaults to standard error).
|
14
|
+
# appenders : specifies where the log messages should be appended to (defaults to standard error).
|
15
|
+
# log_level : specifies the log level (defaults to info).
|
16
|
+
# arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
|
17
|
+
#
|
18
|
+
#
|
19
|
+
#
|
20
|
+
# Returns itself.
|
21
|
+
#
|
22
|
+
|
23
|
+
def initialize(urls, options = {}, arguments = {})
|
24
|
+
@urls = Array(urls)
|
25
|
+
@loop_extractor_args = nil
|
26
|
+
@extractor_args = []
|
27
|
+
@loop = nil
|
28
|
+
|
29
|
+
@request_arguments = arguments
|
30
|
+
|
31
|
+
@options = {
|
32
|
+
:async => false
|
33
|
+
}.merge(options)
|
34
|
+
|
35
|
+
|
36
|
+
@response_count = 0
|
37
|
+
@queued_count = 0
|
38
|
+
|
39
|
+
@hooks = {}
|
40
|
+
@failed_requests = []
|
41
|
+
|
42
|
+
hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
|
43
|
+
@hydra = Typhoeus::Hydra.new hydra_options
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Public: Sets the scraper extraction loop.
|
49
|
+
#
|
50
|
+
# Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
|
51
|
+
#
|
52
|
+
#
|
53
|
+
# selector - The CSS3 selector identifying the node list over which iterate (optional).
|
54
|
+
# callback - A block of code (optional).
|
55
|
+
# attribute - An attribute name (optional).
|
56
|
+
#
|
57
|
+
# Returns itself.
|
58
|
+
#
|
59
|
+
|
60
|
+
def loop_on(*args)
|
61
|
+
@loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
|
62
|
+
self
|
63
|
+
end
|
64
|
+
|
65
|
+
# Public: Registers a new extractor to be added to the loop.
|
66
|
+
#
|
67
|
+
# Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
|
68
|
+
#
|
69
|
+
# selector - The CSS3 selector identifying the node list over which iterate (optional).
|
70
|
+
# callback - A block of code (optional).
|
71
|
+
# attribute - An attribute name (optional).
|
72
|
+
#
|
73
|
+
# Returns itself.
|
74
|
+
#
|
75
|
+
#
|
76
|
+
|
77
|
+
def extract(*args)
|
78
|
+
@extractor_args << args.insert(1, ExtractionEnvironment.new(self))
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
#
|
83
|
+
# Public: Runs the main scraping loop.
|
84
|
+
#
|
85
|
+
# Returns nothing
|
86
|
+
#
|
87
|
+
def run
|
88
|
+
@urls.each do |url|
|
89
|
+
issue_request(url)
|
90
|
+
|
91
|
+
# if the scraper is asynchronous start processing the Hydra HTTP queue
|
92
|
+
# only after that the last url has been appended to the queue (see #issue_request).
|
93
|
+
#
|
94
|
+
if @options[:async]
|
95
|
+
if url == @urls.last
|
96
|
+
@hydra.run
|
97
|
+
end
|
98
|
+
else
|
99
|
+
@hydra.run
|
100
|
+
end
|
101
|
+
end
|
102
|
+
self
|
103
|
+
end
|
104
|
+
|
105
|
+
protected
|
106
|
+
|
107
|
+
def issue_request(url)
|
108
|
+
|
109
|
+
@request_arguments[:params] = merge_request_parameters(url)
|
110
|
+
url_without_params = url.gsub(/\?.*/,"")
|
111
|
+
|
112
|
+
arguments = {
|
113
|
+
'headers' => [
|
114
|
+
'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
|
115
|
+
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
116
|
+
].join("\n")
|
117
|
+
}
|
118
|
+
|
119
|
+
arguments.merge!(@request_arguments)
|
120
|
+
request = Typhoeus::Request.new(*[url_without_params, arguments])
|
121
|
+
|
122
|
+
request.on_complete do |response|
|
123
|
+
handle_response(response)
|
124
|
+
end
|
125
|
+
|
126
|
+
log("queueing url: #{url}, params #{arguments[:params]}", :debug)
|
127
|
+
@queued_count += 1
|
128
|
+
@hydra.queue(request)
|
129
|
+
end
|
130
|
+
|
131
|
+
def merge_request_parameters(url)
|
132
|
+
url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
|
133
|
+
return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
|
134
|
+
|
135
|
+
params = symbolize_keys(@request_arguments[:params] ||= {})
|
136
|
+
url_params.merge(params)
|
137
|
+
end
|
138
|
+
|
139
|
+
def handle_response(response)
|
140
|
+
@response_count += 1
|
141
|
+
@loop = prepare_loop(response)
|
142
|
+
log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
|
143
|
+
@loop.run
|
144
|
+
|
145
|
+
@environment = @loop.environment
|
146
|
+
run_hook(:data, [@loop.records, response])
|
147
|
+
end
|
148
|
+
|
149
|
+
def prepare_loop(response)
|
150
|
+
format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
|
151
|
+
extractor_class = format == :json ? JsonExtractor : DomExtractor
|
152
|
+
loop_extractor = extractor_class.new(*@loop_extractor_args)
|
153
|
+
extractors = @extractor_args.map { |args| extractor_class.new(*args) }
|
154
|
+
ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
|
155
|
+
end
|
156
|
+
|
157
|
+
def detect_format(content_type)
|
158
|
+
#TODO: add support for xml/rdf documents
|
159
|
+
if content_type && content_type =~ /json$/
|
160
|
+
:json
|
161
|
+
else
|
162
|
+
:html
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Utils
|
2
|
+
module ScrapingHelpers
|
3
|
+
#
|
4
|
+
# Generates a proc that iterates over a list of anchors
|
5
|
+
# and collects the value of the specified paramenter
|
6
|
+
#
|
7
|
+
def values_for_param(param)
|
8
|
+
lambda { |nodeList|
|
9
|
+
nodeList.collect {|node|
|
10
|
+
query = URI::parse(node.attr(:href)).query
|
11
|
+
query.split("&").collect { |token| token.split("=") }.
|
12
|
+
detect{ |chunks| chunks.first == param.to_s }.last
|
13
|
+
}.uniq
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module URIAddition
|
19
|
+
#
|
20
|
+
# Public
|
21
|
+
#
|
22
|
+
# Generates a hash representation of a uri's query string.
|
23
|
+
#
|
24
|
+
# Returns a hash mapping the URL query parameters to their respective values
|
25
|
+
#
|
26
|
+
# NOTE: this is intended as a decorator method for instances of URI::HTTP.
|
27
|
+
#
|
28
|
+
# examples:
|
29
|
+
#
|
30
|
+
# URI::parse(url).extend(URIAddition).query_hash
|
31
|
+
#
|
32
|
+
|
33
|
+
def query_hash
|
34
|
+
return unless self.query
|
35
|
+
self.query.split("&").reduce({}) do |memo, item|
|
36
|
+
param, value = *item.split("=")
|
37
|
+
memo.merge(param.to_sym => value)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module DeepFetchable
|
43
|
+
def get_in(path)
|
44
|
+
keys, node = Array(path), self
|
45
|
+
|
46
|
+
keys.each_with_index do |key, index|
|
47
|
+
node = node[key]
|
48
|
+
next_key = keys[index + 1]
|
49
|
+
break unless node
|
50
|
+
end
|
51
|
+
|
52
|
+
node
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
module Support
|
57
|
+
def symbolize_keys(hash)
|
58
|
+
hash.reduce({}) { |memo, (k,v)| memo.merge(k => v) }
|
59
|
+
end
|
60
|
+
#
|
61
|
+
# Creates instance variables from a hash.
|
62
|
+
#
|
63
|
+
# hash - An hash representing of instance variables to be created.
|
64
|
+
# defaults - An hash representing the attributes' default values (optional).
|
65
|
+
#
|
66
|
+
protected
|
67
|
+
def set_attributes(hash, defaults={})
|
68
|
+
allowed = defaults.keys
|
69
|
+
hash.each { |key, value| self.instance_variable_set("@#{key}", value)}
|
70
|
+
defaults.each do |key, value|
|
71
|
+
self.instance_variable_set("@#{key}", value) unless self.instance_variable_get("@#{key}")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
data/lib/extraloop.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop" )
|
2
|
+
|
3
|
+
# Standard library
|
4
|
+
autoload :OpenStruct, "ostruct"
|
5
|
+
|
6
|
+
# Rubygems
|
7
|
+
|
8
|
+
gem "yajl-ruby"
|
9
|
+
gem "nokogiri"
|
10
|
+
gem "typhoeus"
|
11
|
+
gem "logging"
|
12
|
+
|
13
|
+
|
14
|
+
autoload :Nokogiri, "nokogiri"
|
15
|
+
autoload :Yajl, "yajl"
|
16
|
+
autoload :Typhoeus, "typhoeus"
|
17
|
+
|
18
|
+
|
19
|
+
# Extraloop components
|
20
|
+
|
21
|
+
autoload :Utils , "#{base_path}/utils"
|
22
|
+
autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
|
23
|
+
autoload :ExtractorBase , "#{base_path}/extractor_base"
|
24
|
+
autoload :DomExtractor , "#{base_path}/dom_extractor"
|
25
|
+
autoload :JsonExtractor , "#{base_path}/json_extractor"
|
26
|
+
autoload :ExtractionLoop , "#{base_path}/extraction_loop"
|
27
|
+
autoload :ScraperBase , "#{base_path}/scraper_base"
|
28
|
+
autoload :Loggable , "#{base_path}/loggable"
|
29
|
+
autoload :Hookable , "#{base_path}/hookable"
|
30
|
+
autoload :IterativeScraper , "#{base_path}/iterative_scraper"
|
31
|
+
|
32
|
+
|
33
|
+
# monkey patch scraperbase with the Loggable module.
|
34
|
+
#
|
35
|
+
# This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
|
36
|
+
#
|
37
|
+
ScraperBase
|
38
|
+
Loggable
|
39
|
+
|
40
|
+
|
41
|
+
class ExtraLoop
|
42
|
+
VERSION = '0.0.1'
|
43
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
|
3
|
+
describe DomExtractor do
|
4
|
+
before(:each) do
|
5
|
+
stub(scraper = Object.new).options
|
6
|
+
stub(scraper).results
|
7
|
+
@env = ExtractionEnvironment.new(scraper)
|
8
|
+
@html ||= <<-EOF
|
9
|
+
<div class="entry">
|
10
|
+
<p><a href="http://example.com">my dummy link</a></p>
|
11
|
+
</div>
|
12
|
+
<div class="entry exclude" />
|
13
|
+
<div class="entry" />
|
14
|
+
EOF
|
15
|
+
|
16
|
+
@xml ||= <<-EOF
|
17
|
+
<?xml version="1.0"?>
|
18
|
+
<StandardDataObject xmlns="myns">
|
19
|
+
<InteractionElements>
|
20
|
+
<TargetCenter>92f4-MPA</TargetCenter>
|
21
|
+
<Trace>7.19879</Trace>
|
22
|
+
</InteractionElements>
|
23
|
+
</StandardDataObject>
|
24
|
+
EOF
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#new" do
|
28
|
+
subject { DomExtractor.new(:my_field, @env, "p a", :href) }
|
29
|
+
it { subject.field_name.should eql(:my_field) }
|
30
|
+
end
|
31
|
+
|
32
|
+
context "when no attribute is provided" do
|
33
|
+
before do
|
34
|
+
@extractor = DomExtractor.new(:anchor, @env, "p a")
|
35
|
+
@node = @extractor.parse(@html)
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#extract_field" do
|
39
|
+
subject { @extractor.extract_field(@node) }
|
40
|
+
it { should eql("my dummy link") }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
context "when an attribute is provided" do
|
46
|
+
before do
|
47
|
+
@extractor = DomExtractor.new(:anchor, @env, "p a", :href)
|
48
|
+
@node = @extractor.parse(@html)
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#extract_field" do
|
52
|
+
subject { @extractor.extract_field(@node) }
|
53
|
+
it { should eql("http://example.com") }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
context "when a selector and a block is provided" do
|
58
|
+
before do
|
59
|
+
@extractor = DomExtractor.new(:anchor, @env, "p a", proc { |node|
|
60
|
+
node.text.gsub("dummy", "fancy")
|
61
|
+
})
|
62
|
+
@node = @extractor.parse(@html)
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "#extract_field" do
|
66
|
+
subject { @extractor.extract_field(@node) }
|
67
|
+
it { should match(/my fancy/) }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
context "when only a block is provided" do
|
72
|
+
before do
|
73
|
+
@extractor = DomExtractor.new(:anchor, @env, proc { |document|
|
74
|
+
document.at_css("p a").text.gsub(/dummy/,'fancy')
|
75
|
+
})
|
76
|
+
@node = @extractor.parse(@html)
|
77
|
+
end
|
78
|
+
|
79
|
+
describe "#extract_field" do
|
80
|
+
subject { @extractor.extract_field(@node) }
|
81
|
+
it { should match(/my fancy/) }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
context "when only an attribute is provided" do
|
86
|
+
before do
|
87
|
+
@extractor = DomExtractor.new(:url, @env, :href)
|
88
|
+
@node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
|
89
|
+
end
|
90
|
+
describe "#extract_field" do
|
91
|
+
subject { @extractor.extract_field(@node) }
|
92
|
+
it { should eql("hello-world") }
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
context "when nothing but a field name is provided" do
|
97
|
+
before do
|
98
|
+
@extractor = DomExtractor.new(:url, @env)
|
99
|
+
@node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
|
100
|
+
end
|
101
|
+
describe "#extract_field" do
|
102
|
+
subject { @extractor.extract_field(@node) }
|
103
|
+
it { should eql("Hello") }
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
describe "extract_list" do
|
108
|
+
context "no block provided" do
|
109
|
+
before do
|
110
|
+
@extractor = DomExtractor.new(nil, @env, "div.entry")
|
111
|
+
@node = @extractor.parse(@html)
|
112
|
+
end
|
113
|
+
|
114
|
+
subject { @extractor.extract_list(@node) }
|
115
|
+
it { subject.should have(3).items }
|
116
|
+
end
|
117
|
+
|
118
|
+
context "block provided" do
|
119
|
+
before do
|
120
|
+
@extractor = DomExtractor.new(nil, @env, "div.entry", lambda { |nodeList|
|
121
|
+
nodeList.reject {|node| node.attr(:class).split(" ").include?('exclude') }
|
122
|
+
})
|
123
|
+
end
|
124
|
+
|
125
|
+
subject { @extractor.extract_list(@html) }
|
126
|
+
it { subject.should have(2).items }
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
context "xml input" do
|
131
|
+
describe "#parse" do
|
132
|
+
before do
|
133
|
+
@extractor = DomExtractor.new(nil, @env, "entry")
|
134
|
+
end
|
135
|
+
|
136
|
+
subject { @extractor.parse(@xml) }
|
137
|
+
it { should be_an_instance_of(Nokogiri::XML::Document)}
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
context "html input" do
|
143
|
+
describe "#parse" do
|
144
|
+
before do
|
145
|
+
@extractor = DomExtractor.new(nil, @env, "entry")
|
146
|
+
end
|
147
|
+
|
148
|
+
subject { @extractor.parse(@html) }
|
149
|
+
it { should be_an_instance_of(Nokogiri::HTML::Document)}
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
context "non-string input" do
|
154
|
+
describe "#parse" do
|
155
|
+
before do
|
156
|
+
@extractor = DomExtractor.new(nil, @env, "entry")
|
157
|
+
end
|
158
|
+
|
159
|
+
it "Should raise an exception" do
|
160
|
+
expect { @extractor.parse(Nokogiri::HTML(@html)) }.to raise_exception(ExtractorBase::Exceptions::ExtractorParseError)
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
|
3
|
+
describe ExtractionLoop do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@fake_scraper = stub!.options
|
7
|
+
stub(@fake_scraper).results
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "#new" do
|
11
|
+
before do
|
12
|
+
@mock_loop = Object.new
|
13
|
+
stub(@mock_loop).parse {}
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
subject { ExtractionLoop.new(@mock_loop ) }
|
18
|
+
|
19
|
+
it "should allow read/write access to public attributes" do
|
20
|
+
|
21
|
+
{:extractors => [:fake, :fake],
|
22
|
+
:document => nil,
|
23
|
+
:hooks => { }
|
24
|
+
}.each do |k, v|
|
25
|
+
subject.send("#{k}=", v)
|
26
|
+
subject.send(k).should eql(v)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "run" do
|
32
|
+
before(:each) do
|
33
|
+
|
34
|
+
@extractors = [:a, :b].map do |field_name|
|
35
|
+
object = Object.new
|
36
|
+
stub(object).extract_field { |node, record| node[field_name] }
|
37
|
+
stub(object).field_name { field_name }
|
38
|
+
object
|
39
|
+
end
|
40
|
+
|
41
|
+
@loop_extractor = Object.new
|
42
|
+
|
43
|
+
stub(@loop_extractor).parse { |input| Nokogiri::HTML("<html><body>Hello test!</body></html>") }
|
44
|
+
|
45
|
+
stub(@loop_extractor).extract_list { |document|
|
46
|
+
#list of fake dom elements
|
47
|
+
(0..9).to_a.map { |n| {:a => n, :b => n*n } }
|
48
|
+
}
|
49
|
+
|
50
|
+
|
51
|
+
before, before_extract, after_extract, after = *(1..4).to_a.map { proc {} }
|
52
|
+
hooks = {before: [before], before_extract: [before_extract], after_extract: [after_extract], after: [after]}
|
53
|
+
|
54
|
+
any_instance_of(ExtractionEnvironment) do |env|
|
55
|
+
mock(env).run.with_any_args.times(20 + 2)
|
56
|
+
end
|
57
|
+
|
58
|
+
@extraction_loop = ExtractionLoop.new(@loop_extractor, @extractors, "fake document", hooks, @fake_scraper).run
|
59
|
+
end
|
60
|
+
|
61
|
+
subject { @extraction_loop.run }
|
62
|
+
|
63
|
+
it "should produce 10 records" do
|
64
|
+
@extraction_loop.records.size.should eql(10)
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should run extractors" do
|
68
|
+
@extraction_loop.records.all? { |record| record.a && record.b && record.b == record.a ** 2 }
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should convert extracted records into OpenStruct instances" do
|
72
|
+
@extraction_loop.records.all? { |record| record.is_a?(OpenStruct) }
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|