extraloop 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -1
- data/README.md +2 -2
- data/examples/google_news_scraper.rb +1 -1
- data/examples/wikipedia_categories.rb +1 -1
- data/lib/extraloop.rb +18 -16
- data/lib/extraloop/dom_extractor.rb +38 -36
- data/lib/extraloop/extraction_environment.rb +16 -14
- data/lib/extraloop/extraction_loop.rb +37 -37
- data/lib/extraloop/extractor_base.rb +34 -33
- data/lib/extraloop/hookable.rb +18 -18
- data/lib/extraloop/iterative_scraper.rb +249 -250
- data/lib/extraloop/json_extractor.rb +27 -26
- data/lib/extraloop/loggable.rb +50 -49
- data/lib/extraloop/scraper_base.rb +144 -141
- data/lib/extraloop/utils.rb +64 -61
- data/spec/helpers/spec_helper.rb +2 -1
- metadata +24 -13
data/History.txt
CHANGED
data/README.md
CHANGED
@@ -14,7 +14,7 @@ A basic scraper that fetches the top 25 websites from Alexa's daily top 100 list
|
|
14
14
|
|
15
15
|
results = nil
|
16
16
|
|
17
|
-
|
17
|
+
ExtraLoop::ScraperBase.
|
18
18
|
new("http://www.alexa.com/topsites").
|
19
19
|
loop_on("li.site-listing").
|
20
20
|
extract(:site_name, "h2").
|
@@ -29,7 +29,7 @@ An Iterative Scraper that fetches URL, title, and publisher from some 110 Google
|
|
29
29
|
|
30
30
|
results = []
|
31
31
|
|
32
|
-
IterativeScraper.
|
32
|
+
ExtraLoop::IterativeScraper.
|
33
33
|
new("https://www.google.com/search?tbm=nws&q=Egypt").
|
34
34
|
set_iteration(:start, (1..101).step(10)).
|
35
35
|
loop_on("h3", proc { |nodes| nodes.map(&:parent) }).
|
@@ -2,7 +2,7 @@ require '../lib/extraloop'
|
|
2
2
|
|
3
3
|
results = []
|
4
4
|
|
5
|
-
IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
|
5
|
+
ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
|
6
6
|
:log_level => :debug,
|
7
7
|
:appenders => [Logging.appenders.stderr ]
|
8
8
|
}).set_iteration(:start, (1..101).step(10)).
|
@@ -33,7 +33,7 @@ request_arguments = { :params => params }
|
|
33
33
|
# (used in the Google News example).
|
34
34
|
#
|
35
35
|
|
36
|
-
IterativeScraper.new(api_url, options, request_arguments).
|
36
|
+
ExtraLoop::IterativeScraper.new(api_url, options, request_arguments).
|
37
37
|
loop_on(['query', 'categorymembers']).
|
38
38
|
extract(:title).
|
39
39
|
extract(:ns).
|
data/lib/extraloop.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop" )
|
2
2
|
|
3
|
+
module ExtraLoop
|
4
|
+
VERSION = '0.0.3'
|
5
|
+
end
|
6
|
+
|
7
|
+
|
3
8
|
# Standard library
|
4
9
|
autoload :OpenStruct, "ostruct"
|
5
10
|
|
@@ -16,28 +21,25 @@ autoload :Yajl, "yajl"
|
|
16
21
|
autoload :Typhoeus, "typhoeus"
|
17
22
|
|
18
23
|
|
24
|
+
|
19
25
|
# Extraloop components
|
20
26
|
|
21
|
-
autoload :Utils , "#{base_path}/utils"
|
22
|
-
autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
|
23
|
-
autoload :ExtractorBase , "#{base_path}/extractor_base"
|
24
|
-
autoload :DomExtractor , "#{base_path}/dom_extractor"
|
25
|
-
autoload :JsonExtractor , "#{base_path}/json_extractor"
|
26
|
-
autoload :ExtractionLoop , "#{base_path}/extraction_loop"
|
27
|
-
autoload :ScraperBase , "#{base_path}/scraper_base"
|
28
|
-
autoload :Loggable , "#{base_path}/loggable"
|
29
|
-
autoload :Hookable
|
30
|
-
autoload :IterativeScraper , "#{base_path}/iterative_scraper"
|
27
|
+
ExtraLoop.autoload :Utils , "#{base_path}/utils"
|
28
|
+
ExtraLoop.autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
|
29
|
+
ExtraLoop.autoload :ExtractorBase , "#{base_path}/extractor_base"
|
30
|
+
ExtraLoop.autoload :DomExtractor , "#{base_path}/dom_extractor"
|
31
|
+
ExtraLoop.autoload :JsonExtractor , "#{base_path}/json_extractor"
|
32
|
+
ExtraLoop.autoload :ExtractionLoop , "#{base_path}/extraction_loop"
|
33
|
+
ExtraLoop.autoload :ScraperBase , "#{base_path}/scraper_base"
|
34
|
+
ExtraLoop.autoload :Loggable , "#{base_path}/loggable"
|
35
|
+
ExtraLoop.autoload :Hookable , "#{base_path}/hookable"
|
36
|
+
ExtraLoop.autoload :IterativeScraper , "#{base_path}/iterative_scraper"
|
31
37
|
|
32
38
|
|
33
39
|
# monkey patch scraperbase with the Loggable module.
|
34
40
|
#
|
35
41
|
# This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
|
36
42
|
#
|
37
|
-
ScraperBase
|
38
|
-
Loggable
|
39
|
-
|
43
|
+
ExtraLoop::ScraperBase
|
44
|
+
ExtraLoop::Loggable
|
40
45
|
|
41
|
-
class ExtraLoop
|
42
|
-
VERSION = '0.0.1'
|
43
|
-
end
|
@@ -1,45 +1,47 @@
|
|
1
|
-
|
1
|
+
module ExtraLoop
|
2
|
+
class DomExtractor < ExtractorBase
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
# Public: Runs the extractor against a document fragment (dom node or object).
|
5
|
+
#
|
6
|
+
# node - The document fragment
|
7
|
+
# record - The extracted record
|
8
|
+
#
|
9
|
+
# Returns the text content of the element, or the output of the extractor's callback.
|
10
|
+
#
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
def extract_field(node, record=nil)
|
13
|
+
target = node = node.respond_to?(:document) ? node : parse(node)
|
14
|
+
target = node.at_css(@selector) if @selector
|
15
|
+
target = target.attr(@attribute) if target.respond_to?(:attr) && @attribute
|
16
|
+
target = @environment.run(target, record, &@callback) if @callback
|
16
17
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
#if target is still a DOM node, return its text content
|
19
|
+
target = target.text if target.respond_to?(:text)
|
20
|
+
target
|
21
|
+
end
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
#
|
24
|
+
# Public: Extracts a list of document fragments matching the provided selector/callback
|
25
|
+
#
|
26
|
+
# input - a document (either as a string or as a parsed Nokogiri document)
|
27
|
+
#
|
28
|
+
# Returns an array of elements matching the specified selector or function
|
29
|
+
#
|
30
|
+
#
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
32
|
+
def extract_list(input)
|
33
|
+
nodes = input.respond_to?(:document) ? input : parse(input)
|
34
|
+
nodes = nodes.search(@selector) if @selector
|
35
|
+
@callback && Array(@environment.run(nodes, &@callback)) || nodes
|
36
|
+
end
|
36
37
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
def parse(input)
|
39
|
+
super(input)
|
40
|
+
@environment.document = is_xml(input) ? Nokogiri::XML(input) : Nokogiri::HTML(input)
|
41
|
+
end
|
41
42
|
|
42
|
-
|
43
|
-
|
43
|
+
def is_xml(input)
|
44
|
+
input =~ /^\s*\<\?xml version=\"\d\.\d\"\?\>/
|
45
|
+
end
|
44
46
|
end
|
45
47
|
end
|
@@ -1,20 +1,22 @@
|
|
1
|
-
|
2
|
-
#
|
1
|
+
module ExtraLoop
|
2
|
+
# This class acts as a virtual environment within
|
3
|
+
# which Hook handlers and extractors run (through #run)
|
3
4
|
|
4
|
-
class ExtractionEnvironment
|
5
|
-
|
5
|
+
class ExtractionEnvironment
|
6
|
+
attr_accessor :document
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def initialize(scraper=nil, document=nil, records=nil)
|
9
|
+
if scraper
|
10
|
+
@options = scraper.options
|
11
|
+
@results = scraper.results
|
12
|
+
@scraper = scraper
|
13
|
+
end
|
14
|
+
@document = document
|
15
|
+
@records = records
|
12
16
|
end
|
13
|
-
@document = document
|
14
|
-
@records = records
|
15
|
-
end
|
16
17
|
|
17
|
-
|
18
|
-
|
18
|
+
def run(*arguments, &block)
|
19
|
+
self.instance_exec(*arguments, &block)
|
20
|
+
end
|
19
21
|
end
|
20
22
|
end
|
@@ -1,46 +1,46 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
module ExtraLoop
|
2
|
+
class ExtractionLoop
|
3
|
+
include Hookable
|
3
4
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
attr_reader :records, :environment
|
9
|
-
attr_accessor :extractors, :document, :hooks, :children, :parent, :scraper
|
10
|
-
|
11
|
-
def initialize(loop_extractor, extractors=[], document=nil, hooks = {}, scraper = nil)
|
12
|
-
@loop_extractor = loop_extractor
|
13
|
-
@extractors = extractors
|
14
|
-
@document = @loop_extractor.parse(document)
|
15
|
-
@records = []
|
16
|
-
@hooks = hooks
|
17
|
-
@environment = ExtractionEnvironment.new(@scraper, @document, @records)
|
18
|
-
self
|
19
|
-
end
|
20
|
-
|
21
|
-
def run
|
22
|
-
run_hook(:before, @document)
|
5
|
+
module Exceptions
|
6
|
+
class UnsupportedFormat < StandardError; end
|
7
|
+
end
|
23
8
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
9
|
+
attr_reader :records, :environment
|
10
|
+
attr_accessor :extractors, :document, :hooks, :children, :parent, :scraper
|
11
|
+
|
12
|
+
def initialize(loop_extractor, extractors=[], document=nil, hooks = {}, scraper = nil)
|
13
|
+
@loop_extractor = loop_extractor
|
14
|
+
@extractors = extractors
|
15
|
+
@document = @loop_extractor.parse(document)
|
16
|
+
@records = []
|
17
|
+
@hooks = hooks
|
18
|
+
@environment = ExtractionEnvironment.new(@scraper, @document, @records)
|
19
|
+
self
|
28
20
|
end
|
29
21
|
|
30
|
-
|
31
|
-
|
32
|
-
end
|
22
|
+
def run
|
23
|
+
run_hook(:before, @document)
|
33
24
|
|
25
|
+
get_nodelist.each do |node|
|
26
|
+
run_hook(:before_extract, [node])
|
27
|
+
@records << run_extractors(node)
|
28
|
+
run_hook(:after_extract, [node, records.last])
|
29
|
+
end
|
34
30
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
end
|
31
|
+
run_hook(:after, @records)
|
32
|
+
self
|
33
|
+
end
|
39
34
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
35
|
+
private
|
36
|
+
def get_nodelist
|
37
|
+
@loop_extractor.extract_list(@document)
|
38
|
+
end
|
45
39
|
|
40
|
+
def run_extractors(node)
|
41
|
+
record = OpenStruct.new(:extracted_at => Time.now.to_i)
|
42
|
+
@extractors.each { |extractor| record.send("#{extractor.field_name.to_s}=", extractor.extract_field(node, record)) }
|
43
|
+
record
|
44
|
+
end
|
45
|
+
end
|
46
46
|
end
|
@@ -1,40 +1,41 @@
|
|
1
|
-
|
2
|
-
#
|
3
|
-
#
|
4
|
-
#
|
5
|
-
class ExtractorBase
|
6
|
-
module Exceptions
|
7
|
-
class WrongArgumentError < StandardError; end
|
8
|
-
class ExtractorParseError < StandardError; end
|
9
|
-
end
|
10
|
-
|
11
|
-
attr_reader :field_name
|
12
|
-
#
|
13
|
-
# Public: Initializes a Data extractor.
|
14
|
-
#
|
15
|
-
# Parameters:
|
16
|
-
# field_name - The machine readable field name
|
17
|
-
# environment - The object within which the extractor callback will be run (using run).
|
18
|
-
# selector: - The css3 selector to be used to match a specific portion of a document (optional).
|
19
|
-
# callback - A block of code to which the extracted node/attribute will be passed (optional).
|
20
|
-
# attribute: - A node attribute. If provided, the attribute value will be returned (optional).
|
21
|
-
#
|
22
|
-
# Returns itself
|
1
|
+
module ExtraLoop
|
2
|
+
# Pseudo Abstract class.
|
3
|
+
# This should not be called directly
|
23
4
|
#
|
5
|
+
class ExtractorBase
|
6
|
+
module Exceptions
|
7
|
+
class WrongArgumentError < StandardError; end
|
8
|
+
class ExtractorParseError < StandardError; end
|
9
|
+
end
|
24
10
|
|
25
|
-
|
26
|
-
|
27
|
-
|
11
|
+
attr_reader :field_name
|
12
|
+
#
|
13
|
+
# Public: Initializes a Data extractor.
|
14
|
+
#
|
15
|
+
# Parameters:
|
16
|
+
# field_name - The machine readable field name
|
17
|
+
# environment - The object within which the extractor callback will be run (using run).
|
18
|
+
# selector: - The css3 selector to be used to match a specific portion of a document (optional).
|
19
|
+
# callback - A block of code to which the extracted node/attribute will be passed (optional).
|
20
|
+
# attribute: - A node attribute. If provided, the attribute value will be returned (optional).
|
21
|
+
#
|
22
|
+
# Returns itself
|
23
|
+
#
|
28
24
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
25
|
+
def initialize(field_name, environment, *args)
|
26
|
+
@field_name = field_name
|
27
|
+
@environment = environment
|
28
|
+
|
29
|
+
@selector = args.find { |arg| arg.is_a?(String)}
|
30
|
+
args.delete(@selector) if @selector
|
31
|
+
@attribute = args.find { |arg| arg.is_a?(String) || arg.is_a?(Symbol) }
|
32
|
+
@callback = args.find { |arg| arg.respond_to?(:call) }
|
33
|
+
self
|
34
|
+
end
|
35
35
|
|
36
36
|
|
37
|
-
|
38
|
-
|
37
|
+
def parse(input)
|
38
|
+
raise Exceptions::ExtractorParseError.new "input parameter must be a string" unless input.is_a?(String)
|
39
|
+
end
|
39
40
|
end
|
40
41
|
end
|
data/lib/extraloop/hookable.rb
CHANGED
@@ -1,26 +1,26 @@
|
|
1
|
-
module
|
1
|
+
module ExtraLoop
|
2
|
+
module Hookable
|
2
3
|
|
3
|
-
|
4
|
-
|
4
|
+
module Exceptions
|
5
|
+
class HookArgumentError < StandardError
|
6
|
+
end
|
5
7
|
end
|
6
|
-
end
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
def set_hook(hookname, handler)
|
10
|
+
@hooks ||= {}
|
11
|
+
raise Exceptions::HookArgumentError.new "handler must be a callable proc" unless handler.respond_to?(:call)
|
12
|
+
@hooks[hookname.to_sym] ? @hooks[hookname.to_sym].push(handler) : @hooks[hookname.to_sym] = [handler]
|
13
|
+
self
|
14
|
+
end
|
14
15
|
|
15
|
-
|
16
|
-
|
16
|
+
def run_hook(hook, arguments)
|
17
|
+
return unless @hooks.has_key?(hook)
|
17
18
|
|
18
|
-
|
19
|
-
|
19
|
+
@hooks[hook].each do |handler|
|
20
|
+
(@environment || ExtractionEnvironment.new ).run(*arguments, &handler)
|
21
|
+
end
|
20
22
|
end
|
21
|
-
end
|
22
23
|
|
23
|
-
|
24
|
+
alias_method :on, :set_hook
|
25
|
+
end
|
24
26
|
end
|
25
|
-
|
26
|
-
|
@@ -1,291 +1,290 @@
|
|
1
|
-
|
1
|
+
module ExtraLoop
|
2
|
+
class IterativeScraper < ScraperBase
|
3
|
+
module Exceptions
|
4
|
+
class NonGetAsyncRequestNotYetImplemented < StandardError; end
|
5
|
+
end
|
2
6
|
|
3
|
-
|
4
|
-
|
5
|
-
|
7
|
+
#
|
8
|
+
# Public
|
9
|
+
#
|
10
|
+
# Initializes an iterative scraper (i.e. a scraper which can extract data from a list of several web pages).
|
11
|
+
#
|
12
|
+
# urls - One or an array of several urls.
|
13
|
+
# options - A hash of scraper options (optional).
|
14
|
+
# async : Wether or not the scraper should issue HTTP requests synchronously or asynchronously (defaults to false).
|
15
|
+
# log : Logging options (set to false to completely suppress logging).
|
16
|
+
# hydra : A list of arguments to be passed in when initializing the HTTP queue (see Typheous#Hydra).
|
17
|
+
# arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
|
18
|
+
#
|
19
|
+
#
|
20
|
+
# Examples:
|
21
|
+
#
|
22
|
+
# # Iterates over the first 10 pages of Google News search result for the query 'Egypt'.
|
23
|
+
#
|
24
|
+
# IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
|
25
|
+
# :appenders => [ 'example.log', :stderr],
|
26
|
+
# :log_level => :debug
|
27
|
+
#
|
28
|
+
# }).set_iteration(:start, (1..101).step(10))
|
29
|
+
#
|
30
|
+
# # Iterates over the first 10 pages of Google News search results for the query 'Egypt' first, and then
|
31
|
+
# # for the query 'Syria', issuing HTTP requests asynchronously, and ignoring ssl certificate verification.
|
32
|
+
#
|
33
|
+
# IterativeScraper.new([
|
34
|
+
# https://www.google.com/search?tbm=nws&q=Egypt",
|
35
|
+
# https://www.google.com/search?tbm=nws&q=Syria"
|
36
|
+
# ], {:async => true, }, {:disable_ssl_peer_verification => true
|
37
|
+
#
|
38
|
+
# }).set_iteration(:start, (1..101).step(10))
|
39
|
+
#
|
40
|
+
# Returns itself.
|
41
|
+
#
|
42
|
+
|
43
|
+
def initialize(urls, options = {}, arguments = {})
|
44
|
+
super([], options, arguments)
|
45
|
+
|
46
|
+
@base_urls = Array(urls)
|
47
|
+
@iteration_set = []
|
48
|
+
@iteration_extractor = nil
|
49
|
+
@iteration_extractor_args = nil
|
50
|
+
@iteration_count = 0
|
51
|
+
@iteration_param = nil
|
52
|
+
@iteration_param_value = nil
|
53
|
+
@continue_clause_args = nil
|
54
|
+
self
|
55
|
+
end
|
6
56
|
|
7
|
-
#
|
8
|
-
# Public
|
9
|
-
#
|
10
|
-
# Initializes an iterative scraper (i.e. a scraper which can extract data from a list of several web pages).
|
11
|
-
#
|
12
|
-
# urls - One or an array of several urls.
|
13
|
-
# options - A hash of scraper options (optional).
|
14
|
-
# async : Wether or not the scraper should issue HTTP requests synchronously or asynchronously (defaults to false).
|
15
|
-
# log : Logging options (set to false to completely suppress logging).
|
16
|
-
# hydra : A list of arguments to be passed in when initializing the HTTP queue (see Typheous#Hydra).
|
17
|
-
# arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
|
18
|
-
#
|
19
|
-
#
|
20
|
-
# Examples:
|
21
|
-
#
|
22
|
-
# # Iterates over the first 10 pages of Google News search result for the query 'Egypt'.
|
23
|
-
#
|
24
|
-
# IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
|
25
|
-
# :appenders => [ 'example.log', :stderr],
|
26
|
-
# :log_level => :debug
|
27
|
-
#
|
28
|
-
# }).set_iteration(:start, (1..101).step(10))
|
29
|
-
#
|
30
|
-
# # Iterates over the first 10 pages of Google News search results for the query 'Egypt' first, and then
|
31
|
-
# # for the query 'Syria', issuing HTTP requests asynchronously, and ignoring ssl certificate verification.
|
32
|
-
#
|
33
|
-
# IterativeScraper.new([
|
34
|
-
# https://www.google.com/search?tbm=nws&q=Egypt",
|
35
|
-
# https://www.google.com/search?tbm=nws&q=Syria"
|
36
|
-
# ], {:async => true, }, {:disable_ssl_peer_verification => true
|
37
|
-
#
|
38
|
-
# }).set_iteration(:start, (1..101).step(10))
|
39
|
-
#
|
40
|
-
# Returns itself.
|
41
|
-
#
|
42
|
-
|
43
|
-
def initialize(urls, options = {}, arguments = {})
|
44
|
-
super([], options, arguments)
|
45
|
-
|
46
|
-
@base_urls = Array(urls)
|
47
|
-
@iteration_set = []
|
48
|
-
@iteration_extractor = nil
|
49
|
-
@iteration_extractor_args = nil
|
50
|
-
@iteration_count = 0
|
51
|
-
@iteration_param = nil
|
52
|
-
@iteration_param_value = nil
|
53
|
-
@continue_clause_args = nil
|
54
|
-
self
|
55
|
-
end
|
56
57
|
|
58
|
+
# Public
|
59
|
+
#
|
60
|
+
# Specifies the collection of values over which the scraper should iterate.
|
61
|
+
# At each iteration, the current value in the iteration set will be included as part of the request parameters.
|
62
|
+
#
|
63
|
+
# param - the name of the iteration parameter.
|
64
|
+
# args - Either an array of values, or a set the arguments to initialize an Extractor object.
|
65
|
+
#
|
66
|
+
# Examples:
|
67
|
+
#
|
68
|
+
# # Explicitly specify the iteration set (can be either a range or an array).
|
69
|
+
#
|
70
|
+
# IterativeScraper.new("http://my-site.com/events").
|
71
|
+
# set_iteration(:p, 1..10).
|
72
|
+
#
|
73
|
+
# # Pass in a code block to dynamically extract the iteration set from the document.
|
74
|
+
# # The code block will be passed to generate an Extractor that will be run at the first
|
75
|
+
# # iteration. The iteration will not continue if the proc will return return a non empty
|
76
|
+
# # set of values.
|
77
|
+
#
|
78
|
+
# fetch_page_numbers = proc { |elements|
|
79
|
+
# elements.map { |a|
|
80
|
+
# a.attr(:href).match(/p=(\d+)/)
|
81
|
+
# $1
|
82
|
+
# }.reject { |p| p == 1 }
|
83
|
+
# }
|
84
|
+
#
|
85
|
+
# IterativeScraper.new("http://my-site.com/events").
|
86
|
+
# set_iteration(:p, "div#pagination a", fetch_page_numbers)
|
87
|
+
#
|
88
|
+
#
|
89
|
+
# Returns itself.
|
90
|
+
#
|
91
|
+
|
92
|
+
def set_iteration(param, *args)
|
93
|
+
#TODO: allow passing ranges as well as arrays
|
94
|
+
if args.first.respond_to?(:map)
|
95
|
+
@iteration_set = Array(args.first).map &:to_s
|
96
|
+
else
|
97
|
+
@iteration_extractor_args = [:pagination, *args]
|
98
|
+
end
|
99
|
+
set_iteration_param(param)
|
100
|
+
self
|
101
|
+
end
|
57
102
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
# # The code block will be passed to generate an Extractor that will be run at the first
|
75
|
-
# # iteration. The iteration will not continue if the proc will return return a non empty
|
76
|
-
# # set of values.
|
77
|
-
#
|
78
|
-
# fetch_page_numbers = proc { |elements|
|
79
|
-
# elements.map { |a|
|
80
|
-
# a.attr(:href).match(/p=(\d+)/)
|
81
|
-
# $1
|
82
|
-
# }.reject { |p| p == 1 }
|
83
|
-
# }
|
84
|
-
#
|
85
|
-
# IterativeScraper.new("http://my-site.com/events").
|
86
|
-
# set_iteration(:p, "div#pagination a", fetch_page_numbers)
|
87
|
-
#
|
88
|
-
#
|
89
|
-
# Returns itself.
|
90
|
-
#
|
91
|
-
|
92
|
-
def set_iteration(param, *args)
|
93
|
-
#TODO: allow passing ranges as well as arrays
|
94
|
-
if args.first.respond_to?(:map)
|
95
|
-
@iteration_set = Array(args.first).map &:to_s
|
96
|
-
else
|
97
|
-
@iteration_extractor_args = [:pagination, *args]
|
103
|
+
# Public
|
104
|
+
#
|
105
|
+
# Builds an extractor and uses it to set the value of the next iteration's offset parameter.
|
106
|
+
# If the extractor returns nil, the iteration stops.
|
107
|
+
#
|
108
|
+
# param - A symbol identifying the itertion parameter name.
|
109
|
+
# extractor_args - Arguments to be passed to the extractor which will be used to evaluate the continue value
|
110
|
+
#
|
111
|
+
# Returns itself.
|
112
|
+
|
113
|
+
def continue_with(param, *extractor_args)
|
114
|
+
raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
|
115
|
+
|
116
|
+
@continue_clause_args = extractor_args
|
117
|
+
set_iteration_param(param)
|
118
|
+
self
|
98
119
|
end
|
99
|
-
set_iteration_param(param)
|
100
|
-
self
|
101
|
-
end
|
102
120
|
|
103
|
-
|
104
|
-
|
105
|
-
# Builds an extractor and uses it to set the value of the next iteration's offset parameter.
|
106
|
-
# If the extractor returns nil, the iteration stops.
|
107
|
-
#
|
108
|
-
# param - A symbol identifying the itertion parameter name.
|
109
|
-
# extractor_args - Arguments to be passed to the extractor which will be used to evaluate the continue value
|
110
|
-
#
|
111
|
-
# Returns itself.
|
112
|
-
|
113
|
-
def continue_with(param, *extractor_args)
|
114
|
-
raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
|
115
|
-
|
116
|
-
@continue_clause_args = extractor_args
|
117
|
-
set_iteration_param(param)
|
118
|
-
self
|
119
|
-
end
|
121
|
+
def run
|
122
|
+
@base_urls.each do |base_url|
|
120
123
|
|
121
|
-
|
122
|
-
|
124
|
+
# run an extra iteration to determine the value of the next offset parameter (if #continue_with is used)
|
125
|
+
# or the entire iteration set (if #set_iteration is used).
|
126
|
+
(run_iteration(base_url); @iteration_count += 1 ) if @iteration_extractor_args || @continue_clause_args
|
123
127
|
|
124
|
-
|
125
|
-
|
126
|
-
|
128
|
+
while @iteration_set.at(@iteration_count)
|
129
|
+
method = @options[:async] ? :run_iteration_async : :run_iteration
|
130
|
+
send(method, base_url)
|
131
|
+
@iteration_count += 1
|
132
|
+
end
|
127
133
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
@iteration_count
|
134
|
+
#reset all counts
|
135
|
+
@queued_count = 0
|
136
|
+
@response_count = 0
|
137
|
+
@iteration_count = 0
|
132
138
|
end
|
133
|
-
|
134
|
-
#reset all counts
|
135
|
-
@queued_count = 0
|
136
|
-
@response_count = 0
|
137
|
-
@iteration_count = 0
|
139
|
+
self
|
138
140
|
end
|
139
|
-
self
|
140
|
-
end
|
141
141
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
142
|
+
protected
|
143
|
+
|
144
|
+
#
|
145
|
+
# Set the name (and optionally the default value) of the iteration parameter.
|
146
|
+
#
|
147
|
+
# param - a symbol or a hash containing the parameter name (as the key) and its default value.
|
148
|
+
#
|
149
|
+
# Returns nothing.
|
150
|
+
#
|
151
|
+
#
|
152
|
+
def set_iteration_param(param)
|
153
|
+
if param.respond_to?(:keys)
|
154
|
+
@iteration_param = param.keys.first
|
155
|
+
@iteration_param_value = param.values.first
|
156
|
+
else
|
157
|
+
@iteration_param = param
|
158
|
+
end
|
158
159
|
end
|
159
|
-
end
|
160
160
|
|
161
|
-
|
162
|
-
|
163
|
-
|
161
|
+
def default_offset
|
162
|
+
@iteration_param_value or "1"
|
163
|
+
end
|
164
164
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
165
|
+
#
|
166
|
+
# Runs an iteration performing blocking, synchronous HTTP request per time (
|
167
|
+
# calls ScraperBase#run at each request)
|
168
|
+
#
|
169
|
+
# url - the current iteration's url.
|
170
|
+
#
|
171
|
+
# Returns nothing
|
172
|
+
#
|
173
|
+
|
174
|
+
def run_iteration(url)
|
175
|
+
@urls = Array(url)
|
176
|
+
update_request_params!
|
177
|
+
run_super(:run)
|
178
|
+
end
|
179
179
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
180
|
+
#
|
181
|
+
# Runs an iteration performing parallel, non-blocking HTTP requests
|
182
|
+
#
|
183
|
+
# url - The current iteration's url.
|
184
|
+
#
|
185
|
+
# Returns nothing.
|
186
|
+
#
|
187
|
+
#
|
188
|
+
def run_iteration_async(url)
|
189
|
+
error_message = "When then option 'async' is set, the IterativeScraper class currently supports only HTTP method 'get'." +
|
190
|
+
"If you have to use a HTTP method other than GET, you will have to set the 'async' option to false."
|
191
191
|
|
192
|
-
|
192
|
+
raise NonGetAsyncRequestNotYetImplemented error_message unless @request_arguments[:method].nil? || @request_arguments[:method].downcase.to_sym == :get
|
193
193
|
|
194
|
-
|
194
|
+
@urls << add_iteration_param(url)
|
195
195
|
|
196
|
-
|
197
|
-
|
196
|
+
if @iteration_set[@iteration_count] == @iteration_set.last
|
197
|
+
run_super(:run)
|
198
|
+
end
|
198
199
|
end
|
199
|
-
end
|
200
200
|
|
201
201
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
202
|
+
#
|
203
|
+
# Dynamically updates the request parameter hash with the
|
204
|
+
# current iteration parameter value.
|
205
|
+
#
|
206
|
+
# Returns nothing.
|
207
|
+
#
|
208
208
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
209
|
+
def update_request_params!
|
210
|
+
offset = @iteration_set.at(@iteration_count) || default_offset
|
211
|
+
@request_arguments[:params] ||= {}
|
212
|
+
@request_arguments[:params][@iteration_param.to_sym] = offset
|
213
|
+
end
|
214
214
|
|
215
215
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
216
|
+
#
|
217
|
+
# Ads the current iteration offset to a url as a GET parameter.
|
218
|
+
#
|
219
|
+
# url - the url to be update
|
220
|
+
#
|
221
|
+
# Returns a url with the current iteration value represented as a get parameter.
|
222
|
+
#
|
223
|
+
def add_iteration_param(url)
|
224
|
+
offset = @iteration_set.at(@iteration_count) || default_offset
|
225
|
+
param = "#{@iteration_param}=#{offset}"
|
226
|
+
parsed_url = URI::parse(url)
|
227
|
+
|
228
|
+
if parsed_url.query
|
229
|
+
parsed_url.query += param
|
230
|
+
else
|
231
|
+
parsed_url.query = param
|
232
|
+
end
|
233
|
+
parsed_url.to_s
|
232
234
|
end
|
233
|
-
parsed_url.to_s
|
234
|
-
end
|
235
235
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
236
|
+
#
|
237
|
+
# Utility function for calling a superclass instance method.
|
238
|
+
#
|
239
|
+
# (currently used to call ScraperBase#run).
|
240
|
+
#
|
241
241
|
|
242
|
-
|
243
|
-
|
244
|
-
|
242
|
+
def run_super(method, args=[])
|
243
|
+
self.class.superclass.instance_method(method).bind(self).call(*args)
|
244
|
+
end
|
245
245
|
|
246
246
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
247
|
+
def issue_request(url)
|
248
|
+
# remove continue argument if this is the first iteration
|
249
|
+
@request_arguments[:params].delete(@iteration_param.to_sym) if @continue_clause_args && @iteration_count == 0
|
250
|
+
super(url)
|
251
|
+
# clear previous value of iteration parameter
|
252
|
+
@request_arguments[:params].delete(@iteration_param.to_sym) if @request_arguments[:params] && @request_arguments[:params].any?
|
253
|
+
end
|
254
254
|
|
255
255
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
256
|
+
#
|
257
|
+
# Overrides ScraperBase#handle_response in order to apply the proc used to dynamically extract the iteration set.
|
258
|
+
#
|
259
|
+
# TODO: update doc
|
260
|
+
#
|
261
|
+
# returns nothing.
|
262
|
+
#
|
263
263
|
|
264
|
-
|
265
|
-
|
266
|
-
|
264
|
+
def handle_response(response)
|
265
|
+
format = @options[:format] || run_super(:detect_format, response.headers_hash['Content-Type'])
|
266
|
+
extractor_class = format == :json ? JsonExtractor : DomExtractor
|
267
267
|
|
268
|
-
|
269
|
-
|
268
|
+
run_iteration_extractor(response.body, extractor_class) if @response_count == 0 && @iteration_extractor_args
|
269
|
+
run_continue_clause(response.body, extractor_class) if @continue_clause_args
|
270
270
|
|
271
|
-
|
272
|
-
|
271
|
+
super(response)
|
272
|
+
end
|
273
273
|
|
274
274
|
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
275
|
+
def run_continue_clause(response_body, extractor_class)
|
276
|
+
extractor = extractor_class.new(:continue, ExtractionEnvironment.new(self), *@continue_clause_args)
|
277
|
+
continue_value = extractor.extract_field(response_body)
|
278
|
+
#TODO: check if continue_value is valid
|
279
279
|
|
280
|
-
|
281
|
-
|
282
|
-
|
280
|
+
@iteration_set << "" if @iteration_count == 0 #horrible hack: please refactor
|
281
|
+
@iteration_set << continue_value.to_s if continue_value
|
282
|
+
end
|
283
283
|
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
284
|
+
def run_iteration_extractor(response_body, extractor_class)
|
285
|
+
@iteration_extractor = extractor_class.new(*@iteration_extractor_args.insert(1, ExtractionEnvironment.new(self)))
|
286
|
+
#NOTE: does this default_offset make any sense?
|
287
|
+
@iteration_set = Array(default_offset) + @iteration_extractor.extract_list(response_body).map(&:to_s) if @iteration_extractor
|
288
|
+
end
|
288
289
|
end
|
289
|
-
|
290
|
-
|
291
290
|
end
|