extraloop 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,2 +1,8 @@
1
- == 0.0.0 / 2011-01-01
1
+ == 0.0.3 / 2011-01-01
2
+ * namespaced all classes into the ExtraLoop module
3
+
4
+ == 0.0.2 / 2011-01-01
5
+ * changed repository URL
6
+
7
+ == 0.0.1 / 2011-01-01
2
8
  * Project Birthday!
data/README.md CHANGED
@@ -14,7 +14,7 @@ A basic scraper that fetches the top 25 websites from Alexa's daily top 100 list
14
14
 
15
15
  results = nil
16
16
 
17
- Scraper.
17
+ ExtraLoop::ScraperBase.
18
18
  new("http://www.alexa.com/topsites").
19
19
  loop_on("li.site-listing").
20
20
  extract(:site_name, "h2").
@@ -29,7 +29,7 @@ An Iterative Scraper that fetches URL, title, and publisher from some 110 Google
29
29
 
30
30
  results = []
31
31
 
32
- IterativeScraper.
32
+ ExtraLoop::IterativeScraper.
33
33
  new("https://www.google.com/search?tbm=nws&q=Egypt").
34
34
  set_iteration(:start, (1..101).step(10)).
35
35
  loop_on("h3", proc { |nodes| nodes.map(&:parent) }).
@@ -2,7 +2,7 @@ require '../lib/extraloop'
2
2
 
3
3
  results = []
4
4
 
5
- IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
5
+ ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
6
6
  :log_level => :debug,
7
7
  :appenders => [Logging.appenders.stderr ]
8
8
  }).set_iteration(:start, (1..101).step(10)).
@@ -33,7 +33,7 @@ request_arguments = { :params => params }
33
33
  # (used in the Google News example).
34
34
  #
35
35
 
36
- IterativeScraper.new(api_url, options, request_arguments).
36
+ ExtraLoop::IterativeScraper.new(api_url, options, request_arguments).
37
37
  loop_on(['query', 'categorymembers']).
38
38
  extract(:title).
39
39
  extract(:ns).
data/lib/extraloop.rb CHANGED
@@ -1,5 +1,10 @@
1
1
  base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop" )
2
2
 
3
+ module ExtraLoop
4
+ VERSION = '0.0.3'
5
+ end
6
+
7
+
3
8
  # Standard library
4
9
  autoload :OpenStruct, "ostruct"
5
10
 
@@ -16,28 +21,25 @@ autoload :Yajl, "yajl"
16
21
  autoload :Typhoeus, "typhoeus"
17
22
 
18
23
 
24
+
19
25
  # Extraloop components
20
26
 
21
- autoload :Utils , "#{base_path}/utils"
22
- autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
23
- autoload :ExtractorBase , "#{base_path}/extractor_base"
24
- autoload :DomExtractor , "#{base_path}/dom_extractor"
25
- autoload :JsonExtractor , "#{base_path}/json_extractor"
26
- autoload :ExtractionLoop , "#{base_path}/extraction_loop"
27
- autoload :ScraperBase , "#{base_path}/scraper_base"
28
- autoload :Loggable , "#{base_path}/loggable"
29
- autoload :Hookable , "#{base_path}/hookable"
30
- autoload :IterativeScraper , "#{base_path}/iterative_scraper"
27
+ ExtraLoop.autoload :Utils , "#{base_path}/utils"
28
+ ExtraLoop.autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
29
+ ExtraLoop.autoload :ExtractorBase , "#{base_path}/extractor_base"
30
+ ExtraLoop.autoload :DomExtractor , "#{base_path}/dom_extractor"
31
+ ExtraLoop.autoload :JsonExtractor , "#{base_path}/json_extractor"
32
+ ExtraLoop.autoload :ExtractionLoop , "#{base_path}/extraction_loop"
33
+ ExtraLoop.autoload :ScraperBase , "#{base_path}/scraper_base"
34
+ ExtraLoop.autoload :Loggable , "#{base_path}/loggable"
35
+ ExtraLoop.autoload :Hookable , "#{base_path}/hookable"
36
+ ExtraLoop.autoload :IterativeScraper , "#{base_path}/iterative_scraper"
31
37
 
32
38
 
33
39
  # monkey patch scraperbase with the Loggable module.
34
40
  #
35
41
  # This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
36
42
  #
37
- ScraperBase
38
- Loggable
39
-
43
+ ExtraLoop::ScraperBase
44
+ ExtraLoop::Loggable
40
45
 
41
- class ExtraLoop
42
- VERSION = '0.0.1'
43
- end
@@ -1,45 +1,47 @@
1
- class DomExtractor < ExtractorBase
1
+ module ExtraLoop
2
+ class DomExtractor < ExtractorBase
2
3
 
3
- # Public: Runs the extractor against a document fragment (dom node or object).
4
- #
5
- # node - The document fragment
6
- # record - The extracted record
7
- #
8
- # Returns the text content of the element, or the output of the extractor's callback.
9
- #
4
+ # Public: Runs the extractor against a document fragment (dom node or object).
5
+ #
6
+ # node - The document fragment
7
+ # record - The extracted record
8
+ #
9
+ # Returns the text content of the element, or the output of the extractor's callback.
10
+ #
10
11
 
11
- def extract_field(node, record=nil)
12
- target = node = node.respond_to?(:document) ? node : parse(node)
13
- target = node.at_css(@selector) if @selector
14
- target = target.attr(@attribute) if target.respond_to?(:attr) && @attribute
15
- target = @environment.run(target, record, &@callback) if @callback
12
+ def extract_field(node, record=nil)
13
+ target = node = node.respond_to?(:document) ? node : parse(node)
14
+ target = node.at_css(@selector) if @selector
15
+ target = target.attr(@attribute) if target.respond_to?(:attr) && @attribute
16
+ target = @environment.run(target, record, &@callback) if @callback
16
17
 
17
- #if target is still a DOM node, return its text content
18
- target = target.text if target.respond_to?(:text)
19
- target
20
- end
18
+ #if target is still a DOM node, return its text content
19
+ target = target.text if target.respond_to?(:text)
20
+ target
21
+ end
21
22
 
22
- #
23
- # Public: Extracts a list of document fragments matching the provided selector/callback
24
- #
25
- # input - a document (either as a string or as a parsed Nokogiri document)
26
- #
27
- # Returns an array of elements matching the specified selector or function
28
- #
29
- #
23
+ #
24
+ # Public: Extracts a list of document fragments matching the provided selector/callback
25
+ #
26
+ # input - a document (either as a string or as a parsed Nokogiri document)
27
+ #
28
+ # Returns an array of elements matching the specified selector or function
29
+ #
30
+ #
30
31
 
31
- def extract_list(input)
32
- nodes = input.respond_to?(:document) ? input : parse(input)
33
- nodes = nodes.search(@selector) if @selector
34
- @callback && Array(@environment.run(nodes, &@callback)) || nodes
35
- end
32
+ def extract_list(input)
33
+ nodes = input.respond_to?(:document) ? input : parse(input)
34
+ nodes = nodes.search(@selector) if @selector
35
+ @callback && Array(@environment.run(nodes, &@callback)) || nodes
36
+ end
36
37
 
37
- def parse(input)
38
- super(input)
39
- @environment.document = is_xml(input) ? Nokogiri::XML(input) : Nokogiri::HTML(input)
40
- end
38
+ def parse(input)
39
+ super(input)
40
+ @environment.document = is_xml(input) ? Nokogiri::XML(input) : Nokogiri::HTML(input)
41
+ end
41
42
 
42
- def is_xml(input)
43
- input =~ /^\s*\<\?xml version=\"\d\.\d\"\?\>/
43
+ def is_xml(input)
44
+ input =~ /^\s*\<\?xml version=\"\d\.\d\"\?\>/
45
+ end
44
46
  end
45
47
  end
@@ -1,20 +1,22 @@
1
- # This class is simply used as a virtual environment within
2
- # which Hook handlers and extractors run (through #run)
1
+ module ExtraLoop
2
+ # This class acts as a virtual environment within
3
+ # which Hook handlers and extractors run (through #run)
3
4
 
4
- class ExtractionEnvironment
5
- attr_accessor :document
5
+ class ExtractionEnvironment
6
+ attr_accessor :document
6
7
 
7
- def initialize(scraper=nil, document=nil, records=nil)
8
- if scraper
9
- @options = scraper.options
10
- @results = scraper.results
11
- @scraper = scraper
8
+ def initialize(scraper=nil, document=nil, records=nil)
9
+ if scraper
10
+ @options = scraper.options
11
+ @results = scraper.results
12
+ @scraper = scraper
13
+ end
14
+ @document = document
15
+ @records = records
12
16
  end
13
- @document = document
14
- @records = records
15
- end
16
17
 
17
- def run(*arguments, &block)
18
- self.instance_exec(*arguments, &block)
18
+ def run(*arguments, &block)
19
+ self.instance_exec(*arguments, &block)
20
+ end
19
21
  end
20
22
  end
@@ -1,46 +1,46 @@
1
- class ExtractionLoop
2
- include Hookable
1
+ module ExtraLoop
2
+ class ExtractionLoop
3
+ include Hookable
3
4
 
4
- module Exceptions
5
- class UnsupportedFormat < StandardError; end
6
- end
7
-
8
- attr_reader :records, :environment
9
- attr_accessor :extractors, :document, :hooks, :children, :parent, :scraper
10
-
11
- def initialize(loop_extractor, extractors=[], document=nil, hooks = {}, scraper = nil)
12
- @loop_extractor = loop_extractor
13
- @extractors = extractors
14
- @document = @loop_extractor.parse(document)
15
- @records = []
16
- @hooks = hooks
17
- @environment = ExtractionEnvironment.new(@scraper, @document, @records)
18
- self
19
- end
20
-
21
- def run
22
- run_hook(:before, @document)
5
+ module Exceptions
6
+ class UnsupportedFormat < StandardError; end
7
+ end
23
8
 
24
- get_nodelist.each do |node|
25
- run_hook(:before_extract, [node])
26
- @records << run_extractors(node)
27
- run_hook(:after_extract, [node, records.last])
9
+ attr_reader :records, :environment
10
+ attr_accessor :extractors, :document, :hooks, :children, :parent, :scraper
11
+
12
+ def initialize(loop_extractor, extractors=[], document=nil, hooks = {}, scraper = nil)
13
+ @loop_extractor = loop_extractor
14
+ @extractors = extractors
15
+ @document = @loop_extractor.parse(document)
16
+ @records = []
17
+ @hooks = hooks
18
+ @environment = ExtractionEnvironment.new(@scraper, @document, @records)
19
+ self
28
20
  end
29
21
 
30
- run_hook(:after, @records)
31
- self
32
- end
22
+ def run
23
+ run_hook(:before, @document)
33
24
 
25
+ get_nodelist.each do |node|
26
+ run_hook(:before_extract, [node])
27
+ @records << run_extractors(node)
28
+ run_hook(:after_extract, [node, records.last])
29
+ end
34
30
 
35
- private
36
- def get_nodelist
37
- @loop_extractor.extract_list(@document)
38
- end
31
+ run_hook(:after, @records)
32
+ self
33
+ end
39
34
 
40
- def run_extractors(node)
41
- record = OpenStruct.new(:extracted_at => Time.now.to_i)
42
- @extractors.each { |extractor| record.send("#{extractor.field_name.to_s}=", extractor.extract_field(node, record)) }
43
- record
44
- end
35
+ private
36
+ def get_nodelist
37
+ @loop_extractor.extract_list(@document)
38
+ end
45
39
 
40
+ def run_extractors(node)
41
+ record = OpenStruct.new(:extracted_at => Time.now.to_i)
42
+ @extractors.each { |extractor| record.send("#{extractor.field_name.to_s}=", extractor.extract_field(node, record)) }
43
+ record
44
+ end
45
+ end
46
46
  end
@@ -1,40 +1,41 @@
1
- # Abstract class.
2
- # This should not be called directly
3
- #
4
- #
5
- class ExtractorBase
6
- module Exceptions
7
- class WrongArgumentError < StandardError; end
8
- class ExtractorParseError < StandardError; end
9
- end
10
-
11
- attr_reader :field_name
12
- #
13
- # Public: Initializes a Data extractor.
14
- #
15
- # Parameters:
16
- # field_name - The machine readable field name
17
- # environment - The object within which the extractor callback will be run (using run).
18
- # selector: - The css3 selector to be used to match a specific portion of a document (optional).
19
- # callback - A block of code to which the extracted node/attribute will be passed (optional).
20
- # attribute: - A node attribute. If provided, the attribute value will be returned (optional).
21
- #
22
- # Returns itself
1
+ module ExtraLoop
2
+ # Pseudo Abstract class.
3
+ # This should not be called directly
23
4
  #
5
+ class ExtractorBase
6
+ module Exceptions
7
+ class WrongArgumentError < StandardError; end
8
+ class ExtractorParseError < StandardError; end
9
+ end
24
10
 
25
- def initialize(field_name, environment, *args)
26
- @field_name = field_name
27
- @environment = environment
11
+ attr_reader :field_name
12
+ #
13
+ # Public: Initializes a Data extractor.
14
+ #
15
+ # Parameters:
16
+ # field_name - The machine readable field name
17
+ # environment - The object within which the extractor callback will be run (using run).
18
+ # selector: - The css3 selector to be used to match a specific portion of a document (optional).
19
+ # callback - A block of code to which the extracted node/attribute will be passed (optional).
20
+ # attribute: - A node attribute. If provided, the attribute value will be returned (optional).
21
+ #
22
+ # Returns itself
23
+ #
28
24
 
29
- @selector = args.find { |arg| arg.is_a?(String)}
30
- args.delete(@selector) if @selector
31
- @attribute = args.find { |arg| arg.is_a?(String) || arg.is_a?(Symbol) }
32
- @callback = args.find { |arg| arg.respond_to?(:call) }
33
- self
34
- end
25
+ def initialize(field_name, environment, *args)
26
+ @field_name = field_name
27
+ @environment = environment
28
+
29
+ @selector = args.find { |arg| arg.is_a?(String)}
30
+ args.delete(@selector) if @selector
31
+ @attribute = args.find { |arg| arg.is_a?(String) || arg.is_a?(Symbol) }
32
+ @callback = args.find { |arg| arg.respond_to?(:call) }
33
+ self
34
+ end
35
35
 
36
36
 
37
- def parse(input)
38
- raise Exceptions::ExtractorParseError.new "input parameter must be a string" unless input.is_a?(String)
37
+ def parse(input)
38
+ raise Exceptions::ExtractorParseError.new "input parameter must be a string" unless input.is_a?(String)
39
+ end
39
40
  end
40
41
  end
@@ -1,26 +1,26 @@
1
- module Hookable
1
+ module ExtraLoop
2
+ module Hookable
2
3
 
3
- module Exceptions
4
- class HookArgumentError < StandardError
4
+ module Exceptions
5
+ class HookArgumentError < StandardError
6
+ end
5
7
  end
6
- end
7
8
 
8
- def set_hook(hookname, handler)
9
- @hooks ||= {}
10
- raise Exceptions::HookArgumentError.new "handler must be a callable proc" unless handler.respond_to?(:call)
11
- @hooks[hookname.to_sym] ? @hooks[hookname.to_sym].push(handler) : @hooks[hookname.to_sym] = [handler]
12
- self
13
- end
9
+ def set_hook(hookname, handler)
10
+ @hooks ||= {}
11
+ raise Exceptions::HookArgumentError.new "handler must be a callable proc" unless handler.respond_to?(:call)
12
+ @hooks[hookname.to_sym] ? @hooks[hookname.to_sym].push(handler) : @hooks[hookname.to_sym] = [handler]
13
+ self
14
+ end
14
15
 
15
- def run_hook(hook, arguments)
16
- return unless @hooks.has_key?(hook)
16
+ def run_hook(hook, arguments)
17
+ return unless @hooks.has_key?(hook)
17
18
 
18
- @hooks[hook].each do |handler|
19
- (@environment || ExtractionEnvironment.new ).run(*arguments, &handler)
19
+ @hooks[hook].each do |handler|
20
+ (@environment || ExtractionEnvironment.new ).run(*arguments, &handler)
21
+ end
20
22
  end
21
- end
22
23
 
23
- alias_method :on, :set_hook
24
+ alias_method :on, :set_hook
25
+ end
24
26
  end
25
-
26
-
@@ -1,291 +1,290 @@
1
- class IterativeScraper < ScraperBase
1
+ module ExtraLoop
2
+ class IterativeScraper < ScraperBase
3
+ module Exceptions
4
+ class NonGetAsyncRequestNotYetImplemented < StandardError; end
5
+ end
2
6
 
3
- module Exceptions
4
- class NonGetAsyncRequestNotYetImplemented < StandardError; end
5
- end
7
+ #
8
+ # Public
9
+ #
10
+ # Initializes an iterative scraper (i.e. a scraper which can extract data from a list of several web pages).
11
+ #
12
+ # urls - One or an array of several urls.
13
+ # options - A hash of scraper options (optional).
14
+ # async : Wether or not the scraper should issue HTTP requests synchronously or asynchronously (defaults to false).
15
+ # log : Logging options (set to false to completely suppress logging).
16
+ # hydra : A list of arguments to be passed in when initializing the HTTP queue (see Typheous#Hydra).
17
+ # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
18
+ #
19
+ #
20
+ # Examples:
21
+ #
22
+ # # Iterates over the first 10 pages of Google News search result for the query 'Egypt'.
23
+ #
24
+ # IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
25
+ # :appenders => [ 'example.log', :stderr],
26
+ # :log_level => :debug
27
+ #
28
+ # }).set_iteration(:start, (1..101).step(10))
29
+ #
30
+ # # Iterates over the first 10 pages of Google News search results for the query 'Egypt' first, and then
31
+ # # for the query 'Syria', issuing HTTP requests asynchronously, and ignoring ssl certificate verification.
32
+ #
33
+ # IterativeScraper.new([
34
+ # https://www.google.com/search?tbm=nws&q=Egypt",
35
+ # https://www.google.com/search?tbm=nws&q=Syria"
36
+ # ], {:async => true, }, {:disable_ssl_peer_verification => true
37
+ #
38
+ # }).set_iteration(:start, (1..101).step(10))
39
+ #
40
+ # Returns itself.
41
+ #
42
+
43
+ def initialize(urls, options = {}, arguments = {})
44
+ super([], options, arguments)
45
+
46
+ @base_urls = Array(urls)
47
+ @iteration_set = []
48
+ @iteration_extractor = nil
49
+ @iteration_extractor_args = nil
50
+ @iteration_count = 0
51
+ @iteration_param = nil
52
+ @iteration_param_value = nil
53
+ @continue_clause_args = nil
54
+ self
55
+ end
6
56
 
7
- #
8
- # Public
9
- #
10
- # Initializes an iterative scraper (i.e. a scraper which can extract data from a list of several web pages).
11
- #
12
- # urls - One or an array of several urls.
13
- # options - A hash of scraper options (optional).
14
- # async : Wether or not the scraper should issue HTTP requests synchronously or asynchronously (defaults to false).
15
- # log : Logging options (set to false to completely suppress logging).
16
- # hydra : A list of arguments to be passed in when initializing the HTTP queue (see Typheous#Hydra).
17
- # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
18
- #
19
- #
20
- # Examples:
21
- #
22
- # # Iterates over the first 10 pages of Google News search result for the query 'Egypt'.
23
- #
24
- # IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
25
- # :appenders => [ 'example.log', :stderr],
26
- # :log_level => :debug
27
- #
28
- # }).set_iteration(:start, (1..101).step(10))
29
- #
30
- # # Iterates over the first 10 pages of Google News search results for the query 'Egypt' first, and then
31
- # # for the query 'Syria', issuing HTTP requests asynchronously, and ignoring ssl certificate verification.
32
- #
33
- # IterativeScraper.new([
34
- # https://www.google.com/search?tbm=nws&q=Egypt",
35
- # https://www.google.com/search?tbm=nws&q=Syria"
36
- # ], {:async => true, }, {:disable_ssl_peer_verification => true
37
- #
38
- # }).set_iteration(:start, (1..101).step(10))
39
- #
40
- # Returns itself.
41
- #
42
-
43
- def initialize(urls, options = {}, arguments = {})
44
- super([], options, arguments)
45
-
46
- @base_urls = Array(urls)
47
- @iteration_set = []
48
- @iteration_extractor = nil
49
- @iteration_extractor_args = nil
50
- @iteration_count = 0
51
- @iteration_param = nil
52
- @iteration_param_value = nil
53
- @continue_clause_args = nil
54
- self
55
- end
56
57
 
58
+ # Public
59
+ #
60
+ # Specifies the collection of values over which the scraper should iterate.
61
+ # At each iteration, the current value in the iteration set will be included as part of the request parameters.
62
+ #
63
+ # param - the name of the iteration parameter.
64
+ # args - Either an array of values, or a set the arguments to initialize an Extractor object.
65
+ #
66
+ # Examples:
67
+ #
68
+ # # Explicitly specify the iteration set (can be either a range or an array).
69
+ #
70
+ # IterativeScraper.new("http://my-site.com/events").
71
+ # set_iteration(:p, 1..10).
72
+ #
73
+ # # Pass in a code block to dynamically extract the iteration set from the document.
74
+ # # The code block will be passed to generate an Extractor that will be run at the first
75
+ # # iteration. The iteration will not continue if the proc will return return a non empty
76
+ # # set of values.
77
+ #
78
+ # fetch_page_numbers = proc { |elements|
79
+ # elements.map { |a|
80
+ # a.attr(:href).match(/p=(\d+)/)
81
+ # $1
82
+ # }.reject { |p| p == 1 }
83
+ # }
84
+ #
85
+ # IterativeScraper.new("http://my-site.com/events").
86
+ # set_iteration(:p, "div#pagination a", fetch_page_numbers)
87
+ #
88
+ #
89
+ # Returns itself.
90
+ #
91
+
92
+ def set_iteration(param, *args)
93
+ #TODO: allow passing ranges as well as arrays
94
+ if args.first.respond_to?(:map)
95
+ @iteration_set = Array(args.first).map &:to_s
96
+ else
97
+ @iteration_extractor_args = [:pagination, *args]
98
+ end
99
+ set_iteration_param(param)
100
+ self
101
+ end
57
102
 
58
- # Public
59
- #
60
- # Specifies the collection of values over which the scraper should iterate.
61
- # At each iteration, the current value in the iteration set will be included as part of the request parameters.
62
- #
63
- # param - the name of the iteration parameter.
64
- # args - Either an array of values, or a set the arguments to initialize an Extractor object.
65
- #
66
- # Examples:
67
- #
68
- # # Explicitly specify the iteration set (can be either a range or an array).
69
- #
70
- # IterativeScraper.new("http://my-site.com/events").
71
- # set_iteration(:p, 1..10).
72
- #
73
- # # Pass in a code block to dynamically extract the iteration set from the document.
74
- # # The code block will be passed to generate an Extractor that will be run at the first
75
- # # iteration. The iteration will not continue if the proc will return return a non empty
76
- # # set of values.
77
- #
78
- # fetch_page_numbers = proc { |elements|
79
- # elements.map { |a|
80
- # a.attr(:href).match(/p=(\d+)/)
81
- # $1
82
- # }.reject { |p| p == 1 }
83
- # }
84
- #
85
- # IterativeScraper.new("http://my-site.com/events").
86
- # set_iteration(:p, "div#pagination a", fetch_page_numbers)
87
- #
88
- #
89
- # Returns itself.
90
- #
91
-
92
- def set_iteration(param, *args)
93
- #TODO: allow passing ranges as well as arrays
94
- if args.first.respond_to?(:map)
95
- @iteration_set = Array(args.first).map &:to_s
96
- else
97
- @iteration_extractor_args = [:pagination, *args]
103
+ # Public
104
+ #
105
+ # Builds an extractor and uses it to set the value of the next iteration's offset parameter.
106
+ # If the extractor returns nil, the iteration stops.
107
+ #
108
+ # param - A symbol identifying the itertion parameter name.
109
+ # extractor_args - Arguments to be passed to the extractor which will be used to evaluate the continue value
110
+ #
111
+ # Returns itself.
112
+
113
+ def continue_with(param, *extractor_args)
114
+ raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
115
+
116
+ @continue_clause_args = extractor_args
117
+ set_iteration_param(param)
118
+ self
98
119
  end
99
- set_iteration_param(param)
100
- self
101
- end
102
120
 
103
- # Public
104
- #
105
- # Builds an extractor and uses it to set the value of the next iteration's offset parameter.
106
- # If the extractor returns nil, the iteration stops.
107
- #
108
- # param - A symbol identifying the itertion parameter name.
109
- # extractor_args - Arguments to be passed to the extractor which will be used to evaluate the continue value
110
- #
111
- # Returns itself.
112
-
113
- def continue_with(param, *extractor_args)
114
- raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
115
-
116
- @continue_clause_args = extractor_args
117
- set_iteration_param(param)
118
- self
119
- end
121
+ def run
122
+ @base_urls.each do |base_url|
120
123
 
121
- def run
122
- @base_urls.each do |base_url|
124
+ # run an extra iteration to determine the value of the next offset parameter (if #continue_with is used)
125
+ # or the entire iteration set (if #set_iteration is used).
126
+ (run_iteration(base_url); @iteration_count += 1 ) if @iteration_extractor_args || @continue_clause_args
123
127
 
124
- # run an extra iteration to determine the value of the next offset parameter (if #continue_with is used)
125
- # or the entire iteration set (if #set_iteration is used).
126
- (run_iteration(base_url); @iteration_count += 1 ) if @iteration_extractor_args || @continue_clause_args
128
+ while @iteration_set.at(@iteration_count)
129
+ method = @options[:async] ? :run_iteration_async : :run_iteration
130
+ send(method, base_url)
131
+ @iteration_count += 1
132
+ end
127
133
 
128
- while @iteration_set.at(@iteration_count)
129
- method = @options[:async] ? :run_iteration_async : :run_iteration
130
- send(method, base_url)
131
- @iteration_count += 1
134
+ #reset all counts
135
+ @queued_count = 0
136
+ @response_count = 0
137
+ @iteration_count = 0
132
138
  end
133
-
134
- #reset all counts
135
- @queued_count = 0
136
- @response_count = 0
137
- @iteration_count = 0
139
+ self
138
140
  end
139
- self
140
- end
141
141
 
142
- protected
143
-
144
- #
145
- # Set the name (and optionally the default value) of the iteration parameter.
146
- #
147
- # param - a symbol or a hash containing the parameter name (as the key) and its default value.
148
- #
149
- # Returns nothing.
150
- #
151
- #
152
- def set_iteration_param(param)
153
- if param.respond_to?(:keys)
154
- @iteration_param = param.keys.first
155
- @iteration_param_value = param.values.first
156
- else
157
- @iteration_param = param
142
+ protected
143
+
144
+ #
145
+ # Set the name (and optionally the default value) of the iteration parameter.
146
+ #
147
+ # param - a symbol or a hash containing the parameter name (as the key) and its default value.
148
+ #
149
+ # Returns nothing.
150
+ #
151
+ #
152
+ def set_iteration_param(param)
153
+ if param.respond_to?(:keys)
154
+ @iteration_param = param.keys.first
155
+ @iteration_param_value = param.values.first
156
+ else
157
+ @iteration_param = param
158
+ end
158
159
  end
159
- end
160
160
 
161
- def default_offset
162
- @iteration_param_value or "1"
163
- end
161
+ def default_offset
162
+ @iteration_param_value or "1"
163
+ end
164
164
 
165
- #
166
- # Runs an iteration performing blocking, synchronous HTTP request per time (
167
- # calls ScraperBase#run at each request)
168
- #
169
- # url - the current iteration's url.
170
- #
171
- # Returns nothing
172
- #
173
-
174
- def run_iteration(url)
175
- @urls = Array(url)
176
- update_request_params!
177
- run_super(:run)
178
- end
165
+ #
166
+ # Runs an iteration performing blocking, synchronous HTTP request per time (
167
+ # calls ScraperBase#run at each request)
168
+ #
169
+ # url - the current iteration's url.
170
+ #
171
+ # Returns nothing
172
+ #
173
+
174
+ def run_iteration(url)
175
+ @urls = Array(url)
176
+ update_request_params!
177
+ run_super(:run)
178
+ end
179
179
 
180
- #
181
- # Runs an iteration performing parallel, non-blocking HTTP requests
182
- #
183
- # url - The current iteration's url.
184
- #
185
- # Returns nothing.
186
- #
187
- #
188
- def run_iteration_async(url)
189
- error_message = "When then option 'async' is set, the IterativeScraper class currently supports only HTTP method 'get'." +
190
- "If you have to use a HTTP method other than GET, you will have to set the 'async' option to false."
180
+ #
181
+ # Runs an iteration performing parallel, non-blocking HTTP requests
182
+ #
183
+ # url - The current iteration's url.
184
+ #
185
+ # Returns nothing.
186
+ #
187
+ #
188
+ def run_iteration_async(url)
189
+ error_message = "When then option 'async' is set, the IterativeScraper class currently supports only HTTP method 'get'." +
190
+ "If you have to use a HTTP method other than GET, you will have to set the 'async' option to false."
191
191
 
192
- raise NonGetAsyncRequestNotYetImplemented error_message unless @request_arguments[:method].nil? || @request_arguments[:method].downcase.to_sym == :get
192
+ raise NonGetAsyncRequestNotYetImplemented error_message unless @request_arguments[:method].nil? || @request_arguments[:method].downcase.to_sym == :get
193
193
 
194
- @urls << add_iteration_param(url)
194
+ @urls << add_iteration_param(url)
195
195
 
196
- if @iteration_set[@iteration_count] == @iteration_set.last
197
- run_super(:run)
196
+ if @iteration_set[@iteration_count] == @iteration_set.last
197
+ run_super(:run)
198
+ end
198
199
  end
199
- end
200
200
 
201
201
 
202
- #
203
- # Dynamically updates the request parameter hash with the
204
- # current iteration parameter value.
205
- #
206
- # Returns nothing.
207
- #
202
+ #
203
+ # Dynamically updates the request parameter hash with the
204
+ # current iteration parameter value.
205
+ #
206
+ # Returns nothing.
207
+ #
208
208
 
209
- def update_request_params!
210
- offset = @iteration_set.at(@iteration_count) || default_offset
211
- @request_arguments[:params] ||= {}
212
- @request_arguments[:params][@iteration_param.to_sym] = offset
213
- end
209
+ def update_request_params!
210
+ offset = @iteration_set.at(@iteration_count) || default_offset
211
+ @request_arguments[:params] ||= {}
212
+ @request_arguments[:params][@iteration_param.to_sym] = offset
213
+ end
214
214
 
215
215
 
216
- #
217
- # Ads the current iteration offset to a url as a GET parameter.
218
- #
219
- # url - the url to be update
220
- #
221
- # Returns a url with the current iteration value represented as a get parameter.
222
- #
223
- def add_iteration_param(url)
224
- offset = @iteration_set.at(@iteration_count) || default_offset
225
- param = "#{@iteration_param}=#{offset}"
226
- parsed_url = URI::parse(url)
227
-
228
- if parsed_url.query
229
- parsed_url.query += param
230
- else
231
- parsed_url.query = param
216
+ #
217
+ # Ads the current iteration offset to a url as a GET parameter.
218
+ #
219
+ # url - the url to be update
220
+ #
221
+ # Returns a url with the current iteration value represented as a get parameter.
222
+ #
223
+ def add_iteration_param(url)
224
+ offset = @iteration_set.at(@iteration_count) || default_offset
225
+ param = "#{@iteration_param}=#{offset}"
226
+ parsed_url = URI::parse(url)
227
+
228
+ if parsed_url.query
229
+ parsed_url.query += param
230
+ else
231
+ parsed_url.query = param
232
+ end
233
+ parsed_url.to_s
232
234
  end
233
- parsed_url.to_s
234
- end
235
235
 
236
- #
237
- # Utility function for calling a superclass instance method.
238
- #
239
- # (currently used to call ScraperBase#run).
240
- #
236
+ #
237
+ # Utility function for calling a superclass instance method.
238
+ #
239
+ # (currently used to call ScraperBase#run).
240
+ #
241
241
 
242
- def run_super(method, args=[])
243
- self.class.superclass.instance_method(method).bind(self).call(*args)
244
- end
242
+ def run_super(method, args=[])
243
+ self.class.superclass.instance_method(method).bind(self).call(*args)
244
+ end
245
245
 
246
246
 
247
- def issue_request(url)
248
- # remove continue argument if this is the first iteration
249
- @request_arguments[:params].delete(@iteration_param.to_sym) if @continue_clause_args && @iteration_count == 0
250
- super(url)
251
- # clear previous value of iteration parameter
252
- @request_arguments[:params].delete(@iteration_param.to_sym) if @request_arguments[:params] && @request_arguments[:params].any?
253
- end
247
+ def issue_request(url)
248
+ # remove continue argument if this is the first iteration
249
+ @request_arguments[:params].delete(@iteration_param.to_sym) if @continue_clause_args && @iteration_count == 0
250
+ super(url)
251
+ # clear previous value of iteration parameter
252
+ @request_arguments[:params].delete(@iteration_param.to_sym) if @request_arguments[:params] && @request_arguments[:params].any?
253
+ end
254
254
 
255
255
 
256
- #
257
- # Overrides ScraperBase#handle_response in order to apply the proc used to dynamically extract the iteration set.
258
- #
259
- # TODO: update doc
260
- #
261
- # returns nothing.
262
- #
256
+ #
257
+ # Overrides ScraperBase#handle_response in order to apply the proc used to dynamically extract the iteration set.
258
+ #
259
+ # TODO: update doc
260
+ #
261
+ # returns nothing.
262
+ #
263
263
 
264
- def handle_response(response)
265
- format = @options[:format] || run_super(:detect_format, response.headers_hash['Content-Type'])
266
- extractor_class = format == :json ? JsonExtractor : DomExtractor
264
+ def handle_response(response)
265
+ format = @options[:format] || run_super(:detect_format, response.headers_hash['Content-Type'])
266
+ extractor_class = format == :json ? JsonExtractor : DomExtractor
267
267
 
268
- run_iteration_extractor(response.body, extractor_class) if @response_count == 0 && @iteration_extractor_args
269
- run_continue_clause(response.body, extractor_class) if @continue_clause_args
268
+ run_iteration_extractor(response.body, extractor_class) if @response_count == 0 && @iteration_extractor_args
269
+ run_continue_clause(response.body, extractor_class) if @continue_clause_args
270
270
 
271
- super(response)
272
- end
271
+ super(response)
272
+ end
273
273
 
274
274
 
275
- def run_continue_clause(response_body, extractor_class)
276
- extractor = extractor_class.new(:continue, ExtractionEnvironment.new(self), *@continue_clause_args)
277
- continue_value = extractor.extract_field(response_body)
278
- #TODO: check if continue_value is valid
275
+ def run_continue_clause(response_body, extractor_class)
276
+ extractor = extractor_class.new(:continue, ExtractionEnvironment.new(self), *@continue_clause_args)
277
+ continue_value = extractor.extract_field(response_body)
278
+ #TODO: check if continue_value is valid
279
279
 
280
- @iteration_set << "" if @iteration_count == 0 #horrible hack: please refactor
281
- @iteration_set << continue_value.to_s if continue_value
282
- end
280
+ @iteration_set << "" if @iteration_count == 0 #horrible hack: please refactor
281
+ @iteration_set << continue_value.to_s if continue_value
282
+ end
283
283
 
284
- def run_iteration_extractor(response_body, extractor_class)
285
- @iteration_extractor = extractor_class.new(*@iteration_extractor_args.insert(1, ExtractionEnvironment.new(self)))
286
- #NOTE: does this default_offset make any sense?
287
- @iteration_set = Array(default_offset) + @iteration_extractor.extract_list(response_body).map(&:to_s) if @iteration_extractor
284
+ def run_iteration_extractor(response_body, extractor_class)
285
+ @iteration_extractor = extractor_class.new(*@iteration_extractor_args.insert(1, ExtractionEnvironment.new(self)))
286
+ #NOTE: does this default_offset make any sense?
287
+ @iteration_set = Array(default_offset) + @iteration_extractor.extract_list(response_body).map(&:to_s) if @iteration_extractor
288
+ end
288
289
  end
289
-
290
-
291
290
  end