RubyGems - extraloop - Versions diffs - 0.0.2 → 0.0.3 - Mend

extraloop 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/History.txt +7 -1
data/README.md +2 -2
data/examples/google_news_scraper.rb +1 -1
data/examples/wikipedia_categories.rb +1 -1
data/lib/extraloop.rb +18 -16
data/lib/extraloop/dom_extractor.rb +38 -36
data/lib/extraloop/extraction_environment.rb +16 -14
data/lib/extraloop/extraction_loop.rb +37 -37
data/lib/extraloop/extractor_base.rb +34 -33
data/lib/extraloop/hookable.rb +18 -18
data/lib/extraloop/iterative_scraper.rb +249 -250
data/lib/extraloop/json_extractor.rb +27 -26
data/lib/extraloop/loggable.rb +50 -49
data/lib/extraloop/scraper_base.rb +144 -141
data/lib/extraloop/utils.rb +64 -61
data/spec/helpers/spec_helper.rb +2 -1
metadata +24 -13

data/History.txt CHANGED Viewed

@@ -1,2 +1,8 @@
-== 0.0.0 / 2011-01-01
+== 0.0.3  / 2011-01-01
+  * namespaced all classes into the ExtraLoop module
+== 0.0.2  / 2011-01-01
+  * changed repository URL
+== 0.0.1  / 2011-01-01
   * Project Birthday!

data/README.md CHANGED Viewed

@@ -14,7 +14,7 @@ A basic scraper that fetches the top 25 websites from Alexa's daily top 100 list
     results = nil
-    Scraper.
+    ExtraLoop::ScraperBase.
       new("http://www.alexa.com/topsites").
       loop_on("li.site-listing").
         extract(:site_name, "h2").
@@ -29,7 +29,7 @@ An Iterative Scraper that fetches URL, title, and publisher from some 110 Google
     results = []
-    IterativeScraper.
+    ExtraLoop::IterativeScraper.
        new("https://www.google.com/search?tbm=nws&q=Egypt").
        set_iteration(:start, (1..101).step(10)).
        loop_on("h3", proc { |nodes| nodes.map(&:parent) }).

data/examples/google_news_scraper.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require '../lib/extraloop'
 results = []
-IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
+ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
   :log_level => :debug,
   :appenders => [Logging.appenders.stderr ]
 }).set_iteration(:start, (1..101).step(10)).

data/examples/wikipedia_categories.rb CHANGED Viewed

@@ -33,7 +33,7 @@ request_arguments = { :params => params }
 # (used in the Google News example).
 #
-IterativeScraper.new(api_url, options, request_arguments).
+ExtraLoop::IterativeScraper.new(api_url, options, request_arguments).
   loop_on(['query', 'categorymembers']).
     extract(:title).
     extract(:ns).

data/lib/extraloop.rb CHANGED Viewed

@@ -1,5 +1,10 @@
 base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop"  )
+module ExtraLoop
+  VERSION = '0.0.3'
+end
 # Standard library
 autoload :OpenStruct, "ostruct"
@@ -16,28 +21,25 @@ autoload :Yajl,     "yajl"
 autoload :Typhoeus, "typhoeus"
 # Extraloop components
-autoload :Utils                 , "#{base_path}/utils"
-autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
-autoload :ExtractorBase         , "#{base_path}/extractor_base"
-autoload :DomExtractor          , "#{base_path}/dom_extractor"
-autoload :JsonExtractor         , "#{base_path}/json_extractor"
-autoload :ExtractionLoop        , "#{base_path}/extraction_loop"
-autoload :ScraperBase           , "#{base_path}/scraper_base"
-autoload :Loggable              , "#{base_path}/loggable"
-autoload :Hookable             , "#{base_path}/hookable"
-autoload :IterativeScraper      , "#{base_path}/iterative_scraper"
+ExtraLoop.autoload :Utils                 , "#{base_path}/utils"
+ExtraLoop.autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
+ExtraLoop.autoload :ExtractorBase         , "#{base_path}/extractor_base"
+ExtraLoop.autoload :DomExtractor          , "#{base_path}/dom_extractor"
+ExtraLoop.autoload :JsonExtractor         , "#{base_path}/json_extractor"
+ExtraLoop.autoload :ExtractionLoop        , "#{base_path}/extraction_loop"
+ExtraLoop.autoload :ScraperBase           , "#{base_path}/scraper_base"
+ExtraLoop.autoload :Loggable              , "#{base_path}/loggable"
+ExtraLoop.autoload :Hookable              , "#{base_path}/hookable"
+ExtraLoop.autoload :IterativeScraper      , "#{base_path}/iterative_scraper"
 # monkey patch scraperbase with the Loggable module.
 #
 # This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
 #
-ScraperBase
-Loggable
+ExtraLoop::ScraperBase
+ExtraLoop::Loggable
-class ExtraLoop
-  VERSION = '0.0.1'
-end

data/lib/extraloop/dom_extractor.rb CHANGED Viewed

@@ -1,45 +1,47 @@
-class DomExtractor < ExtractorBase
+module ExtraLoop
+  class DomExtractor < ExtractorBase
-  # Public: Runs the extractor against a document fragment (dom node or object).
-  #
-  # node   - The document fragment
-  # record - The extracted record
-  #
-  # Returns the text content of the element, or the output of the extractor's callback.
-  #
+    # Public: Runs the extractor against a document fragment (dom node or object).
+    #
+    # node   - The document fragment
+    # record - The extracted record
+    #
+    # Returns the text content of the element, or the output of the extractor's callback.
+    #
-  def extract_field(node, record=nil)
-    target = node = node.respond_to?(:document) ? node : parse(node)
-    target = node.at_css(@selector)  if @selector
-    target = target.attr(@attribute) if target.respond_to?(:attr) && @attribute
-    target = @environment.run(target, record, &@callback) if @callback
+    def extract_field(node, record=nil)
+      target = node = node.respond_to?(:document) ? node : parse(node)
+      target = node.at_css(@selector)  if @selector
+      target = target.attr(@attribute) if target.respond_to?(:attr) && @attribute
+      target = @environment.run(target, record, &@callback) if @callback
-    #if target is still a DOM node, return its text content
-    target = target.text if target.respond_to?(:text)
-    target
-  end
+      #if target is still a DOM node, return its text content
+      target = target.text if target.respond_to?(:text)
+      target
+    end
-  #
-  # Public: Extracts a list of document fragments matching the provided selector/callback
-  #
-  # input - a document (either as a string or as a parsed Nokogiri document)
-  #
-  # Returns an array of elements matching the specified selector or function
-  #
-  #
+    #
+    # Public: Extracts a list of document fragments matching the provided selector/callback
+    #
+    # input - a document (either as a string or as a parsed Nokogiri document)
+    #
+    # Returns an array of elements matching the specified selector or function
+    #
+    #
-  def extract_list(input)
-    nodes = input.respond_to?(:document) ? input : parse(input)
-    nodes = nodes.search(@selector) if @selector
-    @callback && Array(@environment.run(nodes, &@callback)) || nodes
-  end
+    def extract_list(input)
+      nodes = input.respond_to?(:document) ? input : parse(input)
+      nodes = nodes.search(@selector) if @selector
+      @callback && Array(@environment.run(nodes, &@callback)) || nodes
+    end
-  def parse(input)
-    super(input)
-    @environment.document = is_xml(input) ? Nokogiri::XML(input) : Nokogiri::HTML(input)
-  end
+    def parse(input)
+      super(input)
+      @environment.document = is_xml(input) ? Nokogiri::XML(input) : Nokogiri::HTML(input)
+    end
-  def is_xml(input)
-    input =~ /^\s*\<\?xml version=\"\d\.\d\"\?\>/
+    def is_xml(input)
+      input =~ /^\s*\<\?xml version=\"\d\.\d\"\?\>/
+    end
   end
 end

data/lib/extraloop/extraction_environment.rb CHANGED Viewed

@@ -1,20 +1,22 @@
-# This class is simply used as a virtual environment within
-# which Hook handlers and extractors run (through #run)
+module ExtraLoop
+  # This class acts as a virtual environment within
+  # which Hook handlers and extractors run (through #run)
-class ExtractionEnvironment
-  attr_accessor :document
+  class ExtractionEnvironment
+    attr_accessor :document
-  def initialize(scraper=nil, document=nil, records=nil)
-    if scraper
-      @options  = scraper.options
-      @results  = scraper.results
-      @scraper  = scraper
+    def initialize(scraper=nil, document=nil, records=nil)
+      if scraper
+        @options  = scraper.options
+        @results  = scraper.results
+        @scraper  = scraper
+      end
+      @document = document
+      @records  = records
     end
-    @document = document
-    @records  = records
-  end
-  def run(*arguments, &block)
-    self.instance_exec(*arguments, &block)
+    def run(*arguments, &block)
+      self.instance_exec(*arguments, &block)
+    end
   end
 end

data/lib/extraloop/extraction_loop.rb CHANGED Viewed

@@ -1,46 +1,46 @@
-class ExtractionLoop
-  include Hookable
+module ExtraLoop
+  class ExtractionLoop
+    include Hookable
-  module Exceptions
-    class UnsupportedFormat < StandardError; end
-  end
-  attr_reader :records, :environment
-  attr_accessor :extractors, :document, :hooks, :children, :parent, :scraper
-  def initialize(loop_extractor, extractors=[], document=nil, hooks = {}, scraper = nil)
-    @loop_extractor = loop_extractor
-    @extractors = extractors
-    @document = @loop_extractor.parse(document)
-    @records = []
-    @hooks = hooks
-    @environment = ExtractionEnvironment.new(@scraper, @document, @records)
-    self
-  end
-  def run
-    run_hook(:before, @document)
+    module Exceptions
+      class UnsupportedFormat < StandardError; end
+    end
-    get_nodelist.each do |node|
-      run_hook(:before_extract, [node])
-      @records << run_extractors(node)
-      run_hook(:after_extract, [node, records.last])
+    attr_reader :records, :environment
+    attr_accessor :extractors, :document, :hooks, :children, :parent, :scraper
+    def initialize(loop_extractor, extractors=[], document=nil, hooks = {}, scraper = nil)
+      @loop_extractor = loop_extractor
+      @extractors = extractors
+      @document = @loop_extractor.parse(document)
+      @records = []
+      @hooks = hooks
+      @environment = ExtractionEnvironment.new(@scraper, @document, @records)
+      self
     end
-    run_hook(:after, @records)
-    self
-  end
+    def run
+      run_hook(:before, @document)
+      get_nodelist.each do |node|
+        run_hook(:before_extract, [node])
+        @records << run_extractors(node)
+        run_hook(:after_extract, [node, records.last])
+      end
-  private
-  def get_nodelist
-    @loop_extractor.extract_list(@document)
-  end
+      run_hook(:after, @records)
+      self
+    end
-  def run_extractors(node)
-    record = OpenStruct.new(:extracted_at => Time.now.to_i)
-    @extractors.each { |extractor| record.send("#{extractor.field_name.to_s}=", extractor.extract_field(node, record)) }
-    record
-  end
+    private
+    def get_nodelist
+      @loop_extractor.extract_list(@document)
+    end
+    def run_extractors(node)
+      record = OpenStruct.new(:extracted_at => Time.now.to_i)
+      @extractors.each { |extractor| record.send("#{extractor.field_name.to_s}=", extractor.extract_field(node, record)) }
+      record
+    end
+  end
 end

data/lib/extraloop/extractor_base.rb CHANGED Viewed

@@ -1,40 +1,41 @@
-# Abstract class.
-# This should not be called directly
-#
-#
-class ExtractorBase
-  module Exceptions
-    class WrongArgumentError < StandardError; end
-    class ExtractorParseError < StandardError; end
-  end
-  attr_reader :field_name
-  #
-  # Public: Initializes a Data extractor.
-  #
-  # Parameters:
-  #   field_name  - The machine readable field name
-  #   environment - The object within which the extractor callback will be run (using run).
-  #   selector:   - The css3 selector to be used to match a specific portion of a document (optional).
-  #   callback    - A block of code to which the extracted node/attribute will be passed (optional).
-  #   attribute:  - A node attribute. If provided, the attribute value will be returned (optional).
-  #
-  # Returns itself
+module ExtraLoop
+  # Pseudo Abstract class.
+  # This should not be called directly
   #
+  class ExtractorBase
+    module Exceptions
+      class WrongArgumentError < StandardError; end
+      class ExtractorParseError < StandardError; end
+    end
-  def initialize(field_name, environment, *args)
-    @field_name = field_name
-    @environment = environment
+    attr_reader :field_name
+    #
+    # Public: Initializes a Data extractor.
+    #
+    # Parameters:
+    #   field_name  - The machine readable field name
+    #   environment - The object within which the extractor callback will be run (using run).
+    #   selector:   - The css3 selector to be used to match a specific portion of a document (optional).
+    #   callback    - A block of code to which the extracted node/attribute will be passed (optional).
+    #   attribute:  - A node attribute. If provided, the attribute value will be returned (optional).
+    #
+    # Returns itself
+    #
-    @selector = args.find { |arg| arg.is_a?(String)}
-    args.delete(@selector) if @selector
-    @attribute = args.find { |arg| arg.is_a?(String) || arg.is_a?(Symbol) }
-    @callback = args.find { |arg| arg.respond_to?(:call) }
-    self
-  end
+    def initialize(field_name, environment, *args)
+      @field_name = field_name
+      @environment = environment
+      @selector = args.find { |arg| arg.is_a?(String)}
+      args.delete(@selector) if @selector
+      @attribute = args.find { |arg| arg.is_a?(String) || arg.is_a?(Symbol) }
+      @callback = args.find { |arg| arg.respond_to?(:call) }
+      self
+    end
-  def parse(input)
-    raise Exceptions::ExtractorParseError.new "input parameter must be a string" unless input.is_a?(String)
+    def parse(input)
+      raise Exceptions::ExtractorParseError.new "input parameter must be a string" unless input.is_a?(String)
+    end
   end
 end

data/lib/extraloop/hookable.rb CHANGED Viewed

@@ -1,26 +1,26 @@
-module Hookable
+module ExtraLoop
+  module Hookable
-  module Exceptions
-    class HookArgumentError < StandardError
+    module Exceptions
+      class HookArgumentError < StandardError
+      end
     end
-  end
-  def set_hook(hookname, handler)
-    @hooks ||= {}
-    raise Exceptions::HookArgumentError.new "handler must be a callable proc" unless handler.respond_to?(:call)
-    @hooks[hookname.to_sym] ? @hooks[hookname.to_sym].push(handler) : @hooks[hookname.to_sym] = [handler]
-    self
-  end
+    def set_hook(hookname, handler)
+      @hooks ||= {}
+      raise Exceptions::HookArgumentError.new "handler must be a callable proc" unless handler.respond_to?(:call)
+      @hooks[hookname.to_sym] ? @hooks[hookname.to_sym].push(handler) : @hooks[hookname.to_sym] = [handler]
+      self
+    end
-  def run_hook(hook, arguments)
-    return unless @hooks.has_key?(hook)
+    def run_hook(hook, arguments)
+      return unless @hooks.has_key?(hook)
-    @hooks[hook].each do |handler|
-      (@environment || ExtractionEnvironment.new ).run(*arguments, &handler)
+      @hooks[hook].each do |handler|
+        (@environment || ExtractionEnvironment.new ).run(*arguments, &handler)
+      end
     end
-  end
-  alias_method :on, :set_hook
+    alias_method :on, :set_hook
+  end
 end

data/lib/extraloop/iterative_scraper.rb CHANGED Viewed

@@ -1,291 +1,290 @@
-class IterativeScraper < ScraperBase
+module ExtraLoop
+  class IterativeScraper < ScraperBase
+    module Exceptions
+      class NonGetAsyncRequestNotYetImplemented < StandardError; end
+    end
-  module Exceptions
-    class NonGetAsyncRequestNotYetImplemented < StandardError; end
-  end
+    #
+    # Public
+    #
+    # Initializes an iterative scraper (i.e. a scraper which can extract data from a list of several web pages).
+    #
+    # urls      -  One or an array of several urls.
+    # options   -  A hash of scraper options (optional).
+    #   async : Wether or not the scraper should issue HTTP requests synchronously or asynchronously (defaults to false).
+    #   log   : Logging options (set to false to completely suppress logging).
+    #   hydra : A list of arguments to be passed in when initializing the HTTP queue (see Typheous#Hydra).
+    # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
+    #
+    #
+    # Examples:
+    #
+    # # Iterates over the first 10 pages of Google News search result for the query 'Egypt'.
+    #
+    # IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
+    #     :appenders => [ 'example.log', :stderr],
+    #     :log_level => :debug
+    #
+    #   }).set_iteration(:start, (1..101).step(10))
+    #
+    # # Iterates over the first 10 pages of Google News search results for the query 'Egypt' first, and then
+    # # for the query 'Syria', issuing HTTP requests asynchronously, and ignoring ssl certificate verification.
+    #
+    # IterativeScraper.new([
+    #     https://www.google.com/search?tbm=nws&q=Egypt",
+    #     https://www.google.com/search?tbm=nws&q=Syria"
+    #   ], {:async => true,  }, {:disable_ssl_peer_verification => true
+    #
+    # }).set_iteration(:start, (1..101).step(10))
+    #
+    # Returns itself.
+    #
+    def initialize(urls, options = {}, arguments = {})
+      super([], options, arguments)
+      @base_urls = Array(urls)
+      @iteration_set = []
+      @iteration_extractor = nil
+      @iteration_extractor_args = nil
+      @iteration_count = 0
+      @iteration_param = nil
+      @iteration_param_value = nil
+      @continue_clause_args = nil
+      self
+    end
-  #
-  # Public
-  #
-  # Initializes an iterative scraper (i.e. a scraper which can extract data from a list of several web pages).
-  #
-  # urls      -  One or an array of several urls.
-  # options   -  A hash of scraper options (optional).
-  #   async : Wether or not the scraper should issue HTTP requests synchronously or asynchronously (defaults to false).
-  #   log   : Logging options (set to false to completely suppress logging).
-  #   hydra : A list of arguments to be passed in when initializing the HTTP queue (see Typheous#Hydra).
-  # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
-  #
-  #
-  # Examples:
-  #
-  # # Iterates over the first 10 pages of Google News search result for the query 'Egypt'.
-  #
-  # IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
-  #     :appenders => [ 'example.log', :stderr],
-  #     :log_level => :debug
-  #
-  #   }).set_iteration(:start, (1..101).step(10))
-  #
-  # # Iterates over the first 10 pages of Google News search results for the query 'Egypt' first, and then
-  # # for the query 'Syria', issuing HTTP requests asynchronously, and ignoring ssl certificate verification.
-  #
-  # IterativeScraper.new([
-  #     https://www.google.com/search?tbm=nws&q=Egypt",
-  #     https://www.google.com/search?tbm=nws&q=Syria"
-  #   ], {:async => true,  }, {:disable_ssl_peer_verification => true
-  #
-  # }).set_iteration(:start, (1..101).step(10))
-  #
-  # Returns itself.
-  #
-  def initialize(urls, options = {}, arguments = {})
-    super([], options, arguments)
-    @base_urls = Array(urls)
-    @iteration_set = []
-    @iteration_extractor = nil
-    @iteration_extractor_args = nil
-    @iteration_count = 0
-    @iteration_param = nil
-    @iteration_param_value = nil
-    @continue_clause_args = nil
-    self
-  end
+    # Public
+    #
+    # Specifies the collection of values over which the scraper should iterate.
+    # At each iteration, the current value in the iteration set will be included as part of the request parameters.
+    #
+    # param - the name of the iteration parameter.
+    # args  - Either an array of values, or a set the arguments to initialize an Extractor object.
+    #
+    # Examples:
+    #
+    #  # Explicitly specify the iteration set (can be either a range or an array).
+    #
+    #   IterativeScraper.new("http://my-site.com/events").
+    #     set_iteration(:p, 1..10).
+    #
+    #  # Pass in a code block to dynamically extract the iteration set from the document.
+    #  # The code block will be passed to generate an Extractor that will be run at the first
+    #  # iteration. The iteration will not continue if the proc will return return a non empty
+    #  # set of values.
+    #
+    #  fetch_page_numbers = proc { |elements|
+    #    elements.map { |a|
+    #       a.attr(:href).match(/p=(\d+)/)
+    #       $1
+    #    }.reject { |p| p == 1 }
+    #  }
+    #
+    #  IterativeScraper.new("http://my-site.com/events").
+    #    set_iteration(:p, "div#pagination a", fetch_page_numbers)
+    #
+    #
+    # Returns itself.
+    #
+    def set_iteration(param, *args)
+      #TODO: allow passing ranges as well as arrays
+      if args.first.respond_to?(:map)
+        @iteration_set = Array(args.first).map &:to_s
+      else
+        @iteration_extractor_args = [:pagination, *args]
+      end
+      set_iteration_param(param)
+      self
+    end
-  # Public
-  #
-  # Specifies the collection of values over which the scraper should iterate.
-  # At each iteration, the current value in the iteration set will be included as part of the request parameters.
-  #
-  # param - the name of the iteration parameter.
-  # args  - Either an array of values, or a set the arguments to initialize an Extractor object.
-  #
-  # Examples:
-  #
-  #  # Explicitly specify the iteration set (can be either a range or an array).
-  #
-  #   IterativeScraper.new("http://my-site.com/events").
-  #     set_iteration(:p, 1..10).
-  #
-  #  # Pass in a code block to dynamically extract the iteration set from the document.
-  #  # The code block will be passed to generate an Extractor that will be run at the first
-  #  # iteration. The iteration will not continue if the proc will return return a non empty
-  #  # set of values.
-  #
-  #  fetch_page_numbers = proc { |elements|
-  #    elements.map { |a|
-  #       a.attr(:href).match(/p=(\d+)/)
-  #       $1
-  #    }.reject { |p| p == 1 }
-  #  }
-  #
-  #  IterativeScraper.new("http://my-site.com/events").
-  #    set_iteration(:p, "div#pagination a", fetch_page_numbers)
-  #
-  #
-  # Returns itself.
-  #
-  def set_iteration(param, *args)
-    #TODO: allow passing ranges as well as arrays
-    if args.first.respond_to?(:map)
-      @iteration_set = Array(args.first).map &:to_s
-    else
-      @iteration_extractor_args = [:pagination, *args]
+    # Public
+    #
+    # Builds an extractor and uses it to set the value of the next iteration's offset parameter.
+    # If the extractor returns nil, the iteration stops.
+    #
+    # param - A symbol identifying the itertion parameter name.
+    # extractor_args - Arguments to be passed to the extractor which will be used to evaluate the continue value
+    #
+    # Returns itself.
+    def continue_with(param, *extractor_args)
+      raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
+      @continue_clause_args = extractor_args
+      set_iteration_param(param)
+      self
     end
-    set_iteration_param(param)
-    self
-  end
-  # Public
-  #
-  # Builds an extractor and uses it to set the value of the next iteration's offset parameter.
-  # If the extractor returns nil, the iteration stops.
-  #
-  # param - A symbol identifying the itertion parameter name.
-  # extractor_args - Arguments to be passed to the extractor which will be used to evaluate the continue value
-  #
-  # Returns itself.
-  def continue_with(param, *extractor_args)
-    raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
-    @continue_clause_args = extractor_args
-    set_iteration_param(param)
-    self
-  end
+    def run
+      @base_urls.each do |base_url|
-  def run
-    @base_urls.each do |base_url|
+        # run an extra iteration to determine the value of the next offset parameter (if #continue_with is used)
+        # or the entire iteration set (if #set_iteration is used).
+        (run_iteration(base_url); @iteration_count += 1 ) if @iteration_extractor_args || @continue_clause_args
-      # run an extra iteration to determine the value of the next offset parameter (if #continue_with is used)
-      # or the entire iteration set (if #set_iteration is used).
-      (run_iteration(base_url); @iteration_count += 1 ) if @iteration_extractor_args || @continue_clause_args
+        while @iteration_set.at(@iteration_count)
+          method = @options[:async] ? :run_iteration_async : :run_iteration
+          send(method, base_url)
+          @iteration_count += 1
+        end
-      while @iteration_set.at(@iteration_count)
-        method = @options[:async] ? :run_iteration_async : :run_iteration
-        send(method, base_url)
-        @iteration_count += 1
+        #reset all counts
+        @queued_count = 0
+        @response_count = 0
+        @iteration_count = 0
       end
-      #reset all counts
-      @queued_count = 0
-      @response_count = 0
-      @iteration_count = 0
+      self
     end
-    self
-  end
-  protected
-  #
-  # Set the name (and optionally the default value) of the iteration parameter.
-  #
-  # param - a symbol or a hash containing the parameter name (as the key) and its default value.
-  #
-  # Returns nothing.
-  #
-  #
-  def set_iteration_param(param)
-    if param.respond_to?(:keys)
-      @iteration_param = param.keys.first
-      @iteration_param_value = param.values.first
-    else
-      @iteration_param = param
+    protected
+    #
+    # Set the name (and optionally the default value) of the iteration parameter.
+    #
+    # param - a symbol or a hash containing the parameter name (as the key) and its default value.
+    #
+    # Returns nothing.
+    #
+    #
+    def set_iteration_param(param)
+      if param.respond_to?(:keys)
+        @iteration_param = param.keys.first
+        @iteration_param_value = param.values.first
+      else
+        @iteration_param = param
+      end
     end
-  end
-  def default_offset
-    @iteration_param_value or "1"
-  end
+    def default_offset
+      @iteration_param_value or "1"
+    end
-  #
-  # Runs an iteration performing blocking, synchronous HTTP request per time (
-  # calls ScraperBase#run at each request)
-  #
-  # url - the current iteration's url.
-  #
-  # Returns nothing
-  #
-  def run_iteration(url)
-    @urls = Array(url)
-    update_request_params!
-    run_super(:run)
-  end
+    #
+    # Runs an iteration performing blocking, synchronous HTTP request per time (
+    # calls ScraperBase#run at each request)
+    #
+    # url - the current iteration's url.
+    #
+    # Returns nothing
+    #
+    def run_iteration(url)
+      @urls = Array(url)
+      update_request_params!
+      run_super(:run)
+    end
-  #
-  # Runs an iteration performing parallel, non-blocking HTTP requests
-  #
-  # url - The current iteration's url.
-  #
-  # Returns nothing.
-  #
-  #
-  def run_iteration_async(url)
-    error_message = "When then option 'async' is set, the IterativeScraper class currently supports only HTTP method 'get'." +
-      "If you have to use a HTTP method other than GET, you will have to set the 'async' option to false."
+    #
+    # Runs an iteration performing parallel, non-blocking HTTP requests
+    #
+    # url - The current iteration's url.
+    #
+    # Returns nothing.
+    #
+    #
+    def run_iteration_async(url)
+      error_message = "When then option 'async' is set, the IterativeScraper class currently supports only HTTP method 'get'." +
+        "If you have to use a HTTP method other than GET, you will have to set the 'async' option to false."
-    raise NonGetAsyncRequestNotYetImplemented error_message unless @request_arguments[:method].nil? || @request_arguments[:method].downcase.to_sym == :get
+      raise NonGetAsyncRequestNotYetImplemented error_message unless @request_arguments[:method].nil? || @request_arguments[:method].downcase.to_sym == :get
-    @urls << add_iteration_param(url)
+      @urls << add_iteration_param(url)
-    if @iteration_set[@iteration_count] == @iteration_set.last
-      run_super(:run)
+      if @iteration_set[@iteration_count] == @iteration_set.last
+        run_super(:run)
+      end
     end
-  end
-  #
-  # Dynamically updates the request parameter hash with the
-  # current iteration parameter value.
-  #
-  # Returns nothing.
-  #
+    #
+    # Dynamically updates the request parameter hash with the
+    # current iteration parameter value.
+    #
+    # Returns nothing.
+    #
-  def update_request_params!
-    offset = @iteration_set.at(@iteration_count) || default_offset
-    @request_arguments[:params] ||= {}
-    @request_arguments[:params][@iteration_param.to_sym] = offset
-  end
+    def update_request_params!
+      offset = @iteration_set.at(@iteration_count) || default_offset
+      @request_arguments[:params] ||= {}
+      @request_arguments[:params][@iteration_param.to_sym] = offset
+    end
-  #
-  # Ads the current iteration offset to a url as a GET parameter.
-  #
-  # url - the url to be update
-  #
-  # Returns a url with the current iteration value represented as a get parameter.
-  #
-  def add_iteration_param(url)
-    offset = @iteration_set.at(@iteration_count) || default_offset
-    param = "#{@iteration_param}=#{offset}"
-    parsed_url = URI::parse(url)
-    if parsed_url.query
-      parsed_url.query += param
-    else
-      parsed_url.query =  param
+    #
+    # Ads the current iteration offset to a url as a GET parameter.
+    #
+    # url - the url to be update
+    #
+    # Returns a url with the current iteration value represented as a get parameter.
+    #
+    def add_iteration_param(url)
+      offset = @iteration_set.at(@iteration_count) || default_offset
+      param = "#{@iteration_param}=#{offset}"
+      parsed_url = URI::parse(url)
+      if parsed_url.query
+        parsed_url.query += param
+      else
+        parsed_url.query =  param
+      end
+      parsed_url.to_s
     end
-    parsed_url.to_s
-  end
-  #
-  # Utility function for calling a superclass instance method.
-  #
-  # (currently used to call ScraperBase#run).
-  #
+    #
+    # Utility function for calling a superclass instance method.
+    #
+    # (currently used to call ScraperBase#run).
+    #
-  def run_super(method, args=[])
-    self.class.superclass.instance_method(method).bind(self).call(*args)
-  end
+    def run_super(method, args=[])
+      self.class.superclass.instance_method(method).bind(self).call(*args)
+    end
-  def issue_request(url)
-    # remove continue argument if this is the first iteration
-    @request_arguments[:params].delete(@iteration_param.to_sym) if @continue_clause_args && @iteration_count == 0
-    super(url)
-    # clear previous value of iteration parameter
-    @request_arguments[:params].delete(@iteration_param.to_sym) if @request_arguments[:params] && @request_arguments[:params].any?
-  end
+    def issue_request(url)
+      # remove continue argument if this is the first iteration
+      @request_arguments[:params].delete(@iteration_param.to_sym) if @continue_clause_args && @iteration_count == 0
+      super(url)
+      # clear previous value of iteration parameter
+      @request_arguments[:params].delete(@iteration_param.to_sym) if @request_arguments[:params] && @request_arguments[:params].any?
+    end
-  #
-  # Overrides ScraperBase#handle_response in order to apply the proc used to dynamically extract the iteration set.
-  #
-  # TODO: update doc
-  #
-  # returns nothing.
-  #
+    #
+    # Overrides ScraperBase#handle_response in order to apply the proc used to dynamically extract the iteration set.
+    #
+    # TODO: update doc
+    #
+    # returns nothing.
+    #
-  def handle_response(response)
-    format =  @options[:format] || run_super(:detect_format, response.headers_hash['Content-Type'])
-    extractor_class = format == :json ? JsonExtractor : DomExtractor
+    def handle_response(response)
+      format =  @options[:format] || run_super(:detect_format, response.headers_hash['Content-Type'])
+      extractor_class = format == :json ? JsonExtractor : DomExtractor
-    run_iteration_extractor(response.body, extractor_class) if @response_count == 0 && @iteration_extractor_args
-    run_continue_clause(response.body, extractor_class) if @continue_clause_args
+      run_iteration_extractor(response.body, extractor_class) if @response_count == 0 && @iteration_extractor_args
+      run_continue_clause(response.body, extractor_class) if @continue_clause_args
-    super(response)
-  end
+      super(response)
+    end
-  def run_continue_clause(response_body, extractor_class)
-    extractor = extractor_class.new(:continue, ExtractionEnvironment.new(self), *@continue_clause_args)
-    continue_value = extractor.extract_field(response_body)
-    #TODO: check if continue_value is valid
+    def run_continue_clause(response_body, extractor_class)
+      extractor = extractor_class.new(:continue, ExtractionEnvironment.new(self), *@continue_clause_args)
+      continue_value = extractor.extract_field(response_body)
+      #TODO: check if continue_value is valid
-    @iteration_set << "" if @iteration_count == 0  #horrible hack: please refactor
-    @iteration_set << continue_value.to_s if continue_value
-  end
+      @iteration_set << "" if @iteration_count == 0  #horrible hack: please refactor
+      @iteration_set << continue_value.to_s if continue_value
+    end
-  def run_iteration_extractor(response_body, extractor_class)
-    @iteration_extractor = extractor_class.new(*@iteration_extractor_args.insert(1, ExtractionEnvironment.new(self)))
-    #NOTE: does this default_offset make any sense?
-    @iteration_set = Array(default_offset) + @iteration_extractor.extract_list(response_body).map(&:to_s) if @iteration_extractor
+    def run_iteration_extractor(response_body, extractor_class)
+      @iteration_extractor = extractor_class.new(*@iteration_extractor_args.insert(1, ExtractionEnvironment.new(self)))
+      #NOTE: does this default_offset make any sense?
+      @iteration_set = Array(default_offset) + @iteration_extractor.extract_list(response_body).map(&:to_s) if @iteration_extractor
+    end
   end
 end