RubyGems - scrubber-scrubyt - Versions diffs - 0.4.11 - Mend

scrubber-scrubyt 0.4.11

Files changed (45) hide show

data/CHANGELOG +343 -0
data/COPYING +340 -0
data/README +99 -0
data/Rakefile +101 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +167 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +140 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/lib/scrubyt.rb +43 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +115 -0

data/lib/scrubyt/core/scraping/pattern.rb ADDED Viewed

@@ -0,0 +1,359 @@
+require 'rubygems'
+require 'hpricot'
+module Scrubyt
+  ##
+  #=<tt>Group more filters into one</tt>
+  #
+  #Server as an umbrella for filters which are conceptually extracting
+  #the same thing - for example a price or a title or ...
+  #
+  #Sometimes the same piece of information can not be extracted with one filter
+  #across more result instances (for example a price has an XPath in record n,
+  #but since in record n+1 has a discount price as well, the real price is pushed
+  #to a different XPath etc) - in this case the more filters which extract the same
+  #thing are hold in the same pattern.
+  class Pattern
+    #Type of the pattern;
+    # TODO: Update documentation
+    #    # a root pattern represents a (surprise!) root pattern
+    #    PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT
+    #    # a tree pattern represents a HTML region
+    #    PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE
+    #    # represents an attribute of the node extracted by the parent pattern
+    #    PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE
+    #    # represents a pattern which filters its output with a regexp
+    #    PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP
+    #    # represents a pattern which crawls to the detail page and extracts information from there
+    #    PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE
+    #    # represents a download pattern
+    #    PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD
+    #    # write out the HTML subtree beginning at the matched element
+    #    PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
+    VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
+    # :determine - default value, represent that type of example need determine
+    # :string    - represent node with example type EXAMPLE_TYPE_STRING
+    VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
+    #The pattern can be either a model pattern (in this case it is
+    #written to the output) or a temp pattern (in this case it is skipped)
+    #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
+    #is considered to be a model pattern
+    #Model pattern are shown in the output
+    #    OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
+    #    #Temp patterns are skipped in the output (their ancestors are appended to the parent
+    #    #of the pattrern which was skipped
+    #    OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
+    VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
+    #These options can be set upon wrapper creation
+    PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
+    VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
+    attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
+                  :indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
+    attr_reader(:next_page_url, :result_indexer)
+    option_reader(:type => :tree, :output_type => :model, :generalize => false,
+                  :write_text => lambda { @children.size == 0 }, :limit => nil,
+                  :default => nil, :resolve => :full, :except => [], :example_type => :determine)
+    def initialize(name, args=[], extractor=nil, parent=nil, &block)
+      #init attributes
+      @name = name
+      @extractor = extractor
+      @parent = parent
+      @options = {}
+      @children = []
+      @filters = []
+      @constraints = []
+      @modifier_calls = []
+      #grab any examples that are defined
+      examples = look_for_examples(args)
+      #parse the options hash if provided
+      parse_options_hash(args[-1]) if args[-1].is_a? Hash
+      #perform checks for special cases
+      examples = check_if_shortcut_pattern() if examples == nil
+      check_if_detail_page(block)
+      @options[:output_type] = :page_list if name == 'page_list'
+      #create filters
+      if examples == nil
+        @filters << Scrubyt::BaseFilter.create(self) #create a default filter
+      else
+        examples.each do |example|
+          @filters << Scrubyt::BaseFilter.create(self,example) #create a filter
+        end
+      end
+      #by default, generalize the root pattern, but only in the case if
+      #@generalize was not set up explicitly
+      if @options[:generalize].nil?
+        @options[:generalize] = true if parent.nil?
+        @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
+      end
+      #parse child patterns if available
+      parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
+      #tree pattern only (TODO: subclass?)
+      if type == :tree
+        #generate xpaths and regexps
+        @filters.each do |filter|
+          filter.generate_XPath_for_example(false) unless @name == 'next_page'
+          filter.generate_regexp_for_example
+        end
+        #when the xpaths of this pattern have been created, its children can make their xpaths relative
+        xpaths = @filters.collect { |filter| filter.xpath }
+        @children.each do |child|
+          child.generate_relative_XPaths xpaths
+        end
+      end
+    end
+    def generate_relative_XPaths(parent_xpaths)
+      return if type != :tree
+      raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
+      @filters.each_index do |index|
+        @filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
+      end
+    end
+    #Shortcut patterns, as their name says, are a shortcut for creating patterns
+    #from predefined rules; for example:
+    #
+    #  detail_url
+    #
+    #  is equivalent to
+    #
+    #  detail_url 'href', type => :attribute
+    #
+    #i.e. the system figures out on it's own that because of the postfix, the
+    #example should be looked up (but it should never override the user input!)
+    #another example (will be available later):
+    #
+    # every_img
+    #
+    # is equivivalent to
+    #
+    # every_img '//img'
+    #
+    def check_if_shortcut_pattern()
+      if @name =~ /.+_url/
+        @options[:type] = :attribute
+        ['href']
+      end
+    end
+    #Check whether the currently created pattern is a detail pattern (i.e. it refrences
+    #a subextractor). Also check if the currently created pattern is
+    #an ancestor of a detail pattern , and store this in a hash if yes (to be able to
+    #traverse the pattern structure on detail pages as well).
+    def check_if_detail_page(block)
+      if @name =~ /.+_detail/
+        @options[:type] = :detail_page
+        @referenced_extractor = block
+      end
+    end
+    def parent_of_leaf
+      @children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
+    end
+    def filter_count
+      @filters.size
+    end
+    def parse_child_patterns(&block)
+      context = Object.new
+      context.instance_eval do
+        def current=(value)
+          @current = value
+        end
+        def method_missing(method_name, *args, &block)
+          if method_name.to_s[0..0] == '_'
+            #add hash option
+            key = method_name.to_s[1..-1].to_sym
+            check_option(key)
+            args.each do |arg|
+              current_value = @current.options[key]
+              if current_value.nil?
+                @current.options[key] = arg
+              else
+                @current.options[key] = [current_value] if !current_value.is_a Array
+                @current.options[key] << arg
+              end
+            end
+          else
+            #create child pattern
+            child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
+            @current.children << child
+            child
+          end
+        end
+      end
+      context.current = self
+      context.instance_eval(&block)
+    end
+    #Dispatcher function; The class was already too big so I have decided to factor
+    #out some methods based on their functionality (like output, adding constraints)
+    #to utility classes.
+    #
+    #The second function besides dispatching is to lookup the results in an evaluated
+    #wrapper, for example
+    #
+    # camera_data.item[1].item_name[0]
+    def method_missing(method_name, *args, &block)
+      if @extractor.evaluating_extractor_definition
+        @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
+      end
+      case method_name.to_s
+      when 'select_indices'
+        @result_indexer = Scrubyt::ResultIndexer.new(*args)
+        return self
+      when /^ensure_/
+        @constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
+        return self #To make chaining possible
+      else
+        @children.each { |child| return child if child.name == method_name.to_s }
+      end
+      raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
+    end
+    def evaluate(source, filter_indices)
+      if type == :detail_page # DIRTY!
+        return @filters[0].evaluate(source)
+      end
+      #we apply all filters if filter_indices is nil
+      indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices
+      #stores the results of all filters
+      all_filter_results = []
+      #remembers which filters have retured a certain result
+      indices_mapping = {}
+      #evaluate filters and collect filter results
+      indices_to_evaluate.each do |filter_index|
+        filter = @filters[filter_index]
+        filter_results = filter.evaluate(source)
+        filter_results.each do |result|
+          #add result to list if not already there
+          all_filter_results << result if all_filter_results.index(result).nil?
+          #add the current filter's index to the mapping
+           (indices_mapping[result] ||= []) << filter_index
+        end
+      end
+      #apply constraints
+      if @constraints.size > 0
+        all_filter_results = all_filter_results.select do |result|
+          @constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
+        end
+      end
+      #apply indexer
+      all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil?
+      #create result nodes and evaluate children
+      result_nodes = []
+      all_filter_results.each do |result|
+        #create result node
+        node = ResultNode.new(@name, result, @options)
+        node.generated_by_leaf = (@children.size == 0)
+        #evaluate children
+        @children.each do |child|
+          raise if self.filter_count != 1 && child.filter_count != self.filter_count
+          if self.filter_count == 1
+            #evaluate all child filters
+            node.push(*child.evaluate(result, nil))
+          else
+            #evaluate appropriate child filters
+            node.push(*child.evaluate(result, indices_mapping[result]))
+          end
+        end
+        #apply child constraints (ensure_presence_of_pattern)
+        required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target}
+        unless required_child_names.empty?
+          check = lambda { |node_to_check|
+            required_child_names.delete node_to_check.name
+            node_to_check.each { |child| check.call child }
+          }
+          check.call node
+        end
+        next unless required_child_names.empty?
+        #add the current result node to the list
+        result_nodes << node
+      end
+      if result_nodes.empty?
+        result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
+      end
+      case output_type
+        when :model
+          return result_nodes
+        when :page_list
+          result_nodes.each do |result_node|
+            @extractor.add_to_next_page_list result_node
+          end
+          return []
+      end
+    end
+    private
+    def parse_options_hash(hash)
+      #merge provided hash
+      @options.merge!(hash)
+      #check if valid
+      hash.each { |key, value| check_option(key.to_sym) }
+      raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
+      raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
+      raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
+    end
+    def check_option(option)
+      raise "Unknown pattern option: #{option.to_s}" if VALID_OPTIONS.index(option).nil?
+    end
+    def look_for_examples(args)
+      if (args[0].is_a? String)
+        examples = args.select {|e| e.is_a? String}
+        #Check if all the String parameters are really the first
+        #parameters
+        args[0..examples.size-1].each do |example|
+          if !example.is_a? String
+            puts 'FATAL: Problem with example specification'
+          end
+        end
+      elsif (args[0].is_a? Regexp)
+        examples = args.select {|e| e.is_a? Regexp}
+        #Check if all the String parameters are really the first
+        #parameters
+        args[0..examples.size].each do |example|
+          if !example.is_a? Regexp
+            puts 'FATAL: Problem with example specification'
+          end
+        end
+        @options[:type] = :regexp
+      elsif (args[0].is_a? Hash)
+        examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
+        examples = nil if examples == []
+      elsif (args[0].is_a? Proc)
+        examples = [args[0]]
+      end
+      @has_examples = !examples.nil?
+      examples
+    end
+  end #end of class Pattern
+end #end of module Scrubyt

data/lib/scrubyt/core/scraping/pre_filter_document.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Scrubyt
+  ##
+  #=<tt>Apply different functions on the input document</tt>
+  #Before the document is passed to Hpricot for parsing, we may need
+  #to do different stuff with it which are clumsy/not appropriate/impossible
+  #to do once the document is loaded.
+  class PreFilterDocument
+     #Replace <br/> tags with newlines
+     def self.br_to_newline(doc)
+       doc.gsub(/<br[ \/]*>/i, "\r\n")
+       doc = doc.tr("\240"," ")
+     end #end of function  br_to_newline
+  end #end of class PreFilterDocument
+end #end of module Scrubyt

data/lib/scrubyt/core/scraping/result_indexer.rb ADDED Viewed

@@ -0,0 +1,90 @@
+module Scrubyt
+  ##
+  #=<tt>Selecting results based on indices</tt>
+  #
+  #If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
+  #probably with a variable count of results (like tags, authors etc.), you may need just
+  #specific elements - like the last one, every third one, or at specific indices.
+  #In this case you should use the select_indices syntax.
+  class ResultIndexer
+    attr_reader :indices_to_extract
+    def initialize(*args)
+      select_indices(*args)
+    end
+    ##
+    #Perform selection of the desires result instances, based on their indices
+    def select_indices_to_extract(ary)
+      return ary if @indices_to_extract == nil
+      to_keep = []
+      @indices_to_extract.each {|e|
+        if e.is_a? Symbol
+          case e
+          when :first
+            to_keep << 0
+          when :last
+            to_keep << ary.size-1
+          when :all_but_last
+           (0..ary.size-2).each {|i| to_keep << i}
+          when :all_but_first
+           (1..ary.size-1).each {|i| to_keep << i}
+          when :every_even
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
+          when :every_odd
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
+          when :every_second
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
+          when :every_third
+           (0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
+          when :every_fourth
+           (0..ary.size).each {|i| to_keep << i if (i % 4 == 0)}
+          end
+        end
+      }
+      @indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
+      to_keep.sort!
+      ary.reject! {|e| !to_keep.include? ary.index(e)}
+      ary
+    end
+    private
+    ##
+    #Do not return the whole result set, just specified indices - like
+    #first,last, every odd index, indices from [1..3] etc.
+    #
+    #This method can accept:
+    #- a range, like (2..3)
+    #- an array of indices, like [1,2,3]
+    #- specified set of keywords:
+    #  - :first
+    #  - :last
+    #  - :every_even
+    #  - :every_odd
+    #  (there can be more of these keywords in one select_indices call)
+    def select_indices(*args)
+      indices_to_grab = args[0]
+      case indices_to_grab.class.to_s
+      when "Range"
+        @indices_to_extract = indices_to_grab.to_a
+      when "Array"
+        nested_arrays = []
+        indices_to_grab.each {|e|
+          if e.is_a? Array
+            nested_arrays << e
+          elsif e.is_a? Range
+            nested_arrays << e.to_a
+          end
+        }
+        @indices_to_extract = indices_to_grab
+        nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
+        @indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
+      when "Symbol"
+        #parse this when  we already have the results
+        @indices_to_extract = [indices_to_grab]
+      else
+        puts "Invalid index specification"
+      end
+    end #end of function select_indices
+  end #end of class ResultIndexer
+end #end of module Scrubyt

data/lib/scrubyt/core/shared/extractor.rb ADDED Viewed

@@ -0,0 +1,167 @@
+module Scrubyt
+  ##
+  #=<tt>Driving the whole extraction process</tt>
+  #
+  #Extractor is a performer class - it gets an extractor definition and carries
+  #out the actions and evaluates the wrappers sequentially.
+  #
+  #Originally also the navigation actions were here, but since the class got too
+  #big, they were factored out to an own class, NavigationAction.
+  class Extractor
+    include FetchAction
+    attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
+    #The definition of the extractor is passed through this method
+    def self.define(mode=nil, &extractor_definition)
+      if mode.is_a?(Hash)
+        if mode[:agent]==:firefox
+          FetchAction.class_eval do
+            include Navigation::Firewatir
+          end
+        else
+          FetchAction.class_eval do
+            include Navigation::Mechanize
+          end
+        end
+      else
+        FetchAction.class_eval do
+          include Navigation::Mechanize
+        end
+      end
+      extractor = self.new(mode, extractor_definition)
+      extractor.result
+    end
+    def self.load(filename)
+      define(&eval(IO.read(filename)))
+    end
+    def initialize(mode, extractor_definition)
+      @mode = mode
+      @root_patterns = []
+      @next_page_pattern = nil
+      #      @hpricot_doc = nil
+      #      @hpricot_doc_url = nil
+      @evaluating_extractor_definition = false
+      @next_page_list = []
+      @processed_pages = []
+      backtrace = SharedUtils.get_backtrace
+      parts = backtrace[1].split(':')
+      source_file = parts[0]
+      Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
+      @evaluating_extractor_definition = true
+      context = Object.new
+      context.extend NavigationActions
+      context.instance_eval do
+        def extractor=(value)
+          @extractor = value
+        end
+        def next_page(*args)
+          @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
+        end
+        def method_missing(method_name, *args, &block)
+          root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
+          @extractor.root_patterns << root_pattern
+          root_pattern
+        end
+      end
+      context.extractor = self
+      context.instance_eval(&extractor_definition)
+      @evaluating_extractor_definition = false
+      if @root_patterns.empty?
+        # TODO: this should be an exception
+        Scrubyt.log :ERROR, 'No extractor defined, exiting...'
+        exit
+      end
+      #Once all is set up, evaluate the extractor from the root pattern!
+      root_results = evaluate_extractor
+      @result = ScrubytResult.new('root')
+      @result.push(*root_results)
+      @result.root_patterns = @root_patterns
+      @result.source_file = source_file
+      @result.source_proc = extractor_definition
+      #Return the root pattern
+      Scrubyt.log :INFO, 'Extraction finished succesfully!'
+    end
+    def get_hpricot_doc
+      FetchAction.get_hpricot_doc
+    end
+    def get_current_doc_url
+      FetchAction.get_current_doc_url
+    end
+    def get_detail_pattern_relations
+      @detail_pattern_relations
+    end
+    def get_mode
+      @mode
+    end
+    def get_original_host_name
+      @original_host_name
+    end
+    def add_to_next_page_list(result_node)
+      if result_node.result.is_a? Hpricot::Elem
+        node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
+        return if node == nil || node.attributes['href'] == nil
+        href = node.attributes['href'].gsub('&amp;') {'&'}
+      elsif result_node.result.is_a? String
+        href = result_node.result
+      end
+      url = href #TODO need absolute address here 1/4
+      @next_page_list << url
+    end
+    def evaluate_extractor
+      root_results = []
+      current_page_count = 1
+      catch :quit_next_page_loop do
+        loop do
+          url = get_current_doc_url #TODO need absolute address here 2/4
+          @processed_pages << url
+          @root_patterns.each do |root_pattern|
+            root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+          end
+          while @processed_pages.include? url #TODO need absolute address here 3/4
+            if !@next_page_pattern.nil?
+              throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
+              throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
+              xpath = @next_page_pattern.filters[0].xpath
+              node = (get_hpricot_doc/xpath).map.last
+              node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
+              throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
+              href = node.attributes['href'].gsub('&amp;') {'&'}
+              throw :quit_next_page_loop if href == nil
+              url = href #TODO need absolute address here 4/4
+            else
+              throw :quit_next_page_loop if @next_page_list.empty?
+              url = @next_page_list.pop
+            end
+          end
+          restore_host_name
+          FetchAction.fetch(url)
+          current_page_count += 1
+        end
+      end
+      root_results
+    end
+  end
+end