RubyGems - scrubyt - Versions diffs - 0.3.0 → 0.3.4 - Mend

scrubyt 0.3.0 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/CHANGELOG +13 -6
data/Rakefile +22 -10
data/lib/scrubyt.rb +9 -4
data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
data/lib/scrubyt/core/scraping/pattern.rb +25 -18
data/lib/scrubyt/core/shared/extractor.rb +109 -128
data/lib/scrubyt/logging.rb +146 -8
data/lib/scrubyt/output/export.rb +60 -44
data/lib/scrubyt/output/result_node.rb +34 -3
data/lib/scrubyt/output/scrubyt_result.rb +18 -9
data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
data/lib/scrubyt/utils/shared_utils.rb +1 -1
data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
metadata +52 -6
data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67

data/lib/scrubyt/core/scraping/filters/download_filter.rb CHANGED

@@ -14,17 +14,25 @@ module Scrubyt
 private
     def download_file(source)
-      host_name = @parent_pattern.evaluation_context.extractor.get_host_name
+      return '' if source.size < 4
+      host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
       outfile = nil
+      host_name += "/" if host_name[-1..-1] != "/"
       base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
-      return '' if source.size < 4
       file_name = source.scan(/.+\/(.*)/)[0][0]
+      return nil if @parent_pattern.except.include? file_name
       Net::HTTP.start(base_url) { |http|
-        puts "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
-        resp = http.get(source.scan(/\s*(.+)/)[0][0])
-        outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
-        FileUtils.mkdir_p @example
-        open(outfile, 'wb') {|f| f.write(resp.body) }
+        Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
+        begin
+          ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
+          path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
+          resp = http.get(path, {'User-Agent'=> ua})
+          outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
+          FileUtils.mkdir_p @example
+          open(outfile, 'wb') {|f| f.write(resp.body) }
+        rescue Timeout::Error
+          outfile = "[FAILED]#{file_name}"
+        end
        }
        outfile.scan(/.+\/(.*)/)[0][0]
     end
@@ -34,11 +42,21 @@ private
       loop do
         if File.exists? file_name
           if already_found
-            last_no = file_name.scan(/_(\d+)\./)[0][0]
-            file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
+            if file_name.include?('.')
+              last_no = file_name.scan(/_(\d+)\./)[0][0]
+              file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
+            else
+              last_no = file_name.scan(/_(\d+)$/)[0][0]
+              file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
+            end
           else
-            file_name.sub!(/\./) {"_1\."}
-            already_found = true
+            if file_name.include?('.')
+              file_name.sub!(/\./) {"_1\."}
+              already_found = true
+            else
+              file_name << '_1'
+              already_found = true
+            end
           end
         else
           break

data/lib/scrubyt/core/scraping/filters/regexp_filter.rb CHANGED

@@ -1,17 +1,17 @@
 module Scrubyt
   class RegexpFilter < BaseFilter
     def evaluate(source)
       if source.is_a? String
         source.scan(@example).flatten
       else
-        source.inner_text.scan(@example).flatten
-      end
+        source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
+      end
     end
     def to_sexp
       [:lit, @example]
     end
   end #End of class TreeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/script_filter.rb ADDED

@@ -0,0 +1,14 @@
+module Scrubyt
+  class ScriptFilter < BaseFilter
+    def evaluate(source)
+      param = source
+      param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
+      @example.call param
+    end
+    def to_sexp
+      [:str, "FIXME!!! Can't dump Proc"]
+    end #end of method to_sexp
+  end #End of class ConstantFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/text_filter.rb ADDED

@@ -0,0 +1,38 @@
+module Scrubyt
+  class TextFilter < BaseFilter
+    def evaluate(source)
+        return find_string(source) if @example =~ /^find\(/
+        final_element_name = @example.scan(/^(.+?)\[/)[0][0]
+        text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
+        index = @example.scan(/\]:(.+)/).flatten
+        index = 0 if index.empty?
+        index = index[0].to_i unless index[0] == "all"
+        result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
+        return "" unless result
+        if index[0] == "all"
+          result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
+        else
+          [XPathUtils.traverse_up_until_name(result,final_element_name)]
+        end
+    end
+    def find_string(source)
+      str = @example.scan(/find\((.+)\)/).flatten[0]
+      strings_to_find = str.include? ('|') ? str.split('|') : [str]
+      strings_to_find.each do |s|
+        result = SharedUtils.traverse_for_match(source,/#{s}/i)
+        return [s] unless result.empty?
+      end
+      return []
+    end
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
+  end #End of class TextFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/tree_filter.rb CHANGED

@@ -38,7 +38,7 @@ module Scrubyt
       return if @temp_sink.is_a? String
       return if @example =~ /.+\[.+\]$/
-      text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_text)
+      text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
       match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
       return if match_range == (0..text.length)
@@ -64,7 +64,7 @@ module Scrubyt
       when EXAMPLE_TYPE_XPATH
         @xpath = @example
       when EXAMPLE_TYPE_STRING
-        @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
+        @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
                                                              @example,
                                                              next_page_example)
         return if @temp_sink == nil
@@ -116,10 +116,10 @@ module Scrubyt
           current_example_index += 1
         end
       when EXAMPLE_TYPE_IMAGE
-        @temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.extractor.get_hpricot_doc, @example)
+        @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
         @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
       when EXAMPLE_TYPE_COMPOUND
-        @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
+        @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
                                                                           @example,
                                                                           next_page_example)
         @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
@@ -128,7 +128,7 @@ module Scrubyt
     end
     def generate_relative_XPath(parent_xpath)
-      parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
+      parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
                                               parent_xpath,
                                               @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
       @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node

data/lib/scrubyt/core/scraping/pattern.rb CHANGED

@@ -33,7 +33,7 @@ module Scrubyt
     #    # write out the HTML subtree beginning at the matched element
     #    PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
-    VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
+    VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
     #The pattern can be either a model pattern (in this case it is
     #written to the output) or a temp pattern (in this case it is skipped)
@@ -46,27 +46,25 @@ module Scrubyt
     #    #of the pattrern which was skipped
     #    OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
-    VALID_OUTPUT_TYPES = [:model, :temp]
+    VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
     #These options can be set upon wrapper creation
-    PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve]
+    PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
     VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
-    attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
-                  :last_result, :evaluation_context,
-                  :indices_to_extract, :referenced_extractor, :referenced_pattern,
-                  :source_file, :source_proc, :modifier_calls)
+    attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
+                  :indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
     attr_reader(:next_page_url, :result_indexer)
     option_reader(:type => :tree, :output_type => :model, :generalize => false,
                   :write_text => lambda { @children.size == 0 }, :limit => nil,
-    :default => nil, :resolve => :full)
+                  :default => nil, :resolve => :full, :except => nil, :example_type => nil)
-    def initialize(name, args=[], evaluation_context=nil, parent=nil, &block)
+    def initialize(name, args=[], extractor=nil, parent=nil, &block)
       #init attributes
       @name = name
-      @evaluation_context = evaluation_context
+      @extractor = extractor
       @parent = parent
       @options = {}
       @children = []
@@ -83,6 +81,7 @@ module Scrubyt
       #perform checks for special cases
       examples = check_if_shortcut_pattern() if examples == nil
       check_if_detail_page(block)
+      @options[:output_type] = :page_list if name == 'page_list'
       #create filters
       if examples == nil
@@ -97,7 +96,7 @@ module Scrubyt
       #@generalize was not set up explicitly
       if @options[:generalize].nil?
         @options[:generalize] = true if parent.nil?
-        @options[:generalize] = false if filters[0].example =~ /.+\[[a-zA-Z].+\]$/
+        @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
       end
       #parse child patterns if available
@@ -160,7 +159,6 @@ module Scrubyt
       if @name =~ /.+_detail/
         @options[:type] = :detail_page
         @referenced_extractor = block
-        Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
       end
     end
@@ -194,7 +192,7 @@ module Scrubyt
             end
           else
             #create child pattern
-            child = Scrubyt::Pattern.new(method_name.to_s, args, @current.evaluation_context, @current, &block)
+            child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
             @current.children << child
             child
           end
@@ -213,7 +211,7 @@ module Scrubyt
     #
     # camera_data.item[1].item_name[0]
     def method_missing(method_name, *args, &block)
-      if @evaluation_context.evaluating_extractor_definition
+      if @extractor.evaluating_extractor_definition
         @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
       end
@@ -294,9 +292,17 @@ module Scrubyt
         result_nodes << node
       end
       if result_nodes.empty?
-        result_nodes << ResultNode.new(@name,@options[:default],@options) if @options[:default]
+        result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
+      end
+      case output_type
+        when :model
+          return result_nodes
+        when :page_list
+          result_nodes.each do |result_node|
+            @extractor.add_to_next_page_list result_node
+          end
+          return []
       end
-      result_nodes
     end
     def to_sexp
@@ -310,8 +316,7 @@ module Scrubyt
       if type == :detail_page
         #add detail page extractor
-        detail_root = @evaluation_context.extractor.get_detail_extractor(self)
-        sexp = [:iter, sexp, nil, [:block, detail_root.to_sexp]]
+        sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
       else
         #add child block if the pattern has children
         sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
@@ -363,6 +368,8 @@ module Scrubyt
       elsif (args[0].is_a? Hash)
         examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
         examples = nil if examples == []
+      elsif (args[0].is_a? Proc)
+        examples = [args[0]]
       end
       @has_examples = !examples.nil?

data/lib/scrubyt/core/shared/extractor.rb CHANGED

@@ -7,166 +7,147 @@ module Scrubyt
   #
   #Originally also the navigation actions were here, but since the class got too
   #big, they were factored out to an own class, NavigationAction.
-  class Extractor
+  class Extractor
+    include FetchAction
+    attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
     #The definition of the extractor is passed through this method
     def self.define(mode=nil, &extractor_definition)
+      extractor = self.new(mode, extractor_definition)
+      extractor.result
+    end
+    def self.load(filename)
+      define(&eval(IO.read(filename)))
+    end
+    def initialize(mode, extractor_definition)
+      @mode = mode
+      @root_patterns = []
+      @next_page_pattern = nil
+      #      @hpricot_doc = nil
+      #      @hpricot_doc_url = nil
+      @evaluating_extractor_definition = false
+      @next_page_list = []
+      @processed_pages = []
       backtrace = SharedUtils.get_backtrace
       parts = backtrace[1].split(':')
       source_file = parts[0]
-      @@mode = mode
-      #We are keeping the relations between the detail patterns and their root patterns
-      @@detail_extractor_to_pattern_name = {}
-      @@detail_pattern_relations = {}
-      #root pattern -> URIBuilder mapping
-      @@next_patterns = {}
-      mode_name = (mode == :production ? 'Production' : 'Learning')
+      Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
-      Scrubyt.log :MODE, mode_name
-      @@evaluation_context = EvaluationContext.new
-      #Hack up an artificial root pattern (i.e. do not return the pattern which
-      #is the root one in the user's definition, but rather the real (invisible)
-      #root pattern
-      @@evaluation_context.evaluating_extractor_definition = true
-      class_eval(&extractor_definition)
-      @@evaluation_context.evaluating_extractor_definition = false
-      root_pattern = @@evaluation_context.root_pattern
-      if root_pattern.nil?
+      @evaluating_extractor_definition = true
+      context = Object.new
+      context.extend NavigationActions
+      context.instance_eval do
+        def extractor=(value)
+          @extractor = value
+        end
+        def next_page(*args)
+          @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
+        end
+        def method_missing(method_name, *args, &block)
+          root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
+          @extractor.root_patterns << root_pattern
+          root_pattern
+        end
+      end
+      context.extractor = self
+      context.instance_eval(&extractor_definition)
+      @evaluating_extractor_definition = false
+      if @root_patterns.empty?
         # TODO: this should be an exception
         Scrubyt.log :ERROR, 'No extractor defined, exiting...'
         exit
       end
-      root_pattern.source_file = source_file
-      root_pattern.source_proc = extractor_definition
       #Once all is set up, evaluate the extractor from the root pattern!
-      root_results = evaluate_extractor(root_pattern)
-      scrubyt_result = ScrubytResult.new('root')
-      scrubyt_result.push(*root_results)
-      scrubyt_result.root_pattern = root_pattern
+      root_results = evaluate_extractor
+      @result = ScrubytResult.new('root')
+      @result.push(*root_results)
+      @result.root_patterns = @root_patterns
+      @result.source_file = source_file
+      @result.source_proc = extractor_definition
       #Return the root pattern
       Scrubyt.log :INFO, 'Extraction finished succesfully!'
-      scrubyt_result
-    end
-    #Evaluate a subexttractor (i.e. an extractor on a detail page).
-    #The url passed to this function is automatically loaded.
-    #The definition of the subextractor is passed as a block
-    #
-    #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
-    def self.evaluate_subextractor(url, parent_pattern, resolve)
-      if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
-        detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
-        detail_root.last_result = nil
-        FetchAction.store_page
-        @@original_evaluation_context.push @@evaluation_context
-        @@host_stack.push FetchAction.get_host_name
-        @@evaluation_context = EvaluationContext.new
-        FetchAction.restore_host_name
-        fetch url, :resolve => resolve
-        @@evaluation_context.extractor = self
-        @@evaluation_context.root_pattern = detail_root
-        root_results = evaluate_extractor detail_root
-        @@evaluation_context = @@original_evaluation_context.pop
-        FetchAction.restore_page
-        FetchAction.store_host_name(@@host_stack.pop)
-        root_results
-      else
-        @@original_evaluation_context ||= []
-        @@host_stack ||= []
-        FetchAction.store_page
-        @@original_evaluation_context.push @@evaluation_context
-        @@host_stack.push FetchAction.get_host_name
-        @@evaluation_context = EvaluationContext.new
-        FetchAction.restore_host_name
-        fetch url, :resolve => resolve
-        class_eval(&parent_pattern.referenced_extractor)
-        root_pattern = @@evaluation_context.root_pattern
-        @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
-        root_results = evaluate_extractor(root_pattern)
-        @@evaluation_context = @@original_evaluation_context.pop
-        FetchAction.restore_page
-        FetchAction.store_host_name(@@host_stack.pop)
-        root_results
-      end
-    end
-    #build the current wrapper
-    def self.method_missing(method_name, *args, &block)
-      if NavigationActions::KEYWORDS.include? method_name.to_s
-        NavigationActions.send(method_name, *args)
-        return
-      end
-      if method_name.to_s == 'next_page'
-        pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
-        pattern.evaluation_context = @@evaluation_context
-        @@evaluation_context.setup_uri_builder(pattern, args)
-        @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
-      else
-        raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
-        #Create a root pattern
-        @@evaluation_context.extractor = self
-        root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
-        @@last_root_pattern = root_pattern
-        @@evaluation_context.root_pattern = root_pattern
-        root_pattern
-      end
     end
-    def self.add_detail_extractor_to_pattern_name(referenced_extractor, pattern)
-      @@detail_extractor_to_pattern_name[referenced_extractor] ||= [] << pattern
-    end
-    def self.get_detail_extractor(parent_pattern)
-      @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
-    end
-    def self.get_hpricot_doc
-      NavigationActions.get_hpricot_doc
+    def get_hpricot_doc
+      FetchAction.get_hpricot_doc
     end
-    def self.get_current_doc_url
-      NavigationActions.get_current_doc_url
+    def get_current_doc_url
+      FetchAction.get_current_doc_url
     end
-    def self.get_detail_pattern_relations
-      @@detail_pattern_relations
+    def get_detail_pattern_relations
+      @detail_pattern_relations
     end
-    def self.get_host_name
-      NavigationActions.get_host_name
+    def get_mode
+      @mode
     end
-    def self.get_mode
-      @@mode
+    def get_original_host_name
+      @original_host_name
     end
-    def self.get_original_host_name
-      @@original_host_name
+    def add_to_next_page_list(result_node)
+      if result_node.result.is_a? Hpricot::Elem
+        node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
+        return if node == nil || node.attributes['href'] == nil
+        href = node.attributes['href'].gsub('&amp;') {'&'}
+      elsif result_node.result.is_a? String
+        href = result_node.result
+      end
+      url = href #TODO need absolute address here 1/4
+      @next_page_list << url
     end
-    private
-    def self.evaluate_extractor(root_pattern)
+    def evaluate_extractor
       root_results = []
-      if @@next_patterns[root_pattern]
-        current_page_count = 1
+      current_page_count = 1
+      catch :quit_next_page_loop do
         loop do
-          root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
-          break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(@@next_patterns[root_pattern]))
-          current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
+          url = get_current_doc_url #TODO need absolute address here 2/4
+          puts url
+          @processed_pages << url
+          @root_patterns.each do |root_pattern|
+            root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+          end
+          while @processed_pages.include? url #TODO need absolute address here 3/4
+            if !@next_page_pattern.nil?
+              throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
+              throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
+              xpath = @next_page_pattern.filters[0].xpath
+              node = (get_hpricot_doc/xpath).map.last
+              node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
+              throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
+              href = node.attributes['href'].gsub('&amp;') {'&'}
+              throw :quit_next_page_loop if href == nil
+              url = href #TODO need absolute address here 4/4
+            else
+              throw :quit_next_page_loop if @next_page_list.empty?
+              url = @next_page_list.pop
+            end
+          end
+          restore_host_name
+          FetchAction.fetch(url)
+          current_page_count += 1
         end
-      else
-        root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
       end
       root_results
     end
-  end #end of class Extractor
-end #end of module Scrubyt
+  end
+end