RubyGems - sutch-scrubyt - Versions diffs - 0.4.20 - Mend

sutch-scrubyt 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data/CHANGELOG +350 -0
data/COPYING +340 -0
data/README +121 -0
data/Rakefile +101 -0
data/lib/scrubyt.rb +45 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +168 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +140 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +117 -0

data/lib/scrubyt/core/scraping/pre_filter_document.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Scrubyt
+  ##
+  #=<tt>Apply different functions on the input document</tt>
+  #Before the document is passed to Hpricot for parsing, we may need
+  #to do different stuff with it which are clumsy/not appropriate/impossible
+  #to do once the document is loaded.
+  class PreFilterDocument
+     #Replace <br/> tags with newlines
+     def self.br_to_newline(doc)
+       doc.gsub(/<br[ \/]*>/i, "\r\n")
+       doc = doc.tr("\240"," ")
+     end #end of function  br_to_newline
+  end #end of class PreFilterDocument
+end #end of module Scrubyt

data/lib/scrubyt/core/scraping/result_indexer.rb ADDED Viewed

@@ -0,0 +1,90 @@
+module Scrubyt
+  ##
+  #=<tt>Selecting results based on indices</tt>
+  #
+  #If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
+  #probably with a variable count of results (like tags, authors etc.), you may need just
+  #specific elements - like the last one, every third one, or at specific indices.
+  #In this case you should use the select_indices syntax.
+  class ResultIndexer
+    attr_reader :indices_to_extract
+    def initialize(*args)
+      select_indices(*args)
+    end
+    ##
+    #Perform selection of the desires result instances, based on their indices
+    def select_indices_to_extract(ary)
+      return ary if @indices_to_extract == nil
+      to_keep = []
+      @indices_to_extract.each {|e|
+        if e.is_a? Symbol
+          case e
+          when :first
+            to_keep << 0
+          when :last
+            to_keep << ary.size-1
+          when :all_but_last
+           (0..ary.size-2).each {|i| to_keep << i}
+          when :all_but_first
+           (1..ary.size-1).each {|i| to_keep << i}
+          when :every_even
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
+          when :every_odd
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
+          when :every_second
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
+          when :every_third
+           (0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
+          when :every_fourth
+           (0..ary.size).each {|i| to_keep << i if (i % 4 == 0)}
+          end
+        end
+      }
+      @indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
+      to_keep.sort!
+      ary.reject! {|e| !to_keep.include? ary.index(e)}
+      ary
+    end
+    private
+    ##
+    #Do not return the whole result set, just specified indices - like
+    #first,last, every odd index, indices from [1..3] etc.
+    #
+    #This method can accept:
+    #- a range, like (2..3)
+    #- an array of indices, like [1,2,3]
+    #- specified set of keywords:
+    #  - :first
+    #  - :last
+    #  - :every_even
+    #  - :every_odd
+    #  (there can be more of these keywords in one select_indices call)
+    def select_indices(*args)
+      indices_to_grab = args[0]
+      case indices_to_grab.class.to_s
+      when "Range"
+        @indices_to_extract = indices_to_grab.to_a
+      when "Array"
+        nested_arrays = []
+        indices_to_grab.each {|e|
+          if e.is_a? Array
+            nested_arrays << e
+          elsif e.is_a? Range
+            nested_arrays << e.to_a
+          end
+        }
+        @indices_to_extract = indices_to_grab
+        nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
+        @indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
+      when "Symbol"
+        #parse this when  we already have the results
+        @indices_to_extract = [indices_to_grab]
+      else
+        puts "Invalid index specification"
+      end
+    end #end of function select_indices
+  end #end of class ResultIndexer
+end #end of module Scrubyt

data/lib/scrubyt/core/shared/extractor.rb ADDED Viewed

@@ -0,0 +1,168 @@
+module Scrubyt
+  ##
+  #=<tt>Driving the whole extraction process</tt>
+  #
+  #Extractor is a performer class - it gets an extractor definition and carries
+  #out the actions and evaluates the wrappers sequentially.
+  #
+  #Originally also the navigation actions were here, but since the class got too
+  #big, they were factored out to an own class, NavigationAction.
+  class Extractor
+    include FetchAction
+    attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
+    #The definition of the extractor is passed through this method
+    def self.define(mode=nil, &extractor_definition)
+      if mode.is_a?(Hash)
+        if mode[:agent]==:firefox
+          FetchAction.class_eval do
+            include Navigation::Firewatir
+          end
+        else
+          FetchAction.class_eval do
+            include Navigation::Mechanize
+          end
+        end
+      else
+        FetchAction.class_eval do
+          include Navigation::Mechanize
+        end
+      end
+      extractor = self.new(mode, extractor_definition)
+      extractor.result
+    end
+    def self.load(filename)
+      define(&eval(IO.read(filename)))
+    end
+    def initialize(mode, extractor_definition)
+      @mode = mode
+      @root_patterns = []
+      @next_page_pattern = nil
+      #      @hpricot_doc = nil
+      #      @hpricot_doc_url = nil
+      @evaluating_extractor_definition = false
+      @next_page_list = []
+      @processed_pages = []
+      backtrace = SharedUtils.get_backtrace
+      parts = backtrace[1].split(':')
+      source_file = parts[0]
+      Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
+      @evaluating_extractor_definition = true
+      context = Object.new
+      context.extend NavigationActions
+      context.instance_eval do
+        def extractor=(value)
+          @extractor = value
+        end
+        def next_page(*args)
+          @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
+        end
+        def method_missing(method_name, *args, &block)
+          root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
+          @extractor.root_patterns << root_pattern
+          root_pattern
+        end
+      end
+      context.extractor = self
+      context.instance_eval(&extractor_definition)
+      @evaluating_extractor_definition = false
+      if @root_patterns.empty?
+        # TODO: this should be an exception
+        Scrubyt.log :ERROR, 'No extractor defined, exiting...'
+        exit
+      end
+      #Once all is set up, evaluate the extractor from the root pattern!
+      root_results = evaluate_extractor
+      FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
+      @result = ScrubytResult.new('root')
+      @result.push(*root_results)
+      @result.root_patterns = @root_patterns
+      @result.source_file = source_file
+      @result.source_proc = extractor_definition
+      #Return the root pattern
+      Scrubyt.log :INFO, 'Extraction finished succesfully!'
+    end
+    def get_hpricot_doc
+      FetchAction.get_hpricot_doc
+    end
+    def get_current_doc_url
+      FetchAction.get_current_doc_url
+    end
+    def get_detail_pattern_relations
+      @detail_pattern_relations
+    end
+    def get_mode
+      @mode
+    end
+    def get_original_host_name
+      @original_host_name
+    end
+    def add_to_next_page_list(result_node)
+      if result_node.result.is_a? Hpricot::Elem
+        node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
+        return if node == nil || node.attributes['href'] == nil
+        href = node.attributes['href'].gsub('&amp;') {'&'}
+      elsif result_node.result.is_a? String
+        href = result_node.result
+      end
+      url = href #TODO need absolute address here 1/4
+      @next_page_list << url
+    end
+    def evaluate_extractor
+      root_results = []
+      current_page_count = 1
+      catch :quit_next_page_loop do
+        loop do
+          url = get_current_doc_url #TODO need absolute address here 2/4
+          @processed_pages << url
+          @root_patterns.each do |root_pattern|
+            root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+          end
+          while @processed_pages.include? url #TODO need absolute address here 3/4
+            if !@next_page_pattern.nil?
+              throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
+              throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
+              xpath = @next_page_pattern.filters[0].xpath
+              node = (get_hpricot_doc/xpath).map.last
+              node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
+              throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
+              href = node.attributes['href'].gsub('&amp;') {'&'}
+              throw :quit_next_page_loop if href == nil
+              url = href #TODO need absolute address here 4/4
+            else
+              throw :quit_next_page_loop if @next_page_list.empty?
+              url = @next_page_list.pop
+            end
+          end
+          restore_host_name
+          FetchAction.fetch(url)
+          current_page_count += 1
+        end
+      end
+      root_results
+    end
+  end
+end

data/lib/scrubyt/logging.rb ADDED Viewed

@@ -0,0 +1,154 @@
+#
+# TODO: if multiline messages aren't needed, then remove them.
+#
+# TODO: switch to the conventional Ruby logger interface,
+# or create an adapter to it. If the former, then decided what to
+# do with the unit tests.
+#
+module Scrubyt
+  # Logging is disabled by default. It can be enabled as follows:
+  #
+  #   Scrubyt.logger = Scrubyt::Logger.new  # logs *all* messages to STDERR
+  #
+  def self.logger=(logger)
+    @logger = logger
+  end
+  # Simple logger implementation, based on Scrubyt's original logging style.
+  # Messages will be sent to STDERR. Logging can be limited to certain message
+  # levels by specifying them on initialization, e.g.
+  #
+  #   Scrubyt::Logger.new(:ACTION, :ERROR)  # will only log action/error messages
+  #
+  class Logger
+    class Message
+      def initialize(level, text)
+        @level, @text = level.to_s, text.to_s
+      end
+      def to_s
+        prefix + @text
+      end
+      protected
+      def prefix
+        @prefix ||= "[#{@level}] "
+      end
+    end
+    class MultiLineMessage < Message
+      def initialize(level, lines)
+        super level, lines.shift
+        @lines = lines
+      end
+      def to_s
+        [ super, indented_lines ] * "\n"
+      end
+      private
+      def indented_lines
+        @lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
+      end
+      def indented(line)
+        ' ' * prefix.length + line
+      end
+    end
+    def initialize(*levels)
+      @levels = levels
+    end
+    def log(level, message)
+      return unless logging?(level)
+      message_class = message.is_a?(Array) ? MultiLineMessage : Message
+      output_stream.puts message_class.new(level, message)
+    end
+    def output_stream
+      @output_stream || STDERR
+    end
+    attr_writer :output_stream
+    private
+    def logging?(level)
+      @levels.empty? || @levels.include?(level)
+    end
+  end
+  def self.log(level, message)
+    return if logger.nil?
+    logger.log(level, message)
+  end
+  private
+  def self.logger
+    @logger
+  end
+end
+if __FILE__ == $0 then
+  require 'test/unit'
+  class ScrubytLoggingTestCase < Test::Unit::TestCase
+    class FauxOutputStream < Array
+      def puts(object)
+        self << object.to_s
+      end
+    end
+    def setup_logger_with_faux_output_stream!(*logger_args)
+      @stream = FauxOutputStream.new
+      logger = Scrubyt::Logger.new(*logger_args)
+      logger.output_stream = @stream
+      Scrubyt.logger = logger
+    end
+    def test_that_logging_works_with_nil_logger
+      Scrubyt.logger = nil
+      assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
+    end
+    def test_simple_messages_are_output_correctly
+      setup_logger_with_faux_output_stream!
+      Scrubyt.log :ACTION, 'i just did something'
+      assert_equal 1, @stream.size
+      assert_equal '[ACTION] i just did something', @stream.first
+    end
+    def test_that_multiline_messages_are_output_correctly
+      setup_logger_with_faux_output_stream!
+      Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
+      assert_equal 1, @stream.size
+      assert_equal "[ERROR] something bad happened\n        dear oh dear", @stream.first
+    end
+    def test_that_loggers_can_be_limited_to_specfied_message_levels
+      setup_logger_with_faux_output_stream! :ERROR
+      Scrubyt.log :ACTION, 'i just did something'
+      Scrubyt.log :ERROR, 'something bad happened'
+      assert_equal 1, @stream.size
+      assert_equal '[ERROR] something bad happened', @stream.first
+    end
+  end
+end

data/lib/scrubyt/output/post_processor.rb ADDED Viewed

@@ -0,0 +1,139 @@
+module Scrubyt
+########################################## NOT USED ANY MORE ##########################################
+require 'set'
+##
+#=<tt>Post processing results after the extraction</tt>
+#Some things can not be carried out during evaluation - for example
+#the ensure_presence_of_pattern constraint (since the evaluation is top
+#to bottom, at a given point we don't know yet whether the currently
+#evaluated pattern will have a child pattern or not) or removing unneeded
+#results caused by evaluating multiple filters.
+#
+#The sole purpose of this class is to execute these post-processing tasks.
+  class PostProcessor
+    ##
+    #This is just a convenience method do call all the postprocessing
+    #functionality and checks
+    def self.apply_post_processing(root_pattern)
+      ensure_presence_of_pattern_full(root_pattern)
+      remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
+      report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
+    end
+    ##
+    #Apply the ensure_presence_of_pattern constraint on
+    #the full extractor
+    def self.ensure_presence_of_pattern_full(pattern)
+      ensure_presence_of_pattern(pattern)
+      pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
+    end
+    ##
+    #Remove unneeded results of a pattern (caused by evaluating multiple filters)
+    #See for example the B&N scenario - the book titles are extracted two times
+    #for every pattern (since both examples generate the same XPath for them)
+    #but since always only one of the results has a price, the other is discarded
+    def self.remove_multiple_filter_duplicates(pattern)
+      remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
+      pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
+    end
+    ##
+    #Issue an error report if the document did not extract anything.
+    #Probably this is because the structure of the page changed or
+    #because of some rather nasty bug - in any case, something wrong
+    #is going on, and we need to inform the user about this!
+    def self.report_if_no_results(root_pattern)
+      results_found = false
+      root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
+      Scrubyt.log :WARNING, [
+        "The extractor did not find any result instances. Most probably this is wrong.",
+        "Check your extractor and if you are sure it should work, report a bug!"
+      ]
+    end
+private
+    def self.ensure_presence_of_pattern(pattern)
+      #holds the name of those child patterns which have to be present as children of the input parameter
+      epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
+      return if epop_names.empty?
+      #all_parent_values holds instances extracted by pattern
+      all_parent_values = []
+      pattern.result.childmap.each { |h| all_parent_values << h.values }
+      all_parent_values.flatten!
+      #indices of result instances (of pattern) we are going to remove
+      results_to_remove = Set.new
+      pattern.children.each do |child_pattern|
+        #all_child_values holds instances extracted by child_pattern
+        all_child_values = []
+        child_pattern.result.childmap.each { |h| all_child_values << h.values }
+        all_child_values.flatten!
+        #populate results_to_remove
+        i = 0
+        all_parent_values.each do |parent_value|
+          #Hey! Not just the direct children but all the ancestors
+          @found_ancestor = false
+          check_ancestors(parent_value, all_child_values)
+          results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
+          i += 1
+        end
+      end
+      #based on results_to_remove, populate the array 'rejected' which holds the actual instances
+      #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
+      #results_to_remove indices to their actual instances
+      rejected = []
+      i = -1
+      pattern.result.childmap.each do |h|
+        h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
+      end
+      #Finally, do the actual delete!
+      pattern.result.childmap.each { |h| h.each { |k,v| rejected.each  { |r| v.delete(r)} } }
+    end
+    def self.check_ancestors(parent_value, all_child_values)
+      parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
+parent_value.is_a? Hpricot::Elem
+      parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
+    end
+    def self.remove_multiple_filter_duplicates_intern(pattern)
+      possible_duplicates = {}
+      longest_result = 0
+      pattern.result.childmap.each { |r|
+        r.each do |k,v|
+          v.each do |x|
+            all_child_results = []
+            pattern.children.each { |child|
+              temp_res = child.result.lookup(x)
+              all_child_results << temp_res if temp_res != nil
+            }
+            next if all_child_results.size <= 1
+            longest_result = all_child_results.map {|e| e.size}.max
+            all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
+            possible_duplicates[x] = all_child_results.transpose
+          end
+        end
+      }
+      #Determine the 'real' duplicates
+      real_duplicates = {}
+      possible_duplicates.each { |k,v|
+        next if v.size == 1
+        v.each { |r| real_duplicates[k] = r }
+      }
+      #Finally, remove them!
+      pattern.children.each { |child|
+        child.result.childmap.each { |r|
+          r.each { |k,v|
+           real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
+          }
+        }
+      }
+    end #end of function
+  end #end of class PostProcessor
+end #end of module Scrubyt