RubyGems - andyverprauskus-scrubyt - Versions diffs - 0.5.1 - Mend

andyverprauskus-scrubyt 0.5.1

Files changed (45) hide show

data/CHANGELOG +355 -0
data/COPYING +340 -0
data/README.rdoc +121 -0
data/Rakefile +101 -0
data/lib/scrubyt.rb +53 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +318 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +312 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +63 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +107 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +183 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +145 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +120 -0

data/lib/scrubyt/core/scraping/pre_filter_document.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Scrubyt
+  ##
+  #=<tt>Apply different functions on the input document</tt>
+  #Before the document is passed to Hpricot for parsing, we may need
+  #to do different stuff with it which are clumsy/not appropriate/impossible
+  #to do once the document is loaded.
+  class PreFilterDocument
+     #Replace <br/> tags with newlines
+     def self.br_to_newline(doc)
+       doc.gsub(/<br[ \/]*>/i, "\r\n")
+       doc = doc.tr("\240"," ")
+     end #end of function  br_to_newline
+  end #end of class PreFilterDocument
+end #end of module Scrubyt

data/lib/scrubyt/core/scraping/result_indexer.rb ADDED Viewed

@@ -0,0 +1,90 @@
+module Scrubyt
+  ##
+  #=<tt>Selecting results based on indices</tt>
+  #
+  #If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
+  #probably with a variable count of results (like tags, authors etc.), you may need just
+  #specific elements - like the last one, every third one, or at specific indices.
+  #In this case you should use the select_indices syntax.
+  class ResultIndexer
+    attr_reader :indices_to_extract
+    def initialize(*args)
+      select_indices(*args)
+    end
+    ##
+    #Perform selection of the desires result instances, based on their indices
+    def select_indices_to_extract(ary)
+      return ary if @indices_to_extract == nil
+      to_keep = []
+      @indices_to_extract.each {|e|
+        if e.is_a? Symbol
+          case e
+          when :first
+            to_keep << 0
+          when :last
+            to_keep << ary.size-1
+          when :all_but_last
+           (0..ary.size-2).each {|i| to_keep << i}
+          when :all_but_first
+           (1..ary.size-1).each {|i| to_keep << i}
+          when :every_even
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
+          when :every_odd
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
+          when :every_second
+           (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
+          when :every_third
+           (0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
+          when :every_fourth
+           (0..ary.size).each {|i| to_keep << i if (i % 4 == 0)}
+          end
+        end
+      }
+      @indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
+      to_keep.sort!
+      ary.reject! {|e| !to_keep.include? ary.index(e)}
+      ary
+    end
+    private
+    ##
+    #Do not return the whole result set, just specified indices - like
+    #first,last, every odd index, indices from [1..3] etc.
+    #
+    #This method can accept:
+    #- a range, like (2..3)
+    #- an array of indices, like [1,2,3]
+    #- specified set of keywords:
+    #  - :first
+    #  - :last
+    #  - :every_even
+    #  - :every_odd
+    #  (there can be more of these keywords in one select_indices call)
+    def select_indices(*args)
+      indices_to_grab = args[0]
+      case indices_to_grab.class.to_s
+      when "Range"
+        @indices_to_extract = indices_to_grab.to_a
+      when "Array"
+        nested_arrays = []
+        indices_to_grab.each {|e|
+          if e.is_a? Array
+            nested_arrays << e
+          elsif e.is_a? Range
+            nested_arrays << e.to_a
+          end
+        }
+        @indices_to_extract = indices_to_grab
+        nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
+        @indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
+      when "Symbol"
+        #parse this when  we already have the results
+        @indices_to_extract = [indices_to_grab]
+      else
+        puts "Invalid index specification"
+      end
+    end #end of function select_indices
+  end #end of class ResultIndexer
+end #end of module Scrubyt

data/lib/scrubyt/core/shared/extractor.rb ADDED Viewed

@@ -0,0 +1,183 @@
+module Scrubyt
+  ##
+  #=<tt>Driving the whole extraction process</tt>
+  #
+  #Extractor is a performer class - it gets an extractor definition and carries
+  #out the actions and evaluates the wrappers sequentially.
+  #
+  #Originally also the navigation actions were here, but since the class got too
+  #big, they were factored out to an own class, NavigationAction.
+  class Extractor
+    include FetchAction
+    attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
+    #The definition of the extractor is passed through this method
+    def self.define(mode=nil, &extractor_definition)
+      if mode.is_a?(Hash)
+        if mode[:agent]==:firefox
+          FetchAction.class_eval do
+            include Navigation::Firewatir
+          end
+        else
+          FetchAction.class_eval do
+            include Navigation::Mechanize
+          end
+        end
+      else
+        FetchAction.class_eval do
+          include Navigation::Mechanize
+        end
+      end
+      extractor = self.new(mode, extractor_definition)
+      extractor.result
+    end
+    def self.load(filename)
+      define(&eval(IO.read(filename)))
+    end
+    def initialize(mode, extractor_definition)
+      @mode = mode
+      @root_patterns = []
+      @next_page_pattern = nil
+      #      @hpricot_doc = nil
+      #      @hpricot_doc_url = nil
+      @evaluating_extractor_definition = false
+      @next_page_list = []
+      @processed_pages = []
+      backtrace = SharedUtils.get_backtrace
+      parts = backtrace[1].split(':')
+      source_file = parts[0]
+      Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
+      @evaluating_extractor_definition = true
+      context = Object.new
+      context.extend NavigationActions
+      context.instance_eval do
+        def extractor=(value)
+          @extractor = value
+        end
+        def next_page(*args)
+          @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
+        end
+        def method_missing(method_name, *args, &block)
+          root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
+          @extractor.root_patterns << root_pattern
+          root_pattern
+        end
+      end
+      FetchAction.extractor = self
+      context.extractor = self
+      context.instance_eval(&extractor_definition)
+      @evaluating_extractor_definition = false
+      if @root_patterns.empty?
+        # TODO: this should be an exception
+        Scrubyt.log :ERROR, 'No extractor defined, exiting...'
+        exit
+      end
+      #Once all is set up, evaluate the extractor from the root pattern!
+      root_results = evaluate_extractor
+      FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
+      @result = ScrubytResult.new('root')
+      @result.push(*@root_results)
+      @result.root_patterns = @root_patterns
+      @result.source_file = source_file
+      @result.source_proc = extractor_definition
+      #Return the root pattern
+      Scrubyt.log :INFO, 'Extraction finished succesfully!'
+    end
+    def get_hpricot_doc
+      FetchAction.get_hpricot_doc
+    end
+    def get_current_doc_url
+      FetchAction.get_current_doc_url
+    end
+    def get_detail_pattern_relations
+      @detail_pattern_relations
+    end
+    def get_mode
+      @mode
+    end
+    def get_original_host_name
+      @original_host_name
+    end
+    def add_to_next_page_list(result_node)
+      if result_node.result.is_a? Hpricot::Elem
+        node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
+        return if node == nil || node.attributes['href'] == nil
+        href = node.attributes['href'].gsub('&amp;') {'&'}
+      elsif result_node.result.is_a? String
+        href = result_node.result
+      end
+      url = href #TODO need absolute address here 1/4
+      @next_page_list << url
+    end
+    def evaluate_extractor
+      @root_results ||= []
+      current_page_count = 1
+      xpath = nil
+      catch :quit_next_page_loop do
+        loop do
+          url = get_current_doc_url #TODO need absolute address here 2/4
+          @processed_pages << url
+          @root_patterns.each do |root_pattern|
+            @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+          end
+	  node = nil
+          while @processed_pages.include? url #TODO need absolute address here 3/4
+            if !@next_page_pattern.nil?
+	      if @next_page_pattern.options[:limit] == current_page_count
+		      throw :quit_next_page_loop
+	      end
+	      unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
+		      throw :quit_next_page_loop
+	      end
+              xpath = @next_page_pattern.filters[0].xpath
+              node = (get_hpricot_doc/xpath).last
+              node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
+	      if node == nil || node.attributes['href'] == nil
+		      throw :quit_next_page_loop
+	      end
+              href = node.attributes['href'].gsub('&amp;') {'&'}
+              throw :quit_next_page_loop if href == nil
+              url = href #TODO need absolute address here 4/4
+            else
+              throw :quit_next_page_loop if @next_page_list.empty?
+              url = @next_page_list.pop
+            end
+          end
+          restore_host_name
+	  if url == "#"
+		  FetchAction.click_by_xpath_without_evaluate(xpath)
+	  else
+		  FetchAction.fetch(url)
+	  end
+          current_page_count += 1
+        end
+      end
+      @root_patterns = []
+      @root_results
+    end
+  end
+end

data/lib/scrubyt/logging.rb ADDED Viewed

@@ -0,0 +1,154 @@
+#
+# TODO: if multiline messages aren't needed, then remove them.
+#
+# TODO: switch to the conventional Ruby logger interface,
+# or create an adapter to it. If the former, then decided what to
+# do with the unit tests.
+#
+module Scrubyt
+  # Logging is disabled by default. It can be enabled as follows:
+  #
+  #   Scrubyt.logger = Scrubyt::Logger.new  # logs *all* messages to STDERR
+  #
+  def self.logger=(logger)
+    @logger = logger
+  end
+  # Simple logger implementation, based on Scrubyt's original logging style.
+  # Messages will be sent to STDERR. Logging can be limited to certain message
+  # levels by specifying them on initialization, e.g.
+  #
+  #   Scrubyt::Logger.new(:ACTION, :ERROR)  # will only log action/error messages
+  #
+  class Logger
+    class Message
+      def initialize(level, text)
+        @level, @text = level.to_s, text.to_s
+      end
+      def to_s
+        prefix + @text
+      end
+      protected
+      def prefix
+        @prefix ||= "[#{@level}] "
+      end
+    end
+    class MultiLineMessage < Message
+      def initialize(level, lines)
+        super level, lines.shift
+        @lines = lines
+      end
+      def to_s
+        [ super, indented_lines ] * "\n"
+      end
+      private
+      def indented_lines
+        @lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
+      end
+      def indented(line)
+        ' ' * prefix.length + line
+      end
+    end
+    def initialize(*levels)
+      @levels = levels
+    end
+    def log(level, message)
+      return unless logging?(level)
+      message_class = message.is_a?(Array) ? MultiLineMessage : Message
+      output_stream.puts message_class.new(level, message)
+    end
+    def output_stream
+      @output_stream || STDERR
+    end
+    attr_writer :output_stream
+    private
+    def logging?(level)
+      @levels.empty? || @levels.include?(level)
+    end
+  end
+  def self.log(level, message)
+    return if logger.nil?
+    logger.log(level, message)
+  end
+  private
+  def self.logger
+    @logger
+  end
+end
+if __FILE__ == $0 then
+  require 'test/unit'
+  class ScrubytLoggingTestCase < Test::Unit::TestCase
+    class FauxOutputStream < Array
+      def puts(object)
+        self << object.to_s
+      end
+    end
+    def setup_logger_with_faux_output_stream!(*logger_args)
+      @stream = FauxOutputStream.new
+      logger = Scrubyt::Logger.new(*logger_args)
+      logger.output_stream = @stream
+      Scrubyt.logger = logger
+    end
+    def test_that_logging_works_with_nil_logger
+      Scrubyt.logger = nil
+      assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
+    end
+    def test_simple_messages_are_output_correctly
+      setup_logger_with_faux_output_stream!
+      Scrubyt.log :ACTION, 'i just did something'
+      assert_equal 1, @stream.size
+      assert_equal '[ACTION] i just did something', @stream.first
+    end
+    def test_that_multiline_messages_are_output_correctly
+      setup_logger_with_faux_output_stream!
+      Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
+      assert_equal 1, @stream.size
+      assert_equal "[ERROR] something bad happened\n        dear oh dear", @stream.first
+    end
+    def test_that_loggers_can_be_limited_to_specfied_message_levels
+      setup_logger_with_faux_output_stream! :ERROR
+      Scrubyt.log :ACTION, 'i just did something'
+      Scrubyt.log :ERROR, 'something bad happened'
+      assert_equal 1, @stream.size
+      assert_equal '[ERROR] something bad happened', @stream.first
+    end
+  end
+end

data/lib/scrubyt/output/post_processor.rb ADDED Viewed

@@ -0,0 +1,139 @@
+module Scrubyt
+########################################## NOT USED ANY MORE ##########################################
+require 'set'
+##
+#=<tt>Post processing results after the extraction</tt>
+#Some things can not be carried out during evaluation - for example
+#the ensure_presence_of_pattern constraint (since the evaluation is top
+#to bottom, at a given point we don't know yet whether the currently
+#evaluated pattern will have a child pattern or not) or removing unneeded
+#results caused by evaluating multiple filters.
+#
+#The sole purpose of this class is to execute these post-processing tasks.
+  class PostProcessor
+    ##
+    #This is just a convenience method do call all the postprocessing
+    #functionality and checks
+    def self.apply_post_processing(root_pattern)
+      ensure_presence_of_pattern_full(root_pattern)
+      remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
+      report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
+    end
+    ##
+    #Apply the ensure_presence_of_pattern constraint on
+    #the full extractor
+    def self.ensure_presence_of_pattern_full(pattern)
+      ensure_presence_of_pattern(pattern)
+      pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
+    end
+    ##
+    #Remove unneeded results of a pattern (caused by evaluating multiple filters)
+    #See for example the B&N scenario - the book titles are extracted two times
+    #for every pattern (since both examples generate the same XPath for them)
+    #but since always only one of the results has a price, the other is discarded
+    def self.remove_multiple_filter_duplicates(pattern)
+      remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
+      pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
+    end
+    ##
+    #Issue an error report if the document did not extract anything.
+    #Probably this is because the structure of the page changed or
+    #because of some rather nasty bug - in any case, something wrong
+    #is going on, and we need to inform the user about this!
+    def self.report_if_no_results(root_pattern)
+      results_found = false
+      root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
+      Scrubyt.log :WARNING, [
+        "The extractor did not find any result instances. Most probably this is wrong.",
+        "Check your extractor and if you are sure it should work, report a bug!"
+      ]
+    end
+private
+    def self.ensure_presence_of_pattern(pattern)
+      #holds the name of those child patterns which have to be present as children of the input parameter
+      epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
+      return if epop_names.empty?
+      #all_parent_values holds instances extracted by pattern
+      all_parent_values = []
+      pattern.result.childmap.each { |h| all_parent_values << h.values }
+      all_parent_values.flatten!
+      #indices of result instances (of pattern) we are going to remove
+      results_to_remove = Set.new
+      pattern.children.each do |child_pattern|
+        #all_child_values holds instances extracted by child_pattern
+        all_child_values = []
+        child_pattern.result.childmap.each { |h| all_child_values << h.values }
+        all_child_values.flatten!
+        #populate results_to_remove
+        i = 0
+        all_parent_values.each do |parent_value|
+          #Hey! Not just the direct children but all the ancestors
+          @found_ancestor = false
+          check_ancestors(parent_value, all_child_values)
+          results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
+          i += 1
+        end
+      end
+      #based on results_to_remove, populate the array 'rejected' which holds the actual instances
+      #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
+      #results_to_remove indices to their actual instances
+      rejected = []
+      i = -1
+      pattern.result.childmap.each do |h|
+        h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
+      end
+      #Finally, do the actual delete!
+      pattern.result.childmap.each { |h| h.each { |k,v| rejected.each  { |r| v.delete(r)} } }
+    end
+    def self.check_ancestors(parent_value, all_child_values)
+      parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
+parent_value.is_a? Hpricot::Elem
+      parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
+    end
+    def self.remove_multiple_filter_duplicates_intern(pattern)
+      possible_duplicates = {}
+      longest_result = 0
+      pattern.result.childmap.each { |r|
+        r.each do |k,v|
+          v.each do |x|
+            all_child_results = []
+            pattern.children.each { |child|
+              temp_res = child.result.lookup(x)
+              all_child_results << temp_res if temp_res != nil
+            }
+            next if all_child_results.size <= 1
+            longest_result = all_child_results.map {|e| e.size}.max
+            all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
+            possible_duplicates[x] = all_child_results.transpose
+          end
+        end
+      }
+      #Determine the 'real' duplicates
+      real_duplicates = {}
+      possible_duplicates.each { |k,v|
+        next if v.size == 1
+        v.each { |r| real_duplicates[k] = r }
+      }
+      #Finally, remove them!
+      pattern.children.each { |child|
+        child.result.childmap.each { |r|
+          r.each { |k,v|
+           real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
+          }
+        }
+      }
+    end #end of function
+  end #end of class PostProcessor
+end #end of module Scrubyt