RubyGems - scrubber-scrubyt - Versions diffs - 0.4.11 - Mend

scrubber-scrubyt 0.4.11

Files changed (45) hide show

data/CHANGELOG +343 -0
data/COPYING +340 -0
data/README +99 -0
data/Rakefile +101 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +167 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +140 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/lib/scrubyt.rb +43 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +115 -0

data/lib/scrubyt/utils/compound_example_lookup.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module Scrubyt
+  #=<tt>Lookup of compund examples</tt>
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example.
+  #
+  #This class is responsible for finding elements matched by compound examples.
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
+  #text but also contains a specific attribute etc.)
+  class CompoundExampleLookup
+    def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
+      @partial_results = []
+      self.lookup_compound_example(doc, compound_example, index)
+    end
+private
+    #Lookup the first element which is matched by this compund example
+    #
+    #A compound example is specified with :contains, :begins_with and
+    #:ends_with descriptors - which can be both regexps or strings
+    #
+    #Example:
+    #
+    #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
+    def self.lookup_compound_example(doc, compound_example, index)
+      compound_example.each do |k,v|
+        v = Regexp.escape(v) if v.is_a? String
+        case k
+          when :contains
+            v = /#{v}/
+          when :begins_with
+            v = /^\s*#{v}/
+          when :ends_with
+            v = /#{v}\s*$/
+        end
+        if (@partial_results.empty?)
+          @partial_results = SharedUtils.traverse_for_match(doc, v)
+        else
+          refine_partial_results(v)
+        end
+      end
+      @partial_results[index]
+    end
+    def self.refine_partial_results(regexp)
+      @partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
+    end
+  end #End of class CompoundExampleLookup
+end #End of module Scrubyt

data/lib/scrubyt/utils/ruby_extensions.rb ADDED Viewed

@@ -0,0 +1,85 @@
+class Module
+  def option_reader(key_default_hash)
+    key_default_hash.each do |key, default|
+      define_method(key) {
+        if @options[key].nil?
+          if default.is_a? Proc
+            instance_eval(&default)
+          else
+            default
+          end
+        else
+          @options[key]
+        end
+      }
+    end
+  end
+  def option_writer(*keys)
+    keys.each do |key|
+      define_method("#{key.to_s}=".to_sym) { |value|
+        @options[key] = value
+      }
+    end
+  end
+  def option(key, default=nil, writable=false)
+    option_reader(key => default)
+    option_writer(key) if writable
+  end
+  def option_accessor(key_default_hash)
+    key_default_hash.each do |key, default|
+      option(key, default, true)
+    end
+  end
+end
+class Range
+  def <=>(other)
+    self.begin <=> other.begin
+  end
+  def +(amount)
+   (self.begin + amount)..(self.end + amount)
+  end
+  def -(amount)
+   (self.begin - amount)..(self.end - amount)
+  end
+end
+module Math
+  def self.min(a, b)
+    a < b ? a : b
+  end
+  def self.max(a, b)
+    a > b ? a : b
+  end
+end
+#dec 16: Dropped - causes some errors w/ Rails
+#just some hack here to allow current examples' syntax:
+#table_data.to_xml.write(open('result.xml', 'w'), 1)
+#class String
+#  def write(stringio, add_indent=0)
+#    stringio.write((self.split("\n").collect { |line| ('  ' * add_indent) + line }).join("\n"))
+#  end
+#end
+#hack to simulate ancestor::tag selector of XPAth
+module Hpricot
+  class Elem
+    def ancestors(tag = nil)
+      element=self
+      path=Hpricot::Elements.new
+      while element.class != Hpricot::Doc do
+        return element if (tag && (tag ==element.name))
+        path.push element
+        element = element.parent
+      end
+      path
+    end
+  end
+end

data/lib/scrubyt/utils/shared_utils.rb ADDED Viewed

@@ -0,0 +1,58 @@
+module Scrubyt
+  ##
+  #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
+  #
+  class SharedUtils
+    #Entities to replace - need to make this more complete, or install htmlentities or similar package
+    ENTITIES = {
+        'quot'      => '"',
+        'apos'      => "'",
+        'amp'       => '&',
+        'lt'        => '<',
+        'gt'        => '>',
+        'nbsp'      => ' '}
+    def self.prepare_text_for_comparison(text)
+      unescape_entities text
+      text.strip!
+      text
+    end
+    #Unescape the entities in the HTML!
+    def self.unescape_entities(text)
+      ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
+      text
+    end
+    #Entry point for finding the elements specified by examples
+    def self.traverse_for_match(node, regexp)
+      results = []
+      traverse_for_match_inner = lambda { |node, regexp|
+        ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
+        if ft =~ regexp
+          node.instance_eval do
+            @match_data = $~
+            def match_data
+              @match_data
+            end
+          end
+          results << node
+          results.delete node.parent if node.is_a? Hpricot::Elem
+        end
+        node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
+      }
+      traverse_for_match_inner.call(node,regexp)
+      results
+    end
+    def self.get_backtrace
+      begin
+        raise
+      rescue Exception => ex
+        backtrace = ex.backtrace
+      end
+      backtrace.slice!(0)
+      backtrace
+    end
+  end #end of class SharedUtils
+end #end of module Scrubyt

data/lib/scrubyt/utils/simple_example_lookup.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Scrubyt
+  #=<tt>Lookup of simple examples</tt>
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example.
+  #
+  #This class is responsible for finding elements matched by simple examples.
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
+  #text but also contains a specific attribute etc.)
+  class SimpleExampleLookup
+    #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
+    #The text can be also a mixed content text, e.g.
+    #
+    # <a>Bon <b>nuit</b>, monsieur!</a>
+    #
+    #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
+    def self.find_node_from_text(doc, text, next_link=false, index = 0)
+      text.gsub!('»', '&#187;')
+      #Process immediate attribute extraction (like "go to google.com/@href")
+      if text =~ /.+\/@.+$/
+        text = text.scan(/^(.+?)\/@.+$/)[0][0]
+      elsif text =~ /.+\[\d+\]$/
+        res = text.scan(/(.+)\[(\d+)\]$/)
+        text = res[0][0]
+        index = res[0][1].to_i
+      elsif text =~ /.+\[.+\]$/
+        final_element_name = text.scan(/^(.+?)\[/)[0][0]
+        text = text.scan(/\[(.+?)\]/)[0][0]
+      end
+      if final_element_name
+        text = Regexp.escape(text) if text.is_a? String
+        result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
+        result = XPathUtils.traverse_up_until_name(result,final_element_name)
+      else
+        text = Regexp.escape(text) if text.is_a? String
+        result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
+      end
+    end
+  end #End of class SimpleExampleLookup
+end #End of module Scrubyt

data/lib/scrubyt/utils/xpathutils.rb ADDED Viewed

@@ -0,0 +1,202 @@
+require 'rubygems'
+require 'hpricot'
+module Scrubyt
+  ##
+  #=<tt>Various XPath utility functions</tt>
+  class XPathUtils
+    #Find the LCA (Lowest Common Ancestor) of two nodes
+    def self.lowest_common_ancestor(node1, node2)
+      path1 = traverse_up(node1)
+      path2 = traverse_up(node2)
+      return node1.parent if path1 == path2
+      closure = nil
+      while (!path1.empty? && !path2.empty?)
+	    closure = path1.pop
+	    return closure.parent if (closure != path2.pop)
+      end
+      path1.size > path2.size ? path1.last.parent : path2.last.parent
+    end
+    ##
+    #Generate XPath for the given node
+    #
+    #*parameters*
+    #
+    #_node_ - The node we are looking up the XPath for
+    #
+    #_stopnode_ - The Xpath generation is stopped and the XPath that
+    #was generated so far is returned if this node is reached.
+    #
+    #_write_indices_ - whether the index inside the parent shuold be
+    #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
+    def self.generate_XPath(node, stopnode=nil, write_indices=false)
+      path = []
+      indices = []
+      found = false
+      while !node.nil? && node.class != Hpricot::Doc do
+        if node == stopnode
+          found = true
+          break
+        end
+        path.push node.name
+        indices.push find_index(node) if write_indices
+        node = node.parent
+      end
+      #This condition ensures that if there is a stopnode, and we did not found it along the way,
+      #we return nil (since the stopnode is not contained in the path at all)
+      return nil if stopnode != nil && !found
+      result = ""
+      if write_indices
+        path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
+      else
+        path.reverse.each{ |node| result += "#{node}/" }
+      end
+      "/" + result.chop
+    end
+    #Generate an XPath of the node with indices, relatively to the given
+    #relative_root.
+    #
+    #For example if the elem's absolute XPath is /a/b/c,
+    #and the relative root's Xpath is a/b, the result of the function will
+    #be /c.
+    def self.generate_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, true)
+    end
+    #Generate a generalized XPath (i.e. without indices) of the node,
+    #relatively to the given relative_root.
+    #
+    #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
+    #and the relative root's Xpath is a[1]/b[3], the result of the function will
+    #be /c.
+    def self.generate_generalized_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, false)
+    end
+    #Find an image based on the src of the example
+    #
+    #*parameters*
+    #
+    #_doc_ - The containing document
+    #
+    #_example_ - The value of the src attribute of the img tag
+    #This is convenient, since if the users rigth-clicks an image and
+    #copies image location, this string will be copied to the clipboard
+    #and thus can be easily pasted as an examle
+    #
+    #_index_ - there might be more images with the same src on the page -
+    #most typically the user will need the 0th - but if this is not the
+    #case, there is the possibility to override this
+    def self.find_image(doc, example, index=0)
+      if example =~ /\.(jpg|png|gif|jpeg)(\[\d+\])$/
+        res = example.scan(/(.+)\[(\d+)\]$/)
+        example = res[0][0]
+        index = res[0][1].to_i
+      end
+      (doc/"//img[@src='#{example}']")[index]
+    end
+    ##
+    #Used to find the parent of a node with the given name - for example
+    #find the <form> node which is the parent of the <input> node
+    def self.traverse_up_until_name(node, name)
+      while node.class != Hpricot::Doc do
+        #raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
+        return nil unless node
+        break if node.name == name
+        node = node.parent
+      end
+      node
+    end
+    ##
+    #Used when automatically looking up href attributes (for detail or next links)
+    #If the detail pattern did not extract a link, we first look up it's
+    #children - and if we don't find a link, traverse up
+    def self.find_nearest_node_with_attribute(node, attribute)
+      @node = nil
+      return node if node.is_a? Hpricot::Elem and node[attribute]
+      first_child_node_with_attribute(node, attribute)
+      first_parent_node_with_attribute(node, attribute) if !@node
+      @node
+    end
+    ##
+    #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
+    #and a child one. The result of the method is the relative XPath of the node pointed to
+    #by the second XPath to the node pointed to by the firs XPath.
+    def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
+      original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
+      pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
+      i = 0
+      pairs.each_with_index do |pair,index|
+        i = index
+        break if pair[0] != pair[1]
+      end
+      "/" + original_child_xpath_parts[i..-1].join('/')
+    end
+    def self.to_full_XPath(doc, xpath, generalize)
+      elem = doc/xpath
+      elem = elem.map[0] if elem.is_a? Hpricot::Elements
+      XPathUtils.generate_XPath(elem, nil, generalize)
+    end
+private
+    #Find the index of the child inside the parent
+    #For example:
+    #
+    #         tr
+    #      /  |   \
+    #    td   td   td
+    #    0    1    2
+    #
+    #The last row contains the indices of the td's from the
+    #tow above.
+    #
+    #Note that in classic XPath, the indices start with 1 (rather
+    #than 0).
+    def self.find_index(node)
+     c = 0
+     node.parent.children.each do |child|
+       if child.class == Hpricot::Elem
+         c += 1 if (child.name == node.name)
+         break if (node == child)
+       end
+     end
+     c
+    end
+    def self.traverse_up(node, stopnode=nil)
+      path = []
+      while node.class != Hpricot::Doc do
+        break if node == stopnode
+        path.push node
+        node = node.parent
+      end
+    path
+    end
+    def self.first_child_node_with_attribute(node, attribute)
+      return if !node.instance_of? Hpricot::Elem || @node
+      @node = node if node.attributes[attribute]
+      node.children.each  { |child| first_child_node_with_attribute(child, attribute) }
+    end
+    def self.first_parent_node_with_attribute(node, attribute)
+      return if !node.instance_of? Hpricot::Elem || @node
+      @node = node if node.attributes[attribute]
+      first_parent_node_with_attribute(node.parent, attribute)
+    end
+    def self.to_general_XPath(xpath)
+      xpath.gsub(/\[.+?\]/) {""}
+    end #End of method to_general_XPath
+  end #End of class XPathUtils
+end #End of module Scrubyt

data/lib/scrubyt.rb ADDED Viewed

@@ -0,0 +1,43 @@
+$KCODE = "u"
+require "jcode"
+#ruby core
+require "open-uri"
+require "erb"
+#gems
+require "rexml/text"
+require "rubygems"
+require "mechanize"
+require "hpricot"
+#scrubyt
+require "#{File.dirname(__FILE__)}/scrubyt/logging"
+require "#{File.dirname(__FILE__)}/scrubyt/utils/ruby_extensions.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/utils/xpathutils.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/utils/shared_utils.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/utils/simple_example_lookup.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/utils/compound_example_lookup.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint_adder.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/result_indexer.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pre_filter_document.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/base_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/attribute_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/constant_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/script_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/text_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/detail_page_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/download_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/html_subtree_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/regexp_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/tree_filter.rb"
+require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pattern.rb"

data/test/blackbox_test.rb ADDED Viewed

@@ -0,0 +1,60 @@
+$lib_path = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+$:.unshift $lib_path
+require 'scrubyt'
+require 'test/unit'
+def perform_test(test_path, detailed = false)
+  out = $stdout
+  $stdout = StringIO.new unless detailed
+  cwd = Dir.getwd
+  Dir.chdir(File.dirname(test_path))
+  out.puts "Test: #{test_path}" if detailed
+  out.puts "========== Print Output ==========" if detailed
+  begin
+    expected_xml = File.read(File.basename(test_path)[0..-4] + ".expected.xml")
+    scrubyt_result_native = Scrubyt::Extractor.load(File.basename(test_path))
+    exported_code = scrubyt_result_native.export({:template => 'lambda'})
+    scrubyt_result_exported = Scrubyt::Extractor.define(&eval(exported_code))
+  ensure
+    if detailed
+      out.puts "========== Native Extractor =========="
+      out.puts IO.read(File.basename(test_path))
+      out.puts "========== Exported Extractor =========="
+      out.puts exported_code
+      out.puts "========== Expected =========="
+      out.puts expected_xml
+      out.puts "========== Result (native) =========="
+      out.puts scrubyt_result_native.to_xml
+      out.puts "========== Result (exported) =========="
+      out.puts scrubyt_result_exported.to_xml
+    end
+  end
+  assert_equal expected_xml, scrubyt_result_native.to_xml
+  assert_equal expected_xml, scrubyt_result_exported.to_xml
+ensure
+  Dir.chdir(cwd)
+  $stdout = out
+end
+if $0 == __FILE__ && ARGV[0]
+  include Test::Unit::Assertions
+  perform_test(ARGV[0], true)
+  exit
+end
+class BlackboxTest < Test::Unit::TestCase
+  tests = Dir.glob(File.join(File.dirname(__FILE__), 'blackbox_tests', '**', '*.rb'))
+  tests = tests.sort
+  tests.each do |test_path|
+    define_method("test_#{test_path.gsub('/', '_')}") do
+      perform_test(test_path)
+    end
+  end
+end

data/test/blackbox_tests/basic/multi_root.rb ADDED Viewed

@@ -0,0 +1,6 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "three_divs.html"))
+  entry '1'
+  data '4'
+end

data/test/blackbox_tests/basic/simple.rb ADDED Viewed

@@ -0,0 +1,5 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "three_divs.html"))
+  entry '1'
+end

data/test/blackbox_tests/detail_page/one_detail_page.rb ADDED Viewed

@@ -0,0 +1,9 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "main_page_1.html"))
+  main 'Main 1' do
+    xyz_detail do
+      detail 'Detail 1'
+    end
+  end
+end

data/test/blackbox_tests/detail_page/two_detail_pages.rb ADDED Viewed

@@ -0,0 +1,9 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "main_page_2.html"))
+  main 'Main 1' do
+    xyz_detail do
+      detail 'Detail 1'
+    end
+  end
+end

data/test/blackbox_tests/next_page/next_page_link.rb ADDED Viewed

@@ -0,0 +1,7 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "page_1.html"))
+  entry '1'
+  next_page 'Next'
+end

data/test/blackbox_tests/next_page/page_list_links.rb ADDED Viewed

@@ -0,0 +1,7 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "page_1.html"))
+  entry '1'
+  page_list 'Page 2'
+end