RubyGems - scrubyt - Versions diffs - 0.2.6 → 0.2.8 - Mend

scrubyt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/CHANGELOG +59 -12
data/Rakefile +2 -2
data/lib/scrubyt.rb +24 -6
data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
data/lib/scrubyt/core/scraping/constraint.rb +53 -57
data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
data/lib/scrubyt/core/scraping/pattern.rb +292 -157
data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
data/lib/scrubyt/core/shared/extractor.rb +122 -163
data/lib/scrubyt/output/export.rb +59 -174
data/lib/scrubyt/output/post_processor.rb +4 -3
data/lib/scrubyt/output/result.rb +8 -9
data/lib/scrubyt/output/result_dumper.rb +81 -42
data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
data/lib/scrubyt/utils/shared_utils.rb +39 -26
data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
data/lib/scrubyt/utils/xpathutils.rb +31 -30
data/test/unittests/constraint_test.rb +11 -7
data/test/unittests/extractor_test.rb +6 -6
data/test/unittests/filter_test.rb +66 -66
metadata +22 -15
data/lib/scrubyt/core/scraping/filter.rb +0 -201

data/lib/scrubyt/utils/compound_example_lookup.rb CHANGED

@@ -4,27 +4,27 @@ module Scrubyt
   #the simple example and the compound example.
   #
   #This class is responsible for finding elements matched by compound examples.
-  #In the futre probably more sophisticated matching algorithms will be added
-  #(e.g. match the n-th which matches the text, or element that matches the
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
   #text but also contains a specific attribute etc.)
   class CompoundExampleLookup
-    def self.find_node_from_compund_example(doc, compound_example, next_link)
+    def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
       @partial_results = []
-      self.lookup_compound_example(doc, compound_example)
+      self.lookup_compound_example(doc, compound_example, index)
     end
 private
     #Lookup the first element which is matched by this compund example
     #
-    #A compound example is specified with :contains, :begins_with and
+    #A compound example is specified with :contains, :begins_with and
     #:ends_with descriptors - which can be both regexps or strings
     #
     #Example:
     #
     #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
-    def self.lookup_compound_example(doc, compound_example)
+    def self.lookup_compound_example(doc, compound_example, index)
       compound_example.each do |k,v|
-        v = Regexp.escape(v) if v.is_a? String
+        v = Regexp.escape(v) if v.is_a? String
         case k
           when :contains
             v = /#{v}/
@@ -39,12 +39,12 @@ private
           refine_partial_results(v)
         end
       end
-      @partial_results.first
+      @partial_results[index]
     end
     def self.refine_partial_results(regexp)
       @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
     end
   end #End of class CompoundExampleLookup
 end #End of module Scrubyt

data/lib/scrubyt/utils/ruby_extensions.rb ADDED

@@ -0,0 +1,113 @@
+class Module
+  def option_reader(key_default_hash)
+    key_default_hash.each do |key, default|
+      define_method(key) {
+        if @options[key].nil?
+          if default.is_a? Proc
+            instance_eval(&default)
+          else
+            default
+          end
+        else
+          @options[key]
+        end
+      }
+    end
+  end
+  def option_writer(*keys)
+    keys.each do |key|
+      define_method("#{key.to_s}=".to_sym) { |value|
+        @options[key] = value
+      }
+    end
+  end
+  def option(key, default=nil, writable=false)
+    option_reader(key => default)
+    option_writer(key) if writable
+  end
+  def option_accessor(key_default_hash)
+    key_default_hash.each do |key, default|
+      option(key, default, true)
+    end
+  end
+end
+class Range
+  def <=>(other)
+    self.begin <=> other.begin
+  end
+  def +(amount)
+   (self.begin + amount)..(self.end + amount)
+  end
+  def -(amount)
+   (self.begin - amount)..(self.end - amount)
+  end
+end
+module Math
+  def self.min(a, b)
+    a < b ? a : b
+  end
+  def self.max(a, b)
+    a > b ? a : b
+  end
+end
+class Array
+  def to_sexp
+    [:array, *to_sexp_array]
+  end
+  def to_sexp_array
+    collect { |element| element.to_sexp }
+  end
+end
+class Hash
+  def to_sexp
+    [:hash, *to_sexp_array]
+  end
+  def to_sexp_array
+    sexp = []
+    each { |key, value| sexp.push(key.to_sexp, value.to_sexp) }
+    sexp
+  end
+end
+class Symbol
+  def to_sexp
+    [:lit, self]
+  end
+end
+class String
+  def to_sexp
+    [:str, self]
+  end
+end
+class TrueClass
+  def to_sexp
+    [:true]
+  end
+end
+class FalseClass
+  def to_sexp
+    [:false]
+  end
+end
+class Proc
+  alias_method :parse_tree_to_sexp, :to_sexp
+  def to_sexp
+    [:iter, [:fcall, :lambda], nil, parse_tree_to_sexp[1] ]
+  end
+end

data/lib/scrubyt/utils/shared_utils.rb CHANGED

@@ -1,11 +1,8 @@
 module Scrubyt
   ##
-  #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
-  #
+  #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
+  #
   class SharedUtils
-    #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
-    NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
     #Entities to replace - need to make this more complete, or install htmlentities or similar package
     ENTITIES = {
         'quot'      => '"',
@@ -14,32 +11,48 @@ module Scrubyt
         'lt'        => '<',
         'gt'        => '>',
         'nbsp'      => ' '}
+    def self.prepare_text_for_comparison(text)
+      unescape_entities text
+      text.strip!
+      text
+    end
     #Unescape the entities in the HTML!
     def self.unescape_entities(text)
-        ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
-        text
-    end
+      ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
+      text
+    end
     #Entry point for finding the elements specified by examples
     def self.traverse_for_match(node, regexp)
-      @results = []
-      traverse_for_match_inner(node,regexp)
-      @results
+      results = []
+      traverse_for_match_inner = lambda { |node, regexp|
+        ft = prepare_text_for_comparison(node.inner_text)
+        if ft =~ regexp
+          node.instance_eval do
+            @match_data = $~
+            def match_data
+              @match_data
+            end
+          end
+          results << node
+          results.delete node.parent if node.is_a? Hpricot::Elem
+        end
+        node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
+      }
+      traverse_for_match_inner.call(node,regexp)
+      results
     end
-private
-    def self.traverse_for_match_inner(node, regexp)
-      ft = unescape_entities(node.inner_text).strip
-      if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
-        @results << node
-        @results.delete node.parent
+    def self.get_backtrace
+      begin
+        raise
+      rescue Exception => ex
+        backtrace = ex.backtrace
       end
-      node.children.each do |child|
-        if child.instance_of? Hpricot::Elem
-          traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
-        end
-      end
-    end #end of method traverse_for_match
+      backtrace.slice!(0)
+      backtrace
+    end
   end #end of class SharedUtils
 end #end of module Scrubyt

data/lib/scrubyt/utils/simple_example_lookup.rb CHANGED

@@ -4,8 +4,8 @@ module Scrubyt
   #the simple example and the compound example.
   #
   #This class is responsible for finding elements matched by simple examples.
-  #In the futre probably more sophisticated matching algorithms will be added
-  #(e.g. match the n-th which matches the text, or element that matches the
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
   #text but also contains a specific attribute etc.)
   class SimpleExampleLookup
     #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
@@ -14,10 +14,10 @@ module Scrubyt
     # <a>Bon <b>nuit</b>, monsieur!</a>
     #
     #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
-    def self.find_node_from_text(doc, text, next_link)
+    def self.find_node_from_text(doc, text, next_link=false, index = 0)
       text.gsub!('»', '&#187;')
-      text = Regexp.escape(text) if text.is_a? String
-      SharedUtils.traverse_for_match(doc,/#{text}/).first
-    end
+      text = Regexp.escape(text) if text.is_a? String
+      SharedUtils.traverse_for_match(doc,/#{text}/)[index]
+    end
   end #End of class SimpleExampleLookup
 end #End of module Scrubyt

data/lib/scrubyt/utils/xpathutils.rb CHANGED

@@ -4,9 +4,9 @@ require 'hpricot'
 module Scrubyt
   ##
   #=<tt>Various XPath utility functions</tt>
-  class XPathUtils
-    #Find the LCA (Lowest Common Ancestor) of two nodes
+  class XPathUtils
+    #Find the LCA (Lowest Common Ancestor) of two nodes
     def self.lowest_common_ancestor(node1, node2)
       path1 = traverse_up(node1)
       path2 = traverse_up(node2)
@@ -19,7 +19,7 @@ module Scrubyt
       end
       path1.size > path2.size ? path1.last.parent : path2.last.parent
     end
     ##
     #Generate XPath for the given node
     #
@@ -28,7 +28,7 @@ module Scrubyt
     #_node_ - The node we are looking up the XPath for
     #
     #_stopnode_ - The Xpath generation is stopped and the XPath that
-    #was generated so far is returned if this node is reached.
+    #was generated so far is returned if this node is reached.
     #
     #_write_indices_ - whether the index inside the parent shuold be
     #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
@@ -36,7 +36,7 @@ module Scrubyt
       path = []
       indices = []
       found = false
-      while node.class != Hpricot::Doc do
+      while !node.nil? && node.class != Hpricot::Doc do
         if node == stopnode
           found = true
           break
@@ -53,32 +53,32 @@ module Scrubyt
         path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
       else
         path.reverse.each{ |node| result += "#{node}/" }
-      end
+      end
       "/" + result.chop
     end
-    #Generate an XPath of the node with indices, relatively to the given
+    #Generate an XPath of the node with indices, relatively to the given
     #relative_root.
     #
-    #For example if the elem's absolute XPath is /a/b/c,
+    #For example if the elem's absolute XPath is /a/b/c,
     #and the relative root's Xpath is a/b, the result of the function will
     #be /c.
     def self.generate_relative_XPath( elem,relative_root )
       return nil if (elem == relative_root)
       generate_XPath(elem, relative_root, true)
     end
-    #Generate a generalized XPath (i.e. without indices) of the node,
+    #Generate a generalized XPath (i.e. without indices) of the node,
     #relatively to the given relative_root.
     #
-    #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
+    #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
     #and the relative root's Xpath is a[1]/b[3], the result of the function will
-    #be /c.
+    #be /c.
     def self.generate_generalized_relative_XPath( elem,relative_root )
       return nil if (elem == relative_root)
       generate_XPath(elem, relative_root, false)
     end
     #Find an image based on the src of the example
     #
     #*parameters*
@@ -91,7 +91,7 @@ module Scrubyt
     #and thus can be easily pasted as an examle
     #
     #_index_ - there might be more images with the same src on the page -
-    #most typically the user will need the 0th - but if this is not the
+    #most typically the user will need the 0th - but if this is not the
     #case, there is the possibility to override this
     def self.find_image(doc, example, index=0)
       (doc/"//img[@src='#{example}']")[index]
@@ -99,19 +99,20 @@ module Scrubyt
     ##
     #Used to find the parent of a node with the given name - for example
-    #find the <form> node which is the parent of the <input> node
+    #find the <form> node which is the parent of the <input> node
     def self.traverse_up_until_name(node, name)
       while node.class != Hpricot::Doc do
+        raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
         break if node.name == name
         node = node.parent
       end
       node
     end
     ##
     #Used when automatically looking up href attributes (for detail or next links)
-    #If the detail pattern did not extract a link, we first look up it's
-    #children - and if we don't find a link, traverse up
+    #If the detail pattern did not extract a link, we first look up it's
+    #children - and if we don't find a link, traverse up
     def self.find_nearest_node_with_attribute(node, attribute)
       @node = nil
       return node if node.is_a? Hpricot::Elem and node[attribute]
@@ -119,13 +120,13 @@ module Scrubyt
       first_parent_node_with_attribute(node, attribute) if !@node
       @node
     end
     ##
     #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
     #and a child one. The result of the method is the relative XPath of the node pointed to
     #by the second XPath to the node pointed to by the firs XPath.
     def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
-      original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
+      original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
       pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
       i = 0
       pairs.each_with_index do |pair,index|
@@ -134,7 +135,7 @@ module Scrubyt
       end
       "/" + original_child_xpath_parts[i..-1].join('/')
     end
 private
     #Find the index of the child inside the parent
     #For example:
@@ -142,7 +143,7 @@ private
     #         tr
     #      /  |   \
     #    td   td   td
-    #    0    1    2
+    #    0    1    2
     #
     #The last row contains the indices of the td's from the
     #tow above.
@@ -154,7 +155,7 @@ private
      node.parent.children.each do |child|
        if child.class == Hpricot::Elem
          c += 1 if (child.name == node.name)
-         break if (node == child)
+         break if (node == child)
        end
      end
      c
@@ -169,21 +170,21 @@ private
       end
     path
     end
     def self.first_child_node_with_attribute(node, attribute)
       return if !node.instance_of? Hpricot::Elem || @node
       @node = node if node.attributes[attribute]
       node.children.each  { |child| first_child_node_with_attribute(child, attribute) }
     end
     def self.first_parent_node_with_attribute(node, attribute)
       return if !node.instance_of? Hpricot::Elem || @node
       @node = node if node.attributes[attribute]
       first_parent_node_with_attribute(node.parent, attribute)
-    end
+    end
     def self.to_general_XPath(xpath)
       xpath.gsub(/\[.+?\]/) {""}
-    end #End of method to_general_XPath
+    end #End of method to_general_XPath
   end #End of class XPathUtils
 end #End of module Scrubyt