RubyGems - scrubyt - Versions diffs - 0.2.3 → 0.2.6 - Mend

scrubyt 0.2.3 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/CHANGELOG +30 -0
data/Rakefile +2 -2
data/lib/scrubyt.rb +5 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/filter.rb +35 -11
data/lib/scrubyt/core/scraping/pattern.rb +29 -22
data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
data/lib/scrubyt/core/shared/extractor.rb +111 -15
data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
data/lib/scrubyt/output/export.rb +69 -22
data/lib/scrubyt/output/result.rb +1 -0
data/lib/scrubyt/output/result_dumper.rb +26 -7
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/shared_utils.rb +45 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
data/lib/scrubyt/utils/xpathutils.rb +43 -92
data/test/unittests/simple_example_lookup_test.rb +68 -0
data/test/unittests/xpathutils_test.rb +0 -13
metadata +9 -3

data/lib/scrubyt/utils/compound_example_lookup.rb ADDED

@@ -0,0 +1,50 @@
+module Scrubyt
+  #=<tt>Lookup of compund examples</tt>
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example.
+  #
+  #This class is responsible for finding elements matched by compound examples.
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
+  #text but also contains a specific attribute etc.)
+  class CompoundExampleLookup
+    def self.find_node_from_compund_example(doc, compound_example, next_link)
+      @partial_results = []
+      self.lookup_compound_example(doc, compound_example)
+    end
+private
+    #Lookup the first element which is matched by this compund example
+    #
+    #A compound example is specified with :contains, :begins_with and
+    #:ends_with descriptors - which can be both regexps or strings
+    #
+    #Example:
+    #
+    #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
+    def self.lookup_compound_example(doc, compound_example)
+      compound_example.each do |k,v|
+        v = Regexp.escape(v) if v.is_a? String
+        case k
+          when :contains
+            v = /#{v}/
+          when :begins_with
+            v = /^\s*#{v}/
+          when :ends_with
+            v = /#{v}\s*$/
+        end
+        if (@partial_results.empty?)
+          @partial_results = SharedUtils.traverse_for_match(doc, v)
+        else
+          refine_partial_results(v)
+        end
+      end
+      @partial_results.first
+    end
+    def self.refine_partial_results(regexp)
+      @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
+    end
+  end #End of class CompoundExampleLookup
+end #End of module Scrubyt

data/lib/scrubyt/utils/shared_utils.rb ADDED

@@ -0,0 +1,45 @@
+module Scrubyt
+  ##
+  #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
+  #
+  class SharedUtils
+    #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
+    NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
+    #Entities to replace - need to make this more complete, or install htmlentities or similar package
+    ENTITIES = {
+        'quot'      => '"',
+        'apos'      => "'",
+        'amp'       => '&',
+        'lt'        => '<',
+        'gt'        => '>',
+        'nbsp'      => ' '}
+    #Unescape the entities in the HTML!
+    def self.unescape_entities(text)
+        ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
+        text
+    end
+    #Entry point for finding the elements specified by examples
+    def self.traverse_for_match(node, regexp)
+      @results = []
+      traverse_for_match_inner(node,regexp)
+      @results
+    end
+private
+    def self.traverse_for_match_inner(node, regexp)
+      ft = unescape_entities(node.inner_text).strip
+      if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
+        @results << node
+        @results.delete node.parent
+      end
+      node.children.each do |child|
+        if child.instance_of? Hpricot::Elem
+          traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
+        end
+      end
+    end #end of method traverse_for_match
+  end #end of class SharedUtils
+end #end of module Scrubyt

data/lib/scrubyt/utils/simple_example_lookup.rb ADDED

@@ -0,0 +1,23 @@
+module Scrubyt
+  #=<tt>Lookup of simple examples</tt>
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example.
+  #
+  #This class is responsible for finding elements matched by simple examples.
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
+  #text but also contains a specific attribute etc.)
+  class SimpleExampleLookup
+    #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
+    #The text can be also a mixed content text, e.g.
+    #
+    # <a>Bon <b>nuit</b>, monsieur!</a>
+    #
+    #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
+    def self.find_node_from_text(doc, text, next_link)
+      text.gsub!('»', '&#187;')
+      text = Regexp.escape(text) if text.is_a? String
+      SharedUtils.traverse_for_match(doc,/#{text}/).first
+    end
+  end #End of class SimpleExampleLookup
+end #End of module Scrubyt

data/lib/scrubyt/utils/xpathutils.rb CHANGED

@@ -4,54 +4,8 @@ require 'hpricot'
 module Scrubyt
   ##
   #=<tt>Various XPath utility functions</tt>
-  class XPathUtils
-    #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
-    NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
-    ENTITIES = {
-        'quot'      => '"',
-        'apos'      => "'",
-        'amp'       => '&',
-        'lt'        => '<',
-        'gt'        => '>',
-        'nbsp'      => ' '}
-    #From the example text defined by the user, find the lowest possible node with the text 'text'.
-    #The text can be also a mixed content text, e.g.
-    #
-    # <a>Bon <b>nuit</b>, monsieur!</a>
-    #
-    #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
-    def self.find_node_from_text(doc, text, next_link)
-      @node = nil
-      @found = false
-      #digg next page hack
-      text.gsub!('»', '&#187;')
-      self.traverse_for_full_text(doc,text)
-      self.lowest_possible_node_with_text(@node, text) if @node != nil
-      if (@found == false)
-        #Fallback to per node text lookup
-        self.traverse_for_node_text(doc,text)
-        if (@found == false)
-          return nil if next_link
-          puts "!" * 65
-          puts "!!!!!! FATAL: Node for example #{text} Not found! !!!!!!"
-          puts "!!!!!! Please make sure you specified the example properly !!!!!!"
-          puts "!" * 65
-          exit
-        end
-      end
-      @node
-    end
-    #Full text of the node; this is equivalent to Hpricot's inner_text
-    #(? be sure to check). Will be
-    #replaced if Hpricot 0.5 will be released
-    def self.full_text(node)
-      result = ""
-      node.traverse_text { |t| result += t.to_s }
-      result
-    end
+  class XPathUtils
     #Find the LCA (Lowest Common Ancestor) of two nodes
     def self.lowest_common_ancestor(node1, node2)
       path1 = traverse_up(node1)
@@ -71,7 +25,7 @@ module Scrubyt
     #
     #*parameters*
     #
-    #_node_ - The node we are looking the XPath for
+    #_node_ - The node we are looking up the XPath for
     #
     #_stopnode_ - The Xpath generation is stopped and the XPath that
     #was generated so far is returned if this node is reached.
@@ -154,7 +108,33 @@ module Scrubyt
       node
     end
+    ##
+    #Used when automatically looking up href attributes (for detail or next links)
+    #If the detail pattern did not extract a link, we first look up it's
+    #children - and if we don't find a link, traverse up
+    def self.find_nearest_node_with_attribute(node, attribute)
+      @node = nil
+      return node if node.is_a? Hpricot::Elem and node[attribute]
+      first_child_node_with_attribute(node, attribute)
+      first_parent_node_with_attribute(node, attribute) if !@node
+      @node
+    end
+    ##
+    #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
+    #and a child one. The result of the method is the relative XPath of the node pointed to
+    #by the second XPath to the node pointed to by the firs XPath.
+    def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
+      original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
+      pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
+      i = 0
+      pairs.each_with_index do |pair,index|
+        i = index
+        break if pair[0] != pair[1]
+      end
+      "/" + original_child_xpath_parts[i..-1].join('/')
+    end
 private
     #Find the index of the child inside the parent
     #For example:
@@ -189,50 +169,21 @@ private
       end
     path
     end
-    def self.traverse_for_node_text(node, text)
-      return if @found
-      if (node.instance_of? Hpricot::Elem)
-        node.traverse_text do |t|
-          if (t.to_s == text)
-            @found = true
-            @node = t.parent
-          end
-       end
-      end
-      node.children.each do |child|
-        if child.instance_of? Hpricot::Elem
-          traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
-        end
-      end
-    end
-    def self.traverse_for_full_text(node, text)
-      return if @found
-      if (node.instance_of? Hpricot::Elem)
-        ft = unescape_entities(full_text(node)).strip
-        #puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
-        if (ft == text)
-          @found = true
-          @node = node
-        end
-      end
-      node.children.each do |child|
-        if child.instance_of? Hpricot::Elem
-          traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
-        end
-      end
+    def self.first_child_node_with_attribute(node, attribute)
+      return if !node.instance_of? Hpricot::Elem || @node
+      @node = node if node.attributes[attribute]
+      node.children.each  { |child| first_child_node_with_attribute(child, attribute) }
     end
-    def self.unescape_entities(text)
-        ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
-        text
-    end
-    def self.lowest_possible_node_with_text(node, text)
-      return if node.instance_of? Hpricot::Text
-      @node = node if full_text(node) == text
-      node.children.each  { |child| lowest_possible_node_with_text(child, text) }
-    end #End of method lowest_possible_node_with_text
+    def self.first_parent_node_with_attribute(node, attribute)
+      return if !node.instance_of? Hpricot::Elem || @node
+      @node = node if node.attributes[attribute]
+      first_parent_node_with_attribute(node.parent, attribute)
+    end
+    def self.to_general_XPath(xpath)
+      xpath.gsub(/\[.+?\]/) {""}
+    end #End of method to_general_XPath
   end #End of class XPathUtils
 end #End of module Scrubyt

data/test/unittests/simple_example_lookup_test.rb ADDED

@@ -0,0 +1,68 @@
+require 'scrubyt'
+require 'test/unit'
+class SimpleExampleLookupTest
+  def setup
+    doc1 = <<-DOC
+    <a>
+        <b>
+                <c/>
+                <d>dddd</d>
+                <e>
+                    <f>fff</f>
+                    <k>kk</k>
+                    <j/>
+                    <l>lll</l>
+                    <m/>
+                    <n>nnn</n>
+                    <n>nnnnnn</n>
+                    <n>
+                        nnnnnnnnn
+                        <q/>
+                        <r>rrr</r>
+                    </n>
+                    <o>ooo</o>
+                    <n>nnnnnnnnnnnn</n>
+                    <p>ppp</p>
+                </e>
+        </b>
+        <g>ggg</g>
+    </a>
+    DOC
+    @doc1 = Hpricot(doc1)
+    @a = @doc1.children[1]
+    @b = @a.children[1]
+    @c = @b.children[1]
+    @d = @b.children[3]
+    @e = @b.children[5]
+    @f = @e.children[1]
+    @g = @a.children[@a.children.size-2]
+    @k = @e.children[3]
+    @j = @e.children[5]
+    @l = @e.children[7]
+    @m = @e.children[9]
+    @n_1 = @e.children[11]
+    @n_2 = @e.children[13]
+    @n_3 = @e.children[15]
+    @o = @e.children[17]
+    @n_4 = @e.children[19]
+    @p = @e.children[21]
+    @q = @n_3.children[1]
+    @r = @n_3.children[3]
+    #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
+  end
+  def test_find_node_from_text
+    elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
+    assert_instance_of(Hpricot::Elem, elem)
+    assert_equal(elem, @f)
+    elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
+    assert_equal(elem, @d)
+    elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
+    assert_equal(elem, @r)
+  end
+end

data/test/unittests/xpathutils_test.rb CHANGED

@@ -53,20 +53,7 @@ class XPathUtilsTest < Test::Unit::TestCase
     @r = @n_3.children[3]
     #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
   end
-  def test_find_node_from_text
-    elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
-    assert_instance_of(Hpricot::Elem, elem)
-    assert_equal(elem, @f)
-    elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
-    assert_equal(elem, @d)
-    elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
-    assert_equal(elem, @r)
-  end
   def test_lowest_common_ancestor
     lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
     lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)

metadata CHANGED

@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: scrubyt
 version: !ruby/object:Gem::Version
-  version: 0.2.3
-date: 2007-02-20 00:00:00 +01:00
+  version: 0.2.6
+date: 2007-03-25 00:00:00 +01:00
 summary: A powerful Web-scraping framework
 require_paths:
 - lib
@@ -34,19 +34,24 @@ files:
 - CHANGELOG
 - Rakefile
 - lib/scrubyt.rb
+- lib/scrubyt/utils/shared_utils.rb
 - lib/scrubyt/utils/xpathutils.rb
+- lib/scrubyt/utils/simple_example_lookup.rb
+- lib/scrubyt/utils/compound_example_lookup.rb
 - lib/scrubyt/output/result_dumper.rb
 - lib/scrubyt/output/export.rb
 - lib/scrubyt/output/post_processor.rb
 - lib/scrubyt/output/result.rb
-- lib/scrubyt/core/navigation/fetch_action.rb
 - lib/scrubyt/core/navigation/navigation_actions.rb
+- lib/scrubyt/core/navigation/fetch_action.rb
 - lib/scrubyt/core/scraping/result_indexer.rb
 - lib/scrubyt/core/scraping/constraint_adder.rb
 - lib/scrubyt/core/scraping/constraint.rb
 - lib/scrubyt/core/scraping/filter.rb
 - lib/scrubyt/core/scraping/pattern.rb
 - lib/scrubyt/core/scraping/pre_filter_document.rb
+- lib/scrubyt/core/scraping/compound_example.rb
+- lib/scrubyt/core/shared/u_r_i_builder.rb
 - lib/scrubyt/core/shared/evaluation_context.rb
 - lib/scrubyt/core/shared/extractor.rb
 test_files:
@@ -56,6 +61,7 @@ test_files:
 - test/unittests/extractor_test.rb
 - test/unittests/xpathutils_test.rb
 - test/unittests/constraint_test.rb
+- test/unittests/simple_example_lookup_test.rb
 - test/unittests/input/constraint_test.html
 - test/unittests/input/test.html
 rdoc_options: []