RubyGems - scrubyt - Versions diffs - 0.1.0 - Mend

scrubyt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/README +41 -0
data/Rakefile +55 -0
data/lib/scrubyt.rb +9 -0
data/lib/scrubyt/constraint.rb +185 -0
data/lib/scrubyt/constraint_adder.rb +86 -0
data/lib/scrubyt/export.rb +187 -0
data/lib/scrubyt/extractor.rb +187 -0
data/lib/scrubyt/filter.rb +144 -0
data/lib/scrubyt/pattern.rb +263 -0
data/lib/scrubyt/result.rb +43 -0
data/lib/scrubyt/result_dumper.rb +84 -0
data/lib/scrubyt/xpathutils.rb +196 -0
data/test/unittests/constraint_test.rb +106 -0
data/test/unittests/extractor_test.rb +93 -0
data/test/unittests/filter_test.rb +71 -0
data/test/unittests/input/constraint_test.html +55 -0
data/test/unittests/input/test.html +39 -0
data/test/unittests/xpathutils_test.rb +165 -0
metadata +63 -0

data/lib/scrubyt/result.rb ADDED

@@ -0,0 +1,43 @@
+module Scrubyt
+  ##
+  #=<tt>Represents the results of a pattern</tt>
+  class Result
+    attr_reader :childmap, :instances
+    def initialize
+      @childmap ||= []
+    end
+    def add_result(source, result)
+      @childmap.each do |hash|
+        if hash.keys[0] == source
+          hash[source] << result
+          return
+        end
+      end
+      @childmap << {source => [result]}
+    end
+    def lookup(last_result)
+      @childmap.each do |hashes|
+        hashes.each { |key, value| return value if (key == last_result) }
+      end
+      nil
+    end#end of method lookup
+  end#end of class Result
+end#end of module Scrubyt
+  #It roughly works like this:
+  #
+  # root
+  # source:         nil
+  # childmap:       [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
+  #table
+  #  source:         doc1
+  #  childmap        [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
+  #row
+  #  source:         table1s1, table2s1, table3s1
+  #  childmap:       [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
+  #                    {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]

data/lib/scrubyt/result_dumper.rb ADDED

@@ -0,0 +1,84 @@
+require 'rexml/document'
+module Scrubyt
+  ##
+  #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
+  class ResultDumper
+    ##
+    #Output the results as XML
+    def self.to_xml(pattern)
+      doc = REXML::Document.new
+      root = REXML::Element.new('root')
+      doc.add_element(root)
+      all_extracted_docs = pattern.last_result
+      all_extracted_docs.each do |lr|
+        pattern.last_result = lr
+        to_xml_recursive(pattern, root)
+      end
+      doc
+    end
+    ##
+    #Output the text of the pattern; If this pattern is a tree, collect the text from its
+    #result instance node; otherwise rely on the last_result
+    def self.to_text(pattern)
+       last_result = pattern.last_result
+       result = ""
+       if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_TREE
+         last_result.traverse_text { |t| result += t.to_s }
+       else
+         result = last_result
+       end
+       result
+    end
+    ##
+    #Print some simple statistics on the extracted results, like the count of extracted
+    #instances by each pattern
+    def self.print_statistics(pattern)
+      puts "\n" * 2
+      print_statistics_recursive(pattern,0)
+      puts
+    end
+private
+    def self.to_xml_recursive(pattern, element)
+      pattern.children.each do |child|
+        childresults = child.result.lookup(child.parent.last_result)
+        #Output text for leaf nodes only; Maybe add possibility to customize this later
+        if (childresults == nil)
+          res = ""
+          child.parent.last_result.traverse_text { |t| res += t.to_s }
+          if (child.parent.size == 0)
+            element.text = (res.gsub('&nbsp;'){' '}).strip unless element.parent.is_a? REXML::Document
+          end
+          next
+        end
+        generate_children(child, childresults, element)
+      end
+    end
+    def self.generate_children(child, childresults, element)
+        childresults.size.times do |num|
+          child.last_result = childresults[num]
+          res = ""
+          if child.last_result.instance_of? String
+            res = child.last_result
+          else
+            child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
+          end
+          child_node = REXML::Element.new(child.name)
+          child_node.text = (res.gsub('&nbsp;'){' '}).strip if (child.children.size == 0)
+          element.add_element(child_node)
+          to_xml_recursive(child, child_node)
+        end
+    end
+    def self.print_statistics_recursive(pattern, depth)
+      puts((' ' * "#{depth}".to_i) +  "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
+      pattern.children.each do |child|
+        print_statistics_recursive(child, depth + 4)
+      end
+    end#end of method print_statistics_recursive
+  end #end of class ResultDumper
+end #end of module Scrubyt

data/lib/scrubyt/xpathutils.rb ADDED

@@ -0,0 +1,196 @@
+require 'rubygems'
+require 'hpricot'
+module Scrubyt
+  ##
+  #=<tt>Various XPath utility functions</tt>
+  class XPathUtils
+    #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
+    NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
+    #From the example text defined by the user, find the lowest possible node with the text 'text'.
+    #The text can be also a mixed content text, e.g.
+    #
+    # <a>Bon <b>nuit</b>, monsieur!</a>
+    #
+    #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
+    def self.find_node_from_text(doc, text)
+      @node = nil
+      @found = false
+      self.traverse_for_text(doc,text)
+      self.lowest_possible_node_with_text(@node, text)
+      #$Logger.warn("Node for example #{text} Not found!") if (@found == false)
+      puts "Node for example #{text} Not found!" if (@found == false)
+      @node
+    end
+    #Full text of the node; this is equivalent to Hpricot's inner_text. Will be
+    #replaced if Hpricot 0.5 will be released
+    def self.full_text(node)
+      result = ""
+      node.traverse_text { |t| result += t.to_s }
+      result
+    end
+    #Find the LCA (Lowest Common Ancestor) of two nodes
+    def self.lowest_common_ancestor(node1, node2)
+      path1 = traverse_up(node1)
+      path2 = traverse_up(node2)
+      return node1.parent if path1 == path2
+      closure = nil
+      while (!path1.empty? && !path2.empty?)
+	    closure = path1.pop
+	    return closure.parent if (closure != path2.pop)
+      end
+      path1.size > path2.size ? path1.last.parent : path2.last.parent
+    end
+    ##
+    #Generate XPath for the given node
+    #
+    #*parameters*
+    #
+    #_node_ - The node we are looking the XPath for
+    #
+    #_stopnode_ - The Xpath generation is stopped and the XPath that
+    #was generated so far is returned if this node is reached.
+    #
+    #_write_indices_ - whether the index inside the parent shuold be
+    #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
+    def self.generate_XPath(node, stopnode=nil, write_indices=false)
+      path = []
+      indices = []
+      found = false
+      while node.class != Hpricot::Doc do
+        if node == stopnode
+          found = true
+          break
+        end
+        path.push node.name
+        indices.push find_index(node) if write_indices
+        node = node.parent
+      end
+      #This condition ensures that if there is a stopnode, and we did not found it along the way,
+      #we return nil (since the stopnode is not contained in the path at all)
+      return nil if stopnode != nil && !found
+      result = ""
+      if write_indices
+        path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
+      else
+        path.reverse.each{ |node| result += "#{node}/" }
+      end
+      "/" + result.chop
+    end
+    #Generate an XPath of the node with indices, relatively to the given
+    #relative_root.
+    #
+    #For example if the elem's absolute XPath is /a/b/c,
+    #and the relative root's Xpath is a/b, the result of the function will
+    #be /c.
+    def self.generate_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, true)
+    end
+    #Generate a generalized XPath (i.e. without indices) of the node,
+    #relatively to the given relative_root.
+    #
+    #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
+    #and the relative root's Xpath is a[1]/b[3], the result of the function will
+    #be /c.
+    def self.generate_generalized_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, false)
+    end
+    #Find an image based on the src of the example
+    #
+    #*parameters*
+    #
+    #_doc_ - The containing document
+    #
+    #_example_ - The value of the src attribute of the img tag
+    #This is convenient, since if the users rigth-clicks an image and
+    #copies image location, this string will be copied to the clipboard
+    #and thus can be easily pasted as an examle
+    #
+    #_index_ - there might be more images with the same src on the page -
+    #most typically the user will need the 0th - but if this is not the
+    #case, there is the possibility to override this
+    def self.find_image(doc, example, index=0)
+      (doc/"img[@src='#{example}']")[index]
+    end
+    ##
+    #Used to find the parent of a node with the given name - for example
+    #find the <form> node which is the parent of the <input> node
+    def self.traverse_up_until_name(node, name)
+      while node.class != Hpricot::Doc do
+        break if node.name == name
+        node = node.parent
+      end
+      node
+    end
+private
+    #Find the index of the child inside the parent
+    #For example:
+    #
+    #         tr
+    #      /  |   \
+    #    td   td   td
+    #    0    1    2
+    #
+    #The last row contains the indices of the td's from the
+    #tow above.
+    #
+    #Note that in classic XPath, the indices start with 1 (rather
+    #than 0).
+    def self.find_index(node)
+     c = -1
+     node.parent.children.each do |child|
+       if child.class == Hpricot::Elem
+         c += 1 if (child.name == node.name)
+         break if (node == child)
+       end
+     end
+     c
+    end
+    def self.traverse_up(node, stopnode=nil)
+      path = []
+      while node.class != Hpricot::Doc do
+        break if node == stopnode
+        path.push node
+        node = node.parent
+      end
+    path
+    end
+    def self.traverse_for_text(node, text)
+      return if @found
+      if (node.instance_of? Hpricot::Elem)
+        @node = node
+        ft = full_text(node)
+        @found = true if (ft.gsub('&nbsp;'){' '} == text)
+      end
+      node.children.each do |child|
+        traverse_nodes child if child.instance_of? Hpricot::Doc
+        if child.instance_of? Hpricot::Elem
+          traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
+        end
+      end
+    end
+    def self.lowest_possible_node_with_text(node, text)
+      return if node.instance_of? Hpricot::Text
+      @node = node if full_text(node) == text
+      node.children.each do |child|
+        lowest_possible_node_with_text(child, text)
+      end
+    end #End of method lowest_possible_node_with_text
+  end #End of class XPathUtils
+end #End of module Scrubyt

data/test/unittests/constraint_test.rb ADDED

@@ -0,0 +1,106 @@
+#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
+#require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
+#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
+require 'scrubyt'
+require 'test/unit'
+class ConstraintTest < Test::Unit::TestCase
+  def test_presence_of_attribute_constraints
+    data = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute('color' => 'red').
+                          ensure_absence_of_attribute('fill' => 'small_circles')
+    end
+    assert_equal(data.children[0].filters[0].constraints[0].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
+    assert_equal(data.children[0].filters[0].constraints[1].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
+  end
+  def test_presence_of_ancestor_node_constraints
+    data = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
+                     ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
+    end
+    assert_equal(data.children[0].filters[0].constraints[0].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
+    assert_equal(data.children[0].filters[0].constraints[1].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
+  end
+  def test_ancestor_node_constraints
+    data0 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham')
+    end
+    data1 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
+    end
+    data2 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
+                     ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
+    end
+    data3 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'line').ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
+                     ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
+    end
+    data4 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'chunky_bacon').
+                     ensure_absence_of_attribute 'thickness' => '2'
+    end
+    assert_equal(data0.to_xml.to_s, "<root><shape>blue_circle</shape><shape>splatted_ellipse</shape></root>")
+    assert_equal(data1.to_xml.to_s, "<root><shape>splatted_ellipse</shape></root>")
+    assert_equal(data2.to_xml.to_s, "<root><shape>blue_circle</shape></root>")
+    assert_equal(data3.to_xml.to_s, "<root><shape>big_rectangle</shape></root>")
+    assert_equal(data4.to_xml.to_s, "<root><shape>ruby_diamond</shape></root>")
+  end
+  def test_attribute_constraints
+    data0 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red'
+    end
+    data1 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => '10x20'
+    end
+    data2 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => nil
+    end
+    data3 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'thickness' => nil
+    end
+    assert_equal(data0.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape><shape>twinky_line</shape></root>")
+    assert_equal(data1.to_xml.to_s, "<root><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
+    assert_equal(data2.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
+    assert_equal(data3.to_xml.to_s, "<root><shape>twinky_line</shape><shape>line</shape><shape>chunky_line</shape></root>")
+  end
+end

data/test/unittests/extractor_test.rb ADDED

@@ -0,0 +1,93 @@
+#require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
+#require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
+require 'scrubyt'
+require 'test/unit'
+class ExtractorTest < Test::Unit::TestCase
+  def test_create_one_pattern
+    pattern = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      pattern "x"
+    end
+    assert_instance_of(Scrubyt::Pattern, pattern)
+    assert_equal(pattern.name, "root")
+    assert_equal(pattern.children[0].name, 'pattern')
+    assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
+    assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
+    assert_equal(pattern.generalize, false)
+    assert_equal(pattern.children[0].generalize, true)
+  end
+  def test_create_child_pattern
+    pattern = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      parent { child "x" }
+    end
+    assert_equal(pattern.name, "root")
+    assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
+    assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
+    assert_equal(pattern.children[0].name, "parent")
+    assert_equal(pattern.children[0].type, Scrubyt::Pattern::PATTERN_TYPE_TREE)
+    assert_equal(pattern.children[0].output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
+  end
+  def test_create_more_children
+    pattern = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      parent do
+        child1 'x'
+        child2 'y'
+        child3 'z'
+        child4 'a'
+      end
+    end
+    assert_equal(pattern.children[0].children.size, 4)
+    i = 0
+    3.times do
+      assert_equal(pattern.children[0].children[i].parent,
+                   pattern.children[0].children[i+=1].parent)
+      assert_equal(pattern.children[0].children[i].children, [])
+    end
+    assert_equal(pattern.children[0].children[3].parent, pattern.children[0])
+    assert_equal(pattern.children[0].children[3].parent.parent, pattern)
+  end
+  def test_create_hierarchy
+    tree = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      a { b { c { d { e "x" } } } }
+    end
+    assert_equal(tree.name,"root")
+    assert_equal(tree.children[0].name,"a")
+    assert_equal(tree.children[0].children[0].name,"b")
+    assert_equal(tree.children[0].children[0].children[0].name,"c")
+    assert_equal(tree.children[0].children[0].children[0].children[0].name,"d")
+  end
+  def test_empty_filter
+    tree = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      a do
+        b 'x'
+        c 'y'
+      end
+    end
+    assert_not_nil(tree.filters[0])
+    assert_nil(tree.example)
+    assert_not_nil(tree.children[0].filters[0])
+    assert_nil(tree.children[0].example)
+    assert_not_nil(tree.children[0].children[0].filters[0])
+    assert_equal(tree.children[0].children[0].example,'x')
+    assert_not_nil(tree.children[0].children[1].filters[0])
+    assert_equal(tree.children[0].children[1].example,'y')
+  end
+end