RubyGems - scrubyt - Versions diffs - 0.1.0 - Mend

scrubyt 0.1.0

Files changed (19) hide show

data/README +41 -0
data/Rakefile +55 -0
data/lib/scrubyt.rb +9 -0
data/lib/scrubyt/constraint.rb +185 -0
data/lib/scrubyt/constraint_adder.rb +86 -0
data/lib/scrubyt/export.rb +187 -0
data/lib/scrubyt/extractor.rb +187 -0
data/lib/scrubyt/filter.rb +144 -0
data/lib/scrubyt/pattern.rb +263 -0
data/lib/scrubyt/result.rb +43 -0
data/lib/scrubyt/result_dumper.rb +84 -0
data/lib/scrubyt/xpathutils.rb +196 -0
data/test/unittests/constraint_test.rb +106 -0
data/test/unittests/extractor_test.rb +93 -0
data/test/unittests/filter_test.rb +71 -0
data/test/unittests/input/constraint_test.html +55 -0
data/test/unittests/input/test.html +39 -0
data/test/unittests/xpathutils_test.rb +165 -0
metadata +63 -0

@@ -0,0 +1,43 @@
+module Scrubyt
+  ##
+  #=<tt>Represents the results of a pattern</tt>
+  class Result
+    attr_reader :childmap, :instances
+    def initialize
+      @childmap ||= []
+    end
+    def add_result(source, result)
+      @childmap.each do |hash|
+        if hash.keys[0] == source
+          hash[source] << result
+          return
+        end
+      end
+      @childmap << {source => [result]}
+    end
+    def lookup(last_result)
+      @childmap.each do |hashes|
+        hashes.each { |key, value| return value if (key == last_result) }
+      end
+      nil
+    end#end of method lookup
+  end#end of class Result
+end#end of module Scrubyt
+  #It roughly works like this:
+  #
+  # root
+  # source:         nil
+  # childmap:       [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
+  #table
+  #  source:         doc1
+  #  childmap        [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
+  #row
+  #  source:         table1s1, table2s1, table3s1
+  #  childmap:       [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
+  #                    {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]

data/lib/scrubyt/result_dumper.rb ADDED

@@ -0,0 +1,84 @@
+require 'rexml/document'
+module Scrubyt
+  ##
+  #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
+  class ResultDumper
+    ##
+    #Output the results as XML
+    def self.to_xml(pattern)
+      doc = REXML::Document.new
+      root = REXML::Element.new('root')
+      doc.add_element(root)
+      all_extracted_docs = pattern.last_result
+      all_extracted_docs.each do |lr|
+        pattern.last_result = lr
+        to_xml_recursive(pattern, root)
+      end
+      doc
+    end
+    ##
+    #Output the text of the pattern; If this pattern is a tree, collect the text from its
+    #result instance node; otherwise rely on the last_result
+    def self.to_text(pattern)
+       last_result = pattern.last_result
+       result = ""
+       if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_TREE
+         last_result.traverse_text { |t| result += t.to_s }
+       else
+         result = last_result
+       end
+       result
+    end
+    ##
+    #Print some simple statistics on the extracted results, like the count of extracted
+    #instances by each pattern
+    def self.print_statistics(pattern)
+      puts "\n" * 2
+      print_statistics_recursive(pattern,0)
+      puts
+    end
+private
+    def self.to_xml_recursive(pattern, element)
+      pattern.children.each do |child|
+        childresults = child.result.lookup(child.parent.last_result)
+        #Output text for leaf nodes only; Maybe add possibility to customize this later
+        if (childresults == nil)
+          res = ""
+          child.parent.last_result.traverse_text { |t| res += t.to_s }
+          if (child.parent.size == 0)
+            element.text = (res.gsub('&nbsp;'){' '}).strip unless element.parent.is_a? REXML::Document
+          end
+          next
+        end
+        generate_children(child, childresults, element)
+      end
+    end
+    def self.generate_children(child, childresults, element)
+        childresults.size.times do |num|
+          child.last_result = childresults[num]
+          res = ""
+          if child.last_result.instance_of? String
+            res = child.last_result
+          else
+            child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
+          end
+          child_node = REXML::Element.new(child.name)
+          child_node.text = (res.gsub('&nbsp;'){' '}).strip if (child.children.size == 0)
+          element.add_element(child_node)
+          to_xml_recursive(child, child_node)
+        end
+    end
+    def self.print_statistics_recursive(pattern, depth)
+      puts((' ' * "#{depth}".to_i) +  "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
+      pattern.children.each do |child|
+        print_statistics_recursive(child, depth + 4)
+      end
+    end#end of method print_statistics_recursive
+  end #end of class ResultDumper
+end #end of module Scrubyt

data/lib/scrubyt/xpathutils.rb ADDED

@@ -0,0 +1,196 @@
+require 'rubygems'
+require 'hpricot'
+module Scrubyt
+  ##
+  #=<tt>Various XPath utility functions</tt>
+  class XPathUtils
+    #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
+    NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
+    #From the example text defined by the user, find the lowest possible node with the text 'text'.
+    #The text can be also a mixed content text, e.g.
+    #
+    # <a>Bon <b>nuit</b>, monsieur!</a>
+    #
+    #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
+    def self.find_node_from_text(doc, text)
+      @node = nil
+      @found = false
+      self.traverse_for_text(doc,text)
+      self.lowest_possible_node_with_text(@node, text)
+      #$Logger.warn("Node for example #{text} Not found!") if (@found == false)
+      puts "Node for example #{text} Not found!" if (@found == false)
+      @node
+    end
+    #Full text of the node; this is equivalent to Hpricot's inner_text. Will be
+    #replaced if Hpricot 0.5 will be released
+    def self.full_text(node)
+      result = ""
+      node.traverse_text { |t| result += t.to_s }
+      result
+    end
+    #Find the LCA (Lowest Common Ancestor) of two nodes
+    def self.lowest_common_ancestor(node1, node2)
+      path1 = traverse_up(node1)
+      path2 = traverse_up(node2)
+      return node1.parent if path1 == path2
+      closure = nil
+      while (!path1.empty? && !path2.empty?)
+	    closure = path1.pop
+	    return closure.parent if (closure != path2.pop)
+      end
+      path1.size > path2.size ? path1.last.parent : path2.last.parent
+    end
+    ##
+    #Generate XPath for the given node
+    #
+    #*parameters*
+    #
+    #_node_ - The node we are looking the XPath for
+    #
+    #_stopnode_ - The Xpath generation is stopped and the XPath that
+    #was generated so far is returned if this node is reached.
+    #
+    #_write_indices_ - whether the index inside the parent shuold be
+    #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
+    def self.generate_XPath(node, stopnode=nil, write_indices=false)
+      path = []
+      indices = []
+      found = false
+      while node.class != Hpricot::Doc do
+        if node == stopnode
+          found = true
+          break
+        end
+        path.push node.name
+        indices.push find_index(node) if write_indices
+        node = node.parent
+      end
+      #This condition ensures that if there is a stopnode, and we did not found it along the way,
+      #we return nil (since the stopnode is not contained in the path at all)
+      return nil if stopnode != nil && !found
+      result = ""
+      if write_indices
+        path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
+      else
+        path.reverse.each{ |node| result += "#{node}/" }
+      end
+      "/" + result.chop
+    end
+    #Generate an XPath of the node with indices, relatively to the given
+    #relative_root.
+    #
+    #For example if the elem's absolute XPath is /a/b/c,
+    #and the relative root's Xpath is a/b, the result of the function will
+    #be /c.
+    def self.generate_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, true)
+    end
+    #Generate a generalized XPath (i.e. without indices) of the node,
+    #relatively to the given relative_root.
+    #
+    #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
+    #and the relative root's Xpath is a[1]/b[3], the result of the function will
+    #be /c.
+    def self.generate_generalized_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, false)
+    end
+    #Find an image based on the src of the example
+    #
+    #*parameters*
+    #
+    #_doc_ - The containing document
+    #
+    #_example_ - The value of the src attribute of the img tag
+    #This is convenient, since if the users rigth-clicks an image and
+    #copies image location, this string will be copied to the clipboard
+    #and thus can be easily pasted as an examle
+    #
+    #_index_ - there might be more images with the same src on the page -
+    #most typically the user will need the 0th - but if this is not the
+    #case, there is the possibility to override this
+    def self.find_image(doc, example, index=0)
+      (doc/"img[@src='#{example}']")[index]
+    end
+    ##
+    #Used to find the parent of a node with the given name - for example
+    #find the <form> node which is the parent of the <input> node
+    def self.traverse_up_until_name(node, name)
+      while node.class != Hpricot::Doc do
+        break if node.name == name
+        node = node.parent
+      end
+      node
+    end
+private
+    #Find the index of the child inside the parent
+    #For example:
+    #
+    #         tr
+    #      /  |   \
+    #    td   td   td
+    #    0    1    2
+    #
+    #The last row contains the indices of the td's from the
+    #tow above.
+    #
+    #Note that in classic XPath, the indices start with 1 (rather
+    #than 0).
+    def self.find_index(node)
+     c = -1
+     node.parent.children.each do |child|
+       if child.class == Hpricot::Elem
+         c += 1 if (child.name == node.name)
+         break if (node == child)
+       end
+     end
+     c
+    end
+    def self.traverse_up(node, stopnode=nil)
+      path = []
+      while node.class != Hpricot::Doc do
+        break if node == stopnode
+        path.push node
+        node = node.parent
+      end
+    path
+    end
+    def self.traverse_for_text(node, text)
+      return if @found
+      if (node.instance_of? Hpricot::Elem)
+        @node = node
+        ft = full_text(node)
+        @found = true if (ft.gsub('&nbsp;'){' '} == text)
+      end
+      node.children.each do |child|
+        traverse_nodes child if child.instance_of? Hpricot::Doc
+        if child.instance_of? Hpricot::Elem
+          traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
+        end
+      end
+    end
+    def self.lowest_possible_node_with_text(node, text)
+      return if node.instance_of? Hpricot::Text
+      @node = node if full_text(node) == text
+      node.children.each do |child|
+        lowest_possible_node_with_text(child, text)
+      end
+    end #End of method lowest_possible_node_with_text
+  end #End of class XPathUtils
+end #End of module Scrubyt

data/test/unittests/constraint_test.rb ADDED

@@ -0,0 +1,106 @@
+#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
+#require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
+#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
+require 'scrubyt'
+require 'test/unit'
+class ConstraintTest < Test::Unit::TestCase
+  def test_presence_of_attribute_constraints
+    data = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute('color' => 'red').
+                          ensure_absence_of_attribute('fill' => 'small_circles')
+    end
+    assert_equal(data.children[0].filters[0].constraints[0].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
+    assert_equal(data.children[0].filters[0].constraints[1].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
+  end
+  def test_presence_of_ancestor_node_constraints
+    data = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
+                     ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
+    end
+    assert_equal(data.children[0].filters[0].constraints[0].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
+    assert_equal(data.children[0].filters[0].constraints[1].type,
+                 Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
+  end
+  def test_ancestor_node_constraints
+    data0 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham')
+    end
+    data1 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
+    end
+    data2 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
+                     ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
+    end
+    data3 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'line').ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
+                     ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
+    end
+    data4 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'chunky_bacon').
+                     ensure_absence_of_attribute 'thickness' => '2'
+    end
+    assert_equal(data0.to_xml.to_s, "<root><shape>blue_circle</shape><shape>splatted_ellipse</shape></root>")
+    assert_equal(data1.to_xml.to_s, "<root><shape>splatted_ellipse</shape></root>")
+    assert_equal(data2.to_xml.to_s, "<root><shape>blue_circle</shape></root>")
+    assert_equal(data3.to_xml.to_s, "<root><shape>big_rectangle</shape></root>")
+    assert_equal(data4.to_xml.to_s, "<root><shape>ruby_diamond</shape></root>")
+  end
+  def test_attribute_constraints
+    data0 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red'
+    end
+    data1 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => '10x20'
+    end
+    data2 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => nil
+    end
+    data3 = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
+      (shape 'ruby_diamond').ensure_presence_of_attribute 'thickness' => nil
+    end
+    assert_equal(data0.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape><shape>twinky_line</shape></root>")
+    assert_equal(data1.to_xml.to_s, "<root><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
+    assert_equal(data2.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
+    assert_equal(data3.to_xml.to_s, "<root><shape>twinky_line</shape><shape>line</shape><shape>chunky_line</shape></root>")
+  end
+end

data/test/unittests/extractor_test.rb ADDED

@@ -0,0 +1,93 @@
+#require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
+#require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
+require 'scrubyt'
+require 'test/unit'
+class ExtractorTest < Test::Unit::TestCase
+  def test_create_one_pattern
+    pattern = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      pattern "x"
+    end
+    assert_instance_of(Scrubyt::Pattern, pattern)
+    assert_equal(pattern.name, "root")
+    assert_equal(pattern.children[0].name, 'pattern')
+    assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
+    assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
+    assert_equal(pattern.generalize, false)
+    assert_equal(pattern.children[0].generalize, true)
+  end
+  def test_create_child_pattern
+    pattern = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      parent { child "x" }
+    end
+    assert_equal(pattern.name, "root")
+    assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
+    assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
+    assert_equal(pattern.children[0].name, "parent")
+    assert_equal(pattern.children[0].type, Scrubyt::Pattern::PATTERN_TYPE_TREE)
+    assert_equal(pattern.children[0].output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
+  end
+  def test_create_more_children
+    pattern = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      parent do
+        child1 'x'
+        child2 'y'
+        child3 'z'
+        child4 'a'
+      end
+    end
+    assert_equal(pattern.children[0].children.size, 4)
+    i = 0
+    3.times do
+      assert_equal(pattern.children[0].children[i].parent,
+                   pattern.children[0].children[i+=1].parent)
+      assert_equal(pattern.children[0].children[i].children, [])
+    end
+    assert_equal(pattern.children[0].children[3].parent, pattern.children[0])
+    assert_equal(pattern.children[0].children[3].parent.parent, pattern)
+  end
+  def test_create_hierarchy
+    tree = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      a { b { c { d { e "x" } } } }
+    end
+    assert_equal(tree.name,"root")
+    assert_equal(tree.children[0].name,"a")
+    assert_equal(tree.children[0].children[0].name,"b")
+    assert_equal(tree.children[0].children[0].children[0].name,"c")
+    assert_equal(tree.children[0].children[0].children[0].children[0].name,"d")
+  end
+  def test_empty_filter
+    tree = Scrubyt::Extractor.define do
+      fetch File.join(File.dirname(__FILE__), "input/test.html")
+      a do
+        b 'x'
+        c 'y'
+      end
+    end
+    assert_not_nil(tree.filters[0])
+    assert_nil(tree.example)
+    assert_not_nil(tree.children[0].filters[0])
+    assert_nil(tree.children[0].example)
+    assert_not_nil(tree.children[0].children[0].filters[0])
+    assert_equal(tree.children[0].children[0].example,'x')
+    assert_not_nil(tree.children[0].children[1].filters[0])
+    assert_equal(tree.children[0].children[1].example,'y')
+  end
+end