RubyGems - scrubyt - Versions diffs - 0.1.0 → 0.1.9 - Mend

scrubyt 0.1.0 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/CHANGELOG +34 -0
data/COPYING +340 -0
data/README +34 -5
data/Rakefile +6 -5
data/lib/scrubyt.rb +1 -0
data/lib/scrubyt/constraint.rb +12 -24
data/lib/scrubyt/constraint_adder.rb +3 -17
data/lib/scrubyt/export.rb +33 -17
data/lib/scrubyt/extractor.rb +74 -23
data/lib/scrubyt/filter.rb +52 -37
data/lib/scrubyt/pattern.rb +74 -30
data/lib/scrubyt/post_processor.rb +58 -0
data/lib/scrubyt/result.rb +2 -2
data/lib/scrubyt/result_dumper.rb +6 -0
data/lib/scrubyt/xpathutils.rb +52 -15
data/test/unittests/constraint_test.rb +0 -3
data/test/unittests/extractor_test.rb +11 -13
data/test/unittests/xpathutils_test.rb +31 -31
metadata +8 -5

data/lib/scrubyt/pattern.rb CHANGED

@@ -1,6 +1,5 @@
 require 'rubygems'
 require 'hpricot'
-require 'open-uri'
 module Scrubyt
   ##
@@ -43,7 +42,8 @@ module Scrubyt
     attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
                   :last_result, :result, :root_pattern, :example,  :block_count,
-                  :next_page, :limit, :extractor, :extracted_docs, :source, :sink
+                  :next_page, :limit, :extractor, :extracted_docs,
+                  :examples, :parent_of_leaf
     attr_reader :type, :generalize_set, :next_page_url
     def initialize (name, *args)
@@ -52,12 +52,17 @@ module Scrubyt
       @root_pattern = nil         #root pattern of the wrapper
       @children = []              #child patterns
       @filters = []               #filters of the wrapper
-      @sink = []                  #output of a pattern
-      @source = []                #input of a pattern
       @result = Result.new        #hierarchical results of the pattern
       @@instance_count = Hash.new(0)
+      @evaluated_examples = []
       @next_page = nil
-      filters << Scrubyt::Filter.new(self) #create a filter
+      if @examples == nil
+        filters << Scrubyt::Filter.new(self) #create a default filter
+      else
+        @examples.each do |example|
+          filters << Scrubyt::Filter.new(self,example) #create a filter
+        end
+      end
     end
     #Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
@@ -66,10 +71,8 @@ module Scrubyt
     #If an example is specified, it *MUST* be the first parameter; the order of the other
     #parameters is irrelevant
     def parse_args(args)
-      #If an example id defined, not only get it but also remove it so it
-      #does not interfere with the other possible string parameters
-      @example = args.delete_at(0) if args[0].instance_of? String
-      @example = args.delete_at(0) if args[0].instance_of? Regexp
+      #Grab any examples that are defined!
+      look_for_examples(args)
       args.each do |arg|
         arg.each do |k,v|
           #Set only the setable fields
@@ -96,7 +99,7 @@ module Scrubyt
       #This flag indicates that the user set 'generalize' to some value;
       #This way we can ensure that the explicit setting will not be overridden
       @generalize_set ||= false
-    end
+    end
     #Dispatcher function; The class was already too big so I have decided to factor
     #out some methods based on their functionality (like output, adding constraints)
@@ -162,7 +165,7 @@ module Scrubyt
       temp_document = generate_next_page_link(@next_page)
       return nil if temp_document == nil
       clear_sources_and_sinks(@root_pattern)
-      @root_pattern.extractor.fetch(temp_document, nil)
+      @root_pattern.extractor.fetch(temp_document)
       attach_current_document
     end
@@ -171,17 +174,18 @@ module Scrubyt
     #crawling to a new page
     def attach_current_document
       doc = @root_pattern.extractor.get_hpricot_doc
-      @source << doc
-      @sink << doc
+      filters[0].source << doc
+      filters[0].sink << doc
       @last_result ||= []
       @last_result << doc
-      @result.add_result(@source, @sink)
+      @result.add_result(filters[0].source, filters[0].sink)
     end
     ##
     #Based on the given examples, calculate the XPaths for the tree patterns
     def setup_examples
       get_root_pattern(self)
+      mark_leaf_parents(self)
       set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
       generate_examples(@root_pattern)
     end
@@ -192,10 +196,14 @@ module Scrubyt
     def evaluate
       #No need to evaluate if there is no parent pattern
       return if @parent == nil
-      @source = @parent.sink
-      @source.each do |source|
-        @filters.each do |filter|
+      all_filter_results = []
+      @filters.each do |filter|
+        filter_index = @filters.index(filter)
+        filter_index = 0 if @parent.filters.size <= filter_index
+        filter.source = @parent.filters[filter_index].sink
+        filter.source.each do |source|
           r = filter.evaluate(source)
+          next if r == nil
           if filter.constraints.size > 0
             #in the beginning, keys of result_hash are made up of all the results of the filter
             #with value = true; Later on, only those results will have 'true' value which are
@@ -206,24 +214,51 @@ module Scrubyt
               filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
             end
             result = result_hash.reject {|k,v| k if !v}
-            sorted_result = r.reject {|e| !result.keys.include? e}
-            add_result(source, sorted_result)
+            sorted_result = r.reject {|e| !result.keys.include? e}
+            add_result(filter, source, sorted_result)
           else
-            add_result(source, r)
-          end
-        end
-      end
+            if ( (xe = @result.lookup(source)) != nil )
+              #puts "ha"; p xe
+            end
+            add_result(filter, source, r)
+          end#end of constraint check
+        end#end of source iteration
+      end#end of filter iteration
     end
     def get_instance_count
       @@instance_count
     end
+    def get_constraints
+      filters[0].constraints
+    end
 private
-    def add_result(source, results)
+    def look_for_examples(args)
+      if (args[0].is_a? String)
+        @examples = args.select {|e| e.is_a? String}
+        #Check if all the String parameters are really the first
+        #parameters
+        args[0..@examples.size-1].each do |example|
+          if !example.is_a? String
+            puts 'FATAL: Problem with example specification'
+          end
+        end
+      elsif (args[0].is_a? Regexp)
+        #Check if all the String parameters are really the first
+        #parameters
+        args[0..@examples.size].each do |example|
+          if !example.is_a? Regexp
+            puts 'FATAL: Problem with example specification'
+          end
+        end
+      end
+    end
+    def add_result(filter, source, results)
       results.each do |res|
-          @sink << res
+        filter.sink << res
         @result.add_result(source, res)
         @@instance_count[@name] += 1
       end
@@ -238,6 +273,13 @@ private
       end
     end
+    def mark_leaf_parents(pattern)
+      pattern.children.each { |child|
+        pattern.parent_of_leaf = true if child.children.size == 0
+      }
+      pattern.children.each { |child| mark_leaf_parents(child) }
+    end
     def set_root_pattern_whole_wrapper(pattern, root_pattern)
       pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
       pattern.root_pattern = root_pattern
@@ -249,15 +291,17 @@ private
     end
     def clear_sources_and_sinks(pattern)
-      pattern.source = []
-      pattern.sink = []
+      pattern.filters.each do |filter|
+        filter.source = []
+        filter.sink = []
+      end
       pattern.children.each {|child| clear_sources_and_sinks child}
     end
     def generate_next_page_link(example)
-      node = XPathUtils.find_node_from_text(@root_pattern.source[0], example)
+      node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example)
       return nil if node == nil
-      node.attributes['href']
+      node.attributes['href'].gsub('&amp;') {'&'}
     end # end of method generate_next_page_link
   end #end of class Pattern
 end #end of module Scrubyt

data/lib/scrubyt/post_processor.rb ADDED

@@ -0,0 +1,58 @@
+module Scrubyt
+##
+#=<tt>Post processing results after the extraction</tt>
+#Some things can not be carried out during evaluation - for example
+#the ensure_presence_of_pattern constraint (since the evaluation is top
+#to bottom, at a given point we don't know yet whether the currently
+#evaluated pattern will have a child pattern or not) or removing unneeded
+#results caused by evaluating multiple filters.
+#
+#The sole purpose of this class is to execute these post-processing tasks.
+  class PostProcessor
+    ##
+    #Remove unneeded results of a pattern (caused by evaluating multiple filters)
+    #See for example the B&N scenario - the book titles are extracted two times
+    #for every pattern (since both examples generate the same XPath for them)
+    #but since always only one of the results has a price, the other is discarded
+    def self.remove_multiple_filter_duplicates(pattern)
+      remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
+      pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
+    end
+private
+    def self.remove_multiple_filter_duplicates_intern(pattern)
+      possible_duplicates = {}
+      longest_result = 0
+      pattern.result.childmap.each { |r|
+        r.each do |k,v|
+          v.each do |x|
+            all_child_results = []
+            pattern.children.each { |child|
+              temp_res = child.result.lookup(x)
+              all_child_results << temp_res if temp_res != nil
+            }
+            next if all_child_results.size <= 1
+            longest_result = all_child_results.map {|e| e.size}.max
+            all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
+            possible_duplicates[x] = all_child_results.transpose
+          end
+        end
+      }
+      #Determine the 'real' duplicates
+      real_duplicates = {}
+      possible_duplicates.each { |k,v|
+        next if v.size == 1
+        v.each { |r| real_duplicates[k] = r }
+      }
+      #Finally, remove them!
+      pattern.children.each { |child|
+        child.result.childmap.each { |r|
+          r.each { |k,v|
+           real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
+          }
+        }
+      }
+    end
+  end
+end

data/lib/scrubyt/result.rb CHANGED

@@ -11,7 +11,7 @@ module Scrubyt
     def add_result(source, result)
       @childmap.each do |hash|
         if hash.keys[0] == source
-          hash[source] << result
+          hash[source] << result if !hash[source].include? result
           return
         end
       end
@@ -35,7 +35,7 @@ end#end of module Scrubyt
   #table
   #  source:         doc1
-  #  childmap        [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
+  #  childmap        [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
   #row
   #  source:         table1s1, table2s1, table3s1

data/lib/scrubyt/result_dumper.rb CHANGED

@@ -15,9 +15,15 @@ module Scrubyt
         pattern.last_result = lr
         to_xml_recursive(pattern, root)
       end
+      remove_empty_leaves(doc)
       doc
     end
+    def self.remove_empty_leaves(node)
+      node.remove if  node.elements.empty? && node.text == nil
+      node.elements.each {|child| remove_empty_leaves child }
+    end
     ##
     #Output the text of the pattern; If this pattern is a tree, collect the text from its
     #result instance node; otherwise rely on the last_result

data/lib/scrubyt/xpathutils.rb CHANGED

@@ -7,6 +7,13 @@ module Scrubyt
   class XPathUtils
     #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
     NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
+    ENTITIES = {
+        'quot'      => '"',
+        'apos'      => "'",
+        'amp'       => '&',
+        'lt'        => '<',
+        'gt'        => '>',
+        'nbsp'      => ' '}
     #From the example text defined by the user, find the lowest possible node with the text 'text'.
     #The text can be also a mixed content text, e.g.
@@ -17,14 +24,23 @@ module Scrubyt
     def self.find_node_from_text(doc, text)
       @node = nil
       @found = false
-      self.traverse_for_text(doc,text)
-      self.lowest_possible_node_with_text(@node, text)
+      self.traverse_for_full_text(doc,text)
+      self.lowest_possible_node_with_text(@node, text) if @node != nil
       #$Logger.warn("Node for example #{text} Not found!") if (@found == false)
-      puts "Node for example #{text} Not found!" if (@found == false)
+      if (@found == false)
+        #Fallback to per node text lookup
+        self.traverse_for_node_text(doc,text)
+        if (@found == false)
+          puts "FATAL: Node for example #{text} Not found!"
+          puts "Please make sure your specified the example properly"
+        end
+      end
+      p @node
       @node
     end
-    #Full text of the node; this is equivalent to Hpricot's inner_text. Will be
+    #Full text of the node; this is equivalent to Hpricot's inner_text
+    #(? be sure to check). Will be
     #replaced if Hpricot 0.5 will be released
     def self.full_text(node)
       result = ""
@@ -119,7 +135,7 @@ module Scrubyt
     #_index_ - there might be more images with the same src on the page -
     #most typically the user will need the 0th - but if this is not the
     #case, there is the possibility to override this
-    def self.find_image(doc, example, index=0)
+    def self.find_image(doc, example, index=1)
       (doc/"img[@src='#{example}']")[index]
     end
@@ -150,7 +166,7 @@ private
     #Note that in classic XPath, the indices start with 1 (rather
     #than 0).
     def self.find_index(node)
-     c = -1
+     c = 0
      node.parent.children.each do |child|
        if child.class == Hpricot::Elem
          c += 1 if (child.name == node.name)
@@ -170,27 +186,48 @@ private
     path
     end
-    def self.traverse_for_text(node, text)
+    def self.traverse_for_node_text(node, text)
       return if @found
       if (node.instance_of? Hpricot::Elem)
-        @node = node
-        ft = full_text(node)
-        @found = true if (ft.gsub('&nbsp;'){' '} == text)
+        node.traverse_text do |t|
+          if (t.to_s == text)
+            @found = true
+            @node = t.parent
+          end
+       end
+      end
+      node.children.each do |child|
+        if child.instance_of? Hpricot::Elem
+          traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
+        end
+      end
+    end
+    def self.traverse_for_full_text(node, text)
+      return if @found
+      if (node.instance_of? Hpricot::Elem)
+        ft = unescape_entities(full_text(node)).strip
+        if (ft == text)
+          @found = true
+          @node = node
+        end
       end
       node.children.each do |child|
-        traverse_nodes child if child.instance_of? Hpricot::Doc
         if child.instance_of? Hpricot::Elem
-          traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
+          traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
         end
       end
     end
+    def self.unescape_entities(text)
+        ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
+        text
+    end
     def self.lowest_possible_node_with_text(node, text)
       return if node.instance_of? Hpricot::Text
       @node = node if full_text(node) == text
-      node.children.each do |child|
-        lowest_possible_node_with_text(child, text)
-      end
+      node.children.each  { |child| lowest_possible_node_with_text(child, text) }
     end #End of method lowest_possible_node_with_text
   end #End of class XPathUtils
 end #End of module Scrubyt

data/test/unittests/constraint_test.rb CHANGED

@@ -1,6 +1,3 @@
-#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
-#require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
-#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
 require 'scrubyt'
 require 'test/unit'

data/test/unittests/extractor_test.rb CHANGED

@@ -1,5 +1,3 @@
-#require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
-#require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
 require 'scrubyt'
 require 'test/unit'
@@ -7,7 +5,7 @@ class ExtractorTest < Test::Unit::TestCase
   def test_create_one_pattern
     pattern = Scrubyt::Extractor.define do
       fetch File.join(File.dirname(__FILE__), "input/test.html")
-      pattern "x"
+      pattern "1"
     end
     assert_instance_of(Scrubyt::Pattern, pattern)
@@ -23,7 +21,7 @@ class ExtractorTest < Test::Unit::TestCase
   def test_create_child_pattern
     pattern = Scrubyt::Extractor.define do
       fetch File.join(File.dirname(__FILE__), "input/test.html")
-      parent { child "x" }
+      parent { child "2" }
     end
     assert_equal(pattern.name, "root")
@@ -39,10 +37,10 @@ class ExtractorTest < Test::Unit::TestCase
     pattern = Scrubyt::Extractor.define do
       fetch File.join(File.dirname(__FILE__), "input/test.html")
       parent do
-        child1 'x'
-        child2 'y'
-        child3 'z'
-        child4 'a'
+        child1 '1'
+        child2 '2'
+        child3 '3'
+        child4 '4'
       end
     end
@@ -61,7 +59,7 @@ class ExtractorTest < Test::Unit::TestCase
   def test_create_hierarchy
     tree = Scrubyt::Extractor.define do
       fetch File.join(File.dirname(__FILE__), "input/test.html")
-      a { b { c { d { e "x" } } } }
+      a { b { c { d { e "1" } } } }
     end
     assert_equal(tree.name,"root")
@@ -76,8 +74,8 @@ class ExtractorTest < Test::Unit::TestCase
     tree = Scrubyt::Extractor.define do
       fetch File.join(File.dirname(__FILE__), "input/test.html")
       a do
-        b 'x'
-        c 'y'
+        b '1'
+        c '2'
       end
     end
@@ -86,8 +84,8 @@ class ExtractorTest < Test::Unit::TestCase
     assert_not_nil(tree.children[0].filters[0])
     assert_nil(tree.children[0].example)
     assert_not_nil(tree.children[0].children[0].filters[0])
-    assert_equal(tree.children[0].children[0].example,'x')
+    assert_equal(tree.children[0].children[0].filters[0].example,'1')
     assert_not_nil(tree.children[0].children[1].filters[0])
-    assert_equal(tree.children[0].children[1].example,'y')
+    assert_equal(tree.children[0].children[1].filters[0].example,'2')
   end
 end