RubyGems - scrubyt - Versions diffs - 0.2.0 → 0.2.3 - Mend

scrubyt 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/CHANGELOG +132 -1
data/Rakefile +4 -2
data/lib/scrubyt.rb +15 -10
data/lib/scrubyt/core/navigation/fetch_action.rb +152 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +106 -0
data/lib/scrubyt/{constraint.rb → core/scraping/constraint.rb} +0 -0
data/lib/scrubyt/{constraint_adder.rb → core/scraping/constraint_adder.rb} +0 -0
data/lib/scrubyt/{filter.rb → core/scraping/filter.rb} +22 -4
data/lib/scrubyt/{pattern.rb → core/scraping/pattern.rb} +21 -98
data/lib/scrubyt/core/scraping/pre_filter_document.rb +13 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +88 -0
data/lib/scrubyt/core/shared/evaluation_context.rb +97 -0
data/lib/scrubyt/core/shared/extractor.rb +116 -0
data/lib/scrubyt/{export.rb → output/export.rb} +14 -8
data/lib/scrubyt/output/post_processor.rb +137 -0
data/lib/scrubyt/{result.rb → output/result.rb} +0 -0
data/lib/scrubyt/{result_dumper.rb → output/result_dumper.rb} +0 -7
data/lib/scrubyt/{xpathutils.rb → utils/xpathutils.rb} +5 -2
data/test/unittests/pattern_test.rb +27 -0
metadata +40 -17
data/lib/scrubyt/extractor.rb +0 -279
data/lib/scrubyt/post_processor.rb +0 -73

data/lib/scrubyt/{export.rb → output/export.rb} RENAMED

@@ -109,7 +109,7 @@ private
       first_line = contents.scan(/.*Extractor\.define.*/)
       #During wrapper construction, we count the number of blocks; add one occurrence of
       #end (to close the block of the extractor definition)
-      count = pattern.root_pattern.block_count + 1
+      count = pattern.evaluation_context.block_count + 1
       #Construct the extractor definition matching regexp based on the number of ends
       definition = contents.scan(/Extractor\.define(?:.*?(?:\}|end)){#{count.to_s}}/m)
       #Since the regexp matching the extractor definition was multiline, get the first
@@ -117,14 +117,13 @@ private
       rows = definition[0].split("\n")
       #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
       #patterns could be matched very easily from the extractor definition (because they begun
-      #with 'P.'). Now that P has been removed, mimick it!
-      keywords = ['fetch', 'fill_textfield', 'submit', 'end', 'click_link']
+      #with 'P.'). Now that P has been removed, mimick it!
       rows.each do |row|
         #Do not prepend P. to comments and empty lines
         next if (row.strip =~ /^#/ || row.strip == '')
         #Do not prepend P. to any of the reserved keywords
         jump_to_next = false
-        keywords.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
+        NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
         next if jump_to_next
         #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
         row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
@@ -180,8 +179,13 @@ private
     def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
       return if name=='root'
-      full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
-      examples = full_line.split(",")
+      parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
+      if parens.empty?
+        full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
+      else
+        full_line = parens[0][0]
+      end
+      examples = full_line.split(",")
       examples.reject! {|exa| exa.strip!;  exa[0..0] != %q{"} && exa[0..0] != %q{'} }
       all_xpaths = ""
       examples.each do |e|
@@ -193,9 +197,11 @@ private
       end
       replacing_xpath = full_line.include?('{') ? "P.#{name}('#{all_xpaths}')" :
                                                   "P.#{name} #{all_xpaths}"
-      @full_definition.sub!(/P\.#{name}\s+#{left_delimiter}(.*)#{right_delimiter}/) do
+      optional_paren_escaped = parens.empty? ? '' : '\('
+      optional_paren = parens.empty? ? '' : '('
+      @full_definition.sub!(/P\.#{name}\s*#{optional_paren_escaped}#{left_delimiter}(.*)#{right_delimiter}/) do
         @name_to_xpath_map.delete("#{name}")
-        replacing_xpath
+        optional_paren + replacing_xpath
       end
     end

data/lib/scrubyt/output/post_processor.rb ADDED

@@ -0,0 +1,137 @@
+module Scrubyt
+require 'set'
+##
+#=<tt>Post processing results after the extraction</tt>
+#Some things can not be carried out during evaluation - for example
+#the ensure_presence_of_pattern constraint (since the evaluation is top
+#to bottom, at a given point we don't know yet whether the currently
+#evaluated pattern will have a child pattern or not) or removing unneeded
+#results caused by evaluating multiple filters.
+#
+#The sole purpose of this class is to execute these post-processing tasks.
+  class PostProcessor
+    ##
+    #This is just a convenience method do call all the postprocessing
+    #functionality and checks
+    def self.apply_post_processing(root_pattern)
+      ensure_presence_of_pattern_full(root_pattern)
+      remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
+      report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
+    end
+    ##
+    #Apply the ensure_presence_of_pattern constraint on
+    #the full extractor
+    def self.ensure_presence_of_pattern_full(pattern)
+      ensure_presence_of_pattern(pattern)
+      pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
+    end
+    ##
+    #Remove unneeded results of a pattern (caused by evaluating multiple filters)
+    #See for example the B&N scenario - the book titles are extracted two times
+    #for every pattern (since both examples generate the same XPath for them)
+    #but since always only one of the results has a price, the other is discarded
+    def self.remove_multiple_filter_duplicates(pattern)
+      remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
+      pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
+    end
+    ##
+    #Issue an error report if the document did not extract anything.
+    #Probably this is because the structure of the page changed or
+    #because of some rather nasty bug - in any case, something wrong
+    #is going on, and we need to inform the user about this!
+    def self.report_if_no_results(root_pattern)
+      results_found = false
+      root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
+      puts
+      puts "!!!!!! WARNING: The extractor did not find any result instances"
+      puts "Most probably this is wrong. Check your extractor and if you are"
+      puts "sure it should work, report a bug!"
+      puts
+    end
+private
+    def self.ensure_presence_of_pattern(pattern)
+      #holds the name of those child patterns which have to be present as children of the input parameter
+      epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
+      return if epop_names.empty?
+      #all_parent_values holds instances extracted by pattern
+      all_parent_values = []
+      pattern.result.childmap.each { |h| all_parent_values << h.values }
+      all_parent_values.flatten!
+      #indices of result instances (of pattern) we are going to remove
+      results_to_remove = Set.new
+      pattern.children.each do |child_pattern|
+        #all_child_values holds instances extracted by child_pattern
+        all_child_values = []
+        child_pattern.result.childmap.each { |h| all_child_values << h.values }
+        all_child_values.flatten!
+        #populate results_to_remove
+        i = 0
+        all_parent_values.each do |parent_value|
+          #Hey! Not just the direct children but all the ancestors
+          @found_ancestor = false
+          check_ancestors(parent_value, all_child_values)
+          results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
+          i += 1
+        end
+      end
+      #based on results_to_remove, populate the array 'rejected' which holds the actual instances
+      #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
+      #results_to_remove indices to their actual instances
+      rejected = []
+      i = -1
+      pattern.result.childmap.each do |h|
+        h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
+      end
+      #Finally, do the actual delete!
+      pattern.result.childmap.each { |h| h.each { |k,v| rejected.each  { |r| v.delete(r)} } }
+    end
+    def self.check_ancestors(parent_value, all_child_values)
+      parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
+      parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
+    end
+    def self.remove_multiple_filter_duplicates_intern(pattern)
+      possible_duplicates = {}
+      longest_result = 0
+      pattern.result.childmap.each { |r|
+        r.each do |k,v|
+          v.each do |x|
+            all_child_results = []
+            pattern.children.each { |child|
+              temp_res = child.result.lookup(x)
+              all_child_results << temp_res if temp_res != nil
+            }
+            next if all_child_results.size <= 1
+            longest_result = all_child_results.map {|e| e.size}.max
+            all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
+            possible_duplicates[x] = all_child_results.transpose
+          end
+        end
+      }
+      #Determine the 'real' duplicates
+      real_duplicates = {}
+      possible_duplicates.each { |k,v|
+        next if v.size == 1
+        v.each { |r| real_duplicates[k] = r }
+      }
+      #Finally, remove them!
+      pattern.children.each { |child|
+        child.result.childmap.each { |r|
+          r.each { |k,v|
+           real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
+          }
+        }
+      }
+    end #end of function
+  end #end of class PostProcessor
+end #end of module Scrubyt

data/lib/scrubyt/{result.rb → output/result.rb} RENAMED

File without changes

data/lib/scrubyt/{result_dumper.rb → output/result_dumper.rb} RENAMED

@@ -81,13 +81,6 @@ private
         end
     end
-    def self.print_old_sta(pattern, depth)
-      puts((' ' * "#{depth}".to_i) +  "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
-      pattern.children.each do |child|
-        print_statistics_recursive(child, depth + 4)
-      end
-    end
     def self.print_statistics_recursive(pattern, depth)
       if pattern.name != 'root'
         count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size

data/lib/scrubyt/{xpathutils.rb → utils/xpathutils.rb} RENAMED

@@ -24,6 +24,8 @@ module Scrubyt
     def self.find_node_from_text(doc, text, next_link)
       @node = nil
       @found = false
+      #digg next page hack
+      text.gsub!('»', '&#187;')
       self.traverse_for_full_text(doc,text)
       self.lowest_possible_node_with_text(@node, text) if @node != nil
       if (@found == false)
@@ -138,7 +140,7 @@ module Scrubyt
     #most typically the user will need the 0th - but if this is not the
     #case, there is the possibility to override this
     def self.find_image(doc, example, index=0)
-      (doc/"img[@src='#{example}']")[index]
+      (doc/"//img[@src='#{example}']")[index]
     end
     ##
@@ -208,7 +210,8 @@ private
     def self.traverse_for_full_text(node, text)
       return if @found
       if (node.instance_of? Hpricot::Elem)
-        ft = unescape_entities(full_text(node)).strip
+        ft = unescape_entities(full_text(node)).strip
+        #puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
         if (ft == text)
           @found = true
           @node = node

data/test/unittests/pattern_test.rb ADDED

@@ -0,0 +1,27 @@
+require 'rubygems'
+require 'scrubyt'
+require 'test/unit'
+class PatternTest < Test::Unit::TestCase
+  def test_select_indices
+    some_pattern =  Scrubyt::Pattern.new('some_pattern')
+    some_pattern.select_indices(1..3)
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
+    some_pattern.select_indices([1])
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [1])
+    some_pattern.select_indices([1,2,3])
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
+    some_pattern.select_indices(:first)
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [:first])
+    some_pattern.select_indices([:first, :last])
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,:last])
+    some_pattern.select_indices([:first, [5,6]])
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,5,6])
+    some_pattern.select_indices([:first, 1..2])
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,1,2])
+    some_pattern.select_indices([4..5, :first, [5,6]])
+    assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,4,5,6])
+  end
+end

metadata CHANGED

@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: scrubyt
 version: !ruby/object:Gem::Version
-  version: 0.2.0
-date: 2007-02-04 00:00:00 +01:00
+  version: 0.2.3
+date: 2007-02-20 00:00:00 +01:00
 summary: A powerful Web-scraping framework
 require_paths:
 - lib
@@ -34,24 +34,30 @@ files:
 - CHANGELOG
 - Rakefile
 - lib/scrubyt.rb
-- lib/scrubyt/constraint.rb
-- lib/scrubyt/pattern.rb
-- lib/scrubyt/result.rb
-- lib/scrubyt/export.rb
-- lib/scrubyt/constraint_adder.rb
-- lib/scrubyt/post_processor.rb
-- lib/scrubyt/filter.rb
-- lib/scrubyt/xpathutils.rb
-- lib/scrubyt/result_dumper.rb
-- lib/scrubyt/extractor.rb
+- lib/scrubyt/utils/xpathutils.rb
+- lib/scrubyt/output/result_dumper.rb
+- lib/scrubyt/output/export.rb
+- lib/scrubyt/output/post_processor.rb
+- lib/scrubyt/output/result.rb
+- lib/scrubyt/core/navigation/fetch_action.rb
+- lib/scrubyt/core/navigation/navigation_actions.rb
+- lib/scrubyt/core/scraping/result_indexer.rb
+- lib/scrubyt/core/scraping/constraint_adder.rb
+- lib/scrubyt/core/scraping/constraint.rb
+- lib/scrubyt/core/scraping/filter.rb
+- lib/scrubyt/core/scraping/pattern.rb
+- lib/scrubyt/core/scraping/pre_filter_document.rb
+- lib/scrubyt/core/shared/evaluation_context.rb
+- lib/scrubyt/core/shared/extractor.rb
 test_files:
 - test/unittests/input
-- test/unittests/constraint_test.rb
 - test/unittests/filter_test.rb
-- test/unittests/xpathutils_test.rb
+- test/unittests/pattern_test.rb
 - test/unittests/extractor_test.rb
-- test/unittests/input/test.html
+- test/unittests/xpathutils_test.rb
+- test/unittests/constraint_test.rb
 - test/unittests/input/constraint_test.html
+- test/unittests/input/test.html
 rdoc_options: []
 extra_rdoc_files: []
@@ -62,5 +68,22 @@ extensions: []
 requirements: []
-dependencies: []
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0.5"
+    version:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.6.3
+    version:

data/lib/scrubyt/extractor.rb DELETED

@@ -1,279 +0,0 @@
-require 'logger'
-require 'open-uri'
-require 'rubygems'
-require 'mechanize'
-require 'hpricot'
-require 'pp'
-require 'set'
-module Scrubyt
-##
-#=<tt>Driving the whole extraction process</tt>
-#Extractor is a performer class - it gets an extractor definition and carries
-#out the actions and evaluates the wrappers sequentially.
-#
-#It also defines the actions as class methods - check out the section
-#commented with ############# Actions.
-  class Extractor
-    #The definition of the extractor is passed through this method
-    def self.define(&extractor_definition)
-      @@current_doc_url = nil
-      @@current_form = nil
-      @@current_doc_protocol = nil
-      @@base_dir = nil
-      @@host_name = nil
-      @@agent = WWW::Mechanize.new
-      #Hack up an artificial root pattern (i.e. do not return the pattern which
-      #is the root one in the user's definition, but rather the real (invisible)
-      #root pattern
-      root_pattern = (class_eval(&extractor_definition)).parent
-      #A little hack here: upon wrapper construction we are counting the number
-      #of blocks, so we know the count of the 'end's/'}'s which end the extractor
-      #definition
-      #Recursively match data based on examples
-      root_pattern.setup_examples
-      #Once all is set up, evaluate the wrapper from the root pattern!
-      if root_pattern.next_page
-        current_page_count = 1
-        loop do
-          evaluate_wrapper(root_pattern)
-          break if (root_pattern.limit == current_page_count || root_pattern.crawl_to_new_page == nil)
-          current_page_count += 1 if root_pattern.limit != nil
-        end
-      else
-        evaluate_wrapper(root_pattern)
-      end
-      ensure_all_postconditions(root_pattern)
-      PostProcessor.remove_multiple_filter_duplicates(root_pattern)
-      PostProcessor.report_if_no_results(root_pattern)
-      #Return the root pattern
-      root_pattern
-    end
-  #build the current wrapper
-  def self.method_missing(method_name, *args, &block)
-    pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
-    if @parent == nil
-      if method_name.to_s == 'next_page'
-        @@root_pattern.next_page = args[0]
-        @@root_pattern.limit = args[1][:limit] if args.size > 1
-        return @@last_pattern
-      else
-        #Create a root pattern
-        root_pattern = Scrubyt::Pattern.new('root', :type => :root)
-        @@root_pattern = root_pattern
-        @@root_pattern.root_pattern = root_pattern
-        @@root_pattern.root_pattern.extractor = self
-        #add the currently active document to the root pattern
-        @@root_pattern.attach_current_document
-        @@root_pattern.add_child_pattern(pattern)
-        @@root_pattern.block_count = 0
-        @@root_pattern.extractor = self
-      end
-    else
-      @parent.add_child_pattern(pattern) if @parent != nil
-    end
-    if block_given?
-      @@root_pattern.block_count = @@root_pattern.block_count + 1
-      @stack ||=[]
-      @parent = pattern
-      @stack.push @parent
-      class_eval(&block)
-      @stack.pop
-      @parent = @stack.last
-    end
-    @@last_pattern = pattern
-  end
-  #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
-  #(You should not be :)
-  def self.get_block_count
-    @@root_pattern.block_count
-  end
-############# Actions
-#
-  ##
-  # At any given point, the current document can be queried with this method; Typically used
-  # when the navigation is over and the result document is passed to the wrapper
-  def self.get_current_doc_url
-    @@current_doc_url
-  end
-  def self.get_hpricot_doc
-    @@hpricot_doc
-  end
-  ##
-  #Action to fetch a document (either a file or a http address)
-  #
-  #*parameters*
-  #
-  #_doc_url_ - the url or file name to fetch
-  def self.fetch(doc_url, mechanize_doc=nil)
-    if (mechanize_doc == nil)
-      @@current_doc_url = doc_url
-      @@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
-      if @@base_dir == nil
-        @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
-      else
-        @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
-      end
-      if @@host_name != nil
-        if doc_url !~ /#{@@host_name}/
-          @@current_doc_url = (@@host_name + doc_url)
-          #remove duplicate parts, like /blogs/en/blogs/en
-          @@current_doc_url = @@current_doc_url.split('/').uniq.reject{|x| x == ""}.join('/')
-          @@current_doc_url.sub!('http:/', 'http://')
-        end
-      end
-      puts "[ACTION] fetching document: #{@@current_doc_url}"
-      if @@current_doc_protocol == :http
-        @@mechanize_doc = @@agent.get(@@current_doc_url)
-        @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0]
-        @@host_name = doc_url if @@host_name == nil
-      end
-    else
-      @@current_doc_url = doc_url
-      @@mechanize_doc = mechanize_doc
-      @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0]
-      @@host_name = doc_url if @@host_name == nil
-    end
-    @@hpricot_doc = Hpricot(open(@@current_doc_url))
-  end
-  ##
-  #Action to fill a textfield with a query string
-  #
-  ##*parameters*
-  #
-  #_textfield_name_ - the name of the textfield (e.g. the name of the google search
-  #textfield is 'q'
-  #
-  #_query_string_ - the string that should be entered into the textfield
-  def self.fill_textfield(textfield_name, query_string)
-    puts "[ACTION] typing #{query_string} into the textfield named '#{textfield_name}'"
-    textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
-    form_tag = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form')
-    #Refactor this code, it's a total mess
-    formname = form_tag.attributes['name']
-    if formname == nil
-      id_string = form_tag.attributes['id']
-      if id_string == nil
-        action_string = form_tag.attributes['action']
-        if action_string == nil
-          #If even this fails, do it with a button
-        else
-          puts "Finding from action"
-          puts action_string
-          find_form_with_attribute('action', action_string)
-        end
-      else
-        puts "Finding from id"
-        find_form_with_attribute('id', id_string)
-      end
-    else
-      puts "Finding from name"
-      @@current_form = @@mechanize_doc.forms.with.name(formname).first
-    end
-    eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
-  end
-  def self.find_form_with_attribute(attr, expected_value)
-    puts "attr: #{attr}"
-    i = 0
-    loop do
-      @@current_form = @@mechanize_doc.forms[i]
-      print "current a: "
-      puts @@current_form.form_node.attributes[attr]
-      return nil if @@current_form == nil
-      break if @@current_form.form_node.attributes[attr] == expected_value
-      i+= 1
-    end
-  end
-  #Submit the last form;
-  def self.submit
-    puts '[ACTION] submitting form...'
-    result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
-    @@current_doc_url = result_page.uri.to_s
-    puts "[ACTION] fetched #{@@current_doc_url}"
-    fetch(@@current_doc_url, result_page)
-  end
-  def self.click_link(link_text)
-    puts "[ACTION] clicking link: #{link_text}"
-    link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
-    result_page = @@agent.click(link)
-    @@current_doc_url = result_page.uri.to_s
-    fetch(@@current_doc_url, result_page)
-  end
-#
-#############
-private
-  def self.ensure_all_postconditions(pattern)
-    ensure_postconditions(pattern)
-    pattern.children.each {|child| ensure_all_postconditions(child)}
-  end
-  def self.ensure_postconditions(pattern)
-    #holds the name of those child patterns which have to be present as children of the input parameter
-    epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
-    return if epop_names.empty?
-    #all_parent_values holds instances extracted by pattern
-    all_parent_values = []
-    pattern.result.childmap.each { |h| all_parent_values << h.values }
-    all_parent_values.flatten!
-    #indices of result instances (of pattern) we are going to remove
-    results_to_remove = Set.new
-    pattern.children.each do |child_pattern|
-      #all_child_values holds instances extracted by child_pattern
-      all_child_values = []
-      child_pattern.result.childmap.each { |h| all_child_values << h.values }
-      all_child_values.flatten!
-      #populate results_to_remove
-      i = 0
-      all_parent_values.each do |parent_value|
-        #Hey! Not just the direct children but all the ancestors
-        @found_ancestor = false
-        check_ancestors(parent_value, all_child_values)
-        results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
-        i += 1
-      end
-    end
-    #based on results_to_remove, populate the array 'rejected' which holds the actual instances
-    #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
-    #results_to_remove indices to their actual instances
-    rejected = []
-    i = -1
-    pattern.result.childmap.each do |h|
-      h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
-    end
-    #Correct the statistics
-    pattern.get_instance_count[pattern.name] -= rejected.size
-    #Finally, do the actual delete!
-    pattern.result.childmap.each { |h| h.each { |k,v| rejected.each  { |r| v.delete(r)} } }
-  end
-  def self.check_ancestors(parent_value, all_child_values)
-    parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
-    parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
-  end
-    def self.evaluate_wrapper(pattern)
-      pattern.evaluate
-      pattern.children.each { |child| evaluate_wrapper child }
-    end #end of method evaluate_wrapper
-  end #end of class Extractor
-end #end of module Scrubyt