RubyGems - scrubber-scrubyt - Versions diffs - 0.4.11 - Mend

scrubber-scrubyt 0.4.11

Files changed (45) hide show

data/CHANGELOG +343 -0
data/COPYING +340 -0
data/README +99 -0
data/Rakefile +101 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +167 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +140 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/lib/scrubyt.rb +43 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +115 -0

data/lib/scrubyt/core/navigation/agents/mechanize.rb ADDED Viewed

@@ -0,0 +1,253 @@
+require 'rubygems'
+require 'mechanize'
+module Scrubyt
+  ##
+  #=<tt>Fetching pages (and related functionality)</tt>
+  #
+  #Since lot of things are happening during (and before)
+  #the fetching of a document, I decided to move out fetching related
+  #functionality to a separate class - so if you are looking for anything
+  #which is loading a document (even by submitting a form or clicking a link)
+  #and related things like setting a proxy etc. you should find it here.
+  module Navigation
+    module Mechanize
+      def self.included(base)
+        base.module_eval do
+          @@agent = WWW::Mechanize.new
+          @@current_doc_url = nil
+          @@current_doc_protocol = nil
+          @@base_dir = nil
+          @@host_name = nil
+          @@history = []
+          ##
+          #Action to fetch a document (either a file or a http address)
+          #
+          #*parameters*
+          #
+          #_doc_url_ - the url or file name to fetch
+          def self.fetch(doc_url, *args)
+            #Refactor this crap!!! with option_accessor stuff
+            if args.size > 0
+              mechanize_doc = args[0][:mechanize_doc]
+              html = args[0][:html]
+              resolve = args[0][:resolve]
+              basic_auth = args[0][:basic_auth]
+              parse_and_set_basic_auth(basic_auth) if basic_auth
+              if html
+                @@current_doc_protocol = 'string'
+                mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
+              end
+            else
+              mechanize_doc = nil
+              resolve = :full
+            end
+            @@current_doc_url = doc_url
+            @@current_doc_protocol = determine_protocol
+            if mechanize_doc.nil? && @@current_doc_protocol != 'file'
+              handle_relative_path(doc_url)
+              handle_relative_url(doc_url, resolve)
+              Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
+              unless 'file' == @@current_doc_protocol
+                @@mechanize_doc = @@agent.get(@@current_doc_url)
+              end
+            else
+              @@mechanize_doc = mechanize_doc
+            end
+            if @@current_doc_protocol == 'file'
+              @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
+            else
+              @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
+              store_host_name(self.get_current_doc_url) #if self.get_current_doc_url   # in case we're on a new host
+            end
+          end
+          ##
+          #Submit the last form;
+          def self.submit(index=nil, sleep_time=nil, type=nil)
+            Scrubyt.log :ACTION, 'Submitting form...'
+            if index == nil
+              result_page = @@agent.submit(@@current_form)
+              process_submit(@@current_form)
+              #----- added by nickmerwin@gmail.com -----
+            elsif index.class == String && !type.nil?
+              button = @@current_form.buttons.detect{|b| b.name == index}
+              result_page = @@current_form.submit(button)
+              process_submit(@@current_form, button,type)
+              #-----------------------------------------
+            else
+              result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
+            end
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          ##
+          #Click the link specified by the text
+          def self.click_link(link_spec,index = 0,wait_secs=0)
+            Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
+            if link_spec.is_a? Hash
+              clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
+            else
+              clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
+            end
+            clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
+            result_page = @@agent.click(clicked_elem)
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          def self.click_image_map(index = 0)
+            Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
+            uri = @@mechanize_doc.search("//area")[index]['href']
+            result_page = @@agent.get(uri)
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          def self.store_host_name(doc_url)
+            @@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
+            @@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
+            @@host_name = doc_url if @@host_name == nil
+            @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
+            @@original_host_name ||= @@host_name
+          end #end of method store_host_name
+          def self.determine_protocol
+            old_protocol = @@current_doc_protocol
+            new_protocol = case @@current_doc_url
+              when /^https/
+                'https'
+              when /^http/
+                'http'
+              when /^www/
+                'http'
+              else
+                'file'
+              end
+            return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
+            return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
+            new_protocol
+          end
+          def self.handle_relative_path(doc_url)
+            if @@base_dir == nil
+              @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
+            else
+              @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
+            end
+          end
+          def self.handle_relative_url(doc_url, resolve)
+            return if doc_url =~ /^http/
+            if doc_url !~ /^\//
+              first_char = doc_url[0..0]
+              doc_url = ( first_char == '?'  ? '' : '/'  ) + doc_url
+              if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
+                current_uri = @@mechanize_doc.uri.to_s
+                current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
+                if (current_uri.include? '?')
+                  current_uri = current_uri.scan(/.+\//)[0]
+                else
+                  current_uri += '/' unless current_uri[-1..-1] == '/'
+                end
+                @@current_doc_url = current_uri + doc_url
+                return
+              end
+            end
+            case resolve
+              when :full
+                @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
+                @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
+              when :host
+                base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
+                @@current_doc_url = base_host_name + doc_url
+              else
+                #custom resilving
+                @@current_doc_url = resolve + doc_url
+            end
+          end
+          def self.fill_textfield(textfield_name, query_string, *unused)
+            lookup_form_for_tag('input','textfield',textfield_name,query_string)
+            eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
+          end
+          ##
+          #Action to fill a textarea with text
+          def self.fill_textarea(textarea_name, text)
+            lookup_form_for_tag('textarea','textarea',textarea_name,text)
+            eval("@@current_form['#{textarea_name}'] = '#{text}'")
+          end
+          ##
+          #Action for selecting an option from a dropdown box
+          def self.select_option(selectlist_name, option)
+            lookup_form_for_tag('select','select list',selectlist_name,option)
+            select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
+            searched_option = select_list.options.find{|f| f.text.strip == option}
+            searched_option.click
+          end
+          def self.check_checkbox(checkbox_name)
+            lookup_form_for_tag('input','checkbox',checkbox_name, '')
+            @@current_form.checkboxes.name(checkbox_name).check
+          end
+          def self.check_radiobutton(checkbox_name, index=0)
+            lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
+            @@current_form.radiobuttons.name(checkbox_name)[index].check
+          end
+          #private
+          def self.process_submit(current_form, button=nil, type=nil)
+            if button == nil
+              result_page = @@agent.submit(current_form)
+            elsif type
+              result_page = current_form.submit(button)
+            else
+              result_page = @@agent.submit(current_form, button)
+            end
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
+            Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
+            widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
+            form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
+            find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
+          end
+          def self.find_form_based_on_tag(tag, possible_attrs)
+            lookup_attribute_name = nil
+            lookup_attribute_value = nil
+            possible_attrs.each { |a|
+              lookup_attribute_name = a
+              lookup_attribute_value = tag.attributes[a]
+              break if lookup_attribute_value != nil
+            }
+            i = 0
+            loop do
+              @@current_form = FetchAction.get_mechanize_doc.forms[i]
+              return nil if @@current_form == nil
+              break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
+              i+= 1
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/scrubyt/core/navigation/fetch_action.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module Scrubyt
+  ##
+  #=<tt>Fetching pages (and related functionality)</tt>
+  #
+  #Since lot of things are happening during (and before)
+  #the fetching of a document, I decided to move out fetching related
+  #functionality to a separate class - so if you are looking for anything
+  #which is loading a document (even by submitting a form or clicking a link)
+  #and related things like setting a proxy etc. you should find it here.
+  module FetchAction
+    @@current_doc_url = nil
+    @@current_doc_protocol = nil
+    @@base_dir = nil
+    @@host_name = nil
+    @@history = []
+    @@current_form = nil
+    ##
+    # At any given point, the current document can be queried with this method; Typically used
+    # when the navigation is over and the result document is passed to the wrapper
+    def self.get_current_doc_url
+      @@current_doc_url
+    end
+    def self.get_mechanize_doc
+      @@mechanize_doc
+    end
+    def self.get_hpricot_doc
+      @@hpricot_doc
+    end
+    def get_host_name
+      @@host_name
+    end
+    def restore_host_name
+      return if @@current_doc_protocol == 'file'
+      @@host_name = @@original_host_name
+    end
+    def store_page
+      @@history.push @@hpricot_doc
+    end
+    def restore_page
+      @@hpricot_doc = @@history.pop
+    end
+    def store_host_name(doc_url)
+      FetchAction.store_host_name(doc_url)
+    end
+  end
+end

data/lib/scrubyt/core/navigation/navigation_actions.rb ADDED Viewed

@@ -0,0 +1,95 @@
+module Scrubyt
+  ##
+  #=<tt>Describing actions which interact with the page</tt>
+  #
+  #This class contains all the actions that are used to navigate on web pages;
+  #first of all, *fetch* for downloading the pages - then various actions
+  #like filling textfields, submitting formst, clicking links and more
+  module NavigationActions
+    def self.extend_object(obj)
+      super(obj)
+      obj.instance_eval do
+        @current_form = nil
+      end
+    end
+    ##
+    #Action to fill a textfield with a query string
+    #
+    ##*parameters*
+    #
+    #_textfield_name_ - the name of the textfield (e.g. the name of the google search
+    #textfield is 'q'
+    #
+    #_query_string_ - the string that should be entered into the textfield
+    def fill_textfield(textfield_name, query_string, use_value = nil)
+      FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
+    end
+    def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
+      FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
+    end
+    ##
+    #Action to fill a textarea with text
+    def fill_textarea(textarea_name, text)
+      FetchAction.fill_textarea(textarea_name, text)
+    end
+    ##
+    #Action for selecting an option from a dropdown box
+    def select_option(selectlist_name, option)
+      FetchAction.select_option(selectlist_name, option)
+    end
+    def check_checkbox(checkbox_name)
+      FetchAction.check_checkbox(checkbox_name)
+    end
+    def check_radiobutton(checkbox_name, index=0)
+      FetchAction.check_radiobutton(checkbox_name, index=0)
+    end
+    ##
+    #Fetch the document
+    def fetch(*args)
+      FetchAction.fetch(*args)
+    end
+    ##
+    #Submit the current form
+    def submit(index=nil, type=nil)
+      FetchAction.submit(nil, index, type)
+    end
+    def submit_and_wait(sleep_time, index=nil, type=nil)
+      FetchAction.submit(index, sleep_time,  type)
+    end
+    ##
+    #Click the link specified by the text
+    def click_link(link_spec,index=0)
+      FetchAction.click_link(link_spec,index, 0)
+    end
+    def click_link_and_wait(link_spec, sleep_secs=0)
+      FetchAction.click_link(link_spec, 0, sleep_secs)
+    end
+    def click_by_xpath(xpath)
+      FetchAction.click_by_xpath(xpath)
+    end
+    def click_image_map(index=0)
+      FetchAction.click_image_map(index)
+    end
+    def frame(attribute,value)
+      FetchAction.frame(attribute,value)
+    end
+    def wait(time=1)
+      FetchAction.wait(time)
+    end
+  end
+end

data/lib/scrubyt/core/scraping/compound_example.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Scrubyt
+  ##
+  #=<tt>Represents a compund example</tt>
+  #
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example. The simple example
+  #is specified by a string, and a compound example is specified with
+  #:contains, :begins_with and :ends_with descriptors - which can be
+  #both regexps or strings
+  class CompoundExample
+    DESCRIPTORS = [:contains, :begins_with, :ends_with]
+    attr_accessor :descriptor_hash
+    def initialize(descriptor_hash)
+      @descriptor_hash = descriptor_hash
+    end
+    ##
+    #Is the hash passed to this function a compound example descriptor hash?
+    #Need to decide this when parsing pattern parameters
+    def self.compound_example?(hash)
+      hash.each do |k,v|
+        return false if !DESCRIPTORS.include? k
+      end
+      true
+    end# end of method
+  end# #end of class CompoundExample
+end# end of module Scrubyt

data/lib/scrubyt/core/scraping/constraint.rb ADDED Viewed

@@ -0,0 +1,169 @@
+module Scrubyt
+  ##
+  #=<tt>Rejecting result instances based on further rules</tt>
+  #
+  #The two  most trivial problems with a set of rules is that they match either less
+  #or more instances than we would like them to. Constraints are a way to remedy the second problem:
+  #they serve as a tool to filter out some result instances based on rules. A typical
+  #example:
+  #
+  #* *ensure_presence_of_ancestor_pattern* consider this model:
+  #    <book>
+  #      <author>...</author>
+  #      <title>...</title>
+  #    </book>
+  #
+  #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
+  #'author' and 'title', only those books will be matched which have an author and a
+  #title (i.e.the child patterns author and title must extract something). This is a way
+  #to say 'a book MUST have an author and a title'.
+  class Constraint
+    #There are more possible ways of applying/checking constraints in the case of
+    #ones that can not be checked in the context node (e.g. ensure_presence_of -
+    #since it may require the evaluation of child patterns of the context pattern to
+    #arbitray level)
+    #
+    #In such cases, the possibilities are:
+    #
+    #1) make a depth-first evaluation from the context pattern until the needed ancestor
+    #   pattern is evaluated. This can mess things up, since if any ancestor node uses
+    #   the sinks of predecessor(s) other than the context node, those need to be evaluated
+    #   too, and we may run into a cyclyc dependency or at least a complicated recursion
+    #
+    #2) Post processing - evaluate normally and throw out results which do not pass the
+    #   constraint
+    #
+    #2b) Do it on the XML level - most probably this solution will be implemented
+    # Different constraint types
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
+    CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
+    CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
+    attr_reader :type, :target
+    #Add 'ensure presence of ancestor pattern' constraint
+    #If this type of constraint is added to a pattern, it must have an ancestor pattern
+    #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
+    #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
+    #(just by looking at the wrapper model, the ancestor pattern is always present)
+    #Note that from this type of constraint there is no 'ensure_absence' version, since
+    #I could not think about an use case for that
+    def self.add_ensure_presence_of_pattern(ancestor)
+      Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
+    end
+    #Add 'ensure absence of attribute' constraint
+    #If this type of constraint is added to a pattern, the HTML node it targets
+    #must NOT have an attribute named "attribute_name" with the value "attribute_value"
+    def self.add_ensure_absence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
+                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
+    end
+    #Add 'ensure presence of attribute' constraint
+    #If this type of constraint is added to a pattern, the HTML node it targets
+    #must have an attribute named "attribute_name" with the value "attribute_value"
+    def self.add_ensure_presence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
+                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
+    end
+    #Add 'ensure absence of ancestor node' constraint
+    #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
+    #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
+    #
+    #"attributes" is an array of hashes, for example
+    #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
+    #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
+    #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
+    #
+    #"attributes" can be empty - in this case just the 'node_name' is checked
+    def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
+    end
+    #Add 'ensure presence of ancestor node' constraint
+    #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
+    #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
+    #
+    #"attributes" is an array of hashes, for example
+    #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
+    #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
+    #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
+    #
+    #"attributes" can be empty - in this case just the 'node_name' is checked
+    def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
+    end
+    #Evaluate the constraint; if this function returns true,
+    #it means that the constraint passed, i.e. its filter will be added to the exctracted
+    #content of the pattern
+    def check(result)
+      case @type
+        #checked after evaluation, so here always return true
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN
+          return true
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
+          attribute_present(result)
+        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
+          !attribute_present(result)
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
+          ancestor_node_present(result)
+        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
+          !ancestor_node_present(result)
+      end
+    end
+  private
+    #We would not like these to be called from outside
+    def initialize(target, type)
+      @target = target
+      @type = type
+    end
+    #Implementation of the ancestor node presence test
+    #Check the documentation of the add_ensure_presence_of_ancestor_node method
+    #for further information on the result parameter
+    def ancestor_node_present(result)
+      found = false
+      node_name = @target[0]
+      node_attributes = @target[1]
+      node_attributes.each do |pair|
+        return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty?
+      end
+      if node_attributes.empty?
+        return true if !result.search("//#{node_name}").empty?
+      end
+      false
+    end
+    def attribute_present(result)
+      return unless result.is_a? Hpricot::Elem
+      match = true
+      #If v = nil, the value of the attribute can be arbitrary;
+      #Therefore, in this case we just have to make sure that the attribute is
+      #present (i.e. != nil), we don't care about the value
+      @target.each do |k,v|
+        if v == nil
+            match &&= (result.attributes[k.to_s] != nil)
+          else
+            match &&= (result.attributes[k.to_s] == v.to_s)
+        end
+      end
+      match
+    end
+  end #end of class
+end #end of module

data/lib/scrubyt/core/scraping/constraint_adder.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Scrubyt
+  ##
+  #=<tt>Utility class for adding constraints</tt>
+  #
+  #Originally methods of Pattern - but since Pattern was already too heavy (and after
+  #all, adding a constraint (logically) does not belong to Pattern anyway) it was moved
+  #to this utility class. In pattern everything that begins with ensure_
+  #is automatically dispatched here.
+  #
+  #I will not document the functions since these are just forwarders; See the 'real'
+  #functions with their documentation in Scrubyt::Constraint.rb
+  class ConstraintAdder
+    def self.ensure_presence_of_pattern(ancestor_node_name)
+      Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
+    end
+    def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
+                                                          prepare_attributes(attributes))
+    end
+    def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
+                                                         prepare_attributes(attributes))
+    end
+    def self.ensure_presence_of_attribute(attribute_hash)
+      Constraint.add_ensure_presence_of_attribute(attribute_hash)
+    end
+    def self.ensure_absence_of_attribute(attribute_hash)
+      Constraint.add_ensure_absence_of_attribute(attribute_hash)
+    end
+    private
+    def self.prepare_attributes(attributes)
+      attribute_pairs = []
+      attributes.each do |key, value|
+        if (value.instance_of? Array)
+          value.each {|val| attribute_pairs << [key,val]}
+        else
+          attribute_pairs << [key, value]
+        end
+      end
+      return attribute_pairs
+    end #end of method prepare_attributes
+  end #end of class ConstraintAddere
+end #end of module Scrubyt

data/lib/scrubyt/core/scraping/filters/attribute_filter.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Scrubyt
+  class AttributeFilter < BaseFilter
+    def evaluate(source)
+      elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
+      if elem.is_a? Hpricot::Elem
+        return [elem.attributes[@example]]
+      else
+        return nil
+      end
+    end
+  end #End of class AttributeFilter
+end #End of module Scrubyt