RubyGems - jspradlin-scrubyt - Versions diffs - 0.4.16 - Mend

jspradlin-scrubyt 0.4.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data/CHANGELOG +343 -0
data/COPYING +340 -0
data/README +120 -0
data/Rakefile +101 -0
data/lib/scrubyt.rb +45 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +167 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +142 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +117 -0

data/lib/scrubyt/core/navigation/agents/mechanize.rb ADDED Viewed

@@ -0,0 +1,289 @@
+require 'rubygems'
+require 'mechanize'
+module Scrubyt
+  ##
+  #=<tt>Fetching pages (and related functionality)</tt>
+  #
+  #Since lot of things are happening during (and before)
+  #the fetching of a document, I decided to move out fetching related
+  #functionality to a separate class - so if you are looking for anything
+  #which is loading a document (even by submitting a form or clicking a link)
+  #and related things like setting a proxy etc. you should find it here.
+  module Navigation
+    module Mechanize
+      def self.included(base)
+        base.module_eval do
+          @@agent = WWW::Mechanize.new
+          @@current_doc_url = nil
+          @@current_doc_protocol = nil
+          @@base_dir = nil
+          @@host_name = nil
+          @@history = []
+          ##
+          #Action to fetch a document (either a file or a http address)
+          #
+          #*parameters*
+          #
+          #_doc_url_ - the url or file name to fetch
+          def self.fetch(doc_url, *args)
+            #Refactor this crap!!! with option_accessor stuff
+            if args.size > 0
+              mechanize_doc = args[0][:mechanize_doc]
+              html = args[0][:html]
+              resolve = args[0][:resolve]
+              basic_auth = args[0][:basic_auth]
+              parse_and_set_basic_auth(basic_auth) if basic_auth
+              proxy = args[0][:proxy]
+              parse_and_set_proxy(proxy) if proxy
+              if html
+                @@current_doc_protocol = 'string'
+                mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
+              end
+            else
+              mechanize_doc = nil
+              resolve = :full
+            end
+            @@current_doc_url = doc_url
+            @@current_doc_protocol = determine_protocol
+            if mechanize_doc.nil? && @@current_doc_protocol != 'file'
+              handle_relative_path(doc_url)
+              handle_relative_url(doc_url, resolve)
+              Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
+              unless 'file' == @@current_doc_protocol
+                @@mechanize_doc = @@agent.get(@@current_doc_url)
+              end
+            else
+              @@mechanize_doc = mechanize_doc
+            end
+            if @@current_doc_protocol == 'file'
+              @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
+            else
+              @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
+              store_host_name(self.get_current_doc_url) #if self.get_current_doc_url   # in case we're on a new host
+            end
+          end
+          ##
+          #Submit the last form;
+          def self.submit(index=nil, sleep_time=nil, type=nil)
+            Scrubyt.log :ACTION, 'Submitting form...'
+            if index == nil
+              result_page = @@agent.submit(@@current_form)
+              process_submit(@@current_form)
+              #----- added by nickmerwin@gmail.com -----
+            elsif index.class == String && !type.nil?
+              button = @@current_form.buttons.detect{|b| b.name == index}
+              result_page = @@current_form.submit(button)
+              process_submit(@@current_form, button,type)
+              #-----------------------------------------
+            else
+              result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
+            end
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          ##
+          #Click the link specified by the text
+          def self.click_link(link_spec,index = 0,wait_secs=0)
+            Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
+            if link_spec.is_a? Hash
+              clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
+            else
+              clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
+            end
+            clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
+            result_page = @@agent.click(clicked_elem)
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          def self.click_image_map(index = 0)
+            Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
+            uri = @@mechanize_doc.search("//area")[index]['href']
+            result_page = @@agent.get(uri)
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          def self.store_host_name(doc_url)
+            @@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
+            @@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
+            @@host_name = doc_url if @@host_name == nil
+            @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
+            @@original_host_name ||= @@host_name
+          end #end of method store_host_name
+          def self.parse_and_set_proxy(proxy)
+            @@proxy_user = @@proxy_pass = nil
+            if proxy.downcase.include?('localhost')
+              @@host = 'localhost'
+              @@port = proxy.split(':').last
+            else
+              parts = proxy.split(':')
+              if (parts.size > 2)
+                user_pass = parts[1].split('@')
+                  @@proxy_user = parts[0]
+                  @@proxy_pass = user_pass[0]
+                  @@host = user_pass[1]
+                  @@port = parts[2]
+              else
+                if (parts[0].include?('@'))
+                  user_host = parts[0].split('@')
+                    @@proxy_user = user_host[0]
+                    @@host = user_host[1]
+                    @@port = parts[1]
+                else
+                  @@host = parts[0]
+                  @@port = parts[1]
+                end
+              end
+              if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
+                puts "Invalid proxy specification..."
+                puts "neither host nor port can be nil!"
+                exit
+              end
+            end
+            Scrubyt.log :ACTION, "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>, username=<#{@@proxy_user}>, password=<#{@@proxy_pass}>"
+            @@agent.set_proxy(@@host, @@port, @@proxy_user, @@proxy_pass)
+          end
+          def self.determine_protocol
+            old_protocol = @@current_doc_protocol
+            new_protocol = case @@current_doc_url
+              when /^https/
+                'https'
+              when /^http/
+                'http'
+              when /^www/
+                'http'
+              else
+                'file'
+              end
+            return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
+            return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
+            new_protocol
+          end
+          def self.handle_relative_path(doc_url)
+            if @@base_dir == nil
+              @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
+            else
+              @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
+            end
+          end
+          def self.handle_relative_url(doc_url, resolve)
+            return if doc_url =~ /^http/
+            if doc_url !~ /^\//
+              first_char = doc_url[0..0]
+              doc_url = ( first_char == '?'  ? '' : '/'  ) + doc_url
+              if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
+                current_uri = @@mechanize_doc.uri.to_s
+                current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
+                if (current_uri.include? '?')
+                  current_uri = current_uri.scan(/.+\//)[0]
+                else
+                  current_uri += '/' unless current_uri[-1..-1] == '/'
+                end
+                @@current_doc_url = current_uri + doc_url
+                return
+              end
+            end
+            case resolve
+              when :full
+                @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
+                @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
+              when :host
+                base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
+                @@current_doc_url = base_host_name + doc_url
+              else
+                #custom resilving
+                @@current_doc_url = resolve + doc_url
+            end
+          end
+          def self.fill_textfield(textfield_name, query_string, *unused)
+            lookup_form_for_tag('input','textfield',textfield_name,query_string)
+            eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
+          end
+          ##
+          #Action to fill a textarea with text
+          def self.fill_textarea(textarea_name, text)
+            lookup_form_for_tag('textarea','textarea',textarea_name,text)
+            eval("@@current_form['#{textarea_name}'] = '#{text}'")
+          end
+          ##
+          #Action for selecting an option from a dropdown box
+          def self.select_option(selectlist_name, option)
+            lookup_form_for_tag('select','select list',selectlist_name,option)
+            select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
+            searched_option = select_list.options.find{|f| f.text.strip == option}
+            searched_option.click
+          end
+          def self.check_checkbox(checkbox_name)
+            lookup_form_for_tag('input','checkbox',checkbox_name, '')
+            @@current_form.checkboxes.name(checkbox_name).check
+          end
+          def self.check_radiobutton(checkbox_name, index=0)
+            lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
+            @@current_form.radiobuttons.name(checkbox_name)[index].check
+          end
+          #private
+          def self.process_submit(current_form, button=nil, type=nil)
+            if button == nil
+              result_page = @@agent.submit(current_form)
+            elsif type
+              result_page = current_form.submit(button)
+            else
+              result_page = @@agent.submit(current_form, button)
+            end
+            @@current_doc_url = result_page.uri.to_s
+            Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+            fetch(@@current_doc_url, :mechanize_doc => result_page)
+          end
+          def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
+            Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
+            widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
+            form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
+            find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
+          end
+          def self.find_form_based_on_tag(tag, possible_attrs)
+            lookup_attribute_name = nil
+            lookup_attribute_value = nil
+            possible_attrs.each { |a|
+              lookup_attribute_name = a
+              lookup_attribute_value = tag.attributes[a]
+              break if lookup_attribute_value != nil
+            }
+            i = 0
+            loop do
+              @@current_form = FetchAction.get_mechanize_doc.forms[i]
+              return nil if @@current_form == nil
+              break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
+              i+= 1
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/scrubyt/core/navigation/fetch_action.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module Scrubyt
+  ##
+  #=<tt>Fetching pages (and related functionality)</tt>
+  #
+  #Since lot of things are happening during (and before)
+  #the fetching of a document, I decided to move out fetching related
+  #functionality to a separate class - so if you are looking for anything
+  #which is loading a document (even by submitting a form or clicking a link)
+  #and related things like setting a proxy etc. you should find it here.
+  module FetchAction
+    @@current_doc_url = nil
+    @@current_doc_protocol = nil
+    @@base_dir = nil
+    @@host_name = nil
+    @@history = []
+    @@current_form = nil
+    ##
+    # At any given point, the current document can be queried with this method; Typically used
+    # when the navigation is over and the result document is passed to the wrapper
+    def self.get_current_doc_url
+      @@current_doc_url
+    end
+    def self.get_mechanize_doc
+      @@mechanize_doc
+    end
+    def self.get_hpricot_doc
+      @@hpricot_doc
+    end
+    def get_host_name
+      @@host_name
+    end
+    def restore_host_name
+      return if @@current_doc_protocol == 'file'
+      @@host_name = @@original_host_name
+    end
+    def store_page
+      @@history.push @@hpricot_doc
+    end
+    def restore_page
+      @@hpricot_doc = @@history.pop
+    end
+    def store_host_name(doc_url)
+      FetchAction.store_host_name(doc_url)
+    end
+  end
+end

data/lib/scrubyt/core/navigation/navigation_actions.rb ADDED Viewed

@@ -0,0 +1,95 @@
+module Scrubyt
+  ##
+  #=<tt>Describing actions which interact with the page</tt>
+  #
+  #This class contains all the actions that are used to navigate on web pages;
+  #first of all, *fetch* for downloading the pages - then various actions
+  #like filling textfields, submitting formst, clicking links and more
+  module NavigationActions
+    def self.extend_object(obj)
+      super(obj)
+      obj.instance_eval do
+        @current_form = nil
+      end
+    end
+    ##
+    #Action to fill a textfield with a query string
+    #
+    ##*parameters*
+    #
+    #_textfield_name_ - the name of the textfield (e.g. the name of the google search
+    #textfield is 'q'
+    #
+    #_query_string_ - the string that should be entered into the textfield
+    def fill_textfield(textfield_name, query_string, use_value = nil)
+      FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
+    end
+    def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
+      FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
+    end
+    ##
+    #Action to fill a textarea with text
+    def fill_textarea(textarea_name, text)
+      FetchAction.fill_textarea(textarea_name, text)
+    end
+    ##
+    #Action for selecting an option from a dropdown box
+    def select_option(selectlist_name, option)
+      FetchAction.select_option(selectlist_name, option)
+    end
+    def check_checkbox(checkbox_name)
+      FetchAction.check_checkbox(checkbox_name)
+    end
+    def check_radiobutton(checkbox_name, index=0)
+      FetchAction.check_radiobutton(checkbox_name, index=0)
+    end
+    ##
+    #Fetch the document
+    def fetch(*args)
+      FetchAction.fetch(*args)
+    end
+    ##
+    #Submit the current form
+    def submit(index=nil, type=nil)
+      FetchAction.submit(nil, index, type)
+    end
+    def submit_and_wait(sleep_time, index=nil, type=nil)
+      FetchAction.submit(index, sleep_time,  type)
+    end
+    ##
+    #Click the link specified by the text
+    def click_link(link_spec,index=0)
+      FetchAction.click_link(link_spec,index, 0)
+    end
+    def click_link_and_wait(link_spec, sleep_secs=0)
+      FetchAction.click_link(link_spec, 0, sleep_secs)
+    end
+    def click_by_xpath(xpath)
+      FetchAction.click_by_xpath(xpath)
+    end
+    def click_image_map(index=0)
+      FetchAction.click_image_map(index)
+    end
+    def frame(attribute,value)
+      FetchAction.frame(attribute,value)
+    end
+    def wait(time=1)
+      FetchAction.wait(time)
+    end
+  end
+end

data/lib/scrubyt/core/scraping/compound_example.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Scrubyt
+  ##
+  #=<tt>Represents a compund example</tt>
+  #
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example. The simple example
+  #is specified by a string, and a compound example is specified with
+  #:contains, :begins_with and :ends_with descriptors - which can be
+  #both regexps or strings
+  class CompoundExample
+    DESCRIPTORS = [:contains, :begins_with, :ends_with]
+    attr_accessor :descriptor_hash
+    def initialize(descriptor_hash)
+      @descriptor_hash = descriptor_hash
+    end
+    ##
+    #Is the hash passed to this function a compound example descriptor hash?
+    #Need to decide this when parsing pattern parameters
+    def self.compound_example?(hash)
+      hash.each do |k,v|
+        return false if !DESCRIPTORS.include? k
+      end
+      true
+    end# end of method
+  end# #end of class CompoundExample
+end# end of module Scrubyt