RubyGems - scrubyt - Versions diffs - 0.3.4 → 0.4.1 - Mend

scrubyt 0.3.4 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/CHANGELOG +31 -0
data/README +1 -1
data/Rakefile +4 -9
data/lib/scrubyt.rb +37 -56
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +2 -183
data/lib/scrubyt/core/navigation/navigation_actions.rb +30 -48
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/base_filter.rb +6 -6
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +6 -6
data/lib/scrubyt/core/scraping/filters/download_filter.rb +0 -4
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +0 -4
data/lib/scrubyt/core/scraping/filters/script_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/text_filter.rb +1 -5
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +0 -8
data/lib/scrubyt/core/scraping/pattern.rb +6 -27
data/lib/scrubyt/core/scraping/result_indexer.rb +0 -4
data/lib/scrubyt/core/shared/extractor.rb +15 -1
data/lib/scrubyt/output/result_node.rb +42 -6
data/lib/scrubyt/output/scrubyt_result.rb +35 -30
data/lib/scrubyt/utils/ruby_extensions.rb +0 -53
data/lib/scrubyt/utils/xpathutils.rb +2 -1
metadata +84 -119
data/lib/scrubyt/output/export.rb +0 -157

data/lib/scrubyt/core/navigation/fetch_action.rb CHANGED

@@ -8,103 +8,13 @@ module Scrubyt
   #which is loading a document (even by submitting a form or clicking a link)
   #and related things like setting a proxy etc. you should find it here.
   module FetchAction
     @@current_doc_url = nil
     @@current_doc_protocol = nil
     @@base_dir = nil
     @@host_name = nil
-    @@agent = WWW::Mechanize.new
     @@history = []
-    ##
-    #Action to fetch a document (either a file or a http address)
-    #
-    #*parameters*
-    #
-    #_doc_url_ - the url or file name to fetch
-    def self.fetch(doc_url, *args)
-      #Refactor this crap!!! with option_accessor stuff
-      if args.size > 0
-        proxy = args[0][:proxy]
-        mechanize_doc = args[0][:mechanize_doc]
-        resolve = args[0][:resolve]
-        basic_auth = args[0][:basic_auth]
-        user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
-        #Refactor this whole stuff as well!!! It looks awful...
-        parse_and_set_proxy(proxy) if proxy
-        set_user_agent(user_agent)
-        parse_and_set_basic_auth(basic_auth) if basic_auth
-      else
-        mechanize_doc = nil
-        resolve = :full
-      end
-      @@current_doc_url = doc_url
-      @@current_doc_protocol = determine_protocol
-      if mechanize_doc.nil? && @@current_doc_protocol != 'file'
-        handle_relative_path(doc_url)
-        handle_relative_url(doc_url, resolve)
-        Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
-        unless 'file' == @@current_doc_protocol
-          @@mechanize_doc = @@agent.get(@@current_doc_url)
-        end
-      else
-        @@mechanize_doc = mechanize_doc
-      end
-      if @@current_doc_protocol == 'file'
-        @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
-      else
-        @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
-        store_host_name(self.get_current_doc_url)   # in case we're on a new host
-      end
-    end
-    ##
-    #Submit the last form;
-    def self.submit(current_form, button=nil, type=nil)
-      Scrubyt.log :ACTION, 'Submitting form...'
-      if button == nil
-        result_page = @@agent.submit(current_form)
-      elsif type
-        result_page = current_form.submit(button)
-      else
-        result_page = @@agent.submit(current_form, button)
-      end
-      @@current_doc_url = result_page.uri.to_s
-      Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
-      fetch(@@current_doc_url, :mechanize_doc => result_page)
-    end
-    ##
-    #Click the link specified by the text
-    def self.click_link(link_spec,index = 0)
-      Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
-      if link_spec.is_a? Hash
-        clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
-      else
-        clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
-      end
-      clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
-      result_page = @@agent.click(clicked_elem)
-      @@current_doc_url = result_page.uri.to_s
-      Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
-      fetch(@@current_doc_url, :mechanize_doc => result_page)
-    end
-    def self.click_image_map(index = 0)
-      Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
-      uri = @@mechanize_doc.search("//area")[index]['href']
-      result_page = @@agent.get(uri)
-      @@current_doc_url = result_page.uri.to_s
-      Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
-      fetch(@@current_doc_url, :mechanize_doc => result_page)
-    end
+    @@current_form = nil
     ##
     # At any given point, the current document can be queried with this method; Typically used
     # when the navigation is over and the result document is passed to the wrapper
@@ -140,96 +50,5 @@ module Scrubyt
     def store_host_name(doc_url)
       FetchAction.store_host_name(doc_url)
     end
-    def self.store_host_name(doc_url)
-      @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
-      @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
-      @@host_name = doc_url if @@host_name == nil
-      @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
-      @@original_host_name ||= @@host_name
-    end #end of method store_host_name
-    def self.determine_protocol
-      old_protocol = @@current_doc_protocol
-      new_protocol = case @@current_doc_url
-        when /^https/
-          'https'
-        when /^http/
-          'http'
-        when /^www/
-          'http'
-        else
-          'file'
-        end
-      return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
-      return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
-      new_protocol
-    end
-    def self.parse_and_set_proxy(proxy)
-      if proxy.downcase == 'localhost'
-        @@host = 'localhost'
-        @@port = proxy.split(':').last
-      else
-        parts = proxy.split(':')
-        @@port = parts.delete_at(-1)
-        @@host = parts.join(':')
-        if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
-          Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
-          exit
-        end
-      end
-      Scrubyt.log :ACTION, "Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
-      @@agent.set_proxy(@@host, @@port)
-    end
-    def self.parse_and_set_basic_auth(basic_auth)
-      login, pass = basic_auth.split('@')
-      Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
-      @@agent.basic_auth(login, pass)
-    end
-    def self.set_user_agent(user_agent)
-      Scrubyt.log :ACTION, "Setting user-agent to #{user_agent}"
-      @@agent.user_agent = user_agent
-    end
-    def self.handle_relative_path(doc_url)
-      if @@base_dir == nil
-        @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
-      else
-        @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
-      end
-    end
-    def self.handle_relative_url(doc_url, resolve)
-      return if doc_url =~ /^http/
-      if doc_url !~ /^\//
-        first_char = doc_url[0..0]
-        doc_url = ( first_char == '?'  ? '' : '/'  ) + doc_url
-        if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
-          current_uri = @@mechanize_doc.uri.to_s
-          current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
-          if (current_uri.include? '?')
-            current_uri = current_uri.scan(/.+\//)[0]
-          else
-            current_uri += '/' unless current_uri[-1..-1] == '/'
-          end
-          @@current_doc_url = current_uri + doc_url
-          return
-        end
-      end
-      case resolve
-        when :full
-          @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
-          @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
-        when :host
-          base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
-          @@current_doc_url = base_host_name + doc_url
-        else
-          #custom resilving
-          @@current_doc_url = resolve + doc_url
-      end
-    end
   end
 end

data/lib/scrubyt/core/navigation/navigation_actions.rb CHANGED

@@ -23,35 +23,32 @@ module Scrubyt
     #textfield is 'q'
     #
     #_query_string_ - the string that should be entered into the textfield
-    def fill_textfield(textfield_name, query_string)
-      lookup_form_for_tag('input','textfield',textfield_name,query_string)
-      eval("@current_form['#{textfield_name}'] = '#{query_string}'")
+    def fill_textfield(textfield_name, query_string, use_value = nil)
+      FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
+    end
+    def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
+      FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
     end
     ##
     #Action to fill a textarea with text
     def fill_textarea(textarea_name, text)
-      lookup_form_for_tag('textarea','textarea',textarea_name,text)
-      eval("@current_form['#{textarea_name}'] = '#{text}'")
+      FetchAction.fill_textarea(textarea_name, text)
     end
     ##
     #Action for selecting an option from a dropdown box
     def select_option(selectlist_name, option)
-      lookup_form_for_tag('select','select list',selectlist_name,option)
-      select_list = @current_form.fields.find {|f| f.name == selectlist_name}
-      searched_option = select_list.options.find{|f| f.text.strip == option}
-      searched_option.click
+      FetchAction.select_option(selectlist_name, option)
     end
     def check_checkbox(checkbox_name)
-      lookup_form_for_tag('input','checkbox',checkbox_name, '')
-      @current_form.checkboxes.name(checkbox_name).check
+      FetchAction.check_checkbox(checkbox_name)
     end
     def check_radiobutton(checkbox_name, index=0)
-      lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
-      @current_form.radiobuttons.name(checkbox_name)[index].check
+      FetchAction.check_radiobutton(checkbox_name, index=0)
     end
     ##
@@ -62,52 +59,37 @@ module Scrubyt
     ##
     #Submit the current form
     def submit(index=nil, type=nil)
-      if index == nil
-        FetchAction.submit(@current_form)
-        #----- added by nickmerwin@gmail.com -----
-      elsif index.class == String
-        button = @current_form.buttons.detect{|b| b.name == index}
-        FetchAction.submit(@current_form, button,type)
-        #-----------------------------------------
-      else
-        FetchAction.submit(@current_form, @current_form.buttons[index])
-      end
+      FetchAction.submit(nil, index, type)
+    end
+    def submit_and_wait(sleep_time, index=nil, type=nil)
+      FetchAction.submit(index, sleep_time,  type)
     end
     ##
     #Click the link specified by the text
     def click_link(link_spec,index=0)
-      FetchAction.click_link(link_spec,index)
+      FetchAction.click_link(link_spec,index, 0)
+    end
+    def click_link_and_wait(link_spec, sleep_secs=0)
+      FetchAction.click_link(link_spec, 0, sleep_secs)
+    end
+    def click_by_xpath(xpath)
+      FetchAction.click_by_xpath(xpath)
     end
     def click_image_map(index=0)
       FetchAction.click_image_map(index)
     end
-    private
-    def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
-      Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
-      widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
-      form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
-      find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
-    end
-    def find_form_based_on_tag(tag, possible_attrs)
-      lookup_attribute_name = nil
-      lookup_attribute_value = nil
-      possible_attrs.each { |a|
-        lookup_attribute_name = a
-        lookup_attribute_value = tag.attributes[a]
-        break if lookup_attribute_value != nil
-      }
-      i = 0
-      loop do
-        @current_form = FetchAction.get_mechanize_doc.forms[i]
-        return nil if @current_form == nil
-        break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
-        i+= 1
-      end
+    def frame(attribute,value)
+      FetchAction.frame(attribute,value)
+    end
+    def wait(time=1)
+      FetchAction.wait(time)
     end
   end
 end

data/lib/scrubyt/core/scraping/filters/attribute_filter.rb CHANGED

@@ -10,8 +10,5 @@ module Scrubyt
       end
     end
-    def to_sexp
-      [:str, @example]
-    end #end of method to_sexp
   end #End of class AttributeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/base_filter.rb CHANGED

@@ -53,7 +53,6 @@ module Scrubyt
                   :constraints, :xpath, :regexp, :example, :final_result)
     def self.create(parent_pattern, example=nil)
       filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
       if filter_name == 'RootFilter'
         BaseFilter.new(parent_pattern, example)
@@ -76,14 +75,15 @@ module Scrubyt
       end
     end
-    def to_sexp
-      nil
-    end
     private
     #We don't want this to be accessible from outside
     def initialize(parent_pattern, example)
-      @example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
+      case parent_pattern.example_type
+      when :xpath
+        @example_type = EXAMPLE_TYPE_XPATH
+      else
+        @example_type = BaseFilter.determine_example_type(example)
+      end
       @parent_pattern = parent_pattern
       @example = example
       @xpath = nil                #The xpath to evaluate this filter

data/lib/scrubyt/core/scraping/filters/constant_filter.rb CHANGED

@@ -5,8 +5,5 @@ module Scrubyt
       return @example
     end
-    def to_sexp
-      [:str, @example]
-    end #end of method to_sexp
   end #End of class ConstantFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb CHANGED

@@ -7,12 +7,16 @@ module Scrubyt
       else
         url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
       end
       @parent_pattern.extractor.store_page
       original_host_name = @parent_pattern.extractor.get_host_name
       @parent_pattern.extractor.restore_host_name
-      FetchAction.fetch url, :resolve => @parent_pattern.resolve
+      begin
+        FetchAction.fetch url, :resolve => @parent_pattern.resolve
+      rescue
+        Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
+      end
       if @detail_extractor.nil?
         @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
@@ -29,9 +33,5 @@ module Scrubyt
       root_results
     end
-    def get_detail_sexp
-      [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
-    end
   end
 end

data/lib/scrubyt/core/scraping/filters/download_filter.rb CHANGED

@@ -8,10 +8,6 @@ module Scrubyt
       download_file(source)
     end #end of method
-    def to_sexp
-      [:str, @example]
-    end #end of method to_sexp
 private
     def download_file(source)
       return '' if source.size < 4

data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb CHANGED

@@ -5,8 +5,5 @@ module Scrubyt
       source.inner_html
     end
-    def to_sexp
-      nil
-    end #end of method
   end #End of class TreeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/regexp_filter.rb CHANGED

@@ -9,9 +9,5 @@ module Scrubyt
       end
     end
-    def to_sexp
-      [:lit, @example]
-    end
   end #End of class TreeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/script_filter.rb CHANGED

@@ -7,8 +7,5 @@ module Scrubyt
       @example.call param
     end
-    def to_sexp
-      [:str, "FIXME!!! Can't dump Proc"]
-    end #end of method to_sexp
   end #End of class ConstantFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/text_filter.rb CHANGED

@@ -9,7 +9,6 @@ module Scrubyt
         index = @example.scan(/\]:(.+)/).flatten
         index = 0 if index.empty?
         index = index[0].to_i unless index[0] == "all"
         result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
         return "" unless result
@@ -22,7 +21,7 @@ module Scrubyt
     def find_string(source)
       str = @example.scan(/find\((.+)\)/).flatten[0]
-      strings_to_find = str.include? ('|') ? str.split('|') : [str]
+      strings_to_find = str.include?('|') ? str.split('|') : [str]
       strings_to_find.each do |s|
         result = SharedUtils.traverse_for_match(source,/#{s}/i)
         return [s] unless result.empty?
@@ -30,9 +29,6 @@ module Scrubyt
       return []
     end
-    def to_sexp
-      [:str, @example]
-    end #end of method to_sexp
   end #End of class TextFilter
 end #End of module Scrubyt