RubyGems - scrubyt - Versions diffs - 0.2.8 → 0.3.0 - Mend

scrubyt 0.2.8 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/CHANGELOG +32 -2
data/Rakefile +25 -20
data/lib/scrubyt.rb +24 -5
data/lib/scrubyt/core/navigation/fetch_action.rb +76 -42
data/lib/scrubyt/core/navigation/navigation_actions.rb +24 -6
data/lib/scrubyt/core/scraping/filters/base_filter.rb +5 -5
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +2 -2
data/lib/scrubyt/core/scraping/filters/download_filter.rb +2 -1
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -2
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +37 -12
data/lib/scrubyt/core/scraping/pattern.rb +82 -90
data/lib/scrubyt/core/scraping/pre_filter_document.rb +2 -1
data/lib/scrubyt/core/shared/evaluation_context.rb +14 -37
data/lib/scrubyt/core/shared/extractor.rb +55 -54
data/lib/scrubyt/logging.rb +16 -0
data/lib/scrubyt/output/export.rb +1 -1
data/lib/scrubyt/output/post_processor.rb +6 -5
data/lib/scrubyt/output/result.rb +1 -0
data/lib/scrubyt/output/result_dumper.rb +4 -3
data/lib/scrubyt/output/result_node.rb +73 -0
data/lib/scrubyt/output/scrubyt_result.rb +28 -0
data/lib/scrubyt/utils/ruby_extensions.rb +8 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +14 -1
data/lib/scrubyt/utils/xpathutils.rb +11 -0
metadata +7 -12
data/test/unittests/constraint_test.rb +0 -107
data/test/unittests/extractor_test.rb +0 -91
data/test/unittests/filter_test.rb +0 -79
data/test/unittests/input/constraint_test.html +0 -55
data/test/unittests/input/test.html +0 -39
data/test/unittests/pattern_test.rb +0 -27
data/test/unittests/simple_example_lookup_test.rb +0 -68
data/test/unittests/xpathutils_test.rb +0 -152

data/CHANGELOG CHANGED

@@ -1,7 +1,38 @@
 = scRUBYt! Changelog
+== 0.3.0
+=== 21st May, 2007
+=<tt>changes:</tt>
+[NEW] complete rewrite of the output system, creating
+      a solid foundation for more robust output functions
+      (credit: Neelance)
+[NEW] logging - no annoying puts messages anymore! (credit: Tim Fletcher)
+[NEW] can index an example - e.g.
+      link 'more[5]'
+      semantics: give me the 6th element with the text 'link'
+[NEW] can use XPath checking an attribute value, like "//div[@id='content']"
+[NEW] default values for missing elements (first version was done in 0.2.8
+      but it did not work for all cases)
+[NEW] possibility to click button with it's text (instead of it's index)
+      (credit: Nick Merwin)
+[NEW] can click on image buttons (by specifying the name of the button)
+[NEW] possibility to extract an URL with one step, like so:
+      link 'The Difference/@href'
+      i.e. give me the href attribute of the element matched by the example 'The Difference'
+[NEW] new way to match an element of the page:
+      div 'div[The Difference]'
+      means 'return the div which contains the string "The Difference"'. This is
+      useful if the XPath of the element is non-constant across the same site (e.g.
+      sometimes a banner or add is added, sometimes not etc.)
+[FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase automatically
+[FIX] Fixed: correctly downloading image if the src
+      attribute had a leading space, as in
+      <img src=' /files/downloads/images/image.jpg'/>
 == 0.2.7
-=== 15th April, 2007
+=== 12th April, 2007
 =<tt>changes:</tt>
@@ -9,7 +40,6 @@
       parent pattern
 [NEW] checking checkboxes
 [NEW] basic authentication support
-[NEW] default values for missing elements
 [NEW] possibility to resolve relative paths against a custom url
 [NEW] first simple version of to_csv and to_hash
 [NEW] complete rewrite of the exporting system (Credit: Neelance)

data/Rakefile CHANGED

@@ -7,8 +7,7 @@ require 'rake/packagetask'
 # Dependencies
 ###################################################
-task "default" => ["test"]
-task "fulltest" => ["test", "blackbox"]
+task "default" => ["test_all"]
 task "generate_rdoc" => ["cleanup_readme"]
 task "cleanup_readme" => ["rdoc"]
@@ -16,22 +15,24 @@ task "cleanup_readme" => ["rdoc"]
 # Gem specification
 ###################################################
-gem_spec = Gem::Specification.new do |s|
-  s.name = 'scrubyt'
-  s.version = '0.2.8'
+gem_spec = Gem::Specification.new do |s|
+  s.name = 'scrubyt'
+  s.version = '0.3.0'
   s.summary = 'A powerful Web-scraping framework'
-  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
-  # Files containing Test::Unit test cases.
-  s.test_files = FileList['test/unittests/**/*']
-  # List of other files to be included.
+  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
+  # Files containing Test::Unit test cases.
+  s.test_files = FileList['test/unittests/**/*']
+  # List of other files to be included.
   s.files = FileList['COPYING', 'README', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
   s.author = 'Peter Szinek'
-  s.email = 'peter@rubyrailways.com'
+  s.email = 'peter@rubyrailways.com'
   s.homepage = 'http://www.scrubyt.org'
   s.add_dependency('hpricot', '>= 0.5')
-  s.add_dependency('mechanize', '>= 0.6.3')
+  s.add_dependency('mechanize', '>= 0.6.3')
+  #s.add_dependency('parsetree', '>= 1.7.0')
+  #s.add_dependency('ruby2ruby', '>= 1.1.5')
   s.has_rdoc = 'true'
-end
+end
 ###################################################
 # Tasks
@@ -47,12 +48,16 @@ Rake::RDocTask.new do |generate_rdoc|
      generate_rdoc.options << '--line-numbers' << '--inline-source'
 end
-Rake::TestTask.new do |test|
-  test.pattern = 'test/unittests/*_test.rb'
-end
+Rake::TestTask.new(:test_all) do |task|
+  task.pattern = 'test/*_test.rb'
+end
+Rake::TestTask.new(:test_blackbox) do |task|
+  task.test_files = ['test/blackbox_test.rb']
+end
-task "blackbox" do
-  ruby "test/blackbox/run_blackbox_tests.rb"
+Rake::TestTask.new(:test_non_blackbox) do |task|
+  task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
 end
 task "cleanup_readme" do
@@ -77,12 +82,12 @@ end
 task "generate_rdoc" do
 end
-Rake::GemPackageTask.new(gem_spec) do |pkg|
+Rake::GemPackageTask.new(gem_spec) do |pkg|
   pkg.need_zip = false
-  pkg.need_tar = false
+  pkg.need_tar = false
 end
-Rake::PackageTask.new('scrubyt-examples', '0.2.8') do |pkg|
+Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
   pkg.need_zip = true
   pkg.need_tar = true
   pkg.package_files.include("examples/**/*")

data/lib/scrubyt.rb CHANGED

@@ -1,14 +1,34 @@
 #ruby core
 require 'open-uri'
+require 'erb'
 #gems
 require 'rubygems'
 require 'mechanize'
 require 'hpricot'
-require 'parse_tree'
+require 'parse_tree_reloaded'
+#little hack to avoid that ruby2ruby tries to load the original parse_tree
+if Gem
+  module Gem
+    class << self
+      alias_method :activate_orig, :activate
+      def activate(gem, autorequire, *version_requirements)
+        activate_orig(gem, autorequire, *version_requirements) unless gem.is_a?(Gem::Dependency) && gem.name == 'ParseTree'
+      end
+    end
+  end
+end
+module Kernel
+  alias_method :require_orig, :require
+  def require(path)
+    require_orig(path) unless path == 'parse_tree'
+  end
+end
 require 'ruby2ruby'
 #scrubyt
+require 'scrubyt/logging'
 require 'scrubyt/utils/ruby_extensions.rb'
 require 'scrubyt/utils/xpathutils.rb'
 require 'scrubyt/utils/shared_utils.rb'
@@ -19,6 +39,8 @@ require 'scrubyt/core/scraping/constraint.rb'
 require 'scrubyt/core/scraping/result_indexer.rb'
 require 'scrubyt/core/scraping/pre_filter_document.rb'
 require 'scrubyt/core/scraping/compound_example.rb'
+require 'scrubyt/output/result_node.rb'
+require 'scrubyt/output/scrubyt_result.rb'
 require 'scrubyt/output/export.rb'
 require 'scrubyt/core/shared/extractor.rb'
 require 'scrubyt/core/scraping/filters/base_filter.rb'
@@ -29,10 +51,7 @@ require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
 require 'scrubyt/core/scraping/filters/regexp_filter.rb'
 require 'scrubyt/core/scraping/filters/tree_filter.rb'
 require 'scrubyt/core/scraping/pattern.rb'
-require 'scrubyt/output/result_dumper.rb'
-require 'scrubyt/output/result.rb'
-require 'scrubyt/output/post_processor.rb'
 require 'scrubyt/core/navigation/navigation_actions.rb'
 require 'scrubyt/core/navigation/fetch_action.rb'
 require 'scrubyt/core/shared/evaluation_context.rb'
-require 'scrubyt/core/shared/u_r_i_builder.rb'
+require 'scrubyt/core/shared/u_r_i_builder.rb'

data/lib/scrubyt/core/navigation/fetch_action.rb CHANGED

@@ -8,14 +8,13 @@ module Scrubyt
   #which is loading a document (even by submitting a form or clicking a link)
   #and related things like setting a proxy etc. you should find it here.
   class FetchAction
-    def initialize
-      @@current_doc_url = nil
-      @@current_doc_protocol = nil
-      @@base_dir = nil
-      @@host_name = nil
-      @@agent = WWW::Mechanize.new
-      @@history = []
-    end
+    @@current_doc_url = nil
+    @@current_doc_protocol = nil
+    @@base_dir = nil
+    @@host_name = nil
+    @@agent = WWW::Mechanize.new
+    @@history = []
     ##
     #Action to fetch a document (either a file or a http address)
@@ -25,29 +24,38 @@ module Scrubyt
     #_doc_url_ - the url or file name to fetch
     def self.fetch(doc_url, *args)
       #Refactor this crap!!! with option_accessor stuff
-      proxy = args[0][:proxy]
-      mechanize_doc = args[0][:mechanize_doc]
-      resolve = args[0][:resolve] || :full
-      basic_auth = args[0][:basic_auth]
-      user_agent = args[0][:user_agent] || "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
-      #Refactor this whole stuff as well!!! It looks awful...
-      parse_and_set_proxy(proxy) if proxy
-      set_user_agent(user_agent)
-      parse_and_set_basic_auth(basic_auth) if basic_auth
-      if !mechanize_doc
-        @@current_doc_url = doc_url
-        @@current_doc_protocol = determine_protocol
+      if args.size > 0
+        proxy = args[0][:proxy]
+        mechanize_doc = args[0][:mechanize_doc]
+        resolve = args[0][:resolve]
+        basic_auth = args[0][:basic_auth]
+        user_agent = args[0][:user_agent] || "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"
+        #Refactor this whole stuff as well!!! It looks awful...
+        parse_and_set_proxy(proxy) if proxy
+        set_user_agent(user_agent)
+        parse_and_set_basic_auth(basic_auth) if basic_auth
+      else
+        mechanize_doc = nil
+        resolve = :full
+      end
+      @@current_doc_url = doc_url
+      @@current_doc_protocol = determine_protocol
+      if mechanize_doc.nil? && @@current_doc_protocol != 'file'
         handle_relative_path(doc_url)
-        handle_relative_url(doc_url,resolve)
-        puts "[ACTION] fetching document: #{@@current_doc_url}"
-        if @@current_doc_protocol != 'file'
+        handle_relative_url(doc_url, resolve)
+        Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
+        unless 'file' == @@current_doc_protocol
           @@mechanize_doc = @@agent.get(@@current_doc_url)
         end
       else
-        @@current_doc_url = doc_url
         @@mechanize_doc = mechanize_doc
-        @@current_doc_protocol = determine_protocol
       end
       if @@current_doc_protocol == 'file'
         @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
       else
@@ -58,22 +66,24 @@ module Scrubyt
     ##
     #Submit the last form;
-    def self.submit(current_form, button=nil)
-      puts '[ACTION] submitting form...'
+    def self.submit(current_form, button=nil, type=nil)
+      Scrubyt.log :ACTION, 'Submitting form...'
       if button == nil
         result_page = @@agent.submit(current_form)
+      elsif type
+        result_page = current_form.submit(button)
       else
         result_page = @@agent.submit(current_form, button)
       end
       @@current_doc_url = result_page.uri.to_s
-      puts "[ACTION] fetched #{@@current_doc_url}"
+      Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
       fetch(@@current_doc_url, :mechanize_doc => result_page)
     end
     ##
     #Click the link specified by the text
     def self.click_link(link_spec,index = 0)
-      print "[ACTION] clicking link specified by: "; p link_spec
+      Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
       if link_spec.is_a? Hash
         clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
       else
@@ -82,7 +92,16 @@ module Scrubyt
       clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
       result_page = @@agent.click(clicked_elem)
       @@current_doc_url = result_page.uri.to_s
-      puts "[ACTION] fetched #{@@current_doc_url}"
+      Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
+      fetch(@@current_doc_url, :mechanize_doc => result_page)
+    end
+    def self.click_image_map(index = 0)
+      Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
+      uri = @@mechanize_doc.search("//area")[index]['href']
+      result_page = @@agent.get(uri)
+      @@current_doc_url = result_page.uri.to_s
+      Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
       fetch(@@current_doc_url, :mechanize_doc => result_page)
     end
@@ -118,6 +137,14 @@ module Scrubyt
       @@hpricot_doc = @@history.pop
     end
+    def self.store_host_name(doc_url)
+      @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
+      @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
+      @@host_name = doc_url if @@host_name == nil
+      @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
+      @@original_host_name ||= @@host_name
+    end #end of method store_host_name
     def self.determine_protocol
       old_protocol = @@current_doc_protocol
       new_protocol = case @@current_doc_url
@@ -149,18 +176,18 @@ module Scrubyt
           exit
         end
       end
-      puts "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
+      Scrubyt.log :ACTION, "Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
       @@agent.set_proxy(@@host, @@port)
     end
     def self.parse_and_set_basic_auth(basic_auth)
       login, pass = basic_auth.split('@')
-      puts "[ACTION] Basic authentication: login=<#{login}>, pass=<#{pass}>"
+      Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
       @@agent.basic_auth(login, pass)
     end
     def self.set_user_agent(user_agent)
-      #puts "[ACTION] Setting user-agent to #{user_agent}"
+      Scrubyt.log :ACTION, "Setting user-agent to #{user_agent}"
       @@agent.user_agent = user_agent
     end
@@ -172,22 +199,29 @@ module Scrubyt
       end
     end
-    def self.store_host_name(doc_url)
-      @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
-      @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
-      @@host_name = doc_url if @@host_name == nil
-      @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
-      @@original_host_name ||= @@host_name
-    end #end of method store_host_name
     def self.handle_relative_url(doc_url, resolve)
       return if doc_url =~ /^http/
+      if doc_url !~ /^\//
+        first_char = doc_url[0..0]
+        doc_url = ( first_char == '?'  ? '' : '/'  ) + doc_url
+        if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
+          current_uri = @@mechanize_doc.uri.to_s
+          current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
+          if (current_uri.include? '?')
+            current_uri = current_uri.scan(/.+\//)[0]
+          else
+            current_uri += '/' unless current_uri[-1..-1] == '/'
+          end
+          @@current_doc_url = current_uri + doc_url
+          return
+        end
+      end
       case resolve
         when :full
           @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
           @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
         when :host
-          base_host_name = @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0]
+          base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
           @@current_doc_url = base_host_name + doc_url
         else
           #custom resilving

data/lib/scrubyt/core/navigation/navigation_actions.rb CHANGED

@@ -13,8 +13,10 @@ module Scrubyt
                 'fill_textarea',
                 'submit',
                 'click_link',
+                'click_image_map',
                 'select_option',
                 'check_checkbox',
+                'check_radiobutton',
                 'end']
     def initialize
@@ -48,16 +50,20 @@ module Scrubyt
     def self.select_option(selectlist_name, option)
       lookup_form_for_tag('select','select list',selectlist_name,option)
       select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
-      searched_option = select_list.options.find{|f| f.text == option}
+      searched_option = select_list.options.find{|f| f.text.strip == option}
       searched_option.click
     end
     def self.check_checkbox(checkbox_name)
-      puts checkbox_name
       lookup_form_for_tag('input','checkbox',checkbox_name, '')
       @@current_form.checkboxes.name(checkbox_name).check
     end
+    def self.check_radiobutton(checkbox_name, index=0)
+      lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
+      @@current_form.radiobuttons.name(checkbox_name)[index].check
+    end
     ##
     #Fetch the document
     def self.fetch(*args)
@@ -65,9 +71,14 @@ module Scrubyt
     end
     ##
     #Submit the current form (delegate it to NavigationActions)
-    def self.submit(index=nil)
+    def self.submit(index=nil, type=nil)
       if index == nil
         FetchAction.submit(@@current_form)
+      #----- added by nickmerwin@gmail.com -----
+      elsif index.class == String
+        button = @@current_form.buttons.detect{|b| b.name == index}
+        FetchAction.submit(@@current_form, button,type)
+      #-----------------------------------------
       else
         FetchAction.submit(@@current_form, @@current_form.buttons[index])
       end
@@ -79,6 +90,10 @@ module Scrubyt
       FetchAction.click_link(link_spec,index)
     end
+    def self.click_image_map(index=0)
+      FetchAction.click_image_map(index)
+    end
     def self.get_hpricot_doc
       FetchAction.get_hpricot_doc
     end
@@ -92,10 +107,12 @@ module Scrubyt
     end
 private
-    def self.lookup_form_for_tag(tag,widget_name,name_attribute,query_string)
-      puts "[ACTION] typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
-      widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[0]
+    def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
+      Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
+      widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
+      p widget
       form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
+      p form_tag
       find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
     end
@@ -112,6 +129,7 @@ private
       loop do
         @@current_form = FetchAction.get_mechanize_doc.forms[i]
         return nil if @@current_form == nil
+        puts i
         break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
         i+= 1
       end