RubyGems - scrubyt - Versions diffs - 0.3.0 → 0.3.4 - Mend

scrubyt 0.3.0 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/CHANGELOG +13 -6
data/Rakefile +22 -10
data/lib/scrubyt.rb +9 -4
data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
data/lib/scrubyt/core/scraping/pattern.rb +25 -18
data/lib/scrubyt/core/shared/extractor.rb +109 -128
data/lib/scrubyt/logging.rb +146 -8
data/lib/scrubyt/output/export.rb +60 -44
data/lib/scrubyt/output/result_node.rb +34 -3
data/lib/scrubyt/output/scrubyt_result.rb +18 -9
data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
data/lib/scrubyt/utils/shared_utils.rb +1 -1
data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
metadata +52 -6
data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67

data/CHANGELOG CHANGED

@@ -1,7 +1,7 @@
 = scRUBYt! Changelog
-== 0.3.0
-=== 21st May, 2007
+== 0.3.1
+=== 29th May, 2007
 =<tt>changes:</tt>
@@ -17,19 +17,26 @@
       but it did not work for all cases)
 [NEW] possibility to click button with it's text (instead of it's index)
       (credit: Nick Merwin)
+[NEW] clicking radio buttons
 [NEW] can click on image buttons (by specifying the name of the button)
 [NEW] possibility to extract an URL with one step, like so:
       link 'The Difference/@href'
-      i.e. give me the href attribute of the element matched by the example 'The Difference'
+      i.e. give me the href attribute of the element matched by the example 'The      Difference'
 [NEW] new way to match an element of the page:
       div 'div[The Difference]'
       means 'return the div which contains the string "The Difference"'. This is
-      useful if the XPath of the element is non-constant across the same site (e.g.
-      sometimes a banner or add is added, sometimes not etc.)
-[FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase automatically
+      useful if the XPath of the element is non-constant across the same site
+      (e.g.sometimes a banner or add is added, sometimes not etc.)
+[NEW] Clicking image maps; At the moment this is achieved by specifying an
+      index, like
+      click_image_map 3
+      which means click the 4th link in the image map
+[FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase
+      automatically
 [FIX] Fixed: correctly downloading image if the src
       attribute had a leading space, as in
       <img src=' /files/downloads/images/image.jpg'/>
+[FIX] Other misc fixes - a ton of them!
 == 0.2.7
 === 12th April, 2007

data/Rakefile CHANGED

@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
 gem_spec = Gem::Specification.new do |s|
   s.name = 'scrubyt'
-  s.version = '0.3.0'
-  s.summary = 'A powerful Web-scraping framework'
+  s.version = '0.3.4'
+  s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
   s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
   # Files containing Test::Unit test cases.
   s.test_files = FileList['test/unittests/**/*']
@@ -29,9 +29,12 @@ gem_spec = Gem::Specification.new do |s|
   s.homepage = 'http://www.scrubyt.org'
   s.add_dependency('hpricot', '>= 0.5')
   s.add_dependency('mechanize', '>= 0.6.3')
-  #s.add_dependency('parsetree', '>= 1.7.0')
-  #s.add_dependency('ruby2ruby', '>= 1.1.5')
-  s.has_rdoc = 'true'
+  s.add_dependency('ParseTreeReloaded')
+  s.add_dependency('RubyInlineAcceleration')
+  s.add_dependency('RubyInline', '= 3.6.3')
+  s.add_dependency('ParseTree', '= 1.7.1')
+  s.add_dependency('ruby2ruby', '= 1.1.6')
+  #s.has_rdoc = 'true'
 end
 ###################################################
@@ -56,10 +59,19 @@ Rake::TestTask.new(:test_blackbox) do |task|
   task.test_files = ['test/blackbox_test.rb']
 end
+task "test_specific" do
+  ruby "test/blackbox_test.rb #{ARGV[1]}"
+end
 Rake::TestTask.new(:test_non_blackbox) do |task|
   task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
 end
+task "rcov" do
+  sh 'rcov --xrefs test/*.rb'
+  puts 'Report done.'
+end
 task "cleanup_readme" do
   puts "Cleaning up README..."
   readme_in = open('./doc/files/README.html')
@@ -87,8 +99,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
   pkg.need_tar = false
 end
-Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
-  pkg.need_zip = true
-  pkg.need_tar = true
-  pkg.package_files.include("examples/**/*")
-end
+#Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
+#  pkg.need_zip = true
+#  pkg.need_tar = true
+#  pkg.package_files.include("examples/**/*")
+#end

data/lib/scrubyt.rb CHANGED

@@ -1,3 +1,6 @@
+$KCODE = 'u'
+require 'jcode'
 #ruby core
 require 'open-uri'
 require 'erb'
@@ -7,6 +10,7 @@ require 'rubygems'
 require 'mechanize'
 require 'hpricot'
 require 'parse_tree_reloaded'
+require 'rexml/text'
 #little hack to avoid that ruby2ruby tries to load the original parse_tree
 if Gem
@@ -42,16 +46,17 @@ require 'scrubyt/core/scraping/compound_example.rb'
 require 'scrubyt/output/result_node.rb'
 require 'scrubyt/output/scrubyt_result.rb'
 require 'scrubyt/output/export.rb'
+require 'scrubyt/core/navigation/navigation_actions.rb'
+require 'scrubyt/core/navigation/fetch_action.rb'
 require 'scrubyt/core/shared/extractor.rb'
 require 'scrubyt/core/scraping/filters/base_filter.rb'
 require 'scrubyt/core/scraping/filters/attribute_filter.rb'
+require 'scrubyt/core/scraping/filters/constant_filter.rb'
+require 'scrubyt/core/scraping/filters/script_filter.rb'
+require 'scrubyt/core/scraping/filters/text_filter.rb'
 require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
 require 'scrubyt/core/scraping/filters/download_filter.rb'
 require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
 require 'scrubyt/core/scraping/filters/regexp_filter.rb'
 require 'scrubyt/core/scraping/filters/tree_filter.rb'
 require 'scrubyt/core/scraping/pattern.rb'
-require 'scrubyt/core/navigation/navigation_actions.rb'
-require 'scrubyt/core/navigation/fetch_action.rb'
-require 'scrubyt/core/shared/evaluation_context.rb'
-require 'scrubyt/core/shared/u_r_i_builder.rb'

data/lib/scrubyt/core/navigation/fetch_action.rb CHANGED

@@ -7,7 +7,7 @@ module Scrubyt
   #functionality to a separate class - so if you are looking for anything
   #which is loading a document (even by submitting a form or clicking a link)
   #and related things like setting a proxy etc. you should find it here.
-  class FetchAction
+  module FetchAction
     @@current_doc_url = nil
     @@current_doc_protocol = nil
@@ -30,7 +30,7 @@ module Scrubyt
         mechanize_doc = args[0][:mechanize_doc]
         resolve = args[0][:resolve]
         basic_auth = args[0][:basic_auth]
-        user_agent = args[0][:user_agent] || "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"
+        user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
         #Refactor this whole stuff as well!!! It looks awful...
         parse_and_set_proxy(proxy) if proxy
         set_user_agent(user_agent)
@@ -120,23 +120,27 @@ module Scrubyt
       @@hpricot_doc
     end
-    def self.get_host_name
+    def get_host_name
       @@host_name
     end
-    def self.restore_host_name
+    def restore_host_name
       return if @@current_doc_protocol == 'file'
       @@host_name = @@original_host_name
     end
-    def self.store_page
+    def store_page
       @@history.push @@hpricot_doc
     end
-    def self.restore_page
+    def restore_page
       @@hpricot_doc = @@history.pop
     end
+    def store_host_name(doc_url)
+      FetchAction.store_host_name(doc_url)
+    end
     def self.store_host_name(doc_url)
       @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
       @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
@@ -171,8 +175,7 @@ module Scrubyt
         @@port = parts.delete_at(-1)
         @@host = parts.join(':')
         if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
-          puts "Invalid proxy specification..."
-          puts "neither host nor port can be nil!"
+          Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
           exit
         end
       end
@@ -227,6 +230,6 @@ module Scrubyt
           #custom resilving
           @@current_doc_url = resolve + doc_url
       end
-    end #end of function handle_relative_url
-  end #end of class FetchAction
-end #end of module Scrubyt
+    end
+  end
+end

data/lib/scrubyt/core/navigation/navigation_actions.rb CHANGED

@@ -5,25 +5,15 @@ module Scrubyt
   #This class contains all the actions that are used to navigate on web pages;
   #first of all, *fetch* for downloading the pages - then various actions
   #like filling textfields, submitting formst, clicking links and more
-  class NavigationActions
-    #These are reserved keywords - they can not be the name of any pattern
-    #since they are reserved for describing the navigation
-    KEYWORDS = ['fetch',
-                'fill_textfield',
-                'fill_textarea',
-                'submit',
-                'click_link',
-                'click_image_map',
-                'select_option',
-                'check_checkbox',
-                'check_radiobutton',
-                'end']
-    def initialize
-        @@current_form = nil
-        FetchAction.new
+  module NavigationActions
+    def self.extend_object(obj)
+      super(obj)
+      obj.instance_eval do
+        @current_form = nil
+      end
     end
     ##
     #Action to fill a textfield with a query string
     #
@@ -33,90 +23,76 @@ module Scrubyt
     #textfield is 'q'
     #
     #_query_string_ - the string that should be entered into the textfield
-    def self.fill_textfield(textfield_name, query_string)
+    def fill_textfield(textfield_name, query_string)
       lookup_form_for_tag('input','textfield',textfield_name,query_string)
-      eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
+      eval("@current_form['#{textfield_name}'] = '#{query_string}'")
     end
     ##
     #Action to fill a textarea with text
-    def self.fill_textarea(textarea_name, text)
+    def fill_textarea(textarea_name, text)
       lookup_form_for_tag('textarea','textarea',textarea_name,text)
-      eval("@@current_form['#{textarea_name}'] = '#{text}'")
+      eval("@current_form['#{textarea_name}'] = '#{text}'")
     end
     ##
     #Action for selecting an option from a dropdown box
-    def self.select_option(selectlist_name, option)
+    def select_option(selectlist_name, option)
       lookup_form_for_tag('select','select list',selectlist_name,option)
-      select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
+      select_list = @current_form.fields.find {|f| f.name == selectlist_name}
       searched_option = select_list.options.find{|f| f.text.strip == option}
       searched_option.click
     end
-    def self.check_checkbox(checkbox_name)
+    def check_checkbox(checkbox_name)
       lookup_form_for_tag('input','checkbox',checkbox_name, '')
-      @@current_form.checkboxes.name(checkbox_name).check
+      @current_form.checkboxes.name(checkbox_name).check
     end
-    def self.check_radiobutton(checkbox_name, index=0)
+    def check_radiobutton(checkbox_name, index=0)
       lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
-      @@current_form.radiobuttons.name(checkbox_name)[index].check
+      @current_form.radiobuttons.name(checkbox_name)[index].check
     end
     ##
     #Fetch the document
-    def self.fetch(*args)
+    def fetch(*args)
       FetchAction.fetch(*args)
     end
     ##
-    #Submit the current form (delegate it to NavigationActions)
-    def self.submit(index=nil, type=nil)
+    #Submit the current form
+    def submit(index=nil, type=nil)
       if index == nil
-        FetchAction.submit(@@current_form)
-      #----- added by nickmerwin@gmail.com -----
+        FetchAction.submit(@current_form)
+        #----- added by nickmerwin@gmail.com -----
       elsif index.class == String
-        button = @@current_form.buttons.detect{|b| b.name == index}
-        FetchAction.submit(@@current_form, button,type)
-      #-----------------------------------------
+        button = @current_form.buttons.detect{|b| b.name == index}
+        FetchAction.submit(@current_form, button,type)
+        #-----------------------------------------
       else
-        FetchAction.submit(@@current_form, @@current_form.buttons[index])
+        FetchAction.submit(@current_form, @current_form.buttons[index])
       end
     end
     ##
-    #Click the link specified by the text ((delegate it to NavigationActions)
-    def self.click_link(link_spec,index=0)
+    #Click the link specified by the text
+    def click_link(link_spec,index=0)
       FetchAction.click_link(link_spec,index)
     end
-    def self.click_image_map(index=0)
+    def click_image_map(index=0)
       FetchAction.click_image_map(index)
     end
-    def self.get_hpricot_doc
-      FetchAction.get_hpricot_doc
-    end
-    def self.get_current_doc_url
-      FetchAction.get_current_doc_url
-    end
-    def self.get_host_name
-      FetchAction.get_host_name
-    end
-private
-    def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
+    private
+    def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
       Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
       widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
-      p widget
       form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
-      p form_tag
       find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
     end
-    def self.find_form_based_on_tag(tag, possible_attrs)
+    def find_form_based_on_tag(tag, possible_attrs)
       lookup_attribute_name = nil
       lookup_attribute_value = nil
@@ -127,12 +103,11 @@ private
       }
       i = 0
       loop do
-        @@current_form = FetchAction.get_mechanize_doc.forms[i]
-        return nil if @@current_form == nil
-        puts i
-        break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
+        @current_form = FetchAction.get_mechanize_doc.forms[i]
+        return nil if @current_form == nil
+        break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
         i+= 1
       end
-    end#find_form_based_on_tag
-  end#end of class NavigationActions
-end#end of module Scrubyt
+    end
+  end
+end

data/lib/scrubyt/core/scraping/filters/base_filter.rb CHANGED

@@ -66,12 +66,13 @@ module Scrubyt
     #should not be called directly
     #TODO still used?
+    alias_method :throw_method_missing, :method_missing
     def method_missing(method_name, *args, &block)
       case method_name.to_s
       when /^ensure.+/
         constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
       else
-        raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
+        throw_method_missing(method_name, *args, &block)
       end
     end
@@ -82,7 +83,7 @@ module Scrubyt
     private
     #We don't want this to be accessible from outside
     def initialize(parent_pattern, example)
-      @example_type = BaseFilter.determine_example_type(example)
+      @example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
       @parent_pattern = parent_pattern
       @example = example
       @xpath = nil                #The xpath to evaluate this filter

data/lib/scrubyt/core/scraping/filters/constant_filter.rb ADDED

@@ -0,0 +1,12 @@
+module Scrubyt
+  class ConstantFilter < BaseFilter
+    def evaluate(source)
+      return @example
+    end
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
+  end #End of class ConstantFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb CHANGED

@@ -2,13 +2,36 @@ module Scrubyt
   class DetailPageFilter < BaseFilter
     def evaluate(source)
-      if source.is_a? String
-        @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
+      if source.is_a?(String)
+        url = source
       else
-        @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
-          XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
-          @parent_pattern, @parent_pattern.resolve)
+        url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
       end
-    end #end of method
-  end #End of class DetailPageFilter
-end #End of module Scrubyt
+      @parent_pattern.extractor.store_page
+      original_host_name = @parent_pattern.extractor.get_host_name
+      @parent_pattern.extractor.restore_host_name
+      FetchAction.fetch url, :resolve => @parent_pattern.resolve
+      if @detail_extractor.nil?
+        @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
+        root_results = @detail_extractor.result
+      else
+        root_results = @detail_extractor.evaluate_extractor
+      end
+      @parent_pattern.extractor.restore_page
+      @parent_pattern.extractor.store_host_name original_host_name
+      root_results
+    end
+    def get_detail_sexp
+      [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
+    end
+  end
+end