RubyGems - scrubyt - Versions diffs - 0.3.0 → 0.3.4 - Mend

scrubyt 0.3.0 → 0.3.4

Files changed (25) hide show

data/CHANGELOG +13 -6
data/Rakefile +22 -10
data/lib/scrubyt.rb +9 -4
data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
data/lib/scrubyt/core/scraping/pattern.rb +25 -18
data/lib/scrubyt/core/shared/extractor.rb +109 -128
data/lib/scrubyt/logging.rb +146 -8
data/lib/scrubyt/output/export.rb +60 -44
data/lib/scrubyt/output/result_node.rb +34 -3
data/lib/scrubyt/output/scrubyt_result.rb +18 -9
data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
data/lib/scrubyt/utils/shared_utils.rb +1 -1
data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
metadata +52 -6
data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67

data/CHANGELOG CHANGED

@@ -1,7 +1,7 @@
 = scRUBYt! Changelog
-== 0.3.0
-=== 21st May, 2007
+== 0.3.1
+=== 29th May, 2007
 =<tt>changes:</tt>
@@ -17,19 +17,26 @@
       but it did not work for all cases)
 [NEW] possibility to click button with it's text (instead of it's index)
       (credit: Nick Merwin)
+[NEW] clicking radio buttons
 [NEW] can click on image buttons (by specifying the name of the button)
 [NEW] possibility to extract an URL with one step, like so:
       link 'The Difference/@href'
-      i.e. give me the href attribute of the element matched by the example 'The Difference'
+      i.e. give me the href attribute of the element matched by the example 'The      Difference'
 [NEW] new way to match an element of the page:
       div 'div[The Difference]'
       means 'return the div which contains the string "The Difference"'. This is
-      useful if the XPath of the element is non-constant across the same site (e.g.
-      sometimes a banner or add is added, sometimes not etc.)
-[FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase automatically
+      useful if the XPath of the element is non-constant across the same site
+      (e.g.sometimes a banner or add is added, sometimes not etc.)
+[NEW] Clicking image maps; At the moment this is achieved by specifying an
+      index, like
+      click_image_map 3
+      which means click the 4th link in the image map
+[FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase
+      automatically
 [FIX] Fixed: correctly downloading image if the src
       attribute had a leading space, as in
       <img src=' /files/downloads/images/image.jpg'/>
+[FIX] Other misc fixes - a ton of them!
 == 0.2.7
 === 12th April, 2007

data/Rakefile CHANGED

@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
 gem_spec = Gem::Specification.new do |s|
   s.name = 'scrubyt'
-  s.version = '0.3.0'
-  s.summary = 'A powerful Web-scraping framework'
+  s.version = '0.3.4'
+  s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
   s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
   # Files containing Test::Unit test cases.
   s.test_files = FileList['test/unittests/**/*']
@@ -29,9 +29,12 @@ gem_spec = Gem::Specification.new do |s|
   s.homepage = 'http://www.scrubyt.org'
   s.add_dependency('hpricot', '>= 0.5')
   s.add_dependency('mechanize', '>= 0.6.3')
-  #s.add_dependency('parsetree', '>= 1.7.0')
-  #s.add_dependency('ruby2ruby', '>= 1.1.5')
-  s.has_rdoc = 'true'
+  s.add_dependency('ParseTreeReloaded')
+  s.add_dependency('RubyInlineAcceleration')
+  s.add_dependency('RubyInline', '= 3.6.3')
+  s.add_dependency('ParseTree', '= 1.7.1')
+  s.add_dependency('ruby2ruby', '= 1.1.6')
+  #s.has_rdoc = 'true'
 end
 ###################################################
@@ -56,10 +59,19 @@ Rake::TestTask.new(:test_blackbox) do |task|
   task.test_files = ['test/blackbox_test.rb']
 end
+task "test_specific" do
+  ruby "test/blackbox_test.rb #{ARGV[1]}"
+end
 Rake::TestTask.new(:test_non_blackbox) do |task|
   task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
 end
+task "rcov" do
+  sh 'rcov --xrefs test/*.rb'
+  puts 'Report done.'
+end
 task "cleanup_readme" do
   puts "Cleaning up README..."
   readme_in = open('./doc/files/README.html')
@@ -87,8 +99,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
   pkg.need_tar = false
 end
-Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
-  pkg.need_zip = true
-  pkg.need_tar = true
-  pkg.package_files.include("examples/**/*")
-end
+#Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
+#  pkg.need_zip = true
+#  pkg.need_tar = true
+#  pkg.package_files.include("examples/**/*")
+#end

data/lib/scrubyt.rb CHANGED

@@ -1,3 +1,6 @@
+$KCODE = 'u'
+require 'jcode'
 #ruby core
 require 'open-uri'
 require 'erb'
@@ -7,6 +10,7 @@ require 'rubygems'
 require 'mechanize'
 require 'hpricot'
 require 'parse_tree_reloaded'
+require 'rexml/text'
 #little hack to avoid that ruby2ruby tries to load the original parse_tree
 if Gem
@@ -42,16 +46,17 @@ require 'scrubyt/core/scraping/compound_example.rb'
 require 'scrubyt/output/result_node.rb'
 require 'scrubyt/output/scrubyt_result.rb'
 require 'scrubyt/output/export.rb'
+require 'scrubyt/core/navigation/navigation_actions.rb'
+require 'scrubyt/core/navigation/fetch_action.rb'
 require 'scrubyt/core/shared/extractor.rb'
 require 'scrubyt/core/scraping/filters/base_filter.rb'
 require 'scrubyt/core/scraping/filters/attribute_filter.rb'
+require 'scrubyt/core/scraping/filters/constant_filter.rb'
+require 'scrubyt/core/scraping/filters/script_filter.rb'
+require 'scrubyt/core/scraping/filters/text_filter.rb'
 require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
 require 'scrubyt/core/scraping/filters/download_filter.rb'
 require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
 require 'scrubyt/core/scraping/filters/regexp_filter.rb'
 require 'scrubyt/core/scraping/filters/tree_filter.rb'
 require 'scrubyt/core/scraping/pattern.rb'
-require 'scrubyt/core/navigation/navigation_actions.rb'
-require 'scrubyt/core/navigation/fetch_action.rb'
-require 'scrubyt/core/shared/evaluation_context.rb'
-require 'scrubyt/core/shared/u_r_i_builder.rb'

data/lib/scrubyt/core/navigation/fetch_action.rb CHANGED

@@ -7,7 +7,7 @@ module Scrubyt
   #functionality to a separate class - so if you are looking for anything
   #which is loading a document (even by submitting a form or clicking a link)
   #and related things like setting a proxy etc. you should find it here.
-  class FetchAction
+  module FetchAction
     @@current_doc_url = nil
     @@current_doc_protocol = nil
@@ -30,7 +30,7 @@ module Scrubyt
         mechanize_doc = args[0][:mechanize_doc]
         resolve = args[0][:resolve]
         basic_auth = args[0][:basic_auth]
-        user_agent = args[0][:user_agent] || "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"
+        user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
         #Refactor this whole stuff as well!!! It looks awful...
         parse_and_set_proxy(proxy) if proxy
         set_user_agent(user_agent)
@@ -120,23 +120,27 @@ module Scrubyt
       @@hpricot_doc
     end
-    def self.get_host_name
+    def get_host_name
       @@host_name
     end
-    def self.restore_host_name
+    def restore_host_name
       return if @@current_doc_protocol == 'file'
       @@host_name = @@original_host_name
     end
-    def self.store_page
+    def store_page
       @@history.push @@hpricot_doc
     end
-    def self.restore_page
+    def restore_page
       @@hpricot_doc = @@history.pop
     end
+    def store_host_name(doc_url)
+      FetchAction.store_host_name(doc_url)
+    end
     def self.store_host_name(doc_url)
       @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
       @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
@@ -171,8 +175,7 @@ module Scrubyt
         @@port = parts.delete_at(-1)
         @@host = parts.join(':')
         if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
-          puts "Invalid proxy specification..."
-          puts "neither host nor port can be nil!"
+          Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
           exit
         end
       end
@@ -227,6 +230,6 @@ module Scrubyt
           #custom resilving
           @@current_doc_url = resolve + doc_url
       end
-    end #end of function handle_relative_url
-  end #end of class FetchAction
-end #end of module Scrubyt
+    end
+  end
+end

data/lib/scrubyt/core/navigation/navigation_actions.rb CHANGED

@@ -5,25 +5,15 @@ module Scrubyt
   #This class contains all the actions that are used to navigate on web pages;
   #first of all, *fetch* for downloading the pages - then various actions
   #like filling textfields, submitting formst, clicking links and more
-  class NavigationActions
-    #These are reserved keywords - they can not be the name of any pattern
-    #since they are reserved for describing the navigation
-    KEYWORDS = ['fetch',
-                'fill_textfield',
-                'fill_textarea',
-                'submit',
-                'click_link',
-                'click_image_map',
-                'select_option',
-                'check_checkbox',
-                'check_radiobutton',
-                'end']
-    def initialize
-        @@current_form = nil
-        FetchAction.new
+  module NavigationActions
+    def self.extend_object(obj)
+      super(obj)
+      obj.instance_eval do
+        @current_form = nil
+      end
     end
     ##
     #Action to fill a textfield with a query string
     #
@@ -33,90 +23,76 @@ module Scrubyt
     #textfield is 'q'
     #
     #_query_string_ - the string that should be entered into the textfield
-    def self.fill_textfield(textfield_name, query_string)
+    def fill_textfield(textfield_name, query_string)
       lookup_form_for_tag('input','textfield',textfield_name,query_string)
-      eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
+      eval("@current_form['#{textfield_name}'] = '#{query_string}'")
     end
     ##
     #Action to fill a textarea with text
-    def self.fill_textarea(textarea_name, text)
+    def fill_textarea(textarea_name, text)
       lookup_form_for_tag('textarea','textarea',textarea_name,text)
-      eval("@@current_form['#{textarea_name}'] = '#{text}'")
+      eval("@current_form['#{textarea_name}'] = '#{text}'")
     end
     ##
     #Action for selecting an option from a dropdown box
-    def self.select_option(selectlist_name, option)
+    def select_option(selectlist_name, option)
       lookup_form_for_tag('select','select list',selectlist_name,option)
-      select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
+      select_list = @current_form.fields.find {|f| f.name == selectlist_name}
       searched_option = select_list.options.find{|f| f.text.strip == option}
       searched_option.click
     end
-    def self.check_checkbox(checkbox_name)
+    def check_checkbox(checkbox_name)
       lookup_form_for_tag('input','checkbox',checkbox_name, '')
-      @@current_form.checkboxes.name(checkbox_name).check
+      @current_form.checkboxes.name(checkbox_name).check
     end
-    def self.check_radiobutton(checkbox_name, index=0)
+    def check_radiobutton(checkbox_name, index=0)
       lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
-      @@current_form.radiobuttons.name(checkbox_name)[index].check
+      @current_form.radiobuttons.name(checkbox_name)[index].check
     end
     ##
     #Fetch the document
-    def self.fetch(*args)
+    def fetch(*args)
       FetchAction.fetch(*args)
     end
     ##
-    #Submit the current form (delegate it to NavigationActions)
-    def self.submit(index=nil, type=nil)
+    #Submit the current form
+    def submit(index=nil, type=nil)
       if index == nil
-        FetchAction.submit(@@current_form)
-      #----- added by nickmerwin@gmail.com -----
+        FetchAction.submit(@current_form)
+        #----- added by nickmerwin@gmail.com -----
       elsif index.class == String
-        button = @@current_form.buttons.detect{|b| b.name == index}
-        FetchAction.submit(@@current_form, button,type)
-      #-----------------------------------------
+        button = @current_form.buttons.detect{|b| b.name == index}
+        FetchAction.submit(@current_form, button,type)
+        #-----------------------------------------
       else
-        FetchAction.submit(@@current_form, @@current_form.buttons[index])
+        FetchAction.submit(@current_form, @current_form.buttons[index])
       end
     end
     ##
-    #Click the link specified by the text ((delegate it to NavigationActions)
-    def self.click_link(link_spec,index=0)
+    #Click the link specified by the text
+    def click_link(link_spec,index=0)
       FetchAction.click_link(link_spec,index)
     end
-    def self.click_image_map(index=0)
+    def click_image_map(index=0)
       FetchAction.click_image_map(index)
     end
-    def self.get_hpricot_doc
-      FetchAction.get_hpricot_doc
-    end
-    def self.get_current_doc_url
-      FetchAction.get_current_doc_url
-    end
-    def self.get_host_name
-      FetchAction.get_host_name
-    end
-private
-    def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
+    private
+    def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
       Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
       widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
-      p widget
       form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
-      p form_tag
       find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
     end
-    def self.find_form_based_on_tag(tag, possible_attrs)
+    def find_form_based_on_tag(tag, possible_attrs)
       lookup_attribute_name = nil
       lookup_attribute_value = nil
@@ -127,12 +103,11 @@ private
       }
       i = 0
       loop do
-        @@current_form = FetchAction.get_mechanize_doc.forms[i]
-        return nil if @@current_form == nil
-        puts i
-        break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
+        @current_form = FetchAction.get_mechanize_doc.forms[i]
+        return nil if @current_form == nil
+        break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
         i+= 1
       end
-    end#find_form_based_on_tag
-  end#end of class NavigationActions
-end#end of module Scrubyt
+    end
+  end
+end

data/lib/scrubyt/core/scraping/filters/base_filter.rb CHANGED

@@ -66,12 +66,13 @@ module Scrubyt
     #should not be called directly
     #TODO still used?
+    alias_method :throw_method_missing, :method_missing
     def method_missing(method_name, *args, &block)
       case method_name.to_s
       when /^ensure.+/
         constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
       else
-        raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
+        throw_method_missing(method_name, *args, &block)
       end
     end
@@ -82,7 +83,7 @@ module Scrubyt
     private
     #We don't want this to be accessible from outside
     def initialize(parent_pattern, example)
-      @example_type = BaseFilter.determine_example_type(example)
+      @example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
       @parent_pattern = parent_pattern
       @example = example
       @xpath = nil                #The xpath to evaluate this filter

data/lib/scrubyt/core/scraping/filters/constant_filter.rb ADDED

@@ -0,0 +1,12 @@
+module Scrubyt
+  class ConstantFilter < BaseFilter
+    def evaluate(source)
+      return @example
+    end
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
+  end #End of class ConstantFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb CHANGED

@@ -2,13 +2,36 @@ module Scrubyt
   class DetailPageFilter < BaseFilter
     def evaluate(source)
-      if source.is_a? String
-        @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
+      if source.is_a?(String)
+        url = source
       else
-        @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
-          XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
-          @parent_pattern, @parent_pattern.resolve)
+        url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
       end
-    end #end of method
-  end #End of class DetailPageFilter
-end #End of module Scrubyt
+      @parent_pattern.extractor.store_page
+      original_host_name = @parent_pattern.extractor.get_host_name
+      @parent_pattern.extractor.restore_host_name
+      FetchAction.fetch url, :resolve => @parent_pattern.resolve
+      if @detail_extractor.nil?
+        @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
+        root_results = @detail_extractor.result
+      else
+        root_results = @detail_extractor.evaluate_extractor
+      end
+      @parent_pattern.extractor.restore_page
+      @parent_pattern.extractor.store_host_name original_host_name
+      root_results
+    end
+    def get_detail_sexp
+      [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
+    end
+  end
+end