RubyGems - scrubyt - Versions diffs - 0.4.1 → 0.4.05 - Mend

scrubyt 0.4.1 → 0.4.05

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/CHANGELOG CHANGED

@@ -1,20 +1,15 @@
 = scRUBYt! Changelog
 == 0.4.05
-== 20th October
+== 14th November
 =<tt>changes:</tt>
-- [NEW] possibility to use FireWatir as the agent for scraping (credit: Glenn Gillen, Glen Gillen and... did I mention Glenn already?)
+- [NEW] possibility to use FireWatir as the agent for scraping (credit: Glen Gillen)
 - [FIX] navigation doesn't crash if a 404/500 is returned (credit: Glen Gillen)
-- [NEW] navigation action: click_by_xpath to click arbitrary elements
+- [NEW] navigation actions: click_by_xpath, click_link_and_wait
 - [MOD] dropped dependencies: RubyInline, ParseTree, Ruby2Ruby (hooray for win32 users)
-- [NEW] scraping through frames (e.g. google analytics)
 - [MOD] exporting temporarily doesn't work - for now, generated XPaths are printed to the screen
-- [MOD] possibility to wait after clicking link/filling textfield (to be able to scrape inserted AJAX stuff)
-- [NEW] possibility to fetch from a string, by specifying nil as the url and the html string with the :html option
-- [FIX] firewatir slowness (credit: jak4)
 - [FIX] lot of bugfixes and stability fixes
--
 == 0.4.0 (unofficial)
 === 31st October, 2007
@@ -27,10 +22,6 @@
 == 0.3.4
 === 26th September, 2007
-=<tt>changes:</tt>
-It seems I have been too busy to update the changelog ;)
 == 0.3.1
 === 29th May, 2007

data/Rakefile CHANGED

@@ -17,7 +17,7 @@ task "cleanup_readme" => ["rdoc"]
 gem_spec = Gem::Specification.new do |s|
   s.name = 'scrubyt'
-  s.version = '0.4.1'
+  s.version = '0.4.05'
   s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
   s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
   # Files containing Test::Unit test cases.
@@ -94,8 +94,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
   pkg.need_tar = false
 end
-#Rake::PackageTask.new('scrubyt-examples', '0.4.03') do |pkg|
-#  pkg.need_zip = true
-#  pkg.need_tar = true
-#  pkg.package_files.include("examples/**/*")
-#end
+Rake::PackageTask.new('scrubyt-examples', gem_spec.version) do |pkg|
+  pkg.need_zip = true
+  pkg.need_tar = true
+  pkg.package_files.include("examples/**/*")
+end

data/lib/scrubyt/core/navigation/agents/firewatir.rb CHANGED

@@ -21,7 +21,6 @@ module Scrubyt
           @@host_name = nil
           @@history = []
           @@current_form = nil
-          @@current_frame = nil
           ##
           #Action to fetch a document (either a file or a http address)
@@ -59,33 +58,12 @@ module Scrubyt
             @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
             store_host_name(@@agent.url)   # in case we're on a new host
           end
-          def self.frame(attribute, value)
-            if @@current_frame
-              @@current_frame.frame(attribute, value)
-            else
-              @@current_frame = @@agent.frame(attribute, value)
-            end
-          end
           ##
           #Submit the last form;
-          def self.submit(current_form, sleep_time=nil, button=nil, type=nil)
-            if @@current_frame
-              #BRUTAL hax but FW is such a shitty piece of software
-              #this sucks FAIL omg
-              @@current_frame.locate
-              form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
-              form.submit
-            else
-              @@agent.element_by_xpath(@@current_form).submit
-            end
-            if sleep_time
-              sleep sleep_time
-              @@agent.wait
-            end
+          def self.submit(current_form, button=nil, type=nil)
+            @@agent.element_by_xpath(@@current_form).submit
+            @@agent.wait
             @@current_doc_url = @@agent.url
             @@mechanize_doc = "<html>#{@@agent.html}</html>"
             @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
@@ -196,18 +174,9 @@ module Scrubyt
             end
           end
-          def self.fill_textfield(textfield_name, query_string, wait_secs, useValue)
+          def self.fill_textfield(textfield_name, query_string)
             @@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
-            target = @@current_frame || @@agent
-            if useValue
-              target.text_field(:name,textfield_name).value = query_string
-            else
-              target.text_field(:name,textfield_name).set(query_string)
-            end
-            sleep(wait_secs) if wait_secs > 0
-            @@mechanize_doc = "<html>#{@@agent.html}</html>"
-            @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
+            @@agent.text_field(:name,textfield_name).set(query_string)
           end
           ##

data/lib/scrubyt/core/navigation/agents/mechanize.rb CHANGED

@@ -32,21 +32,16 @@ module Scrubyt
             if args.size > 0
               mechanize_doc = args[0][:mechanize_doc]
-              html = args[0][:html]
               resolve = args[0][:resolve]
               basic_auth = args[0][:basic_auth]
               parse_and_set_basic_auth(basic_auth) if basic_auth
-              if html
-                @@current_doc_protocol = 'string'
-                mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
-              end
             else
               mechanize_doc = nil
               resolve = :full
             end
             @@current_doc_url = doc_url
-            @@current_doc_protocol ||= determine_protocol
+            @@current_doc_protocol = determine_protocol
             if mechanize_doc.nil? && @@current_doc_protocol != 'file'
               handle_relative_path(doc_url)
@@ -64,13 +59,13 @@ module Scrubyt
               @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
             else
               @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
-              store_host_name(self.get_current_doc_url) if self.get_current_doc_url   # in case we're on a new host
+              store_host_name(self.get_current_doc_url)   # in case we're on a new host
             end
           end
           ##
           #Submit the last form;
-          def self.submit(index=nil, sleep_time=nil, type=nil)
+          def self.submit(index=nil, type=nil)
             Scrubyt.log :ACTION, 'Submitting form...'
             if index == nil
               result_page = @@agent.submit(@@current_form)
@@ -91,7 +86,7 @@ module Scrubyt
           ##
           #Click the link specified by the text
-          def self.click_link(link_spec,index = 0,wait_secs=0)
+          def self.click_link(link_spec,index = 0)
             Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
             if link_spec.is_a? Hash
               clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
@@ -177,7 +172,7 @@ module Scrubyt
             end
           end
-          def self.fill_textfield(textfield_name, query_string, *unused)
+          def self.fill_textfield(textfield_name, query_string)
             lookup_form_for_tag('input','textfield',textfield_name,query_string)
             eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
           end

data/lib/scrubyt/core/navigation/navigation_actions.rb CHANGED

@@ -23,12 +23,8 @@ module Scrubyt
     #textfield is 'q'
     #
     #_query_string_ - the string that should be entered into the textfield
-    def fill_textfield(textfield_name, query_string, use_value = nil)
-      FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
-    end
-    def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
-      FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
+    def fill_textfield(textfield_name, query_string)
+      FetchAction.fill_textfield(textfield_name, query_string)
     end
     ##
@@ -59,17 +55,13 @@ module Scrubyt
     ##
     #Submit the current form
     def submit(index=nil, type=nil)
-      FetchAction.submit(nil, index, type)
-    end
-    def submit_and_wait(sleep_time, index=nil, type=nil)
-      FetchAction.submit(index, sleep_time,  type)
+      FetchAction.submit(index, type)
     end
     ##
     #Click the link specified by the text
-    def click_link(link_spec,index=0)
-      FetchAction.click_link(link_spec,index, 0)
+    def click_link(link_spec,index=0, sleep_secs=0)
+      FetchAction.click_link(link_spec,index, sleep_secs)
     end
     def click_link_and_wait(link_spec, sleep_secs=0)
@@ -84,10 +76,6 @@ module Scrubyt
       FetchAction.click_image_map(index)
     end
-    def frame(attribute,value)
-      FetchAction.frame(attribute,value)
-    end
     def wait(time=1)
       FetchAction.wait(time)
     end

data/lib/scrubyt/core/scraping/filters/attribute_filter.rb CHANGED

@@ -10,5 +10,8 @@ module Scrubyt
       end
     end
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
   end #End of class AttributeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/constant_filter.rb CHANGED

@@ -5,5 +5,8 @@ module Scrubyt
       return @example
     end
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
   end #End of class ConstantFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb CHANGED

@@ -33,5 +33,9 @@ module Scrubyt
       root_results
     end
+    def get_detail_sexp
+      [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
+    end
   end
 end

data/lib/scrubyt/core/scraping/filters/download_filter.rb CHANGED

@@ -8,6 +8,10 @@ module Scrubyt
       download_file(source)
     end #end of method
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
 private
     def download_file(source)
       return '' if source.size < 4

data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb CHANGED

@@ -5,5 +5,8 @@ module Scrubyt
       source.inner_html
     end
+    def to_sexp
+      nil
+    end #end of method
   end #End of class TreeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/regexp_filter.rb CHANGED

@@ -9,5 +9,9 @@ module Scrubyt
       end
     end
+    def to_sexp
+      [:lit, @example]
+    end
   end #End of class TreeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/script_filter.rb CHANGED

@@ -7,5 +7,8 @@ module Scrubyt
       @example.call param
     end
+    def to_sexp
+      [:str, "FIXME!!! Can't dump Proc"]
+    end #end of method to_sexp
   end #End of class ConstantFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/text_filter.rb CHANGED

@@ -29,6 +29,9 @@ module Scrubyt
       return []
     end
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
   end #End of class TextFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/tree_filter.rb CHANGED

@@ -134,5 +134,13 @@ module Scrubyt
       @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
     end
+    def to_sexp
+      if @example =~ /.+\[@.+\]$/
+        [:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
+      else
+        [:str, @xpath]
+      end
+    end
   end #End of class TreeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/pattern.rb CHANGED

@@ -78,7 +78,6 @@ module Scrubyt
       #grab any examples that are defined
       examples = look_for_examples(args)
       #parse the options hash if provided
       parse_options_hash(args[-1]) if args[-1].is_a? Hash
@@ -309,6 +308,32 @@ module Scrubyt
       end
     end
+    def to_sexp
+      #collect arguments
+      args = []
+      args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
+      args.push(@options.to_sexp) if !@options.empty?
+      #build main call
+      sexp = [:fcall, @name, [:array, *args]]
+      if type == :detail_page
+        #add detail page extractor
+        sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
+      else
+        #add child block if the pattern has children
+        sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
+      end
+      #add modifier calls - TODO: remove when everything is exported to the options hash
+      @modifier_calls.each do |modifier_sexp|
+        sexp = [:call, sexp, *modifier_sexp]
+      end
+      #return complete sexp
+      sexp
+    end
     private
     def parse_options_hash(hash)
       #merge provided hash

data/lib/scrubyt/core/scraping/result_indexer.rb CHANGED

@@ -48,6 +48,10 @@ module Scrubyt
       ary
     end
+    #    def to_sexp
+    #      [:array, *@indices_to_extract.collect { |index| [:lit, index] }]
+    #    end
     private
     ##
     #Do not return the whole result set, just specified indices - like

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrubyt
 version: !ruby/object:Gem::Version
-  version: 0.4.1
+  version: 0.4.05
 platform: ruby
 authors:
 - Peter Szinek
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-12-10 00:00:00 +01:00
+date: 2008-11-15 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.1
+rubygems_version: 1.2.0
 signing_key:
 specification_version: 2
 summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)