RubyGems - scrubyt - Versions diffs - 0.3.4 → 0.4.1 - Mend

scrubyt 0.3.4 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/CHANGELOG +31 -0
data/README +1 -1
data/Rakefile +4 -9
data/lib/scrubyt.rb +37 -56
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +2 -183
data/lib/scrubyt/core/navigation/navigation_actions.rb +30 -48
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/base_filter.rb +6 -6
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +6 -6
data/lib/scrubyt/core/scraping/filters/download_filter.rb +0 -4
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +0 -4
data/lib/scrubyt/core/scraping/filters/script_filter.rb +0 -3
data/lib/scrubyt/core/scraping/filters/text_filter.rb +1 -5
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +0 -8
data/lib/scrubyt/core/scraping/pattern.rb +6 -27
data/lib/scrubyt/core/scraping/result_indexer.rb +0 -4
data/lib/scrubyt/core/shared/extractor.rb +15 -1
data/lib/scrubyt/output/result_node.rb +42 -6
data/lib/scrubyt/output/scrubyt_result.rb +35 -30
data/lib/scrubyt/utils/ruby_extensions.rb +0 -53
data/lib/scrubyt/utils/xpathutils.rb +2 -1
metadata +84 -119
data/lib/scrubyt/output/export.rb +0 -157

data/lib/scrubyt/core/scraping/filters/tree_filter.rb CHANGED

@@ -134,13 +134,5 @@ module Scrubyt
       @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
     end
-    def to_sexp
-      if @example =~ /.+\[@.+\]$/
-        [:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
-      else
-        [:str, @xpath]
-      end
-    end
   end #End of class TreeFilter
 end #End of module Scrubyt

data/lib/scrubyt/core/scraping/pattern.rb CHANGED

@@ -35,6 +35,10 @@ module Scrubyt
     VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
+    # :determine - default value, represent that type of example need determine
+    # :string    - represent node with example type EXAMPLE_TYPE_STRING
+    VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
     #The pattern can be either a model pattern (in this case it is
     #written to the output) or a temp pattern (in this case it is skipped)
     #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
@@ -59,7 +63,7 @@ module Scrubyt
     option_reader(:type => :tree, :output_type => :model, :generalize => false,
                   :write_text => lambda { @children.size == 0 }, :limit => nil,
-                  :default => nil, :resolve => :full, :except => nil, :example_type => nil)
+                  :default => nil, :resolve => :full, :except => nil, :example_type => :determine)
     def initialize(name, args=[], extractor=nil, parent=nil, &block)
       #init attributes
@@ -305,32 +309,6 @@ module Scrubyt
       end
     end
-    def to_sexp
-      #collect arguments
-      args = []
-      args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
-      args.push(@options.to_sexp) if !@options.empty?
-      #build main call
-      sexp = [:fcall, @name, [:array, *args]]
-      if type == :detail_page
-        #add detail page extractor
-        sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
-      else
-        #add child block if the pattern has children
-        sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
-      end
-      #add modifier calls - TODO: remove when everything is exported to the options hash
-      @modifier_calls.each do |modifier_sexp|
-        sexp = [:call, sexp, *modifier_sexp]
-      end
-      #return complete sexp
-      sexp
-    end
     private
     def parse_options_hash(hash)
       #merge provided hash
@@ -339,6 +317,7 @@ module Scrubyt
       hash.each { |key, value| check_option(key.to_sym) }
       raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
       raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
+      raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
     end
     def check_option(option)

data/lib/scrubyt/core/scraping/result_indexer.rb CHANGED

@@ -48,10 +48,6 @@ module Scrubyt
       ary
     end
-    #    def to_sexp
-    #      [:array, *@indices_to_extract.collect { |index| [:lit, index] }]
-    #    end
     private
     ##
     #Do not return the whole result set, just specified indices - like

data/lib/scrubyt/core/shared/extractor.rb CHANGED

@@ -14,6 +14,21 @@ module Scrubyt
     #The definition of the extractor is passed through this method
     def self.define(mode=nil, &extractor_definition)
+      if mode.is_a?(Hash)
+        if mode[:agent]==:firefox
+          FetchAction.class_eval do
+            include Navigation::Firewatir
+          end
+        else
+          FetchAction.class_eval do
+            include Navigation::Mechanize
+          end
+        end
+      else
+        FetchAction.class_eval do
+          include Navigation::Mechanize
+        end
+      end
       extractor = self.new(mode, extractor_definition)
       extractor.result
     end
@@ -117,7 +132,6 @@ module Scrubyt
       catch :quit_next_page_loop do
         loop do
           url = get_current_doc_url #TODO need absolute address here 2/4
-          puts url
           @processed_pages << url
           @root_patterns.each do |root_pattern|
             root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))

data/lib/scrubyt/output/result_node.rb CHANGED

@@ -23,7 +23,11 @@ module Scrubyt
       text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
       text = SharedUtils.unescape_entities(text)
       text.strip!
-      text
+      if (@options[:default] && ((text == '') || (text == @options[:default])))
+        @options[:default]
+      else
+        text
+      end
     end
     def to_libxml
@@ -41,26 +45,54 @@ module Scrubyt
     def to_hash(delimiter=',')
       result = []
       flat_hash_inner = lambda {|e, hash|
-        hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s  if e.write_text && !e.to_s.empty?
+        hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s  if ((e.write_text && !e.to_s.empty?) || e.options[:default])
         e.each {|c| flat_hash_inner.call(c, hash)  }
         hash
       }
       self.each {|e| result << flat_hash_inner.call(e, {}) }
       result
     end
+    def to_flat_hash()
+      hash_result = self.to_hash('@@@@@@')
+      merged_hash = hash_result.delete_at 0
+      hash_result.each do |hash|
+        merged_hash.keys.each do |key|
+          merged_hash[key] += "@@@@@@#{hash[key]}"
+        end
+      end
+      result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
+      final_result = []
+      result_sets.each do |rs|
+        temp_result = {}
+        merged_hash.keys.each do |k|
+          temp_result[k] = rs[merged_hash.keys.index(k)]
+        end
+        final_result << temp_result
+      end
+      final_result
+    end
     def to_flat_xml(delimiter=nil)
       lines = []
       hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
+      merged_hash = hash_result.delete_at 0
+      hash_result.each do |hash|
+        merged_hash.keys.each do |key|
+          merged_hash[key] += "#{delimiter}#{hash[key]}"
+        end
+      end
       if delimiter
-        result_sets = hash_result[0].values.map!{|x| x.split(delimiter)}.transpose
+        result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
         final_result = []
         result_sets.each do |rs|
           temp_result = {}
-          hash_result[0].keys.each do |k|
-            temp_result[k] = rs[hash_result[0].keys.index(k)]
+          merged_hash.keys.each do |k|
+            temp_result[k] = rs[merged_hash.keys.index(k)]
           end
           final_result << temp_result
         end
@@ -89,7 +121,11 @@ module Scrubyt
         elsif write_text && !to_s.empty?
           lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
         else
-          lines << "<#{name}/>"
+          if @options[:default]
+            lines << "<#{name}>#{@options[:default]}</#{name}>"
+          else
+            lines << "<#{name}/>"
+          end
         end
       else
         lines << "<#{name}>"

data/lib/scrubyt/output/scrubyt_result.rb CHANGED

@@ -2,36 +2,41 @@ module Scrubyt
   class ScrubytResult < ResultNode
     attr_accessor :root_patterns, :source_file, :source_proc
-    def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
-      if arg1.is_a? String
-        if File.exists? arg1
-          export_old1(arg1, output_file_name, extractor_result_file_name)
-        else
-          export_old2(arg1, output_file_name, extractor_result_file_name)
-        end
-      else
-        export_new(arg1)
-      end
-    end
-    def show_stats
-      #Implement me...
-    end
-    def export_old1(input_file, output_file_name=nil, extractor_result_file_name=nil)
-      contents = open(input_file).read
-      wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
-      export_old2(wrapper_name, output_file_name, extractor_result_file_name)
-    end
-    def export_old2(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
-      export_new({ :wrapper_name => wrapper_name, :output_file_name => output_file_name || "#{wrapper_name}_extractor_export.rb", :extractor_result_file_name => extractor_result_file_name })
-    end
-    def export_new(data)
-      data[:source_file] = @source_file
-      data[:source_proc] = @source_proc
-      Scrubyt::Export.export(@root_patterns, data)
+    def export
+      #Temporary solution; the real one will be back later - or not
+     result = <<-EXPLANATION
+     === Extractor tree ===
+     export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
+     For now, in case you are using examples, you can replace them by hand based on the output below.
+     So if your pattern in the learning extractor looks like
+     book "Ruby Cookbook"
+     and you see the following below:
+     [book] /table[1]/tr/td[2]
+     then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
+     EXPLANATION
+     tree_builder = lambda do |node, level|
+       result += current_level = ("   " * (level == 0 ? 0 : level-1) +
+                                  "|\n" * (level == 0 ? 0 : 1) +
+                                  "   " * (level == 0 ? 0 : level-1) +
+                                 "+-- " * (level == 0 ? 0 : 1) +
+                                 "[#{node.name}]")
+       result += " #{node.filters[0].xpath}" if node.type == :tree
+       result += "\n"
+       node.children.each {|c| tree_builder[c, level+1]}
+     end
+     tree_builder[root_patterns[0],0]
+     result += "\n"
     end
   end
 end

data/lib/scrubyt/utils/ruby_extensions.rb CHANGED

@@ -65,57 +65,4 @@ class String
   def write(stringio, add_indent=0)
     stringio.write((self.split("\n").collect { |line| ('  ' * add_indent) + line }).join("\n"))
   end
-end
-class Array
-  def to_sexp
-    [:array, *to_sexp_array]
-  end
-  def to_sexp_array
-    collect { |element| element.to_sexp }
-  end
-end
-class Hash
-  def to_sexp
-    [:hash, *to_sexp_array]
-  end
-  def to_sexp_array
-    sexp = []
-    each { |key, value| sexp.push(key.to_sexp, value.to_sexp) }
-    sexp
-  end
-end
-class Symbol
-  def to_sexp
-    [:lit, self]
-  end
-end
-class String
-  def to_sexp
-    [:str, self]
-  end
-end
-class TrueClass
-  def to_sexp
-    [:true]
-  end
-end
-class FalseClass
-  def to_sexp
-    [:false]
-  end
-end
-class Proc
-  alias_method :parse_tree_to_sexp, :to_sexp
-  def to_sexp
-    [:iter, [:fcall, :lambda], nil, parse_tree_to_sexp[1] ]
-  end
 end

data/lib/scrubyt/utils/xpathutils.rb CHANGED

@@ -107,7 +107,8 @@ module Scrubyt
     #find the <form> node which is the parent of the <input> node
     def self.traverse_up_until_name(node, name)
       while node.class != Hpricot::Doc do
-        raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
+        #raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
+        return nil unless node
         break if node.name == name
         node = node.parent
       end

metadata CHANGED

@@ -1,87 +1,22 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.9.0
-specification_version: 1
 name: scrubyt
 version: !ruby/object:Gem::Version
-  version: 0.3.4
-date: 2007-09-26 00:00:00 +02:00
-summary: A powerful Web-scraping framework built on Mechanize and Hpricot
-require_paths:
-- lib
-email: peter@rubyrailways.com
-homepage: http://www.scrubyt.org
-rubyforge_project:
-description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
-autorequire:
-default_executable:
-bindir: bin
-has_rdoc: false
-required_ruby_version: !ruby/object:Gem::Version::Requirement
-  requirements:
-  - - ">"
-    - !ruby/object:Gem::Version
-      version: 0.0.0
-  version:
+  version: 0.4.1
 platform: ruby
-signing_key:
-cert_chain:
-post_install_message:
 authors:
 - Peter Szinek
-files:
-- COPYING
-- README
-- CHANGELOG
-- Rakefile
-- lib/scrubyt.rb
-- lib/scrubyt/logging.rb
-- lib/scrubyt/output/result_dumper.rb
-- lib/scrubyt/output/result.rb
-- lib/scrubyt/output/export.rb
-- lib/scrubyt/output/post_processor.rb
-- lib/scrubyt/output/result_node.rb
-- lib/scrubyt/output/scrubyt_result.rb
-- lib/scrubyt/utils/compound_example_lookup.rb
-- lib/scrubyt/utils/simple_example_lookup.rb
-- lib/scrubyt/utils/ruby_extensions.rb
-- lib/scrubyt/utils/xpathutils.rb
-- lib/scrubyt/utils/shared_utils.rb
-- lib/scrubyt/core/navigation/navigation_actions.rb
-- lib/scrubyt/core/navigation/fetch_action.rb
-- lib/scrubyt/core/scraping/constraint.rb
-- lib/scrubyt/core/scraping/pattern.rb
-- lib/scrubyt/core/scraping/pre_filter_document.rb
-- lib/scrubyt/core/scraping/compound_example.rb
-- lib/scrubyt/core/scraping/constraint_adder.rb
-- lib/scrubyt/core/scraping/result_indexer.rb
-- lib/scrubyt/core/scraping/filters/attribute_filter.rb
-- lib/scrubyt/core/scraping/filters/base_filter.rb
-- lib/scrubyt/core/scraping/filters/regexp_filter.rb
-- lib/scrubyt/core/scraping/filters/tree_filter.rb
-- lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
-- lib/scrubyt/core/scraping/filters/detail_page_filter.rb
-- lib/scrubyt/core/scraping/filters/download_filter.rb
-- lib/scrubyt/core/scraping/filters/text_filter.rb
-- lib/scrubyt/core/scraping/filters/constant_filter.rb
-- lib/scrubyt/core/scraping/filters/script_filter.rb
-- lib/scrubyt/core/shared/extractor.rb
-test_files: []
-rdoc_options: []
-extra_rdoc_files: []
-executables: []
-extensions: []
-requirements: []
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2008-12-10 00:00:00 +01:00
+default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: hpricot
+  type: :runtime
   version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
+  version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
@@ -89,55 +24,85 @@ dependencies:
     version:
 - !ruby/object:Gem::Dependency
   name: mechanize
+  type: :runtime
   version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
+  version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
         version: 0.6.3
     version:
-- !ruby/object:Gem::Dependency
-  name: ParseTreeReloaded
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - ">"
-      - !ruby/object:Gem::Version
-        version: 0.0.0
-    version:
-- !ruby/object:Gem::Dependency
-  name: RubyInlineAcceleration
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - ">"
-      - !ruby/object:Gem::Version
-        version: 0.0.0
-    version:
-- !ruby/object:Gem::Dependency
-  name: RubyInline
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - "="
-      - !ruby/object:Gem::Version
-        version: 3.6.3
-    version:
-- !ruby/object:Gem::Dependency
-  name: ParseTree
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - "="
-      - !ruby/object:Gem::Version
-        version: 1.7.1
-    version:
-- !ruby/object:Gem::Dependency
-  name: ruby2ruby
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - "="
-      - !ruby/object:Gem::Version
-        version: 1.1.6
-    version:
+description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
+email: peter@rubyrailways.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- COPYING
+- README
+- CHANGELOG
+- Rakefile
+- lib/scrubyt/core/navigation/agents/firewatir.rb
+- lib/scrubyt/core/navigation/agents/mechanize.rb
+- lib/scrubyt/core/navigation/fetch_action.rb
+- lib/scrubyt/core/navigation/navigation_actions.rb
+- lib/scrubyt/core/scraping/compound_example.rb
+- lib/scrubyt/core/scraping/constraint.rb
+- lib/scrubyt/core/scraping/constraint_adder.rb
+- lib/scrubyt/core/scraping/filters/attribute_filter.rb
+- lib/scrubyt/core/scraping/filters/base_filter.rb
+- lib/scrubyt/core/scraping/filters/constant_filter.rb
+- lib/scrubyt/core/scraping/filters/detail_page_filter.rb
+- lib/scrubyt/core/scraping/filters/download_filter.rb
+- lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
+- lib/scrubyt/core/scraping/filters/regexp_filter.rb
+- lib/scrubyt/core/scraping/filters/script_filter.rb
+- lib/scrubyt/core/scraping/filters/text_filter.rb
+- lib/scrubyt/core/scraping/filters/tree_filter.rb
+- lib/scrubyt/core/scraping/pattern.rb
+- lib/scrubyt/core/scraping/pre_filter_document.rb
+- lib/scrubyt/core/scraping/result_indexer.rb
+- lib/scrubyt/core/shared/extractor.rb
+- lib/scrubyt/logging.rb
+- lib/scrubyt/output/post_processor.rb
+- lib/scrubyt/output/result.rb
+- lib/scrubyt/output/result_dumper.rb
+- lib/scrubyt/output/result_node.rb
+- lib/scrubyt/output/scrubyt_result.rb
+- lib/scrubyt/utils/compound_example_lookup.rb
+- lib/scrubyt/utils/ruby_extensions.rb
+- lib/scrubyt/utils/shared_utils.rb
+- lib/scrubyt/utils/simple_example_lookup.rb
+- lib/scrubyt/utils/xpathutils.rb
+- lib/scrubyt.rb
+has_rdoc: "true"
+homepage: http://www.scrubyt.org
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)
+test_files: []