RubyGems - jspradlin-scrubyt - Versions diffs - 0.4.16 - Mend

jspradlin-scrubyt 0.4.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data/CHANGELOG +343 -0
data/COPYING +340 -0
data/README +120 -0
data/Rakefile +101 -0
data/lib/scrubyt.rb +45 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +167 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +142 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +117 -0

data/lib/scrubyt/utils/shared_utils.rb ADDED Viewed

@@ -0,0 +1,58 @@
+module Scrubyt
+  ##
+  #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
+  #
+  class SharedUtils
+    #Entities to replace - need to make this more complete, or install htmlentities or similar package
+    ENTITIES = {
+        'quot'      => '"',
+        'apos'      => "'",
+        'amp'       => '&',
+        'lt'        => '<',
+        'gt'        => '>',
+        'nbsp'      => ' '}
+    def self.prepare_text_for_comparison(text)
+      unescape_entities text
+      text.strip!
+      text
+    end
+    #Unescape the entities in the HTML!
+    def self.unescape_entities(text)
+      ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
+      text
+    end
+    #Entry point for finding the elements specified by examples
+    def self.traverse_for_match(node, regexp)
+      results = []
+      traverse_for_match_inner = lambda { |node, regexp|
+        ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
+        if ft =~ regexp
+          node.instance_eval do
+            @match_data = $~
+            def match_data
+              @match_data
+            end
+          end
+          results << node
+          results.delete node.parent if node.is_a? Hpricot::Elem
+        end
+        node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
+      }
+      traverse_for_match_inner.call(node,regexp)
+      results
+    end
+    def self.get_backtrace
+      begin
+        raise
+      rescue Exception => ex
+        backtrace = ex.backtrace
+      end
+      backtrace.slice!(0)
+      backtrace
+    end
+  end #end of class SharedUtils
+end #end of module Scrubyt

data/lib/scrubyt/utils/simple_example_lookup.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Scrubyt
+  #=<tt>Lookup of simple examples</tt>
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example.
+  #
+  #This class is responsible for finding elements matched by simple examples.
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
+  #text but also contains a specific attribute etc.)
+  class SimpleExampleLookup
+    #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
+    #The text can be also a mixed content text, e.g.
+    #
+    # <a>Bon <b>nuit</b>, monsieur!</a>
+    #
+    #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
+    def self.find_node_from_text(doc, text, next_link=false, index = 0)
+      text.gsub!('»', '&#187;')
+      #Process immediate attribute extraction (like "go to google.com/@href")
+      if text =~ /.+\/@.+$/
+        text = text.scan(/^(.+?)\/@.+$/)[0][0]
+      elsif text =~ /.+\[\d+\]$/
+        res = text.scan(/(.+)\[(\d+)\]$/)
+        text = res[0][0]
+        index = res[0][1].to_i
+      elsif text =~ /.+\[.+\]$/
+        final_element_name = text.scan(/^(.+?)\[/)[0][0]
+        text = text.scan(/\[(.+?)\]/)[0][0]
+      end
+      if final_element_name
+        text = Regexp.escape(text) if text.is_a? String
+        result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
+        result = XPathUtils.traverse_up_until_name(result,final_element_name)
+      else
+        text = Regexp.escape(text) if text.is_a? String
+        result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
+      end
+    end
+  end #End of class SimpleExampleLookup
+end #End of module Scrubyt

data/lib/scrubyt/utils/xpathutils.rb ADDED Viewed

@@ -0,0 +1,202 @@
+require 'rubygems'
+require 'hpricot'
+module Scrubyt
+  ##
+  #=<tt>Various XPath utility functions</tt>
+  class XPathUtils
+    #Find the LCA (Lowest Common Ancestor) of two nodes
+    def self.lowest_common_ancestor(node1, node2)
+      path1 = traverse_up(node1)
+      path2 = traverse_up(node2)
+      return node1.parent if path1 == path2
+      closure = nil
+      while (!path1.empty? && !path2.empty?)
+	    closure = path1.pop
+	    return closure.parent if (closure != path2.pop)
+      end
+      path1.size > path2.size ? path1.last.parent : path2.last.parent
+    end
+    ##
+    #Generate XPath for the given node
+    #
+    #*parameters*
+    #
+    #_node_ - The node we are looking up the XPath for
+    #
+    #_stopnode_ - The Xpath generation is stopped and the XPath that
+    #was generated so far is returned if this node is reached.
+    #
+    #_write_indices_ - whether the index inside the parent shuold be
+    #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
+    def self.generate_XPath(node, stopnode=nil, write_indices=false)
+      path = []
+      indices = []
+      found = false
+      while !node.nil? && node.class != Hpricot::Doc do
+        if node == stopnode
+          found = true
+          break
+        end
+        path.push node.name
+        indices.push find_index(node) if write_indices
+        node = node.parent
+      end
+      #This condition ensures that if there is a stopnode, and we did not found it along the way,
+      #we return nil (since the stopnode is not contained in the path at all)
+      return nil if stopnode != nil && !found
+      result = ""
+      if write_indices
+        path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
+      else
+        path.reverse.each{ |node| result += "#{node}/" }
+      end
+      "/" + result.chop
+    end
+    #Generate an XPath of the node with indices, relatively to the given
+    #relative_root.
+    #
+    #For example if the elem's absolute XPath is /a/b/c,
+    #and the relative root's Xpath is a/b, the result of the function will
+    #be /c.
+    def self.generate_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, true)
+    end
+    #Generate a generalized XPath (i.e. without indices) of the node,
+    #relatively to the given relative_root.
+    #
+    #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
+    #and the relative root's Xpath is a[1]/b[3], the result of the function will
+    #be /c.
+    def self.generate_generalized_relative_XPath( elem,relative_root )
+      return nil if (elem == relative_root)
+      generate_XPath(elem, relative_root, false)
+    end
+    #Find an image based on the src of the example
+    #
+    #*parameters*
+    #
+    #_doc_ - The containing document
+    #
+    #_example_ - The value of the src attribute of the img tag
+    #This is convenient, since if the users rigth-clicks an image and
+    #copies image location, this string will be copied to the clipboard
+    #and thus can be easily pasted as an examle
+    #
+    #_index_ - there might be more images with the same src on the page -
+    #most typically the user will need the 0th - but if this is not the
+    #case, there is the possibility to override this
+    def self.find_image(doc, example, index=0)
+      if example =~ /\.(jpg|png|gif|jpeg)(\[\d+\])$/
+        res = example.scan(/(.+)\[(\d+)\]$/)
+        example = res[0][0]
+        index = res[0][1].to_i
+      end
+      (doc/"//img[@src='#{example}']")[index]
+    end
+    ##
+    #Used to find the parent of a node with the given name - for example
+    #find the <form> node which is the parent of the <input> node
+    def self.traverse_up_until_name(node, name)
+      while node.class != Hpricot::Doc do
+        #raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
+        return nil unless node
+        break if node.name == name
+        node = node.parent
+      end
+      node
+    end
+    ##
+    #Used when automatically looking up href attributes (for detail or next links)
+    #If the detail pattern did not extract a link, we first look up it's
+    #children - and if we don't find a link, traverse up
+    def self.find_nearest_node_with_attribute(node, attribute)
+      @node = nil
+      return node if node.is_a? Hpricot::Elem and node[attribute]
+      first_child_node_with_attribute(node, attribute)
+      first_parent_node_with_attribute(node, attribute) if !@node
+      @node
+    end
+    ##
+    #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
+    #and a child one. The result of the method is the relative XPath of the node pointed to
+    #by the second XPath to the node pointed to by the firs XPath.
+    def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
+      original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
+      pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
+      i = 0
+      pairs.each_with_index do |pair,index|
+        i = index
+        break if pair[0] != pair[1]
+      end
+      "/" + original_child_xpath_parts[i..-1].join('/')
+    end
+    def self.to_full_XPath(doc, xpath, generalize)
+      elem = doc/xpath
+      elem = elem.map[0] if elem.is_a? Hpricot::Elements
+      XPathUtils.generate_XPath(elem, nil, generalize)
+    end
+private
+    #Find the index of the child inside the parent
+    #For example:
+    #
+    #         tr
+    #      /  |   \
+    #    td   td   td
+    #    0    1    2
+    #
+    #The last row contains the indices of the td's from the
+    #tow above.
+    #
+    #Note that in classic XPath, the indices start with 1 (rather
+    #than 0).
+    def self.find_index(node)
+     c = 0
+     node.parent.children.each do |child|
+       if child.class == Hpricot::Elem
+         c += 1 if (child.name == node.name)
+         break if (node == child)
+       end
+     end
+     c
+    end
+    def self.traverse_up(node, stopnode=nil)
+      path = []
+      while node.class != Hpricot::Doc do
+        break if node == stopnode
+        path.push node
+        node = node.parent
+      end
+    path
+    end
+    def self.first_child_node_with_attribute(node, attribute)
+      return if !node.instance_of? Hpricot::Elem || @node
+      @node = node if node.attributes[attribute]
+      node.children.each  { |child| first_child_node_with_attribute(child, attribute) }
+    end
+    def self.first_parent_node_with_attribute(node, attribute)
+      return if !node.instance_of? Hpricot::Elem || @node
+      @node = node if node.attributes[attribute]
+      first_parent_node_with_attribute(node.parent, attribute)
+    end
+    def self.to_general_XPath(xpath)
+      xpath.gsub(/\[.+?\]/) {""}
+    end #End of method to_general_XPath
+  end #End of class XPathUtils
+end #End of module Scrubyt

data/test/blackbox_test.rb ADDED Viewed

@@ -0,0 +1,60 @@
+$lib_path = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+$:.unshift $lib_path
+require 'scrubyt'
+require 'test/unit'
+def perform_test(test_path, detailed = false)
+  out = $stdout
+  $stdout = StringIO.new unless detailed
+  cwd = Dir.getwd
+  Dir.chdir(File.dirname(test_path))
+  out.puts "Test: #{test_path}" if detailed
+  out.puts "========== Print Output ==========" if detailed
+  begin
+    expected_xml = File.read(File.basename(test_path)[0..-4] + ".expected.xml")
+    scrubyt_result_native = Scrubyt::Extractor.load(File.basename(test_path))
+    exported_code = scrubyt_result_native.export({:template => 'lambda'})
+    scrubyt_result_exported = Scrubyt::Extractor.define(&eval(exported_code))
+  ensure
+    if detailed
+      out.puts "========== Native Extractor =========="
+      out.puts IO.read(File.basename(test_path))
+      out.puts "========== Exported Extractor =========="
+      out.puts exported_code
+      out.puts "========== Expected =========="
+      out.puts expected_xml
+      out.puts "========== Result (native) =========="
+      out.puts scrubyt_result_native.to_xml
+      out.puts "========== Result (exported) =========="
+      out.puts scrubyt_result_exported.to_xml
+    end
+  end
+  assert_equal expected_xml, scrubyt_result_native.to_xml
+  assert_equal expected_xml, scrubyt_result_exported.to_xml
+ensure
+  Dir.chdir(cwd)
+  $stdout = out
+end
+if $0 == __FILE__ && ARGV[0]
+  include Test::Unit::Assertions
+  perform_test(ARGV[0], true)
+  exit
+end
+class BlackboxTest < Test::Unit::TestCase
+  tests = Dir.glob(File.join(File.dirname(__FILE__), 'blackbox_tests', '**', '*.rb'))
+  tests = tests.sort
+  tests.each do |test_path|
+    define_method("test_#{test_path.gsub('/', '_')}") do
+      perform_test(test_path)
+    end
+  end
+end

data/test/blackbox_tests/basic/multi_root.rb ADDED Viewed

@@ -0,0 +1,6 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "three_divs.html"))
+  entry '1'
+  data '4'
+end

data/test/blackbox_tests/basic/simple.rb ADDED Viewed

@@ -0,0 +1,5 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "three_divs.html"))
+  entry '1'
+end

data/test/blackbox_tests/detail_page/one_detail_page.rb ADDED Viewed

@@ -0,0 +1,9 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "main_page_1.html"))
+  main 'Main 1' do
+    xyz_detail do
+      detail 'Detail 1'
+    end
+  end
+end

data/test/blackbox_tests/detail_page/two_detail_pages.rb ADDED Viewed

@@ -0,0 +1,9 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "main_page_2.html"))
+  main 'Main 1' do
+    xyz_detail do
+      detail 'Detail 1'
+    end
+  end
+end

data/test/blackbox_tests/next_page/next_page_link.rb ADDED Viewed

@@ -0,0 +1,7 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "page_1.html"))
+  entry '1'
+  next_page 'Next'
+end

data/test/blackbox_tests/next_page/page_list_links.rb ADDED Viewed

@@ -0,0 +1,7 @@
+lambda do
+  fetch(File.join(File.dirname(__FILE__), "page_1.html"))
+  entry '1'
+  page_list 'Page 2'
+end

metadata ADDED Viewed

@@ -0,0 +1,117 @@
+--- !ruby/object:Gem::Specification
+name: jspradlin-scrubyt
+version: !ruby/object:Gem::Version
+  version: 0.4.16
+platform: ruby
+authors:
+- Peter Szinek
+- Glenn Gillen
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-01-31 00:00:00 -08:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0.5"
+    version:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.6.3
+    version:
+description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
+email:
+- peter@rubyrailways.com
+- glenn.gillen@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- COPYING
+- README
+- CHANGELOG
+- Rakefile
+- lib/scrubyt/core/navigation/agents/firewatir.rb
+- lib/scrubyt/core/navigation/agents/mechanize.rb
+- lib/scrubyt/core/navigation/fetch_action.rb
+- lib/scrubyt/core/navigation/navigation_actions.rb
+- lib/scrubyt/core/scraping/compound_example.rb
+- lib/scrubyt/core/scraping/constraint.rb
+- lib/scrubyt/core/scraping/constraint_adder.rb
+- lib/scrubyt/core/scraping/filters/attribute_filter.rb
+- lib/scrubyt/core/scraping/filters/base_filter.rb
+- lib/scrubyt/core/scraping/filters/constant_filter.rb
+- lib/scrubyt/core/scraping/filters/detail_page_filter.rb
+- lib/scrubyt/core/scraping/filters/download_filter.rb
+- lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
+- lib/scrubyt/core/scraping/filters/regexp_filter.rb
+- lib/scrubyt/core/scraping/filters/script_filter.rb
+- lib/scrubyt/core/scraping/filters/text_filter.rb
+- lib/scrubyt/core/scraping/filters/tree_filter.rb
+- lib/scrubyt/core/scraping/pattern.rb
+- lib/scrubyt/core/scraping/pre_filter_document.rb
+- lib/scrubyt/core/scraping/result_indexer.rb
+- lib/scrubyt/core/shared/extractor.rb
+- lib/scrubyt/logging.rb
+- lib/scrubyt/output/post_processor.rb
+- lib/scrubyt/output/result.rb
+- lib/scrubyt/output/result_dumper.rb
+- lib/scrubyt/output/result_node.rb
+- lib/scrubyt/output/scrubyt_result.rb
+- lib/scrubyt/utils/compound_example_lookup.rb
+- lib/scrubyt/utils/ruby_extensions.rb
+- lib/scrubyt/utils/shared_utils.rb
+- lib/scrubyt/utils/simple_example_lookup.rb
+- lib/scrubyt/utils/xpathutils.rb
+- lib/scrubyt.rb
+has_rdoc: true
+homepage: http://scrubyt.org/
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: scrubyt
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)
+test_files:
+- test/blackbox_test.rb
+- test/blackbox_tests/basic/multi_root.rb
+- test/blackbox_tests/basic/simple.rb
+- test/blackbox_tests/detail_page/one_detail_page.rb
+- test/blackbox_tests/detail_page/two_detail_pages.rb
+- test/blackbox_tests/next_page/next_page_link.rb
+- test/blackbox_tests/next_page/page_list_links.rb