RubyGems - scrapes - Versions diffs - 0.2.0 - Mend

scrapes 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

data/LICENSE +22 -0
data/README +123 -0
data/demo/demo.rb +33 -0
data/demo/pages/about.rb +32 -0
data/demo/pages/main.rb +32 -0
data/lib/scrapes.rb +41 -0
data/lib/scrapes/cache.rb +110 -0
data/lib/scrapes/cookbook.rb +53 -0
data/lib/scrapes/cookies.rb +45 -0
data/lib/scrapes/crawler.rb +97 -0
data/lib/scrapes/hpricot.rb +110 -0
data/lib/scrapes/initializer.rb +86 -0
data/lib/scrapes/page.rb +319 -0
data/lib/scrapes/rule_parser.rb +327 -0
data/lib/scrapes/session.rb +155 -0
data/lib/scrapes/to_proxy.rb +50 -0
data/test/cache.rb +75 -0
data/test/cookies.rb +34 -0
data/test/crawler.rb +69 -0
data/test/hpricot.rb +55 -0
data/test/initializer.rb +54 -0
data/test/lib/server.rb +63 -0
data/test/page.rb +77 -0
data/test/pages/foils.rb +61 -0
data/test/pages/foils2.rb +38 -0
data/test/pages/redhanded_entries.rb +36 -0
data/test/pages/redhanded_main.rb +58 -0
data/test/pages/rule_parser.rb +81 -0
data/test/pages/simple.rb +21 -0
data/test/public/foil72.html +10 -0
data/test/public/foil73.html +9 -0
data/test/public/foil74.html +11 -0
data/test/public/foo.txt +1 -0
data/test/public/index.html +20 -0
data/test/public/redhanded.html +1208 -0
data/test/public/rule_parser.html +21 -0
data/test/public/simple.html +8 -0
data/test/rule_parser.rb +151 -0
data/test/session.rb +45 -0
data/test/textcontent.rb +71 -0
metadata +123 -0

data/lib/scrapes/cookies.rb ADDED

@@ -0,0 +1,45 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+module Scrapes
+  ################################################################################
+  # Make it easy to access HTTP cookies
+  class Cookies < Hash
+    ################################################################################
+    # Convert the current set of cookies into HTTP headers.
+    def to_header
+      map {|k,v| "#{k}=#{v}"}.join(';')
+    end
+    ################################################################################
+    # Parse HTTP cookie headers
+    def from_header (header)
+      k, v = header.sub(/;.*$/, '').split(/\s*=\s*/, 2)
+      self[k] = v
+    end
+  end
+  ################################################################################
+end
+################################################################################

data/lib/scrapes/crawler.rb ADDED

@@ -0,0 +1,97 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+require 'net/http'
+require 'pathname'
+require 'scrapes/cache'
+################################################################################
+module Scrapes
+  ################################################################################
+  # Try to suck down a URI
+  class Crawler
+    ################################################################################
+    # The cache object that this crawler is using
+    attr_accessor :cache
+    ################################################################################
+    # The optional log object that this crawler is using
+    attr_accessor :log
+    ################################################################################
+    # Create a new crawler for the given session
+    def initialize (session)
+      @session = session
+      @log = nil
+      @verbose = 0
+      @delay = 0.5
+      @cache = Cache.new
+    end
+    ################################################################################
+    # Fetch a URI, using HTTP GET unless you supply <tt>post</tt>.
+    def fetch (uri, post={}, headers={})
+      @session.refresh
+      uri = URI.parse(@session.absolute_uri(uri))
+      post.empty? and cached = @cache.check(uri)
+      @log.info((cached ? 'C ' : 'N ') + uri.to_s) if @log
+      return cached if cached # FIXME
+      sleep(@delay) if @delay != 0
+      path = uri.path.dup
+      path << "/" if path.empty?
+      path << "?" + uri.query if uri.query
+      req = post.empty? ? Net::HTTP::Get.new(path) : Net::HTTP::Post.new(path)
+      req.set_form_data(post) unless post.empty?
+      req['Cookie'] = @session.cookies.to_header
+      headers.each {|k,v| req[k] = v}
+      res = Net::HTTP.new(uri.host, uri.port).start {|http| http.request(req)}
+      if @verbose >= 2
+        STDERR.puts "-----------------------------------------------"
+        STDERR.puts res.class
+        res.each_header {|k,v| STDERR.puts "#{k}: #{v}"}
+      end
+      # FIXME, what to do about more than one cookie
+      @session.cookies.from_header(res['set-cookie']) if res.key?('set-cookie')
+      case res
+      when Net::HTTPRedirection
+        @session.base_uris[-1] = @session.absolute_uri(res['location'])
+        res = fetch(res['location'], {}, headers)
+      end
+      post.empty? and @cache.update(uri, res.body)
+      res
+    end
+  end
+  ################################################################################
+end
+################################################################################

data/lib/scrapes/hpricot.rb ADDED

@@ -0,0 +1,110 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+require 'cgi'
+require 'hpricot'
+################################################################################
+module Scrapes
+  ################################################################################
+  module Hpricot # :nodoc:
+    ################################################################################
+    module Extractors
+      ################################################################################
+      # Returns the text of any child text nodes recursively concatenated.
+      def text(node)
+        text_process(node,String) do |e| text(e) end
+      end
+      ################################################################################
+      # Returns the text of any child text nodes recursively as nested Array.
+      def texts(node)
+        text_process(node,Array) do |e| texts(e) end
+      end
+      ################################################################################
+      # Returns the text of any child text nodes concatenated.
+      def content(node)
+        text_process(node,String) do |e| e.content end
+      end
+      ################################################################################
+      # Returns the text of any child text nodes as an Array.
+      def contents(node)
+        text_process(node,Array) do |e| e.content end
+      end
+      ################################################################################
+      # The result of text() with whitespace reduceded to single spaces and striped.
+      def word(node)
+        text_process(node,String) do |e| word(e).gsub(/\s+/,' ').strip end
+      end
+      ################################################################################
+      # The result of texts() striped, flattened, whitespace reduced to single spaces, and
+      # with all blank?s rejected.
+      def words(node)
+        texts(node).flatten.compact.map{|e|e.gsub(/\s+/,' ').strip}.reject{|e| e.blank?}
+      end
+      ################################################################################
+      # Just reuturn the yielded node.
+      def xml(node)
+        node
+      end
+      protected
+      ################################################################################
+      def unescape
+        case result = yield
+        when String then CGI::unescapeHTML(result).gsub('&nbsp;', ' ')
+        when Array then result.map{|e| Extractors::unescape{e}}
+        when NilClass then nil
+        else raise "should be Array or String, was: #{result.class}"
+        end
+      end
+      ################################################################################
+      def text_process(node, klass, &block)
+        Extractors::unescape do
+          case node
+          when Array, ::Hpricot::Elements
+            node.map do |elem|
+              text_process(elem,klass,&block)
+            end
+          when ::Hpricot::Elem, ::Hpricot::Doc
+            node.children.inject(klass.new) do |value,child|
+              (value << block.call(child)) rescue nil
+              value
+            end
+          when ::Hpricot::Text then node.content
+          end
+        end
+      end
+      module_function :word, :words, :text, :texts, :content, :contents, :text_process
+    end
+    ################################################################################
+  end
+  ################################################################################
+end
+################################################################################

data/lib/scrapes/initializer.rb ADDED

@@ -0,0 +1,86 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+module Scrapes
+  ################################################################################
+  # Initialize the Scrapes library
+  class Initializer
+    ################################################################################
+    # The directory name where the pages classes are kept
+    attr_accessor :pages_dir
+    ################################################################################
+    # The parent directory where the pages_dir can be found
+    attr_accessor :pages_parent
+    ################################################################################
+    # Create a new Initializer and run it
+    def self.run (&block)
+      initializer = self.new
+      yield initializer if block
+      initializer
+    end
+    ################################################################################
+    # Establish all the defaults
+    def initialize
+      @pages_dir = 'pages'
+      @pages_parent = File.dirname($0)
+    end
+    ################################################################################
+    # Run all the initilization methods
+    def process
+      load_pages
+    end
+    ################################################################################
+    private
+    ################################################################################
+    # load all files in the pages directory
+    def load_pages
+      reloader(Dir.glob(@pages_parent + '/' + @pages_dir + '/*.rb').sort)
+    end
+    ################################################################################
+    # try to keep loading files until all NameError issues are resolved
+    def reloader (files, limit=4)
+      reload = []
+      files.each do |file|
+        begin
+          load File.expand_path(file)
+        rescue NameError
+          raise if limit <= 0
+          reload << file
+        end
+      end
+      reloader(reload, limit - 1) unless reload.empty?
+    end
+  end
+end
+################################################################################

data/lib/scrapes/page.rb ADDED

@@ -0,0 +1,319 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+require 'scrapes/rule_parser'
+require 'hpricot'
+require 'rextra'
+################################################################################
+module Scrapes
+  ################################################################################
+  # The page class is used as a base class for scraping data out of one web
+  # page.  To use it, you inherit from it and setup some rules.  You can also
+  # use validators to ensure that the page was scraped correctly.
+  #
+  # == Setup
+  #
+  #   class MyPageScraper < Scrapes::Page
+  #     rule :rule_name, blah
+  #   end
+  # Scrapes::RuleParser explains the use of rules.
+  #
+  # == Auto Loading
+  #
+  # Scrapes will automatically 'require' ruby files placed in a special 'pages' directory.
+  # The idea is to place one Scrapes::Page derived class per file in the pages directory,
+  # and have it required for you.
+  #
+  # == Validations
+  #
+  # There are a few class methods that you can use to validate the contents you scraped
+  # from a given web page.
+  class Page
+    include Scrapes::Hpricot::Extractors
+    XSLTPROC = 'xsltproc' # :nodoc
+    ################################################################################
+    # RuleParser is used to extract data from web pages using CSS selectors
+    # and raw element access by using procs.
+    include RuleParser
+    ################################################################################
+    # Access the URI where this page's data came from
+    attr_accessor :uri
+    ################################################################################
+    # Access the session object that was used to fetch this page's data
+    attr_accessor :session
+    ################################################################################
+    # Access the Hpricot object that the selectors are passed
+    attr_accessor :hpricot
+    ################################################################################
+    # If the page that you are parsing is paginated (one page in many of similar data)
+    # you can use this class method to automatically fetch all pages.  In order for this
+    # to work, you need to provide a few special methods:
+    #
+    # === Next Page
+    #
+    # If you know the URL to the next page, then provide a instance method called
+    # <tt>next_page</tt>.  It should return the URL for the next page, or nil when
+    # the current page is the last page.
+    #
+    # class NextPageExample < Scrapes::Page
+    #   rule(:next_page, 'a[href~=next]', '@href', 1)
+    # end
+    #
+    # === Link for Page
+    #
+    # Alternatively, you can provide a instance method <tt>link_for_page</tt> and
+    # another one called <tt>pages</tt>.  The <tt>pages</tt> method should return the
+    # number of pages in this paginated set.  The <tt>link_for_page</tt> method should
+    # take a page number, and return a URL to fetch that page.
+    #
+    # class LinkForPageExample < Scrapes::Page
+    #   rule_1(:page) {|e| m = e.text.match(/Page\s+\d+\s+of\s+(\d+)/) and m[1].to_i}
+    #
+    #   def link_for_page (page)
+    #     uri.sub(/page=\d+/, "page=#{page}")
+    #   end
+    # end
+    #
+    # === Append to Page
+    #
+    # Finally, you must provide a <tt>append_page</tt> method.  It takes an instance
+    # of your Scrapes::Page derived class as an argument.  Its job is to add the data
+    # found on the current page to its instance variables.  This is because when you use
+    # paginated, it only returns one instance of your class.
+    def self.paginated
+      meta_eval { @paginated = true }
+    end
+    ################################################################################
+    # Make Page.extract return an array by calling the given method.  This can be
+    # very useful for when your class does nothing more than collect a set of links
+    # for some other page to process.  It cases Session#page to call the given block
+    # once for each object returned from method_to_call.
+    def self.acts_as_array (method_to_call)
+      meta_eval { @as_array = method_to_call }
+    end
+    ################################################################################
+    # Preprocess the HTML by sending it through an XSLT stylesheet.  The stylesheet
+    # should return a document that can be then processed using your rules.  Using
+    # this feature requires that you have the xsltproc utility in your PATH.
+    # You can get xsltproc from libxslt: http://xmlsoft.org/XSLT/
+    def self.with_xslt (filename)
+      raise "#{XSLTPROC} could not be found" unless `#{XSLTPROC} --version 2>&1`.match(/libxslt/)
+      meta_eval { @with_xslt = filename }
+    end
+    ################################################################################
+    # Ensure that the given attributes have been set by matching rules
+    def self.validates_presence_of (*attrs)
+      attrs, options = attrs_options(attrs, {
+        :message => 'rule never matched',
+      })
+      validates_from(attrs, options, lambda {|a| !a.nil?})
+    end
+    ################################################################################
+    # Ensure that the given attributes are not #blank?
+    def self.validates_not_blank (*attrs)
+      attrs, options = attrs_options(attrs, {
+        :message => 'rule never matched',
+      })
+      validates_from(attrs, options, lambda {|a| !a.blank?})
+    end
+    ################################################################################
+    # Ensure that the given attributes have the correct format
+    def self.validates_format_of (*attrs)
+      attrs, options = attrs_options(attrs, {
+        :message  => 'did not match regular expression',
+        :with     => /.*/,
+      })
+      validates_from(attrs, options, lambda {|a| a.to_s.match(options[:with])})
+    end
+    ################################################################################
+    # Ensure that the given attributes have values in the given list
+    def self.validates_inclusion_of (*attrs)
+      attrs, options = attrs_options(attrs, {
+        :message  => 'is not in the list of accepted values',
+        :in       => [],
+      })
+      validates_from(attrs, options, lambda {|a| options[:in].include?(a)})
+    end
+    ################################################################################
+    # Ensure that the given attribute is a number
+    def self.validates_numericality_of (*attrs)
+      attrs, options = attrs_options(attrs, {
+        :message  => 'is not a number',
+      })
+      closure = lambda do |a|
+        begin
+          Kernel.Float(a.to_s)
+        rescue ArgumentError, TypeError
+          false
+        else
+          true
+        end
+      end
+      validates_from(attrs, options, closure)
+    end
+    ################################################################################
+    # If using acts_as_array that returns links, send them to another class
+    def self.to (other_class)
+      ToProxy.new(self, other_class)
+    end
+    ################################################################################
+    # Called by the crawler to process a web page
+    def self.extract (data, uri, session, &block)
+      obj = process_page(data, uri, session)
+      if meta_eval {@paginated}
+        if obj.respond_to?(:next_page)
+          sister = obj
+          while sister_uri = sister.next_page
+            sister = extract_sister(session, obj, sister_uri)
+          end
+        elsif obj.respond_to?(:link_for_page)
+          (2 .. obj.pages).each do |page|
+            sister_uri = obj.link_for_page(page)
+            extract_sister(session, obj, sister_uri)
+          end
+        end
+      end
+      as_array = meta_eval {@as_array}
+      obj = obj.send(as_array) if as_array
+      return obj unless block
+      obj.respond_to?(:each) ? obj.each {|o| yield(o)} : yield(obj)
+    end
+    ################################################################################
+    # Have a chance to do something after parsing, but before validataion
+    def after_parse
+    end
+    ################################################################################
+    # Called by the extract method to validate scraped data.  If you override this
+    # method, you should call super.  This method will probably be changed in the
+    # future so that you don't have to call super.
+    def validate
+      validations = self.class.meta_eval { @validations }
+      validations.each do |v|
+        raise "#{self.class}.#{v[:name]} #{v[:options][:message]}" unless
+          v[:proc].call(send(v[:name]))
+      end
+      self
+    end
+    ################################################################################
+    protected
+    ################################################################################
+    # Called by extract to process a page object
+    def self.process_page (data, uri, session)
+      if file = meta_eval { @with_xslt }
+        options = "--html '#{file}' -"
+        open("|#{XSLTPROC} #{options} 2> /dev/null", 'w+') do |xsltproc|
+          xsltproc << data
+          xsltproc.close_write
+          data = xsltproc.read
+        end
+      end
+      obj = parse(Hpricot(data))
+      obj.uri = uri
+      obj.session = session
+      obj.after_parse
+      obj.validate
+      obj
+    end
+    ################################################################################
+    # Called by extract to process paginated objects
+    def self.extract_sister (session, obj, sister_uri)
+      res = session.crawler.fetch(sister_uri)
+      sister = process_page(res.body, sister_uri, session)
+      obj.append_page(sister)
+      sister
+    end
+    ################################################################################
+    private
+    ################################################################################
+    # Add some things to sub-classes
+    def self.inherited (klass)
+      klass.meta_eval do
+        @validations = []
+        @paginated   = false
+        @as_array    = false
+      end
+    end
+    ################################################################################
+    # generic way to add validation
+    def self.validates_from (attrs, options, closure)
+      meta_eval do
+        attrs.each do |a|
+          @validations << {
+            :name     => a,
+            :options  => options,
+            :proc     => closure,
+          }
+        end
+      end
+    end
+    ################################################################################
+    # helper to correctly parse the validate calls
+    def self.attrs_options (attrs, options)
+      ops = attrs.pop if attrs.last.is_a?(Hash)
+      options.update(ops) if ops
+      [attrs, options]
+    end
+  end
+  ################################################################################
+end
+################################################################################