RubyGems - scrapes - Versions diffs - 0.2.0 - Mend

scrapes 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

data/LICENSE +22 -0
data/README +123 -0
data/demo/demo.rb +33 -0
data/demo/pages/about.rb +32 -0
data/demo/pages/main.rb +32 -0
data/lib/scrapes.rb +41 -0
data/lib/scrapes/cache.rb +110 -0
data/lib/scrapes/cookbook.rb +53 -0
data/lib/scrapes/cookies.rb +45 -0
data/lib/scrapes/crawler.rb +97 -0
data/lib/scrapes/hpricot.rb +110 -0
data/lib/scrapes/initializer.rb +86 -0
data/lib/scrapes/page.rb +319 -0
data/lib/scrapes/rule_parser.rb +327 -0
data/lib/scrapes/session.rb +155 -0
data/lib/scrapes/to_proxy.rb +50 -0
data/test/cache.rb +75 -0
data/test/cookies.rb +34 -0
data/test/crawler.rb +69 -0
data/test/hpricot.rb +55 -0
data/test/initializer.rb +54 -0
data/test/lib/server.rb +63 -0
data/test/page.rb +77 -0
data/test/pages/foils.rb +61 -0
data/test/pages/foils2.rb +38 -0
data/test/pages/redhanded_entries.rb +36 -0
data/test/pages/redhanded_main.rb +58 -0
data/test/pages/rule_parser.rb +81 -0
data/test/pages/simple.rb +21 -0
data/test/public/foil72.html +10 -0
data/test/public/foil73.html +9 -0
data/test/public/foil74.html +11 -0
data/test/public/foo.txt +1 -0
data/test/public/index.html +20 -0
data/test/public/redhanded.html +1208 -0
data/test/public/rule_parser.html +21 -0
data/test/public/simple.html +8 -0
data/test/rule_parser.rb +151 -0
data/test/session.rb +45 -0
data/test/textcontent.rb +71 -0
metadata +123 -0

data/lib/scrapes/rule_parser.rb ADDED

@@ -0,0 +1,327 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+#--
+# This started as a branch of the uformatparser lib by:
+#   Author:: Assaf Arkin assaf@labnotes.org
+#   Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
+#   Copyright:: Copyright (c) 2005 Assaf Arkin
+#   License:: Creative Commons Attribution-ShareAlike
+# Rewrite and Hpricot support by Michael Garriss
+#++
+################################################################################
+require 'yaml'
+require 'scrapes/hpricot'
+################################################################################
+module Scrapes
+  ################################################################################
+  # The methods defined here are available at the class scope level of a Scrapes::Page
+  # subclass.  For example:
+  #   class Foobar < Scrapes::Page
+  #     rule :foo, 'foo'
+  #     rule_1 :bar, 'bar', 'text()'
+  #   end
+  #--
+  # === Using <tt>rule</tt>
+  # === Using <tt>rule_1</tt>
+  # === Using <tt>selector</tt>
+  # === Using <tt>extractor</tt>
+  #++
+  module RuleParser
+    ################################################################################
+    # name:: the name later used to invoke this rule
+    # select:: the selector to use, String or Symbol
+    # extract:: the extractor to use, String, Symbol, or Class. See RuleParser#extractor
+    # limit:: the limit of nodes to send to extractor
+    # block:: a block extractor, must not be defined if extract is non-nil
+    # Example:
+    #   class Foobar < Scrapes::Page
+    #     rule :foo, 'foo'
+    #   end
+    # Later it's used as an instance method on the Scrapes::Page objects like this:
+    #   foobar.foo.each do |foo|
+    #     example.attr << foo
+    #   end
+    def rule(name, select = '', extract = nil, limit = -1, &block)
+      raise InvalidRuleException, "First argument (rule name) is required" unless name
+      attr name, true
+      self.rules << Rule.new(name, selector(nil,select), extractor(nil,extract,&block), limit)
+    end
+    ################################################################################
+    # Almost the same as rule except forces limit to be 1.  The other difference is
+    # that RuleParser#rule returns collections of mathes (an Array or size 1 even) where as
+    # RuleParser#rule_1 just returns the match.
+    # name:: the name later used to invoke this rule
+    # select:: the selector to use, String or Symbol
+    # extract:: the extractor to use, String, Symbol, or Class
+    # block:: a block extractor, must not be defined if extract is non-nil
+    # Example:
+    #   class Foobar < Scrapes::Page
+    #     rule_1 :bar, 'tr'
+    #   end
+    # Later it's used as an instance method on the Scrapes::Page objects like this:
+    #   example.attr = foobar.bar
+    def rule_1(name, selector = '', extractor = nil, &block)
+      rule(name, selector, extractor, 1, &block)
+    end
+    ################################################################################
+    # Creates a standalone selector that can later be used in a rule.  Example:
+    #   class Foobar < Scrapes::Page
+    #     selector :foo_select, 'table'
+    #     rule_1 :bar, :foo_select # a Symbol triggers use of the selector
+    #   end
+    # name:: the name later used to invoke this selector
+    # select:: the selector to use, String or NilClass
+    # block:: a block selector, must not be defined if select is non-nil
+    # A block selector is yielded the Hpricot doc object just once.  The collection it
+    # returns is interated over and each match is passed to the extractor.  Example:
+    #   class Foobar < Scrapes::Page
+    #     selector :foo_select_2 do |hpricot_doc|
+    #       doc.search('table')
+    #     end
+    #     rule_1 :bar, :foo_select_2 # a Symbol triggers use of the selector
+    #   end
+    # String selectors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted as Hpricot
+    # search strings.  See http://code.whytheluckystiff.net/hpricot/wiki/AnHpricotShowcase
+    def selector(name, select = nil, &block)
+      tor '@selector', name, select, &block
+    end
+    ################################################################################
+    # Creates a standalone extractor that can later be used in a rule.  Example:
+    #   class Foobar < Scrapes::Page
+    #     extractor :mailto_extract do |elem|
+    #       elem.attributes['href'].sub(/mailto:/,'') # remove the mailto: string
+    #     end
+    #     rule :emails, 'a[@href^="mailto:"]', :mailto_extract
+    #   end
+    # name:: the name later used to invoke this selector
+    # extract:: the extractor to use, String or NilClass
+    # block:: a block extractor, must not be defined if extract is non-nil
+    # A block extractor is yielded each object that matched the rules's selector.
+    #
+    # Extractors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted based on
+    # the class of the extractor as follows
+    # ==== NilClass
+    # The result of the selector is just re-returned.  Thus <tt>foo.my_rule</tt> would
+    # just return the selector results defined on the :my_rule rule.
+    # ==== Symbol
+    # An custom extractor is used.  See above docs for this method for an example.
+    # ==== Class
+    # A nested class of the given name is used as a new inner-parser. An instance of that
+    # class is returned from each invocation of the extractor.  Example:
+    #    class Outer < Scrapes::Page
+    #      class Inner < Scrapes::Page
+    #       rule_1 :bold_text, 'b', 'text()'
+    #       rule_1 :img_src, 'img[@src]', '@src'
+    #      end
+    #      rule :items, 'tr', Inner
+    #    end
+    # Now calling <tt>my_page.items</tt> returns an Array of Inner objects that each
+    # separately parses out the bold text and image source of each table row in the
+    # document.
+    # ==== String
+    # Two patterns:
+    # @foobar:: extract out the contents of an attibute named 'foobar'
+    # foobar():: invoke the foobar builtin extractor, see Scrapes::Hpricot::Extractors
+    def extractor(name, extract = nil, &block)
+      tor '@extractor', name, extract, &block
+    end
+    ################################################################################
+    def parse(node, context = nil, rules = nil) # :nodoc:
+      context = self.new() unless context
+      rules   = self.rules unless rules
+      if rules
+        rules.each_with_index do |rule, index|
+          if rule and rule.process(node, context)
+            less_rules = rules.clone unless less_rules
+            less_rules[index] = nil
+          end
+        end
+      end
+      context
+    end
+    ################################################################################
+    def rules() # :nodoc:
+      @microparser_rules ||= []
+    end
+    private
+    ################################################################################
+    def tor(type, name, tor_arg = nil, &block)
+      raise InvalidRuleException, "can't use both arg and block" if tor_arg and block
+      result = case (tor_arg ||= block)
+      when NilClass     then proc {|node| node}
+      when String
+        if type == '@selector'
+          proc {|node| node.search(tor_arg)}
+        else
+          Extractor.new self, tor_arg
+        end
+      when Proc, Method then tor_arg
+      when Symbol       then proc {|node| send(tor_arg,node) }
+      when Class
+        begin
+          tor_arg.method(:parse)
+        rescue NameError=>error
+          raise InvalidRuleException,
+            "Selector class must implement the method parse", error.backtrace
+        end
+        tor_arg
+      else
+        raise InvalidRuleException,
+          "Invalid tor type: must be a string, parser class, block or nil"
+      end
+      # TODO dry
+      if type == "@selector"
+        self.class.class_eval { (@selector ||= {})[name] = result }
+        class_def(name) do |node|
+          self.class.class_eval { @selector[name].call(node) }
+        end if name
+      else
+        self.class.class_eval { (@extractor ||= {})[name] = result }
+        class_def(name) do |node|
+          self.class.class_eval { @extractor[name].call(node) }
+        end if name
+      end
+      result
+    end
+    ################################################################################
+    def self.included(mod) # :nodoc:
+      mod.extend(self)
+      mod.extend(Scrapes::Hpricot::Extractors)
+    end
+    ################################################################################
+    class Rule #:nodoc:all
+      attr :name
+      attr :limit,true
+      attr :selector
+      attr :extractor
+      ################################################################################
+      def initialize(name, selector, extractor, limit)
+        @name, @selector, @extractor, @limit = name.to_s.intern, selector, extractor, limit
+      end
+      ################################################################################
+      def process(node, context)
+        context.instance_variable_set '@hpricot', node
+        return true if @limit == 0
+        result = @selector.call(node)
+        result = [result] unless result.respond_to? :each
+        current = context.instance_variable_set "@#@name", []
+        result.compact.each do |node|
+          value = case @extractor
+          when UnboundMethod then @extractor.bind(context).call(node)
+          when Extractor     then @extractor.extract(node)
+          when Proc, Method  then @extractor.call(node)
+          when Class         then @extractor.parse(node)
+          end
+          next unless value
+          current << value
+          break if current.size == @limit
+        end
+        context.instance_variable_set "@#@name", current[0] if @limit == 1
+        true
+      end
+      ################################################################################
+      def inspect
+        @selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
+      end
+    end
+    ################################################################################
+    class Extractor # :nodoc:all
+      # TODO review this
+      # Parse each extractor into three parts:
+      # $1 function name (excluding parentheses)
+      # $2 element name
+      # $3 attribute name (including leading @)
+      # If a match is found the result is either $1, or $2 and/or $3
+      REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
+      ################################################################################
+      def initialize(context, statement) # :nodoc:
+        statement.strip!
+        @extracts = []
+        statement.split('|').each do |extract|
+          parts = REGEX.match(extract)
+          if parts[1]
+            begin
+              @extracts << context.method(parts[1])
+            rescue NameError=>error
+              raise InvalidRuleException, error.message, error.backtrace
+            end
+          elsif parts[2] and parts[3]
+            attr_name = parts[3][1..-1]
+            @extracts << proc do |node|
+              node.attributes[attr_name] if node.name == parts[2]
+            end
+          elsif parts[2]
+            @extracts << proc { |node| text(node) if node.name == parts[2] }
+          elsif parts[3]
+            attr_name = parts[3][1..-1]
+            @extracts << proc do |node|
+              if node.respond_to? :each
+                node.all.attributes.all[attr_name]
+              else
+                node.attributes[attr_name]
+              end
+            end
+          else
+            raise InvalidRuleException, "Invalid extraction statement"
+          end
+        end
+        raise InvalidRuleException, "Invalid (empty) extraction statement" if
+          @extracts.size == 0
+      end
+      ################################################################################
+      def extract(node) # :nodoc:
+        value = nil
+        @extracts.find do |extract|
+          value = extract.call(node)
+        end
+        value
+      end
+      ################################################################################
+      def inspect() # :nodoc:
+        @extracts.join('|')
+      end
+    end
+    ################################################################################
+    class InvalidRuleException < Exception # :nodoc:all
+    end
+  end
+end

data/lib/scrapes/session.rb ADDED

@@ -0,0 +1,155 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+require 'scrapes/crawler'
+module Scrapes
+  ################################################################################
+  # Session is used to process all web pages under a single session.  This may
+  # be necessary when some web sites need you to login, or otherwise create
+  # a session ID with a cookie before you can continue processing pages.
+  class Session
+    ################################################################################
+    attr_reader :log
+    ################################################################################
+    attr_accessor :post
+    ################################################################################
+    attr_accessor :timeout
+    ################################################################################
+    attr_accessor :cookies
+    ################################################################################
+    attr_reader :uri
+    ################################################################################
+    attr_reader :crawler
+    ################################################################################
+    attr_reader :base_uris
+    ################################################################################
+    # Start a session using a HTTP GET
+    def self.from_get (uri, &block)
+      session = self.new
+      session.uri = uri
+      block ? yield(session) : session
+    end
+    ################################################################################
+    # Start a session using HTTP POST
+    def self.from_post (uri, post, &block)
+      session = self.new
+      session.uri = uri
+      session.post = post
+      block ? yield(session) : session
+    end
+    ################################################################################
+    # Start a session witout having to create a session with the web site first.
+    def self.start (log=nil,&block)
+      session = self.new(log)
+      block ? yield(session) : session
+    end
+    ################################################################################
+    def initialize log = nil
+      @uri = nil
+      @post = {}
+      @when = Time.at(0)
+      @timeout = 900
+      @cookies = Cookies.new
+      @base_uris = []
+      @crawler = Crawler.new(self)
+      @crawler.log = @log = log
+      @refreshing = false
+    end
+    ################################################################################
+    def uri= (uri)
+      @uri = uri
+      @base_uris << uri
+    end
+    ################################################################################
+    # Process a web page
+    def page (page_class, link, post={}, &block)
+      return if link.nil?
+      link = [link] unless link.respond_to?(:to_ary)
+      block ||= lambda {|data| data}
+      result = nil
+      link.each do |u|
+        fetch(u, post) do |res|
+          result = page_class.extract(res.body, u, self, &block)
+        end
+      end
+      result
+    end
+    ################################################################################
+    # Fetch a URL in the session, but without a Scrapes::Page
+    def fetch (uri, post={}, &block)
+      u = absolute_uri(uri)
+      @base_uris.push(u)
+      yield(@crawler.fetch(u, post))
+      @base_uris.pop
+    end
+    ################################################################################
+    # Refresh the session, sometimes necessary when you are getting pages out of the
+    # cache, but then go to the real web site and the session has expired.
+    def refresh
+      if !@refreshing and @uri and (Time.now - @when) > @timeout
+        begin
+          @refreshing = true
+          @when = Time.now
+          @cookies.clear
+          @crawler.cache.without_cache do
+            @crawler.fetch(uri, post)
+          end
+        ensure
+          @refreshing = false
+        end
+      end
+      self
+    end
+    ################################################################################
+    # Convert a relative URI to an absolute URI
+    def absolute_uri (uri)
+      return uri if @base_uris.empty?
+      base = URI.parse(@base_uris.last)
+      base.merge(uri).to_s
+    end
+  end
+  ################################################################################
+end
+################################################################################