RubyGems - scrapi - Versions diffs - 1.1.2 - Mend

scrapi 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/lib/scraper/reader.rb ADDED

@@ -0,0 +1,239 @@
+# ScrAPI toolkit for Ruby
+#
+# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
+# Developed for http://co.mments.com
+# Code and documention: http://labnotes.org
+require "uri"
+require "net/http"
+require "net/https"
+begin
+  require "rubygems"
+  require "tidy"
+rescue LoadError
+end
+module Scraper
+  module Reader
+    class HTTPError < StandardError
+      attr_reader :cause
+      def initialize(cause = nil)
+        @cause = cause
+      end
+      def to_s
+        @cause ? "#{super}: #{@cause}" : super
+      end
+    end
+    class HTTPTimeoutError < HTTPError ; end
+    class HTTPUnspecifiedError < HTTPError ; end
+    class HTTPNotFoundError < HTTPError ; end
+    class HTTPNoAccessError < HTTPError ; end
+    class HTTPInvalidURLError < HTTPError ; end
+    class HTTPRedirectLimitError < HTTPError ; end
+    class HTMLParseError < StandardError
+      attr_reader :cause
+      def initialize(cause = nil)
+          @cause = cause
+      end
+      def to_s
+        @cause ? "#{super}: #{@cause}" : super
+      end
+    end
+    unless const_defined? :REDIRECT_LIMIT
+      REDIRECT_LIMIT = 3
+      DEFAULT_TIMEOUT = 30
+      PARSERS = [:tidy, :html_parser]
+    end
+    unless const_defined? :TIDY_OPTIONS
+      TIDY_OPTIONS = {
+        :output_xhtml=>true,
+        :show_errors=>0,
+        :show_warnings=>false,
+        :wrap=>0,
+        :wrap_sections=>false,
+        :force_output=>true,
+        :quiet=>true,
+        :tidy_mark=>false
+      }
+    end
+    Page = Struct.new(:url, :content, :encoding, :last_modified, :etag)
+    Parsed = Struct.new(:document, :encoding)
+  module_function
+    # :call-seq:
+    #   read_page(url, options?) => response
+    #
+    # Reads a Web page and return its URL, content and cache control headers.
+    #
+    # The request reads a Web page at the specified URL (must be a URI object).
+    # It accepts the following options:
+    # * :last_modified -- Last modified header (from a previous request).
+    # * :etag -- ETag header (from a previous request).
+    # * :redirect_limit -- Number of redirects allowed (default is 3).
+    # * :user_agent -- The User-Agent header to send.
+    # * :timeout -- HTTP open connection/read timeouts (in second).
+    #
+    # It returns a hash with the following information:
+    # * :url -- The URL of the requested page (may change by permanent redirect)
+    # * :content -- The content of the response (may be nil if cached)
+    # * :content_type -- The HTML page Content-Type header
+    # * :last_modified -- Last modified cache control header (may be nil)
+    # * :etag -- ETag cache control header (may be nil)
+    # * :encoding -- Document encoding for the page
+    # If the page has not been modified from the last request, the content is nil.
+    #
+    # Raises HTTPError if an error prevents it from reading the page.
+    def read_page(url, options = nil)
+      options ||= {}
+      redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
+      raise HTTPRedirectLimitError if redirect_limit == 0
+      if url.is_a?(URI)
+        uri = url
+      else
+        begin
+          uri = URI.parse(url)
+        rescue Exception=>error
+          raise HTTPInvalidURLError.new(error)
+        end
+      end
+      raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
+      begin
+        http = Net::HTTP.new(uri.host, uri.port)
+        http.use_ssl = (uri.scheme == "https")
+        http.close_on_empty_response = true
+        http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
+        path = uri.path.dup # required so we don't modify path
+        path << "?#{uri.query}" if uri.query
+        # TODO: Specify which content types are accepted.
+        # TODO: GZip support.
+        headers = {}
+        headers["User-Agent"] = options[:user_agent] if options[:user_agent]
+        headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
+        headers["ETag"] = options[:etag] if options[:etag]
+        response = http.request_get(path, headers)
+        # TODO: Ignore content types that do not map to HTML.
+      rescue TimeoutError=>error
+        raise HTTPTimeoutError.new(error)
+      rescue Exception=>error
+        raise HTTPUnspecifiedError.new(error)
+      end
+      case response
+      when Net::HTTPSuccess
+        encoding = if content_type = response["Content-Type"]
+          if match = content_type.match(/charset=([^\s]+)/i)
+            match[1]
+          end
+        end
+        return Page[(options[:source_url] || uri), response.body, encoding,
+                    response["Last-Modified"], response["ETag"]]
+      when Net::HTTPNotModified
+        return Page[(options[:source_url] || uri), nil, nil,
+                    options[:last_modified], options[:etag]]
+      when Net::HTTPMovedPermanently
+        return read_page(response["location"], # New URL takes effect
+                         :last_modified=>options[:last_modified],
+                         :etag=>options[:etag],
+                         :redirect_limit=>redirect_limit-1)
+      when Net::HTTPRedirection
+        return read_page(response["location"],
+                         :last_modified=>options[:last_modified],
+                         :etag=>options[:etag],
+                         :redirect_limit=>redirect_limit-1,
+                         :source_url=>(options[:source_url] || uri)) # Old URL still in effect
+      when Net::HTTPNotFound
+        raise HTTPNotFoundError
+      when Net::HTTPUnauthorized, Net::HTTPForbidden
+        raise HTTPNoAccessError
+      when Net::HTTPRequestTimeOut
+        raise HTTPTimeoutError
+      else
+        raise HTTPUnspecifiedError
+      end
+    end
+    # :call-seq:
+    #   parse_page(html, encoding?, options?, parser) => html
+    #
+    # Parses an HTML page and returns the encoding and HTML element.
+    # Raises HTMLParseError exceptions if it cannot parse the HTML.
+    #
+    # Options are passed to the parser. For example, when using Tidy
+    # you can pass Tidy cleanup options in the hash.
+    #
+    # The last option specifies which parser to use (see PARSERS).
+    # By default Tidy is used.
+    def parse_page(content, encoding = nil, options = nil, parser = :tidy)
+      begin
+        # Get the document encoding from the meta header.
+        if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
+          if meta = meta[0].match(/charset=([\w-]*)/i)
+            encoding = meta[1]
+          end
+        end
+        encoding ||= "utf8"
+        case (parser || :tidy)
+        when :tidy
+          # Make sure the Tidy path is set and always apply the default
+          # options (these only control things like errors, output type).
+          find_tidy
+          options = (options || {}).update(TIDY_OPTIONS)
+          options[:input_encoding] = encoding.gsub("-", "").downcase
+          document = Tidy.open(options) do |tidy|
+            html = tidy.clean(content)
+            HTML::Document.new(html).find(:tag=>"html")
+          end
+        when :html_parser
+          document = HTML::HTMLParser.parse(content).root
+        else
+          raise HTMLParseError, "No parser #{parser || "unspecified"}"
+        end
+        return Parsed[document, encoding]
+      rescue Exception=>error
+        raise HTMLParseError.new(error)
+      end
+    end
+  protected
+  module_function
+    def find_tidy()
+      return if Tidy.path
+      begin
+        Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
+      rescue LoadError
+        begin
+          Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
+        rescue LoadError
+          Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
+        end
+      end
+    end
+  end
+end

data/lib/scrapi.rb ADDED

@@ -0,0 +1,8 @@
+# Conditional loads, since we may have these libraries elsewhere,
+# e.g. when using Rails with assert_select plugin.
+require File.join(File.dirname(__FILE__), "html", "document") unless defined?(HTML::Document)
+require File.join(File.dirname(__FILE__), "html", "node_ext") unless defined?(HTML::Node.detach)
+require File.join(File.dirname(__FILE__), "html", "selector") unless defined?(HTML::Selector)
+require File.join(File.dirname(__FILE__), "html", "htmlparser") unless defined?(HTML::HTMLParser)
+require File.join(File.dirname(__FILE__), "scraper", "base") unless defined?(Scraper::Base)

data/lib/tidy/libtidy.dll ADDED

Binary file

data/lib/tidy/libtidy.so ADDED

Binary file

data/test/mock_net_http.rb ADDED

@@ -0,0 +1,54 @@
+require "net/http"
+class Net::HTTP
+  @@on_get = nil
+  # Reset get method to default behavior.
+  def self.reset_on_get
+    @@on_get = nil
+  end
+  # :call-seq:
+  #   on_get { |address, path, headers| ... => [response, body] }
+  #
+  # Specify alternative behavior for next execution of get method.
+  # This change applies to all instances of Net::HTTP, so do not use
+  # this method when running tests in parallel.
+  #
+  # The method takes a single block that accepts three arguments:
+  # the address (host), path and headers (hash). It must return an
+  # array with two values: the Net::HTTPResponse object and the
+  # content of the response body.
+  def self.on_get(&block)
+    @@on_get = block
+  end
+  unless method_defined?(:mocked_request_get)
+    alias :mocked_request_get :request_get
+    def request_get(path, headers)
+      # If we have prescribed behavior for the next search, execute it,
+      # otherwise, go with the default.
+      if @@on_get
+        response, body = @@on_get.call(@address, path, headers)
+        # Stuff the body into the response. No other way, since read_body
+        # attempts to read from a socket and we're too lazy to stub a socket.
+        response.instance_variable_set(:@mock_body, body.to_s)
+        class << response
+          def read_body()
+            @mock_body
+          end
+        end
+        response
+      else
+        mocked_request_get(path, headers)
+      end
+    end
+  end
+end

data/test/node_ext_test.rb ADDED

@@ -0,0 +1,24 @@
+# ScrAPI toolkit for Ruby
+#
+# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
+# Developed for http://co.mments.com
+# Code and documention: http://labnotes.org
+require "rubygems"
+require "test/unit"
+require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+class NodeExtTest < Test::Unit::TestCase
+  def setup
+  end
+  def teardown
+  end
+  def test_add_tests
+  end
+end

data/test/reader_test.rb ADDED

@@ -0,0 +1,299 @@
+# ScrAPI toolkit for Ruby
+#
+# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
+# Developed for http://co.mments.com
+# Code and documention: http://labnotes.org
+require "rubygems"
+require "test/unit"
+require "time" # rfc2822
+require "webrick"
+require "webrick/https"
+require "logger"
+require "stringio"
+require File.join(File.dirname(__FILE__), "mock_net_http")
+require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+class ReaderTest < Test::Unit::TestCase
+  include Scraper
+  WEBRICK_OPTIONS = {
+    :BindAddredd=>"127.0.0.1",
+    :Port=>2000,
+    :Logger=>Logger.new(StringIO.new) # /dev/null
+  }
+  WEBRICK_TEST_URL = "http://127.0.0.1:2000/test.html"
+  def setup
+    Net::HTTP.reset_on_get
+  end
+  def teardown
+    Net::HTTP.reset_on_get
+  end
+  #
+  # Tests read_page.
+  #
+  def test_should_pass_path_and_user_agent
+    # Test path, query string and user agent.
+    Net::HTTP.on_get do |address, path, headers|
+      assert_equal "localhost", address
+      assert_equal "/path?query", path
+      assert_equal "MyUserAgent", headers["User-Agent"]
+      [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
+    end
+    response = Reader.read_page("http://localhost/path?query", :user_agent=>"MyUserAgent")
+    assert_equal "http://localhost/path?query", response.url.to_s
+    assert_equal "nothing", response.content
+    assert_equal nil, response.last_modified
+    assert_equal nil, response.etag
+  end
+  def test_should_handle_http_and_timeout_errors
+    # Test timeout error and HTTP status that we can't process.
+    Net::HTTP.on_get { |address, path, headers| raise TimeoutError }
+    assert_raise(Reader::HTTPTimeoutError) do
+      response = Reader.read_page("http://localhost/path?query")
+    end
+    Net::HTTP.on_get { |address, path, headers| [Net::HTTPRequestTimeOut.new(Net::HTTP.version_1_2, 408, "Timeout"),""] }
+    assert_raise(Reader::HTTPTimeoutError) do
+      response = Reader.read_page("http://localhost/path?query")
+    end
+  end
+  def test_should_fail_on_too_many_redirects
+    # Test too many redirections.
+    Net::HTTP.on_get do |address, path, headers|
+      response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
+      response["location"] = "http://localhost"
+      [response, ""]
+    end
+    assert_raise(Reader::HTTPRedirectLimitError) do
+      response = Reader.read_page("http://localhost/path?query")
+    end
+    Net::HTTP.on_get do |address, path, headers|
+      response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
+      response["location"] = "http://localhost"
+      [response, ""]
+    end
+    assert_raise(Reader::HTTPRedirectLimitError) do
+      response = Reader.read_page("http://localhost/path?query")
+    end
+  end
+  def test_should_validate_redirect_url
+    # Test validation of redirection URI.
+    Net::HTTP.on_get do |address, path, headers|
+      response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
+      response["location"] = "ftp://notsupported"
+      [response, ""]
+    end
+    assert_raise(Reader::HTTPInvalidURLError) do
+      response = Reader.read_page("http://localhost/path?query")
+    end
+  end
+  def test_should_support_redirection
+    # Test working redirection. Redirect only once and test response URL.
+    # Should be new URL for permanent redirect, same URL for all other redirects.
+    Net::HTTP.on_get do |address, path, headers|
+      if path.empty?
+        [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
+      else
+        response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
+        response["Location"] = "http://localhost"
+        [response, ""]
+      end
+    end
+    assert_nothing_raised() do
+      response = Reader.read_page("http://localhost/path?query")
+      assert_equal "http://localhost/path?query", response.url.to_s
+    end
+  end
+  def test_should_support_permanent_redirection
+    # Test working redirection. Redirect only once and test response URL.
+    # Should be new URL for permanent redirect, same URL for all other redirects.
+    Net::HTTP.on_get do |address, path, headers|
+      if path == "/"
+        [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
+      else
+        response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
+        response["location"] = "http://localhost/"
+        [response, ""]
+      end
+    end
+    assert_nothing_raised() do
+      response = Reader.read_page("http://localhost/path?query")
+      assert_equal "http://localhost/", response.url.to_s
+    end
+  end
+  def test_should_use_cache_control
+    # Test Last Modified and ETag headers. First, that they are correctly
+    # returned from headers to response object. Next, that passing right
+    # headers in options returns nil body and same values (no change),
+    # passing wrong/no headers, returnspage.
+    time = Time.new.rfc2822
+    Net::HTTP.on_get do |address, path, headers|
+      response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
+      response["Last-Modified"] = time
+      response["ETag"] = "etag"
+        [response, "nothing"]
+    end
+    response = Reader.read_page("http://localhost/path?query")
+    assert_equal time, response.last_modified
+    assert_equal "etag", response.etag
+    Net::HTTP.on_get do |address, path, headers|
+      if headers["Last-Modified"] == time and headers["ETag"] == "etag"
+        [Net::HTTPNotModified.new(Net::HTTP.version_1_2, 304, "Same"), ""]
+      else
+        [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
+      end
+    end
+    response = Reader.read_page("http://localhost/path?query")
+    assert_equal "nothing", response.content
+    response = Reader.read_page("http://localhost/path?query", :last_modified=>time, :etag=>"etag")
+    assert_equal nil, response.content
+    assert_equal time, response.last_modified
+    assert_equal "etag", response.etag
+  end
+  def test_should_find_encoding
+    # Test working redirection. Redirect only once and test response URL.
+    # Should be new URL for permanent redirect, same URL for all other redirects.
+    Net::HTTP.on_get do |address, path, headers|
+      response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
+      response["content-type"] = "text/html; charset=bogus"
+      [response, ""]
+    end
+    response = Reader.read_page("http://localhost/path?query")
+    assert_equal "bogus", response.encoding
+  end
+  #
+  # Tests parse_page.
+  #
+  def test_should_parse_html_page
+    html = Reader.parse_page("<html><head></head><body><p>something</p></body></html>").document
+    assert_equal 1, html.find_all(:tag=>"head").size
+    assert_equal 1, html.find_all(:tag=>"body").size
+    assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
+    assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
+  end
+  def test_should_use_tidy_if_specified
+    # This will only work with Tidy which adds the head/body parts,
+    # HTMLParser doesn't fix the HTML.
+    html = Reader.parse_page("<p>something</p>", nil, {}).document
+    assert_equal 1, html.find_all(:tag=>"head").size
+    assert_equal 1, html.find_all(:tag=>"body").size
+    assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
+    assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
+  end
+  #
+  # Other tests.
+  #
+  def test_should_handle_encoding_correctly
+    # Test content encoding returned from HTTP server.
+    with_webrick do |server, params|
+      server.mount_proc "/test.html" do |req,resp|
+        resp["Content-Type"] = "text/html; charset=my-encoding"
+        resp.body = "Content comes here"
+      end
+      page = Reader.read_page(WEBRICK_TEST_URL)
+      page = Reader.parse_page(page.content, page.encoding)
+      assert_equal "my-encoding", page.encoding
+    end
+    # Test content encoding in HTML http-equiv header
+    # that overrides content encoding returned in HTTP.
+    with_webrick do |server, params|
+      server.mount_proc "/test.html" do |req,resp|
+        resp["Content-Type"] = "text/html; charset=my-encoding"
+        resp.body = %Q{
+<html>
+<head>
+<meta http-equiv="content-type" value="text/html; charset=other-encoding">
+</head>
+<body></body>
+</html>
+        }
+      end
+      page = Reader.read_page(WEBRICK_TEST_URL)
+      page = Reader.parse_page(page.content, page.encoding)
+      assert_equal "other-encoding", page.encoding
+    end
+  end
+  def test_should_support_https
+    begin
+      options = WEBRICK_OPTIONS.dup.update(
+        :SSLEnable=>true,
+        :SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
+        :SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
+      )
+      server = WEBrick::HTTPServer.new(options)
+      trap("INT") { server.shutdown }
+      Thread.new { server.start }
+      server.mount_proc "/test.html" do |req,resp|
+        resp.body = %Q{
+<html>
+<head>
+<title>test https</title>
+</head>
+<body></body>
+</html>
+      }
+      end
+      # Make sure page not HTTP accessible.
+      assert_raises(Reader::HTTPUnspecifiedError) do
+        Reader.read_page(WEBRICK_TEST_URL)
+      end
+      page = Reader.read_page(WEBRICK_TEST_URL.gsub("http", "https"))
+      page = Reader.parse_page(page.content, page.encoding)
+      assert_equal "<title>test https</title>",
+         page.document.find(:tag=>"title").to_s
+      server.shutdown
+    ensure
+      server.shutdown if server
+    end
+  end
+private
+  def with_webrick(params = nil)
+    begin
+      server = WEBrick::HTTPServer.new(WEBRICK_OPTIONS)
+      trap("INT") { server.shutdown }
+      Thread.new { server.start }
+      yield server, params
+      server.shutdown
+    ensure
+      server.shutdown if server
+    end
+  end
+end