RubyGems - scrapi - Versions diffs - 1.2.0 → 2.0.0 - Mend

scrapi 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/CHANGELOG CHANGED

@@ -1,3 +1,13 @@
+Version 2.0.0 (November 10, 2010)
+* Ruby 1.9.2 support using Tidy FFI, by Christoph Lupprich.
+Version 1.2.1 (Upcoming)
+* Added: Cheat sheets.
+* Fixed: Support for redirects that return path-only locations.
+         Credit: Rick Wargo (http://www.rickwargo.com)
 Version 1.2.0 (August 27, 2006)
 * Added: collect() method called just before result().

data/MIT-LICENSE CHANGED

File without changes

data/{README → README.rdoc} RENAMED

@@ -40,13 +40,22 @@ To get the latest source code with regular updates:
 svn co http://labnotes.org/svn/public/ruby/scrapi
+== Version of Ruby
+ScrAPI 1.2.x tested with Ruby 1.8.6 and 1.8.7, but will not work on Ruby 1.9.x.
+ScrAPI 2.0.x switches to TidyFFI to runs on Ruby 1.9.2 and newer.
+Due to a bug in Ruby's visibility context handling (see changelog #29578 and bug
+#3406 on the official Ruby page), you need to declare all result attributes
+explicitly, using result method or attr_reader/_accessor.
 == Using TIDY
-By default scrAPI uses Tidy to cleanup the HTML.
+By default scrAPI uses Tidy (actually Tidy-FFI) to cleanup the HTML.
 You need to install the Tidy Gem for Ruby:
-  gem install tidy
+  gem install tidy_ffi
 And the Tidy binary libraries, available here:
@@ -56,15 +65,15 @@ By default scrAPI looks for the Tidy DLL (Windows) or shared library (Linux) in
 Alternatively, just point Tidy to the library with:
-  Tidy.path = "...."
+  TidyFFI.library_path = "...."
 On Linux this would probably be:
-  Tidy.path = "/usr/local/lib/libtidy.so"
+  TidyFFI.library_path = "/usr/local/lib/libtidy.so"
 On OS/X this would probably be:
-  Tidy.path = “/usr/lib/libtidy.dylib”
+  TidyFFI.library_path = “/usr/lib/libtidy.dylib”
 For testing purposes, you can also use the built in HTML parser. It's useful for testing and getting up to grabs with scrAPI, but it doesn't deal well with broken HTML. So for testing only:
@@ -86,3 +95,5 @@ HTML DOM extracted from Rails, Copyright (c) 2004 David Heinemeier Hansson. Unde
 HTML parser by Takahiro Maebashi and Katsuyuki Komatsu, Ruby license.
 http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html
+Porting to Ruby 1.9.x by Christoph Lupprich, http://lupprich.info

data/Rakefile CHANGED

@@ -1,12 +1,11 @@
 require "benchmark"
 require "rubygems"
-Gem::manage_gems
 require "rake"
 require "rake/testtask"
 require "rake/rdoctask"
-require "rake/gempackagetask"
+spec = Gem::Specification.load(File.join(File.dirname(__FILE__), 'scrapi.gemspec'))
 desc "Generate documentation"
 Rake::RDocTask.new(:rdoc) do |rdoc|
@@ -14,7 +13,7 @@ Rake::RDocTask.new(:rdoc) do |rdoc|
   rdoc.title    = "Scraper"
   rdoc.options << "--line-numbers"
   rdoc.options << "--inline-source"
-  rdoc.rdoc_files.include("README")
+  rdoc.rdoc_files.include("README.rdoc")
   rdoc.rdoc_files.include("lib/**/*.rb")
 end
@@ -25,42 +24,28 @@ Rake::TestTask.new(:test) do |test|
   test.pattern = "test/**/*_test.rb"
   test.verbose = true
 end
+task :default=>:test
-desc "Package as a Gem"
-gem_spec = Gem::Specification.new do |spec|
+spec = Gem::Specification.load(Dir["*.gemspec"].first)
-  version = nil
-  File.readlines("CHANGELOG").each do |line|
-    if line =~ /Version (\d+\.\d+\.\d+)/
-      version = $1
-      break
-    end
-  end
-  raise RuntimeError, "Can't find version number in changelog" unless version
-  spec.name = "scrapi"
-  spec.version = version
-  spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
-  spec.description = <<-EOF
-scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
-EOF
-  spec.author = "Assaf Arkin"
-  spec.email = "assaf.arkin@gmail.com"
-  spec.homepage = "http://blog.labnotes.org/category/scrapi/"
+desc "Build the Gem"
+task :build do
+  sh "gem build #{spec.name}.gemspec"
+end
-  spec.files = FileList["{test,lib}/**/*", "README", "CHANGELOG", "Rakefile", "MIT-LICENSE"].to_a
-  spec.require_path = "lib"
-  spec.autorequire = "scrapi.rb"
-  spec.requirements << "Tidy"
-  spec.add_dependency "tidy",  ">=1.1.0"
-  spec.has_rdoc = true
-  spec.rdoc_options << "--main" << "README" << "--title" <<  "scrAPI toolkit for Ruby" << "--line-numbers"
-  spec.extra_rdoc_files = ["README"]
-  spec.rubyforge_project = "scrapi"
+desc "Install #{spec.name} locally"
+task :install=>:build do
+  sudo = "sudo" unless File.writable?( Gem::ConfigMap[:bindir])
+  sh "#{sudo} gem install #{spec.name}-#{spec.version}.gem"
 end
-gem = Rake::GemPackageTask.new(gem_spec) do |pkg|
-  pkg.need_tar = true
-  pkg.need_zip = true
+desc "Push new release to gemcutter and git tag"
+task :push=>["test", "build"] do
+  sh "git push"
+  puts "Tagging version #{spec.version} .."
+  sh "git tag v#{spec.version}"
+  sh "git push --tag"
+  puts "Building and pushing gem .."
+  sh "gem push #{spec.name}-#{spec.version}.gem"
 end

data/lib/html/htmlparser.rb CHANGED

File without changes

data/lib/html/selector.rb CHANGED

File without changes

data/lib/scraper/base.rb CHANGED

@@ -327,7 +327,7 @@ module Scraper
       # The following options are supported for parsing the HTML:
       # * <tt>:root_element</tt> -- The root element to scrape, see
       #   also #root_elements.
-      # * <tt>:parser_options</tt> -- Specifies which parser to use.
+      # * <tt>:parser</tt> -- Specifies which parser to use.
       #   (Typically, you set this for the class).
       # * <tt>:parser_options</tt> -- Options to pass to the parser.
       #
@@ -906,10 +906,10 @@ module Scraper
     #   end
     def skip(elements = nil)
       case elements
-      when Array: @skip.concat elements
-      when HTML::Node: @skip << elements
-      when nil: @skip << true
-      when true, false: @skip << elements
+      when Array then @skip.concat elements
+      when HTML::Node then @skip << elements
+      when nil then @skip << true
+      when true, false then @skip << elements
       end
       # Calling skip(element) as the last statement is
       # redundant by design.

data/lib/scraper/reader.rb CHANGED

@@ -10,7 +10,7 @@ require "net/http"
 require "net/https"
 begin
   require "rubygems"
-  require "tidy"
+  require "tidy_ffi"
 rescue LoadError
 end
@@ -95,6 +95,7 @@ module Scraper
     # * :redirect_limit -- Number of redirects allowed (default is 3).
     # * :user_agent -- The User-Agent header to send.
     # * :timeout -- HTTP open connection/read timeouts (in second).
+    # * :ssl_verify_mode -- SSL verification mode, defaults to OpenSSL::SSL::VERIFY_NONE
     #
     # It returns a hash with the following information:
     # * :url -- The URL of the requested page (may change by permanent redirect)
@@ -123,6 +124,7 @@ module Scraper
       begin
         http = Net::HTTP.new(uri.host, uri.port)
         http.use_ssl = (uri.scheme == "https")
+        http.verify_mode = options[:ssl_verify_mode] || OpenSSL::SSL::VERIFY_NONE
         http.close_on_empty_response = true
         http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
         path = uri.path.dup # required so we don't modify path
@@ -153,12 +155,12 @@ module Scraper
         return Page[(options[:source_url] || uri), nil, nil,
                     options[:last_modified], options[:etag]]
       when Net::HTTPMovedPermanently
-        return read_page(response["location"], # New URL takes effect
+        return read_page((uri.merge(response["location"]) rescue nil), # New URL takes effect
                          :last_modified=>options[:last_modified],
                          :etag=>options[:etag],
                          :redirect_limit=>redirect_limit-1)
       when Net::HTTPRedirection
-        return read_page(response["location"],
+        return read_page((uri.merge(response["location"]) rescue nil),
                          :last_modified=>options[:last_modified],
                          :etag=>options[:etag],
                          :redirect_limit=>redirect_limit-1,
@@ -202,10 +204,8 @@ module Scraper
           find_tidy
           options = (options || {}).update(TIDY_OPTIONS)
           options[:input_encoding] = encoding.gsub("-", "").downcase
-          document = Tidy.open(options) do |tidy|
-            html = tidy.clean(content)
-            HTML::Document.new(html).find(:tag=>"html")
-          end
+          html = TidyFFI::Tidy.with_options(options).clean(content)
+          document = HTML::Document.new(html).find(:tag=>"html")
         when :html_parser
           document = HTML::HTMLParser.parse(content).root
         else
@@ -219,17 +219,18 @@ module Scraper
   protected
   module_function
     def find_tidy()
-      return if Tidy.path
+      return if TidyFFI.library_path
       begin
-        Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
+        TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
       rescue LoadError
         begin
-          Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
+          TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
         rescue LoadError
-          Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
+          TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
         end
       end
     end

data/lib/tidy/libtidy.dll CHANGED

File without changes

data/lib/tidy/libtidy.so CHANGED

File without changes

data/test/mock_net_http.rb CHANGED

File without changes

data/test/node_ext_test.rb CHANGED

@@ -7,7 +7,7 @@
 require "rubygems"
 require "test/unit"
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./lib/scrapi"
 class NodeExtTest < Test::Unit::TestCase

data/test/reader_test.rb CHANGED

@@ -12,8 +12,8 @@ require "webrick"
 require "webrick/https"
 require "logger"
 require "stringio"
-require File.join(File.dirname(__FILE__), "mock_net_http")
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./test/mock_net_http"
+require "./lib/scrapi"
 class ReaderTest < Test::Unit::TestCase
@@ -144,6 +144,25 @@ class ReaderTest < Test::Unit::TestCase
   end
+  def test_should_support_partial_location_redirection
+    # Test working redirection. Redirect only once and test response URL.
+    # Should be new URL for permanent redirect, same URL for all other redirects.
+    Net::HTTP.on_get do |address, path, headers|
+      if path == "/somewhere"
+        [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
+      else
+        response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
+        response["location"] = "somewhere"
+        [response, ""]
+      end
+    end
+    assert_nothing_raised() do
+      response = Reader.read_page("http://localhost/path?query")
+      assert_equal "http://localhost/somewhere", response.url.to_s
+    end
+  end
   def test_should_use_cache_control
     # Test Last Modified and ETag headers. First, that they are correctly
     # returned from headers to response object. Next, that passing right
@@ -220,22 +239,22 @@ class ReaderTest < Test::Unit::TestCase
     # Test content encoding returned from HTTP server.
     with_webrick do |server, params|
       server.mount_proc "/test.html" do |req,resp|
-        resp["Content-Type"] = "text/html; charset=my-encoding"
+        resp["Content-Type"] = "text/html; charset=ASCII"
         resp.body = "Content comes here"
       end
       page = Reader.read_page(WEBRICK_TEST_URL)
       page = Reader.parse_page(page.content, page.encoding)
-      assert_equal "my-encoding", page.encoding
+      assert_equal "ASCII", page.encoding
     end
     # Test content encoding in HTML http-equiv header
     # that overrides content encoding returned in HTTP.
     with_webrick do |server, params|
       server.mount_proc "/test.html" do |req,resp|
-        resp["Content-Type"] = "text/html; charset=my-encoding"
+        resp["Content-Type"] = "text/html; charset=ASCII"
         resp.body = %Q{
 <html>
 <head>
-<meta http-equiv="content-type" value="text/html; charset=other-encoding">
+<meta http-equiv="content-type" value="text/html; charset=UTF-8">
 </head>
 <body></body>
 </html>
@@ -243,7 +262,7 @@ class ReaderTest < Test::Unit::TestCase
       end
       page = Reader.read_page(WEBRICK_TEST_URL)
       page = Reader.parse_page(page.content, page.encoding)
-      assert_equal "other-encoding", page.encoding
+      assert_equal "UTF-8", page.encoding
     end
   end
@@ -251,7 +270,7 @@ class ReaderTest < Test::Unit::TestCase
     begin
       options = WEBRICK_OPTIONS.dup.update(
         :SSLEnable=>true,
-        :SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
+        :SSLVerifyClient => OpenSSL::SSL::VERIFY_NONE,
         :SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
       )
       server = WEBrick::HTTPServer.new(options)

data/test/scraper_test.rb CHANGED

@@ -8,8 +8,8 @@
 require "rubygems"
 require "time"
 require "test/unit"
-require File.join(File.dirname(__FILE__), "mock_net_http")
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./test/mock_net_http"
+require "./lib/scrapi"
 class ScraperTest < Test::Unit::TestCase
@@ -287,6 +287,7 @@ class ScraperTest < Test::Unit::TestCase
     scraper = new_scraper(html) do
       process "#1", :this1=>:text
       process "#1", :this2=>:text
+      attr_reader :this1, :this2
     end
     scraper.scrape
     assert_equal "this", scraper.this1
@@ -295,16 +296,18 @@ class ScraperTest < Test::Unit::TestCase
     scraper = new_scraper(html) do
       process "#1", :this1=>:text, :skip=>false
       process "#1", :this2=>:text
+      attr_reader :this1, :this2
     end
     scraper.scrape
     assert_equal "this", scraper.this1
     assert_equal "this", scraper.this2
     scraper = new_scraper(html) do
-      process "#1", :this1=>:text, :skip=>true do
-        false
+      process "#1", :this1=>:text, :skip=>true do |element|
+        element
       end
       process "#1", :this2=>:text
+      attr_reader :this1, :this2
     end
     scraper.scrape
     assert_equal "this", scraper.this1
@@ -351,7 +354,7 @@ class ScraperTest < Test::Unit::TestCase
         [response, <<-EOF
           <html>
             <head>
-              <meta http-equiv="content-type" value="text/html; charset=other-encoding">
+              <meta http-equiv="content-type" value="text/html; charset=ASCII">
             </head>
             <body>
               <div id="x"/>
@@ -371,7 +374,7 @@ class ScraperTest < Test::Unit::TestCase
     assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
     assert_equal time, scraper.page_info.last_modified
     assert_equal "etag", scraper.page_info.etag
-    assert_equal "other-encoding", scraper.page_info.encoding
+    assert_equal "ASCII", scraper.page_info.encoding
   end
@@ -563,6 +566,7 @@ class ScraperTest < Test::Unit::TestCase
       process "h1", [:text, :kls]=>Scraper.define {
         process "*", :text=>:text, :kls=>"@class"
       }
+      attr_reader :text, :kls
     end
     result = scraper.scrape
     assert "first",   result.text
@@ -618,6 +622,7 @@ class ScraperTest < Test::Unit::TestCase
     scraper = new_scraper(DIVS_ST_ND) do
       process_first "div", :div_id=>"@id", :div_text=>:text
+      attr_reader :div_id, :div_text
     end
     value = scraper.scrape
     assert_equal "1",     value.div_id
@@ -721,7 +726,7 @@ class ScraperTest < Test::Unit::TestCase
     # Extracting the attribute skips the second match.
     scraper = new_scraper(DIVS123) do
       process("div") { |element| @count +=1 }
-      define_method(:prepare) { @count = 1 }
+      define_method(:prepare) { |element| @count = 1 }
       define_method(:result) { @count }
     end
     result = scraper.scrape

data/test/selector_test.rb CHANGED

@@ -4,7 +4,7 @@
 # Developed for http://co.mments.com
 # Code and documention: http://labnotes.org
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./lib/scrapi"
 class SelectorTest < Test::Unit::TestCase

metadata CHANGED

@@ -1,82 +1,107 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.9.0
-specification_version: 1
 name: scrapi
 version: !ruby/object:Gem::Version
-  version: 1.2.0
-date: 2006-08-27 00:00:00 -07:00
-summary: scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules.
-require_paths:
-- lib
-email: assaf.arkin@gmail.com
-homepage: http://blog.labnotes.org/category/scrapi/
-rubyforge_project: scrapi
-description: scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
-autorequire: scrapi.rb
-default_executable:
-bindir: bin
-has_rdoc: true
-required_ruby_version: !ruby/object:Gem::Version::Requirement
-  requirements:
-  - - ">"
-    - !ruby/object:Gem::Version
-      version: 0.0.0
-  version:
+  prerelease: false
+  segments:
+  - 2
+  - 0
+  - 0
+  version: 2.0.0
 platform: ruby
-signing_key:
-cert_chain:
-post_install_message:
 authors:
 - Assaf Arkin
+autorequire: scrapi.rb
+bindir: bin
+cert_chain: []
+date: 2010-11-10 00:00:00 -08:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: tidy_ffi
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 1
+        - 2
+        version: 0.1.2
+  type: :runtime
+  version_requirements: *id001
+description: |
+  scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
+email: assaf@labnotes.org
+executables: []
+extensions: []
+extra_rdoc_files:
+- README.rdoc
 files:
+- test/mock_net_http.rb
 - test/node_ext_test.rb
+- test/reader_test.rb
 - test/scraper_test.rb
-- test/mock_net_http.rb
 - test/selector_test.rb
-- test/reader_test.rb
-- lib/scrapi.rb
-- lib/scraper
-- lib/tidy
-- lib/html
-- lib/scraper/reader.rb
-- lib/scraper/base.rb
-- lib/scraper/microformats.rb
-- lib/tidy/libtidy.so
-- lib/tidy/libtidy.dll
+- lib/html/document.rb
+- lib/html/htmlparser.rb
+- lib/html/node.rb
 - lib/html/node_ext.rb
 - lib/html/selector.rb
-- lib/html/node.rb
-- lib/html/version.rb
 - lib/html/tokenizer.rb
-- lib/html/document.rb
-- lib/html/htmlparser.rb
-- README
+- lib/html/version.rb
+- lib/scraper/base.rb
+- lib/scraper/microformats.rb
+- lib/scraper/reader.rb
+- lib/scrapi.rb
+- lib/tidy/libtidy.dll
+- lib/tidy/libtidy.so
+- README.rdoc
 - CHANGELOG
 - Rakefile
 - MIT-LICENSE
-test_files: []
+has_rdoc: true
+homepage: http://github.com/assaf/scrapi
+licenses: []
+post_install_message:
 rdoc_options:
 - --main
-- README
+- README.rdoc
 - --title
 - scrAPI toolkit for Ruby
 - --line-numbers
-extra_rdoc_files:
-- README
-executables: []
-extensions: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 1
+      - 9
+      - 1
+      version: 1.9.1
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
 requirements:
-- Tidy
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: tidy
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.1.0
-    version:
+- Tidy_ffi
+rubyforge_project: scrapi
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules.
+test_files: []