RubyGems - metainspector - Versions diffs - 1.9.2 → 1.9.3 - Mend

metainspector 1.9.2 → 1.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/README.rdoc +10 -4
data/lib/meta_inspector/scraper.rb +17 -11
data/lib/meta_inspector/version.rb +1 -1
data/meta_inspector.gemspec +1 -1
data/spec/metainspector_spec.rb +46 -0
metadata +7 -7

data/README.rdoc CHANGED

@@ -1,5 +1,3 @@
-= MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
 MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, and meta tags.
 = Installation
@@ -69,6 +67,16 @@ The full scraped document if accessible from:
   page.document # Nokogiri doc that you can use it to get any element from the page
+= Errors handling
+You can check if the page has been succesfully parsed with:
+  page.parsed?                # Will return true if everything looks OK
+In case there have been any errors, you can check them with:
+  page.errors                 # Will return an array with the error messages
 = Examples
 You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
@@ -114,9 +122,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
 * Get page.base_dir from the URL
 * Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
-* Be able to set a timeout in seconds
 * If keywords seem to be separated by blank spaces, replace them with commas
-* Mocks
 * Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
 * Autodiscover all available meta tags

data/lib/meta_inspector/scraper.rb CHANGED

@@ -9,14 +9,16 @@ require 'timeout'
 # MetaInspector provides an easy way to scrape web pages and get its elements
 module MetaInspector
   class Scraper
-    attr_reader :url, :scheme
+    attr_reader :url, :scheme, :errors
     # Initializes a new instance of MetaInspector, setting the URL to the one given
     # If no scheme given, set it to http:// by default
     def initialize(url, timeout = 20)
       @url      = URI.parse(url).scheme.nil? ? 'http://' + url : url
       @scheme   = URI.parse(url).scheme || 'http'
       @timeout  = timeout
       @data     = Hashie::Rash.new('url' => @url)
+      @errors   = []
     end
     # Returns the parsed document title, from the content of the <title> tag.
@@ -83,13 +85,16 @@ module MetaInspector
       @data.to_hash
     end
+    # Returns true if parsing has been successful
+    def parsed?
+      !@parsed_document.nil?
+    end
     # Returns the whole parsed document
     def parsed_document
       @parsed_document ||= Nokogiri::HTML(document)
       rescue Exception => e
-        warn 'An exception occurred while trying to scrape the page!'
-        warn e.message
+        add_fatal_error "Parsing exception: #{e.message}"
     end
     # Returns the original, unparsed document
@@ -97,15 +102,11 @@ module MetaInspector
       @document ||= Timeout::timeout(@timeout) { open(@url).read }
       rescue SocketError
-        warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
-        @scraped = false
+        add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
       rescue TimeoutError
-        warn 'Timeout!!!'
-        @scraped = false
+        add_fatal_error 'Timeout!!!'
       rescue Exception => e
-        warn 'An exception occurred while trying to fetch the page!'
-        warn e.message
-        @scraped = false
+        add_fatal_error "Scraping exception: #{e.message}"
     end
     # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
@@ -145,6 +146,11 @@ module MetaInspector
     private
+    # Stores the error for later inspection
+    def add_fatal_error(error)
+      @errors << error
+    end
     # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
     # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
     def absolutify_url(url)

data/lib/meta_inspector/version.rb CHANGED

@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
 module MetaInspector
-  VERSION = "1.9.2"
+  VERSION = "1.9.3"
 end

data/meta_inspector.gemspec CHANGED

@@ -18,7 +18,7 @@ Gem::Specification.new do |gem|
   gem.add_dependency 'charguess', '1.3.20111021164500'
   gem.add_dependency 'rash', '0.3.2'
-  gem.add_development_dependency 'rspec', '2.10.0'
+  gem.add_development_dependency 'rspec', '2.11.0'
   gem.add_development_dependency 'fakeweb', '1.3.0'
   gem.add_development_dependency 'awesome_print', '1.0.2'
   gem.add_development_dependency 'rake', '0.9.2.2'

data/spec/metainspector_spec.rb CHANGED

@@ -231,6 +231,52 @@ describe MetaInspector do
     end
   end
+  describe 'exception handling' do
+    before(:each) do
+      FakeWeb.allow_net_connect = true
+    end
+    after(:each) do
+      FakeWeb.allow_net_connect = false
+    end
+    it "should handle timeouts" do
+      impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
+      expect {
+        title = impatient.title
+      }.to change { impatient.errors.size }
+      impatient.errors.first.should == "Timeout!!!"
+    end
+    it "should handle socket errors" do
+      nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
+      expect {
+        title = nowhere.title
+      }.to change { nowhere.errors.size }
+      nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
+    end
+    describe "parsed?" do
+      it "should return true if we have a parsed document" do
+        good  = MetaInspector.new('http://w3clove.com')
+        title = good.title
+        good.parsed?.should == true
+      end
+      it "should return false if we don't have a parsed document" do
+        bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
+        title = bad.title
+        bad.parsed?.should == false
+      end
+    end
+  end
   describe "regression tests" do
     describe "get image" do
       it "should find image on youtube" do

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: metainspector
 version: !ruby/object:Gem::Version
-  hash: 55
+  hash: 53
   prerelease:
   segments:
   - 1
   - 9
-  - 2
-  version: 1.9.2
+  - 3
+  version: 1.9.3
 platform: ruby
 authors:
 - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-13 00:00:00 Z
+date: 2012-07-22 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -72,12 +72,12 @@ dependencies:
     requirements:
     - - "="
       - !ruby/object:Gem::Version
-        hash: 39
+        hash: 35
         segments:
         - 2
-        - 10
+        - 11
         - 0
-        version: 2.10.0
+        version: 2.11.0
   type: :development
   version_requirements: *id004
 - !ruby/object:Gem::Dependency