RubyGems - metainspector - Versions diffs - 1.9.2 → 1.9.3 - Mend

metainspector 1.9.2 → 1.9.3

Files changed (6) hide show

data/README.rdoc +10 -4
data/lib/meta_inspector/scraper.rb +17 -11
data/lib/meta_inspector/version.rb +1 -1
data/meta_inspector.gemspec +1 -1
data/spec/metainspector_spec.rb +46 -0
metadata +7 -7

data/README.rdoc CHANGED

@@ -1,5 +1,3 @@
-= MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
 MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, and meta tags.
 = Installation
@@ -69,6 +67,16 @@ The full scraped document if accessible from:
   page.document # Nokogiri doc that you can use it to get any element from the page
+= Errors handling
+You can check if the page has been succesfully parsed with:
+  page.parsed?                # Will return true if everything looks OK
+In case there have been any errors, you can check them with:
+  page.errors                 # Will return an array with the error messages
 = Examples
 You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
@@ -114,9 +122,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
 * Get page.base_dir from the URL
 * Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
-* Be able to set a timeout in seconds
 * If keywords seem to be separated by blank spaces, replace them with commas
-* Mocks
 * Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
 * Autodiscover all available meta tags

data/lib/meta_inspector/scraper.rb CHANGED

@@ -9,14 +9,16 @@ require 'timeout'
 # MetaInspector provides an easy way to scrape web pages and get its elements
 module MetaInspector
   class Scraper
-    attr_reader :url, :scheme
+    attr_reader :url, :scheme, :errors
     # Initializes a new instance of MetaInspector, setting the URL to the one given
     # If no scheme given, set it to http:// by default
     def initialize(url, timeout = 20)
       @url      = URI.parse(url).scheme.nil? ? 'http://' + url : url
       @scheme   = URI.parse(url).scheme || 'http'
       @timeout  = timeout
       @data     = Hashie::Rash.new('url' => @url)
+      @errors   = []
     end
     # Returns the parsed document title, from the content of the <title> tag.
@@ -83,13 +85,16 @@ module MetaInspector
       @data.to_hash
     end
+    # Returns true if parsing has been successful
+    def parsed?
+      !@parsed_document.nil?
+    end
     # Returns the whole parsed document
     def parsed_document
       @parsed_document ||= Nokogiri::HTML(document)
       rescue Exception => e
-        warn 'An exception occurred while trying to scrape the page!'
-        warn e.message
+        add_fatal_error "Parsing exception: #{e.message}"
     end
     # Returns the original, unparsed document
@@ -97,15 +102,11 @@ module MetaInspector
       @document ||= Timeout::timeout(@timeout) { open(@url).read }
       rescue SocketError
-        warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
-        @scraped = false
+        add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
       rescue TimeoutError
-        warn 'Timeout!!!'
-        @scraped = false
+        add_fatal_error 'Timeout!!!'
       rescue Exception => e
-        warn 'An exception occurred while trying to fetch the page!'
-        warn e.message
-        @scraped = false
+        add_fatal_error "Scraping exception: #{e.message}"
     end
     # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
@@ -145,6 +146,11 @@ module MetaInspector
     private
+    # Stores the error for later inspection
+    def add_fatal_error(error)
+      @errors << error
+    end
     # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
     # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
     def absolutify_url(url)

data/lib/meta_inspector/version.rb CHANGED

@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
 module MetaInspector
-  VERSION = "1.9.2"
+  VERSION = "1.9.3"
 end

data/meta_inspector.gemspec CHANGED

@@ -18,7 +18,7 @@ Gem::Specification.new do |gem|
   gem.add_dependency 'charguess', '1.3.20111021164500'
   gem.add_dependency 'rash', '0.3.2'
-  gem.add_development_dependency 'rspec', '2.10.0'
+  gem.add_development_dependency 'rspec', '2.11.0'
   gem.add_development_dependency 'fakeweb', '1.3.0'
   gem.add_development_dependency 'awesome_print', '1.0.2'
   gem.add_development_dependency 'rake', '0.9.2.2'

data/spec/metainspector_spec.rb CHANGED

@@ -231,6 +231,52 @@ describe MetaInspector do
     end
   end
+  describe 'exception handling' do
+    before(:each) do
+      FakeWeb.allow_net_connect = true
+    end
+    after(:each) do
+      FakeWeb.allow_net_connect = false
+    end
+    it "should handle timeouts" do
+      impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
+      expect {
+        title = impatient.title
+      }.to change { impatient.errors.size }
+      impatient.errors.first.should == "Timeout!!!"
+    end
+    it "should handle socket errors" do
+      nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
+      expect {
+        title = nowhere.title
+      }.to change { nowhere.errors.size }
+      nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
+    end
+    describe "parsed?" do
+      it "should return true if we have a parsed document" do
+        good  = MetaInspector.new('http://w3clove.com')
+        title = good.title
+        good.parsed?.should == true
+      end
+      it "should return false if we don't have a parsed document" do
+        bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
+        title = bad.title
+        bad.parsed?.should == false
+      end
+    end
+  end
   describe "regression tests" do
     describe "get image" do
       it "should find image on youtube" do

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: metainspector
 version: !ruby/object:Gem::Version
-  hash: 55
+  hash: 53
   prerelease:
   segments:
   - 1
   - 9
-  - 2
-  version: 1.9.2
+  - 3
+  version: 1.9.3
 platform: ruby
 authors:
 - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-13 00:00:00 Z
+date: 2012-07-22 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -72,12 +72,12 @@ dependencies:
     requirements:
     - - "="
       - !ruby/object:Gem::Version
-        hash: 39
+        hash: 35
         segments:
         - 2
-        - 10
+        - 11
         - 0
-        version: 2.10.0
+        version: 2.11.0
   type: :development
   version_requirements: *id004
 - !ruby/object:Gem::Dependency