RubyGems - metainspector - Versions diffs - 1.16.1 → 1.17.0 - Mend

metainspector 1.16.1 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/README.md +17 -11
data/lib/meta_inspector.rb +10 -3
data/lib/meta_inspector/deprecations.rb +19 -0
data/lib/meta_inspector/document.rb +81 -0
data/lib/meta_inspector/exception_log.rb +29 -0
data/lib/meta_inspector/exceptionable.rb +11 -0
data/lib/meta_inspector/parser.rb +178 -0
data/lib/meta_inspector/request.rb +55 -0
data/lib/meta_inspector/url.rb +76 -0
data/lib/meta_inspector/version.rb +1 -1
data/spec/document_spec.rb +97 -0
data/spec/exception_log_spec.rb +59 -0
data/spec/meta_inspector_spec.rb +9 -0
data/spec/parser_spec.rb +374 -0
data/spec/redirections_spec.rb +20 -3
data/spec/request_spec.rb +64 -0
data/spec/url_spec.rb +74 -0
metadata +18 -7
data/lib/meta_inspector/scraper.rb +0 -283
data/spec/metainspector_spec.rb +0 -547

data/spec/redirections_spec.rb CHANGED

@@ -9,7 +9,7 @@ describe MetaInspector do
         m = MetaInspector.new("http://facebook.com")
         m.title.should be_nil
         m.should_not be_ok
-        m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
+        m.exceptions.first.message.should == "redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
       end
       it "allows safe redirections when :allow_redirections => :safe" do
@@ -30,14 +30,14 @@ describe MetaInspector do
         m = MetaInspector.new("https://unsafe-facebook.com")
         m.title.should be_nil
         m.should_not be_ok
-        m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
+        m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
       end
       it "disallows unsafe redirections when :allow_redirections => :safe" do
         m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
         m.title.should be_nil
         m.should_not be_ok
-        m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
+        m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
       end
       it "allows unsafe redirections when :allow_redirections => :all" do
@@ -46,5 +46,22 @@ describe MetaInspector do
         m.should be_ok
       end
     end
+    describe "Redirections should update the base_uri" do
+      it "updates the base_uri on safe redirections" do
+        m = MetaInspector.new("http://facebook.com", :allow_redirections => :safe)
+        # Check for the title to make sure the request happens
+        m.title.should == "Hello From Facebook"
+        m.url.should == "https://www.facebook.com/"
+      end
+      it "updates the base_uri on all redirections" do
+        m = MetaInspector.new("http://facebook.com", :allow_redirections => :all)
+        # Check for the title to make sure the request happens
+        m.title.should == "Hello From Facebook"
+        m.url.should == "https://www.facebook.com/"
+      end
+    end
   end
 end

data/spec/request_spec.rb ADDED

@@ -0,0 +1,64 @@
+# -*- encoding: utf-8 -*-
+require File.join(File.dirname(__FILE__), "/spec_helper")
+describe MetaInspector::Request do
+  describe "read" do
+    it "should return the content of the page" do
+      page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
+      page_request.read[0..14].should == "<!DOCTYPE html>"
+    end
+  end
+  describe "content_type" do
+    it "should return the correct content type of the url for html pages" do
+      page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
+      page_request.content_type.should == "text/html"
+    end
+    it "should return the correct content type of the url for non html pages" do
+      image_request = MetaInspector::Request.new(url('http://pagerankalert.com/image.png'))
+      image_request.content_type.should == "image/png"
+    end
+  end
+  describe 'exception handling' do
+    before(:each) do
+      FakeWeb.allow_net_connect = true
+    end
+    after(:each) do
+      FakeWeb.allow_net_connect = false
+    end
+    it "should handle timeouts" do
+      impatient = MetaInspector::Request.new(url('http://example.com'), timeout: 0.0000000000001)
+      expect {
+        impatient.read.should be_nil
+      }.to change { impatient.exceptions.size }
+      impatient.exceptions.first.class.should == Timeout::Error
+    end
+    it "should handle socket errors" do
+      nowhere = MetaInspector::Request.new(url('http://caca232dsdsaer3sdsd-asd343.org'))
+      expect {
+        nowhere.read.should be_nil
+      }.to change { nowhere.exceptions.size }
+      nowhere.exceptions.first.class.should == SocketError
+    end
+  end
+  private
+  def url(initial_url)
+    MetaInspector::URL.new(initial_url)
+  end
+end

data/spec/url_spec.rb ADDED

@@ -0,0 +1,74 @@
+# -*- encoding: utf-8 -*-
+require File.join(File.dirname(__FILE__), "/spec_helper")
+describe MetaInspector::URL do
+  it "should normalize URLs" do
+    MetaInspector::URL.new('http://example.com').url.should == 'http://example.com/'
+  end
+  it 'should accept an URL with a scheme' do
+    MetaInspector::URL.new('http://example.com/').url.should == 'http://example.com/'
+  end
+  it "should use http:// as a default scheme" do
+    MetaInspector::URL.new('example.com').url.should == 'http://example.com/'
+  end
+  it "should accept an URL with international characters" do
+    MetaInspector::URL.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
+  end
+  it "should return the scheme" do
+    MetaInspector::URL.new('http://example.com').scheme.should   == 'http'
+    MetaInspector::URL.new('https://example.com').scheme.should  == 'https'
+    MetaInspector::URL.new('example.com').scheme.should          == 'http'
+  end
+  it "should return the host" do
+    MetaInspector::URL.new('http://example.com').host.should   == 'example.com'
+    MetaInspector::URL.new('https://example.com').host.should  == 'example.com'
+    MetaInspector::URL.new('example.com').host.should          == 'example.com'
+  end
+  it "should return the root url" do
+    MetaInspector::URL.new('http://example.com').root_url.should        == 'http://example.com/'
+    MetaInspector::URL.new('https://example.com').root_url.should       == 'https://example.com/'
+    MetaInspector::URL.new('example.com').root_url.should               == 'http://example.com/'
+    MetaInspector::URL.new('http://example.com/faqs').root_url.should   == 'http://example.com/'
+  end
+  describe "url=" do
+    it "should update the url" do
+      url = MetaInspector::URL.new('http://first.com/')
+      url.url         = 'http://second.com/'
+      url.url.should == 'http://second.com/'
+    end
+    it "should add the missing scheme and normalize" do
+      url = MetaInspector::URL.new('http://first.com/')
+      url.url         = 'second.com'
+      url.url.should == 'http://second.com/'
+    end
+  end
+  describe "exception handling" do
+    it "should handle URI::InvalidURIError" do
+      expect {
+        @malformed = MetaInspector::URL.new('javascript://')
+      }.to_not raise_error
+      @malformed.exceptions.first.class.should == URI::InvalidURIError
+    end
+    it "should handle URI::InvalidComponentError" do
+      expect {
+        @malformed = MetaInspector::URL.new('mailto:email(at)example.com')
+      }.to_not raise_error
+      @malformed.exceptions.first.class.should == URI::InvalidComponentError
+    end
+  end
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: metainspector
 version: !ruby/object:Gem::Version
-  version: 1.16.1
+  version: 1.17.0
 platform: ruby
 authors:
 - Jaime Iniesta
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-10-01 00:00:00.000000000 Z
+date: 2013-10-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -138,12 +138,20 @@ files:
 - README.md
 - Rakefile
 - lib/meta_inspector.rb
-- lib/meta_inspector/scraper.rb
+- lib/meta_inspector/deprecations.rb
+- lib/meta_inspector/document.rb
+- lib/meta_inspector/exception_log.rb
+- lib/meta_inspector/exceptionable.rb
+- lib/meta_inspector/parser.rb
+- lib/meta_inspector/request.rb
+- lib/meta_inspector/url.rb
 - lib/meta_inspector/version.rb
 - lib/metainspector.rb
 - meta_inspector.gemspec
 - samples/basic_scraping.rb
 - samples/spider.rb
+- spec/document_spec.rb
+- spec/exception_log_spec.rb
 - spec/fixtures/alazan.com.response
 - spec/fixtures/alazan_websolution.response
 - spec/fixtures/charset_000.response
@@ -171,9 +179,12 @@ files:
 - spec/fixtures/unsafe_https.facebook.com.response
 - spec/fixtures/wordpress_site.response
 - spec/fixtures/youtube.response
-- spec/metainspector_spec.rb
+- spec/meta_inspector_spec.rb
+- spec/parser_spec.rb
 - spec/redirections_spec.rb
+- spec/request_spec.rb
 - spec/spec_helper.rb
+- spec/url_spec.rb
 homepage: http://jaimeiniesta.github.io/metainspector/
 licenses: []
 metadata: {}
@@ -183,17 +194,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.1.3
+rubygems_version: 2.0.5
 signing_key:
 specification_version: 4
 summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash

data/lib/meta_inspector/scraper.rb DELETED

@@ -1,283 +0,0 @@
-# -*- encoding: utf-8 -*-
-require 'open-uri'
-require 'open_uri_redirections'
-require 'addressable/uri'
-require 'nokogiri'
-require 'hashie/rash'
-require 'timeout'
-# MetaInspector provides an easy way to scrape web pages and get its elements
-module MetaInspector
-  class Scraper
-    attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
-    attr_reader :allow_redirections, :verbose
-    # Initializes a new instance of MetaInspector, setting the URL to the one given
-    # Options:
-    # => timeout: defaults to 20 seconds
-    # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
-    # => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
-    # => document: the html of the url as a string
-    # => verbose: if the errors should be logged to the screen
-    def initialize(url, options = {})
-      options   = defaults.merge(options)
-      @url      = with_default_scheme(normalize_url(url))
-      @scheme   = URI.parse(@url).scheme
-      @host     = URI.parse(@url).host
-      @root_url = "#{@scheme}://#{@host}/"
-      @timeout  = options[:timeout]
-      @data     = Hashie::Rash.new
-      @errors   = []
-      @html_content_only  = options[:html_content_only]
-      @allow_redirections = options[:allow_redirections]
-      @verbose            = options[:verbose]
-      @document           = options[:document]
-    end
-    # Returns the parsed document title, from the content of the <title> tag.
-    # This is not the same as the meta_title tag
-    def title
-      @title ||= parsed_document.css('title').inner_text rescue nil
-    end
-    # A description getter that first checks for a meta description and if not present will
-    # guess by looking at the first paragraph with more than 120 characters
-    def description
-      meta_description.nil? ? secondary_description : meta_description
-    end
-    # Links found on the page, as absolute URLs
-    def links
-      @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
-    end
-    # Internal links found on the page, as absolute URLs
-    def internal_links
-      @internal_links ||= links.select {|link| host_from_url(link) == host }
-    end
-    # External links found on the page, as absolute URLs
-    def external_links
-      @external_links ||= links.select {|link| host_from_url(link) != host }
-    end
-    # Images found on the page, as absolute URLs
-    def images
-      @images ||= parsed_images.map{ |i| absolutify_url(i) }
-    end
-    # Returns the parsed image from Facebook's open graph property tags
-    # Most all major websites now define this property and is usually very relevant
-    # See doc at http://developers.facebook.com/docs/opengraph/
-    def image
-      meta_og_image || meta_twitter_image
-    end
-    # Returns the parsed document meta rss link
-    def feed
-      @feed ||= (parsed_feed('rss') || parsed_feed('atom'))
-    end
-    # Returns the charset from the meta tags, looking for it in the following order:
-    # <meta charset='utf-8' />
-    # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
-    def charset
-      @charset ||= (charset_from_meta_charset || charset_from_content_type)
-    end
-    # Returns all parsed data as a nested Hash
-    def to_hash
-      scrape_meta_data
-      {
-        'url' => url,
-        'title' => title,
-        'links' => links,
-        'internal_links' => internal_links,
-        'external_links' => external_links,
-        'images' => images,
-        'charset' => charset,
-        'feed' => feed,
-        'content_type' => content_type
-      }.merge @data.to_hash
-    end
-    # Returns the whole parsed document
-    def parsed_document
-      @parsed_document ||= Nokogiri::HTML(document)
-      rescue Exception => e
-        add_fatal_error "Parsing exception: #{e.message}"
-    end
-    # Returns the original, unparsed document
-    def document
-      @document ||= if html_content_only && content_type != "text/html"
-                      raise "The url provided contains #{content_type} content instead of text/html content" and nil
-                    else
-                      request.read
-                    end
-      rescue Exception => e
-        add_fatal_error "Scraping exception: #{e.message}"
-    end
-    # Returns the content_type of the fetched document
-    def content_type
-      @content_type ||= request.content_type
-    end
-    # Returns true if there are no errors
-    def ok?
-      errors.empty?
-    end
-    private
-    def defaults
-      {
-        :timeout                    => 20,
-        :html_content_only          => false,
-        :verbose                    => false
-      }
-    end
-    # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
-    # meta name: keywords, description, robots, generator
-    # meta http-equiv: content-language, Content-Type
-    #
-    # It will first try with meta name="..." and if nothing found,
-    # with meta http-equiv="...", substituting "_" by "-"
-    # TODO: define respond_to? to return true on the meta_name methods
-    def method_missing(method_name)
-      if method_name.to_s =~ /^meta_(.*)/
-        key = $1
-	#special treatment for opengraph (og:) and twitter card (twitter:) tags
-	key.gsub!("_",":") if key =~ /^og_(.*)/ || key =~ /^twitter_(.*)/
-        scrape_meta_data
-        @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
-      else
-        super
-      end
-    end
-    # Makes the request to the server
-    def request
-      Timeout::timeout(timeout) { @request ||= open(url, {:allow_redirections => allow_redirections}) }
-      rescue TimeoutError
-        add_fatal_error 'Timeout!!!'
-      rescue SocketError
-        add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
-      rescue Exception => e
-        add_fatal_error "Scraping exception: #{e.message}"
-    end
-    # Scrapes all meta tags found
-    def scrape_meta_data
-      unless @data.meta
-        @data.meta!.name!
-        @data.meta!.property!
-        parsed_document.xpath("//meta").each do |element|
-          get_meta_name_or_property(element)
-        end
-      end
-    end
-    # Store meta tag value, looking at meta name or meta property
-    def get_meta_name_or_property(element)
-      name_or_property = element.attributes["name"] ? "name" : (element.attributes["property"] ? "property" : nil)
-      content_or_value = element.attributes["content"] ? "content" : (element.attributes["value"] ? "value" : nil)
-      if !name_or_property.nil? && !content_or_value.nil?
-        @data.meta.name[element.attributes[name_or_property].value.downcase] = element.attributes[content_or_value].value
-      end
-    end
-    def parsed_feed(format)
-      feed = parsed_document.search("//link[@type='application/#{format}+xml']").first
-      feed ? absolutify_url(feed.attributes['href'].value) : nil
-    end
-    def parsed_links
-      @parsed_links ||= cleanup_nokogiri_values(parsed_document.search("//a/@href"))
-    end
-    def parsed_images
-      @parsed_images ||= cleanup_nokogiri_values(parsed_document.search('//img/@src'))
-    end
-    # Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
-    def cleanup_nokogiri_values(results)
-      results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
-    end
-    # Stores the error for later inspection
-    def add_fatal_error(error)
-      warn error if verbose
-      @errors << error
-    end
-    # Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
-    def normalize_url(url)
-      Addressable::URI.parse(url).normalize.to_s
-    end
-    # Adds 'http' as default scheme, if there if none
-    def with_default_scheme(url)
-      URI.parse(url).scheme.nil? ? 'http://' + url : url
-    end
-    # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
-    # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
-    def absolutify_url(uri)
-      if uri =~ /^\w*\:/i
-        normalize_url(uri)
-      else
-        Addressable::URI.join(base_url, uri).normalize.to_s
-      end
-    rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
-      add_fatal_error "Link parsing exception: #{e.message}" and nil
-    end
-    # Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
-    # or the url of the document if no <base> tag was found.
-    def base_url
-      base_href || @url
-    end
-    # Returns the value of the href attribute on the <base /> tag, if it exists
-    def base_href
-      parsed_document.search('base').first.attributes['href'].value rescue nil
-    end
-    # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
-    def unrelativize_url(url)
-      url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
-    end
-    # Extracts the host from a given URL
-    def host_from_url(url)
-      URI.parse(url).host
-    rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
-      add_fatal_error "Link parsing exception: #{e.message}" and nil
-    end
-    # Look for the first <p> block with 120 characters or more
-    def secondary_description
-      first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
-      first_long_paragraph ? first_long_paragraph.text : ''
-    end
-    def charset_from_meta_charset
-      parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
-    end
-    def charset_from_content_type
-      parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
-    end
-  end
-end