RubyGems - metainspector - Versions diffs - 1.10.2 → 1.11.0 - Mend

metainspector 1.10.2 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/README.rdoc +2 -2
data/lib/meta_inspector/scraper.rb +98 -67
data/lib/meta_inspector/version.rb +1 -1
data/spec/metainspector_spec.rb +17 -33
metadata +4 -4

data/README.rdoc CHANGED Viewed

@@ -47,7 +47,7 @@ Then you can see the scraped data like this:
   page.title              # title of the page, as string
   page.links              # array of strings, with every link found on the page as an absolute URL
   page.internal_links     # array of strings, with every internal link found on the page as an absolute URL
-  page.extrenal_links     # array of strings, with every external link found on the page as an absolute URL
+  page.external_links     # array of strings, with every external link found on the page as an absolute URL
   page.meta_description   # meta description, as string
   page.description        # returns the meta description, or the first long paragraph if no meta description is found
   page.meta_keywords      # meta keywords, as string
@@ -85,7 +85,7 @@ The full scraped document if accessible from:
 You can check if the page has been succesfully parsed with:
-  page.parsed?                # Will return true if everything looks OK
+  page.ok?                    # Will return true if everything looks OK
 In case there have been any errors, you can check them with:

data/lib/meta_inspector/scraper.rb CHANGED Viewed

@@ -8,90 +8,93 @@ require 'timeout'
 # MetaInspector provides an easy way to scrape web pages and get its elements
 module MetaInspector
   class Scraper
-    attr_reader :url, :scheme, :host, :root_url, :errors, :content_type
+    attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
     # Initializes a new instance of MetaInspector, setting the URL to the one given
-    # If no scheme given, set it to http:// by default
     # Options:
     # => timeout: defaults to 20 seconds
     # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
     def initialize(url, options = {})
-      url       = encode_url(url)
-      @url      = URI.parse(url).scheme.nil? ? 'http://' + url : url
+      @url      = with_default_scheme(encode_url(url))
       @scheme   = URI.parse(@url).scheme
       @host     = URI.parse(@url).host
       @root_url = "#{@scheme}://#{@host}/"
       @timeout  = options[:timeout] || 20
-      @data     = Hashie::Rash.new('url' => @url)
+      @data     = Hashie::Rash.new
       @errors   = []
       @html_content_only = options[:html_content_only] || false
     end
     # Returns the parsed document title, from the content of the <title> tag.
-    # This is not the same as the meta_tite tag
+    # This is not the same as the meta_title tag
     def title
-      @data.title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
+      @title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
     end
     # A description getter that first checks for a meta description and if not present will
-    # guess by looking grabbing the first paragraph > 120 characters
+    # guess by looking at the first paragraph with more than 120 characters
     def description
       meta_description.nil? ? secondary_description : meta_description
     end
     # Links found on the page, as absolute URLs
     def links
-      @data.links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
+      @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
     end
     # Internal links found on the page, as absolute URLs
     def internal_links
-      @data.internal_links ||= links.select {|link| URI.parse(link).host == @host }
+      @internal_links ||= links.select {|link| URI.parse(link).host == host }
     end
     # External links found on the page, as absolute URLs
     def external_links
-      @data.external_links ||= links.select {|link| URI.parse(link).host != @host }
+      @external_links ||= links.select {|link| URI.parse(link).host != host }
     end
     # Images found on the page, as absolute URLs
     def images
-      @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
+      @images ||= parsed_images.map{ |i| absolutify_url(i) }
+    end
+    # Returns the parsed image from Facebook's open graph property tags
+    # Most all major websites now define this property and is usually very relevant
+    # See doc at http://developers.facebook.com/docs/opengraph/
+    def image
+      meta_og_image
     end
     # Returns the parsed document meta rss links
     def feed
-      @data.feed ||= parsed_document.xpath("//link").select{ |link|
+      @feed ||= parsed_document.xpath("//link").select{ |link|
           link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
         }.map { |link|
           absolutify_url(link.attributes["href"].value)
         }.first rescue nil
     end
-    # Returns the parsed image from Facebook's open graph property tags
-    # Most all major websites now define this property and is usually very relevant
-    # See doc at http://developers.facebook.com/docs/opengraph/
-    def image
-      meta_og_image
-    end
     # Returns the charset from the meta tags, looking for it in the following order:
     # <meta charset='utf-8' />
     # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
     def charset
-      @data.charset ||= (charset_from_meta_charset || charset_from_content_type)
+      @charset ||= (charset_from_meta_charset || charset_from_content_type)
     end
     # Returns all parsed data as a nested Hash
     def to_hash
-      # TODO: find a better option to populate the data to the Hash
-      image;images;feed;links;charset;title;meta_keywords;internal_links;external_links
-      @data.to_hash
-    end
+      scrape_meta_data
-    # Returns true if parsing has been successful
-    def parsed?
-      !@parsed_document.nil?
+      {
+        'url' => url,
+        'title' => title,
+        'links' => links,
+        'internal_links' => internal_links,
+        'external_links' => external_links,
+        'images' => images,
+        'charset' => charset,
+        'feed' => feed,
+        'content_type' => content_type
+      }.merge @data.to_hash
     end
     # Returns the whole parsed document
@@ -103,24 +106,33 @@ module MetaInspector
     # Returns the original, unparsed document
     def document
-      @document ||= Timeout::timeout(@timeout) {
-        req = open(@url)
-        @content_type = @data.content_type = req.content_type
+      @document ||= if html_content_only && content_type != "text/html"
+                      raise "The url provided contains #{content_type} content instead of text/html content" and nil
+                    else
+                      request.read
+                    end
+      rescue Exception => e
+        add_fatal_error "Scraping exception: #{e.message}"
+    end
-        if @html_content_only && @content_type != "text/html"
-           raise "The url provided contains #{@content_type} content instead of text/html content"
-        end
+    # Returns the content_type of the fetched document
+    def content_type
+      @content_type ||= request.content_type
+    end
-        req.read
-      }
+    # Returns true if there are no errors
+    def ok?
+      errors.empty?
+    end
-      rescue SocketError
-        add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
-      rescue TimeoutError
-        add_fatal_error 'Timeout!!!'
-      rescue Exception => e
-        add_fatal_error "Scraping exception: #{e.message}"
+    ##### DEPRECATIONS ####
+    def parsed?
+      warn "the parsed? method has been deprecated, please use ok? instead"
+      !@parsed_document.nil?
     end
+    ##### DEPRECATIONS ####
+    private
     # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
     # meta name: keywords, description, robots, generator
@@ -132,43 +144,57 @@ module MetaInspector
     def method_missing(method_name)
       if method_name.to_s =~ /^meta_(.*)/
         key = $1
-        #special treatment for og:
-        if key =~ /^og_(.*)/
-          key = "og:#{$1}"
-        end
-        unless @data.meta
-          @data.meta!.name!
-          @data.meta!.property!
-          parsed_document.xpath("//meta").each do |element|
-            if element.attributes["content"]
-              if element.attributes["name"]
-                @data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
-              end
-              if element.attributes["property"]
-                @data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
-              end
-            end
-          end
-        end
+        key = "og:#{$1}" if key =~ /^og_(.*)/ # special treatment for og:
+        scrape_meta_data
         @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
       else
         super
       end
     end
-    private
+    # Makes the request to the server
+    def request
+      Timeout::timeout(timeout) { @request ||= open(url) }
+      rescue TimeoutError
+        add_fatal_error 'Timeout!!!'
+      rescue SocketError
+        add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
+      rescue Exception => e
+        add_fatal_error "Scraping exception: #{e.message}"
+    end
+    # Scrapes all meta tags found
+    def scrape_meta_data
+      unless @data.meta
+        @data.meta!.name!
+        @data.meta!.property!
+        parsed_document.xpath("//meta").each do |element|
+          if element.attributes["content"]
+            if element.attributes["name"]
+              @data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
+            end
+            if element.attributes["property"]
+              @data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
+            end
+          end
+        end
+      end
+    end
     def parsed_links
       @parsed_links ||= parsed_document.search("//a") \
-                        .map {|link| link.attributes["href"] \
-                        .to_s.strip}.uniq rescue []
+                          .map {|link| link.attributes["href"] \
+                          .to_s.strip}.uniq rescue []
     end
     def parsed_images
       @parsed_images ||= parsed_document.search('//img') \
-                                        .reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
-                                        .map{ |i| i.attributes['src'].value }.uniq
+                           .reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
+                           .map{ |i| i.attributes['src'].value }.uniq
     end
     # Stores the error for later inspection
@@ -182,13 +208,18 @@ module MetaInspector
       URI.encode(url).to_s.gsub("%23", "#")
     end
+    # Adds 'http' as default scheme, if there if none
+    def with_default_scheme(url)
+      URI.parse(url).scheme.nil? ? 'http://' + url : url
+    end
     # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
     # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
     def absolutify_url(url)
       if url =~ /^\w*\:/i
         encode_url(url)
       else
-        URI.parse(@root_url).merge(encode_url(url)).to_s
+        URI.parse(root_url).merge(encode_url(url)).to_s
       end
     rescue URI::InvalidURIError => e
       add_fatal_error "Link parsing exception: #{e.message}" and nil

data/lib/meta_inspector/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
 module MetaInspector
-  VERSION = "1.10.2"
+  VERSION = "1.11.0"
 end

data/spec/metainspector_spec.rb CHANGED Viewed

@@ -25,7 +25,7 @@ describe MetaInspector do
   FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
   FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
   FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
-  FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
+  FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
   FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
   describe 'Initialization' do
@@ -373,16 +373,14 @@ describe MetaInspector do
       image_url = MetaInspector.new('http://pagerankalert.com/image.png')
       desc = image_url.description
-      image_url.errors == nil
-      image_url.parsed? == true
+      image_url.should be_ok
     end
     it "should parse images when parse_html_content_type_only is false" do
       image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
       desc = image_url.description
-      image_url.errors == nil
-      image_url.parsed? == true
+      image_url.should be_ok
     end
     it "should handle errors when content is image/jpeg and html_content_type_only is true" do
@@ -392,7 +390,7 @@ describe MetaInspector do
         title = image_url.title
       }.to change { image_url.errors.size }
-      image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
+      image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
     end
     it "should handle errors when content is not text/html and html_content_type_only is true" do
@@ -405,55 +403,41 @@ describe MetaInspector do
       tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
     end
-    describe "parsed?" do
-      it "should return true if we have a parsed document" do
+    describe "ok?" do
+      it "should return true if we have no errors" do
         good  = MetaInspector.new('http://pagerankalert.com')
-        title = good.title
+        good.to_hash
-        good.parsed?.should == true
+        good.should be_ok
       end
-      it "should return false if we don't have a parsed document" do
-        bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
-        title = bad.title
+      it "should return false if there are errors" do
+        bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
+        bad.title
-        bad.parsed?.should == false
+        bad.should_not be_ok
       end
       it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
         tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
         title = tar.title
-        tar.parsed?.should == false
+        tar.should_not be_ok
       end
     end
   end
   describe "content_type" do
-    it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
+    it "should return the correct content type of the url for non html pages" do
       good = MetaInspector.new('http://pagerankalert.com/image.png')
-      title = good.title
-      good.parsed?.should == true
-      good.content_type == "image/jpeg"
+      good.content_type.should == "image/png"
     end
-    it "should return the correct content type of the url if it is parsed correctly even for html pages" do
+    it "should return the correct content type of the url for html pages" do
       good = MetaInspector.new('http://pagerankalert.com')
-      title = good.title
-      good.parsed?.should == true
-      good.content_type == "text/html"
+      good.content_type.should == "text/html"
     end
-    it "should return the correct content type of the url if it is not parsed correctly" do
-      bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
-      title = bad.title
-      bad.parsed?.should == false
-      bad.content_type == "image/jpeg"
-    end
   end
 end

metadata CHANGED Viewed

@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
   prerelease:
   segments:
   - 1
-  - 10
-  - 2
-  version: 1.10.2
+  - 11
+  - 0
+  version: 1.11.0
 platform: ruby
 authors:
 - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-17 00:00:00 Z
+date: 2012-11-26 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   version_requirements: &id001 !ruby/object:Gem::Requirement