RubyGems - metainspector - Versions diffs - 1.9.11 → 1.10.0 - Mend

metainspector 1.9.11 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.rdoc +22 -18
data/lib/meta_inspector.rb +2 -2
data/lib/meta_inspector/scraper.rb +29 -6
data/lib/meta_inspector/version.rb +1 -1
data/spec/fixtures/{w3clove_faqs.response → markupvalidator_faqs.response} +23 -23
data/spec/fixtures/{twitter_w3clove.response → twitter_markupvalidator.response} +926 -926
data/spec/metainspector_spec.rb +89 -30
metadata +7 -7

data/spec/metainspector_spec.rb CHANGED Viewed

@@ -16,14 +16,16 @@ describe MetaInspector do
   FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
   FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
   FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
-  FakeWeb.register_uri(:get, "http://w3clove.com/faqs", :response => fixture_file("w3clove_faqs.response"))
-  FakeWeb.register_uri(:get, "https://twitter.com/w3clove", :response => fixture_file("twitter_w3clove.response"))
+  FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
+  FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
   FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
   FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
   FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
   FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
   FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
   FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
+  FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
+  FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
   describe 'Initialization' do
     it 'should accept an URL with a scheme' do
@@ -88,7 +90,7 @@ describe MetaInspector do
       end
       it "should find images on twitter" do
-        m = MetaInspector.new('https://twitter.com/w3clove')
+        m = MetaInspector.new('https://twitter.com/markupvalidator')
         m.images.length.should == 6
         m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
       end
@@ -150,30 +152,17 @@ describe MetaInspector do
     end
     it "should get correct absolute links for internal pages" do
-      m = MetaInspector.new('http://w3clove.com/faqs')
-      m.links.should == [ "http://w3clove.com/#",
-                          "http://w3clove.com/",
-                          "http://w3clove.com/faqs",
-                          "http://w3clove.com/plans-and-pricing",
-                          "http://w3clove.com/contact",
-                          "http://w3clove.com/charts/errors",
-                          "http://w3clove.com/credits",
-                          "http://w3clove.com/signin",
-                          "http://validator.w3.org",
-                          "http://www.sitemaps.org/",
-                          "http://jaimeiniesta.com/",
-                          "http://mendicantuniversity.org/",
-                          "http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
-                          "http://majesticseacreature.com/",
-                          "http://school.mendicantuniversity.org/alumni/2011",
-                          "https://github.com/jaimeiniesta/w3clove",
-                          "http://w3clove.com",
-                          "http://w3clove.com/api_v1_reference",
-                          "https://twitter.com/w3clove",
-                          "http://twitter.com/share",
-                          "http://w3clove.com/terms_of_service",
-                          "http://twitter.com/W3CLove",
-                          "http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
+      @m.internal_links.should == [ "http://pagerankalert.com/",
+                           "http://pagerankalert.com/es?language=es",
+                           "http://pagerankalert.com/users/sign_up",
+                           "http://pagerankalert.com/users/sign_in" ]
+    end
+    it "should get correct absolute links for external pages" do
+      @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
+                           "http://pagerankalert.posterous.com",
+                           "http://twitter.com/pagerankalert",
+                           "http://twitter.com/share" ]
     end
     it "should get correct absolute links, correcting relative links from URL not ending with slash" do
@@ -302,7 +291,7 @@ describe MetaInspector do
   describe 'to_hash' do
     it "should return a hash with all the values set" do
       @m = MetaInspector.new('http://pagerankalert.com')
-      @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
+      @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
     end
   end
@@ -316,7 +305,7 @@ describe MetaInspector do
     end
     it "should handle timeouts" do
-      impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
+      impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
       expect {
         title = impatient.title
@@ -335,6 +324,42 @@ describe MetaInspector do
       nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
     end
+    it "should parse images when parse_html_content_type_only is not specified" do
+      image_url = MetaInspector.new('http://pagerankalert.com/image.png')
+      desc = image_url.description
+      image_url.errors == nil
+      image_url.parsed? == true
+    end
+    it "should parse images when parse_html_content_type_only is false" do
+      image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
+      desc = image_url.description
+      image_url.errors == nil
+      image_url.parsed? == true
+    end
+    it "should handle errors when content is image/jpeg and html_content_type_only is true" do
+      image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
+      expect {
+        title = image_url.title
+      }.to change { image_url.errors.size }
+      image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
+    end
+    it "should handle errors when content is not text/html and html_content_type_only is true" do
+      tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
+      expect {
+        title = tar_url.title
+      }.to change { tar_url.errors.size }
+      tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
+    end
     describe "parsed?" do
       it "should return true if we have a parsed document" do
         good  = MetaInspector.new('http://pagerankalert.com')
@@ -344,12 +369,46 @@ describe MetaInspector do
       end
       it "should return false if we don't have a parsed document" do
-        bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
+        bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
         title = bad.title
         bad.parsed?.should == false
       end
+      it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
+        tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
+        title = tar.title
+        tar.parsed?.should == false
+      end
     end
   end
+  describe "content_type" do
+    it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
+      good = MetaInspector.new('http://pagerankalert.com/image.png')
+      title = good.title
+      good.parsed?.should == true
+      good.content_type == "image/jpeg"
+    end
+    it "should return the correct content type of the url if it is parsed correctly even for html pages" do
+      good = MetaInspector.new('http://pagerankalert.com')
+      title = good.title
+      good.parsed?.should == true
+      good.content_type == "text/html"
+    end
+    it "should return the correct content type of the url if it is not parsed correctly" do
+      bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
+      title = bad.title
+      bad.parsed?.should == false
+      bad.content_type == "image/jpeg"
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: metainspector
 version: !ruby/object:Gem::Version
-  hash: 37
+  hash: 63
   prerelease:
   segments:
   - 1
-  - 9
-  - 11
-  version: 1.9.11
+  - 10
+  - 0
+  version: 1.10.0
 platform: ruby
 authors:
 - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-09 00:00:00 Z
+date: 2012-11-15 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   version_requirements: &id001 !ruby/object:Gem::Requirement
@@ -146,14 +146,14 @@ files:
 - spec/fixtures/guardian.co.uk.response
 - spec/fixtures/international.response
 - spec/fixtures/iteh.at.response
+- spec/fixtures/markupvalidator_faqs.response
 - spec/fixtures/nonhttp.response
 - spec/fixtures/pagerankalert.com.response
 - spec/fixtures/protocol_relative.response
 - spec/fixtures/tea-tron.com.response
 - spec/fixtures/theonion-no-description.com.response
 - spec/fixtures/theonion.com.response
-- spec/fixtures/twitter_w3clove.response
-- spec/fixtures/w3clove_faqs.response
+- spec/fixtures/twitter_markupvalidator.response
 - spec/fixtures/wordpress_site.response
 - spec/fixtures/youtube.response
 - spec/metainspector_spec.rb