RubyGems - metainspector - Versions diffs - 1.10.0 → 1.10.1 - Mend

metainspector 1.10.0 → 1.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/README.rdoc +1 -1
data/lib/meta_inspector/scraper.rb +9 -13
data/lib/meta_inspector/version.rb +1 -1
data/spec/fixtures/international.response +8 -0
data/spec/metainspector_spec.rb +46 -18
metadata +4 -4

data/README.rdoc CHANGED Viewed

@@ -34,7 +34,7 @@ You can set a different timeout with a second parameter, like this:
   page = MetaInspector.new('markupvalidator.com', :timeout => 5) # this would wait just 5 seconds to timeout
-Metainspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
+MetaInspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
   page = MetaInspector.new('markupvalidator.com', :html_content_only => true)

data/lib/meta_inspector/scraper.rb CHANGED Viewed

@@ -16,6 +16,7 @@ module MetaInspector
     # => timeout: defaults to 20 seconds
     # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
     def initialize(url, options = {})
+      url       = encode_url(url)
       @url      = URI.parse(url).scheme.nil? ? 'http://' + url : url
       @scheme   = URI.parse(@url).scheme
       @host     = URI.parse(@url).host
@@ -53,21 +54,11 @@ module MetaInspector
       @data.external_links ||= links.select {|link| URI.parse(link).host != @host }
     end
-    def absolute_links
-      warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
-      links
-    end
     # Images found on the page, as absolute URLs
     def images
       @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
     end
-    def absolute_images
-      warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
-      images
-    end
     # Returns the parsed document meta rss links
     def feed
       @data.feed ||= parsed_document.xpath("//link").select{ |link|
@@ -112,7 +103,7 @@ module MetaInspector
     # Returns the original, unparsed document
     def document
-      @document ||= Timeout::timeout(@timeout) {
+      @document ||= Timeout::timeout(@timeout) {
         req = open(@url)
         @content_type = @data.content_type = req.content_type
@@ -186,13 +177,18 @@ module MetaInspector
       @errors << error
     end
+    # Encode url to deal with international characters
+    def encode_url(url)
+      URI.encode(url).to_s.gsub("%23", "#")
+    end
     # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
     # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
     def absolutify_url(url)
       if url =~ /^\w*\:/i
-        url
+        encode_url(url)
       else
-        URI.parse(@root_url).merge(URI.encode(url)).to_s.gsub("%23", "#")
+        URI.parse(@root_url).merge(encode_url(url)).to_s
       end
     end

data/lib/meta_inspector/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
 module MetaInspector
-  VERSION = "1.10.0"
+  VERSION = "1.10.1"
 end

data/spec/fixtures/international.response CHANGED Viewed

@@ -17,10 +17,18 @@ Cache-control: private
   <title>International chars</title>
 </head>
 <body>
+  <h1>Internal links:</h1>
   <a href="/españa.asp">España</a>
   <a href="/romanée">Romanée</a>
   <a href="/faqs#camión">FAQs camión</a>
   <a href="/search?q=camión">Search camión</a>
   <a href="/search?q=españa#top">Search España at top</a>
+  <h1>External links:</h1>
+  <a href="http://example.com/españa.asp">España</a>
+  <a href="http://example.com/romanée">Romanée</a>
+  <a href="http://example.com/faqs#camión">FAQs camión</a>
+  <a href="http://example.com/search?q=camión">Search camión</a>
+  <a href="http://example.com/search?q=españa#top">Search España at top</a>
 </body>
 </html>

data/spec/metainspector_spec.rb CHANGED Viewed

@@ -29,13 +29,15 @@ describe MetaInspector do
   describe 'Initialization' do
     it 'should accept an URL with a scheme' do
-      @m = MetaInspector.new('http://pagerankalert.com')
-      @m.url.should == 'http://pagerankalert.com'
+      MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
     end
     it "should use http:// as a default scheme" do
-      @m = MetaInspector.new('pagerankalert.com')
-      @m.url.should == 'http://pagerankalert.com'
+      MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
+    end
+    it "should accept an URL with international characters" do
+      MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
     end
     it "should store the scheme" do
@@ -51,9 +53,10 @@ describe MetaInspector do
     end
     it "should store the root url" do
-      MetaInspector.new('http://pagerankalert.com').root_url.should   == 'http://pagerankalert.com/'
-      MetaInspector.new('https://pagerankalert.com').root_url.should  == 'https://pagerankalert.com/'
-      MetaInspector.new('pagerankalert.com').root_url.should          == 'http://pagerankalert.com/'
+      MetaInspector.new('http://pagerankalert.com').root_url.should     == 'http://pagerankalert.com/'
+      MetaInspector.new('https://pagerankalert.com').root_url.should    == 'https://pagerankalert.com/'
+      MetaInspector.new('pagerankalert.com').root_url.should            == 'http://pagerankalert.com/'
+      MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
     end
   end
@@ -171,19 +174,44 @@ describe MetaInspector do
                           "http://alazan.com/faqs.asp" ]
     end
-    it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
-      m = MetaInspector.new('http://international.com')
-      m.links.should == [ "http://international.com/espa%C3%B1a.asp",
-                          "http://international.com/roman%C3%A9e",
-                          "http://international.com/faqs#cami%C3%B3n",
-                          "http://international.com/search?q=cami%C3%B3n",
-                          "http://international.com/search?q=espa%C3%B1a#top"]
-    end
     it "should return empty array if no links found" do
       m = MetaInspector.new('http://example.com/empty')
       m.links.should == []
     end
+    describe "links with international characters" do
+      it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
+        m = MetaInspector.new('http://international.com')
+        m.links.should == [ "http://international.com/espa%C3%B1a.asp",
+                            "http://international.com/roman%C3%A9e",
+                            "http://international.com/faqs#cami%C3%B3n",
+                            "http://international.com/search?q=cami%C3%B3n",
+                            "http://international.com/search?q=espa%C3%B1a#top",
+                            "http://example.com/espa%C3%B1a.asp",
+                            "http://example.com/roman%C3%A9e",
+                            "http://example.com/faqs#cami%C3%B3n",
+                            "http://example.com/search?q=cami%C3%B3n",
+                            "http://example.com/search?q=espa%C3%B1a#top"]
+      end
+      it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
+        m = MetaInspector.new('http://international.com')
+        m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
+                                     "http://international.com/roman%C3%A9e",
+                                     "http://international.com/faqs#cami%C3%B3n",
+                                     "http://international.com/search?q=cami%C3%B3n",
+                                     "http://international.com/search?q=espa%C3%B1a#top"]
+      end
+      it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
+        m = MetaInspector.new('http://international.com')
+        m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
+                                     "http://example.com/roman%C3%A9e",
+                                     "http://example.com/faqs#cami%C3%B3n",
+                                     "http://example.com/search?q=cami%C3%B3n",
+                                     "http://example.com/search?q=espa%C3%B1a#top"]
+      end
+    end
   end
   describe 'Non-HTTP links' do
@@ -342,7 +370,7 @@ describe MetaInspector do
     it "should handle errors when content is image/jpeg and html_content_type_only is true" do
       image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
       expect {
         title = image_url.title
       }.to change { image_url.errors.size }
@@ -352,7 +380,7 @@ describe MetaInspector do
     it "should handle errors when content is not text/html and html_content_type_only is true" do
       tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
       expect {
         title = tar_url.title
       }.to change { tar_url.errors.size }

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: metainspector
 version: !ruby/object:Gem::Version
-  hash: 63
+  hash: 61
   prerelease:
   segments:
   - 1
   - 10
-  - 0
-  version: 1.10.0
+  - 1
+  version: 1.10.1
 platform: ruby
 authors:
 - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-15 00:00:00 Z
+date: 2012-11-16 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   version_requirements: &id001 !ruby/object:Gem::Requirement