metainspector 1.10.0 → 1.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -1
 - data/lib/meta_inspector/scraper.rb +9 -13
 - data/lib/meta_inspector/version.rb +1 -1
 - data/spec/fixtures/international.response +8 -0
 - data/spec/metainspector_spec.rb +46 -18
 - metadata +4 -4
 
    
        data/README.rdoc
    CHANGED
    
    | 
         @@ -34,7 +34,7 @@ You can set a different timeout with a second parameter, like this: 
     | 
|
| 
       34 
34 
     | 
    
         | 
| 
       35 
35 
     | 
    
         
             
              page = MetaInspector.new('markupvalidator.com', :timeout => 5) # this would wait just 5 seconds to timeout
         
     | 
| 
       36 
36 
     | 
    
         | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
      
 37 
     | 
    
         
            +
            MetaInspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
         
     | 
| 
       38 
38 
     | 
    
         | 
| 
       39 
39 
     | 
    
         
             
              page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
         
     | 
| 
       40 
40 
     | 
    
         | 
| 
         @@ -16,6 +16,7 @@ module MetaInspector 
     | 
|
| 
       16 
16 
     | 
    
         
             
                # => timeout: defaults to 20 seconds
         
     | 
| 
       17 
17 
     | 
    
         
             
                # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
         
     | 
| 
       18 
18 
     | 
    
         
             
                def initialize(url, options = {})
         
     | 
| 
      
 19 
     | 
    
         
            +
                  url       = encode_url(url)
         
     | 
| 
       19 
20 
     | 
    
         
             
                  @url      = URI.parse(url).scheme.nil? ? 'http://' + url : url
         
     | 
| 
       20 
21 
     | 
    
         
             
                  @scheme   = URI.parse(@url).scheme
         
     | 
| 
       21 
22 
     | 
    
         
             
                  @host     = URI.parse(@url).host
         
     | 
| 
         @@ -53,21 +54,11 @@ module MetaInspector 
     | 
|
| 
       53 
54 
     | 
    
         
             
                  @data.external_links ||= links.select {|link| URI.parse(link).host != @host }
         
     | 
| 
       54 
55 
     | 
    
         
             
                end
         
     | 
| 
       55 
56 
     | 
    
         | 
| 
       56 
     | 
    
         
            -
                def absolute_links
         
     | 
| 
       57 
     | 
    
         
            -
                  warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
         
     | 
| 
       58 
     | 
    
         
            -
                  links
         
     | 
| 
       59 
     | 
    
         
            -
                end
         
     | 
| 
       60 
     | 
    
         
            -
             
     | 
| 
       61 
57 
     | 
    
         
             
                # Images found on the page, as absolute URLs
         
     | 
| 
       62 
58 
     | 
    
         
             
                def images
         
     | 
| 
       63 
59 
     | 
    
         
             
                  @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
         
     | 
| 
       64 
60 
     | 
    
         
             
                end
         
     | 
| 
       65 
61 
     | 
    
         | 
| 
       66 
     | 
    
         
            -
                def absolute_images
         
     | 
| 
       67 
     | 
    
         
            -
                  warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
         
     | 
| 
       68 
     | 
    
         
            -
                  images
         
     | 
| 
       69 
     | 
    
         
            -
                end
         
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
       71 
62 
     | 
    
         
             
                # Returns the parsed document meta rss links
         
     | 
| 
       72 
63 
     | 
    
         
             
                def feed
         
     | 
| 
       73 
64 
     | 
    
         
             
                  @data.feed ||= parsed_document.xpath("//link").select{ |link|
         
     | 
| 
         @@ -112,7 +103,7 @@ module MetaInspector 
     | 
|
| 
       112 
103 
     | 
    
         | 
| 
       113 
104 
     | 
    
         
             
                # Returns the original, unparsed document
         
     | 
| 
       114 
105 
     | 
    
         
             
                def document
         
     | 
| 
       115 
     | 
    
         
            -
                  @document ||= Timeout::timeout(@timeout) { 
     | 
| 
      
 106 
     | 
    
         
            +
                  @document ||= Timeout::timeout(@timeout) {
         
     | 
| 
       116 
107 
     | 
    
         
             
                    req = open(@url)
         
     | 
| 
       117 
108 
     | 
    
         
             
                    @content_type = @data.content_type = req.content_type
         
     | 
| 
       118 
109 
     | 
    
         | 
| 
         @@ -186,13 +177,18 @@ module MetaInspector 
     | 
|
| 
       186 
177 
     | 
    
         
             
                  @errors << error
         
     | 
| 
       187 
178 
     | 
    
         
             
                end
         
     | 
| 
       188 
179 
     | 
    
         | 
| 
      
 180 
     | 
    
         
            +
                # Encode url to deal with international characters
         
     | 
| 
      
 181 
     | 
    
         
            +
                def encode_url(url)
         
     | 
| 
      
 182 
     | 
    
         
            +
                  URI.encode(url).to_s.gsub("%23", "#")
         
     | 
| 
      
 183 
     | 
    
         
            +
                end
         
     | 
| 
      
 184 
     | 
    
         
            +
             
     | 
| 
       189 
185 
     | 
    
         
             
                # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
         
     | 
| 
       190 
186 
     | 
    
         
             
                # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
         
     | 
| 
       191 
187 
     | 
    
         
             
                def absolutify_url(url)
         
     | 
| 
       192 
188 
     | 
    
         
             
                  if url =~ /^\w*\:/i
         
     | 
| 
       193 
     | 
    
         
            -
                    url
         
     | 
| 
      
 189 
     | 
    
         
            +
                    encode_url(url)
         
     | 
| 
       194 
190 
     | 
    
         
             
                  else
         
     | 
| 
       195 
     | 
    
         
            -
                    URI.parse(@root_url).merge( 
     | 
| 
      
 191 
     | 
    
         
            +
                    URI.parse(@root_url).merge(encode_url(url)).to_s
         
     | 
| 
       196 
192 
     | 
    
         
             
                  end
         
     | 
| 
       197 
193 
     | 
    
         
             
                end
         
     | 
| 
       198 
194 
     | 
    
         | 
| 
         @@ -17,10 +17,18 @@ Cache-control: private 
     | 
|
| 
       17 
17 
     | 
    
         
             
              <title>International chars</title>
         
     | 
| 
       18 
18 
     | 
    
         
             
            </head>
         
     | 
| 
       19 
19 
     | 
    
         
             
            <body>
         
     | 
| 
      
 20 
     | 
    
         
            +
              <h1>Internal links:</h1>
         
     | 
| 
       20 
21 
     | 
    
         
             
              <a href="/españa.asp">España</a>
         
     | 
| 
       21 
22 
     | 
    
         
             
              <a href="/romanée">Romanée</a>
         
     | 
| 
       22 
23 
     | 
    
         
             
              <a href="/faqs#camión">FAQs camión</a>
         
     | 
| 
       23 
24 
     | 
    
         
             
              <a href="/search?q=camión">Search camión</a>
         
     | 
| 
       24 
25 
     | 
    
         
             
              <a href="/search?q=españa#top">Search España at top</a>
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
              <h1>External links:</h1>
         
     | 
| 
      
 28 
     | 
    
         
            +
              <a href="http://example.com/españa.asp">España</a>
         
     | 
| 
      
 29 
     | 
    
         
            +
              <a href="http://example.com/romanée">Romanée</a>
         
     | 
| 
      
 30 
     | 
    
         
            +
              <a href="http://example.com/faqs#camión">FAQs camión</a>
         
     | 
| 
      
 31 
     | 
    
         
            +
              <a href="http://example.com/search?q=camión">Search camión</a>
         
     | 
| 
      
 32 
     | 
    
         
            +
              <a href="http://example.com/search?q=españa#top">Search España at top</a>
         
     | 
| 
       25 
33 
     | 
    
         
             
            </body>
         
     | 
| 
       26 
34 
     | 
    
         
             
            </html>
         
     | 
    
        data/spec/metainspector_spec.rb
    CHANGED
    
    | 
         @@ -29,13 +29,15 @@ describe MetaInspector do 
     | 
|
| 
       29 
29 
     | 
    
         | 
| 
       30 
30 
     | 
    
         
             
              describe 'Initialization' do
         
     | 
| 
       31 
31 
     | 
    
         
             
                it 'should accept an URL with a scheme' do
         
     | 
| 
       32 
     | 
    
         
            -
                   
     | 
| 
       33 
     | 
    
         
            -
                  @m.url.should == 'http://pagerankalert.com'
         
     | 
| 
      
 32 
     | 
    
         
            +
                  MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
         
     | 
| 
       34 
33 
     | 
    
         
             
                end
         
     | 
| 
       35 
34 
     | 
    
         | 
| 
       36 
35 
     | 
    
         
             
                it "should use http:// as a default scheme" do
         
     | 
| 
       37 
     | 
    
         
            -
                   
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
      
 36 
     | 
    
         
            +
                  MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                it "should accept an URL with international characters" do
         
     | 
| 
      
 40 
     | 
    
         
            +
                  MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
         
     | 
| 
       39 
41 
     | 
    
         
             
                end
         
     | 
| 
       40 
42 
     | 
    
         | 
| 
       41 
43 
     | 
    
         
             
                it "should store the scheme" do
         
     | 
| 
         @@ -51,9 +53,10 @@ describe MetaInspector do 
     | 
|
| 
       51 
53 
     | 
    
         
             
                end
         
     | 
| 
       52 
54 
     | 
    
         | 
| 
       53 
55 
     | 
    
         
             
                it "should store the root url" do
         
     | 
| 
       54 
     | 
    
         
            -
                  MetaInspector.new('http://pagerankalert.com').root_url.should 
     | 
| 
       55 
     | 
    
         
            -
                  MetaInspector.new('https://pagerankalert.com').root_url.should 
     | 
| 
       56 
     | 
    
         
            -
                  MetaInspector.new('pagerankalert.com').root_url.should 
     | 
| 
      
 56 
     | 
    
         
            +
                  MetaInspector.new('http://pagerankalert.com').root_url.should     == 'http://pagerankalert.com/'
         
     | 
| 
      
 57 
     | 
    
         
            +
                  MetaInspector.new('https://pagerankalert.com').root_url.should    == 'https://pagerankalert.com/'
         
     | 
| 
      
 58 
     | 
    
         
            +
                  MetaInspector.new('pagerankalert.com').root_url.should            == 'http://pagerankalert.com/'
         
     | 
| 
      
 59 
     | 
    
         
            +
                  MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
         
     | 
| 
       57 
60 
     | 
    
         
             
                end
         
     | 
| 
       58 
61 
     | 
    
         
             
              end
         
     | 
| 
       59 
62 
     | 
    
         | 
| 
         @@ -171,19 +174,44 @@ describe MetaInspector do 
     | 
|
| 
       171 
174 
     | 
    
         
             
                                      "http://alazan.com/faqs.asp" ]
         
     | 
| 
       172 
175 
     | 
    
         
             
                end
         
     | 
| 
       173 
176 
     | 
    
         | 
| 
       174 
     | 
    
         
            -
                it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
         
     | 
| 
       175 
     | 
    
         
            -
                  m = MetaInspector.new('http://international.com')
         
     | 
| 
       176 
     | 
    
         
            -
                  m.links.should == [ "http://international.com/espa%C3%B1a.asp",
         
     | 
| 
       177 
     | 
    
         
            -
                                      "http://international.com/roman%C3%A9e",
         
     | 
| 
       178 
     | 
    
         
            -
                                      "http://international.com/faqs#cami%C3%B3n",
         
     | 
| 
       179 
     | 
    
         
            -
                                      "http://international.com/search?q=cami%C3%B3n",
         
     | 
| 
       180 
     | 
    
         
            -
                                      "http://international.com/search?q=espa%C3%B1a#top"]
         
     | 
| 
       181 
     | 
    
         
            -
                end
         
     | 
| 
       182 
     | 
    
         
            -
             
     | 
| 
       183 
177 
     | 
    
         
             
                it "should return empty array if no links found" do
         
     | 
| 
       184 
178 
     | 
    
         
             
                  m = MetaInspector.new('http://example.com/empty')
         
     | 
| 
       185 
179 
     | 
    
         
             
                  m.links.should == []
         
     | 
| 
       186 
180 
     | 
    
         
             
                end
         
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
      
 182 
     | 
    
         
            +
                describe "links with international characters" do
         
     | 
| 
      
 183 
     | 
    
         
            +
                  it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
         
     | 
| 
      
 184 
     | 
    
         
            +
                    m = MetaInspector.new('http://international.com')
         
     | 
| 
      
 185 
     | 
    
         
            +
                    m.links.should == [ "http://international.com/espa%C3%B1a.asp",
         
     | 
| 
      
 186 
     | 
    
         
            +
                                        "http://international.com/roman%C3%A9e",
         
     | 
| 
      
 187 
     | 
    
         
            +
                                        "http://international.com/faqs#cami%C3%B3n",
         
     | 
| 
      
 188 
     | 
    
         
            +
                                        "http://international.com/search?q=cami%C3%B3n",
         
     | 
| 
      
 189 
     | 
    
         
            +
                                        "http://international.com/search?q=espa%C3%B1a#top",
         
     | 
| 
      
 190 
     | 
    
         
            +
                                        "http://example.com/espa%C3%B1a.asp",
         
     | 
| 
      
 191 
     | 
    
         
            +
                                        "http://example.com/roman%C3%A9e",
         
     | 
| 
      
 192 
     | 
    
         
            +
                                        "http://example.com/faqs#cami%C3%B3n",
         
     | 
| 
      
 193 
     | 
    
         
            +
                                        "http://example.com/search?q=cami%C3%B3n",
         
     | 
| 
      
 194 
     | 
    
         
            +
                                        "http://example.com/search?q=espa%C3%B1a#top"]
         
     | 
| 
      
 195 
     | 
    
         
            +
                  end
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
      
 197 
     | 
    
         
            +
                  it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
         
     | 
| 
      
 198 
     | 
    
         
            +
                    m = MetaInspector.new('http://international.com')
         
     | 
| 
      
 199 
     | 
    
         
            +
                    m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
         
     | 
| 
      
 200 
     | 
    
         
            +
                                                 "http://international.com/roman%C3%A9e",
         
     | 
| 
      
 201 
     | 
    
         
            +
                                                 "http://international.com/faqs#cami%C3%B3n",
         
     | 
| 
      
 202 
     | 
    
         
            +
                                                 "http://international.com/search?q=cami%C3%B3n",
         
     | 
| 
      
 203 
     | 
    
         
            +
                                                 "http://international.com/search?q=espa%C3%B1a#top"]
         
     | 
| 
      
 204 
     | 
    
         
            +
                  end
         
     | 
| 
      
 205 
     | 
    
         
            +
             
     | 
| 
      
 206 
     | 
    
         
            +
                  it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
         
     | 
| 
      
 207 
     | 
    
         
            +
                    m = MetaInspector.new('http://international.com')
         
     | 
| 
      
 208 
     | 
    
         
            +
                    m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
         
     | 
| 
      
 209 
     | 
    
         
            +
                                                 "http://example.com/roman%C3%A9e",
         
     | 
| 
      
 210 
     | 
    
         
            +
                                                 "http://example.com/faqs#cami%C3%B3n",
         
     | 
| 
      
 211 
     | 
    
         
            +
                                                 "http://example.com/search?q=cami%C3%B3n",
         
     | 
| 
      
 212 
     | 
    
         
            +
                                                 "http://example.com/search?q=espa%C3%B1a#top"]
         
     | 
| 
      
 213 
     | 
    
         
            +
                  end
         
     | 
| 
      
 214 
     | 
    
         
            +
                end
         
     | 
| 
       187 
215 
     | 
    
         
             
              end
         
     | 
| 
       188 
216 
     | 
    
         | 
| 
       189 
217 
     | 
    
         
             
              describe 'Non-HTTP links' do
         
     | 
| 
         @@ -342,7 +370,7 @@ describe MetaInspector do 
     | 
|
| 
       342 
370 
     | 
    
         | 
| 
       343 
371 
     | 
    
         
             
                it "should handle errors when content is image/jpeg and html_content_type_only is true" do
         
     | 
| 
       344 
372 
     | 
    
         
             
                  image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
         
     | 
| 
       345 
     | 
    
         
            -
             
     | 
| 
      
 373 
     | 
    
         
            +
             
     | 
| 
       346 
374 
     | 
    
         
             
                  expect {
         
     | 
| 
       347 
375 
     | 
    
         
             
                    title = image_url.title
         
     | 
| 
       348 
376 
     | 
    
         
             
                  }.to change { image_url.errors.size }
         
     | 
| 
         @@ -352,7 +380,7 @@ describe MetaInspector do 
     | 
|
| 
       352 
380 
     | 
    
         | 
| 
       353 
381 
     | 
    
         
             
                it "should handle errors when content is not text/html and html_content_type_only is true" do
         
     | 
| 
       354 
382 
     | 
    
         
             
                  tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
         
     | 
| 
       355 
     | 
    
         
            -
             
     | 
| 
      
 383 
     | 
    
         
            +
             
     | 
| 
       356 
384 
     | 
    
         
             
                  expect {
         
     | 
| 
       357 
385 
     | 
    
         
             
                    title = tar_url.title
         
     | 
| 
       358 
386 
     | 
    
         
             
                  }.to change { tar_url.errors.size }
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,13 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification 
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: metainspector
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version 
         
     | 
| 
       4 
     | 
    
         
            -
              hash:  
     | 
| 
      
 4 
     | 
    
         
            +
              hash: 61
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
              segments: 
         
     | 
| 
       7 
7 
     | 
    
         
             
              - 1
         
     | 
| 
       8 
8 
     | 
    
         
             
              - 10
         
     | 
| 
       9 
     | 
    
         
            -
              -  
     | 
| 
       10 
     | 
    
         
            -
              version: 1.10. 
     | 
| 
      
 9 
     | 
    
         
            +
              - 1
         
     | 
| 
      
 10 
     | 
    
         
            +
              version: 1.10.1
         
     | 
| 
       11 
11 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       12 
12 
     | 
    
         
             
            authors: 
         
     | 
| 
       13 
13 
     | 
    
         
             
            - Jaime Iniesta
         
     | 
| 
         @@ -15,7 +15,7 @@ autorequire: 
     | 
|
| 
       15 
15 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       16 
16 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       17 
17 
     | 
    
         | 
| 
       18 
     | 
    
         
            -
            date: 2012-11- 
     | 
| 
      
 18 
     | 
    
         
            +
            date: 2012-11-16 00:00:00 Z
         
     | 
| 
       19 
19 
     | 
    
         
             
            dependencies: 
         
     | 
| 
       20 
20 
     | 
    
         
             
            - !ruby/object:Gem::Dependency 
         
     | 
| 
       21 
21 
     | 
    
         
             
              version_requirements: &id001 !ruby/object:Gem::Requirement 
         
     |