metainspector 1.9.11 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +22 -18
- data/lib/meta_inspector.rb +2 -2
- data/lib/meta_inspector/scraper.rb +29 -6
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/{w3clove_faqs.response → markupvalidator_faqs.response} +23 -23
- data/spec/fixtures/{twitter_w3clove.response → twitter_markupvalidator.response} +926 -926
- data/spec/metainspector_spec.rb +89 -30
- metadata +7 -7
    
        data/spec/metainspector_spec.rb
    CHANGED
    
    | @@ -16,14 +16,16 @@ describe MetaInspector do | |
| 16 16 | 
             
              FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
         | 
| 17 17 | 
             
              FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
         | 
| 18 18 | 
             
              FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
         | 
| 19 | 
            -
              FakeWeb.register_uri(:get, "http:// | 
| 20 | 
            -
              FakeWeb.register_uri(:get, "https://twitter.com/ | 
| 19 | 
            +
              FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
         | 
| 20 | 
            +
              FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
         | 
| 21 21 | 
             
              FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
         | 
| 22 22 | 
             
              FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
         | 
| 23 23 | 
             
              FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
         | 
| 24 24 | 
             
              FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
         | 
| 25 25 | 
             
              FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
         | 
| 26 26 | 
             
              FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
         | 
| 27 | 
            +
              FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
         | 
| 28 | 
            +
              FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
         | 
| 27 29 |  | 
| 28 30 | 
             
              describe 'Initialization' do
         | 
| 29 31 | 
             
                it 'should accept an URL with a scheme' do
         | 
| @@ -88,7 +90,7 @@ describe MetaInspector do | |
| 88 90 | 
             
                  end
         | 
| 89 91 |  | 
| 90 92 | 
             
                  it "should find images on twitter" do
         | 
| 91 | 
            -
                    m = MetaInspector.new('https://twitter.com/ | 
| 93 | 
            +
                    m = MetaInspector.new('https://twitter.com/markupvalidator')
         | 
| 92 94 | 
             
                    m.images.length.should == 6
         | 
| 93 95 | 
             
                    m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
         | 
| 94 96 | 
             
                  end
         | 
| @@ -150,30 +152,17 @@ describe MetaInspector do | |
| 150 152 | 
             
                end
         | 
| 151 153 |  | 
| 152 154 | 
             
                it "should get correct absolute links for internal pages" do
         | 
| 153 | 
            -
                  m  | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
             | 
| 158 | 
            -
             | 
| 159 | 
            -
             | 
| 160 | 
            -
             | 
| 161 | 
            -
             | 
| 162 | 
            -
             | 
| 163 | 
            -
             | 
| 164 | 
            -
                                      "http://jaimeiniesta.com/",
         | 
| 165 | 
            -
                                      "http://mendicantuniversity.org/",
         | 
| 166 | 
            -
                                      "http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
         | 
| 167 | 
            -
                                      "http://majesticseacreature.com/",
         | 
| 168 | 
            -
                                      "http://school.mendicantuniversity.org/alumni/2011",
         | 
| 169 | 
            -
                                      "https://github.com/jaimeiniesta/w3clove",
         | 
| 170 | 
            -
                                      "http://w3clove.com",
         | 
| 171 | 
            -
                                      "http://w3clove.com/api_v1_reference",
         | 
| 172 | 
            -
                                      "https://twitter.com/w3clove",
         | 
| 173 | 
            -
                                      "http://twitter.com/share",
         | 
| 174 | 
            -
                                      "http://w3clove.com/terms_of_service",
         | 
| 175 | 
            -
                                      "http://twitter.com/W3CLove",
         | 
| 176 | 
            -
                                      "http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
         | 
| 155 | 
            +
                  @m.internal_links.should == [ "http://pagerankalert.com/",
         | 
| 156 | 
            +
                                       "http://pagerankalert.com/es?language=es",
         | 
| 157 | 
            +
                                       "http://pagerankalert.com/users/sign_up",
         | 
| 158 | 
            +
                                       "http://pagerankalert.com/users/sign_in" ]
         | 
| 159 | 
            +
                end
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                it "should get correct absolute links for external pages" do
         | 
| 162 | 
            +
                  @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
         | 
| 163 | 
            +
                                       "http://pagerankalert.posterous.com",
         | 
| 164 | 
            +
                                       "http://twitter.com/pagerankalert",
         | 
| 165 | 
            +
                                       "http://twitter.com/share" ]
         | 
| 177 166 | 
             
                end
         | 
| 178 167 |  | 
| 179 168 | 
             
                it "should get correct absolute links, correcting relative links from URL not ending with slash" do
         | 
| @@ -302,7 +291,7 @@ describe MetaInspector do | |
| 302 291 | 
             
              describe 'to_hash' do
         | 
| 303 292 | 
             
                it "should return a hash with all the values set" do
         | 
| 304 293 | 
             
                  @m = MetaInspector.new('http://pagerankalert.com')
         | 
| 305 | 
            -
                  @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
         | 
| 294 | 
            +
                  @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
         | 
| 306 295 | 
             
                end
         | 
| 307 296 | 
             
              end
         | 
| 308 297 |  | 
| @@ -316,7 +305,7 @@ describe MetaInspector do | |
| 316 305 | 
             
                end
         | 
| 317 306 |  | 
| 318 307 | 
             
                it "should handle timeouts" do
         | 
| 319 | 
            -
                  impatient = MetaInspector.new('http:// | 
| 308 | 
            +
                  impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
         | 
| 320 309 |  | 
| 321 310 | 
             
                  expect {
         | 
| 322 311 | 
             
                    title = impatient.title
         | 
| @@ -335,6 +324,42 @@ describe MetaInspector do | |
| 335 324 | 
             
                  nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
         | 
| 336 325 | 
             
                end
         | 
| 337 326 |  | 
| 327 | 
            +
                it "should parse images when parse_html_content_type_only is not specified" do
         | 
| 328 | 
            +
                  image_url = MetaInspector.new('http://pagerankalert.com/image.png')
         | 
| 329 | 
            +
                  desc = image_url.description
         | 
| 330 | 
            +
             | 
| 331 | 
            +
                  image_url.errors == nil
         | 
| 332 | 
            +
                  image_url.parsed? == true
         | 
| 333 | 
            +
                end
         | 
| 334 | 
            +
             | 
| 335 | 
            +
                it "should parse images when parse_html_content_type_only is false" do
         | 
| 336 | 
            +
                  image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
         | 
| 337 | 
            +
                  desc = image_url.description
         | 
| 338 | 
            +
             | 
| 339 | 
            +
                  image_url.errors == nil
         | 
| 340 | 
            +
                  image_url.parsed? == true
         | 
| 341 | 
            +
                end
         | 
| 342 | 
            +
             | 
| 343 | 
            +
                it "should handle errors when content is image/jpeg and html_content_type_only is true" do
         | 
| 344 | 
            +
                  image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
         | 
| 345 | 
            +
                  
         | 
| 346 | 
            +
                  expect {
         | 
| 347 | 
            +
                    title = image_url.title
         | 
| 348 | 
            +
                  }.to change { image_url.errors.size }
         | 
| 349 | 
            +
             | 
| 350 | 
            +
                  image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
         | 
| 351 | 
            +
                end
         | 
| 352 | 
            +
             | 
| 353 | 
            +
                it "should handle errors when content is not text/html and html_content_type_only is true" do
         | 
| 354 | 
            +
                  tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
         | 
| 355 | 
            +
                  
         | 
| 356 | 
            +
                  expect {
         | 
| 357 | 
            +
                    title = tar_url.title
         | 
| 358 | 
            +
                  }.to change { tar_url.errors.size }
         | 
| 359 | 
            +
             | 
| 360 | 
            +
                  tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
         | 
| 361 | 
            +
                end
         | 
| 362 | 
            +
             | 
| 338 363 | 
             
                describe "parsed?" do
         | 
| 339 364 | 
             
                  it "should return true if we have a parsed document" do
         | 
| 340 365 | 
             
                    good  = MetaInspector.new('http://pagerankalert.com')
         | 
| @@ -344,12 +369,46 @@ describe MetaInspector do | |
| 344 369 | 
             
                  end
         | 
| 345 370 |  | 
| 346 371 | 
             
                  it "should return false if we don't have a parsed document" do
         | 
| 347 | 
            -
                    bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
         | 
| 372 | 
            +
                    bad  = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
         | 
| 348 373 | 
             
                    title = bad.title
         | 
| 349 374 |  | 
| 350 375 | 
             
                    bad.parsed?.should == false
         | 
| 351 376 | 
             
                  end
         | 
| 377 | 
            +
             | 
| 378 | 
            +
                  it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
         | 
| 379 | 
            +
                    tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
         | 
| 380 | 
            +
                    title = tar.title
         | 
| 381 | 
            +
             | 
| 382 | 
            +
                    tar.parsed?.should == false
         | 
| 383 | 
            +
                  end
         | 
| 352 384 | 
             
                end
         | 
| 353 385 | 
             
              end
         | 
| 354 386 |  | 
| 387 | 
            +
              describe "content_type" do
         | 
| 388 | 
            +
                it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
         | 
| 389 | 
            +
                  good = MetaInspector.new('http://pagerankalert.com/image.png')
         | 
| 390 | 
            +
                  title = good.title
         | 
| 391 | 
            +
             | 
| 392 | 
            +
                  good.parsed?.should == true
         | 
| 393 | 
            +
                  good.content_type == "image/jpeg"
         | 
| 394 | 
            +
                end
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                it "should return the correct content type of the url if it is parsed correctly even for html pages" do
         | 
| 397 | 
            +
                  good = MetaInspector.new('http://pagerankalert.com')
         | 
| 398 | 
            +
                  title = good.title
         | 
| 399 | 
            +
             | 
| 400 | 
            +
                  good.parsed?.should == true
         | 
| 401 | 
            +
                  good.content_type == "text/html"
         | 
| 402 | 
            +
                end
         | 
| 403 | 
            +
             | 
| 404 | 
            +
                it "should return the correct content type of the url if it is not parsed correctly" do
         | 
| 405 | 
            +
                  bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
         | 
| 406 | 
            +
                  title = bad.title
         | 
| 407 | 
            +
             | 
| 408 | 
            +
                  bad.parsed?.should == false
         | 
| 409 | 
            +
                  bad.content_type == "image/jpeg"
         | 
| 410 | 
            +
                end
         | 
| 411 | 
            +
             | 
| 412 | 
            +
              end
         | 
| 413 | 
            +
             | 
| 355 414 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: metainspector
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 4 | 
            +
              hash: 63
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 1
         | 
| 8 | 
            -
              -  | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 1. | 
| 8 | 
            +
              - 10
         | 
| 9 | 
            +
              - 0
         | 
| 10 | 
            +
              version: 1.10.0
         | 
| 11 11 | 
             
            platform: ruby
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - Jaime Iniesta
         | 
| @@ -15,7 +15,7 @@ autorequire: | |
| 15 15 | 
             
            bindir: bin
         | 
| 16 16 | 
             
            cert_chain: []
         | 
| 17 17 |  | 
| 18 | 
            -
            date: 2012-11- | 
| 18 | 
            +
            date: 2012-11-15 00:00:00 Z
         | 
| 19 19 | 
             
            dependencies: 
         | 
| 20 20 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 21 21 | 
             
              version_requirements: &id001 !ruby/object:Gem::Requirement 
         | 
| @@ -146,14 +146,14 @@ files: | |
| 146 146 | 
             
            - spec/fixtures/guardian.co.uk.response
         | 
| 147 147 | 
             
            - spec/fixtures/international.response
         | 
| 148 148 | 
             
            - spec/fixtures/iteh.at.response
         | 
| 149 | 
            +
            - spec/fixtures/markupvalidator_faqs.response
         | 
| 149 150 | 
             
            - spec/fixtures/nonhttp.response
         | 
| 150 151 | 
             
            - spec/fixtures/pagerankalert.com.response
         | 
| 151 152 | 
             
            - spec/fixtures/protocol_relative.response
         | 
| 152 153 | 
             
            - spec/fixtures/tea-tron.com.response
         | 
| 153 154 | 
             
            - spec/fixtures/theonion-no-description.com.response
         | 
| 154 155 | 
             
            - spec/fixtures/theonion.com.response
         | 
| 155 | 
            -
            - spec/fixtures/ | 
| 156 | 
            -
            - spec/fixtures/w3clove_faqs.response
         | 
| 156 | 
            +
            - spec/fixtures/twitter_markupvalidator.response
         | 
| 157 157 | 
             
            - spec/fixtures/wordpress_site.response
         | 
| 158 158 | 
             
            - spec/fixtures/youtube.response
         | 
| 159 159 | 
             
            - spec/metainspector_spec.rb
         |