metainspector 1.9.11 → 1.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,14 +16,16 @@ describe MetaInspector do
16
16
  FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
17
17
  FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
18
18
  FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
19
- FakeWeb.register_uri(:get, "http://w3clove.com/faqs", :response => fixture_file("w3clove_faqs.response"))
20
- FakeWeb.register_uri(:get, "https://twitter.com/w3clove", :response => fixture_file("twitter_w3clove.response"))
19
+ FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
20
+ FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
21
21
  FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
22
22
  FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
23
23
  FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
24
24
  FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
25
25
  FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
26
26
  FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
27
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
28
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
27
29
 
28
30
  describe 'Initialization' do
29
31
  it 'should accept an URL with a scheme' do
@@ -88,7 +90,7 @@ describe MetaInspector do
88
90
  end
89
91
 
90
92
  it "should find images on twitter" do
91
- m = MetaInspector.new('https://twitter.com/w3clove')
93
+ m = MetaInspector.new('https://twitter.com/markupvalidator')
92
94
  m.images.length.should == 6
93
95
  m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
94
96
  end
@@ -150,30 +152,17 @@ describe MetaInspector do
150
152
  end
151
153
 
152
154
  it "should get correct absolute links for internal pages" do
153
- m = MetaInspector.new('http://w3clove.com/faqs')
154
- m.links.should == [ "http://w3clove.com/#",
155
- "http://w3clove.com/",
156
- "http://w3clove.com/faqs",
157
- "http://w3clove.com/plans-and-pricing",
158
- "http://w3clove.com/contact",
159
- "http://w3clove.com/charts/errors",
160
- "http://w3clove.com/credits",
161
- "http://w3clove.com/signin",
162
- "http://validator.w3.org",
163
- "http://www.sitemaps.org/",
164
- "http://jaimeiniesta.com/",
165
- "http://mendicantuniversity.org/",
166
- "http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
167
- "http://majesticseacreature.com/",
168
- "http://school.mendicantuniversity.org/alumni/2011",
169
- "https://github.com/jaimeiniesta/w3clove",
170
- "http://w3clove.com",
171
- "http://w3clove.com/api_v1_reference",
172
- "https://twitter.com/w3clove",
173
- "http://twitter.com/share",
174
- "http://w3clove.com/terms_of_service",
175
- "http://twitter.com/W3CLove",
176
- "http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
155
+ @m.internal_links.should == [ "http://pagerankalert.com/",
156
+ "http://pagerankalert.com/es?language=es",
157
+ "http://pagerankalert.com/users/sign_up",
158
+ "http://pagerankalert.com/users/sign_in" ]
159
+ end
160
+
161
+ it "should get correct absolute links for external pages" do
162
+ @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
163
+ "http://pagerankalert.posterous.com",
164
+ "http://twitter.com/pagerankalert",
165
+ "http://twitter.com/share" ]
177
166
  end
178
167
 
179
168
  it "should get correct absolute links, correcting relative links from URL not ending with slash" do
@@ -302,7 +291,7 @@ describe MetaInspector do
302
291
  describe 'to_hash' do
303
292
  it "should return a hash with all the values set" do
304
293
  @m = MetaInspector.new('http://pagerankalert.com')
305
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
294
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
306
295
  end
307
296
  end
308
297
 
@@ -316,7 +305,7 @@ describe MetaInspector do
316
305
  end
317
306
 
318
307
  it "should handle timeouts" do
319
- impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
308
+ impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
320
309
 
321
310
  expect {
322
311
  title = impatient.title
@@ -335,6 +324,42 @@ describe MetaInspector do
335
324
  nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
336
325
  end
337
326
 
327
+ it "should parse images when parse_html_content_type_only is not specified" do
328
+ image_url = MetaInspector.new('http://pagerankalert.com/image.png')
329
+ desc = image_url.description
330
+
331
+ image_url.errors == nil
332
+ image_url.parsed? == true
333
+ end
334
+
335
+ it "should parse images when parse_html_content_type_only is false" do
336
+ image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
337
+ desc = image_url.description
338
+
339
+ image_url.errors == nil
340
+ image_url.parsed? == true
341
+ end
342
+
343
+ it "should handle errors when content is image/jpeg and html_content_type_only is true" do
344
+ image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
345
+
346
+ expect {
347
+ title = image_url.title
348
+ }.to change { image_url.errors.size }
349
+
350
+ image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
351
+ end
352
+
353
+ it "should handle errors when content is not text/html and html_content_type_only is true" do
354
+ tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
355
+
356
+ expect {
357
+ title = tar_url.title
358
+ }.to change { tar_url.errors.size }
359
+
360
+ tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
361
+ end
362
+
338
363
  describe "parsed?" do
339
364
  it "should return true if we have a parsed document" do
340
365
  good = MetaInspector.new('http://pagerankalert.com')
@@ -344,12 +369,46 @@ describe MetaInspector do
344
369
  end
345
370
 
346
371
  it "should return false if we don't have a parsed document" do
347
- bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
372
+ bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
348
373
  title = bad.title
349
374
 
350
375
  bad.parsed?.should == false
351
376
  end
377
+
378
+ it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
379
+ tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
380
+ title = tar.title
381
+
382
+ tar.parsed?.should == false
383
+ end
352
384
  end
353
385
  end
354
386
 
387
+ describe "content_type" do
388
+ it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
389
+ good = MetaInspector.new('http://pagerankalert.com/image.png')
390
+ title = good.title
391
+
392
+ good.parsed?.should == true
393
+ good.content_type == "image/jpeg"
394
+ end
395
+
396
+ it "should return the correct content type of the url if it is parsed correctly even for html pages" do
397
+ good = MetaInspector.new('http://pagerankalert.com')
398
+ title = good.title
399
+
400
+ good.parsed?.should == true
401
+ good.content_type == "text/html"
402
+ end
403
+
404
+ it "should return the correct content type of the url if it is not parsed correctly" do
405
+ bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
406
+ title = bad.title
407
+
408
+ bad.parsed?.should == false
409
+ bad.content_type == "image/jpeg"
410
+ end
411
+
412
+ end
413
+
355
414
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 37
4
+ hash: 63
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 9
9
- - 11
10
- version: 1.9.11
8
+ - 10
9
+ - 0
10
+ version: 1.10.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-09 00:00:00 Z
18
+ date: 2012-11-15 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement
@@ -146,14 +146,14 @@ files:
146
146
  - spec/fixtures/guardian.co.uk.response
147
147
  - spec/fixtures/international.response
148
148
  - spec/fixtures/iteh.at.response
149
+ - spec/fixtures/markupvalidator_faqs.response
149
150
  - spec/fixtures/nonhttp.response
150
151
  - spec/fixtures/pagerankalert.com.response
151
152
  - spec/fixtures/protocol_relative.response
152
153
  - spec/fixtures/tea-tron.com.response
153
154
  - spec/fixtures/theonion-no-description.com.response
154
155
  - spec/fixtures/theonion.com.response
155
- - spec/fixtures/twitter_w3clove.response
156
- - spec/fixtures/w3clove_faqs.response
156
+ - spec/fixtures/twitter_markupvalidator.response
157
157
  - spec/fixtures/wordpress_site.response
158
158
  - spec/fixtures/youtube.response
159
159
  - spec/metainspector_spec.rb