metainspector 1.9.11 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,14 +16,16 @@ describe MetaInspector do
16
16
  FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
17
17
  FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
18
18
  FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
19
- FakeWeb.register_uri(:get, "http://w3clove.com/faqs", :response => fixture_file("w3clove_faqs.response"))
20
- FakeWeb.register_uri(:get, "https://twitter.com/w3clove", :response => fixture_file("twitter_w3clove.response"))
19
+ FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
20
+ FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
21
21
  FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
22
22
  FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
23
23
  FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
24
24
  FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
25
25
  FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
26
26
  FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
27
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
28
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
27
29
 
28
30
  describe 'Initialization' do
29
31
  it 'should accept an URL with a scheme' do
@@ -88,7 +90,7 @@ describe MetaInspector do
88
90
  end
89
91
 
90
92
  it "should find images on twitter" do
91
- m = MetaInspector.new('https://twitter.com/w3clove')
93
+ m = MetaInspector.new('https://twitter.com/markupvalidator')
92
94
  m.images.length.should == 6
93
95
  m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
94
96
  end
@@ -150,30 +152,17 @@ describe MetaInspector do
150
152
  end
151
153
 
152
154
  it "should get correct absolute links for internal pages" do
153
- m = MetaInspector.new('http://w3clove.com/faqs')
154
- m.links.should == [ "http://w3clove.com/#",
155
- "http://w3clove.com/",
156
- "http://w3clove.com/faqs",
157
- "http://w3clove.com/plans-and-pricing",
158
- "http://w3clove.com/contact",
159
- "http://w3clove.com/charts/errors",
160
- "http://w3clove.com/credits",
161
- "http://w3clove.com/signin",
162
- "http://validator.w3.org",
163
- "http://www.sitemaps.org/",
164
- "http://jaimeiniesta.com/",
165
- "http://mendicantuniversity.org/",
166
- "http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
167
- "http://majesticseacreature.com/",
168
- "http://school.mendicantuniversity.org/alumni/2011",
169
- "https://github.com/jaimeiniesta/w3clove",
170
- "http://w3clove.com",
171
- "http://w3clove.com/api_v1_reference",
172
- "https://twitter.com/w3clove",
173
- "http://twitter.com/share",
174
- "http://w3clove.com/terms_of_service",
175
- "http://twitter.com/W3CLove",
176
- "http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
155
+ @m.internal_links.should == [ "http://pagerankalert.com/",
156
+ "http://pagerankalert.com/es?language=es",
157
+ "http://pagerankalert.com/users/sign_up",
158
+ "http://pagerankalert.com/users/sign_in" ]
159
+ end
160
+
161
+ it "should get correct absolute links for external pages" do
162
+ @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
163
+ "http://pagerankalert.posterous.com",
164
+ "http://twitter.com/pagerankalert",
165
+ "http://twitter.com/share" ]
177
166
  end
178
167
 
179
168
  it "should get correct absolute links, correcting relative links from URL not ending with slash" do
@@ -302,7 +291,7 @@ describe MetaInspector do
302
291
  describe 'to_hash' do
303
292
  it "should return a hash with all the values set" do
304
293
  @m = MetaInspector.new('http://pagerankalert.com')
305
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
294
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
306
295
  end
307
296
  end
308
297
 
@@ -316,7 +305,7 @@ describe MetaInspector do
316
305
  end
317
306
 
318
307
  it "should handle timeouts" do
319
- impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
308
+ impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
320
309
 
321
310
  expect {
322
311
  title = impatient.title
@@ -335,6 +324,42 @@ describe MetaInspector do
335
324
  nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
336
325
  end
337
326
 
327
+ it "should parse images when parse_html_content_type_only is not specified" do
328
+ image_url = MetaInspector.new('http://pagerankalert.com/image.png')
329
+ desc = image_url.description
330
+
331
+ image_url.errors == nil
332
+ image_url.parsed? == true
333
+ end
334
+
335
+ it "should parse images when parse_html_content_type_only is false" do
336
+ image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
337
+ desc = image_url.description
338
+
339
+ image_url.errors == nil
340
+ image_url.parsed? == true
341
+ end
342
+
343
+ it "should handle errors when content is image/jpeg and html_content_type_only is true" do
344
+ image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
345
+
346
+ expect {
347
+ title = image_url.title
348
+ }.to change { image_url.errors.size }
349
+
350
+ image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
351
+ end
352
+
353
+ it "should handle errors when content is not text/html and html_content_type_only is true" do
354
+ tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
355
+
356
+ expect {
357
+ title = tar_url.title
358
+ }.to change { tar_url.errors.size }
359
+
360
+ tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
361
+ end
362
+
338
363
  describe "parsed?" do
339
364
  it "should return true if we have a parsed document" do
340
365
  good = MetaInspector.new('http://pagerankalert.com')
@@ -344,12 +369,46 @@ describe MetaInspector do
344
369
  end
345
370
 
346
371
  it "should return false if we don't have a parsed document" do
347
- bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
372
+ bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
348
373
  title = bad.title
349
374
 
350
375
  bad.parsed?.should == false
351
376
  end
377
+
378
+ it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
379
+ tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
380
+ title = tar.title
381
+
382
+ tar.parsed?.should == false
383
+ end
352
384
  end
353
385
  end
354
386
 
387
+ describe "content_type" do
388
+ it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
389
+ good = MetaInspector.new('http://pagerankalert.com/image.png')
390
+ title = good.title
391
+
392
+ good.parsed?.should == true
393
+ good.content_type == "image/jpeg"
394
+ end
395
+
396
+ it "should return the correct content type of the url if it is parsed correctly even for html pages" do
397
+ good = MetaInspector.new('http://pagerankalert.com')
398
+ title = good.title
399
+
400
+ good.parsed?.should == true
401
+ good.content_type == "text/html"
402
+ end
403
+
404
+ it "should return the correct content type of the url if it is not parsed correctly" do
405
+ bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
406
+ title = bad.title
407
+
408
+ bad.parsed?.should == false
409
+ bad.content_type == "image/jpeg"
410
+ end
411
+
412
+ end
413
+
355
414
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 37
4
+ hash: 63
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 9
9
- - 11
10
- version: 1.9.11
8
+ - 10
9
+ - 0
10
+ version: 1.10.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-09 00:00:00 Z
18
+ date: 2012-11-15 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement
@@ -146,14 +146,14 @@ files:
146
146
  - spec/fixtures/guardian.co.uk.response
147
147
  - spec/fixtures/international.response
148
148
  - spec/fixtures/iteh.at.response
149
+ - spec/fixtures/markupvalidator_faqs.response
149
150
  - spec/fixtures/nonhttp.response
150
151
  - spec/fixtures/pagerankalert.com.response
151
152
  - spec/fixtures/protocol_relative.response
152
153
  - spec/fixtures/tea-tron.com.response
153
154
  - spec/fixtures/theonion-no-description.com.response
154
155
  - spec/fixtures/theonion.com.response
155
- - spec/fixtures/twitter_w3clove.response
156
- - spec/fixtures/w3clove_faqs.response
156
+ - spec/fixtures/twitter_markupvalidator.response
157
157
  - spec/fixtures/wordpress_site.response
158
158
  - spec/fixtures/youtube.response
159
159
  - spec/metainspector_spec.rb