metainspector 1.9.11 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +22 -18
- data/lib/meta_inspector.rb +2 -2
- data/lib/meta_inspector/scraper.rb +29 -6
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/{w3clove_faqs.response → markupvalidator_faqs.response} +23 -23
- data/spec/fixtures/{twitter_w3clove.response → twitter_markupvalidator.response} +926 -926
- data/spec/metainspector_spec.rb +89 -30
- metadata +7 -7
data/spec/metainspector_spec.rb
CHANGED
@@ -16,14 +16,16 @@ describe MetaInspector do
|
|
16
16
|
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
17
17
|
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
18
18
|
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
19
|
-
FakeWeb.register_uri(:get, "http://
|
20
|
-
FakeWeb.register_uri(:get, "https://twitter.com/
|
19
|
+
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
20
|
+
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
21
21
|
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
22
22
|
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
23
23
|
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
24
24
|
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
25
25
|
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
26
26
|
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
27
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
|
28
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
27
29
|
|
28
30
|
describe 'Initialization' do
|
29
31
|
it 'should accept an URL with a scheme' do
|
@@ -88,7 +90,7 @@ describe MetaInspector do
|
|
88
90
|
end
|
89
91
|
|
90
92
|
it "should find images on twitter" do
|
91
|
-
m = MetaInspector.new('https://twitter.com/
|
93
|
+
m = MetaInspector.new('https://twitter.com/markupvalidator')
|
92
94
|
m.images.length.should == 6
|
93
95
|
m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
|
94
96
|
end
|
@@ -150,30 +152,17 @@ describe MetaInspector do
|
|
150
152
|
end
|
151
153
|
|
152
154
|
it "should get correct absolute links for internal pages" do
|
153
|
-
m
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
"http://jaimeiniesta.com/",
|
165
|
-
"http://mendicantuniversity.org/",
|
166
|
-
"http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
|
167
|
-
"http://majesticseacreature.com/",
|
168
|
-
"http://school.mendicantuniversity.org/alumni/2011",
|
169
|
-
"https://github.com/jaimeiniesta/w3clove",
|
170
|
-
"http://w3clove.com",
|
171
|
-
"http://w3clove.com/api_v1_reference",
|
172
|
-
"https://twitter.com/w3clove",
|
173
|
-
"http://twitter.com/share",
|
174
|
-
"http://w3clove.com/terms_of_service",
|
175
|
-
"http://twitter.com/W3CLove",
|
176
|
-
"http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
|
155
|
+
@m.internal_links.should == [ "http://pagerankalert.com/",
|
156
|
+
"http://pagerankalert.com/es?language=es",
|
157
|
+
"http://pagerankalert.com/users/sign_up",
|
158
|
+
"http://pagerankalert.com/users/sign_in" ]
|
159
|
+
end
|
160
|
+
|
161
|
+
it "should get correct absolute links for external pages" do
|
162
|
+
@m.external_links.should == [ "mailto:pagerankalert@gmail.com",
|
163
|
+
"http://pagerankalert.posterous.com",
|
164
|
+
"http://twitter.com/pagerankalert",
|
165
|
+
"http://twitter.com/share" ]
|
177
166
|
end
|
178
167
|
|
179
168
|
it "should get correct absolute links, correcting relative links from URL not ending with slash" do
|
@@ -302,7 +291,7 @@ describe MetaInspector do
|
|
302
291
|
describe 'to_hash' do
|
303
292
|
it "should return a hash with all the values set" do
|
304
293
|
@m = MetaInspector.new('http://pagerankalert.com')
|
305
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
294
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
|
306
295
|
end
|
307
296
|
end
|
308
297
|
|
@@ -316,7 +305,7 @@ describe MetaInspector do
|
|
316
305
|
end
|
317
306
|
|
318
307
|
it "should handle timeouts" do
|
319
|
-
impatient = MetaInspector.new('http://
|
308
|
+
impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
|
320
309
|
|
321
310
|
expect {
|
322
311
|
title = impatient.title
|
@@ -335,6 +324,42 @@ describe MetaInspector do
|
|
335
324
|
nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
|
336
325
|
end
|
337
326
|
|
327
|
+
it "should parse images when parse_html_content_type_only is not specified" do
|
328
|
+
image_url = MetaInspector.new('http://pagerankalert.com/image.png')
|
329
|
+
desc = image_url.description
|
330
|
+
|
331
|
+
image_url.errors == nil
|
332
|
+
image_url.parsed? == true
|
333
|
+
end
|
334
|
+
|
335
|
+
it "should parse images when parse_html_content_type_only is false" do
|
336
|
+
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
|
337
|
+
desc = image_url.description
|
338
|
+
|
339
|
+
image_url.errors == nil
|
340
|
+
image_url.parsed? == true
|
341
|
+
end
|
342
|
+
|
343
|
+
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
344
|
+
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
345
|
+
|
346
|
+
expect {
|
347
|
+
title = image_url.title
|
348
|
+
}.to change { image_url.errors.size }
|
349
|
+
|
350
|
+
image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
|
351
|
+
end
|
352
|
+
|
353
|
+
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
354
|
+
tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
355
|
+
|
356
|
+
expect {
|
357
|
+
title = tar_url.title
|
358
|
+
}.to change { tar_url.errors.size }
|
359
|
+
|
360
|
+
tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
|
361
|
+
end
|
362
|
+
|
338
363
|
describe "parsed?" do
|
339
364
|
it "should return true if we have a parsed document" do
|
340
365
|
good = MetaInspector.new('http://pagerankalert.com')
|
@@ -344,12 +369,46 @@ describe MetaInspector do
|
|
344
369
|
end
|
345
370
|
|
346
371
|
it "should return false if we don't have a parsed document" do
|
347
|
-
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
|
372
|
+
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
|
348
373
|
title = bad.title
|
349
374
|
|
350
375
|
bad.parsed?.should == false
|
351
376
|
end
|
377
|
+
|
378
|
+
it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
|
379
|
+
tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
380
|
+
title = tar.title
|
381
|
+
|
382
|
+
tar.parsed?.should == false
|
383
|
+
end
|
352
384
|
end
|
353
385
|
end
|
354
386
|
|
387
|
+
describe "content_type" do
|
388
|
+
it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
|
389
|
+
good = MetaInspector.new('http://pagerankalert.com/image.png')
|
390
|
+
title = good.title
|
391
|
+
|
392
|
+
good.parsed?.should == true
|
393
|
+
good.content_type == "image/jpeg"
|
394
|
+
end
|
395
|
+
|
396
|
+
it "should return the correct content type of the url if it is parsed correctly even for html pages" do
|
397
|
+
good = MetaInspector.new('http://pagerankalert.com')
|
398
|
+
title = good.title
|
399
|
+
|
400
|
+
good.parsed?.should == true
|
401
|
+
good.content_type == "text/html"
|
402
|
+
end
|
403
|
+
|
404
|
+
it "should return the correct content type of the url if it is not parsed correctly" do
|
405
|
+
bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
406
|
+
title = bad.title
|
407
|
+
|
408
|
+
bad.parsed?.should == false
|
409
|
+
bad.content_type == "image/jpeg"
|
410
|
+
end
|
411
|
+
|
412
|
+
end
|
413
|
+
|
355
414
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 63
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 1.
|
8
|
+
- 10
|
9
|
+
- 0
|
10
|
+
version: 1.10.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-11-
|
18
|
+
date: 2012-11-15 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -146,14 +146,14 @@ files:
|
|
146
146
|
- spec/fixtures/guardian.co.uk.response
|
147
147
|
- spec/fixtures/international.response
|
148
148
|
- spec/fixtures/iteh.at.response
|
149
|
+
- spec/fixtures/markupvalidator_faqs.response
|
149
150
|
- spec/fixtures/nonhttp.response
|
150
151
|
- spec/fixtures/pagerankalert.com.response
|
151
152
|
- spec/fixtures/protocol_relative.response
|
152
153
|
- spec/fixtures/tea-tron.com.response
|
153
154
|
- spec/fixtures/theonion-no-description.com.response
|
154
155
|
- spec/fixtures/theonion.com.response
|
155
|
-
- spec/fixtures/
|
156
|
-
- spec/fixtures/w3clove_faqs.response
|
156
|
+
- spec/fixtures/twitter_markupvalidator.response
|
157
157
|
- spec/fixtures/wordpress_site.response
|
158
158
|
- spec/fixtures/youtube.response
|
159
159
|
- spec/metainspector_spec.rb
|