metainspector 1.9.11 → 1.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +22 -18
- data/lib/meta_inspector.rb +2 -2
- data/lib/meta_inspector/scraper.rb +29 -6
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/{w3clove_faqs.response → markupvalidator_faqs.response} +23 -23
- data/spec/fixtures/{twitter_w3clove.response → twitter_markupvalidator.response} +926 -926
- data/spec/metainspector_spec.rb +89 -30
- metadata +7 -7
data/spec/metainspector_spec.rb
CHANGED
@@ -16,14 +16,16 @@ describe MetaInspector do
|
|
16
16
|
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
17
17
|
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
18
18
|
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
19
|
-
FakeWeb.register_uri(:get, "http://
|
20
|
-
FakeWeb.register_uri(:get, "https://twitter.com/
|
19
|
+
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
20
|
+
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
21
21
|
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
22
22
|
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
23
23
|
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
24
24
|
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
25
25
|
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
26
26
|
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
27
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
|
28
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
27
29
|
|
28
30
|
describe 'Initialization' do
|
29
31
|
it 'should accept an URL with a scheme' do
|
@@ -88,7 +90,7 @@ describe MetaInspector do
|
|
88
90
|
end
|
89
91
|
|
90
92
|
it "should find images on twitter" do
|
91
|
-
m = MetaInspector.new('https://twitter.com/
|
93
|
+
m = MetaInspector.new('https://twitter.com/markupvalidator')
|
92
94
|
m.images.length.should == 6
|
93
95
|
m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
|
94
96
|
end
|
@@ -150,30 +152,17 @@ describe MetaInspector do
|
|
150
152
|
end
|
151
153
|
|
152
154
|
it "should get correct absolute links for internal pages" do
|
153
|
-
m
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
"http://jaimeiniesta.com/",
|
165
|
-
"http://mendicantuniversity.org/",
|
166
|
-
"http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
|
167
|
-
"http://majesticseacreature.com/",
|
168
|
-
"http://school.mendicantuniversity.org/alumni/2011",
|
169
|
-
"https://github.com/jaimeiniesta/w3clove",
|
170
|
-
"http://w3clove.com",
|
171
|
-
"http://w3clove.com/api_v1_reference",
|
172
|
-
"https://twitter.com/w3clove",
|
173
|
-
"http://twitter.com/share",
|
174
|
-
"http://w3clove.com/terms_of_service",
|
175
|
-
"http://twitter.com/W3CLove",
|
176
|
-
"http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
|
155
|
+
@m.internal_links.should == [ "http://pagerankalert.com/",
|
156
|
+
"http://pagerankalert.com/es?language=es",
|
157
|
+
"http://pagerankalert.com/users/sign_up",
|
158
|
+
"http://pagerankalert.com/users/sign_in" ]
|
159
|
+
end
|
160
|
+
|
161
|
+
it "should get correct absolute links for external pages" do
|
162
|
+
@m.external_links.should == [ "mailto:pagerankalert@gmail.com",
|
163
|
+
"http://pagerankalert.posterous.com",
|
164
|
+
"http://twitter.com/pagerankalert",
|
165
|
+
"http://twitter.com/share" ]
|
177
166
|
end
|
178
167
|
|
179
168
|
it "should get correct absolute links, correcting relative links from URL not ending with slash" do
|
@@ -302,7 +291,7 @@ describe MetaInspector do
|
|
302
291
|
describe 'to_hash' do
|
303
292
|
it "should return a hash with all the values set" do
|
304
293
|
@m = MetaInspector.new('http://pagerankalert.com')
|
305
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
294
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
|
306
295
|
end
|
307
296
|
end
|
308
297
|
|
@@ -316,7 +305,7 @@ describe MetaInspector do
|
|
316
305
|
end
|
317
306
|
|
318
307
|
it "should handle timeouts" do
|
319
|
-
impatient = MetaInspector.new('http://
|
308
|
+
impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
|
320
309
|
|
321
310
|
expect {
|
322
311
|
title = impatient.title
|
@@ -335,6 +324,42 @@ describe MetaInspector do
|
|
335
324
|
nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
|
336
325
|
end
|
337
326
|
|
327
|
+
it "should parse images when parse_html_content_type_only is not specified" do
|
328
|
+
image_url = MetaInspector.new('http://pagerankalert.com/image.png')
|
329
|
+
desc = image_url.description
|
330
|
+
|
331
|
+
image_url.errors == nil
|
332
|
+
image_url.parsed? == true
|
333
|
+
end
|
334
|
+
|
335
|
+
it "should parse images when parse_html_content_type_only is false" do
|
336
|
+
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
|
337
|
+
desc = image_url.description
|
338
|
+
|
339
|
+
image_url.errors == nil
|
340
|
+
image_url.parsed? == true
|
341
|
+
end
|
342
|
+
|
343
|
+
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
344
|
+
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
345
|
+
|
346
|
+
expect {
|
347
|
+
title = image_url.title
|
348
|
+
}.to change { image_url.errors.size }
|
349
|
+
|
350
|
+
image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
|
351
|
+
end
|
352
|
+
|
353
|
+
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
354
|
+
tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
355
|
+
|
356
|
+
expect {
|
357
|
+
title = tar_url.title
|
358
|
+
}.to change { tar_url.errors.size }
|
359
|
+
|
360
|
+
tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
|
361
|
+
end
|
362
|
+
|
338
363
|
describe "parsed?" do
|
339
364
|
it "should return true if we have a parsed document" do
|
340
365
|
good = MetaInspector.new('http://pagerankalert.com')
|
@@ -344,12 +369,46 @@ describe MetaInspector do
|
|
344
369
|
end
|
345
370
|
|
346
371
|
it "should return false if we don't have a parsed document" do
|
347
|
-
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
|
372
|
+
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
|
348
373
|
title = bad.title
|
349
374
|
|
350
375
|
bad.parsed?.should == false
|
351
376
|
end
|
377
|
+
|
378
|
+
it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
|
379
|
+
tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
380
|
+
title = tar.title
|
381
|
+
|
382
|
+
tar.parsed?.should == false
|
383
|
+
end
|
352
384
|
end
|
353
385
|
end
|
354
386
|
|
387
|
+
describe "content_type" do
|
388
|
+
it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
|
389
|
+
good = MetaInspector.new('http://pagerankalert.com/image.png')
|
390
|
+
title = good.title
|
391
|
+
|
392
|
+
good.parsed?.should == true
|
393
|
+
good.content_type == "image/jpeg"
|
394
|
+
end
|
395
|
+
|
396
|
+
it "should return the correct content type of the url if it is parsed correctly even for html pages" do
|
397
|
+
good = MetaInspector.new('http://pagerankalert.com')
|
398
|
+
title = good.title
|
399
|
+
|
400
|
+
good.parsed?.should == true
|
401
|
+
good.content_type == "text/html"
|
402
|
+
end
|
403
|
+
|
404
|
+
it "should return the correct content type of the url if it is not parsed correctly" do
|
405
|
+
bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
406
|
+
title = bad.title
|
407
|
+
|
408
|
+
bad.parsed?.should == false
|
409
|
+
bad.content_type == "image/jpeg"
|
410
|
+
end
|
411
|
+
|
412
|
+
end
|
413
|
+
|
355
414
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 63
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 1.
|
8
|
+
- 10
|
9
|
+
- 0
|
10
|
+
version: 1.10.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-11-
|
18
|
+
date: 2012-11-15 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -146,14 +146,14 @@ files:
|
|
146
146
|
- spec/fixtures/guardian.co.uk.response
|
147
147
|
- spec/fixtures/international.response
|
148
148
|
- spec/fixtures/iteh.at.response
|
149
|
+
- spec/fixtures/markupvalidator_faqs.response
|
149
150
|
- spec/fixtures/nonhttp.response
|
150
151
|
- spec/fixtures/pagerankalert.com.response
|
151
152
|
- spec/fixtures/protocol_relative.response
|
152
153
|
- spec/fixtures/tea-tron.com.response
|
153
154
|
- spec/fixtures/theonion-no-description.com.response
|
154
155
|
- spec/fixtures/theonion.com.response
|
155
|
-
- spec/fixtures/
|
156
|
-
- spec/fixtures/w3clove_faqs.response
|
156
|
+
- spec/fixtures/twitter_markupvalidator.response
|
157
157
|
- spec/fixtures/wordpress_site.response
|
158
158
|
- spec/fixtures/youtube.response
|
159
159
|
- spec/metainspector_spec.rb
|