metainspector 1.16.1 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,547 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
- require File.join(File.dirname(__FILE__), "/spec_helper")
4
-
5
- describe MetaInspector do
6
- describe 'Initialization' do
7
- it 'should accept an URL with a scheme' do
8
- MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com/'
9
- end
10
-
11
- it "should use http:// as a default scheme" do
12
- MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
13
- end
14
-
15
- it "should accept an URL with international characters" do
16
- MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
17
- end
18
-
19
- it "should store the scheme" do
20
- MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
21
- MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
22
- MetaInspector.new('pagerankalert.com').scheme.should == 'http'
23
- end
24
-
25
- it "should store the host" do
26
- MetaInspector.new('http://pagerankalert.com').host.should == 'pagerankalert.com'
27
- MetaInspector.new('https://pagerankalert.com').host.should == 'pagerankalert.com'
28
- MetaInspector.new('pagerankalert.com').host.should == 'pagerankalert.com'
29
- end
30
-
31
- it "should store the root url" do
32
- MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
33
- MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
34
- MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
35
- MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
36
- end
37
- end
38
-
39
- describe 'Doing a basic scrape' do
40
- EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes & receive alerts'
41
-
42
- before(:each) do
43
- @m = MetaInspector.new('http://pagerankalert.com')
44
- end
45
-
46
- it "should get the title" do
47
- @m.title.should == EXPECTED_TITLE
48
- end
49
-
50
- it "should not find an image" do
51
- @m.image.should == nil
52
- end
53
-
54
- describe "get image" do
55
- it "should find the og image" do
56
- @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
57
- @m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
58
- @m.meta_og_image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
59
- end
60
-
61
- it "should find image on youtube" do
62
- MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc').image.should == "http://i2.ytimg.com/vi/iaGSSrp49uc/mqdefault.jpg"
63
- end
64
- end
65
-
66
- describe "get images" do
67
- it "should find all page images" do
68
- @m.images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
69
- end
70
-
71
- it "should find images on twitter" do
72
- m = MetaInspector.new('https://twitter.com/markupvalidator')
73
- m.images.length.should == 6
74
- m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
75
- end
76
- end
77
-
78
- it "should ignore malformed image tags" do
79
- # There is an image tag without a source. The scraper should not fatal.
80
- @m = MetaInspector.new("http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups")
81
- @m.images.size.should == 11
82
- end
83
-
84
- it "should have a Nokogiri::HTML::Document as parsed_document" do
85
- @m.parsed_document.class.should == Nokogiri::HTML::Document
86
- end
87
-
88
- it "should have a String as document" do
89
- @m.document.class.should == String
90
- end
91
-
92
- describe "Feed" do
93
- it "should get rss feed" do
94
- @m = MetaInspector.new('http://www.iteh.at')
95
- @m.feed.should == 'http://www.iteh.at/de/rss/'
96
- end
97
-
98
- it "should get atom feed" do
99
- @m = MetaInspector.new('http://www.tea-tron.com/jbravo/blog/')
100
- @m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
101
- end
102
-
103
- it "should return nil if no feed found" do
104
- @m = MetaInspector.new('http://www.alazan.com')
105
- @m.feed.should == nil
106
- end
107
- end
108
-
109
- describe "get description" do
110
- it "should find description on youtube" do
111
- MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc').description.should == ""
112
- end
113
- end
114
- end
115
-
116
- describe 'Doing a basic scrape from passed url html' do
117
-
118
- before(:each) do
119
- @m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
120
- end
121
-
122
- it "should get correct links when the url html is passed as an option" do
123
- @m.links.should == ["http://cnn.com/hello"]
124
- end
125
-
126
- it "should get the title" do
127
- @m.title.should == "Hello From Passed Html"
128
- end
129
- end
130
-
131
- describe 'Page with missing meta description' do
132
- it "should find secondary description" do
133
- @m = MetaInspector.new('http://theonion-no-description.com')
134
- @m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
135
- " an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
136
- end
137
- end
138
-
139
- describe 'Links' do
140
- before(:each) do
141
- @m = MetaInspector.new('http://pagerankalert.com')
142
- end
143
-
144
- it "should get the links" do
145
- @m.links.should == [ "http://pagerankalert.com/",
146
- "http://pagerankalert.com/es?language=es",
147
- "http://pagerankalert.com/users/sign_up",
148
- "http://pagerankalert.com/users/sign_in",
149
- "mailto:pagerankalert@gmail.com",
150
- "http://pagerankalert.posterous.com/",
151
- "http://twitter.com/pagerankalert",
152
- "http://twitter.com/share" ]
153
- end
154
-
155
- it "should get correct absolute links for internal pages" do
156
- @m.internal_links.should == [ "http://pagerankalert.com/",
157
- "http://pagerankalert.com/es?language=es",
158
- "http://pagerankalert.com/users/sign_up",
159
- "http://pagerankalert.com/users/sign_in" ]
160
- end
161
-
162
- it "should get correct absolute links for external pages" do
163
- @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
164
- "http://pagerankalert.posterous.com/",
165
- "http://twitter.com/pagerankalert",
166
- "http://twitter.com/share" ]
167
- end
168
-
169
- it "should get correct absolute links, correcting relative links from URL not ending with slash" do
170
- m = MetaInspector.new('http://alazan.com/websolution.asp')
171
- m.links.should == [ "http://alazan.com/index.asp",
172
- "http://alazan.com/faqs.asp" ]
173
- end
174
-
175
- it "should return empty array if no links found" do
176
- m = MetaInspector.new('http://example.com/empty')
177
- m.links.should == []
178
- end
179
-
180
- describe "links with international characters" do
181
- it "should get correct absolute links, encoding the URLs as needed" do
182
- m = MetaInspector.new('http://international.com')
183
- m.links.should == [ "http://international.com/espa%C3%B1a.asp",
184
- "http://international.com/roman%C3%A9e",
185
- "http://international.com/faqs#cami%C3%B3n",
186
- "http://international.com/search?q=cami%C3%B3n",
187
- "http://international.com/search?q=espa%C3%B1a#top",
188
- "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
189
- "http://example.com/espa%C3%B1a.asp",
190
- "http://example.com/roman%C3%A9e",
191
- "http://example.com/faqs#cami%C3%B3n",
192
- "http://example.com/search?q=cami%C3%B3n",
193
- "http://example.com/search?q=espa%C3%B1a#top"]
194
- end
195
-
196
- describe "internal links" do
197
- it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
198
- m = MetaInspector.new('http://international.com')
199
- m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
200
- "http://international.com/roman%C3%A9e",
201
- "http://international.com/faqs#cami%C3%B3n",
202
- "http://international.com/search?q=cami%C3%B3n",
203
- "http://international.com/search?q=espa%C3%B1a#top",
204
- "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
205
- end
206
-
207
- it "should not crash when processing malformed hrefs" do
208
- m = MetaInspector.new('http://example.com/malformed_href')
209
- expect {
210
- m.internal_links.should == [ "http://example.com/faqs" ]
211
- m.should_not be_ok
212
- }.to_not raise_error
213
- end
214
- end
215
-
216
- describe "external links" do
217
- it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
218
- m = MetaInspector.new('http://international.com')
219
- m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
220
- "http://example.com/roman%C3%A9e",
221
- "http://example.com/faqs#cami%C3%B3n",
222
- "http://example.com/search?q=cami%C3%B3n",
223
- "http://example.com/search?q=espa%C3%B1a#top"]
224
- end
225
-
226
- it "should not crash when processing malformed hrefs" do
227
- m = MetaInspector.new('http://example.com/malformed_href')
228
- expect {
229
- m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
230
- "javascript:alert('ok');", "javascript://", "mailto:email(at)example.com"]
231
- m.should_not be_ok
232
- }.to_not raise_error
233
- end
234
- end
235
- end
236
-
237
- it "should not crash with links that have weird href values" do
238
- m = MetaInspector.new('http://example.com/invalid_href')
239
- m.links.should == ["%3Cp%3Eftp://ftp.cdrom.com", "skype:joeuser?call", "telnet://telnet.cdrom.com"]
240
- end
241
- end
242
-
243
- describe 'Relative links' do
244
- describe 'From a root URL' do
245
- before(:each) do
246
- @m = MetaInspector.new('http://relative.com/')
247
- end
248
-
249
- it 'should get the relative links' do
250
- @m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
251
- end
252
- end
253
-
254
- describe 'From a document' do
255
- before(:each) do
256
- @m = MetaInspector.new('http://relative.com/company')
257
- end
258
-
259
- it 'should get the relative links' do
260
- @m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
261
- end
262
- end
263
-
264
- describe 'From a directory' do
265
- before(:each) do
266
- @m = MetaInspector.new('http://relative.com/company/')
267
- end
268
-
269
- it 'should get the relative links' do
270
- @m.internal_links.should == ['http://relative.com/company/about', 'http://relative.com/sitemap']
271
- end
272
- end
273
- end
274
-
275
- describe 'Relative links with base' do
276
- it 'should get the relative links from a document' do
277
- m = MetaInspector.new('http://relativewithbase.com/company/page2')
278
- m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
279
- end
280
-
281
- it 'should get the relative links from a directory' do
282
- m = MetaInspector.new('http://relativewithbase.com/company/page2/')
283
- m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
284
- end
285
- end
286
-
287
- describe 'Non-HTTP links' do
288
- before(:each) do
289
- @m = MetaInspector.new('http://example.com/nonhttp')
290
- end
291
-
292
- it "should get the links" do
293
- @m.links.sort.should == [
294
- "ftp://ftp.cdrom.com/",
295
- "javascript:alert('hey');",
296
- "mailto:user@example.com",
297
- "skype:joeuser?call",
298
- "telnet://telnet.cdrom.com"
299
- ]
300
- end
301
- end
302
-
303
- describe 'Protocol-relative URLs' do
304
- before(:each) do
305
- @m_http = MetaInspector.new('http://protocol-relative.com')
306
- @m_https = MetaInspector.new('https://protocol-relative.com')
307
- end
308
-
309
- it "should convert protocol-relative links to http" do
310
- @m_http.links.should include('http://protocol-relative.com/contact')
311
- @m_http.links.should include('http://yahoo.com/')
312
- end
313
-
314
- it "should convert protocol-relative links to https" do
315
- @m_https.links.should include('https://protocol-relative.com/contact')
316
- @m_https.links.should include('https://yahoo.com/')
317
- end
318
- end
319
-
320
- describe 'Getting meta tags by ghost methods' do
321
- before(:each) do
322
- @m = MetaInspector.new('http://pagerankalert.com')
323
- end
324
-
325
- it "should get the robots meta tag" do
326
- @m.meta_robots.should == 'all,follow'
327
- end
328
-
329
- it "should get the robots meta tag" do
330
- @m.meta_RoBoTs.should == 'all,follow'
331
- end
332
-
333
- it "should get the description meta tag" do
334
- @m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
335
- end
336
-
337
- it "should get the keywords meta tag" do
338
- @m.meta_keywords.should == "pagerank, seo, optimization, google"
339
- end
340
-
341
- it "should get the content-language meta tag" do
342
- pending "mocks"
343
- @m.meta_content_language.should == "en"
344
- end
345
-
346
- it "should get the Csrf_pAram meta tag" do
347
- @m.meta_Csrf_pAram.should == "authenticity_token"
348
- end
349
-
350
- it "should return nil for nonfound meta_tags" do
351
- @m.meta_lollypop.should == nil
352
- end
353
-
354
- it "should get the generator meta tag" do
355
- @m = MetaInspector.new('http://www.inkthemes.com/')
356
- @m.meta_generator.should == 'WordPress 3.4.2'
357
- end
358
-
359
- it "should find a meta_og_title" do
360
- @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
361
- @m.meta_og_title.should == "Apple Claims New iPhone Only Visible To Most Loyal Of Customers"
362
- end
363
-
364
- it "should not find a meta_og_something" do
365
- @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
366
- @m.meta_og_something.should == nil
367
- end
368
-
369
- it "should find a meta_twitter_site" do
370
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
371
- @m.meta_twitter_site.should == "@youtube"
372
- end
373
-
374
- it "should find a meta_twitter_player_width" do
375
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
376
- @m.meta_twitter_player_width.should == "1920"
377
- end
378
-
379
- it "should not find a meta_twitter_dummy" do
380
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
381
- @m.meta_twitter_dummy.should == nil
382
- end
383
-
384
- it "should find a meta_og_video_width" do
385
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
386
- @m.meta_og_video_width.should == "1920"
387
- end
388
- end
389
-
390
- describe 'Charset detection' do
391
- it "should get the charset from <meta charset />" do
392
- @m = MetaInspector.new('http://charset001.com')
393
- @m.charset.should == "utf-8"
394
- end
395
-
396
- it "should get the charset from meta content type" do
397
- @m = MetaInspector.new('http://charset002.com')
398
- @m.charset.should == "windows-1252"
399
- end
400
-
401
- it "should get nil if no declared charset is found" do
402
- @m = MetaInspector.new('http://charset000.com')
403
- @m.charset.should == nil
404
- end
405
- end
406
-
407
- describe 'to_hash' do
408
- it "should return a hash with all the values set" do
409
- @m = MetaInspector.new('http://pagerankalert.com')
410
- @m.to_hash.should == {
411
- "url" =>"http://pagerankalert.com/",
412
- "title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
413
- "links" => ["http://pagerankalert.com/",
414
- "http://pagerankalert.com/es?language=es",
415
- "http://pagerankalert.com/users/sign_up",
416
- "http://pagerankalert.com/users/sign_in",
417
- "mailto:pagerankalert@gmail.com",
418
- "http://pagerankalert.posterous.com/",
419
- "http://twitter.com/pagerankalert",
420
- "http://twitter.com/share"],
421
- "internal_links" => ["http://pagerankalert.com/",
422
- "http://pagerankalert.com/es?language=es",
423
- "http://pagerankalert.com/users/sign_up",
424
- "http://pagerankalert.com/users/sign_in"],
425
- "external_links" => ["mailto:pagerankalert@gmail.com",
426
- "http://pagerankalert.posterous.com/",
427
- "http://twitter.com/pagerankalert",
428
- "http://twitter.com/share"],
429
- "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
430
- "charset" => "utf-8",
431
- "feed" => "http://feeds.feedburner.com/PageRankAlert",
432
- "content_type" =>"text/html",
433
- "meta" => {
434
- "name" => {
435
- "description"=> "Track your PageRank(TM) changes and receive alerts by email",
436
- "keywords" => "pagerank, seo, optimization, google",
437
- "robots" => "all,follow",
438
- "csrf_param" => "authenticity_token",
439
- "csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
440
- },
441
- "property"=>{}
442
- }
443
- }
444
- end
445
- end
446
-
447
- describe 'exception handling' do
448
- before(:each) do
449
- FakeWeb.allow_net_connect = true
450
- end
451
-
452
- after(:each) do
453
- FakeWeb.allow_net_connect = false
454
- end
455
-
456
- it "should handle timeouts" do
457
- impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
458
-
459
- expect {
460
- title = impatient.title
461
- }.to change { impatient.errors.size }
462
-
463
- impatient.errors.first.should == "Timeout!!!"
464
- end
465
-
466
- it "should handle socket errors" do
467
- nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
468
-
469
- expect {
470
- title = nowhere.title
471
- }.to change { nowhere.errors.size }
472
-
473
- nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
474
- end
475
-
476
- it "should parse images when parse_html_content_type_only is not specified" do
477
- image_url = MetaInspector.new('http://pagerankalert.com/image.png')
478
- desc = image_url.description
479
-
480
- image_url.should be_ok
481
- end
482
-
483
- it "should parse images when parse_html_content_type_only is false" do
484
- image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
485
- desc = image_url.description
486
-
487
- image_url.should be_ok
488
- end
489
-
490
- it "should handle errors when content is image/jpeg and html_content_type_only is true" do
491
- image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
492
-
493
- expect {
494
- title = image_url.title
495
- }.to change { image_url.errors.size }
496
-
497
- image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
498
- end
499
-
500
- it "should handle errors when content is not text/html and html_content_type_only is true" do
501
- tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
502
-
503
- expect {
504
- title = tar_url.title
505
- }.to change { tar_url.errors.size }
506
-
507
- tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
508
- end
509
-
510
- describe "ok?" do
511
- it "should return true if we have no errors" do
512
- good = MetaInspector.new('http://pagerankalert.com')
513
- good.to_hash
514
-
515
- good.should be_ok
516
- end
517
-
518
- it "should return false if there are errors" do
519
- bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
520
- bad.title
521
-
522
- bad.should_not be_ok
523
- end
524
-
525
- it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
526
- tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
527
- title = tar.title
528
-
529
- tar.should_not be_ok
530
- end
531
- end
532
- end
533
-
534
- describe "content_type" do
535
- it "should return the correct content type of the url for non html pages" do
536
- good = MetaInspector.new('http://pagerankalert.com/image.png')
537
-
538
- good.content_type.should == "image/png"
539
- end
540
-
541
- it "should return the correct content type of the url for html pages" do
542
- good = MetaInspector.new('http://pagerankalert.com')
543
-
544
- good.content_type.should == "text/html"
545
- end
546
- end
547
- end