metainspector 1.16.1 → 1.17.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,547 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
- require File.join(File.dirname(__FILE__), "/spec_helper")
4
-
5
- describe MetaInspector do
6
- describe 'Initialization' do
7
- it 'should accept an URL with a scheme' do
8
- MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com/'
9
- end
10
-
11
- it "should use http:// as a default scheme" do
12
- MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
13
- end
14
-
15
- it "should accept an URL with international characters" do
16
- MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
17
- end
18
-
19
- it "should store the scheme" do
20
- MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
21
- MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
22
- MetaInspector.new('pagerankalert.com').scheme.should == 'http'
23
- end
24
-
25
- it "should store the host" do
26
- MetaInspector.new('http://pagerankalert.com').host.should == 'pagerankalert.com'
27
- MetaInspector.new('https://pagerankalert.com').host.should == 'pagerankalert.com'
28
- MetaInspector.new('pagerankalert.com').host.should == 'pagerankalert.com'
29
- end
30
-
31
- it "should store the root url" do
32
- MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
33
- MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
34
- MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
35
- MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
36
- end
37
- end
38
-
39
- describe 'Doing a basic scrape' do
40
- EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes & receive alerts'
41
-
42
- before(:each) do
43
- @m = MetaInspector.new('http://pagerankalert.com')
44
- end
45
-
46
- it "should get the title" do
47
- @m.title.should == EXPECTED_TITLE
48
- end
49
-
50
- it "should not find an image" do
51
- @m.image.should == nil
52
- end
53
-
54
- describe "get image" do
55
- it "should find the og image" do
56
- @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
57
- @m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
58
- @m.meta_og_image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
59
- end
60
-
61
- it "should find image on youtube" do
62
- MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc').image.should == "http://i2.ytimg.com/vi/iaGSSrp49uc/mqdefault.jpg"
63
- end
64
- end
65
-
66
- describe "get images" do
67
- it "should find all page images" do
68
- @m.images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
69
- end
70
-
71
- it "should find images on twitter" do
72
- m = MetaInspector.new('https://twitter.com/markupvalidator')
73
- m.images.length.should == 6
74
- m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
75
- end
76
- end
77
-
78
- it "should ignore malformed image tags" do
79
- # There is an image tag without a source. The scraper should not fatal.
80
- @m = MetaInspector.new("http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups")
81
- @m.images.size.should == 11
82
- end
83
-
84
- it "should have a Nokogiri::HTML::Document as parsed_document" do
85
- @m.parsed_document.class.should == Nokogiri::HTML::Document
86
- end
87
-
88
- it "should have a String as document" do
89
- @m.document.class.should == String
90
- end
91
-
92
- describe "Feed" do
93
- it "should get rss feed" do
94
- @m = MetaInspector.new('http://www.iteh.at')
95
- @m.feed.should == 'http://www.iteh.at/de/rss/'
96
- end
97
-
98
- it "should get atom feed" do
99
- @m = MetaInspector.new('http://www.tea-tron.com/jbravo/blog/')
100
- @m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
101
- end
102
-
103
- it "should return nil if no feed found" do
104
- @m = MetaInspector.new('http://www.alazan.com')
105
- @m.feed.should == nil
106
- end
107
- end
108
-
109
- describe "get description" do
110
- it "should find description on youtube" do
111
- MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc').description.should == ""
112
- end
113
- end
114
- end
115
-
116
- describe 'Doing a basic scrape from passed url html' do
117
-
118
- before(:each) do
119
- @m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
120
- end
121
-
122
- it "should get correct links when the url html is passed as an option" do
123
- @m.links.should == ["http://cnn.com/hello"]
124
- end
125
-
126
- it "should get the title" do
127
- @m.title.should == "Hello From Passed Html"
128
- end
129
- end
130
-
131
- describe 'Page with missing meta description' do
132
- it "should find secondary description" do
133
- @m = MetaInspector.new('http://theonion-no-description.com')
134
- @m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
135
- " an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
136
- end
137
- end
138
-
139
- describe 'Links' do
140
- before(:each) do
141
- @m = MetaInspector.new('http://pagerankalert.com')
142
- end
143
-
144
- it "should get the links" do
145
- @m.links.should == [ "http://pagerankalert.com/",
146
- "http://pagerankalert.com/es?language=es",
147
- "http://pagerankalert.com/users/sign_up",
148
- "http://pagerankalert.com/users/sign_in",
149
- "mailto:pagerankalert@gmail.com",
150
- "http://pagerankalert.posterous.com/",
151
- "http://twitter.com/pagerankalert",
152
- "http://twitter.com/share" ]
153
- end
154
-
155
- it "should get correct absolute links for internal pages" do
156
- @m.internal_links.should == [ "http://pagerankalert.com/",
157
- "http://pagerankalert.com/es?language=es",
158
- "http://pagerankalert.com/users/sign_up",
159
- "http://pagerankalert.com/users/sign_in" ]
160
- end
161
-
162
- it "should get correct absolute links for external pages" do
163
- @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
164
- "http://pagerankalert.posterous.com/",
165
- "http://twitter.com/pagerankalert",
166
- "http://twitter.com/share" ]
167
- end
168
-
169
- it "should get correct absolute links, correcting relative links from URL not ending with slash" do
170
- m = MetaInspector.new('http://alazan.com/websolution.asp')
171
- m.links.should == [ "http://alazan.com/index.asp",
172
- "http://alazan.com/faqs.asp" ]
173
- end
174
-
175
- it "should return empty array if no links found" do
176
- m = MetaInspector.new('http://example.com/empty')
177
- m.links.should == []
178
- end
179
-
180
- describe "links with international characters" do
181
- it "should get correct absolute links, encoding the URLs as needed" do
182
- m = MetaInspector.new('http://international.com')
183
- m.links.should == [ "http://international.com/espa%C3%B1a.asp",
184
- "http://international.com/roman%C3%A9e",
185
- "http://international.com/faqs#cami%C3%B3n",
186
- "http://international.com/search?q=cami%C3%B3n",
187
- "http://international.com/search?q=espa%C3%B1a#top",
188
- "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
189
- "http://example.com/espa%C3%B1a.asp",
190
- "http://example.com/roman%C3%A9e",
191
- "http://example.com/faqs#cami%C3%B3n",
192
- "http://example.com/search?q=cami%C3%B3n",
193
- "http://example.com/search?q=espa%C3%B1a#top"]
194
- end
195
-
196
- describe "internal links" do
197
- it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
198
- m = MetaInspector.new('http://international.com')
199
- m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
200
- "http://international.com/roman%C3%A9e",
201
- "http://international.com/faqs#cami%C3%B3n",
202
- "http://international.com/search?q=cami%C3%B3n",
203
- "http://international.com/search?q=espa%C3%B1a#top",
204
- "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
205
- end
206
-
207
- it "should not crash when processing malformed hrefs" do
208
- m = MetaInspector.new('http://example.com/malformed_href')
209
- expect {
210
- m.internal_links.should == [ "http://example.com/faqs" ]
211
- m.should_not be_ok
212
- }.to_not raise_error
213
- end
214
- end
215
-
216
- describe "external links" do
217
- it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
218
- m = MetaInspector.new('http://international.com')
219
- m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
220
- "http://example.com/roman%C3%A9e",
221
- "http://example.com/faqs#cami%C3%B3n",
222
- "http://example.com/search?q=cami%C3%B3n",
223
- "http://example.com/search?q=espa%C3%B1a#top"]
224
- end
225
-
226
- it "should not crash when processing malformed hrefs" do
227
- m = MetaInspector.new('http://example.com/malformed_href')
228
- expect {
229
- m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
230
- "javascript:alert('ok');", "javascript://", "mailto:email(at)example.com"]
231
- m.should_not be_ok
232
- }.to_not raise_error
233
- end
234
- end
235
- end
236
-
237
- it "should not crash with links that have weird href values" do
238
- m = MetaInspector.new('http://example.com/invalid_href')
239
- m.links.should == ["%3Cp%3Eftp://ftp.cdrom.com", "skype:joeuser?call", "telnet://telnet.cdrom.com"]
240
- end
241
- end
242
-
243
- describe 'Relative links' do
244
- describe 'From a root URL' do
245
- before(:each) do
246
- @m = MetaInspector.new('http://relative.com/')
247
- end
248
-
249
- it 'should get the relative links' do
250
- @m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
251
- end
252
- end
253
-
254
- describe 'From a document' do
255
- before(:each) do
256
- @m = MetaInspector.new('http://relative.com/company')
257
- end
258
-
259
- it 'should get the relative links' do
260
- @m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
261
- end
262
- end
263
-
264
- describe 'From a directory' do
265
- before(:each) do
266
- @m = MetaInspector.new('http://relative.com/company/')
267
- end
268
-
269
- it 'should get the relative links' do
270
- @m.internal_links.should == ['http://relative.com/company/about', 'http://relative.com/sitemap']
271
- end
272
- end
273
- end
274
-
275
- describe 'Relative links with base' do
276
- it 'should get the relative links from a document' do
277
- m = MetaInspector.new('http://relativewithbase.com/company/page2')
278
- m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
279
- end
280
-
281
- it 'should get the relative links from a directory' do
282
- m = MetaInspector.new('http://relativewithbase.com/company/page2/')
283
- m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
284
- end
285
- end
286
-
287
- describe 'Non-HTTP links' do
288
- before(:each) do
289
- @m = MetaInspector.new('http://example.com/nonhttp')
290
- end
291
-
292
- it "should get the links" do
293
- @m.links.sort.should == [
294
- "ftp://ftp.cdrom.com/",
295
- "javascript:alert('hey');",
296
- "mailto:user@example.com",
297
- "skype:joeuser?call",
298
- "telnet://telnet.cdrom.com"
299
- ]
300
- end
301
- end
302
-
303
- describe 'Protocol-relative URLs' do
304
- before(:each) do
305
- @m_http = MetaInspector.new('http://protocol-relative.com')
306
- @m_https = MetaInspector.new('https://protocol-relative.com')
307
- end
308
-
309
- it "should convert protocol-relative links to http" do
310
- @m_http.links.should include('http://protocol-relative.com/contact')
311
- @m_http.links.should include('http://yahoo.com/')
312
- end
313
-
314
- it "should convert protocol-relative links to https" do
315
- @m_https.links.should include('https://protocol-relative.com/contact')
316
- @m_https.links.should include('https://yahoo.com/')
317
- end
318
- end
319
-
320
- describe 'Getting meta tags by ghost methods' do
321
- before(:each) do
322
- @m = MetaInspector.new('http://pagerankalert.com')
323
- end
324
-
325
- it "should get the robots meta tag" do
326
- @m.meta_robots.should == 'all,follow'
327
- end
328
-
329
- it "should get the robots meta tag" do
330
- @m.meta_RoBoTs.should == 'all,follow'
331
- end
332
-
333
- it "should get the description meta tag" do
334
- @m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
335
- end
336
-
337
- it "should get the keywords meta tag" do
338
- @m.meta_keywords.should == "pagerank, seo, optimization, google"
339
- end
340
-
341
- it "should get the content-language meta tag" do
342
- pending "mocks"
343
- @m.meta_content_language.should == "en"
344
- end
345
-
346
- it "should get the Csrf_pAram meta tag" do
347
- @m.meta_Csrf_pAram.should == "authenticity_token"
348
- end
349
-
350
- it "should return nil for nonfound meta_tags" do
351
- @m.meta_lollypop.should == nil
352
- end
353
-
354
- it "should get the generator meta tag" do
355
- @m = MetaInspector.new('http://www.inkthemes.com/')
356
- @m.meta_generator.should == 'WordPress 3.4.2'
357
- end
358
-
359
- it "should find a meta_og_title" do
360
- @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
361
- @m.meta_og_title.should == "Apple Claims New iPhone Only Visible To Most Loyal Of Customers"
362
- end
363
-
364
- it "should not find a meta_og_something" do
365
- @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
366
- @m.meta_og_something.should == nil
367
- end
368
-
369
- it "should find a meta_twitter_site" do
370
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
371
- @m.meta_twitter_site.should == "@youtube"
372
- end
373
-
374
- it "should find a meta_twitter_player_width" do
375
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
376
- @m.meta_twitter_player_width.should == "1920"
377
- end
378
-
379
- it "should not find a meta_twitter_dummy" do
380
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
381
- @m.meta_twitter_dummy.should == nil
382
- end
383
-
384
- it "should find a meta_og_video_width" do
385
- @m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
386
- @m.meta_og_video_width.should == "1920"
387
- end
388
- end
389
-
390
- describe 'Charset detection' do
391
- it "should get the charset from <meta charset />" do
392
- @m = MetaInspector.new('http://charset001.com')
393
- @m.charset.should == "utf-8"
394
- end
395
-
396
- it "should get the charset from meta content type" do
397
- @m = MetaInspector.new('http://charset002.com')
398
- @m.charset.should == "windows-1252"
399
- end
400
-
401
- it "should get nil if no declared charset is found" do
402
- @m = MetaInspector.new('http://charset000.com')
403
- @m.charset.should == nil
404
- end
405
- end
406
-
407
- describe 'to_hash' do
408
- it "should return a hash with all the values set" do
409
- @m = MetaInspector.new('http://pagerankalert.com')
410
- @m.to_hash.should == {
411
- "url" =>"http://pagerankalert.com/",
412
- "title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
413
- "links" => ["http://pagerankalert.com/",
414
- "http://pagerankalert.com/es?language=es",
415
- "http://pagerankalert.com/users/sign_up",
416
- "http://pagerankalert.com/users/sign_in",
417
- "mailto:pagerankalert@gmail.com",
418
- "http://pagerankalert.posterous.com/",
419
- "http://twitter.com/pagerankalert",
420
- "http://twitter.com/share"],
421
- "internal_links" => ["http://pagerankalert.com/",
422
- "http://pagerankalert.com/es?language=es",
423
- "http://pagerankalert.com/users/sign_up",
424
- "http://pagerankalert.com/users/sign_in"],
425
- "external_links" => ["mailto:pagerankalert@gmail.com",
426
- "http://pagerankalert.posterous.com/",
427
- "http://twitter.com/pagerankalert",
428
- "http://twitter.com/share"],
429
- "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
430
- "charset" => "utf-8",
431
- "feed" => "http://feeds.feedburner.com/PageRankAlert",
432
- "content_type" =>"text/html",
433
- "meta" => {
434
- "name" => {
435
- "description"=> "Track your PageRank(TM) changes and receive alerts by email",
436
- "keywords" => "pagerank, seo, optimization, google",
437
- "robots" => "all,follow",
438
- "csrf_param" => "authenticity_token",
439
- "csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
440
- },
441
- "property"=>{}
442
- }
443
- }
444
- end
445
- end
446
-
447
- describe 'exception handling' do
448
- before(:each) do
449
- FakeWeb.allow_net_connect = true
450
- end
451
-
452
- after(:each) do
453
- FakeWeb.allow_net_connect = false
454
- end
455
-
456
- it "should handle timeouts" do
457
- impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
458
-
459
- expect {
460
- title = impatient.title
461
- }.to change { impatient.errors.size }
462
-
463
- impatient.errors.first.should == "Timeout!!!"
464
- end
465
-
466
- it "should handle socket errors" do
467
- nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
468
-
469
- expect {
470
- title = nowhere.title
471
- }.to change { nowhere.errors.size }
472
-
473
- nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
474
- end
475
-
476
- it "should parse images when parse_html_content_type_only is not specified" do
477
- image_url = MetaInspector.new('http://pagerankalert.com/image.png')
478
- desc = image_url.description
479
-
480
- image_url.should be_ok
481
- end
482
-
483
- it "should parse images when parse_html_content_type_only is false" do
484
- image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
485
- desc = image_url.description
486
-
487
- image_url.should be_ok
488
- end
489
-
490
- it "should handle errors when content is image/jpeg and html_content_type_only is true" do
491
- image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
492
-
493
- expect {
494
- title = image_url.title
495
- }.to change { image_url.errors.size }
496
-
497
- image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
498
- end
499
-
500
- it "should handle errors when content is not text/html and html_content_type_only is true" do
501
- tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
502
-
503
- expect {
504
- title = tar_url.title
505
- }.to change { tar_url.errors.size }
506
-
507
- tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
508
- end
509
-
510
- describe "ok?" do
511
- it "should return true if we have no errors" do
512
- good = MetaInspector.new('http://pagerankalert.com')
513
- good.to_hash
514
-
515
- good.should be_ok
516
- end
517
-
518
- it "should return false if there are errors" do
519
- bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
520
- bad.title
521
-
522
- bad.should_not be_ok
523
- end
524
-
525
- it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
526
- tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
527
- title = tar.title
528
-
529
- tar.should_not be_ok
530
- end
531
- end
532
- end
533
-
534
- describe "content_type" do
535
- it "should return the correct content type of the url for non html pages" do
536
- good = MetaInspector.new('http://pagerankalert.com/image.png')
537
-
538
- good.content_type.should == "image/png"
539
- end
540
-
541
- it "should return the correct content type of the url for html pages" do
542
- good = MetaInspector.new('http://pagerankalert.com')
543
-
544
- good.content_type.should == "text/html"
545
- end
546
- end
547
- end