metainspector 1.16.1 → 1.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -11
- data/lib/meta_inspector.rb +10 -3
- data/lib/meta_inspector/deprecations.rb +19 -0
- data/lib/meta_inspector/document.rb +81 -0
- data/lib/meta_inspector/exception_log.rb +29 -0
- data/lib/meta_inspector/exceptionable.rb +11 -0
- data/lib/meta_inspector/parser.rb +178 -0
- data/lib/meta_inspector/request.rb +55 -0
- data/lib/meta_inspector/url.rb +76 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +97 -0
- data/spec/exception_log_spec.rb +59 -0
- data/spec/meta_inspector_spec.rb +9 -0
- data/spec/parser_spec.rb +374 -0
- data/spec/redirections_spec.rb +20 -3
- data/spec/request_spec.rb +64 -0
- data/spec/url_spec.rb +74 -0
- metadata +18 -7
- data/lib/meta_inspector/scraper.rb +0 -283
- data/spec/metainspector_spec.rb +0 -547
data/spec/metainspector_spec.rb
DELETED
@@ -1,547 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
-
|
5
|
-
describe MetaInspector do
|
6
|
-
describe 'Initialization' do
|
7
|
-
it 'should accept an URL with a scheme' do
|
8
|
-
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com/'
|
9
|
-
end
|
10
|
-
|
11
|
-
it "should use http:// as a default scheme" do
|
12
|
-
MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
|
13
|
-
end
|
14
|
-
|
15
|
-
it "should accept an URL with international characters" do
|
16
|
-
MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
|
17
|
-
end
|
18
|
-
|
19
|
-
it "should store the scheme" do
|
20
|
-
MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
|
21
|
-
MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
|
22
|
-
MetaInspector.new('pagerankalert.com').scheme.should == 'http'
|
23
|
-
end
|
24
|
-
|
25
|
-
it "should store the host" do
|
26
|
-
MetaInspector.new('http://pagerankalert.com').host.should == 'pagerankalert.com'
|
27
|
-
MetaInspector.new('https://pagerankalert.com').host.should == 'pagerankalert.com'
|
28
|
-
MetaInspector.new('pagerankalert.com').host.should == 'pagerankalert.com'
|
29
|
-
end
|
30
|
-
|
31
|
-
it "should store the root url" do
|
32
|
-
MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
33
|
-
MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
|
34
|
-
MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
35
|
-
MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
describe 'Doing a basic scrape' do
|
40
|
-
EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes & receive alerts'
|
41
|
-
|
42
|
-
before(:each) do
|
43
|
-
@m = MetaInspector.new('http://pagerankalert.com')
|
44
|
-
end
|
45
|
-
|
46
|
-
it "should get the title" do
|
47
|
-
@m.title.should == EXPECTED_TITLE
|
48
|
-
end
|
49
|
-
|
50
|
-
it "should not find an image" do
|
51
|
-
@m.image.should == nil
|
52
|
-
end
|
53
|
-
|
54
|
-
describe "get image" do
|
55
|
-
it "should find the og image" do
|
56
|
-
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
57
|
-
@m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
58
|
-
@m.meta_og_image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
59
|
-
end
|
60
|
-
|
61
|
-
it "should find image on youtube" do
|
62
|
-
MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc').image.should == "http://i2.ytimg.com/vi/iaGSSrp49uc/mqdefault.jpg"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
describe "get images" do
|
67
|
-
it "should find all page images" do
|
68
|
-
@m.images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
|
69
|
-
end
|
70
|
-
|
71
|
-
it "should find images on twitter" do
|
72
|
-
m = MetaInspector.new('https://twitter.com/markupvalidator')
|
73
|
-
m.images.length.should == 6
|
74
|
-
m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
it "should ignore malformed image tags" do
|
79
|
-
# There is an image tag without a source. The scraper should not fatal.
|
80
|
-
@m = MetaInspector.new("http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups")
|
81
|
-
@m.images.size.should == 11
|
82
|
-
end
|
83
|
-
|
84
|
-
it "should have a Nokogiri::HTML::Document as parsed_document" do
|
85
|
-
@m.parsed_document.class.should == Nokogiri::HTML::Document
|
86
|
-
end
|
87
|
-
|
88
|
-
it "should have a String as document" do
|
89
|
-
@m.document.class.should == String
|
90
|
-
end
|
91
|
-
|
92
|
-
describe "Feed" do
|
93
|
-
it "should get rss feed" do
|
94
|
-
@m = MetaInspector.new('http://www.iteh.at')
|
95
|
-
@m.feed.should == 'http://www.iteh.at/de/rss/'
|
96
|
-
end
|
97
|
-
|
98
|
-
it "should get atom feed" do
|
99
|
-
@m = MetaInspector.new('http://www.tea-tron.com/jbravo/blog/')
|
100
|
-
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
|
101
|
-
end
|
102
|
-
|
103
|
-
it "should return nil if no feed found" do
|
104
|
-
@m = MetaInspector.new('http://www.alazan.com')
|
105
|
-
@m.feed.should == nil
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
describe "get description" do
|
110
|
-
it "should find description on youtube" do
|
111
|
-
MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc').description.should == ""
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
describe 'Doing a basic scrape from passed url html' do
|
117
|
-
|
118
|
-
before(:each) do
|
119
|
-
@m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
120
|
-
end
|
121
|
-
|
122
|
-
it "should get correct links when the url html is passed as an option" do
|
123
|
-
@m.links.should == ["http://cnn.com/hello"]
|
124
|
-
end
|
125
|
-
|
126
|
-
it "should get the title" do
|
127
|
-
@m.title.should == "Hello From Passed Html"
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
describe 'Page with missing meta description' do
|
132
|
-
it "should find secondary description" do
|
133
|
-
@m = MetaInspector.new('http://theonion-no-description.com')
|
134
|
-
@m.description == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
|
135
|
-
" an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
describe 'Links' do
|
140
|
-
before(:each) do
|
141
|
-
@m = MetaInspector.new('http://pagerankalert.com')
|
142
|
-
end
|
143
|
-
|
144
|
-
it "should get the links" do
|
145
|
-
@m.links.should == [ "http://pagerankalert.com/",
|
146
|
-
"http://pagerankalert.com/es?language=es",
|
147
|
-
"http://pagerankalert.com/users/sign_up",
|
148
|
-
"http://pagerankalert.com/users/sign_in",
|
149
|
-
"mailto:pagerankalert@gmail.com",
|
150
|
-
"http://pagerankalert.posterous.com/",
|
151
|
-
"http://twitter.com/pagerankalert",
|
152
|
-
"http://twitter.com/share" ]
|
153
|
-
end
|
154
|
-
|
155
|
-
it "should get correct absolute links for internal pages" do
|
156
|
-
@m.internal_links.should == [ "http://pagerankalert.com/",
|
157
|
-
"http://pagerankalert.com/es?language=es",
|
158
|
-
"http://pagerankalert.com/users/sign_up",
|
159
|
-
"http://pagerankalert.com/users/sign_in" ]
|
160
|
-
end
|
161
|
-
|
162
|
-
it "should get correct absolute links for external pages" do
|
163
|
-
@m.external_links.should == [ "mailto:pagerankalert@gmail.com",
|
164
|
-
"http://pagerankalert.posterous.com/",
|
165
|
-
"http://twitter.com/pagerankalert",
|
166
|
-
"http://twitter.com/share" ]
|
167
|
-
end
|
168
|
-
|
169
|
-
it "should get correct absolute links, correcting relative links from URL not ending with slash" do
|
170
|
-
m = MetaInspector.new('http://alazan.com/websolution.asp')
|
171
|
-
m.links.should == [ "http://alazan.com/index.asp",
|
172
|
-
"http://alazan.com/faqs.asp" ]
|
173
|
-
end
|
174
|
-
|
175
|
-
it "should return empty array if no links found" do
|
176
|
-
m = MetaInspector.new('http://example.com/empty')
|
177
|
-
m.links.should == []
|
178
|
-
end
|
179
|
-
|
180
|
-
describe "links with international characters" do
|
181
|
-
it "should get correct absolute links, encoding the URLs as needed" do
|
182
|
-
m = MetaInspector.new('http://international.com')
|
183
|
-
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
184
|
-
"http://international.com/roman%C3%A9e",
|
185
|
-
"http://international.com/faqs#cami%C3%B3n",
|
186
|
-
"http://international.com/search?q=cami%C3%B3n",
|
187
|
-
"http://international.com/search?q=espa%C3%B1a#top",
|
188
|
-
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
|
189
|
-
"http://example.com/espa%C3%B1a.asp",
|
190
|
-
"http://example.com/roman%C3%A9e",
|
191
|
-
"http://example.com/faqs#cami%C3%B3n",
|
192
|
-
"http://example.com/search?q=cami%C3%B3n",
|
193
|
-
"http://example.com/search?q=espa%C3%B1a#top"]
|
194
|
-
end
|
195
|
-
|
196
|
-
describe "internal links" do
|
197
|
-
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
198
|
-
m = MetaInspector.new('http://international.com')
|
199
|
-
m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
|
200
|
-
"http://international.com/roman%C3%A9e",
|
201
|
-
"http://international.com/faqs#cami%C3%B3n",
|
202
|
-
"http://international.com/search?q=cami%C3%B3n",
|
203
|
-
"http://international.com/search?q=espa%C3%B1a#top",
|
204
|
-
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
|
205
|
-
end
|
206
|
-
|
207
|
-
it "should not crash when processing malformed hrefs" do
|
208
|
-
m = MetaInspector.new('http://example.com/malformed_href')
|
209
|
-
expect {
|
210
|
-
m.internal_links.should == [ "http://example.com/faqs" ]
|
211
|
-
m.should_not be_ok
|
212
|
-
}.to_not raise_error
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
describe "external links" do
|
217
|
-
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
218
|
-
m = MetaInspector.new('http://international.com')
|
219
|
-
m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
|
220
|
-
"http://example.com/roman%C3%A9e",
|
221
|
-
"http://example.com/faqs#cami%C3%B3n",
|
222
|
-
"http://example.com/search?q=cami%C3%B3n",
|
223
|
-
"http://example.com/search?q=espa%C3%B1a#top"]
|
224
|
-
end
|
225
|
-
|
226
|
-
it "should not crash when processing malformed hrefs" do
|
227
|
-
m = MetaInspector.new('http://example.com/malformed_href')
|
228
|
-
expect {
|
229
|
-
m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
|
230
|
-
"javascript:alert('ok');", "javascript://", "mailto:email(at)example.com"]
|
231
|
-
m.should_not be_ok
|
232
|
-
}.to_not raise_error
|
233
|
-
end
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
it "should not crash with links that have weird href values" do
|
238
|
-
m = MetaInspector.new('http://example.com/invalid_href')
|
239
|
-
m.links.should == ["%3Cp%3Eftp://ftp.cdrom.com", "skype:joeuser?call", "telnet://telnet.cdrom.com"]
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
describe 'Relative links' do
|
244
|
-
describe 'From a root URL' do
|
245
|
-
before(:each) do
|
246
|
-
@m = MetaInspector.new('http://relative.com/')
|
247
|
-
end
|
248
|
-
|
249
|
-
it 'should get the relative links' do
|
250
|
-
@m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
254
|
-
describe 'From a document' do
|
255
|
-
before(:each) do
|
256
|
-
@m = MetaInspector.new('http://relative.com/company')
|
257
|
-
end
|
258
|
-
|
259
|
-
it 'should get the relative links' do
|
260
|
-
@m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
261
|
-
end
|
262
|
-
end
|
263
|
-
|
264
|
-
describe 'From a directory' do
|
265
|
-
before(:each) do
|
266
|
-
@m = MetaInspector.new('http://relative.com/company/')
|
267
|
-
end
|
268
|
-
|
269
|
-
it 'should get the relative links' do
|
270
|
-
@m.internal_links.should == ['http://relative.com/company/about', 'http://relative.com/sitemap']
|
271
|
-
end
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
describe 'Relative links with base' do
|
276
|
-
it 'should get the relative links from a document' do
|
277
|
-
m = MetaInspector.new('http://relativewithbase.com/company/page2')
|
278
|
-
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
279
|
-
end
|
280
|
-
|
281
|
-
it 'should get the relative links from a directory' do
|
282
|
-
m = MetaInspector.new('http://relativewithbase.com/company/page2/')
|
283
|
-
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
284
|
-
end
|
285
|
-
end
|
286
|
-
|
287
|
-
describe 'Non-HTTP links' do
|
288
|
-
before(:each) do
|
289
|
-
@m = MetaInspector.new('http://example.com/nonhttp')
|
290
|
-
end
|
291
|
-
|
292
|
-
it "should get the links" do
|
293
|
-
@m.links.sort.should == [
|
294
|
-
"ftp://ftp.cdrom.com/",
|
295
|
-
"javascript:alert('hey');",
|
296
|
-
"mailto:user@example.com",
|
297
|
-
"skype:joeuser?call",
|
298
|
-
"telnet://telnet.cdrom.com"
|
299
|
-
]
|
300
|
-
end
|
301
|
-
end
|
302
|
-
|
303
|
-
describe 'Protocol-relative URLs' do
|
304
|
-
before(:each) do
|
305
|
-
@m_http = MetaInspector.new('http://protocol-relative.com')
|
306
|
-
@m_https = MetaInspector.new('https://protocol-relative.com')
|
307
|
-
end
|
308
|
-
|
309
|
-
it "should convert protocol-relative links to http" do
|
310
|
-
@m_http.links.should include('http://protocol-relative.com/contact')
|
311
|
-
@m_http.links.should include('http://yahoo.com/')
|
312
|
-
end
|
313
|
-
|
314
|
-
it "should convert protocol-relative links to https" do
|
315
|
-
@m_https.links.should include('https://protocol-relative.com/contact')
|
316
|
-
@m_https.links.should include('https://yahoo.com/')
|
317
|
-
end
|
318
|
-
end
|
319
|
-
|
320
|
-
describe 'Getting meta tags by ghost methods' do
|
321
|
-
before(:each) do
|
322
|
-
@m = MetaInspector.new('http://pagerankalert.com')
|
323
|
-
end
|
324
|
-
|
325
|
-
it "should get the robots meta tag" do
|
326
|
-
@m.meta_robots.should == 'all,follow'
|
327
|
-
end
|
328
|
-
|
329
|
-
it "should get the robots meta tag" do
|
330
|
-
@m.meta_RoBoTs.should == 'all,follow'
|
331
|
-
end
|
332
|
-
|
333
|
-
it "should get the description meta tag" do
|
334
|
-
@m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
|
335
|
-
end
|
336
|
-
|
337
|
-
it "should get the keywords meta tag" do
|
338
|
-
@m.meta_keywords.should == "pagerank, seo, optimization, google"
|
339
|
-
end
|
340
|
-
|
341
|
-
it "should get the content-language meta tag" do
|
342
|
-
pending "mocks"
|
343
|
-
@m.meta_content_language.should == "en"
|
344
|
-
end
|
345
|
-
|
346
|
-
it "should get the Csrf_pAram meta tag" do
|
347
|
-
@m.meta_Csrf_pAram.should == "authenticity_token"
|
348
|
-
end
|
349
|
-
|
350
|
-
it "should return nil for nonfound meta_tags" do
|
351
|
-
@m.meta_lollypop.should == nil
|
352
|
-
end
|
353
|
-
|
354
|
-
it "should get the generator meta tag" do
|
355
|
-
@m = MetaInspector.new('http://www.inkthemes.com/')
|
356
|
-
@m.meta_generator.should == 'WordPress 3.4.2'
|
357
|
-
end
|
358
|
-
|
359
|
-
it "should find a meta_og_title" do
|
360
|
-
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
361
|
-
@m.meta_og_title.should == "Apple Claims New iPhone Only Visible To Most Loyal Of Customers"
|
362
|
-
end
|
363
|
-
|
364
|
-
it "should not find a meta_og_something" do
|
365
|
-
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
366
|
-
@m.meta_og_something.should == nil
|
367
|
-
end
|
368
|
-
|
369
|
-
it "should find a meta_twitter_site" do
|
370
|
-
@m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
371
|
-
@m.meta_twitter_site.should == "@youtube"
|
372
|
-
end
|
373
|
-
|
374
|
-
it "should find a meta_twitter_player_width" do
|
375
|
-
@m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
376
|
-
@m.meta_twitter_player_width.should == "1920"
|
377
|
-
end
|
378
|
-
|
379
|
-
it "should not find a meta_twitter_dummy" do
|
380
|
-
@m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
381
|
-
@m.meta_twitter_dummy.should == nil
|
382
|
-
end
|
383
|
-
|
384
|
-
it "should find a meta_og_video_width" do
|
385
|
-
@m = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
386
|
-
@m.meta_og_video_width.should == "1920"
|
387
|
-
end
|
388
|
-
end
|
389
|
-
|
390
|
-
describe 'Charset detection' do
|
391
|
-
it "should get the charset from <meta charset />" do
|
392
|
-
@m = MetaInspector.new('http://charset001.com')
|
393
|
-
@m.charset.should == "utf-8"
|
394
|
-
end
|
395
|
-
|
396
|
-
it "should get the charset from meta content type" do
|
397
|
-
@m = MetaInspector.new('http://charset002.com')
|
398
|
-
@m.charset.should == "windows-1252"
|
399
|
-
end
|
400
|
-
|
401
|
-
it "should get nil if no declared charset is found" do
|
402
|
-
@m = MetaInspector.new('http://charset000.com')
|
403
|
-
@m.charset.should == nil
|
404
|
-
end
|
405
|
-
end
|
406
|
-
|
407
|
-
describe 'to_hash' do
|
408
|
-
it "should return a hash with all the values set" do
|
409
|
-
@m = MetaInspector.new('http://pagerankalert.com')
|
410
|
-
@m.to_hash.should == {
|
411
|
-
"url" =>"http://pagerankalert.com/",
|
412
|
-
"title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
|
413
|
-
"links" => ["http://pagerankalert.com/",
|
414
|
-
"http://pagerankalert.com/es?language=es",
|
415
|
-
"http://pagerankalert.com/users/sign_up",
|
416
|
-
"http://pagerankalert.com/users/sign_in",
|
417
|
-
"mailto:pagerankalert@gmail.com",
|
418
|
-
"http://pagerankalert.posterous.com/",
|
419
|
-
"http://twitter.com/pagerankalert",
|
420
|
-
"http://twitter.com/share"],
|
421
|
-
"internal_links" => ["http://pagerankalert.com/",
|
422
|
-
"http://pagerankalert.com/es?language=es",
|
423
|
-
"http://pagerankalert.com/users/sign_up",
|
424
|
-
"http://pagerankalert.com/users/sign_in"],
|
425
|
-
"external_links" => ["mailto:pagerankalert@gmail.com",
|
426
|
-
"http://pagerankalert.posterous.com/",
|
427
|
-
"http://twitter.com/pagerankalert",
|
428
|
-
"http://twitter.com/share"],
|
429
|
-
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
430
|
-
"charset" => "utf-8",
|
431
|
-
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
432
|
-
"content_type" =>"text/html",
|
433
|
-
"meta" => {
|
434
|
-
"name" => {
|
435
|
-
"description"=> "Track your PageRank(TM) changes and receive alerts by email",
|
436
|
-
"keywords" => "pagerank, seo, optimization, google",
|
437
|
-
"robots" => "all,follow",
|
438
|
-
"csrf_param" => "authenticity_token",
|
439
|
-
"csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
|
440
|
-
},
|
441
|
-
"property"=>{}
|
442
|
-
}
|
443
|
-
}
|
444
|
-
end
|
445
|
-
end
|
446
|
-
|
447
|
-
describe 'exception handling' do
|
448
|
-
before(:each) do
|
449
|
-
FakeWeb.allow_net_connect = true
|
450
|
-
end
|
451
|
-
|
452
|
-
after(:each) do
|
453
|
-
FakeWeb.allow_net_connect = false
|
454
|
-
end
|
455
|
-
|
456
|
-
it "should handle timeouts" do
|
457
|
-
impatient = MetaInspector.new('http://markupvalidator.com', :timeout => 0.0000000000001)
|
458
|
-
|
459
|
-
expect {
|
460
|
-
title = impatient.title
|
461
|
-
}.to change { impatient.errors.size }
|
462
|
-
|
463
|
-
impatient.errors.first.should == "Timeout!!!"
|
464
|
-
end
|
465
|
-
|
466
|
-
it "should handle socket errors" do
|
467
|
-
nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
|
468
|
-
|
469
|
-
expect {
|
470
|
-
title = nowhere.title
|
471
|
-
}.to change { nowhere.errors.size }
|
472
|
-
|
473
|
-
nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
|
474
|
-
end
|
475
|
-
|
476
|
-
it "should parse images when parse_html_content_type_only is not specified" do
|
477
|
-
image_url = MetaInspector.new('http://pagerankalert.com/image.png')
|
478
|
-
desc = image_url.description
|
479
|
-
|
480
|
-
image_url.should be_ok
|
481
|
-
end
|
482
|
-
|
483
|
-
it "should parse images when parse_html_content_type_only is false" do
|
484
|
-
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
|
485
|
-
desc = image_url.description
|
486
|
-
|
487
|
-
image_url.should be_ok
|
488
|
-
end
|
489
|
-
|
490
|
-
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
491
|
-
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
492
|
-
|
493
|
-
expect {
|
494
|
-
title = image_url.title
|
495
|
-
}.to change { image_url.errors.size }
|
496
|
-
|
497
|
-
image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
|
498
|
-
end
|
499
|
-
|
500
|
-
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
501
|
-
tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
502
|
-
|
503
|
-
expect {
|
504
|
-
title = tar_url.title
|
505
|
-
}.to change { tar_url.errors.size }
|
506
|
-
|
507
|
-
tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
|
508
|
-
end
|
509
|
-
|
510
|
-
describe "ok?" do
|
511
|
-
it "should return true if we have no errors" do
|
512
|
-
good = MetaInspector.new('http://pagerankalert.com')
|
513
|
-
good.to_hash
|
514
|
-
|
515
|
-
good.should be_ok
|
516
|
-
end
|
517
|
-
|
518
|
-
it "should return false if there are errors" do
|
519
|
-
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
|
520
|
-
bad.title
|
521
|
-
|
522
|
-
bad.should_not be_ok
|
523
|
-
end
|
524
|
-
|
525
|
-
it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
|
526
|
-
tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
527
|
-
title = tar.title
|
528
|
-
|
529
|
-
tar.should_not be_ok
|
530
|
-
end
|
531
|
-
end
|
532
|
-
end
|
533
|
-
|
534
|
-
describe "content_type" do
|
535
|
-
it "should return the correct content type of the url for non html pages" do
|
536
|
-
good = MetaInspector.new('http://pagerankalert.com/image.png')
|
537
|
-
|
538
|
-
good.content_type.should == "image/png"
|
539
|
-
end
|
540
|
-
|
541
|
-
it "should return the correct content type of the url for html pages" do
|
542
|
-
good = MetaInspector.new('http://pagerankalert.com')
|
543
|
-
|
544
|
-
good.content_type.should == "text/html"
|
545
|
-
end
|
546
|
-
end
|
547
|
-
end
|