metainspector 3.3.0 → 4.0.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Guardfile +5 -0
- data/README.md +26 -8
- data/lib/meta_inspector/document.rb +4 -8
- data/lib/meta_inspector/exception_log.rb +0 -2
- data/lib/meta_inspector/exceptionable.rb +0 -2
- data/lib/meta_inspector/parser.rb +17 -162
- data/lib/meta_inspector/parsers/base.rb +30 -0
- data/lib/meta_inspector/parsers/images.rb +45 -0
- data/lib/meta_inspector/parsers/links.rb +69 -0
- data/lib/meta_inspector/parsers/meta_tags.rb +72 -0
- data/lib/meta_inspector/parsers/texts.rb +27 -0
- data/lib/meta_inspector/request.rb +0 -2
- data/lib/meta_inspector/url.rb +0 -2
- data/lib/meta_inspector/version.rb +1 -3
- data/lib/meta_inspector.rb +5 -2
- data/lib/metainspector.rb +0 -2
- data/meta_inspector.gemspec +2 -1
- data/spec/document_spec.rb +16 -26
- data/spec/exception_log_spec.rb +1 -3
- data/spec/fixtures/example.response +17 -0
- data/spec/meta_inspector/images_spec.rb +111 -0
- data/spec/meta_inspector/links_spec.rb +203 -0
- data/spec/{meta_inspector_spec.rb → meta_inspector/meta_inspector_spec.rb} +1 -3
- data/spec/meta_inspector/meta_tags_spec.rb +108 -0
- data/spec/meta_inspector/redirections_spec.rb +48 -0
- data/spec/meta_inspector/texts_spec.rb +22 -0
- data/spec/parser_spec.rb +7 -393
- data/spec/request_spec.rb +1 -3
- data/spec/spec_helper.rb +0 -2
- data/spec/url_spec.rb +1 -3
- metadata +44 -6
- data/spec/redirections_spec.rb +0 -47
data/spec/parser_spec.rb
CHANGED
@@ -1,400 +1,14 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), "/spec_helper")
|
1
|
+
require 'spec_helper'
|
4
2
|
|
5
3
|
describe MetaInspector::Parser do
|
6
|
-
let(:
|
7
|
-
|
8
|
-
describe 'Doing a basic scrape' do
|
9
|
-
|
10
|
-
before(:each) do
|
11
|
-
@m = MetaInspector::Parser.new(doc 'http://pagerankalert.com')
|
12
|
-
end
|
13
|
-
|
14
|
-
it "should not find an image" do
|
15
|
-
@m.image.should == nil
|
16
|
-
end
|
17
|
-
|
18
|
-
describe "get image" do
|
19
|
-
it "should find the og image" do
|
20
|
-
@m = MetaInspector::Parser.new(doc 'http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
21
|
-
@m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
22
|
-
end
|
23
|
-
|
24
|
-
it "should find image on youtube" do
|
25
|
-
MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc').image.should == "http://i2.ytimg.com/vi/iaGSSrp49uc/mqdefault.jpg"
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
describe "get images" do
|
30
|
-
it "should find all page images" do
|
31
|
-
@m.images.should == ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"]
|
32
|
-
end
|
33
|
-
|
34
|
-
it "should find images on twitter" do
|
35
|
-
m = MetaInspector::Parser.new(doc 'https://twitter.com/markupvalidator')
|
36
|
-
m.images.length.should == 6
|
37
|
-
m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
it "should ignore malformed image tags" do
|
42
|
-
# There is an image tag without a source. The scraper should not fatal.
|
43
|
-
@m = MetaInspector::Parser.new(doc "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups")
|
44
|
-
@m.images.size.should == 11
|
45
|
-
end
|
46
|
-
|
47
|
-
it "should have a Nokogiri::HTML::Document as parsed" do
|
48
|
-
@m.parsed.class.should == Nokogiri::HTML::Document
|
49
|
-
end
|
50
|
-
|
51
|
-
it "should return the document as a string" do
|
52
|
-
@m.to_s.class.should == String
|
53
|
-
end
|
54
|
-
|
55
|
-
describe "Feed" do
|
56
|
-
it "should get rss feed" do
|
57
|
-
@m = MetaInspector::Parser.new(doc 'http://www.iteh.at')
|
58
|
-
@m.feed.should == 'http://www.iteh.at/de/rss/'
|
59
|
-
end
|
60
|
-
|
61
|
-
it "should get atom feed" do
|
62
|
-
@m = MetaInspector::Parser.new(doc 'http://www.tea-tron.com/jbravo/blog/')
|
63
|
-
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
|
64
|
-
end
|
65
|
-
|
66
|
-
it "should return nil if no feed found" do
|
67
|
-
@m = MetaInspector::Parser.new(doc 'http://www.alazan.com')
|
68
|
-
@m.feed.should == nil
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
it "should get the title from the head section" do
|
75
|
-
p = MetaInspector::Parser.new(doc 'http://example.com')
|
76
|
-
p.title.should == 'An example page'
|
77
|
-
end
|
78
|
-
|
79
|
-
describe '#description' do
|
80
|
-
it "should find description from meta description" do
|
81
|
-
page = MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc')
|
82
|
-
|
83
|
-
page.description.should == "This is Youtube"
|
84
|
-
end
|
85
|
-
|
86
|
-
it "should find a secondary description if no meta description" do
|
87
|
-
@m = MetaInspector::Parser.new(doc 'http://theonion-no-description.com')
|
88
|
-
@m.description.should == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday, an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
describe '#favicon' do
|
93
|
-
it "should get favicon link when marked as icon" do
|
94
|
-
@m = MetaInspector::Parser.new(doc 'http://pagerankalert.com/')
|
95
|
-
@m.favicon.should == 'http://pagerankalert.com/src/favicon.ico'
|
96
|
-
end
|
97
|
-
|
98
|
-
it "should get favicon link when marked as shortcut" do
|
99
|
-
@m = MetaInspector::Parser.new(doc 'http://pagerankalert-shortcut.com/')
|
100
|
-
@m.favicon.should == 'http://pagerankalert-shortcut.com/src/favicon.ico'
|
101
|
-
end
|
102
|
-
|
103
|
-
it "should get favicon link when marked as shorcut and icon" do
|
104
|
-
@m = MetaInspector::Parser.new(doc 'http://pagerankalert-shortcut-and-icon.com/')
|
105
|
-
@m.favicon.should == 'http://pagerankalert-shortcut-and-icon.com/src/favicon.ico'
|
106
|
-
end
|
107
|
-
|
108
|
-
it "should get favicon link when there is also a touch icon" do
|
109
|
-
@m = MetaInspector::Parser.new(doc 'http://pagerankalert-touch-icon.com/')
|
110
|
-
@m.favicon.should == 'http://pagerankalert-touch-icon.com/src/favicon.ico'
|
111
|
-
end
|
112
|
-
|
113
|
-
it "should get favicon link of nil" do
|
114
|
-
@m = MetaInspector::Parser.new(doc 'http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
115
|
-
@m.favicon.should == nil
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
describe 'Links' do
|
120
|
-
before(:each) do
|
121
|
-
@m = MetaInspector::Parser.new(doc 'http://pagerankalert.com')
|
122
|
-
end
|
123
|
-
|
124
|
-
it "should get the links" do
|
125
|
-
@m.links.should == [ "http://pagerankalert.com/",
|
126
|
-
"http://pagerankalert.com/es?language=es",
|
127
|
-
"http://pagerankalert.com/users/sign_up",
|
128
|
-
"http://pagerankalert.com/users/sign_in",
|
129
|
-
"mailto:pagerankalert@gmail.com",
|
130
|
-
"http://pagerankalert.posterous.com/",
|
131
|
-
"http://twitter.com/pagerankalert",
|
132
|
-
"http://twitter.com/share" ]
|
133
|
-
end
|
134
|
-
|
135
|
-
it "should get correct absolute links for internal pages" do
|
136
|
-
@m.internal_links.should == [ "http://pagerankalert.com/",
|
137
|
-
"http://pagerankalert.com/es?language=es",
|
138
|
-
"http://pagerankalert.com/users/sign_up",
|
139
|
-
"http://pagerankalert.com/users/sign_in" ]
|
140
|
-
end
|
141
|
-
|
142
|
-
it "should get correct absolute links for external pages" do
|
143
|
-
@m.external_links.should == [ "mailto:pagerankalert@gmail.com",
|
144
|
-
"http://pagerankalert.posterous.com/",
|
145
|
-
"http://twitter.com/pagerankalert",
|
146
|
-
"http://twitter.com/share" ]
|
147
|
-
end
|
148
|
-
|
149
|
-
it "should get correct absolute links, correcting relative links from URL not ending with slash" do
|
150
|
-
m = MetaInspector::Parser.new(doc 'http://alazan.com/websolution.asp')
|
151
|
-
m.links.should == [ "http://alazan.com/index.asp",
|
152
|
-
"http://alazan.com/faqs.asp" ]
|
153
|
-
end
|
154
|
-
|
155
|
-
it "should return empty array if no links found" do
|
156
|
-
m = MetaInspector::Parser.new(doc 'http://example.com/empty')
|
157
|
-
m.links.should == []
|
158
|
-
end
|
159
|
-
|
160
|
-
describe "links with international characters" do
|
161
|
-
it "should get correct absolute links, encoding the URLs as needed" do
|
162
|
-
m = MetaInspector::Parser.new(doc 'http://international.com')
|
163
|
-
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
164
|
-
"http://international.com/roman%C3%A9e",
|
165
|
-
"http://international.com/faqs#cami%C3%B3n",
|
166
|
-
"http://international.com/search?q=cami%C3%B3n",
|
167
|
-
"http://international.com/search?q=espa%C3%B1a#top",
|
168
|
-
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
|
169
|
-
"http://example.com/espa%C3%B1a.asp",
|
170
|
-
"http://example.com/roman%C3%A9e",
|
171
|
-
"http://example.com/faqs#cami%C3%B3n",
|
172
|
-
"http://example.com/search?q=cami%C3%B3n",
|
173
|
-
"http://example.com/search?q=espa%C3%B1a#top"]
|
174
|
-
end
|
175
|
-
|
176
|
-
describe "internal links" do
|
177
|
-
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
178
|
-
m = MetaInspector::Parser.new(doc 'http://international.com')
|
179
|
-
m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
|
180
|
-
"http://international.com/roman%C3%A9e",
|
181
|
-
"http://international.com/faqs#cami%C3%B3n",
|
182
|
-
"http://international.com/search?q=cami%C3%B3n",
|
183
|
-
"http://international.com/search?q=espa%C3%B1a#top",
|
184
|
-
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
|
185
|
-
end
|
4
|
+
let(:doc) { MetaInspector::Document.new('http://pagerankalert.com') }
|
5
|
+
let(:parser) { MetaInspector::Parser.new(doc) }
|
186
6
|
|
187
|
-
|
188
|
-
|
189
|
-
m.internal_links.should == [ "http://example.com/faqs" ]
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
describe "external links" do
|
194
|
-
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
195
|
-
m = MetaInspector::Parser.new(doc 'http://international.com')
|
196
|
-
m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
|
197
|
-
"http://example.com/roman%C3%A9e",
|
198
|
-
"http://example.com/faqs#cami%C3%B3n",
|
199
|
-
"http://example.com/search?q=cami%C3%B3n",
|
200
|
-
"http://example.com/search?q=espa%C3%B1a#top"]
|
201
|
-
end
|
202
|
-
|
203
|
-
it "should not crash when processing malformed hrefs" do
|
204
|
-
m = MetaInspector::Parser.new(doc 'http://example.com/malformed_href')
|
205
|
-
m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com", "javascript:alert('ok');",
|
206
|
-
"javascript://", "mailto:email(at)example.com"]
|
207
|
-
end
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
it "should not crash with links that have weird href values" do
|
212
|
-
m = MetaInspector::Parser.new(doc 'http://example.com/invalid_href')
|
213
|
-
m.links.should == ["%3Cp%3Eftp://ftp.cdrom.com", "skype:joeuser?call", "telnet://telnet.cdrom.com"]
|
214
|
-
end
|
215
|
-
end
|
216
|
-
|
217
|
-
describe 'Relative links' do
|
218
|
-
describe 'From a root URL' do
|
219
|
-
before(:each) do
|
220
|
-
@m = MetaInspector::Parser.new(doc 'http://relative.com/')
|
221
|
-
end
|
222
|
-
|
223
|
-
it 'should get the relative links' do
|
224
|
-
@m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
|
-
describe 'From a document' do
|
229
|
-
before(:each) do
|
230
|
-
@m = MetaInspector::Parser.new(doc 'http://relative.com/company')
|
231
|
-
end
|
232
|
-
|
233
|
-
it 'should get the relative links' do
|
234
|
-
@m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
235
|
-
end
|
236
|
-
end
|
237
|
-
|
238
|
-
describe 'From a directory' do
|
239
|
-
before(:each) do
|
240
|
-
@m = MetaInspector::Parser.new(doc 'http://relative.com/company/')
|
241
|
-
end
|
242
|
-
|
243
|
-
it 'should get the relative links' do
|
244
|
-
@m.internal_links.should == ['http://relative.com/company/about', 'http://relative.com/sitemap']
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
describe 'Relative links with base' do
|
250
|
-
it 'should get the relative links from a document' do
|
251
|
-
m = MetaInspector::Parser.new(doc 'http://relativewithbase.com/company/page2')
|
252
|
-
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
253
|
-
end
|
254
|
-
|
255
|
-
it 'should get the relative links from a directory' do
|
256
|
-
m = MetaInspector::Parser.new(doc 'http://relativewithbase.com/company/page2/')
|
257
|
-
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
258
|
-
end
|
259
|
-
end
|
260
|
-
|
261
|
-
describe 'Non-HTTP links' do
|
262
|
-
before(:each) do
|
263
|
-
@m = MetaInspector::Parser.new(doc 'http://example.com/nonhttp')
|
264
|
-
end
|
265
|
-
|
266
|
-
it "should get the links" do
|
267
|
-
@m.links.sort.should == [
|
268
|
-
"ftp://ftp.cdrom.com/",
|
269
|
-
"javascript:alert('hey');",
|
270
|
-
"mailto:user@example.com",
|
271
|
-
"skype:joeuser?call",
|
272
|
-
"telnet://telnet.cdrom.com"
|
273
|
-
]
|
274
|
-
end
|
275
|
-
end
|
276
|
-
|
277
|
-
describe 'Protocol-relative URLs' do
|
278
|
-
before(:each) do
|
279
|
-
@m_http = MetaInspector::Parser.new(doc 'http://protocol-relative.com')
|
280
|
-
@m_https = MetaInspector::Parser.new(doc 'https://protocol-relative.com')
|
281
|
-
end
|
282
|
-
|
283
|
-
it "should convert protocol-relative links to http" do
|
284
|
-
@m_http.links.should include('http://protocol-relative.com/contact')
|
285
|
-
@m_http.links.should include('http://yahoo.com/')
|
286
|
-
end
|
287
|
-
|
288
|
-
it "should convert protocol-relative links to https" do
|
289
|
-
@m_https.links.should include('https://protocol-relative.com/contact')
|
290
|
-
@m_https.links.should include('https://yahoo.com/')
|
291
|
-
end
|
292
|
-
end
|
293
|
-
|
294
|
-
describe 'Getting meta tags' do
|
295
|
-
let(:page) { MetaInspector::Parser.new(doc 'http://example.com/meta-tags') }
|
296
|
-
|
297
|
-
it "#meta_tags" do
|
298
|
-
page.meta_tags.should == {
|
299
|
-
'name' => {
|
300
|
-
'keywords' => ['one, two, three'],
|
301
|
-
'description' => ['the description'],
|
302
|
-
'author' => ['Joe Sample'],
|
303
|
-
'robots' => ['index,follow'],
|
304
|
-
'revisit' => ['15 days'],
|
305
|
-
'dc.date.issued' => ['2011-09-15']
|
306
|
-
},
|
307
|
-
|
308
|
-
'http-equiv' => {
|
309
|
-
'content-type' => ['text/html; charset=UTF-8'],
|
310
|
-
'content-style-type' => ['text/css']
|
311
|
-
},
|
312
|
-
|
313
|
-
'property' => {
|
314
|
-
'og:title' => ['An OG title'],
|
315
|
-
'og:type' => ['website'],
|
316
|
-
'og:url' => ['http://example.com/meta-tags'],
|
317
|
-
'og:image' => ['http://example.com/rock.jpg',
|
318
|
-
'http://example.com/rock2.jpg',
|
319
|
-
'http://example.com/rock3.jpg'],
|
320
|
-
'og:image:width' => ['300'],
|
321
|
-
'og:image:height' => ['300', '1000']
|
322
|
-
},
|
323
|
-
|
324
|
-
'charset' => ['UTF-8']
|
325
|
-
}
|
326
|
-
end
|
327
|
-
|
328
|
-
it "#meta_tag" do
|
329
|
-
page.meta_tag.should == {
|
330
|
-
'name' => {
|
331
|
-
'keywords' => 'one, two, three',
|
332
|
-
'description' => 'the description',
|
333
|
-
'author' => 'Joe Sample',
|
334
|
-
'robots' => 'index,follow',
|
335
|
-
'revisit' => '15 days',
|
336
|
-
'dc.date.issued' => '2011-09-15'
|
337
|
-
},
|
338
|
-
|
339
|
-
'http-equiv' => {
|
340
|
-
'content-type' => 'text/html; charset=UTF-8',
|
341
|
-
'content-style-type' => 'text/css'
|
342
|
-
},
|
343
|
-
|
344
|
-
'property' => {
|
345
|
-
'og:title' => 'An OG title',
|
346
|
-
'og:type' => 'website',
|
347
|
-
'og:url' => 'http://example.com/meta-tags',
|
348
|
-
'og:image' => 'http://example.com/rock.jpg',
|
349
|
-
'og:image:width' => '300',
|
350
|
-
'og:image:height' => '300'
|
351
|
-
},
|
352
|
-
|
353
|
-
'charset' => 'UTF-8'
|
354
|
-
}
|
355
|
-
end
|
356
|
-
|
357
|
-
it "#meta" do
|
358
|
-
page.meta.should == {
|
359
|
-
'keywords' => 'one, two, three',
|
360
|
-
'description' => 'the description',
|
361
|
-
'author' => 'Joe Sample',
|
362
|
-
'robots' => 'index,follow',
|
363
|
-
'revisit' => '15 days',
|
364
|
-
'dc.date.issued' => '2011-09-15',
|
365
|
-
'content-type' => 'text/html; charset=UTF-8',
|
366
|
-
'content-style-type' => 'text/css',
|
367
|
-
'og:title' => 'An OG title',
|
368
|
-
'og:type' => 'website',
|
369
|
-
'og:url' => 'http://example.com/meta-tags',
|
370
|
-
'og:image' => 'http://example.com/rock.jpg',
|
371
|
-
'og:image:width' => '300',
|
372
|
-
'og:image:height' => '300',
|
373
|
-
'charset' => 'UTF-8'
|
374
|
-
}
|
375
|
-
end
|
7
|
+
it "should have a Nokogiri::HTML::Document as parsed" do
|
8
|
+
parser.parsed.class.should == Nokogiri::HTML::Document
|
376
9
|
end
|
377
10
|
|
378
|
-
|
379
|
-
|
380
|
-
@m = MetaInspector::Parser.new(doc 'http://charset001.com')
|
381
|
-
@m.charset.should == "utf-8"
|
382
|
-
end
|
383
|
-
|
384
|
-
it "should get the charset from meta content type" do
|
385
|
-
@m = MetaInspector::Parser.new(doc 'http://charset002.com')
|
386
|
-
@m.charset.should == "windows-1252"
|
387
|
-
end
|
388
|
-
|
389
|
-
it "should get nil if no declared charset is found" do
|
390
|
-
@m = MetaInspector::Parser.new(doc 'http://charset000.com')
|
391
|
-
@m.charset.should == nil
|
392
|
-
end
|
393
|
-
end
|
394
|
-
|
395
|
-
private
|
396
|
-
|
397
|
-
def doc(url, options = { exception_log: logger })
|
398
|
-
MetaInspector::Document.new(url, options)
|
11
|
+
it "should return the document as a string" do
|
12
|
+
parser.to_s.class.should == String
|
399
13
|
end
|
400
14
|
end
|
data/spec/request_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
data/spec/url_spec.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 4.0.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -164,6 +164,34 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: guard
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: guard-rspec
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - ">="
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
167
195
|
description: MetaInspector lets you scrape a web page and get its title, charset,
|
168
196
|
link and meta tags
|
169
197
|
email:
|
@@ -176,6 +204,7 @@ files:
|
|
176
204
|
- ".rspec.example"
|
177
205
|
- ".travis.yml"
|
178
206
|
- Gemfile
|
207
|
+
- Guardfile
|
179
208
|
- MIT-LICENSE
|
180
209
|
- README.md
|
181
210
|
- Rakefile
|
@@ -187,6 +216,11 @@ files:
|
|
187
216
|
- lib/meta_inspector/exception_log.rb
|
188
217
|
- lib/meta_inspector/exceptionable.rb
|
189
218
|
- lib/meta_inspector/parser.rb
|
219
|
+
- lib/meta_inspector/parsers/base.rb
|
220
|
+
- lib/meta_inspector/parsers/images.rb
|
221
|
+
- lib/meta_inspector/parsers/links.rb
|
222
|
+
- lib/meta_inspector/parsers/meta_tags.rb
|
223
|
+
- lib/meta_inspector/parsers/texts.rb
|
190
224
|
- lib/meta_inspector/request.rb
|
191
225
|
- lib/meta_inspector/url.rb
|
192
226
|
- lib/meta_inspector/version.rb
|
@@ -227,9 +261,13 @@ files:
|
|
227
261
|
- spec/fixtures/unsafe_https.facebook.com.response
|
228
262
|
- spec/fixtures/wordpress_site.response
|
229
263
|
- spec/fixtures/youtube.response
|
230
|
-
- spec/
|
264
|
+
- spec/meta_inspector/images_spec.rb
|
265
|
+
- spec/meta_inspector/links_spec.rb
|
266
|
+
- spec/meta_inspector/meta_inspector_spec.rb
|
267
|
+
- spec/meta_inspector/meta_tags_spec.rb
|
268
|
+
- spec/meta_inspector/redirections_spec.rb
|
269
|
+
- spec/meta_inspector/texts_spec.rb
|
231
270
|
- spec/parser_spec.rb
|
232
|
-
- spec/redirections_spec.rb
|
233
271
|
- spec/request_spec.rb
|
234
272
|
- spec/spec_helper.rb
|
235
273
|
- spec/url_spec.rb
|
@@ -248,9 +286,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
248
286
|
version: '0'
|
249
287
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
250
288
|
requirements:
|
251
|
-
- - "
|
289
|
+
- - ">"
|
252
290
|
- !ruby/object:Gem::Version
|
253
|
-
version:
|
291
|
+
version: 1.3.1
|
254
292
|
requirements: []
|
255
293
|
rubyforge_project:
|
256
294
|
rubygems_version: 2.2.2
|
data/spec/redirections_spec.rb
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
-
|
5
|
-
describe MetaInspector do
|
6
|
-
describe "redirections" do
|
7
|
-
let(:logger) { MetaInspector::ExceptionLog.new }
|
8
|
-
|
9
|
-
context "when redirections are turned off" do
|
10
|
-
it "disallows redirections" do
|
11
|
-
m = MetaInspector.new("http://facebook.com", :allow_redirections => false, exception_log: logger)
|
12
|
-
m.url.should == "http://facebook.com/"
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
context "when redirections are on (default)" do
|
17
|
-
it "allows follows redirections" do
|
18
|
-
logger.should_not receive(:<<)
|
19
|
-
|
20
|
-
m = MetaInspector.new("http://facebook.com", exception_log: logger)
|
21
|
-
|
22
|
-
m.url.should == "https://www.facebook.com/"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
context "when there are cookies required for proper redirection" do
|
27
|
-
before(:all){WebMock.enable!}
|
28
|
-
after(:all){WebMock.disable!}
|
29
|
-
|
30
|
-
it "allows follows redirections while sending the cookies" do
|
31
|
-
stub_request(:get, "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/").to_return(
|
32
|
-
:status => 302,
|
33
|
-
:headers => {
|
34
|
-
"Location" => "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1",
|
35
|
-
"Set-Cookie" => "EMETA_COOKIE_CHECK=1; path=/; domain=clarionledger.com"
|
36
|
-
})
|
37
|
-
stub_request(:get, "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1")
|
38
|
-
.with(:headers => {"Cookie" => "EMETA_COOKIE_CHECK=1"})
|
39
|
-
logger.should_not receive(:<<)
|
40
|
-
|
41
|
-
m = MetaInspector.new("http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/", exception_log: logger)
|
42
|
-
|
43
|
-
m.url.should == "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1"
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|