url_parser 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +4 -0
- data/Guardfile +40 -7
- data/LICENSE.txt +1 -1
- data/README.md +301 -5
- data/Rakefile +5 -0
- data/lib/url_parser.rb +93 -286
- data/lib/url_parser/db.yml +77 -0
- data/lib/url_parser/domain.rb +102 -0
- data/lib/url_parser/model.rb +233 -0
- data/lib/url_parser/option_setter.rb +47 -0
- data/lib/url_parser/parser.rb +206 -0
- data/lib/url_parser/uri.rb +206 -0
- data/lib/url_parser/version.rb +1 -1
- data/spec/spec_helper.rb +83 -6
- data/spec/support/.gitkeep +0 -0
- data/spec/support/helpers.rb +7 -0
- data/spec/url_parser/domain_spec.rb +163 -0
- data/spec/url_parser/model_spec.rb +426 -0
- data/spec/url_parser/option_setter_spec.rb +71 -0
- data/spec/url_parser/parser_spec.rb +515 -0
- data/spec/url_parser/uri_spec.rb +570 -0
- data/spec/url_parser_spec.rb +93 -387
- data/url_parser.gemspec +5 -6
- metadata +39 -29
@@ -0,0 +1,515 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe UrlParser::Parser do
|
4
|
+
|
5
|
+
let(:url) { 'http://example.com/path' }
|
6
|
+
|
7
|
+
context ".new" do
|
8
|
+
|
9
|
+
it "sets #uri" do
|
10
|
+
expect(described_class.new('#').uri).to eq '#'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "sets options" do
|
14
|
+
opts = { host: 'localhost' }
|
15
|
+
expect(described_class.new('#', opts).options).to eq opts
|
16
|
+
end
|
17
|
+
|
18
|
+
context "by default" do
|
19
|
+
|
20
|
+
it "uses the library configured embedded_params" do
|
21
|
+
expect(described_class.new('#').embedded_params)
|
22
|
+
.to eq UrlParser.configuration.embedded_params
|
23
|
+
end
|
24
|
+
|
25
|
+
it "does not return the raw uri" do
|
26
|
+
expect(described_class.new('#')).not_to be_raw
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
context "options" do
|
32
|
+
|
33
|
+
it "accepts a :base_uri option" do
|
34
|
+
expect(described_class.new('#', base_uri: 'http://example.com').base_uri)
|
35
|
+
.to eq 'http://example.com'
|
36
|
+
end
|
37
|
+
|
38
|
+
it "accepts a :raw option" do
|
39
|
+
expect(described_class.new('#', raw: true)).to be_raw
|
40
|
+
end
|
41
|
+
|
42
|
+
it "accepts an :embedded_params option" do
|
43
|
+
expect(described_class.new('#', embedded_params: 'ref').embedded_params)
|
44
|
+
.to eq [ 'ref' ]
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
context ".call" do
|
52
|
+
|
53
|
+
it "is aliased to .parse" do
|
54
|
+
expect(described_class.method(:call)).to eq described_class.method(:parse)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "returns an Addressable::URI" do
|
58
|
+
expect(described_class.call('#id')).to be_an Addressable::URI
|
59
|
+
end
|
60
|
+
|
61
|
+
it "returns nil if the uri argument is nil" do
|
62
|
+
expect(described_class.call(nil)).to be_nil
|
63
|
+
end
|
64
|
+
|
65
|
+
it "uses the default scheme if only a host is present" do
|
66
|
+
expect(described_class.call('//example.com', default_scheme: 'https').scheme)
|
67
|
+
.to eq 'https'
|
68
|
+
end
|
69
|
+
|
70
|
+
it "does not fail with host labels that exceed size limitations" do
|
71
|
+
expect(described_class.call('a'*64+'.ca').host).to eq nil
|
72
|
+
end
|
73
|
+
|
74
|
+
%w(javascript mailto xmpp).each do |scheme|
|
75
|
+
|
76
|
+
context "with host-less schemes" do
|
77
|
+
|
78
|
+
let(:instance) { described_class.call("#{scheme}:void(0);") }
|
79
|
+
|
80
|
+
it "sets the scheme for #{scheme} links" do
|
81
|
+
expect(instance.scheme).to eq "#{scheme}"
|
82
|
+
end
|
83
|
+
|
84
|
+
it "sets the path for #{scheme} links" do
|
85
|
+
expect(instance.path).to eq 'void(0);'
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
it "accepts a custom host" do
|
93
|
+
expect(described_class.call('/path', host: 'localhost').to_s).to eq 'http://localhost/path'
|
94
|
+
end
|
95
|
+
|
96
|
+
context "with a block" do
|
97
|
+
|
98
|
+
it "can call parser methods to modify the uri" do
|
99
|
+
blk = ->(uri){ uri.unembed! }
|
100
|
+
uri = described_class.call('http://energy.gov/exit?url=https%3A//twitter.com/energy', &blk)
|
101
|
+
expect(uri).to eq described_class.call('https://twitter.com/energy')
|
102
|
+
end
|
103
|
+
|
104
|
+
it "accepts the :raw option" do
|
105
|
+
expect(described_class.call('https://twitter.com/energy', raw: true))
|
106
|
+
.to eq 'https://twitter.com/energy'
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
context "#parse" do
|
114
|
+
|
115
|
+
let(:instance) { described_class.new(url) }
|
116
|
+
|
117
|
+
it "returns a parsed Addressable::URI" do
|
118
|
+
expect(instance.parse).to be_an Addressable::URI
|
119
|
+
end
|
120
|
+
|
121
|
+
it "joins URIs with a :base_uri option" do
|
122
|
+
instance = described_class.new('/bar#id', base_uri: 'http://foo.com/zee/zaw/zoom.html')
|
123
|
+
expect(instance.parse).to eq described_class.call('http://foo.com/bar#id')
|
124
|
+
end
|
125
|
+
|
126
|
+
it "does not changes the value of #uri" do
|
127
|
+
expect{
|
128
|
+
instance.parse
|
129
|
+
}.not_to change{
|
130
|
+
instance.uri
|
131
|
+
}
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
|
136
|
+
context "#parse!" do
|
137
|
+
|
138
|
+
let(:instance) { described_class.new(url) }
|
139
|
+
|
140
|
+
it "updates #uri with the the parsed Addressable::URI" do
|
141
|
+
expect{
|
142
|
+
instance.parse!
|
143
|
+
}.to change{
|
144
|
+
instance.uri
|
145
|
+
}
|
146
|
+
end
|
147
|
+
|
148
|
+
it "is idempotent" do
|
149
|
+
instance.parse!
|
150
|
+
expect{
|
151
|
+
instance.parse!
|
152
|
+
}.not_to change{
|
153
|
+
instance.uri
|
154
|
+
}
|
155
|
+
end
|
156
|
+
|
157
|
+
end
|
158
|
+
|
159
|
+
context "#unescape" do
|
160
|
+
|
161
|
+
let(:instance) { described_class.new('http://example.com/path?id%3D1') }
|
162
|
+
|
163
|
+
it "returns an unescaped string" do
|
164
|
+
expect(instance.unescape).to eq 'http://example.com/path?id=1'
|
165
|
+
end
|
166
|
+
|
167
|
+
it "does not changes the value of #uri" do
|
168
|
+
expect{
|
169
|
+
instance.unescape
|
170
|
+
}.not_to change{
|
171
|
+
instance.uri
|
172
|
+
}
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
|
177
|
+
context "#unescape!" do
|
178
|
+
|
179
|
+
let(:instance) { described_class.new('http://example.com/path?id%3D1') }
|
180
|
+
|
181
|
+
it "updates #uri with the the unescaped string" do
|
182
|
+
expect{
|
183
|
+
instance.unescape!
|
184
|
+
}.to change{
|
185
|
+
instance.uri
|
186
|
+
}
|
187
|
+
end
|
188
|
+
|
189
|
+
it "is idempotent" do
|
190
|
+
instance.unescape!
|
191
|
+
expect{
|
192
|
+
instance.unescape!
|
193
|
+
}.not_to change{
|
194
|
+
instance.uri
|
195
|
+
}
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
context "#unembed" do
|
201
|
+
|
202
|
+
it "extracts an embedded url from a 'u' param" do
|
203
|
+
url = 'http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fexample.com%2Fnews'
|
204
|
+
instance = described_class.new(url)
|
205
|
+
expect(instance.unembed).to eq described_class.call('http://example.com/news')
|
206
|
+
end
|
207
|
+
|
208
|
+
it "extracts an embedded url from a 'url' param" do
|
209
|
+
url = 'http://energy.gov/exit?url=https%3A//twitter.com/energy'
|
210
|
+
instance = described_class.new(url)
|
211
|
+
expect(instance.unembed).to eq described_class.call('https://twitter.com/energy')
|
212
|
+
end
|
213
|
+
|
214
|
+
it "accepts a custom embedded param key" do
|
215
|
+
url = 'https://www.upwork.com/leaving?ref=https%3A%2F%2Fwww.solaraccreditation.com.au' +
|
216
|
+
'%2Fconsumers%2Ffind-an-installer.html'
|
217
|
+
instance = described_class.new(url, embedded_params: 'ref')
|
218
|
+
expect(instance.unembed)
|
219
|
+
.to eq described_class.call('https://www.solaraccreditation.com.au/consumers/find-an-installer.html')
|
220
|
+
end
|
221
|
+
|
222
|
+
it "accepts custom embedded param keys" do
|
223
|
+
url = 'https://www.upwork.com/leaving?ref=https%3A%2F%2Fwww.solaraccreditation.com.au' +
|
224
|
+
'%2Fconsumers%2Ffind-an-installer.html'
|
225
|
+
instance = described_class.new(url, embedded_params: [ 'u', 'url', 'ref'])
|
226
|
+
expect(instance.unembed)
|
227
|
+
.to eq described_class.call('https://www.solaraccreditation.com.au/consumers/find-an-installer.html')
|
228
|
+
end
|
229
|
+
|
230
|
+
end
|
231
|
+
|
232
|
+
context "#unembed!" do
|
233
|
+
|
234
|
+
let(:instance) { described_class.new('http://energy.gov/exit?url=https%3A//twitter.com/energy') }
|
235
|
+
|
236
|
+
it "updates #uri with the the unescaped string" do
|
237
|
+
expect{
|
238
|
+
instance.unembed!
|
239
|
+
}.to change{
|
240
|
+
instance.uri
|
241
|
+
}
|
242
|
+
end
|
243
|
+
|
244
|
+
it "is idempotent" do
|
245
|
+
instance.unembed!
|
246
|
+
expect{
|
247
|
+
instance.unembed!
|
248
|
+
}.not_to change{
|
249
|
+
instance.uri
|
250
|
+
}
|
251
|
+
end
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
context "#normalize" do
|
256
|
+
|
257
|
+
let(:example) { described_class.call('http://example.com/') }
|
258
|
+
|
259
|
+
def n(uri)
|
260
|
+
described_class.normalize(uri).to_s
|
261
|
+
end
|
262
|
+
|
263
|
+
it "normalizes paths" do
|
264
|
+
expect(described_class.new('http://example.com/').normalize).to eq example
|
265
|
+
expect(described_class.new('http://example.com').normalize).to eq example
|
266
|
+
expect(described_class.new('http://example.com///').normalize).to eq example
|
267
|
+
expect(described_class.new('http://example.com/../').normalize).to eq example
|
268
|
+
expect(described_class.new('http://example.com/a/b/../../').normalize).to eq example
|
269
|
+
expect(described_class.new('http://example.com/a/b/../..').normalize).to eq example
|
270
|
+
end
|
271
|
+
|
272
|
+
it "normalizes query strings" do
|
273
|
+
expect(described_class.new('http://example.com/?').normalize).to eq example
|
274
|
+
expect(described_class.new('http://example.com?').normalize).to eq example
|
275
|
+
expect(described_class.new('http://example.com/a/../?').normalize).to eq example
|
276
|
+
end
|
277
|
+
|
278
|
+
it "normalizes anchors" do
|
279
|
+
expect(described_class.new('http://example.com#test').normalize).to eq example
|
280
|
+
expect(described_class.new('http://example.com#test#test').normalize).to eq example
|
281
|
+
expect(described_class.new('http://example.com/a/../?#test').normalize).to eq example
|
282
|
+
end
|
283
|
+
|
284
|
+
it "cleans whitespace" do
|
285
|
+
expect(described_class.new('http://example.com/a/../? ').normalize).to eq example
|
286
|
+
expect(described_class.new('http://example.com/a/../? #test').normalize).to eq example
|
287
|
+
expect(described_class.new('http://example.com/ /../').normalize).to eq example
|
288
|
+
end
|
289
|
+
|
290
|
+
it "normalizes the hostname" do
|
291
|
+
expect(described_class.new('EXAMPLE.COM').normalize).to eq example
|
292
|
+
expect(described_class.new('EXAMPLE.COM/ABC').normalize).to eq (example + 'ABC')
|
293
|
+
expect(described_class.new("💩.la").normalize).to eq described_class.call("xn--ls8h.la")
|
294
|
+
end
|
295
|
+
|
296
|
+
it "defaults to http scheme if missing" do
|
297
|
+
expect(described_class.new('example.com').normalize).to eq example
|
298
|
+
expect(described_class.new('https://example.com/').normalize)
|
299
|
+
.to eq described_class.call('https://example.com/')
|
300
|
+
end
|
301
|
+
|
302
|
+
it "removes trailing slashes on paths" do
|
303
|
+
expect(described_class.new('http://example.com/').normalize).to eq example
|
304
|
+
expect(described_class.new('http://example.com/a').normalize).to eq (example + 'a')
|
305
|
+
expect(described_class.new('http://example.com/a/').normalize).to eq (example + 'a')
|
306
|
+
expect(described_class.new('http://example.com/a/b').normalize).to eq (example + 'a/b')
|
307
|
+
expect(described_class.new('http://example.com/a/b/').normalize).to eq (example + 'a/b')
|
308
|
+
end
|
309
|
+
|
310
|
+
end
|
311
|
+
|
312
|
+
context "#normalize!" do
|
313
|
+
|
314
|
+
let(:instance) { described_class.new('http://example.com///') }
|
315
|
+
|
316
|
+
it "updates #uri with the the normalized string" do
|
317
|
+
expect{
|
318
|
+
instance.normalize!
|
319
|
+
}.to change{
|
320
|
+
instance.uri
|
321
|
+
}
|
322
|
+
end
|
323
|
+
|
324
|
+
it "is idempotent" do
|
325
|
+
instance.normalize!
|
326
|
+
expect{
|
327
|
+
instance.normalize!
|
328
|
+
}.not_to change{
|
329
|
+
instance.uri
|
330
|
+
}
|
331
|
+
end
|
332
|
+
|
333
|
+
end
|
334
|
+
|
335
|
+
context "#canonicalize" do
|
336
|
+
|
337
|
+
let(:instance) { described_class.new('https://wikipedia.org/?source=ABCD&utm_source=EFGH') }
|
338
|
+
|
339
|
+
it "is alised to #c14n" do
|
340
|
+
expect(instance.method(:canonicalize)).to eq instance.method(:c14n)
|
341
|
+
end
|
342
|
+
|
343
|
+
it "returns a canonicalized Addressable::URI" do
|
344
|
+
expect(instance.canonicalize).to eq Addressable::URI.parse('https://wikipedia.org/')
|
345
|
+
end
|
346
|
+
|
347
|
+
it "does not changes the value of #uri" do
|
348
|
+
expect{
|
349
|
+
instance.canonicalize
|
350
|
+
}.not_to change{
|
351
|
+
instance.uri
|
352
|
+
}
|
353
|
+
end
|
354
|
+
|
355
|
+
end
|
356
|
+
|
357
|
+
context "#canonicalize!" do
|
358
|
+
|
359
|
+
let(:instance) { described_class.new('https://wikipedia.org/?source=ABCD&utm_source=EFGH') }
|
360
|
+
|
361
|
+
it "is alised to #c14n!" do
|
362
|
+
expect(instance.method(:canonicalize!)).to eq instance.method(:c14n!)
|
363
|
+
end
|
364
|
+
|
365
|
+
it "updates #uri with the the unescaped string" do
|
366
|
+
expect{
|
367
|
+
instance.canonicalize!
|
368
|
+
}.to change{
|
369
|
+
instance.uri
|
370
|
+
}
|
371
|
+
end
|
372
|
+
|
373
|
+
it "is idempotent" do
|
374
|
+
instance.canonicalize!
|
375
|
+
expect{
|
376
|
+
instance.canonicalize!
|
377
|
+
}.not_to change{
|
378
|
+
instance.uri
|
379
|
+
}
|
380
|
+
end
|
381
|
+
|
382
|
+
end
|
383
|
+
|
384
|
+
context "#raw" do
|
385
|
+
|
386
|
+
let(:instance) { described_class.new('https://example.com') }
|
387
|
+
|
388
|
+
it "returns a string" do
|
389
|
+
instance.parse!
|
390
|
+
expect(instance.raw).to eq 'https://example.com/'
|
391
|
+
end
|
392
|
+
|
393
|
+
it "does not changes the value of #uri" do
|
394
|
+
expect{
|
395
|
+
instance.raw
|
396
|
+
}.not_to change{
|
397
|
+
instance.uri
|
398
|
+
}
|
399
|
+
end
|
400
|
+
|
401
|
+
end
|
402
|
+
|
403
|
+
context "#raw!" do
|
404
|
+
|
405
|
+
let(:instance) { described_class.new('https://example.com') }
|
406
|
+
|
407
|
+
before do
|
408
|
+
instance.parse!
|
409
|
+
end
|
410
|
+
|
411
|
+
it "updates #uri with the the raw string" do
|
412
|
+
expect{
|
413
|
+
instance.raw!
|
414
|
+
}.to change{
|
415
|
+
instance.uri
|
416
|
+
}
|
417
|
+
end
|
418
|
+
|
419
|
+
it "is idempotent" do
|
420
|
+
instance.raw!
|
421
|
+
expect{
|
422
|
+
instance.raw!
|
423
|
+
}.not_to change{
|
424
|
+
instance.uri
|
425
|
+
}
|
426
|
+
end
|
427
|
+
|
428
|
+
end
|
429
|
+
|
430
|
+
context "#clean!" do
|
431
|
+
|
432
|
+
let(:instance) { described_class.new('#') }
|
433
|
+
|
434
|
+
it "unescapes the URI" do
|
435
|
+
expect(instance).to receive :unescape!
|
436
|
+
instance.clean!
|
437
|
+
end
|
438
|
+
|
439
|
+
it "parses the URI" do
|
440
|
+
expect(instance).to receive :parse!
|
441
|
+
instance.clean!
|
442
|
+
end
|
443
|
+
|
444
|
+
it "unembeds the URI" do
|
445
|
+
expect(instance).to receive :unembed!
|
446
|
+
instance.clean!
|
447
|
+
end
|
448
|
+
|
449
|
+
it "canonicalizes the URI" do
|
450
|
+
expect(instance).to receive :canonicalize!
|
451
|
+
instance.clean!
|
452
|
+
end
|
453
|
+
|
454
|
+
it "normalizes the URI" do
|
455
|
+
expect(instance).to receive :normalize!
|
456
|
+
instance.clean!
|
457
|
+
end
|
458
|
+
|
459
|
+
it "does not convert the URI to a string by default" do
|
460
|
+
expect(instance).not_to receive :raw!
|
461
|
+
instance.clean!
|
462
|
+
end
|
463
|
+
|
464
|
+
it "returns a string with the :raw option enabled" do
|
465
|
+
instance = described_class.new('#', raw: true)
|
466
|
+
expect(instance).to receive :raw!
|
467
|
+
instance.clean!
|
468
|
+
end
|
469
|
+
|
470
|
+
end
|
471
|
+
|
472
|
+
context "#sha1" do
|
473
|
+
|
474
|
+
let(:instance) { described_class.new('http://example.com') }
|
475
|
+
|
476
|
+
it "is aliased to #hash" do
|
477
|
+
expect(instance.method(:sha1)).to eq instance.method(:hash)
|
478
|
+
end
|
479
|
+
|
480
|
+
it "returns a SHA1 hash representation of the raw uri" do
|
481
|
+
expect(instance.sha1).to eq "89dce6a446a69d6b9bdc01ac75251e4c322bcdff"
|
482
|
+
end
|
483
|
+
|
484
|
+
end
|
485
|
+
|
486
|
+
context "#==" do
|
487
|
+
|
488
|
+
it "is true if two URIs have the same SHA1" do
|
489
|
+
expect(
|
490
|
+
described_class.new('http://example.com/') == 'http://example.com'
|
491
|
+
).to be true
|
492
|
+
end
|
493
|
+
|
494
|
+
it "is false if two URIs do not have the same SHA1" do
|
495
|
+
expect(
|
496
|
+
described_class.new('http://example.com/') == 'http://example.org'
|
497
|
+
).to be false
|
498
|
+
end
|
499
|
+
|
500
|
+
it "cleans both URIs before comparing" do
|
501
|
+
expect(
|
502
|
+
described_class.new('http://example.com/?utm_source=google') ==
|
503
|
+
'http://example.com/?utm_source=yahoo'
|
504
|
+
).to be true
|
505
|
+
end
|
506
|
+
|
507
|
+
it "compares two URIs with the :raw option enabled" do
|
508
|
+
expect(
|
509
|
+
described_class.new('http://example.com/?utm_source=google', raw: true) ==
|
510
|
+
'http://example.com/?utm_source=yahoo'
|
511
|
+
).to be true
|
512
|
+
end
|
513
|
+
|
514
|
+
end
|
515
|
+
end
|