postrank-uri 1.0.18 → 1.0.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Appraisals +19 -0
- data/LICENSE +21 -0
- data/README.md +26 -1
- data/Rakefile +3 -0
- data/lib/postrank-uri.rb +12 -7
- data/lib/postrank-uri/version.rb +1 -1
- data/postrank-uri.gemspec +6 -3
- data/spec/postrank-uri_spec.rb +163 -148
- metadata +73 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2a453df060f3bb0d7ea04c7031d17f9e2039e8fc
|
4
|
+
data.tar.gz: fc665791c45e60179c706e9e0e2b3c8c50a58f73
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c6eec4d6e64e4c400d1ba9e2508a6bca95008452d3e64f22511b56236892a12afd582a7d8be28b5044e0e03f709459f5bba238359ac6408d42221912e8970a3
|
7
|
+
data.tar.gz: 7816c251f9ba449f2f3ea3c8e8e4fb1f80c2e76f8fd004c2c73021b3e47fc7cc1068467f90e17986f041ea948c375b24ce7248a837d8f0d13dedc1f0c4773ed1
|
data/.gitignore
CHANGED
data/Appraisals
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
appraise "nokogiri-1.7" do
|
2
|
+
gem "nokogiri", "~> 1.7.0"
|
3
|
+
end
|
4
|
+
|
5
|
+
appraise "nokogiri-1.6" do
|
6
|
+
gem "nokogiri", "~> 1.6.1"
|
7
|
+
end
|
8
|
+
|
9
|
+
appraise "addressable-2.3" do
|
10
|
+
gem "addressable", "~> 2.3.0"
|
11
|
+
end
|
12
|
+
|
13
|
+
appraise "addressable-2.4" do
|
14
|
+
gem "addressable", "~> 2.4.0"
|
15
|
+
end
|
16
|
+
|
17
|
+
appraise "addressable-2.5" do
|
18
|
+
gem "addressable", "~> 2.5.0"
|
19
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2011 Ilya Grigorik
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
CHANGED
@@ -37,4 +37,29 @@ In a nutshell, we need to make sure that creative cases like the ones below all
|
|
37
37
|
|
38
38
|
As part of URI canonicalization the library will remove common tracking parameters from Google Analytics and several other providers. Beyond that, host-specific rules are also applied. For example, nytimes.com likes to add a 'partner' query parameter for tracking purposes, but which has no effect on the content - hence, it is removed from the URI. For full list, see the c14n.yml file.
|
39
39
|
|
40
|
-
Detecting "duplicate URLs" is a hard problem to solve (expensive in all senses), instead we are compiling a manually assembled database. If you find cases which are missing, please do report them, or send us a pull request!
|
40
|
+
Detecting "duplicate URLs" is a hard problem to solve (expensive in all senses), instead we are compiling a manually assembled database. If you find cases which are missing, please do report them, or send us a pull request!
|
41
|
+
|
42
|
+
## Development
|
43
|
+
|
44
|
+
### Setup
|
45
|
+
|
46
|
+
```
|
47
|
+
bundle install
|
48
|
+
```
|
49
|
+
|
50
|
+
### Running tests
|
51
|
+
|
52
|
+
```
|
53
|
+
bundle exec rake
|
54
|
+
```
|
55
|
+
|
56
|
+
### Running dependency appraisals
|
57
|
+
|
58
|
+
To verify `postrake-uri` works with different versions of its runtime dependencies you can run:
|
59
|
+
|
60
|
+
```
|
61
|
+
bundle exec appraisal install
|
62
|
+
bundle exec rake appraisal
|
63
|
+
```
|
64
|
+
|
65
|
+
This will execute the test suite with different versions of the dependencies.
|
data/Rakefile
CHANGED
data/lib/postrank-uri.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# encoding: utf-8
|
2
2
|
require 'addressable/uri'
|
3
3
|
require 'digest/md5'
|
4
4
|
require 'nokogiri'
|
@@ -9,7 +9,7 @@ module Addressable
|
|
9
9
|
class URI
|
10
10
|
def domain
|
11
11
|
host = self.host
|
12
|
-
(host && PublicSuffix.valid?(host)) ? PublicSuffix.parse(host).domain : nil
|
12
|
+
(host && PublicSuffix.valid?(host, default_rule: nil)) ? PublicSuffix.parse(host).domain : nil
|
13
13
|
end
|
14
14
|
|
15
15
|
def normalized_query
|
@@ -86,8 +86,9 @@ module PostRank
|
|
86
86
|
)
|
87
87
|
}iox;
|
88
88
|
|
89
|
+
URIREGEX[:reserved_characters] = /%3F|%26/i
|
89
90
|
URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x
|
90
|
-
URIREGEX[:unescape] = /(
|
91
|
+
URIREGEX[:unescape] = /(%[0-9a-fA-F]{2})/x
|
91
92
|
URIREGEX.each_pair{|k,v| v.freeze }
|
92
93
|
|
93
94
|
module_function
|
@@ -97,7 +98,7 @@ module PostRank
|
|
97
98
|
urls = []
|
98
99
|
text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
99
100
|
# Only extract the URL if the domain is valid
|
100
|
-
if PublicSuffix.valid?(domain)
|
101
|
+
if PublicSuffix.valid?(domain, default_rule: nil)
|
101
102
|
url = clean(url)
|
102
103
|
urls.push url.to_s
|
103
104
|
end
|
@@ -131,8 +132,12 @@ module PostRank
|
|
131
132
|
def unescape(uri)
|
132
133
|
u = parse(uri)
|
133
134
|
u.query = u.query.tr('+', ' ') if u.query
|
134
|
-
u.to_s.gsub(URIREGEX[:unescape]) do
|
135
|
-
[
|
135
|
+
u.to_s.gsub(URIREGEX[:unescape]) do |encoded|
|
136
|
+
if encoded.match? URIREGEX[:reserved_characters]
|
137
|
+
encoded
|
138
|
+
else
|
139
|
+
[encoded.delete('%')].pack('H*')
|
140
|
+
end
|
136
141
|
end
|
137
142
|
end
|
138
143
|
|
@@ -225,7 +230,7 @@ module PostRank
|
|
225
230
|
cleaned_uri = clean(uri, :raw => true)
|
226
231
|
|
227
232
|
if host = cleaned_uri.host
|
228
|
-
is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host))
|
233
|
+
is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil)
|
229
234
|
end
|
230
235
|
|
231
236
|
is_valid
|
data/lib/postrank-uri/version.rb
CHANGED
data/postrank-uri.gemspec
CHANGED
@@ -11,14 +11,17 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.homepage = "http://github.com/postrank-labs/postrank-uri"
|
12
12
|
s.summary = "URI normalization, c14n, escaping, and extraction"
|
13
13
|
s.description = s.summary
|
14
|
+
s.license = 'MIT'
|
14
15
|
|
15
16
|
s.rubyforge_project = "postrank-uri"
|
16
17
|
|
17
|
-
s.add_dependency "addressable", "
|
18
|
-
s.add_dependency "public_suffix", "
|
19
|
-
s.add_dependency "nokogiri", "
|
18
|
+
s.add_dependency "addressable", ">= 2.3.0", "< 2.6"
|
19
|
+
s.add_dependency "public_suffix", ">= 2.0.0", "< 2.1"
|
20
|
+
s.add_dependency "nokogiri", ">= 1.6.1", "< 1.8"
|
20
21
|
|
22
|
+
s.add_development_dependency "rake"
|
21
23
|
s.add_development_dependency "rspec"
|
24
|
+
s.add_development_dependency "appraisal", ">= 2.0.0", "< 3.0"
|
22
25
|
|
23
26
|
s.files = `git ls-files`.split("\n")
|
24
27
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/spec/postrank-uri_spec.rb
CHANGED
@@ -3,103 +3,101 @@
|
|
3
3
|
require 'helper'
|
4
4
|
|
5
5
|
describe PostRank::URI do
|
6
|
-
|
7
|
-
let(:igvita) { 'http://igvita.com/' }
|
8
|
-
|
9
6
|
context "escaping" do
|
10
|
-
it "
|
11
|
-
PostRank::URI.escape('id=1').
|
7
|
+
it "escapes PostRank::URI string" do
|
8
|
+
expect(PostRank::URI.escape('id=1')).to eq('id%3D1')
|
12
9
|
end
|
13
10
|
|
14
|
-
it "
|
15
|
-
PostRank::URI.escape('id= 1').
|
11
|
+
it "escapes spaces as %20's" do
|
12
|
+
expect(PostRank::URI.escape('id= 1')).to match('%20')
|
16
13
|
end
|
17
14
|
end
|
18
15
|
|
19
16
|
context "unescape" do
|
20
|
-
it "
|
21
|
-
PostRank::URI.unescape(PostRank::URI.escape('id=1')).
|
17
|
+
it "unescapes PostRank::URI" do
|
18
|
+
expect(PostRank::URI.unescape(PostRank::URI.escape('id=1'))).to eq('id=1')
|
22
19
|
end
|
23
20
|
|
24
|
-
it "
|
25
|
-
PostRank::URI.unescape(PostRank::URI.escape('id= 1')).
|
21
|
+
it "unescapes PostRank::URI with spaces" do
|
22
|
+
expect(PostRank::URI.unescape(PostRank::URI.escape('id= 1'))).to eq('id= 1')
|
26
23
|
end
|
27
24
|
|
28
25
|
context "accept improperly escaped PostRank::URI strings" do
|
29
26
|
# See http://tools.ietf.org/html/rfc3986#section-2.3
|
30
27
|
|
31
|
-
it "
|
32
|
-
PostRank::URI.unescape('?id=+1').
|
28
|
+
it "unescapes PostRank::URI with spaces encoded as '+'" do
|
29
|
+
expect(PostRank::URI.unescape('?id=+1')).to eq('?id= 1')
|
33
30
|
end
|
34
31
|
|
35
|
-
it "
|
36
|
-
PostRank::URI.unescape('?id%3D+1').
|
32
|
+
it "unescapes PostRank::URI with spaces encoded as '+'" do
|
33
|
+
expect(PostRank::URI.unescape('?id%3D+1')).to eq('?id= 1')
|
37
34
|
end
|
38
35
|
|
39
|
-
it "
|
40
|
-
PostRank::URI.unescape('?id=%201').
|
36
|
+
it "unescapes PostRank::URI with spaces encoded as %20" do
|
37
|
+
expect(PostRank::URI.unescape('?id=%201')).to eq('?id= 1')
|
41
38
|
end
|
42
39
|
|
43
|
-
it "
|
44
|
-
PostRank::URI.unescape('/foo+bar?id=foo+bar').
|
40
|
+
it "does not unescape '+' to spaces in paths" do
|
41
|
+
expect(PostRank::URI.unescape('/foo+bar?id=foo+bar')).to eq('/foo+bar?id=foo bar')
|
45
42
|
end
|
46
43
|
end
|
47
44
|
|
48
45
|
end
|
49
46
|
|
50
47
|
context "normalize" do
|
48
|
+
let(:igvita) { 'http://igvita.com/' }
|
49
|
+
|
51
50
|
def n(uri)
|
52
51
|
PostRank::URI.normalize(uri).to_s
|
53
52
|
end
|
54
53
|
|
55
|
-
it "
|
56
|
-
n('http://igvita.com/').
|
57
|
-
n('http://igvita.com').to_s.
|
58
|
-
n('http://igvita.com///').
|
54
|
+
it "normalizes paths in PostRank::URIs" do
|
55
|
+
expect(n('http://igvita.com/')).to eq(igvita)
|
56
|
+
expect(n('http://igvita.com').to_s).to eq(igvita)
|
57
|
+
expect(n('http://igvita.com///')).to eq(igvita)
|
59
58
|
|
60
|
-
n('http://igvita.com/../').
|
61
|
-
n('http://igvita.com/a/b/../../').
|
62
|
-
n('http://igvita.com/a/b/../..').
|
59
|
+
expect(n('http://igvita.com/../')).to eq(igvita)
|
60
|
+
expect(n('http://igvita.com/a/b/../../')).to eq(igvita)
|
61
|
+
expect(n('http://igvita.com/a/b/../..')).to eq(igvita)
|
63
62
|
end
|
64
63
|
|
65
|
-
it "
|
66
|
-
n('http://igvita.com/?').
|
67
|
-
n('http://igvita.com?').
|
68
|
-
n('http://igvita.com/a/../?').
|
64
|
+
it "normalizes query strings in PostRank::URIs" do
|
65
|
+
expect(n('http://igvita.com/?')).to eq(igvita)
|
66
|
+
expect(n('http://igvita.com?')).to eq(igvita)
|
67
|
+
expect(n('http://igvita.com/a/../?')).to eq(igvita)
|
69
68
|
end
|
70
69
|
|
71
|
-
it "
|
72
|
-
n('http://igvita.com#test').
|
73
|
-
n('http://igvita.com#test#test').
|
74
|
-
n('http://igvita.com/a/../?#test').
|
70
|
+
it "normalizes anchors in PostRank::URIs" do
|
71
|
+
expect(n('http://igvita.com#test')).to eq(igvita)
|
72
|
+
expect(n('http://igvita.com#test#test')).to eq(igvita)
|
73
|
+
expect(n('http://igvita.com/a/../?#test')).to eq(igvita)
|
75
74
|
end
|
76
75
|
|
77
|
-
it "
|
78
|
-
n('http://igvita.com/a/../? ').
|
79
|
-
n('http://igvita.com/a/../? #test').
|
80
|
-
n('http://igvita.com/ /../').
|
76
|
+
it "cleans whitespace in PostRank::URIs" do
|
77
|
+
expect(n('http://igvita.com/a/../? ')).to eq(igvita)
|
78
|
+
expect(n('http://igvita.com/a/../? #test')).to eq(igvita)
|
79
|
+
expect(n('http://igvita.com/ /../')).to eq(igvita)
|
81
80
|
end
|
82
81
|
|
83
|
-
it "
|
84
|
-
n('igvita.com').
|
85
|
-
n('https://test.com/').to_s.
|
82
|
+
it "defaults to http scheme if missing" do
|
83
|
+
expect(n('igvita.com')).to eq(igvita)
|
84
|
+
expect(n('https://test.com/').to_s).to eq('https://test.com/')
|
86
85
|
end
|
87
86
|
|
88
|
-
it "
|
89
|
-
n('IGVITA.COM').
|
90
|
-
n('IGVITA.COM/ABC').
|
87
|
+
it "downcases the hostname" do
|
88
|
+
expect(n('IGVITA.COM')).to eq(igvita)
|
89
|
+
expect(n('IGVITA.COM/ABC')).to eq(igvita + "ABC")
|
91
90
|
end
|
92
91
|
|
93
|
-
it "
|
94
|
-
n('http://igvita.com/').
|
92
|
+
it "removes trailing slash on paths" do
|
93
|
+
expect(n('http://igvita.com/')).to eq('http://igvita.com/')
|
95
94
|
|
96
|
-
n('http://igvita.com/a').
|
97
|
-
n('http://igvita.com/a/').
|
95
|
+
expect(n('http://igvita.com/a')).to eq('http://igvita.com/a')
|
96
|
+
expect(n('http://igvita.com/a/')).to eq('http://igvita.com/a')
|
98
97
|
|
99
|
-
n('http://igvita.com/a/b').
|
100
|
-
n('http://igvita.com/a/b/').
|
98
|
+
expect(n('http://igvita.com/a/b')).to eq('http://igvita.com/a/b')
|
99
|
+
expect(n('http://igvita.com/a/b/')).to eq('http://igvita.com/a/b')
|
101
100
|
end
|
102
|
-
|
103
101
|
end
|
104
102
|
|
105
103
|
context "canonicalization" do
|
@@ -109,60 +107,60 @@ describe PostRank::URI do
|
|
109
107
|
|
110
108
|
context "query parameters" do
|
111
109
|
it "should handle nester parameters" do
|
112
|
-
c('igvita.com/?id=a&utm_source=a').
|
110
|
+
expect(c('igvita.com/?id=a&utm_source=a')).to eq('http://igvita.com/?id=a')
|
113
111
|
end
|
114
112
|
|
115
|
-
it "
|
113
|
+
it "preserves the order of parameters" do
|
116
114
|
url = 'http://a.com/?'+('a'..'z').to_a.shuffle.map {|e| "#{e}=#{e}"}.join("&")
|
117
|
-
c(url).
|
115
|
+
expect(c(url)).to eq(url)
|
118
116
|
end
|
119
117
|
|
120
|
-
it "
|
121
|
-
c('igvita.com/?id=a&utm_source=a').
|
122
|
-
c('igvita.com/?id=a&utm_source=a&utm_valid').
|
118
|
+
it "removes Google Analytics parameters" do
|
119
|
+
expect(c('igvita.com/?id=a&utm_source=a')).to eq('http://igvita.com/?id=a')
|
120
|
+
expect(c('igvita.com/?id=a&utm_source=a&utm_valid')).to eq('http://igvita.com/?id=a&utm_valid')
|
123
121
|
end
|
124
122
|
|
125
|
-
it "
|
126
|
-
c('igvita.com/?id=a&utm_source=a&awesm=b').
|
127
|
-
c('igvita.com/?id=a&sms_ss=a').
|
123
|
+
it "removes awesm/sms parameters" do
|
124
|
+
expect(c('igvita.com/?id=a&utm_source=a&awesm=b')).to eq('http://igvita.com/?id=a')
|
125
|
+
expect(c('igvita.com/?id=a&sms_ss=a')).to eq('http://igvita.com/?id=a')
|
128
126
|
end
|
129
127
|
|
130
|
-
it "
|
131
|
-
c('http://www.nachi.org/forum?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').
|
132
|
-
c('http://www.nachi.org/forum/?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').
|
133
|
-
c('http://www.nachi.org/forum?id=123&PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').
|
128
|
+
it "removes PHPSESSID parameter" do
|
129
|
+
expect(c('http://www.nachi.org/forum?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum?')
|
130
|
+
expect(c('http://www.nachi.org/forum/?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum/?')
|
131
|
+
expect(c('http://www.nachi.org/forum?id=123&PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum?id=123')
|
134
132
|
end
|
135
133
|
end
|
136
134
|
|
137
135
|
context "hashbang" do
|
138
|
-
it "
|
139
|
-
c('http://twitter.com/#!/igrigorik').
|
140
|
-
c('http://twitter.com/#!/a/statuses/1').
|
141
|
-
c('http://nontwitter.com/#!/a/statuses/1').
|
136
|
+
it "rewrites twitter links to crawlable versions" do
|
137
|
+
expect(c('http://twitter.com/#!/igrigorik')).to eq('http://twitter.com/igrigorik')
|
138
|
+
expect(c('http://twitter.com/#!/a/statuses/1')).to eq('http://twitter.com/a/statuses/1')
|
139
|
+
expect(c('http://nontwitter.com/#!/a/statuses/1')).to eq('http://nontwitter.com/#!/a/statuses/1')
|
142
140
|
end
|
143
141
|
end
|
144
142
|
|
145
143
|
context "tumblr" do
|
146
|
-
it "
|
147
|
-
c('http://test.tumblr.com/post/4533459403/some-text').
|
148
|
-
c('http://tumblr.com/xjl2evo3hh').
|
144
|
+
it "strips the slug" do
|
145
|
+
expect(c('http://test.tumblr.com/post/4533459403/some-text')).to eq('http://test.tumblr.com/post/4533459403/')
|
146
|
+
expect(c('http://tumblr.com/xjl2evo3hh')).to eq('http://tumblr.com/xjl2evo3hh')
|
149
147
|
end
|
150
148
|
end
|
151
149
|
|
152
150
|
context "embedded links" do
|
153
|
-
it "
|
151
|
+
it "extracts embedded redirects from Google News" do
|
154
152
|
u = c('http://news.google.com/news/url?sa=t&fd=R&&url=http://www.ctv.ca/CTVNews/Politics/20110111/')
|
155
|
-
u.
|
153
|
+
expect(u).to eq('http://www.ctv.ca/CTVNews/Politics/20110111')
|
156
154
|
end
|
157
155
|
|
158
|
-
it "
|
156
|
+
it "extracts embedded redirects from xfruits.com" do
|
159
157
|
u = c('http://xfruits.com/MrGroar/?url=http%3A%2F%2Faap.lesroyaumes.com%2Fdepeches%2Fdepeche351820908.html')
|
160
|
-
u.
|
158
|
+
expect(u).to eq('http://aap.lesroyaumes.com/depeches/depeche351820908.html')
|
161
159
|
end
|
162
160
|
|
163
|
-
it "
|
161
|
+
it "extracts embedded redirects from MySpace" do
|
164
162
|
u = c('http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fghanaian-chronicle.com%2Fnews%2Fother-news%2Fcanadian-high-commissioner-urges-media%2F&t=Canadian%20High%20Commissioner%20urges%20media')
|
165
|
-
u.
|
163
|
+
expect(u).to eq('http://ghanaian-chronicle.com/news/other-news/canadian-high-commissioner-urges-media')
|
166
164
|
end
|
167
165
|
end
|
168
166
|
end
|
@@ -172,25 +170,42 @@ describe PostRank::URI do
|
|
172
170
|
PostRank::URI.clean(uri)
|
173
171
|
end
|
174
172
|
|
175
|
-
it "
|
176
|
-
c('http://igvita.com/?id=1').
|
177
|
-
c('igvita.com/?id=1').
|
173
|
+
it "unescapes, canonicalizes and normalizes" do
|
174
|
+
expect(c('http://igvita.com/?id=1')).to eq('http://igvita.com/?id=1')
|
175
|
+
expect(c('igvita.com/?id=1')).to eq('http://igvita.com/?id=1')
|
178
176
|
|
179
|
-
c('http://igvita.com/?id= 1').
|
180
|
-
c('http://igvita.com/?id=+1').
|
181
|
-
c('http://igvita.com/?id%3D%201').
|
177
|
+
expect(c('http://igvita.com/?id= 1')).to eq('http://igvita.com/?id=%201')
|
178
|
+
expect(c('http://igvita.com/?id=+1')).to eq('http://igvita.com/?id=%201')
|
179
|
+
expect(c('http://igvita.com/?id%3D%201')).to eq('http://igvita.com/?id=%201')
|
182
180
|
|
183
|
-
c('igvita.com/a/..?id=1&utm_source=a&awesm=b#c').
|
181
|
+
expect(c('igvita.com/a/..?id=1&utm_source=a&awesm=b#c')).to eq('http://igvita.com/?id=1')
|
184
182
|
|
185
|
-
c('igvita.com?id=<>').
|
186
|
-
c('igvita.com?id="').
|
183
|
+
expect(c('igvita.com?id=<>')).to eq('http://igvita.com/?id=%3C%3E')
|
184
|
+
expect(c('igvita.com?id="')).to eq('http://igvita.com/?id=%22')
|
187
185
|
|
188
|
-
c('test.tumblr.com/post/23223/text-stub').
|
186
|
+
expect(c('test.tumblr.com/post/23223/text-stub')).to eq('http://test.tumblr.com/post/23223')
|
189
187
|
end
|
190
188
|
|
191
|
-
it "
|
189
|
+
it "cleans host specific parameters" do
|
192
190
|
YAML.load_file('spec/c14n_hosts.yml').each do |orig, clean|
|
193
|
-
c(orig).
|
191
|
+
expect(c(orig)).to eq(clean)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
context "reserved characters" do
|
196
|
+
it "preserves encoded question marks" do
|
197
|
+
expect(c('http://en.wikipedia.org/wiki/Whose_Line_Is_It_Anyway%3F_%28U.S._TV_series%29')).
|
198
|
+
to eq('http://en.wikipedia.org/wiki/Whose_Line_Is_It_Anyway%3F_(U.S._TV_series)')
|
199
|
+
end
|
200
|
+
|
201
|
+
it "preserves encoded ampersands" do
|
202
|
+
expect(c('http://example.com/?foo=BAR%26BAZ')).
|
203
|
+
to eq('http://example.com/?foo=BAR%26BAZ')
|
204
|
+
end
|
205
|
+
|
206
|
+
it "preserves consecutive reserved characters" do
|
207
|
+
expect(c('http://example.com/so-quizical%3F%3F%3F?foo=bar')).
|
208
|
+
to eq('http://example.com/so-quizical%3F%3F%3F?foo=bar')
|
194
209
|
end
|
195
210
|
end
|
196
211
|
end
|
@@ -200,20 +215,20 @@ describe PostRank::URI do
|
|
200
215
|
PostRank::URI.hash(uri, opts)
|
201
216
|
end
|
202
217
|
|
203
|
-
it "
|
218
|
+
it "computes the MD5 hash without cleaning the URI" do
|
204
219
|
hash = '55fae8910d312b7878a3201ed653b881'
|
205
220
|
|
206
|
-
h('http://everburning.com/feed/post/1').
|
207
|
-
h('everburning.com/feed/post/1').
|
221
|
+
expect(h('http://everburning.com/feed/post/1')).to eq(hash)
|
222
|
+
expect(h('everburning.com/feed/post/1')).not_to eq(hash)
|
208
223
|
end
|
209
224
|
|
210
|
-
it "
|
225
|
+
it "normalizes the URI if requested and compute MD5 hash" do
|
211
226
|
hash = '55fae8910d312b7878a3201ed653b881'
|
212
227
|
|
213
|
-
h('http://EverBurning.Com/feed/post/1', :clean => true).
|
214
|
-
h('Everburning.com/feed/post/1', :clean => true).
|
215
|
-
h('everburning.com/feed/post/1', :clean => true).
|
216
|
-
h('everburning.com/feed/post/1/', :clean => true).
|
228
|
+
expect(h('http://EverBurning.Com/feed/post/1', :clean => true)).to eq(hash)
|
229
|
+
expect(h('Everburning.com/feed/post/1', :clean => true)).to eq(hash)
|
230
|
+
expect(h('everburning.com/feed/post/1', :clean => true)).to eq(hash)
|
231
|
+
expect(h('everburning.com/feed/post/1/', :clean => true)).to eq(hash)
|
217
232
|
end
|
218
233
|
end
|
219
234
|
|
@@ -223,81 +238,81 @@ describe PostRank::URI do
|
|
223
238
|
end
|
224
239
|
|
225
240
|
context "TLDs" do
|
226
|
-
it "
|
227
|
-
e("yah.lets").
|
241
|
+
it "does not pick up bad grammar as a domain name and think it has a link" do
|
242
|
+
expect(e("yah.lets")).to be_empty
|
228
243
|
end
|
229
244
|
|
230
|
-
it "
|
231
|
-
e('stuff.zz a.b.c d.zq').
|
245
|
+
it "does not pickup bad TLDS" do
|
246
|
+
expect(e('stuff.zz a.b.c d.zq')).to be_empty
|
232
247
|
end
|
233
248
|
end
|
234
249
|
|
235
|
-
it "
|
236
|
-
e('test http://twitter.com/#!/igrigorik').
|
250
|
+
it "extracts twitter links with hashbangs" do
|
251
|
+
expect(e('test http://twitter.com/#!/igrigorik')).to include('http://twitter.com/igrigorik')
|
237
252
|
end
|
238
253
|
|
239
|
-
it "
|
240
|
-
e('test http://mobile.twitter.com/#!/_mm6').
|
254
|
+
it "extracts mobile twitter links with hashbangs" do
|
255
|
+
expect(e('test http://mobile.twitter.com/#!/_mm6')).to include('http://mobile.twitter.com/_mm6')
|
241
256
|
end
|
242
257
|
|
243
|
-
it "
|
244
|
-
e("text:http://spn.tw/tfnLT").
|
245
|
-
e("text;http://spn.tw/tfnLT").
|
246
|
-
e("text.http://spn.tw/tfnLT").
|
247
|
-
e("text-http://spn.tw/tfnLT").
|
258
|
+
it "handles a URL that comes after text without a space" do
|
259
|
+
expect(e("text:http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
260
|
+
expect(e("text;http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
261
|
+
expect(e("text.http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
262
|
+
expect(e("text-http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
248
263
|
end
|
249
264
|
|
250
|
-
it "
|
251
|
-
e("http://bit.ly/9cJ2mz......if ur pickin up anythign here, u FAIL.").
|
265
|
+
it "does not pick up anything on or after the first . in the path of a URL with a shortener domain" do
|
266
|
+
expect(e("http://bit.ly/9cJ2mz......if ur pickin up anythign here, u FAIL.")).to eq(["http://bit.ly/9cJ2mz"])
|
252
267
|
end
|
253
268
|
|
254
|
-
it "
|
269
|
+
it "picks up urls without protocol" do
|
255
270
|
u = e('abc.com abc.co')
|
256
|
-
u.
|
257
|
-
u.
|
271
|
+
expect(u).to include('http://abc.com/')
|
272
|
+
expect(u).to include('http://abc.co/')
|
258
273
|
end
|
259
274
|
|
260
|
-
it "
|
275
|
+
it "picks up urls inside tags" do
|
261
276
|
u = e("<a href='http://bit.ly/3fds3'>abc.com</a>")
|
262
|
-
u.
|
277
|
+
expect(u).to include('http://abc.com/')
|
263
278
|
end
|
264
279
|
|
265
280
|
context "multibyte characters" do
|
266
|
-
it "
|
267
|
-
e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食").
|
281
|
+
it "stops extracting URLs at the full-width CJK space character" do
|
282
|
+
expect(e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食")).to eq(["http://www.youtube.com/watch?v=w_j4Lda25jA"])
|
268
283
|
end
|
269
284
|
end
|
270
285
|
|
271
286
|
end
|
272
287
|
|
273
288
|
context "href extract" do
|
274
|
-
it "
|
289
|
+
it "extracts links from html text" do
|
275
290
|
g,b = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
|
276
291
|
|
277
|
-
g.first.
|
278
|
-
b.first.
|
292
|
+
expect(g.first).to eq('http://google.com/')
|
293
|
+
expect(b.first).to eq('http://b.com/')
|
279
294
|
|
280
|
-
g.last.
|
281
|
-
b.last.
|
295
|
+
expect(g.last).to eq('link to google')
|
296
|
+
expect(b.last).to eq('stuff')
|
282
297
|
end
|
283
298
|
|
284
|
-
it "
|
285
|
-
|
299
|
+
it "handles empty hrefs" do
|
300
|
+
expect do
|
286
301
|
l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
|
287
|
-
l.
|
288
|
-
end.
|
302
|
+
expect(l).to be_empty
|
303
|
+
end.not_to raise_error
|
289
304
|
end
|
290
305
|
|
291
306
|
context "relative paths" do
|
292
|
-
it "
|
307
|
+
it "rejects relative paths" do
|
293
308
|
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
|
294
|
-
l.
|
309
|
+
expect(l).to be_empty
|
295
310
|
end
|
296
311
|
|
297
|
-
it "
|
312
|
+
it "resolves relative paths if host is provided" do
|
298
313
|
i = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com").first
|
299
|
-
i.first.
|
300
|
-
i.last.
|
314
|
+
expect(i.first).to eq('http://igvita.com/stuff')
|
315
|
+
expect(i.last).to eq('link to stuff')
|
301
316
|
end
|
302
317
|
end
|
303
318
|
|
@@ -322,51 +337,51 @@ describe PostRank::URI do
|
|
322
337
|
}
|
323
338
|
|
324
339
|
url_list.each_pair do |url, expected_result|
|
325
|
-
it "
|
340
|
+
it "extracts #{expected_result.inspect} from #{url}" do
|
326
341
|
u = PostRank::URI.clean(url, :raw => true)
|
327
|
-
u.domain.
|
342
|
+
expect(u.domain).to eq(expected_result)
|
328
343
|
end
|
329
344
|
end
|
330
345
|
end
|
331
346
|
end
|
332
347
|
|
333
348
|
context "parse" do
|
334
|
-
it '
|
335
|
-
PostRank::URI.parse('a'*64+'.ca').host.
|
349
|
+
it 'does not fail on large host-part look-alikes' do
|
350
|
+
expect(PostRank::URI.parse('a'*64+'.ca').host).to eq(nil)
|
336
351
|
end
|
337
352
|
|
338
|
-
it '
|
339
|
-
PostRank::URI.parse('javascript:void(0);').scheme.
|
353
|
+
it 'does not pancake javascript scheme URIs' do
|
354
|
+
expect(PostRank::URI.parse('javascript:void(0);').scheme).to eq('javascript')
|
340
355
|
end
|
341
356
|
|
342
|
-
it '
|
343
|
-
PostRank::URI.parse('mailto:void(0);').scheme.
|
357
|
+
it 'does not pancake mailto scheme URIs' do
|
358
|
+
expect(PostRank::URI.parse('mailto:void(0);').scheme).to eq('mailto')
|
344
359
|
end
|
345
360
|
|
346
|
-
it '
|
347
|
-
PostRank::URI.parse('xmpp:void(0);').scheme.
|
361
|
+
it 'does not pancake xmpp scheme URIs' do
|
362
|
+
expect(PostRank::URI.parse('xmpp:void(0);').scheme).to eq('xmpp')
|
348
363
|
end
|
349
364
|
end
|
350
365
|
|
351
366
|
context 'valid?' do
|
352
367
|
it 'marks incomplete URI string as invalid' do
|
353
|
-
PostRank::URI.valid?('/path/page.html').
|
368
|
+
expect(PostRank::URI.valid?('/path/page.html')).to be false
|
354
369
|
end
|
355
370
|
|
356
371
|
it 'marks www.test.c as invalid' do
|
357
|
-
PostRank::URI.valid?('http://www.test.c').
|
372
|
+
expect(PostRank::URI.valid?('http://www.test.c')).to be false
|
358
373
|
end
|
359
374
|
|
360
375
|
it 'marks www.test.com as valid' do
|
361
|
-
PostRank::URI.valid?('http://www.test.com').
|
376
|
+
expect(PostRank::URI.valid?('http://www.test.com')).to be true
|
362
377
|
end
|
363
378
|
|
364
379
|
it 'marks Unicode domain as valid (NOTE: works only with a scheme)' do
|
365
|
-
PostRank::URI.valid?('http://президент.рф').
|
380
|
+
expect(PostRank::URI.valid?('http://президент.рф')).to be true
|
366
381
|
end
|
367
382
|
|
368
383
|
it 'marks punycode domain domain as valid' do
|
369
|
-
PostRank::URI.valid?('xn--d1abbgf6aiiy.xn--p1ai').
|
384
|
+
expect(PostRank::URI.valid?('xn--d1abbgf6aiiy.xn--p1ai')).to be true
|
370
385
|
end
|
371
386
|
end
|
372
387
|
end
|
metadata
CHANGED
@@ -1,71 +1,123 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: postrank-uri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ilya Grigorik
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-03-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 2.3.0
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '2.6'
|
20
23
|
type: :runtime
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
|
-
- -
|
27
|
+
- - ">="
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: 2.3.0
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '2.6'
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: public_suffix
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
30
36
|
requirements:
|
31
|
-
- -
|
37
|
+
- - ">="
|
32
38
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
39
|
+
version: 2.0.0
|
40
|
+
- - "<"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '2.1'
|
34
43
|
type: :runtime
|
35
44
|
prerelease: false
|
36
45
|
version_requirements: !ruby/object:Gem::Requirement
|
37
46
|
requirements:
|
38
|
-
- -
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 2.0.0
|
50
|
+
- - "<"
|
39
51
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
52
|
+
version: '2.1'
|
41
53
|
- !ruby/object:Gem::Dependency
|
42
54
|
name: nokogiri
|
43
55
|
requirement: !ruby/object:Gem::Requirement
|
44
56
|
requirements:
|
45
|
-
- -
|
57
|
+
- - ">="
|
46
58
|
- !ruby/object:Gem::Version
|
47
59
|
version: 1.6.1
|
60
|
+
- - "<"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '1.8'
|
48
63
|
type: :runtime
|
49
64
|
prerelease: false
|
50
65
|
version_requirements: !ruby/object:Gem::Requirement
|
51
66
|
requirements:
|
52
|
-
- -
|
67
|
+
- - ">="
|
53
68
|
- !ruby/object:Gem::Version
|
54
69
|
version: 1.6.1
|
70
|
+
- - "<"
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '1.8'
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: rake
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '0'
|
80
|
+
type: :development
|
81
|
+
prerelease: false
|
82
|
+
version_requirements: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
55
87
|
- !ruby/object:Gem::Dependency
|
56
88
|
name: rspec
|
57
89
|
requirement: !ruby/object:Gem::Requirement
|
58
90
|
requirements:
|
59
|
-
- -
|
91
|
+
- - ">="
|
60
92
|
- !ruby/object:Gem::Version
|
61
93
|
version: '0'
|
62
94
|
type: :development
|
63
95
|
prerelease: false
|
64
96
|
version_requirements: !ruby/object:Gem::Requirement
|
65
97
|
requirements:
|
66
|
-
- -
|
98
|
+
- - ">="
|
67
99
|
- !ruby/object:Gem::Version
|
68
100
|
version: '0'
|
101
|
+
- !ruby/object:Gem::Dependency
|
102
|
+
name: appraisal
|
103
|
+
requirement: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: 2.0.0
|
108
|
+
- - "<"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.0'
|
111
|
+
type: :development
|
112
|
+
prerelease: false
|
113
|
+
version_requirements: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 2.0.0
|
118
|
+
- - "<"
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '3.0'
|
69
121
|
description: URI normalization, c14n, escaping, and extraction
|
70
122
|
email:
|
71
123
|
- ilya@igvita.com
|
@@ -73,9 +125,11 @@ executables: []
|
|
73
125
|
extensions: []
|
74
126
|
extra_rdoc_files: []
|
75
127
|
files:
|
76
|
-
- .gitignore
|
77
|
-
- .rspec
|
128
|
+
- ".gitignore"
|
129
|
+
- ".rspec"
|
130
|
+
- Appraisals
|
78
131
|
- Gemfile
|
132
|
+
- LICENSE
|
79
133
|
- README.md
|
80
134
|
- Rakefile
|
81
135
|
- lib/postrank-uri.rb
|
@@ -86,7 +140,8 @@ files:
|
|
86
140
|
- spec/helper.rb
|
87
141
|
- spec/postrank-uri_spec.rb
|
88
142
|
homepage: http://github.com/postrank-labs/postrank-uri
|
89
|
-
licenses:
|
143
|
+
licenses:
|
144
|
+
- MIT
|
90
145
|
metadata: {}
|
91
146
|
post_install_message:
|
92
147
|
rdoc_options: []
|
@@ -94,17 +149,17 @@ require_paths:
|
|
94
149
|
- lib
|
95
150
|
required_ruby_version: !ruby/object:Gem::Requirement
|
96
151
|
requirements:
|
97
|
-
- -
|
152
|
+
- - ">="
|
98
153
|
- !ruby/object:Gem::Version
|
99
154
|
version: '0'
|
100
155
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
156
|
requirements:
|
102
|
-
- -
|
157
|
+
- - ">="
|
103
158
|
- !ruby/object:Gem::Version
|
104
159
|
version: '0'
|
105
160
|
requirements: []
|
106
161
|
rubyforge_project: postrank-uri
|
107
|
-
rubygems_version: 2.
|
162
|
+
rubygems_version: 2.6.8
|
108
163
|
signing_key:
|
109
164
|
specification_version: 4
|
110
165
|
summary: URI normalization, c14n, escaping, and extraction
|
@@ -112,4 +167,3 @@ test_files:
|
|
112
167
|
- spec/c14n_hosts.yml
|
113
168
|
- spec/helper.rb
|
114
169
|
- spec/postrank-uri_spec.rb
|
115
|
-
has_rdoc:
|