postrank-uri 1.0.18 → 1.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Appraisals +19 -0
- data/LICENSE +21 -0
- data/README.md +26 -1
- data/Rakefile +3 -0
- data/lib/postrank-uri.rb +12 -7
- data/lib/postrank-uri/version.rb +1 -1
- data/postrank-uri.gemspec +6 -3
- data/spec/postrank-uri_spec.rb +163 -148
- metadata +73 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2a453df060f3bb0d7ea04c7031d17f9e2039e8fc
|
4
|
+
data.tar.gz: fc665791c45e60179c706e9e0e2b3c8c50a58f73
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c6eec4d6e64e4c400d1ba9e2508a6bca95008452d3e64f22511b56236892a12afd582a7d8be28b5044e0e03f709459f5bba238359ac6408d42221912e8970a3
|
7
|
+
data.tar.gz: 7816c251f9ba449f2f3ea3c8e8e4fb1f80c2e76f8fd004c2c73021b3e47fc7cc1068467f90e17986f041ea948c375b24ce7248a837d8f0d13dedc1f0c4773ed1
|
data/.gitignore
CHANGED
data/Appraisals
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
appraise "nokogiri-1.7" do
|
2
|
+
gem "nokogiri", "~> 1.7.0"
|
3
|
+
end
|
4
|
+
|
5
|
+
appraise "nokogiri-1.6" do
|
6
|
+
gem "nokogiri", "~> 1.6.1"
|
7
|
+
end
|
8
|
+
|
9
|
+
appraise "addressable-2.3" do
|
10
|
+
gem "addressable", "~> 2.3.0"
|
11
|
+
end
|
12
|
+
|
13
|
+
appraise "addressable-2.4" do
|
14
|
+
gem "addressable", "~> 2.4.0"
|
15
|
+
end
|
16
|
+
|
17
|
+
appraise "addressable-2.5" do
|
18
|
+
gem "addressable", "~> 2.5.0"
|
19
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2011 Ilya Grigorik
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
CHANGED
@@ -37,4 +37,29 @@ In a nutshell, we need to make sure that creative cases like the ones below all
|
|
37
37
|
|
38
38
|
As part of URI canonicalization the library will remove common tracking parameters from Google Analytics and several other providers. Beyond that, host-specific rules are also applied. For example, nytimes.com likes to add a 'partner' query parameter for tracking purposes, but which has no effect on the content - hence, it is removed from the URI. For full list, see the c14n.yml file.
|
39
39
|
|
40
|
-
Detecting "duplicate URLs" is a hard problem to solve (expensive in all senses), instead we are compiling a manually assembled database. If you find cases which are missing, please do report them, or send us a pull request!
|
40
|
+
Detecting "duplicate URLs" is a hard problem to solve (expensive in all senses), instead we are compiling a manually assembled database. If you find cases which are missing, please do report them, or send us a pull request!
|
41
|
+
|
42
|
+
## Development
|
43
|
+
|
44
|
+
### Setup
|
45
|
+
|
46
|
+
```
|
47
|
+
bundle install
|
48
|
+
```
|
49
|
+
|
50
|
+
### Running tests
|
51
|
+
|
52
|
+
```
|
53
|
+
bundle exec rake
|
54
|
+
```
|
55
|
+
|
56
|
+
### Running dependency appraisals
|
57
|
+
|
58
|
+
To verify `postrake-uri` works with different versions of its runtime dependencies you can run:
|
59
|
+
|
60
|
+
```
|
61
|
+
bundle exec appraisal install
|
62
|
+
bundle exec rake appraisal
|
63
|
+
```
|
64
|
+
|
65
|
+
This will execute the test suite with different versions of the dependencies.
|
data/Rakefile
CHANGED
data/lib/postrank-uri.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# encoding: utf-8
|
2
2
|
require 'addressable/uri'
|
3
3
|
require 'digest/md5'
|
4
4
|
require 'nokogiri'
|
@@ -9,7 +9,7 @@ module Addressable
|
|
9
9
|
class URI
|
10
10
|
def domain
|
11
11
|
host = self.host
|
12
|
-
(host && PublicSuffix.valid?(host)) ? PublicSuffix.parse(host).domain : nil
|
12
|
+
(host && PublicSuffix.valid?(host, default_rule: nil)) ? PublicSuffix.parse(host).domain : nil
|
13
13
|
end
|
14
14
|
|
15
15
|
def normalized_query
|
@@ -86,8 +86,9 @@ module PostRank
|
|
86
86
|
)
|
87
87
|
}iox;
|
88
88
|
|
89
|
+
URIREGEX[:reserved_characters] = /%3F|%26/i
|
89
90
|
URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x
|
90
|
-
URIREGEX[:unescape] = /(
|
91
|
+
URIREGEX[:unescape] = /(%[0-9a-fA-F]{2})/x
|
91
92
|
URIREGEX.each_pair{|k,v| v.freeze }
|
92
93
|
|
93
94
|
module_function
|
@@ -97,7 +98,7 @@ module PostRank
|
|
97
98
|
urls = []
|
98
99
|
text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
99
100
|
# Only extract the URL if the domain is valid
|
100
|
-
if PublicSuffix.valid?(domain)
|
101
|
+
if PublicSuffix.valid?(domain, default_rule: nil)
|
101
102
|
url = clean(url)
|
102
103
|
urls.push url.to_s
|
103
104
|
end
|
@@ -131,8 +132,12 @@ module PostRank
|
|
131
132
|
def unescape(uri)
|
132
133
|
u = parse(uri)
|
133
134
|
u.query = u.query.tr('+', ' ') if u.query
|
134
|
-
u.to_s.gsub(URIREGEX[:unescape]) do
|
135
|
-
[
|
135
|
+
u.to_s.gsub(URIREGEX[:unescape]) do |encoded|
|
136
|
+
if encoded.match? URIREGEX[:reserved_characters]
|
137
|
+
encoded
|
138
|
+
else
|
139
|
+
[encoded.delete('%')].pack('H*')
|
140
|
+
end
|
136
141
|
end
|
137
142
|
end
|
138
143
|
|
@@ -225,7 +230,7 @@ module PostRank
|
|
225
230
|
cleaned_uri = clean(uri, :raw => true)
|
226
231
|
|
227
232
|
if host = cleaned_uri.host
|
228
|
-
is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host))
|
233
|
+
is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil)
|
229
234
|
end
|
230
235
|
|
231
236
|
is_valid
|
data/lib/postrank-uri/version.rb
CHANGED
data/postrank-uri.gemspec
CHANGED
@@ -11,14 +11,17 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.homepage = "http://github.com/postrank-labs/postrank-uri"
|
12
12
|
s.summary = "URI normalization, c14n, escaping, and extraction"
|
13
13
|
s.description = s.summary
|
14
|
+
s.license = 'MIT'
|
14
15
|
|
15
16
|
s.rubyforge_project = "postrank-uri"
|
16
17
|
|
17
|
-
s.add_dependency "addressable", "
|
18
|
-
s.add_dependency "public_suffix", "
|
19
|
-
s.add_dependency "nokogiri", "
|
18
|
+
s.add_dependency "addressable", ">= 2.3.0", "< 2.6"
|
19
|
+
s.add_dependency "public_suffix", ">= 2.0.0", "< 2.1"
|
20
|
+
s.add_dependency "nokogiri", ">= 1.6.1", "< 1.8"
|
20
21
|
|
22
|
+
s.add_development_dependency "rake"
|
21
23
|
s.add_development_dependency "rspec"
|
24
|
+
s.add_development_dependency "appraisal", ">= 2.0.0", "< 3.0"
|
22
25
|
|
23
26
|
s.files = `git ls-files`.split("\n")
|
24
27
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/spec/postrank-uri_spec.rb
CHANGED
@@ -3,103 +3,101 @@
|
|
3
3
|
require 'helper'
|
4
4
|
|
5
5
|
describe PostRank::URI do
|
6
|
-
|
7
|
-
let(:igvita) { 'http://igvita.com/' }
|
8
|
-
|
9
6
|
context "escaping" do
|
10
|
-
it "
|
11
|
-
PostRank::URI.escape('id=1').
|
7
|
+
it "escapes PostRank::URI string" do
|
8
|
+
expect(PostRank::URI.escape('id=1')).to eq('id%3D1')
|
12
9
|
end
|
13
10
|
|
14
|
-
it "
|
15
|
-
PostRank::URI.escape('id= 1').
|
11
|
+
it "escapes spaces as %20's" do
|
12
|
+
expect(PostRank::URI.escape('id= 1')).to match('%20')
|
16
13
|
end
|
17
14
|
end
|
18
15
|
|
19
16
|
context "unescape" do
|
20
|
-
it "
|
21
|
-
PostRank::URI.unescape(PostRank::URI.escape('id=1')).
|
17
|
+
it "unescapes PostRank::URI" do
|
18
|
+
expect(PostRank::URI.unescape(PostRank::URI.escape('id=1'))).to eq('id=1')
|
22
19
|
end
|
23
20
|
|
24
|
-
it "
|
25
|
-
PostRank::URI.unescape(PostRank::URI.escape('id= 1')).
|
21
|
+
it "unescapes PostRank::URI with spaces" do
|
22
|
+
expect(PostRank::URI.unescape(PostRank::URI.escape('id= 1'))).to eq('id= 1')
|
26
23
|
end
|
27
24
|
|
28
25
|
context "accept improperly escaped PostRank::URI strings" do
|
29
26
|
# See http://tools.ietf.org/html/rfc3986#section-2.3
|
30
27
|
|
31
|
-
it "
|
32
|
-
PostRank::URI.unescape('?id=+1').
|
28
|
+
it "unescapes PostRank::URI with spaces encoded as '+'" do
|
29
|
+
expect(PostRank::URI.unescape('?id=+1')).to eq('?id= 1')
|
33
30
|
end
|
34
31
|
|
35
|
-
it "
|
36
|
-
PostRank::URI.unescape('?id%3D+1').
|
32
|
+
it "unescapes PostRank::URI with spaces encoded as '+'" do
|
33
|
+
expect(PostRank::URI.unescape('?id%3D+1')).to eq('?id= 1')
|
37
34
|
end
|
38
35
|
|
39
|
-
it "
|
40
|
-
PostRank::URI.unescape('?id=%201').
|
36
|
+
it "unescapes PostRank::URI with spaces encoded as %20" do
|
37
|
+
expect(PostRank::URI.unescape('?id=%201')).to eq('?id= 1')
|
41
38
|
end
|
42
39
|
|
43
|
-
it "
|
44
|
-
PostRank::URI.unescape('/foo+bar?id=foo+bar').
|
40
|
+
it "does not unescape '+' to spaces in paths" do
|
41
|
+
expect(PostRank::URI.unescape('/foo+bar?id=foo+bar')).to eq('/foo+bar?id=foo bar')
|
45
42
|
end
|
46
43
|
end
|
47
44
|
|
48
45
|
end
|
49
46
|
|
50
47
|
context "normalize" do
|
48
|
+
let(:igvita) { 'http://igvita.com/' }
|
49
|
+
|
51
50
|
def n(uri)
|
52
51
|
PostRank::URI.normalize(uri).to_s
|
53
52
|
end
|
54
53
|
|
55
|
-
it "
|
56
|
-
n('http://igvita.com/').
|
57
|
-
n('http://igvita.com').to_s.
|
58
|
-
n('http://igvita.com///').
|
54
|
+
it "normalizes paths in PostRank::URIs" do
|
55
|
+
expect(n('http://igvita.com/')).to eq(igvita)
|
56
|
+
expect(n('http://igvita.com').to_s).to eq(igvita)
|
57
|
+
expect(n('http://igvita.com///')).to eq(igvita)
|
59
58
|
|
60
|
-
n('http://igvita.com/../').
|
61
|
-
n('http://igvita.com/a/b/../../').
|
62
|
-
n('http://igvita.com/a/b/../..').
|
59
|
+
expect(n('http://igvita.com/../')).to eq(igvita)
|
60
|
+
expect(n('http://igvita.com/a/b/../../')).to eq(igvita)
|
61
|
+
expect(n('http://igvita.com/a/b/../..')).to eq(igvita)
|
63
62
|
end
|
64
63
|
|
65
|
-
it "
|
66
|
-
n('http://igvita.com/?').
|
67
|
-
n('http://igvita.com?').
|
68
|
-
n('http://igvita.com/a/../?').
|
64
|
+
it "normalizes query strings in PostRank::URIs" do
|
65
|
+
expect(n('http://igvita.com/?')).to eq(igvita)
|
66
|
+
expect(n('http://igvita.com?')).to eq(igvita)
|
67
|
+
expect(n('http://igvita.com/a/../?')).to eq(igvita)
|
69
68
|
end
|
70
69
|
|
71
|
-
it "
|
72
|
-
n('http://igvita.com#test').
|
73
|
-
n('http://igvita.com#test#test').
|
74
|
-
n('http://igvita.com/a/../?#test').
|
70
|
+
it "normalizes anchors in PostRank::URIs" do
|
71
|
+
expect(n('http://igvita.com#test')).to eq(igvita)
|
72
|
+
expect(n('http://igvita.com#test#test')).to eq(igvita)
|
73
|
+
expect(n('http://igvita.com/a/../?#test')).to eq(igvita)
|
75
74
|
end
|
76
75
|
|
77
|
-
it "
|
78
|
-
n('http://igvita.com/a/../? ').
|
79
|
-
n('http://igvita.com/a/../? #test').
|
80
|
-
n('http://igvita.com/ /../').
|
76
|
+
it "cleans whitespace in PostRank::URIs" do
|
77
|
+
expect(n('http://igvita.com/a/../? ')).to eq(igvita)
|
78
|
+
expect(n('http://igvita.com/a/../? #test')).to eq(igvita)
|
79
|
+
expect(n('http://igvita.com/ /../')).to eq(igvita)
|
81
80
|
end
|
82
81
|
|
83
|
-
it "
|
84
|
-
n('igvita.com').
|
85
|
-
n('https://test.com/').to_s.
|
82
|
+
it "defaults to http scheme if missing" do
|
83
|
+
expect(n('igvita.com')).to eq(igvita)
|
84
|
+
expect(n('https://test.com/').to_s).to eq('https://test.com/')
|
86
85
|
end
|
87
86
|
|
88
|
-
it "
|
89
|
-
n('IGVITA.COM').
|
90
|
-
n('IGVITA.COM/ABC').
|
87
|
+
it "downcases the hostname" do
|
88
|
+
expect(n('IGVITA.COM')).to eq(igvita)
|
89
|
+
expect(n('IGVITA.COM/ABC')).to eq(igvita + "ABC")
|
91
90
|
end
|
92
91
|
|
93
|
-
it "
|
94
|
-
n('http://igvita.com/').
|
92
|
+
it "removes trailing slash on paths" do
|
93
|
+
expect(n('http://igvita.com/')).to eq('http://igvita.com/')
|
95
94
|
|
96
|
-
n('http://igvita.com/a').
|
97
|
-
n('http://igvita.com/a/').
|
95
|
+
expect(n('http://igvita.com/a')).to eq('http://igvita.com/a')
|
96
|
+
expect(n('http://igvita.com/a/')).to eq('http://igvita.com/a')
|
98
97
|
|
99
|
-
n('http://igvita.com/a/b').
|
100
|
-
n('http://igvita.com/a/b/').
|
98
|
+
expect(n('http://igvita.com/a/b')).to eq('http://igvita.com/a/b')
|
99
|
+
expect(n('http://igvita.com/a/b/')).to eq('http://igvita.com/a/b')
|
101
100
|
end
|
102
|
-
|
103
101
|
end
|
104
102
|
|
105
103
|
context "canonicalization" do
|
@@ -109,60 +107,60 @@ describe PostRank::URI do
|
|
109
107
|
|
110
108
|
context "query parameters" do
|
111
109
|
it "should handle nester parameters" do
|
112
|
-
c('igvita.com/?id=a&utm_source=a').
|
110
|
+
expect(c('igvita.com/?id=a&utm_source=a')).to eq('http://igvita.com/?id=a')
|
113
111
|
end
|
114
112
|
|
115
|
-
it "
|
113
|
+
it "preserves the order of parameters" do
|
116
114
|
url = 'http://a.com/?'+('a'..'z').to_a.shuffle.map {|e| "#{e}=#{e}"}.join("&")
|
117
|
-
c(url).
|
115
|
+
expect(c(url)).to eq(url)
|
118
116
|
end
|
119
117
|
|
120
|
-
it "
|
121
|
-
c('igvita.com/?id=a&utm_source=a').
|
122
|
-
c('igvita.com/?id=a&utm_source=a&utm_valid').
|
118
|
+
it "removes Google Analytics parameters" do
|
119
|
+
expect(c('igvita.com/?id=a&utm_source=a')).to eq('http://igvita.com/?id=a')
|
120
|
+
expect(c('igvita.com/?id=a&utm_source=a&utm_valid')).to eq('http://igvita.com/?id=a&utm_valid')
|
123
121
|
end
|
124
122
|
|
125
|
-
it "
|
126
|
-
c('igvita.com/?id=a&utm_source=a&awesm=b').
|
127
|
-
c('igvita.com/?id=a&sms_ss=a').
|
123
|
+
it "removes awesm/sms parameters" do
|
124
|
+
expect(c('igvita.com/?id=a&utm_source=a&awesm=b')).to eq('http://igvita.com/?id=a')
|
125
|
+
expect(c('igvita.com/?id=a&sms_ss=a')).to eq('http://igvita.com/?id=a')
|
128
126
|
end
|
129
127
|
|
130
|
-
it "
|
131
|
-
c('http://www.nachi.org/forum?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').
|
132
|
-
c('http://www.nachi.org/forum/?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').
|
133
|
-
c('http://www.nachi.org/forum?id=123&PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').
|
128
|
+
it "removes PHPSESSID parameter" do
|
129
|
+
expect(c('http://www.nachi.org/forum?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum?')
|
130
|
+
expect(c('http://www.nachi.org/forum/?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum/?')
|
131
|
+
expect(c('http://www.nachi.org/forum?id=123&PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum?id=123')
|
134
132
|
end
|
135
133
|
end
|
136
134
|
|
137
135
|
context "hashbang" do
|
138
|
-
it "
|
139
|
-
c('http://twitter.com/#!/igrigorik').
|
140
|
-
c('http://twitter.com/#!/a/statuses/1').
|
141
|
-
c('http://nontwitter.com/#!/a/statuses/1').
|
136
|
+
it "rewrites twitter links to crawlable versions" do
|
137
|
+
expect(c('http://twitter.com/#!/igrigorik')).to eq('http://twitter.com/igrigorik')
|
138
|
+
expect(c('http://twitter.com/#!/a/statuses/1')).to eq('http://twitter.com/a/statuses/1')
|
139
|
+
expect(c('http://nontwitter.com/#!/a/statuses/1')).to eq('http://nontwitter.com/#!/a/statuses/1')
|
142
140
|
end
|
143
141
|
end
|
144
142
|
|
145
143
|
context "tumblr" do
|
146
|
-
it "
|
147
|
-
c('http://test.tumblr.com/post/4533459403/some-text').
|
148
|
-
c('http://tumblr.com/xjl2evo3hh').
|
144
|
+
it "strips the slug" do
|
145
|
+
expect(c('http://test.tumblr.com/post/4533459403/some-text')).to eq('http://test.tumblr.com/post/4533459403/')
|
146
|
+
expect(c('http://tumblr.com/xjl2evo3hh')).to eq('http://tumblr.com/xjl2evo3hh')
|
149
147
|
end
|
150
148
|
end
|
151
149
|
|
152
150
|
context "embedded links" do
|
153
|
-
it "
|
151
|
+
it "extracts embedded redirects from Google News" do
|
154
152
|
u = c('http://news.google.com/news/url?sa=t&fd=R&&url=http://www.ctv.ca/CTVNews/Politics/20110111/')
|
155
|
-
u.
|
153
|
+
expect(u).to eq('http://www.ctv.ca/CTVNews/Politics/20110111')
|
156
154
|
end
|
157
155
|
|
158
|
-
it "
|
156
|
+
it "extracts embedded redirects from xfruits.com" do
|
159
157
|
u = c('http://xfruits.com/MrGroar/?url=http%3A%2F%2Faap.lesroyaumes.com%2Fdepeches%2Fdepeche351820908.html')
|
160
|
-
u.
|
158
|
+
expect(u).to eq('http://aap.lesroyaumes.com/depeches/depeche351820908.html')
|
161
159
|
end
|
162
160
|
|
163
|
-
it "
|
161
|
+
it "extracts embedded redirects from MySpace" do
|
164
162
|
u = c('http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fghanaian-chronicle.com%2Fnews%2Fother-news%2Fcanadian-high-commissioner-urges-media%2F&t=Canadian%20High%20Commissioner%20urges%20media')
|
165
|
-
u.
|
163
|
+
expect(u).to eq('http://ghanaian-chronicle.com/news/other-news/canadian-high-commissioner-urges-media')
|
166
164
|
end
|
167
165
|
end
|
168
166
|
end
|
@@ -172,25 +170,42 @@ describe PostRank::URI do
|
|
172
170
|
PostRank::URI.clean(uri)
|
173
171
|
end
|
174
172
|
|
175
|
-
it "
|
176
|
-
c('http://igvita.com/?id=1').
|
177
|
-
c('igvita.com/?id=1').
|
173
|
+
it "unescapes, canonicalizes and normalizes" do
|
174
|
+
expect(c('http://igvita.com/?id=1')).to eq('http://igvita.com/?id=1')
|
175
|
+
expect(c('igvita.com/?id=1')).to eq('http://igvita.com/?id=1')
|
178
176
|
|
179
|
-
c('http://igvita.com/?id= 1').
|
180
|
-
c('http://igvita.com/?id=+1').
|
181
|
-
c('http://igvita.com/?id%3D%201').
|
177
|
+
expect(c('http://igvita.com/?id= 1')).to eq('http://igvita.com/?id=%201')
|
178
|
+
expect(c('http://igvita.com/?id=+1')).to eq('http://igvita.com/?id=%201')
|
179
|
+
expect(c('http://igvita.com/?id%3D%201')).to eq('http://igvita.com/?id=%201')
|
182
180
|
|
183
|
-
c('igvita.com/a/..?id=1&utm_source=a&awesm=b#c').
|
181
|
+
expect(c('igvita.com/a/..?id=1&utm_source=a&awesm=b#c')).to eq('http://igvita.com/?id=1')
|
184
182
|
|
185
|
-
c('igvita.com?id=<>').
|
186
|
-
c('igvita.com?id="').
|
183
|
+
expect(c('igvita.com?id=<>')).to eq('http://igvita.com/?id=%3C%3E')
|
184
|
+
expect(c('igvita.com?id="')).to eq('http://igvita.com/?id=%22')
|
187
185
|
|
188
|
-
c('test.tumblr.com/post/23223/text-stub').
|
186
|
+
expect(c('test.tumblr.com/post/23223/text-stub')).to eq('http://test.tumblr.com/post/23223')
|
189
187
|
end
|
190
188
|
|
191
|
-
it "
|
189
|
+
it "cleans host specific parameters" do
|
192
190
|
YAML.load_file('spec/c14n_hosts.yml').each do |orig, clean|
|
193
|
-
c(orig).
|
191
|
+
expect(c(orig)).to eq(clean)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
context "reserved characters" do
|
196
|
+
it "preserves encoded question marks" do
|
197
|
+
expect(c('http://en.wikipedia.org/wiki/Whose_Line_Is_It_Anyway%3F_%28U.S._TV_series%29')).
|
198
|
+
to eq('http://en.wikipedia.org/wiki/Whose_Line_Is_It_Anyway%3F_(U.S._TV_series)')
|
199
|
+
end
|
200
|
+
|
201
|
+
it "preserves encoded ampersands" do
|
202
|
+
expect(c('http://example.com/?foo=BAR%26BAZ')).
|
203
|
+
to eq('http://example.com/?foo=BAR%26BAZ')
|
204
|
+
end
|
205
|
+
|
206
|
+
it "preserves consecutive reserved characters" do
|
207
|
+
expect(c('http://example.com/so-quizical%3F%3F%3F?foo=bar')).
|
208
|
+
to eq('http://example.com/so-quizical%3F%3F%3F?foo=bar')
|
194
209
|
end
|
195
210
|
end
|
196
211
|
end
|
@@ -200,20 +215,20 @@ describe PostRank::URI do
|
|
200
215
|
PostRank::URI.hash(uri, opts)
|
201
216
|
end
|
202
217
|
|
203
|
-
it "
|
218
|
+
it "computes the MD5 hash without cleaning the URI" do
|
204
219
|
hash = '55fae8910d312b7878a3201ed653b881'
|
205
220
|
|
206
|
-
h('http://everburning.com/feed/post/1').
|
207
|
-
h('everburning.com/feed/post/1').
|
221
|
+
expect(h('http://everburning.com/feed/post/1')).to eq(hash)
|
222
|
+
expect(h('everburning.com/feed/post/1')).not_to eq(hash)
|
208
223
|
end
|
209
224
|
|
210
|
-
it "
|
225
|
+
it "normalizes the URI if requested and compute MD5 hash" do
|
211
226
|
hash = '55fae8910d312b7878a3201ed653b881'
|
212
227
|
|
213
|
-
h('http://EverBurning.Com/feed/post/1', :clean => true).
|
214
|
-
h('Everburning.com/feed/post/1', :clean => true).
|
215
|
-
h('everburning.com/feed/post/1', :clean => true).
|
216
|
-
h('everburning.com/feed/post/1/', :clean => true).
|
228
|
+
expect(h('http://EverBurning.Com/feed/post/1', :clean => true)).to eq(hash)
|
229
|
+
expect(h('Everburning.com/feed/post/1', :clean => true)).to eq(hash)
|
230
|
+
expect(h('everburning.com/feed/post/1', :clean => true)).to eq(hash)
|
231
|
+
expect(h('everburning.com/feed/post/1/', :clean => true)).to eq(hash)
|
217
232
|
end
|
218
233
|
end
|
219
234
|
|
@@ -223,81 +238,81 @@ describe PostRank::URI do
|
|
223
238
|
end
|
224
239
|
|
225
240
|
context "TLDs" do
|
226
|
-
it "
|
227
|
-
e("yah.lets").
|
241
|
+
it "does not pick up bad grammar as a domain name and think it has a link" do
|
242
|
+
expect(e("yah.lets")).to be_empty
|
228
243
|
end
|
229
244
|
|
230
|
-
it "
|
231
|
-
e('stuff.zz a.b.c d.zq').
|
245
|
+
it "does not pickup bad TLDS" do
|
246
|
+
expect(e('stuff.zz a.b.c d.zq')).to be_empty
|
232
247
|
end
|
233
248
|
end
|
234
249
|
|
235
|
-
it "
|
236
|
-
e('test http://twitter.com/#!/igrigorik').
|
250
|
+
it "extracts twitter links with hashbangs" do
|
251
|
+
expect(e('test http://twitter.com/#!/igrigorik')).to include('http://twitter.com/igrigorik')
|
237
252
|
end
|
238
253
|
|
239
|
-
it "
|
240
|
-
e('test http://mobile.twitter.com/#!/_mm6').
|
254
|
+
it "extracts mobile twitter links with hashbangs" do
|
255
|
+
expect(e('test http://mobile.twitter.com/#!/_mm6')).to include('http://mobile.twitter.com/_mm6')
|
241
256
|
end
|
242
257
|
|
243
|
-
it "
|
244
|
-
e("text:http://spn.tw/tfnLT").
|
245
|
-
e("text;http://spn.tw/tfnLT").
|
246
|
-
e("text.http://spn.tw/tfnLT").
|
247
|
-
e("text-http://spn.tw/tfnLT").
|
258
|
+
it "handles a URL that comes after text without a space" do
|
259
|
+
expect(e("text:http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
260
|
+
expect(e("text;http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
261
|
+
expect(e("text.http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
262
|
+
expect(e("text-http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
248
263
|
end
|
249
264
|
|
250
|
-
it "
|
251
|
-
e("http://bit.ly/9cJ2mz......if ur pickin up anythign here, u FAIL.").
|
265
|
+
it "does not pick up anything on or after the first . in the path of a URL with a shortener domain" do
|
266
|
+
expect(e("http://bit.ly/9cJ2mz......if ur pickin up anythign here, u FAIL.")).to eq(["http://bit.ly/9cJ2mz"])
|
252
267
|
end
|
253
268
|
|
254
|
-
it "
|
269
|
+
it "picks up urls without protocol" do
|
255
270
|
u = e('abc.com abc.co')
|
256
|
-
u.
|
257
|
-
u.
|
271
|
+
expect(u).to include('http://abc.com/')
|
272
|
+
expect(u).to include('http://abc.co/')
|
258
273
|
end
|
259
274
|
|
260
|
-
it "
|
275
|
+
it "picks up urls inside tags" do
|
261
276
|
u = e("<a href='http://bit.ly/3fds3'>abc.com</a>")
|
262
|
-
u.
|
277
|
+
expect(u).to include('http://abc.com/')
|
263
278
|
end
|
264
279
|
|
265
280
|
context "multibyte characters" do
|
266
|
-
it "
|
267
|
-
e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食").
|
281
|
+
it "stops extracting URLs at the full-width CJK space character" do
|
282
|
+
expect(e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食")).to eq(["http://www.youtube.com/watch?v=w_j4Lda25jA"])
|
268
283
|
end
|
269
284
|
end
|
270
285
|
|
271
286
|
end
|
272
287
|
|
273
288
|
context "href extract" do
|
274
|
-
it "
|
289
|
+
it "extracts links from html text" do
|
275
290
|
g,b = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
|
276
291
|
|
277
|
-
g.first.
|
278
|
-
b.first.
|
292
|
+
expect(g.first).to eq('http://google.com/')
|
293
|
+
expect(b.first).to eq('http://b.com/')
|
279
294
|
|
280
|
-
g.last.
|
281
|
-
b.last.
|
295
|
+
expect(g.last).to eq('link to google')
|
296
|
+
expect(b.last).to eq('stuff')
|
282
297
|
end
|
283
298
|
|
284
|
-
it "
|
285
|
-
|
299
|
+
it "handles empty hrefs" do
|
300
|
+
expect do
|
286
301
|
l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
|
287
|
-
l.
|
288
|
-
end.
|
302
|
+
expect(l).to be_empty
|
303
|
+
end.not_to raise_error
|
289
304
|
end
|
290
305
|
|
291
306
|
context "relative paths" do
|
292
|
-
it "
|
307
|
+
it "rejects relative paths" do
|
293
308
|
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
|
294
|
-
l.
|
309
|
+
expect(l).to be_empty
|
295
310
|
end
|
296
311
|
|
297
|
-
it "
|
312
|
+
it "resolves relative paths if host is provided" do
|
298
313
|
i = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com").first
|
299
|
-
i.first.
|
300
|
-
i.last.
|
314
|
+
expect(i.first).to eq('http://igvita.com/stuff')
|
315
|
+
expect(i.last).to eq('link to stuff')
|
301
316
|
end
|
302
317
|
end
|
303
318
|
|
@@ -322,51 +337,51 @@ describe PostRank::URI do
|
|
322
337
|
}
|
323
338
|
|
324
339
|
url_list.each_pair do |url, expected_result|
|
325
|
-
it "
|
340
|
+
it "extracts #{expected_result.inspect} from #{url}" do
|
326
341
|
u = PostRank::URI.clean(url, :raw => true)
|
327
|
-
u.domain.
|
342
|
+
expect(u.domain).to eq(expected_result)
|
328
343
|
end
|
329
344
|
end
|
330
345
|
end
|
331
346
|
end
|
332
347
|
|
333
348
|
context "parse" do
|
334
|
-
it '
|
335
|
-
PostRank::URI.parse('a'*64+'.ca').host.
|
349
|
+
it 'does not fail on large host-part look-alikes' do
|
350
|
+
expect(PostRank::URI.parse('a'*64+'.ca').host).to eq(nil)
|
336
351
|
end
|
337
352
|
|
338
|
-
it '
|
339
|
-
PostRank::URI.parse('javascript:void(0);').scheme.
|
353
|
+
it 'does not pancake javascript scheme URIs' do
|
354
|
+
expect(PostRank::URI.parse('javascript:void(0);').scheme).to eq('javascript')
|
340
355
|
end
|
341
356
|
|
342
|
-
it '
|
343
|
-
PostRank::URI.parse('mailto:void(0);').scheme.
|
357
|
+
it 'does not pancake mailto scheme URIs' do
|
358
|
+
expect(PostRank::URI.parse('mailto:void(0);').scheme).to eq('mailto')
|
344
359
|
end
|
345
360
|
|
346
|
-
it '
|
347
|
-
PostRank::URI.parse('xmpp:void(0);').scheme.
|
361
|
+
it 'does not pancake xmpp scheme URIs' do
|
362
|
+
expect(PostRank::URI.parse('xmpp:void(0);').scheme).to eq('xmpp')
|
348
363
|
end
|
349
364
|
end
|
350
365
|
|
351
366
|
context 'valid?' do
|
352
367
|
it 'marks incomplete URI string as invalid' do
|
353
|
-
PostRank::URI.valid?('/path/page.html').
|
368
|
+
expect(PostRank::URI.valid?('/path/page.html')).to be false
|
354
369
|
end
|
355
370
|
|
356
371
|
it 'marks www.test.c as invalid' do
|
357
|
-
PostRank::URI.valid?('http://www.test.c').
|
372
|
+
expect(PostRank::URI.valid?('http://www.test.c')).to be false
|
358
373
|
end
|
359
374
|
|
360
375
|
it 'marks www.test.com as valid' do
|
361
|
-
PostRank::URI.valid?('http://www.test.com').
|
376
|
+
expect(PostRank::URI.valid?('http://www.test.com')).to be true
|
362
377
|
end
|
363
378
|
|
364
379
|
it 'marks Unicode domain as valid (NOTE: works only with a scheme)' do
|
365
|
-
PostRank::URI.valid?('http://президент.рф').
|
380
|
+
expect(PostRank::URI.valid?('http://президент.рф')).to be true
|
366
381
|
end
|
367
382
|
|
368
383
|
it 'marks punycode domain domain as valid' do
|
369
|
-
PostRank::URI.valid?('xn--d1abbgf6aiiy.xn--p1ai').
|
384
|
+
expect(PostRank::URI.valid?('xn--d1abbgf6aiiy.xn--p1ai')).to be true
|
370
385
|
end
|
371
386
|
end
|
372
387
|
end
|
metadata
CHANGED
@@ -1,71 +1,123 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: postrank-uri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ilya Grigorik
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-03-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 2.3.0
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '2.6'
|
20
23
|
type: :runtime
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
|
-
- -
|
27
|
+
- - ">="
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: 2.3.0
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '2.6'
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: public_suffix
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
30
36
|
requirements:
|
31
|
-
- -
|
37
|
+
- - ">="
|
32
38
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
39
|
+
version: 2.0.0
|
40
|
+
- - "<"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '2.1'
|
34
43
|
type: :runtime
|
35
44
|
prerelease: false
|
36
45
|
version_requirements: !ruby/object:Gem::Requirement
|
37
46
|
requirements:
|
38
|
-
- -
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 2.0.0
|
50
|
+
- - "<"
|
39
51
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
52
|
+
version: '2.1'
|
41
53
|
- !ruby/object:Gem::Dependency
|
42
54
|
name: nokogiri
|
43
55
|
requirement: !ruby/object:Gem::Requirement
|
44
56
|
requirements:
|
45
|
-
- -
|
57
|
+
- - ">="
|
46
58
|
- !ruby/object:Gem::Version
|
47
59
|
version: 1.6.1
|
60
|
+
- - "<"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '1.8'
|
48
63
|
type: :runtime
|
49
64
|
prerelease: false
|
50
65
|
version_requirements: !ruby/object:Gem::Requirement
|
51
66
|
requirements:
|
52
|
-
- -
|
67
|
+
- - ">="
|
53
68
|
- !ruby/object:Gem::Version
|
54
69
|
version: 1.6.1
|
70
|
+
- - "<"
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '1.8'
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: rake
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '0'
|
80
|
+
type: :development
|
81
|
+
prerelease: false
|
82
|
+
version_requirements: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
55
87
|
- !ruby/object:Gem::Dependency
|
56
88
|
name: rspec
|
57
89
|
requirement: !ruby/object:Gem::Requirement
|
58
90
|
requirements:
|
59
|
-
- -
|
91
|
+
- - ">="
|
60
92
|
- !ruby/object:Gem::Version
|
61
93
|
version: '0'
|
62
94
|
type: :development
|
63
95
|
prerelease: false
|
64
96
|
version_requirements: !ruby/object:Gem::Requirement
|
65
97
|
requirements:
|
66
|
-
- -
|
98
|
+
- - ">="
|
67
99
|
- !ruby/object:Gem::Version
|
68
100
|
version: '0'
|
101
|
+
- !ruby/object:Gem::Dependency
|
102
|
+
name: appraisal
|
103
|
+
requirement: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: 2.0.0
|
108
|
+
- - "<"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.0'
|
111
|
+
type: :development
|
112
|
+
prerelease: false
|
113
|
+
version_requirements: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 2.0.0
|
118
|
+
- - "<"
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '3.0'
|
69
121
|
description: URI normalization, c14n, escaping, and extraction
|
70
122
|
email:
|
71
123
|
- ilya@igvita.com
|
@@ -73,9 +125,11 @@ executables: []
|
|
73
125
|
extensions: []
|
74
126
|
extra_rdoc_files: []
|
75
127
|
files:
|
76
|
-
- .gitignore
|
77
|
-
- .rspec
|
128
|
+
- ".gitignore"
|
129
|
+
- ".rspec"
|
130
|
+
- Appraisals
|
78
131
|
- Gemfile
|
132
|
+
- LICENSE
|
79
133
|
- README.md
|
80
134
|
- Rakefile
|
81
135
|
- lib/postrank-uri.rb
|
@@ -86,7 +140,8 @@ files:
|
|
86
140
|
- spec/helper.rb
|
87
141
|
- spec/postrank-uri_spec.rb
|
88
142
|
homepage: http://github.com/postrank-labs/postrank-uri
|
89
|
-
licenses:
|
143
|
+
licenses:
|
144
|
+
- MIT
|
90
145
|
metadata: {}
|
91
146
|
post_install_message:
|
92
147
|
rdoc_options: []
|
@@ -94,17 +149,17 @@ require_paths:
|
|
94
149
|
- lib
|
95
150
|
required_ruby_version: !ruby/object:Gem::Requirement
|
96
151
|
requirements:
|
97
|
-
- -
|
152
|
+
- - ">="
|
98
153
|
- !ruby/object:Gem::Version
|
99
154
|
version: '0'
|
100
155
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
156
|
requirements:
|
102
|
-
- -
|
157
|
+
- - ">="
|
103
158
|
- !ruby/object:Gem::Version
|
104
159
|
version: '0'
|
105
160
|
requirements: []
|
106
161
|
rubyforge_project: postrank-uri
|
107
|
-
rubygems_version: 2.
|
162
|
+
rubygems_version: 2.6.8
|
108
163
|
signing_key:
|
109
164
|
specification_version: 4
|
110
165
|
summary: URI normalization, c14n, escaping, and extraction
|
@@ -112,4 +167,3 @@ test_files:
|
|
112
167
|
- spec/c14n_hosts.yml
|
113
168
|
- spec/helper.rb
|
114
169
|
- spec/postrank-uri_spec.rb
|
115
|
-
has_rdoc:
|