postrank-uri 1.0.17 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +3 -3
- data/lib/postrank-uri.rb +12 -11
- data/lib/postrank-uri/{c18n.yml → c14n.yml} +2 -0
- data/lib/postrank-uri/version.rb +1 -1
- data/postrank-uri.gemspec +2 -2
- data/spec/{c18n_hosts.yml → c14n_hosts.yml} +0 -0
- data/spec/postrank-uri_spec.rb +24 -6
- metadata +16 -26
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1f9a235d0c2287732278b672997027c7d4093d0f
|
4
|
+
data.tar.gz: 8c91e4b1c787c43e6ea2ccc51593be3809746fa7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c702922f05473c762d7223777eb48641db8c4b0e156539c729433f3aa9174b010bce707801ee8c7ec310e4181edb3402c77131d3a66b0bdc02e99f87c1d9982a
|
7
|
+
data.tar.gz: 06040dead1c6348febb64211e2787216c287c87baa09555947e03a4bea09b7293ed33faed08851e28aaa2ff08389d121a48746cc6a6f3af6c7e023f75db1917a
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ In a nutshell, we need to make sure that creative cases like the ones below all
|
|
14
14
|
## API
|
15
15
|
|
16
16
|
- **PostRank::URI.extract(text)** - Detect URIs in text, discard bad TLD's
|
17
|
-
- **PostRank::URI.clean(uri)** - Unescape, normalize, apply
|
17
|
+
- **PostRank::URI.clean(uri)** - Unescape, normalize, apply c14n filters - 95% use case.
|
18
18
|
|
19
19
|
- **PostRank::URI.normalize(uri)** - Apply RFC normalization rules, discard extra path characters, drop anchors
|
20
20
|
- **PostRank::URI.unescape(uri)** - Unescape URI entities, handle +/%20's, etc
|
@@ -33,8 +33,8 @@ In a nutshell, we need to make sure that creative cases like the ones below all
|
|
33
33
|
[0] "http://link.to/?a=b"
|
34
34
|
]
|
35
35
|
|
36
|
-
##
|
36
|
+
## C14N
|
37
37
|
|
38
|
-
As part of URI canonicalization the library will remove common tracking parameters from Google Analytics and several other providers. Beyond that, host-specific rules are also applied. For example, nytimes.com likes to add a 'partner' query parameter for tracking purposes, but which has no effect on the content - hence, it is removed from the URI. For full list, see the
|
38
|
+
As part of URI canonicalization the library will remove common tracking parameters from Google Analytics and several other providers. Beyond that, host-specific rules are also applied. For example, nytimes.com likes to add a 'partner' query parameter for tracking purposes, but which has no effect on the content - hence, it is removed from the URI. For full list, see the c14n.yml file.
|
39
39
|
|
40
40
|
Detecting "duplicate URLs" is a hard problem to solve (expensive in all senses), instead we are compiling a manually assembled database. If you find cases which are missing, please do report them, or send us a pull request!
|
data/lib/postrank-uri.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
1
|
|
3
2
|
require 'addressable/uri'
|
4
3
|
require 'digest/md5'
|
@@ -34,11 +33,11 @@ end
|
|
34
33
|
module PostRank
|
35
34
|
module URI
|
36
35
|
|
37
|
-
|
36
|
+
c14ndb = YAML.load_file(File.dirname(__FILE__) + '/postrank-uri/c14n.yml')
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
C14N = {}
|
39
|
+
C14N[:global] = c14ndb[:all].freeze
|
40
|
+
C14N[:hosts] = c14ndb[:hosts].inject({}) {|h,(k,v)| h[/#{Regexp.escape(k)}$/.freeze] = v; h}
|
42
41
|
|
43
42
|
URIREGEX = {}
|
44
43
|
URIREGEX[:protocol] = /https?:\/\//i
|
@@ -130,13 +129,15 @@ module PostRank
|
|
130
129
|
end
|
131
130
|
|
132
131
|
def unescape(uri)
|
133
|
-
|
132
|
+
u = parse(uri)
|
133
|
+
u.query = u.query.tr('+', ' ') if u.query
|
134
|
+
u.to_s.gsub(URIREGEX[:unescape]) do
|
134
135
|
[$1.delete('%')].pack('H*')
|
135
136
|
end
|
136
137
|
end
|
137
138
|
|
138
139
|
def clean(uri, opts = {})
|
139
|
-
uri = normalize(
|
140
|
+
uri = normalize(c14n(unescape(uri), opts))
|
140
141
|
opts[:raw] ? uri : uri.to_s
|
141
142
|
end
|
142
143
|
|
@@ -153,13 +154,13 @@ module PostRank
|
|
153
154
|
u
|
154
155
|
end
|
155
156
|
|
156
|
-
def
|
157
|
+
def c14n(uri, opts = {})
|
157
158
|
u = parse(uri, opts)
|
158
159
|
u = embedded(u)
|
159
160
|
|
160
161
|
if q = u.query_values(Array)
|
161
|
-
q.delete_if { |k,v|
|
162
|
-
q.delete_if { |k,v|
|
162
|
+
q.delete_if { |k,v| C14N[:global].include?(k) }
|
163
|
+
q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
|
163
164
|
end
|
164
165
|
u.query_values = q
|
165
166
|
|
@@ -224,7 +225,7 @@ module PostRank
|
|
224
225
|
cleaned_uri = clean(uri, :raw => true)
|
225
226
|
|
226
227
|
if host = cleaned_uri.host
|
227
|
-
is_valid = PublicSuffix.valid?(host)
|
228
|
+
is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host))
|
228
229
|
end
|
229
230
|
|
230
231
|
is_valid
|
data/lib/postrank-uri/version.rb
CHANGED
data/postrank-uri.gemspec
CHANGED
@@ -9,14 +9,14 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.authors = ["Ilya Grigorik"]
|
10
10
|
s.email = ["ilya@igvita.com"]
|
11
11
|
s.homepage = "http://github.com/postrank-labs/postrank-uri"
|
12
|
-
s.summary = "URI normalization,
|
12
|
+
s.summary = "URI normalization, c14n, escaping, and extraction"
|
13
13
|
s.description = s.summary
|
14
14
|
|
15
15
|
s.rubyforge_project = "postrank-uri"
|
16
16
|
|
17
17
|
s.add_dependency "addressable", "~> 2.3.0"
|
18
18
|
s.add_dependency "public_suffix", "~> 1.1.3"
|
19
|
-
s.add_dependency "nokogiri", "~> 1.
|
19
|
+
s.add_dependency "nokogiri", "~> 1.6.1"
|
20
20
|
|
21
21
|
s.add_development_dependency "rspec"
|
22
22
|
|
File without changes
|
data/spec/postrank-uri_spec.rb
CHANGED
@@ -29,15 +29,19 @@ describe PostRank::URI do
|
|
29
29
|
# See http://tools.ietf.org/html/rfc3986#section-2.3
|
30
30
|
|
31
31
|
it "should unescape PostRank::URI with spaces encoded as '+'" do
|
32
|
-
PostRank::URI.unescape('id=+1').should == 'id= 1'
|
32
|
+
PostRank::URI.unescape('?id=+1').should == '?id= 1'
|
33
33
|
end
|
34
34
|
|
35
35
|
it "should unescape PostRank::URI with spaces encoded as '+'" do
|
36
|
-
PostRank::URI.unescape('id%3D+1').should == 'id= 1'
|
36
|
+
PostRank::URI.unescape('?id%3D+1').should == '?id= 1'
|
37
37
|
end
|
38
38
|
|
39
39
|
it "should unescape PostRank::URI with spaces encoded as %20" do
|
40
|
-
PostRank::URI.unescape('id=%201').should == 'id= 1'
|
40
|
+
PostRank::URI.unescape('?id=%201').should == '?id= 1'
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should not unescape '+' to spaces in paths" do
|
44
|
+
PostRank::URI.unescape('/foo+bar?id=foo+bar').should == '/foo+bar?id=foo bar'
|
41
45
|
end
|
42
46
|
end
|
43
47
|
|
@@ -100,7 +104,7 @@ describe PostRank::URI do
|
|
100
104
|
|
101
105
|
context "canonicalization" do
|
102
106
|
def c(uri)
|
103
|
-
PostRank::URI.
|
107
|
+
PostRank::URI.c14n(uri).to_s
|
104
108
|
end
|
105
109
|
|
106
110
|
context "query parameters" do
|
@@ -122,6 +126,12 @@ describe PostRank::URI do
|
|
122
126
|
c('igvita.com/?id=a&utm_source=a&awesm=b').should == 'http://igvita.com/?id=a'
|
123
127
|
c('igvita.com/?id=a&sms_ss=a').should == 'http://igvita.com/?id=a'
|
124
128
|
end
|
129
|
+
|
130
|
+
it "should remove PHPSESSID parameter" do
|
131
|
+
c('http://www.nachi.org/forum?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').should == 'http://www.nachi.org/forum?'
|
132
|
+
c('http://www.nachi.org/forum/?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').should == 'http://www.nachi.org/forum/?'
|
133
|
+
c('http://www.nachi.org/forum?id=123&PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').should == 'http://www.nachi.org/forum?id=123'
|
134
|
+
end
|
125
135
|
end
|
126
136
|
|
127
137
|
context "hashbang" do
|
@@ -162,7 +172,7 @@ describe PostRank::URI do
|
|
162
172
|
PostRank::URI.clean(uri)
|
163
173
|
end
|
164
174
|
|
165
|
-
it "should unescape,
|
175
|
+
it "should unescape, c14n and normalize" do
|
166
176
|
c('http://igvita.com/?id=1').should == 'http://igvita.com/?id=1'
|
167
177
|
c('igvita.com/?id=1').should == 'http://igvita.com/?id=1'
|
168
178
|
|
@@ -179,7 +189,7 @@ describe PostRank::URI do
|
|
179
189
|
end
|
180
190
|
|
181
191
|
it "should clean host specific parameters" do
|
182
|
-
YAML.load_file('spec/
|
192
|
+
YAML.load_file('spec/c14n_hosts.yml').each do |orig, clean|
|
183
193
|
c(orig).should == clean
|
184
194
|
end
|
185
195
|
end
|
@@ -350,5 +360,13 @@ describe PostRank::URI do
|
|
350
360
|
it 'marks www.test.com as valid' do
|
351
361
|
PostRank::URI.valid?('http://www.test.com').should be_true
|
352
362
|
end
|
363
|
+
|
364
|
+
it 'marks Unicode domain as valid (NOTE: works only with a scheme)' do
|
365
|
+
PostRank::URI.valid?('http://президент.рф').should be_true
|
366
|
+
end
|
367
|
+
|
368
|
+
it 'marks punycode domain domain as valid' do
|
369
|
+
PostRank::URI.valid?('xn--d1abbgf6aiiy.xn--p1ai').should be_true
|
370
|
+
end
|
353
371
|
end
|
354
372
|
end
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: postrank-uri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
5
|
-
prerelease:
|
4
|
+
version: 1.0.18
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Ilya Grigorik
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-04-10 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: addressable
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ~>
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ~>
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: public_suffix
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ~>
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ~>
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,36 +41,32 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: nokogiri
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ~>
|
52
46
|
- !ruby/object:Gem::Version
|
53
|
-
version: 1.
|
47
|
+
version: 1.6.1
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ~>
|
60
53
|
- !ruby/object:Gem::Version
|
61
|
-
version: 1.
|
54
|
+
version: 1.6.1
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: rspec
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- -
|
59
|
+
- - '>='
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '0'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- -
|
66
|
+
- - '>='
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '0'
|
78
|
-
description: URI normalization,
|
69
|
+
description: URI normalization, c14n, escaping, and extraction
|
79
70
|
email:
|
80
71
|
- ilya@igvita.com
|
81
72
|
executables: []
|
@@ -88,38 +79,37 @@ files:
|
|
88
79
|
- README.md
|
89
80
|
- Rakefile
|
90
81
|
- lib/postrank-uri.rb
|
91
|
-
- lib/postrank-uri/
|
82
|
+
- lib/postrank-uri/c14n.yml
|
92
83
|
- lib/postrank-uri/version.rb
|
93
84
|
- postrank-uri.gemspec
|
94
|
-
- spec/
|
85
|
+
- spec/c14n_hosts.yml
|
95
86
|
- spec/helper.rb
|
96
87
|
- spec/postrank-uri_spec.rb
|
97
88
|
homepage: http://github.com/postrank-labs/postrank-uri
|
98
89
|
licenses: []
|
90
|
+
metadata: {}
|
99
91
|
post_install_message:
|
100
92
|
rdoc_options: []
|
101
93
|
require_paths:
|
102
94
|
- lib
|
103
95
|
required_ruby_version: !ruby/object:Gem::Requirement
|
104
|
-
none: false
|
105
96
|
requirements:
|
106
|
-
- -
|
97
|
+
- - '>='
|
107
98
|
- !ruby/object:Gem::Version
|
108
99
|
version: '0'
|
109
100
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
110
|
-
none: false
|
111
101
|
requirements:
|
112
|
-
- -
|
102
|
+
- - '>='
|
113
103
|
- !ruby/object:Gem::Version
|
114
104
|
version: '0'
|
115
105
|
requirements: []
|
116
106
|
rubyforge_project: postrank-uri
|
117
|
-
rubygems_version:
|
107
|
+
rubygems_version: 2.0.6
|
118
108
|
signing_key:
|
119
|
-
specification_version:
|
120
|
-
summary: URI normalization,
|
109
|
+
specification_version: 4
|
110
|
+
summary: URI normalization, c14n, escaping, and extraction
|
121
111
|
test_files:
|
122
|
-
- spec/
|
112
|
+
- spec/c14n_hosts.yml
|
123
113
|
- spec/helper.rb
|
124
114
|
- spec/postrank-uri_spec.rb
|
125
115
|
has_rdoc:
|