postrank-uri 1.0.17 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1f9a235d0c2287732278b672997027c7d4093d0f
4
+ data.tar.gz: 8c91e4b1c787c43e6ea2ccc51593be3809746fa7
5
+ SHA512:
6
+ metadata.gz: c702922f05473c762d7223777eb48641db8c4b0e156539c729433f3aa9174b010bce707801ee8c7ec310e4181edb3402c77131d3a66b0bdc02e99f87c1d9982a
7
+ data.tar.gz: 06040dead1c6348febb64211e2787216c287c87baa09555947e03a4bea09b7293ed33faed08851e28aaa2ff08389d121a48746cc6a6f3af6c7e023f75db1917a
data/README.md CHANGED
@@ -14,7 +14,7 @@ In a nutshell, we need to make sure that creative cases like the ones below all
14
14
  ## API
15
15
 
16
16
  - **PostRank::URI.extract(text)** - Detect URIs in text, discard bad TLD's
17
- - **PostRank::URI.clean(uri)** - Unescape, normalize, apply c18n filters - 95% use case.
17
+ - **PostRank::URI.clean(uri)** - Unescape, normalize, apply c14n filters - 95% use case.
18
18
 
19
19
  - **PostRank::URI.normalize(uri)** - Apply RFC normalization rules, discard extra path characters, drop anchors
20
20
  - **PostRank::URI.unescape(uri)** - Unescape URI entities, handle +/%20's, etc
@@ -33,8 +33,8 @@ In a nutshell, we need to make sure that creative cases like the ones below all
33
33
  [0] "http://link.to/?a=b"
34
34
  ]
35
35
 
36
- ## C18N
36
+ ## C14N
37
37
 
38
- As part of URI canonicalization the library will remove common tracking parameters from Google Analytics and several other providers. Beyond that, host-specific rules are also applied. For example, nytimes.com likes to add a 'partner' query parameter for tracking purposes, but which has no effect on the content - hence, it is removed from the URI. For full list, see the c18n.yml file.
38
+ As part of URI canonicalization the library will remove common tracking parameters from Google Analytics and several other providers. Beyond that, host-specific rules are also applied. For example, nytimes.com likes to add a 'partner' query parameter for tracking purposes, but which has no effect on the content - hence, it is removed from the URI. For full list, see the c14n.yml file.
39
39
 
40
40
  Detecting "duplicate URLs" is a hard problem to solve (expensive in all senses), instead we are compiling a manually assembled database. If you find cases which are missing, please do report them, or send us a pull request!
@@ -1,4 +1,3 @@
1
- # -*- encoding: utf-8 -*-
2
1
 
3
2
  require 'addressable/uri'
4
3
  require 'digest/md5'
@@ -34,11 +33,11 @@ end
34
33
  module PostRank
35
34
  module URI
36
35
 
37
- c18ndb = YAML.load_file(File.dirname(__FILE__) + '/postrank-uri/c18n.yml')
36
+ c14ndb = YAML.load_file(File.dirname(__FILE__) + '/postrank-uri/c14n.yml')
38
37
 
39
- C18N = {}
40
- C18N[:global] = c18ndb[:all].freeze
41
- C18N[:hosts] = c18ndb[:hosts].inject({}) {|h,(k,v)| h[/#{Regexp.escape(k)}$/.freeze] = v; h}
38
+ C14N = {}
39
+ C14N[:global] = c14ndb[:all].freeze
40
+ C14N[:hosts] = c14ndb[:hosts].inject({}) {|h,(k,v)| h[/#{Regexp.escape(k)}$/.freeze] = v; h}
42
41
 
43
42
  URIREGEX = {}
44
43
  URIREGEX[:protocol] = /https?:\/\//i
@@ -130,13 +129,15 @@ module PostRank
130
129
  end
131
130
 
132
131
  def unescape(uri)
133
- uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do
132
+ u = parse(uri)
133
+ u.query = u.query.tr('+', ' ') if u.query
134
+ u.to_s.gsub(URIREGEX[:unescape]) do
134
135
  [$1.delete('%')].pack('H*')
135
136
  end
136
137
  end
137
138
 
138
139
  def clean(uri, opts = {})
139
- uri = normalize(c18n(unescape(uri), opts))
140
+ uri = normalize(c14n(unescape(uri), opts))
140
141
  opts[:raw] ? uri : uri.to_s
141
142
  end
142
143
 
@@ -153,13 +154,13 @@ module PostRank
153
154
  u
154
155
  end
155
156
 
156
- def c18n(uri, opts = {})
157
+ def c14n(uri, opts = {})
157
158
  u = parse(uri, opts)
158
159
  u = embedded(u)
159
160
 
160
161
  if q = u.query_values(Array)
161
- q.delete_if { |k,v| C18N[:global].include?(k) }
162
- q.delete_if { |k,v| C18N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
162
+ q.delete_if { |k,v| C14N[:global].include?(k) }
163
+ q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
163
164
  end
164
165
  u.query_values = q
165
166
 
@@ -224,7 +225,7 @@ module PostRank
224
225
  cleaned_uri = clean(uri, :raw => true)
225
226
 
226
227
  if host = cleaned_uri.host
227
- is_valid = PublicSuffix.valid?(host)
228
+ is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host))
228
229
  end
229
230
 
230
231
  is_valid
@@ -7,6 +7,8 @@
7
7
  - utm_campaign # Google Analytics: campaign name
8
8
  - sms_ss # addthis.com tracker
9
9
  - awesm # awe.sm tracker
10
+ - xtor # AT Internet tracker
11
+ - PHPSESSID # Legacy PHP session identifier
10
12
 
11
13
  :hosts:
12
14
  nytimes.com:
@@ -1,5 +1,5 @@
1
1
  module PostRank
2
2
  module URI
3
- VERSION = "1.0.17"
3
+ VERSION = "1.0.18"
4
4
  end
5
5
  end
@@ -9,14 +9,14 @@ Gem::Specification.new do |s|
9
9
  s.authors = ["Ilya Grigorik"]
10
10
  s.email = ["ilya@igvita.com"]
11
11
  s.homepage = "http://github.com/postrank-labs/postrank-uri"
12
- s.summary = "URI normalization, c18n, escaping, and extraction"
12
+ s.summary = "URI normalization, c14n, escaping, and extraction"
13
13
  s.description = s.summary
14
14
 
15
15
  s.rubyforge_project = "postrank-uri"
16
16
 
17
17
  s.add_dependency "addressable", "~> 2.3.0"
18
18
  s.add_dependency "public_suffix", "~> 1.1.3"
19
- s.add_dependency "nokogiri", "~> 1.5.5"
19
+ s.add_dependency "nokogiri", "~> 1.6.1"
20
20
 
21
21
  s.add_development_dependency "rspec"
22
22
 
File without changes
@@ -29,15 +29,19 @@ describe PostRank::URI do
29
29
  # See http://tools.ietf.org/html/rfc3986#section-2.3
30
30
 
31
31
  it "should unescape PostRank::URI with spaces encoded as '+'" do
32
- PostRank::URI.unescape('id=+1').should == 'id= 1'
32
+ PostRank::URI.unescape('?id=+1').should == '?id= 1'
33
33
  end
34
34
 
35
35
  it "should unescape PostRank::URI with spaces encoded as '+'" do
36
- PostRank::URI.unescape('id%3D+1').should == 'id= 1'
36
+ PostRank::URI.unescape('?id%3D+1').should == '?id= 1'
37
37
  end
38
38
 
39
39
  it "should unescape PostRank::URI with spaces encoded as %20" do
40
- PostRank::URI.unescape('id=%201').should == 'id= 1'
40
+ PostRank::URI.unescape('?id=%201').should == '?id= 1'
41
+ end
42
+
43
+ it "should not unescape '+' to spaces in paths" do
44
+ PostRank::URI.unescape('/foo+bar?id=foo+bar').should == '/foo+bar?id=foo bar'
41
45
  end
42
46
  end
43
47
 
@@ -100,7 +104,7 @@ describe PostRank::URI do
100
104
 
101
105
  context "canonicalization" do
102
106
  def c(uri)
103
- PostRank::URI.c18n(uri).to_s
107
+ PostRank::URI.c14n(uri).to_s
104
108
  end
105
109
 
106
110
  context "query parameters" do
@@ -122,6 +126,12 @@ describe PostRank::URI do
122
126
  c('igvita.com/?id=a&utm_source=a&awesm=b').should == 'http://igvita.com/?id=a'
123
127
  c('igvita.com/?id=a&sms_ss=a').should == 'http://igvita.com/?id=a'
124
128
  end
129
+
130
+ it "should remove PHPSESSID parameter" do
131
+ c('http://www.nachi.org/forum?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').should == 'http://www.nachi.org/forum?'
132
+ c('http://www.nachi.org/forum/?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').should == 'http://www.nachi.org/forum/?'
133
+ c('http://www.nachi.org/forum?id=123&PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd').should == 'http://www.nachi.org/forum?id=123'
134
+ end
125
135
  end
126
136
 
127
137
  context "hashbang" do
@@ -162,7 +172,7 @@ describe PostRank::URI do
162
172
  PostRank::URI.clean(uri)
163
173
  end
164
174
 
165
- it "should unescape, c18n and normalize" do
175
+ it "should unescape, c14n and normalize" do
166
176
  c('http://igvita.com/?id=1').should == 'http://igvita.com/?id=1'
167
177
  c('igvita.com/?id=1').should == 'http://igvita.com/?id=1'
168
178
 
@@ -179,7 +189,7 @@ describe PostRank::URI do
179
189
  end
180
190
 
181
191
  it "should clean host specific parameters" do
182
- YAML.load_file('spec/c18n_hosts.yml').each do |orig, clean|
192
+ YAML.load_file('spec/c14n_hosts.yml').each do |orig, clean|
183
193
  c(orig).should == clean
184
194
  end
185
195
  end
@@ -350,5 +360,13 @@ describe PostRank::URI do
350
360
  it 'marks www.test.com as valid' do
351
361
  PostRank::URI.valid?('http://www.test.com').should be_true
352
362
  end
363
+
364
+ it 'marks Unicode domain as valid (NOTE: works only with a scheme)' do
365
+ PostRank::URI.valid?('http://президент.рф').should be_true
366
+ end
367
+
368
+ it 'marks punycode domain domain as valid' do
369
+ PostRank::URI.valid?('xn--d1abbgf6aiiy.xn--p1ai').should be_true
370
+ end
353
371
  end
354
372
  end
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: postrank-uri
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.17
5
- prerelease:
4
+ version: 1.0.18
6
5
  platform: ruby
7
6
  authors:
8
7
  - Ilya Grigorik
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-10-07 00:00:00.000000000 Z
11
+ date: 2014-04-10 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: addressable
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ~>
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ~>
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: public_suffix
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ~>
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ~>
44
39
  - !ruby/object:Gem::Version
@@ -46,36 +41,32 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: nokogiri
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ~>
52
46
  - !ruby/object:Gem::Version
53
- version: 1.5.5
47
+ version: 1.6.1
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ~>
60
53
  - !ruby/object:Gem::Version
61
- version: 1.5.5
54
+ version: 1.6.1
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: rspec
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
- - - ! '>='
59
+ - - '>='
68
60
  - !ruby/object:Gem::Version
69
61
  version: '0'
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
- - - ! '>='
66
+ - - '>='
76
67
  - !ruby/object:Gem::Version
77
68
  version: '0'
78
- description: URI normalization, c18n, escaping, and extraction
69
+ description: URI normalization, c14n, escaping, and extraction
79
70
  email:
80
71
  - ilya@igvita.com
81
72
  executables: []
@@ -88,38 +79,37 @@ files:
88
79
  - README.md
89
80
  - Rakefile
90
81
  - lib/postrank-uri.rb
91
- - lib/postrank-uri/c18n.yml
82
+ - lib/postrank-uri/c14n.yml
92
83
  - lib/postrank-uri/version.rb
93
84
  - postrank-uri.gemspec
94
- - spec/c18n_hosts.yml
85
+ - spec/c14n_hosts.yml
95
86
  - spec/helper.rb
96
87
  - spec/postrank-uri_spec.rb
97
88
  homepage: http://github.com/postrank-labs/postrank-uri
98
89
  licenses: []
90
+ metadata: {}
99
91
  post_install_message:
100
92
  rdoc_options: []
101
93
  require_paths:
102
94
  - lib
103
95
  required_ruby_version: !ruby/object:Gem::Requirement
104
- none: false
105
96
  requirements:
106
- - - ! '>='
97
+ - - '>='
107
98
  - !ruby/object:Gem::Version
108
99
  version: '0'
109
100
  required_rubygems_version: !ruby/object:Gem::Requirement
110
- none: false
111
101
  requirements:
112
- - - ! '>='
102
+ - - '>='
113
103
  - !ruby/object:Gem::Version
114
104
  version: '0'
115
105
  requirements: []
116
106
  rubyforge_project: postrank-uri
117
- rubygems_version: 1.8.24
107
+ rubygems_version: 2.0.6
118
108
  signing_key:
119
- specification_version: 3
120
- summary: URI normalization, c18n, escaping, and extraction
109
+ specification_version: 4
110
+ summary: URI normalization, c14n, escaping, and extraction
121
111
  test_files:
122
- - spec/c18n_hosts.yml
112
+ - spec/c14n_hosts.yml
123
113
  - spec/helper.rb
124
114
  - spec/postrank-uri_spec.rb
125
115
  has_rdoc: