postrank-uri 1.0.0 → 1.0.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/.rspec +0 -0
- data/.travis.yml +15 -0
- data/Appraisals +15 -0
- data/LICENSE +21 -0
- data/README.md +32 -5
- data/Rakefile +3 -4
- data/lib/postrank-uri/{c18n.yml → c14n.yml} +20 -1
- data/lib/postrank-uri/version.rb +1 -1
- data/lib/postrank-uri.rb +148 -34
- data/postrank-uri.gemspec +10 -4
- data/spec/{c18n_hosts.yml → c14n_hosts.yml} +30 -3
- data/spec/helper.rb +4 -1
- data/spec/postrank-uri_spec.rb +287 -82
- metadata +119 -75
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8ea2cc7f1dc58cb559b9168ff0f83150f1ec6119
|
4
|
+
data.tar.gz: 73d97d1f7c56b4b0644eb9a8ad54490ca1561fbf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1fa2d5475a617ab8181554f4d15f4d72a1c67f40bb60c16502a575e16de6d721edcbe0c6ca1e5c331588510e0dd1cced56ea9dd4704dc7e9ab59b71c6a6385a5
|
7
|
+
data.tar.gz: 6ab0bf3e698d99127db88528a8fefba1b4d4c7667a6c42cc256a71dadb2dd78dd1f080e206312cc395b9a9c3c68b1fb62335c2f6a48346871cd85466ac86660b
|
data/.gitignore
ADDED
data/.rspec
ADDED
File without changes
|
data/.travis.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
language: ruby
|
2
|
+
cache: bundler
|
3
|
+
rvm:
|
4
|
+
- 2.3.8
|
5
|
+
- 2.4.5
|
6
|
+
- 2.5.3
|
7
|
+
- 2.6.1
|
8
|
+
before_install:
|
9
|
+
- gem install bundler
|
10
|
+
install:
|
11
|
+
- bundle install --jobs=3 --retry=3
|
12
|
+
- bundle exec appraisal install
|
13
|
+
script:
|
14
|
+
- bundle exec rake
|
15
|
+
- bundle exec rake appraisal
|
data/Appraisals
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
nokogiri_versions = ["1.8", "1.9", "1.10"]
|
2
|
+
|
3
|
+
nokogiri_versions.each do |version|
|
4
|
+
appraise "nokogiri-#{version}" do
|
5
|
+
gem "nokogiri", "~> #{version}.0"
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
addressable_versions = ["2.4", "2.5", "2.6"]
|
10
|
+
|
11
|
+
addressable_versions.each do |version|
|
12
|
+
appraise "addressable-#{version}" do
|
13
|
+
gem "addressable", "~> #{version}.0"
|
14
|
+
end
|
15
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2011 Ilya Grigorik
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# PostRank URI
|
2
2
|
|
3
|
-
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/postrank-uri.svg)](https://rubygems.org/gems/postrank-uri) [![Build Status](https://travis-ci.org/postrank-labs/postrank-uri.svg?branch=master)](https://travis-ci.org/postrank-labs/postrank-uri)
|
4
|
+
|
5
|
+
A collection of convenience methods (Ruby 2.3+) for dealing with extracting, (un)escaping, normalization, and canonicalization of URIs. At PostRank we process over 20M URI associated activities each day, and we need to make sure that we can reliably extract the URIs from a variety of text formats, deal with all the numerous and creative ways users like to escape and unescape their URIs, normalize the resulting URIs, and finally apply a set of custom canonicalization rules to make sure that we can cross-reference when the users are talking about the same URL.
|
4
6
|
|
5
7
|
In a nutshell, we need to make sure that creative cases like the ones below all resolve to same URI:
|
6
8
|
|
@@ -14,7 +16,7 @@ In a nutshell, we need to make sure that creative cases like the ones below all
|
|
14
16
|
## API
|
15
17
|
|
16
18
|
- **PostRank::URI.extract(text)** - Detect URIs in text, discard bad TLD's
|
17
|
-
- **PostRank::URI.clean(uri)** - Unescape, normalize, apply
|
19
|
+
- **PostRank::URI.clean(uri)** - Unescape, normalize, apply c14n filters - 95% use case.
|
18
20
|
|
19
21
|
- **PostRank::URI.normalize(uri)** - Apply RFC normalization rules, discard extra path characters, drop anchors
|
20
22
|
- **PostRank::URI.unescape(uri)** - Unescape URI entities, handle +/%20's, etc
|
@@ -33,8 +35,33 @@ In a nutshell, we need to make sure that creative cases like the ones below all
|
|
33
35
|
[0] "http://link.to/?a=b"
|
34
36
|
]
|
35
37
|
|
36
|
-
##
|
38
|
+
## C14N
|
39
|
+
|
40
|
+
As part of URI canonicalization the library will remove common tracking parameters from Google Analytics and several other providers. Beyond that, host-specific rules are also applied. For example, nytimes.com likes to add a 'partner' query parameter for tracking purposes, but which has no effect on the content - hence, it is removed from the URI. For full list, see the c14n.yml file.
|
41
|
+
|
42
|
+
Detecting "duplicate URLs" is a hard problem to solve (expensive in all senses), instead we are compiling a manually assembled database. If you find cases which are missing, please do report them, or send us a pull request!
|
43
|
+
|
44
|
+
## Development
|
45
|
+
|
46
|
+
### Setup
|
47
|
+
|
48
|
+
```
|
49
|
+
bundle install
|
50
|
+
```
|
51
|
+
|
52
|
+
### Running tests
|
53
|
+
|
54
|
+
```
|
55
|
+
bundle exec rake
|
56
|
+
```
|
57
|
+
|
58
|
+
### Running dependency appraisals
|
59
|
+
|
60
|
+
To verify `postrake-uri` works with different versions of its runtime dependencies you can run:
|
37
61
|
|
38
|
-
|
62
|
+
```
|
63
|
+
bundle exec appraisal install
|
64
|
+
bundle exec rake appraisal
|
65
|
+
```
|
39
66
|
|
40
|
-
|
67
|
+
This will execute the test suite with different versions of the dependencies.
|
data/Rakefile
CHANGED
@@ -7,12 +7,19 @@
|
|
7
7
|
- utm_campaign # Google Analytics: campaign name
|
8
8
|
- sms_ss # addthis.com tracker
|
9
9
|
- awesm # awe.sm tracker
|
10
|
+
- xtor # AT Internet tracker
|
11
|
+
- PHPSESSID # Legacy PHP session identifier
|
10
12
|
|
11
13
|
:hosts:
|
12
14
|
nytimes.com:
|
13
15
|
- partner
|
16
|
+
- pagewanted
|
14
17
|
- emc
|
15
18
|
- _r
|
19
|
+
- ref
|
20
|
+
- src
|
21
|
+
diepresse.com:
|
22
|
+
- _vl_backlink
|
16
23
|
washingtonpost.com:
|
17
24
|
- nav
|
18
25
|
- wprss
|
@@ -34,4 +41,16 @@
|
|
34
41
|
welt.de:
|
35
42
|
- wtmc
|
36
43
|
usatoday.com:
|
37
|
-
- csp
|
44
|
+
- csp
|
45
|
+
cnet.com:
|
46
|
+
- part
|
47
|
+
- subj
|
48
|
+
- tag
|
49
|
+
wsj.com:
|
50
|
+
- mod
|
51
|
+
allthingsd.com:
|
52
|
+
- mod
|
53
|
+
waomarketing.com:
|
54
|
+
- nucrss
|
55
|
+
youtube.com:
|
56
|
+
- feature
|
data/lib/postrank-uri/version.rb
CHANGED
data/lib/postrank-uri.rb
CHANGED
@@ -1,22 +1,48 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# encoding: utf-8
|
3
2
|
require 'addressable/uri'
|
4
|
-
require '
|
3
|
+
require 'digest/md5'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'public_suffix'
|
5
6
|
require 'yaml'
|
6
7
|
|
8
|
+
module Addressable
|
9
|
+
class URI
|
10
|
+
def domain
|
11
|
+
host = self.host
|
12
|
+
(host && PublicSuffix.valid?(host, default_rule: nil)) ? PublicSuffix.parse(host).domain : nil
|
13
|
+
end
|
14
|
+
|
15
|
+
def normalized_query
|
16
|
+
@normalized_query ||= (begin
|
17
|
+
if self.query && self.query.strip != ''
|
18
|
+
(self.query.strip.split("&", -1).map do |pair|
|
19
|
+
Addressable::URI.normalize_component(
|
20
|
+
pair,
|
21
|
+
Addressable::URI::CharacterClasses::QUERY.sub("\\&", "")
|
22
|
+
)
|
23
|
+
end).join("&")
|
24
|
+
else
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
end)
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
7
33
|
module PostRank
|
8
34
|
module URI
|
9
35
|
|
10
|
-
|
36
|
+
c14ndb = YAML.load_file(File.dirname(__FILE__) + '/postrank-uri/c14n.yml')
|
11
37
|
|
12
|
-
|
13
|
-
|
14
|
-
|
38
|
+
C14N = {}
|
39
|
+
C14N[:global] = c14ndb[:all].freeze
|
40
|
+
C14N[:hosts] = c14ndb[:hosts].inject({}) {|h,(k,v)| h[/#{Regexp.escape(k)}$/.freeze] = v; h}
|
15
41
|
|
16
42
|
URIREGEX = {}
|
17
43
|
URIREGEX[:protocol] = /https?:\/\//i
|
18
44
|
URIREGEX[:valid_preceding_chars] = /(?:|\.|[^-\/"':!=A-Z0-9_@@]|^|\:)/i
|
19
|
-
URIREGEX[:valid_domain] =
|
45
|
+
URIREGEX[:valid_domain] = /\b(?:[a-z0-9-]{1,63}\.){1,}[a-z]{2,63}(?::[0-9]+)?/i
|
20
46
|
URIREGEX[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~]/i
|
21
47
|
|
22
48
|
# Allow URL paths to contain balanced parens
|
@@ -60,67 +86,155 @@ module PostRank
|
|
60
86
|
)
|
61
87
|
}iox;
|
62
88
|
|
89
|
+
URIREGEX[:reserved_characters] = /%3F|%26/i
|
63
90
|
URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x
|
64
|
-
URIREGEX[:unescape] = /(
|
91
|
+
URIREGEX[:unescape] = /(%[0-9a-fA-F]{2})/x
|
92
|
+
URIREGEX[:double_slash_outside_scheme] = /(?<!http:|https:)\/{2}/x
|
65
93
|
URIREGEX.each_pair{|k,v| v.freeze }
|
66
94
|
|
67
|
-
|
95
|
+
module_function
|
96
|
+
|
97
|
+
def extract(text)
|
68
98
|
return [] if !text
|
69
99
|
urls = []
|
70
100
|
text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
urls.push url
|
75
|
-
rescue NoMethodError
|
101
|
+
# Only extract the URL if the domain is valid
|
102
|
+
if PublicSuffix.valid?(domain, default_rule: nil)
|
103
|
+
url = clean(url)
|
104
|
+
urls.push url.to_s
|
76
105
|
end
|
77
106
|
end
|
78
107
|
|
79
108
|
urls.compact
|
80
109
|
end
|
81
110
|
|
82
|
-
def
|
111
|
+
def extract_href(text, host = nil)
|
112
|
+
urls = []
|
113
|
+
Nokogiri.HTML(text).search('a').each do |a|
|
114
|
+
begin
|
115
|
+
url = clean(a.attr('href'), :raw => true, :host => host)
|
116
|
+
|
117
|
+
next unless url.absolute?
|
118
|
+
|
119
|
+
urls.push [url.to_s, a.text]
|
120
|
+
rescue
|
121
|
+
next
|
122
|
+
end
|
123
|
+
end
|
124
|
+
urls
|
125
|
+
end
|
126
|
+
|
127
|
+
def escape(uri)
|
83
128
|
uri.gsub(URIREGEX[:escape]) do
|
84
129
|
'%' + $1.unpack('H2' * $1.size).join('%').upcase
|
85
130
|
end.gsub(' ','%20')
|
86
131
|
end
|
87
132
|
|
88
|
-
def
|
89
|
-
|
90
|
-
|
133
|
+
def unescape(uri)
|
134
|
+
u = parse(uri)
|
135
|
+
u.query = u.query.tr('+', ' ') if u.query
|
136
|
+
u.to_s.gsub(URIREGEX[:unescape]) do |encoded|
|
137
|
+
if !encoded.match(URIREGEX[:reserved_characters]).nil?
|
138
|
+
encoded
|
139
|
+
else
|
140
|
+
[encoded.delete('%')].pack('H*')
|
141
|
+
end
|
91
142
|
end
|
92
143
|
end
|
93
144
|
|
94
|
-
def
|
95
|
-
normalize(
|
145
|
+
def clean(uri, opts = {})
|
146
|
+
uri = normalize(c14n(unescape(uri), opts))
|
147
|
+
opts[:raw] ? uri : uri.to_s
|
96
148
|
end
|
97
149
|
|
98
|
-
def
|
99
|
-
|
100
|
-
|
150
|
+
def hash(uri, opts = {})
|
151
|
+
Digest::MD5.hexdigest(opts[:clean] == true ? clean(uri) : uri)
|
152
|
+
end
|
153
|
+
|
154
|
+
def normalize(uri, opts = {})
|
155
|
+
u = parse(uri, opts)
|
156
|
+
u.path = u.path.gsub(URIREGEX[:double_slash_outside_scheme], '/')
|
157
|
+
u.path = u.path.chomp('/') if u.path.size != 1
|
101
158
|
u.query = nil if u.query && u.query.empty?
|
102
159
|
u.fragment = nil
|
103
160
|
u
|
104
161
|
end
|
105
162
|
|
106
|
-
def
|
107
|
-
u = parse(uri)
|
163
|
+
def c14n(uri, opts = {})
|
164
|
+
u = parse(uri, opts)
|
165
|
+
u = embedded(u)
|
108
166
|
|
109
|
-
if q = u.query_values(
|
110
|
-
q.delete_if { |k,v|
|
111
|
-
q.delete_if { |k,v|
|
167
|
+
if q = u.query_values(Array)
|
168
|
+
q.delete_if { |k,v| C14N[:global].include?(k) }
|
169
|
+
q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
|
112
170
|
end
|
113
|
-
|
114
171
|
u.query_values = q
|
172
|
+
|
173
|
+
if u.host =~ /^(mobile\.)?twitter\.com$/ && u.fragment && u.fragment.match(/^!(.*)/)
|
174
|
+
u.fragment = nil
|
175
|
+
u.path = $1
|
176
|
+
end
|
177
|
+
|
178
|
+
if u.host =~ /tumblr\.com$/ && u.path =~ /\/post\/\d+\//
|
179
|
+
u.path = u.path.gsub(/[^\/]+$/, '')
|
180
|
+
end
|
181
|
+
|
115
182
|
u
|
116
183
|
end
|
117
184
|
|
118
|
-
def
|
185
|
+
def embedded(uri)
|
186
|
+
embedded = if uri.host == 'news.google.com' && uri.path == '/news/url' \
|
187
|
+
|| uri.host == 'xfruits.com'
|
188
|
+
uri.query_values['url']
|
189
|
+
|
190
|
+
elsif uri.host =~ /myspace\.com/ && uri.path =~ /PostTo/
|
191
|
+
embedded = uri.query_values['u']
|
192
|
+
end
|
193
|
+
|
194
|
+
uri = clean(embedded, :raw => true) if embedded
|
195
|
+
uri
|
196
|
+
end
|
197
|
+
|
198
|
+
def parse(uri, opts = {})
|
119
199
|
return uri if uri.is_a? Addressable::URI
|
120
200
|
|
121
|
-
uri =
|
122
|
-
|
201
|
+
uri = Addressable::URI.parse(uri)
|
202
|
+
|
203
|
+
if !uri.host && uri.scheme !~ /^javascript|mailto|xmpp$/
|
204
|
+
if uri.scheme
|
205
|
+
# With no host and scheme yes, the parser exploded
|
206
|
+
return parse("http://#{uri}", opts)
|
207
|
+
end
|
208
|
+
|
209
|
+
if opts[:host]
|
210
|
+
uri.host = opts[:host]
|
211
|
+
else
|
212
|
+
parts = uri.path.to_s.split(/[\/:]/)
|
213
|
+
if parts.first =~ URIREGEX[:valid_domain]
|
214
|
+
host = parts.shift
|
215
|
+
uri.path = '/' + parts.join('/')
|
216
|
+
uri.host = host
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
uri.scheme = 'http' if uri.host && !uri.scheme
|
222
|
+
uri.normalize!
|
123
223
|
end
|
124
224
|
|
225
|
+
def valid?(uri)
|
226
|
+
# URI is only valid if it is not nil, parses cleanly as a URI,
|
227
|
+
# and the domain has a recognized, valid TLD component
|
228
|
+
return false if uri.nil?
|
229
|
+
|
230
|
+
is_valid = false
|
231
|
+
cleaned_uri = clean(uri, :raw => true)
|
232
|
+
|
233
|
+
if host = cleaned_uri.host
|
234
|
+
is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil)
|
235
|
+
end
|
236
|
+
|
237
|
+
is_valid
|
238
|
+
end
|
125
239
|
end
|
126
|
-
end
|
240
|
+
end
|
data/postrank-uri.gemspec
CHANGED
@@ -8,15 +8,21 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
9
|
s.authors = ["Ilya Grigorik"]
|
10
10
|
s.email = ["ilya@igvita.com"]
|
11
|
-
s.homepage = "http://
|
12
|
-
s.summary = "URI normalization,
|
11
|
+
s.homepage = "http://github.com/postrank-labs/postrank-uri"
|
12
|
+
s.summary = "URI normalization, c14n, escaping, and extraction"
|
13
13
|
s.description = s.summary
|
14
|
+
s.license = 'MIT'
|
15
|
+
s.required_ruby_version = ">= 2.3.0"
|
14
16
|
|
15
17
|
s.rubyforge_project = "postrank-uri"
|
16
18
|
|
17
|
-
s.add_dependency "addressable"
|
18
|
-
s.add_dependency "
|
19
|
+
s.add_dependency "addressable", ">= 2.4.0"
|
20
|
+
s.add_dependency "public_suffix", ">= 2.0.0", "< 2.1"
|
21
|
+
s.add_dependency "nokogiri", ">= 1.8.0"
|
22
|
+
|
23
|
+
s.add_development_dependency "rake"
|
19
24
|
s.add_development_dependency "rspec"
|
25
|
+
s.add_development_dependency "appraisal", ">= 2.0.0", "< 3.0"
|
20
26
|
|
21
27
|
s.files = `git ls-files`.split("\n")
|
22
28
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -2,8 +2,26 @@
|
|
2
2
|
- - http://www.nytimes.com/2010/12/16/world/europe/16russia.html?_r=1&partner=rss&emc=rss
|
3
3
|
- http://www.nytimes.com/2010/12/16/world/europe/16russia.html
|
4
4
|
|
5
|
+
- - http://www.nytimes.com/2011/02/20/magazine/20FOB-Medium-t.html?ref=magazine
|
6
|
+
- http://www.nytimes.com/2011/02/20/magazine/20FOB-Medium-t.html
|
7
|
+
|
8
|
+
- - http://www.nytimes.com/2011/03/13/business/13hire.html?pagewanted=1&_r=1&ref=technology
|
9
|
+
- http://www.nytimes.com/2011/03/13/business/13hire.html
|
10
|
+
|
11
|
+
- - http://www.nytimes.com/2011/03/15/business/media/15adco.html?_r=2&src=recg
|
12
|
+
- http://www.nytimes.com/2011/03/15/business/media/15adco.html
|
13
|
+
|
14
|
+
- - http://networkeffect.allthingsd.com/20110308/googles-approach-to-social/?mod=tweet
|
15
|
+
- http://networkeffect.allthingsd.com/20110308/googles-approach-to-social
|
16
|
+
|
17
|
+
- - http://online.wsj.com/article/SB10001424052748704657704576150191661959856.html?mod=WSJ_hp_LEFTWhatsNewsCollection
|
18
|
+
- http://online.wsj.com/article/SB10001424052748704657704576150191661959856.html
|
19
|
+
|
20
|
+
- - http://diepresse.com/home/wirtschaft/636448/Griechenland_Drachme-als-letzte-Rettung?_vl_backlink=%2Fhome
|
21
|
+
- http://diepresse.com/home/wirtschaft/636448/Griechenland_Drachme-als-letzte-Rettung
|
22
|
+
|
5
23
|
- - http://dotearth.blogs.nytimes.com/2010/12/14/beyond-political-science/?partner=rss&emc=rss
|
6
|
-
- http://dotearth.blogs.nytimes.com/2010/12/14/beyond-political-science
|
24
|
+
- http://dotearth.blogs.nytimes.com/2010/12/14/beyond-political-science
|
7
25
|
|
8
26
|
- - http://www.washingtonpost.com/wp-dyn/content/article/2010/12/14/AR2010121406045.html?nav=rss_email/components
|
9
27
|
- http://www.washingtonpost.com/wp-dyn/content/article/2010/12/14/AR2010121406045.html
|
@@ -30,10 +48,19 @@
|
|
30
48
|
- http://www.dw-world.de/dw/article/0,,6330472,00.html
|
31
49
|
|
32
50
|
- - http://www.repubblica.it/rubriche/il-caso-del-giorno/2010/12/13/news/riscossa_aeffe-10153565/?rss
|
33
|
-
- http://www.repubblica.it/rubriche/il-caso-del-giorno/2010/12/13/news/riscossa_aeffe-10153565
|
51
|
+
- http://www.repubblica.it/rubriche/il-caso-del-giorno/2010/12/13/news/riscossa_aeffe-10153565
|
34
52
|
|
35
53
|
- - http://www.welt.de/sport/Der-Hoellenritt-des-Fussball-Profis-Jean-Marc-Bosman.html?wtmc=RSS.Sport.Fussball
|
36
54
|
- http://www.welt.de/sport/Der-Hoellenritt-des-Fussball-Profis-Jean-Marc-Bosman.html
|
37
55
|
|
38
56
|
- - http://www.usatoday.com/life/television/news/2011-01-19-race19_ST_N.htm?csp=34life
|
39
|
-
- http://www.usatoday.com/life/television/news/2011-01-19-race19_ST_N.htm
|
57
|
+
- http://www.usatoday.com/life/television/news/2011-01-19-race19_ST_N.htm
|
58
|
+
|
59
|
+
- - http://news.cnet.com/8301-17938_105-20029409-1.html?part=rss&subj=news&tag=2547-1_3-0-20
|
60
|
+
- http://news.cnet.com/8301-17938_105-20029409-1.html
|
61
|
+
|
62
|
+
- - http://www.waomarketing.com/blog/at-internet-white-paper-series/?nucrss=1
|
63
|
+
- http://www.waomarketing.com/blog/at-internet-white-paper-series
|
64
|
+
|
65
|
+
- - http://www.youtube.com/watch?v=RRBoPveyETc&feature=player_embedded
|
66
|
+
- http://www.youtube.com/watch?v=RRBoPveyETc
|
data/spec/helper.rb
CHANGED
data/spec/postrank-uri_spec.rb
CHANGED
@@ -3,145 +3,237 @@
|
|
3
3
|
require 'helper'
|
4
4
|
|
5
5
|
describe PostRank::URI do
|
6
|
-
|
7
|
-
let(:igvita) { 'http://igvita.com/' }
|
8
|
-
|
9
6
|
context "escaping" do
|
10
|
-
it "
|
11
|
-
PostRank::URI.escape('id=1').
|
7
|
+
it "escapes PostRank::URI string" do
|
8
|
+
expect(PostRank::URI.escape('id=1')).to eq('id%3D1')
|
12
9
|
end
|
13
10
|
|
14
|
-
it "
|
15
|
-
PostRank::URI.escape('id= 1').
|
11
|
+
it "escapes spaces as %20's" do
|
12
|
+
expect(PostRank::URI.escape('id= 1')).to match('%20')
|
16
13
|
end
|
17
14
|
end
|
18
15
|
|
19
16
|
context "unescape" do
|
20
|
-
it "
|
21
|
-
PostRank::URI.unescape(PostRank::URI.escape('id=1')).
|
17
|
+
it "unescapes PostRank::URI" do
|
18
|
+
expect(PostRank::URI.unescape(PostRank::URI.escape('id=1'))).to eq('id=1')
|
22
19
|
end
|
23
20
|
|
24
|
-
it "
|
25
|
-
PostRank::URI.unescape(PostRank::URI.escape('id= 1')).
|
21
|
+
it "unescapes PostRank::URI with spaces" do
|
22
|
+
expect(PostRank::URI.unescape(PostRank::URI.escape('id= 1'))).to eq('id= 1')
|
26
23
|
end
|
27
24
|
|
28
25
|
context "accept improperly escaped PostRank::URI strings" do
|
29
26
|
# See http://tools.ietf.org/html/rfc3986#section-2.3
|
30
27
|
|
31
|
-
it "
|
32
|
-
PostRank::URI.unescape('id=+1').
|
28
|
+
it "unescapes PostRank::URI with spaces encoded as '+'" do
|
29
|
+
expect(PostRank::URI.unescape('?id=+1')).to eq('?id= 1')
|
30
|
+
end
|
31
|
+
|
32
|
+
it "unescapes PostRank::URI with spaces encoded as '+'" do
|
33
|
+
expect(PostRank::URI.unescape('?id%3D+1')).to eq('?id= 1')
|
33
34
|
end
|
34
35
|
|
35
|
-
it "
|
36
|
-
PostRank::URI.unescape('id
|
36
|
+
it "unescapes PostRank::URI with spaces encoded as %20" do
|
37
|
+
expect(PostRank::URI.unescape('?id=%201')).to eq('?id= 1')
|
37
38
|
end
|
38
39
|
|
39
|
-
it "
|
40
|
-
PostRank::URI.unescape('id
|
40
|
+
it "does not unescape '+' to spaces in paths" do
|
41
|
+
expect(PostRank::URI.unescape('/foo+bar?id=foo+bar')).to eq('/foo+bar?id=foo bar')
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
44
45
|
end
|
45
46
|
|
46
47
|
context "normalize" do
|
48
|
+
let(:igvita) { 'http://igvita.com/' }
|
49
|
+
|
47
50
|
def n(uri)
|
48
51
|
PostRank::URI.normalize(uri).to_s
|
49
52
|
end
|
50
53
|
|
51
|
-
it "
|
52
|
-
n('http://igvita.com/').
|
53
|
-
n('http://igvita.com').to_s.
|
54
|
-
n('http://igvita.com///').
|
54
|
+
it "normalizes paths in PostRank::URIs" do
|
55
|
+
expect(n('http://igvita.com/')).to eq(igvita)
|
56
|
+
expect(n('http://igvita.com').to_s).to eq(igvita)
|
57
|
+
expect(n('http://igvita.com///')).to eq(igvita)
|
55
58
|
|
56
|
-
n('http://igvita.com/../').
|
57
|
-
n('http://igvita.com/a/b/../../').
|
58
|
-
n('http://igvita.com/a/b/../..').
|
59
|
+
expect(n('http://igvita.com/../')).to eq(igvita)
|
60
|
+
expect(n('http://igvita.com/a/b/../../')).to eq(igvita)
|
61
|
+
expect(n('http://igvita.com/a/b/../..')).to eq(igvita)
|
59
62
|
end
|
60
63
|
|
61
|
-
it "
|
62
|
-
n('http://igvita.com/?').
|
63
|
-
n('http://igvita.com?').
|
64
|
-
n('http://igvita.com/a/../?').
|
64
|
+
it "normalizes query strings in PostRank::URIs" do
|
65
|
+
expect(n('http://igvita.com/?')).to eq(igvita)
|
66
|
+
expect(n('http://igvita.com?')).to eq(igvita)
|
67
|
+
expect(n('http://igvita.com/a/../?')).to eq(igvita)
|
65
68
|
end
|
66
69
|
|
67
|
-
it "
|
68
|
-
n('http://igvita.com#test').
|
69
|
-
n('http://igvita.com#test#test').
|
70
|
-
n('http://igvita.com/a/../?#test').
|
70
|
+
it "normalizes anchors in PostRank::URIs" do
|
71
|
+
expect(n('http://igvita.com#test')).to eq(igvita)
|
72
|
+
expect(n('http://igvita.com#test#test')).to eq(igvita)
|
73
|
+
expect(n('http://igvita.com/a/../?#test')).to eq(igvita)
|
71
74
|
end
|
72
75
|
|
73
|
-
it "
|
74
|
-
n('http://igvita.com/a/../? ').
|
75
|
-
n('http://igvita.com/a/../? #test').
|
76
|
-
n('http://igvita.com/ /../').
|
76
|
+
it "cleans whitespace in PostRank::URIs" do
|
77
|
+
expect(n('http://igvita.com/a/../? ')).to eq(igvita)
|
78
|
+
expect(n('http://igvita.com/a/../? #test')).to eq(igvita)
|
79
|
+
expect(n('http://igvita.com/ /../')).to eq(igvita)
|
77
80
|
end
|
78
81
|
|
79
|
-
it "
|
80
|
-
n('igvita.com').
|
81
|
-
n('https://test.com/').to_s.
|
82
|
+
it "defaults to http scheme if missing" do
|
83
|
+
expect(n('igvita.com')).to eq(igvita)
|
84
|
+
expect(n('https://test.com/').to_s).to eq('https://test.com/')
|
82
85
|
end
|
83
86
|
|
84
|
-
it "
|
85
|
-
n('IGVITA.COM').
|
86
|
-
n('IGVITA.COM/ABC').
|
87
|
+
it "downcases the hostname" do
|
88
|
+
expect(n('IGVITA.COM')).to eq(igvita)
|
89
|
+
expect(n('IGVITA.COM/ABC')).to eq(igvita + "ABC")
|
87
90
|
end
|
88
91
|
|
92
|
+
it "removes trailing slash on paths" do
|
93
|
+
expect(n('http://igvita.com/')).to eq('http://igvita.com/')
|
94
|
+
|
95
|
+
expect(n('http://igvita.com/a')).to eq('http://igvita.com/a')
|
96
|
+
expect(n('http://igvita.com/a/')).to eq('http://igvita.com/a')
|
97
|
+
|
98
|
+
expect(n('http://igvita.com/a/b')).to eq('http://igvita.com/a/b')
|
99
|
+
expect(n('http://igvita.com/a/b/')).to eq('http://igvita.com/a/b')
|
100
|
+
end
|
101
|
+
it 'preserves nested urls' do
|
102
|
+
expect(n('http://igvita.com/a/b/http://hello.com')).to eq('http://igvita.com/a/b/http://hello.com')
|
103
|
+
expect(n('http://igvita.com/a//b/https://hello.com')).to eq('http://igvita.com/a/b/https://hello.com')
|
104
|
+
end
|
89
105
|
end
|
90
106
|
|
91
107
|
context "canonicalization" do
|
92
108
|
def c(uri)
|
93
|
-
PostRank::URI.
|
109
|
+
PostRank::URI.c14n(uri).to_s
|
94
110
|
end
|
95
111
|
|
96
112
|
context "query parameters" do
|
97
113
|
it "should handle nester parameters" do
|
98
|
-
c('igvita.com/?id=a&utm_source=a').
|
114
|
+
expect(c('igvita.com/?id=a&utm_source=a')).to eq('http://igvita.com/?id=a')
|
99
115
|
end
|
100
116
|
|
101
|
-
it "
|
117
|
+
it "preserves the order of parameters" do
|
102
118
|
url = 'http://a.com/?'+('a'..'z').to_a.shuffle.map {|e| "#{e}=#{e}"}.join("&")
|
103
|
-
c(url).
|
119
|
+
expect(c(url)).to eq(url)
|
104
120
|
end
|
105
121
|
|
106
|
-
it "
|
107
|
-
c('igvita.com/?id=a&utm_source=a').
|
108
|
-
c('igvita.com/?id=a&utm_source=a&utm_valid').
|
122
|
+
it "removes Google Analytics parameters" do
|
123
|
+
expect(c('igvita.com/?id=a&utm_source=a')).to eq('http://igvita.com/?id=a')
|
124
|
+
expect(c('igvita.com/?id=a&utm_source=a&utm_valid')).to eq('http://igvita.com/?id=a&utm_valid')
|
109
125
|
end
|
110
126
|
|
111
|
-
it "
|
112
|
-
c('igvita.com/?id=a&utm_source=a&awesm=b').
|
113
|
-
c('igvita.com/?id=a&sms_ss=a').
|
127
|
+
it "removes awesm/sms parameters" do
|
128
|
+
expect(c('igvita.com/?id=a&utm_source=a&awesm=b')).to eq('http://igvita.com/?id=a')
|
129
|
+
expect(c('igvita.com/?id=a&sms_ss=a')).to eq('http://igvita.com/?id=a')
|
114
130
|
end
|
115
131
|
|
132
|
+
it "removes PHPSESSID parameter" do
|
133
|
+
expect(c('http://www.nachi.org/forum?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum?')
|
134
|
+
expect(c('http://www.nachi.org/forum/?PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum/?')
|
135
|
+
expect(c('http://www.nachi.org/forum?id=123&PHPSESSID=9ee2fb10b7274ef2b15d1d4006b8c8dd')).to eq('http://www.nachi.org/forum?id=123')
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
context "hashbang" do
|
140
|
+
it "rewrites twitter links to crawlable versions" do
|
141
|
+
expect(c('http://twitter.com/#!/igrigorik')).to eq('http://twitter.com/igrigorik')
|
142
|
+
expect(c('http://twitter.com/#!/a/statuses/1')).to eq('http://twitter.com/a/statuses/1')
|
143
|
+
expect(c('http://nontwitter.com/#!/a/statuses/1')).to eq('http://nontwitter.com/#!/a/statuses/1')
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
context "tumblr" do
|
148
|
+
it "strips the slug" do
|
149
|
+
expect(c('http://test.tumblr.com/post/4533459403/some-text')).to eq('http://test.tumblr.com/post/4533459403/')
|
150
|
+
expect(c('http://tumblr.com/xjl2evo3hh')).to eq('http://tumblr.com/xjl2evo3hh')
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context "embedded links" do
|
155
|
+
it "extracts embedded redirects from Google News" do
|
156
|
+
u = c('http://news.google.com/news/url?sa=t&fd=R&&url=http://www.ctv.ca/CTVNews/Politics/20110111/')
|
157
|
+
expect(u).to eq('http://www.ctv.ca/CTVNews/Politics/20110111')
|
158
|
+
end
|
159
|
+
|
160
|
+
it "extracts embedded redirects from xfruits.com" do
|
161
|
+
u = c('http://xfruits.com/MrGroar/?url=http%3A%2F%2Faap.lesroyaumes.com%2Fdepeches%2Fdepeche351820908.html')
|
162
|
+
expect(u).to eq('http://aap.lesroyaumes.com/depeches/depeche351820908.html')
|
163
|
+
end
|
164
|
+
|
165
|
+
it "extracts embedded redirects from MySpace" do
|
166
|
+
u = c('http://www.myspace.com/Modules/PostTo/Pages/?u=http%3A%2F%2Fghanaian-chronicle.com%2Fnews%2Fother-news%2Fcanadian-high-commissioner-urges-media%2F&t=Canadian%20High%20Commissioner%20urges%20media')
|
167
|
+
expect(u).to eq('http://ghanaian-chronicle.com/news/other-news/canadian-high-commissioner-urges-media')
|
168
|
+
end
|
116
169
|
end
|
117
170
|
end
|
118
171
|
|
119
172
|
context "clean" do
|
120
|
-
|
121
173
|
def c(uri)
|
122
174
|
PostRank::URI.clean(uri)
|
123
175
|
end
|
124
176
|
|
125
|
-
it "
|
126
|
-
c('http://igvita.com/?id=1').
|
127
|
-
c('igvita.com/?id=1').
|
177
|
+
it "unescapes, canonicalizes and normalizes" do
|
178
|
+
expect(c('http://igvita.com/?id=1')).to eq('http://igvita.com/?id=1')
|
179
|
+
expect(c('igvita.com/?id=1')).to eq('http://igvita.com/?id=1')
|
180
|
+
|
181
|
+
expect(c('http://igvita.com/?id= 1')).to eq('http://igvita.com/?id=%201')
|
182
|
+
expect(c('http://igvita.com/?id=+1')).to eq('http://igvita.com/?id=%201')
|
183
|
+
expect(c('http://igvita.com/?id%3D%201')).to eq('http://igvita.com/?id=%201')
|
184
|
+
|
185
|
+
expect(c('igvita.com/a/..?id=1&utm_source=a&awesm=b#c')).to eq('http://igvita.com/?id=1')
|
128
186
|
|
129
|
-
c('
|
130
|
-
c('
|
131
|
-
c('http://igvita.com/?id%3D%201').should == 'http://igvita.com/?id=%201'
|
187
|
+
expect(c('igvita.com?id=<>')).to eq('http://igvita.com/?id=%3C%3E')
|
188
|
+
expect(c('igvita.com?id="')).to eq('http://igvita.com/?id=%22')
|
132
189
|
|
133
|
-
c('
|
190
|
+
expect(c('test.tumblr.com/post/23223/text-stub')).to eq('http://test.tumblr.com/post/23223')
|
191
|
+
end
|
134
192
|
|
135
|
-
|
136
|
-
|
193
|
+
it "cleans host specific parameters" do
|
194
|
+
YAML.load_file('spec/c14n_hosts.yml').each do |orig, clean|
|
195
|
+
expect(c(orig)).to eq(clean)
|
196
|
+
end
|
137
197
|
end
|
138
198
|
|
139
|
-
|
140
|
-
|
141
|
-
c(
|
199
|
+
context "reserved characters" do
|
200
|
+
it "preserves encoded question marks" do
|
201
|
+
expect(c('http://en.wikipedia.org/wiki/Whose_Line_Is_It_Anyway%3F_%28U.S._TV_series%29')).
|
202
|
+
to eq('http://en.wikipedia.org/wiki/Whose_Line_Is_It_Anyway%3F_(U.S._TV_series)')
|
203
|
+
end
|
204
|
+
|
205
|
+
it "preserves encoded ampersands" do
|
206
|
+
expect(c('http://example.com/?foo=BAR%26BAZ')).
|
207
|
+
to eq('http://example.com/?foo=BAR%26BAZ')
|
208
|
+
end
|
209
|
+
|
210
|
+
it "preserves consecutive reserved characters" do
|
211
|
+
expect(c('http://example.com/so-quizical%3F%3F%3F?foo=bar')).
|
212
|
+
to eq('http://example.com/so-quizical%3F%3F%3F?foo=bar')
|
142
213
|
end
|
143
214
|
end
|
215
|
+
end
|
216
|
+
|
217
|
+
context "hash" do
|
218
|
+
def h(uri, opts = {})
|
219
|
+
PostRank::URI.hash(uri, opts)
|
220
|
+
end
|
221
|
+
|
222
|
+
it "computes the MD5 hash without cleaning the URI" do
|
223
|
+
hash = '55fae8910d312b7878a3201ed653b881'
|
224
|
+
|
225
|
+
expect(h('http://everburning.com/feed/post/1')).to eq(hash)
|
226
|
+
expect(h('everburning.com/feed/post/1')).not_to eq(hash)
|
227
|
+
end
|
228
|
+
|
229
|
+
it "normalizes the URI if requested and compute MD5 hash" do
|
230
|
+
hash = '55fae8910d312b7878a3201ed653b881'
|
144
231
|
|
232
|
+
expect(h('http://EverBurning.Com/feed/post/1', :clean => true)).to eq(hash)
|
233
|
+
expect(h('Everburning.com/feed/post/1', :clean => true)).to eq(hash)
|
234
|
+
expect(h('everburning.com/feed/post/1', :clean => true)).to eq(hash)
|
235
|
+
expect(h('everburning.com/feed/post/1/', :clean => true)).to eq(hash)
|
236
|
+
end
|
145
237
|
end
|
146
238
|
|
147
239
|
context "extract" do
|
@@ -150,37 +242,150 @@ describe PostRank::URI do
|
|
150
242
|
end
|
151
243
|
|
152
244
|
context "TLDs" do
|
153
|
-
it "
|
154
|
-
e("yah.lets").
|
245
|
+
it "does not pick up bad grammar as a domain name and think it has a link" do
|
246
|
+
expect(e("yah.lets")).to be_empty
|
155
247
|
end
|
156
248
|
|
157
|
-
it "
|
158
|
-
e('stuff.zz a.b.c d.zq').
|
249
|
+
it "does not pickup bad TLDS" do
|
250
|
+
expect(e('stuff.zz a.b.c d.zq')).to be_empty
|
159
251
|
end
|
160
252
|
end
|
161
253
|
|
162
|
-
it "
|
163
|
-
e(
|
164
|
-
e("text;http://spn.tw/tfnLT").should include("http://spn.tw/tfnLT")
|
165
|
-
e("text.http://spn.tw/tfnLT").should include("http://spn.tw/tfnLT")
|
166
|
-
e("text-http://spn.tw/tfnLT").should include("http://spn.tw/tfnLT")
|
254
|
+
it "extracts twitter links with hashbangs" do
|
255
|
+
expect(e('test http://twitter.com/#!/igrigorik')).to include('http://twitter.com/igrigorik')
|
167
256
|
end
|
168
257
|
|
169
|
-
it "
|
170
|
-
e(
|
258
|
+
it "extracts mobile twitter links with hashbangs" do
|
259
|
+
expect(e('test http://mobile.twitter.com/#!/_mm6')).to include('http://mobile.twitter.com/_mm6')
|
171
260
|
end
|
172
261
|
|
173
|
-
it "
|
262
|
+
it "handles a URL that comes after text without a space" do
|
263
|
+
expect(e("text:http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
264
|
+
expect(e("text;http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
265
|
+
expect(e("text.http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
266
|
+
expect(e("text-http://spn.tw/tfnLT")).to include("http://spn.tw/tfnLT")
|
267
|
+
end
|
268
|
+
|
269
|
+
it "does not pick up anything on or after the first . in the path of a URL with a shortener domain" do
|
270
|
+
expect(e("http://bit.ly/9cJ2mz......if ur pickin up anythign here, u FAIL.")).to eq(["http://bit.ly/9cJ2mz"])
|
271
|
+
end
|
272
|
+
|
273
|
+
it "picks up urls without protocol" do
|
174
274
|
u = e('abc.com abc.co')
|
175
|
-
u.
|
176
|
-
u.
|
275
|
+
expect(u).to include('http://abc.com/')
|
276
|
+
expect(u).to include('http://abc.co/')
|
277
|
+
end
|
278
|
+
|
279
|
+
it "picks up urls inside tags" do
|
280
|
+
u = e("<a href='http://bit.ly/3fds3'>abc.com</a>")
|
281
|
+
expect(u).to include('http://abc.com/')
|
177
282
|
end
|
178
283
|
|
179
284
|
context "multibyte characters" do
|
180
|
-
it "
|
181
|
-
e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食").
|
285
|
+
it "stops extracting URLs at the full-width CJK space character" do
|
286
|
+
expect(e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食")).to eq(["http://www.youtube.com/watch?v=w_j4Lda25jA"])
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
end
|
291
|
+
|
292
|
+
context "href extract" do
|
293
|
+
it "extracts links from html text" do
|
294
|
+
g,b = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
|
295
|
+
|
296
|
+
expect(g.first).to eq('http://google.com/')
|
297
|
+
expect(b.first).to eq('http://b.com/')
|
298
|
+
|
299
|
+
expect(g.last).to eq('link to google')
|
300
|
+
expect(b.last).to eq('stuff')
|
301
|
+
end
|
302
|
+
|
303
|
+
it "handles empty hrefs" do
|
304
|
+
expect do
|
305
|
+
l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
|
306
|
+
expect(l).to be_empty
|
307
|
+
end.not_to raise_error
|
308
|
+
end
|
309
|
+
|
310
|
+
context "relative paths" do
|
311
|
+
it "rejects relative paths" do
|
312
|
+
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
|
313
|
+
expect(l).to be_empty
|
314
|
+
end
|
315
|
+
|
316
|
+
it "resolves relative paths if host is provided" do
|
317
|
+
i = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com").first
|
318
|
+
expect(i.first).to eq('http://igvita.com/stuff')
|
319
|
+
expect(i.last).to eq('link to stuff')
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
context "domain extraction" do
|
324
|
+
url_list = {
|
325
|
+
"http://alex.pages.example.com" => "example.com",
|
326
|
+
"alex.pages.example.com" => "example.com",
|
327
|
+
"http://example.com/2011/04/01/blah" => "example.com",
|
328
|
+
"http://example.com" => "example.com",
|
329
|
+
"example.com" => "example.com",
|
330
|
+
"ExampLe.com" => "example.com",
|
331
|
+
"ExampLe.com:3000" => "example.com",
|
332
|
+
"http://alex.pages.example.COM" => "example.com",
|
333
|
+
"http://www.example.ag.it/2011/04/01/blah" => "example.ag.it",
|
334
|
+
"ftp://www.example.com/2011/04/01/blah" => 'example.com',
|
335
|
+
"http://com" => nil,
|
336
|
+
"http://alex.pages.examplecom" => nil,
|
337
|
+
"example" => nil,
|
338
|
+
"http://127.0.0.1" => nil,
|
339
|
+
"localhost" => nil,
|
340
|
+
"hello-there.com/you" => "hello-there.com"
|
341
|
+
}
|
342
|
+
|
343
|
+
url_list.each_pair do |url, expected_result|
|
344
|
+
it "extracts #{expected_result.inspect} from #{url}" do
|
345
|
+
u = PostRank::URI.clean(url, :raw => true)
|
346
|
+
expect(u.domain).to eq(expected_result)
|
347
|
+
end
|
182
348
|
end
|
183
349
|
end
|
184
350
|
end
|
185
351
|
|
186
|
-
|
352
|
+
context "parse" do
|
353
|
+
it 'does not fail on large host-part look-alikes' do
|
354
|
+
expect(PostRank::URI.parse('a'*64+'.ca').host).to eq(nil)
|
355
|
+
end
|
356
|
+
|
357
|
+
it 'does not pancake javascript scheme URIs' do
|
358
|
+
expect(PostRank::URI.parse('javascript:void(0);').scheme).to eq('javascript')
|
359
|
+
end
|
360
|
+
|
361
|
+
it 'does not pancake mailto scheme URIs' do
|
362
|
+
expect(PostRank::URI.parse('mailto:void(0);').scheme).to eq('mailto')
|
363
|
+
end
|
364
|
+
|
365
|
+
it 'does not pancake xmpp scheme URIs' do
|
366
|
+
expect(PostRank::URI.parse('xmpp:void(0);').scheme).to eq('xmpp')
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
context 'valid?' do
|
371
|
+
it 'marks incomplete URI string as invalid' do
|
372
|
+
expect(PostRank::URI.valid?('/path/page.html')).to be false
|
373
|
+
end
|
374
|
+
|
375
|
+
it 'marks www.test.c as invalid' do
|
376
|
+
expect(PostRank::URI.valid?('http://www.test.c')).to be false
|
377
|
+
end
|
378
|
+
|
379
|
+
it 'marks www.test.com as valid' do
|
380
|
+
expect(PostRank::URI.valid?('http://www.test.com')).to be true
|
381
|
+
end
|
382
|
+
|
383
|
+
it 'marks Unicode domain as valid (NOTE: works only with a scheme)' do
|
384
|
+
expect(PostRank::URI.valid?('http://президент.рф')).to be true
|
385
|
+
end
|
386
|
+
|
387
|
+
it 'marks punycode domain domain as valid' do
|
388
|
+
expect(PostRank::URI.valid?('xn--d1abbgf6aiiy.xn--p1ai')).to be true
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
metadata
CHANGED
@@ -1,114 +1,158 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: postrank-uri
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
segments:
|
6
|
-
- 1
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
version: 1.0.0
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.24
|
10
5
|
platform: ruby
|
11
|
-
authors:
|
6
|
+
authors:
|
12
7
|
- Ilya Grigorik
|
13
8
|
autorequire:
|
14
9
|
bindir: bin
|
15
10
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
11
|
+
date: 2019-04-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
21
14
|
name: addressable
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.4.0
|
20
|
+
type: :runtime
|
22
21
|
prerelease: false
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.4.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: public_suffix
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
26
31
|
- - ">="
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
|
29
|
-
|
30
|
-
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.0.0
|
34
|
+
- - "<"
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '2.1'
|
31
37
|
type: :runtime
|
32
|
-
version_requirements: *id001
|
33
|
-
- !ruby/object:Gem::Dependency
|
34
|
-
name: domainatrix
|
35
38
|
prerelease: false
|
36
|
-
|
37
|
-
|
38
|
-
requirements:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
39
41
|
- - ">="
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
|
42
|
-
|
43
|
-
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 2.0.0
|
44
|
+
- - "<"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.1'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: nokogiri
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.8.0
|
44
54
|
type: :runtime
|
45
|
-
|
46
|
-
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.8.0
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: rake
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '0'
|
68
|
+
type: :development
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
- !ruby/object:Gem::Dependency
|
47
76
|
name: rspec
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
type: :development
|
48
83
|
prerelease: false
|
49
|
-
|
50
|
-
|
51
|
-
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
- !ruby/object:Gem::Dependency
|
90
|
+
name: appraisal
|
91
|
+
requirement: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
52
93
|
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
|
55
|
-
|
56
|
-
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: 2.0.0
|
96
|
+
- - "<"
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '3.0'
|
57
99
|
type: :development
|
58
|
-
|
59
|
-
|
60
|
-
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: 2.0.0
|
106
|
+
- - "<"
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: '3.0'
|
109
|
+
description: URI normalization, c14n, escaping, and extraction
|
110
|
+
email:
|
61
111
|
- ilya@igvita.com
|
62
112
|
executables: []
|
63
|
-
|
64
113
|
extensions: []
|
65
|
-
|
66
114
|
extra_rdoc_files: []
|
67
|
-
|
68
|
-
|
115
|
+
files:
|
116
|
+
- ".gitignore"
|
117
|
+
- ".rspec"
|
118
|
+
- ".travis.yml"
|
119
|
+
- Appraisals
|
69
120
|
- Gemfile
|
121
|
+
- LICENSE
|
70
122
|
- README.md
|
71
123
|
- Rakefile
|
72
124
|
- lib/postrank-uri.rb
|
73
|
-
- lib/postrank-uri/
|
125
|
+
- lib/postrank-uri/c14n.yml
|
74
126
|
- lib/postrank-uri/version.rb
|
75
127
|
- postrank-uri.gemspec
|
76
|
-
- spec/
|
128
|
+
- spec/c14n_hosts.yml
|
77
129
|
- spec/helper.rb
|
78
130
|
- spec/postrank-uri_spec.rb
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
131
|
+
homepage: http://github.com/postrank-labs/postrank-uri
|
132
|
+
licenses:
|
133
|
+
- MIT
|
134
|
+
metadata: {}
|
83
135
|
post_install_message:
|
84
136
|
rdoc_options: []
|
85
|
-
|
86
|
-
require_paths:
|
137
|
+
require_paths:
|
87
138
|
- lib
|
88
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
-
|
90
|
-
requirements:
|
139
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
91
141
|
- - ">="
|
92
|
-
- !ruby/object:Gem::Version
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
|
-
requirements:
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: 2.3.0
|
144
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
|
+
requirements:
|
99
146
|
- - ">="
|
100
|
-
- !ruby/object:Gem::Version
|
101
|
-
|
102
|
-
- 0
|
103
|
-
version: "0"
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: '0'
|
104
149
|
requirements: []
|
105
|
-
|
106
150
|
rubyforge_project: postrank-uri
|
107
|
-
rubygems_version:
|
151
|
+
rubygems_version: 2.6.11
|
108
152
|
signing_key:
|
109
|
-
specification_version:
|
110
|
-
summary: URI normalization,
|
111
|
-
test_files:
|
112
|
-
- spec/
|
153
|
+
specification_version: 4
|
154
|
+
summary: URI normalization, c14n, escaping, and extraction
|
155
|
+
test_files:
|
156
|
+
- spec/c14n_hosts.yml
|
113
157
|
- spec/helper.rb
|
114
158
|
- spec/postrank-uri_spec.rb
|