twingly-url 4.2.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +136 -25
- data/lib/twingly/public_suffix_list.rb +37 -0
- data/lib/twingly/url.rb +14 -8
- data/lib/twingly/version.rb +1 -1
- metadata +19 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5c842338446cf451ce1bfc0530f3ef53552208a6
|
|
4
|
+
data.tar.gz: b36802a8c3f58444b84254d73dfc5666a0d11b13
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 803792e18a70bf53df2fc8e50b380e02244985e43c632ced88932ab0c14baa914835d514ddfaa03d0a4441275580acaa9562cca2f28a346ab803cdf20f0ff491
|
|
7
|
+
data.tar.gz: e169afed23eaabe22730db9416fa10c43a19629ee1374d97c83cf3a49de220b686137da39c2ab69e0f09fa7a6c097be3068d6a9f67ce077ae07d45ed99ae0bc6
|
data/README.md
CHANGED
|
@@ -27,33 +27,132 @@ Usage (this output was created with [`examples/url.rb`][examples]):
|
|
|
27
27
|
require "twingly/url"
|
|
28
28
|
|
|
29
29
|
url = Twingly::URL.parse("http://www.twingly.co.uk/search")
|
|
30
|
-
url.scheme
|
|
31
|
-
url.
|
|
32
|
-
url.
|
|
33
|
-
url.
|
|
34
|
-
url.
|
|
35
|
-
url.
|
|
36
|
-
url.
|
|
37
|
-
url.
|
|
38
|
-
url.
|
|
39
|
-
url.
|
|
40
|
-
url.
|
|
30
|
+
url.scheme # => "http"
|
|
31
|
+
url.normalized.scheme # => "http"
|
|
32
|
+
url.trd # => "www"
|
|
33
|
+
url.normalized.trd # => "www"
|
|
34
|
+
url.sld # => "twingly"
|
|
35
|
+
url.normalized.sld # => "twingly"
|
|
36
|
+
url.tld # => "co.uk"
|
|
37
|
+
url.normalized.tld # => "co.uk"
|
|
38
|
+
url.ttld # => "uk"
|
|
39
|
+
url.normalized.ttld # => "uk"
|
|
40
|
+
url.domain # => "twingly.co.uk"
|
|
41
|
+
url.normalized.domain # => "twingly.co.uk"
|
|
42
|
+
url.host # => "www.twingly.co.uk"
|
|
43
|
+
url.normalized.host # => "www.twingly.co.uk"
|
|
44
|
+
url.origin # => "http://www.twingly.co.uk"
|
|
45
|
+
url.normalized.origin # => "http://www.twingly.co.uk"
|
|
46
|
+
url.path # => "/search"
|
|
47
|
+
url.normalized.path # => "/search"
|
|
48
|
+
url.without_scheme # => "//www.twingly.co.uk/search"
|
|
49
|
+
url.normalized.without_scheme # => "//www.twingly.co.uk/search"
|
|
50
|
+
url.userinfo # => ""
|
|
51
|
+
url.normalized.userinfo # => ""
|
|
52
|
+
url.user # => ""
|
|
53
|
+
url.normalized.user # => ""
|
|
54
|
+
url.password # => ""
|
|
55
|
+
url.normalized.password # => ""
|
|
56
|
+
url.valid? # => "true"
|
|
57
|
+
url.normalized.valid? # => "true"
|
|
58
|
+
url.to_s # => "http://www.twingly.co.uk/search"
|
|
59
|
+
url.normalized.to_s # => "http://www.twingly.co.uk/search"
|
|
60
|
+
|
|
61
|
+
url = Twingly::URL.parse("http://räksmörgås.макдональдс.рф/foo")
|
|
62
|
+
url.scheme # => "http"
|
|
63
|
+
url.normalized.scheme # => "http"
|
|
64
|
+
url.trd # => "räksmörgås"
|
|
65
|
+
url.normalized.trd # => "xn--rksmrgs-5wao1o"
|
|
66
|
+
url.sld # => "макдональдс"
|
|
67
|
+
url.normalized.sld # => "xn--80aalb1aicli8a5i"
|
|
68
|
+
url.tld # => "рф"
|
|
69
|
+
url.normalized.tld # => "xn--p1ai"
|
|
70
|
+
url.ttld # => "рф"
|
|
71
|
+
url.normalized.ttld # => "xn--p1ai"
|
|
72
|
+
url.domain # => "макдональдс.рф"
|
|
73
|
+
url.normalized.domain # => "xn--80aalb1aicli8a5i.xn--p1ai"
|
|
74
|
+
url.host # => "räksmörgås.макдональдс.рф"
|
|
75
|
+
url.normalized.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
|
|
76
|
+
url.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
|
|
77
|
+
url.normalized.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
|
|
78
|
+
url.path # => "/foo"
|
|
79
|
+
url.normalized.path # => "/foo"
|
|
80
|
+
url.without_scheme # => "//räksmörgås.макдональдс.рф/foo"
|
|
81
|
+
url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
|
|
82
|
+
url.userinfo # => ""
|
|
83
|
+
url.normalized.userinfo # => ""
|
|
84
|
+
url.user # => ""
|
|
85
|
+
url.normalized.user # => ""
|
|
86
|
+
url.password # => ""
|
|
87
|
+
url.normalized.password # => ""
|
|
88
|
+
url.valid? # => "true"
|
|
89
|
+
url.normalized.valid? # => "true"
|
|
90
|
+
url.to_s # => "http://räksmörgås.макдональдс.рф/foo"
|
|
91
|
+
url.normalized.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
|
|
92
|
+
|
|
93
|
+
url = Twingly::URL.parse("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo")
|
|
94
|
+
url.scheme # => "http"
|
|
95
|
+
url.normalized.scheme # => "http"
|
|
96
|
+
url.trd # => "xn--rksmrgs-5wao1o"
|
|
97
|
+
url.normalized.trd # => "xn--rksmrgs-5wao1o"
|
|
98
|
+
url.sld # => "xn--80aalb1aicli8a5i"
|
|
99
|
+
url.normalized.sld # => "xn--80aalb1aicli8a5i"
|
|
100
|
+
url.tld # => "xn--p1ai"
|
|
101
|
+
url.normalized.tld # => "xn--p1ai"
|
|
102
|
+
url.ttld # => "xn--p1ai"
|
|
103
|
+
url.normalized.ttld # => "xn--p1ai"
|
|
104
|
+
url.domain # => "xn--80aalb1aicli8a5i.xn--p1ai"
|
|
105
|
+
url.normalized.domain # => "xn--80aalb1aicli8a5i.xn--p1ai"
|
|
106
|
+
url.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
|
|
107
|
+
url.normalized.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
|
|
108
|
+
url.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
|
|
109
|
+
url.normalized.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
|
|
110
|
+
url.path # => "/foo"
|
|
111
|
+
url.normalized.path # => "/foo"
|
|
112
|
+
url.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
|
|
113
|
+
url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
|
|
114
|
+
url.userinfo # => ""
|
|
115
|
+
url.normalized.userinfo # => ""
|
|
116
|
+
url.user # => ""
|
|
117
|
+
url.normalized.user # => ""
|
|
118
|
+
url.password # => ""
|
|
119
|
+
url.normalized.password # => ""
|
|
120
|
+
url.valid? # => "true"
|
|
121
|
+
url.normalized.valid? # => "true"
|
|
122
|
+
url.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
|
|
123
|
+
url.normalized.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
|
|
41
124
|
|
|
42
125
|
url = Twingly::URL.parse("https://admin:correcthorsebatterystaple@example.com/")
|
|
43
|
-
url.scheme
|
|
44
|
-
url.
|
|
45
|
-
url.
|
|
46
|
-
url.
|
|
47
|
-
url.
|
|
48
|
-
url.
|
|
49
|
-
url.
|
|
50
|
-
url.
|
|
51
|
-
url.
|
|
52
|
-
url.
|
|
53
|
-
url.
|
|
54
|
-
url.
|
|
55
|
-
url.
|
|
56
|
-
url.
|
|
126
|
+
url.scheme # => "https"
|
|
127
|
+
url.normalized.scheme # => "https"
|
|
128
|
+
url.trd # => ""
|
|
129
|
+
url.normalized.trd # => "www"
|
|
130
|
+
url.sld # => "example"
|
|
131
|
+
url.normalized.sld # => "example"
|
|
132
|
+
url.tld # => "com"
|
|
133
|
+
url.normalized.tld # => "com"
|
|
134
|
+
url.ttld # => "com"
|
|
135
|
+
url.normalized.ttld # => "com"
|
|
136
|
+
url.domain # => "example.com"
|
|
137
|
+
url.normalized.domain # => "example.com"
|
|
138
|
+
url.host # => "example.com"
|
|
139
|
+
url.normalized.host # => "www.example.com"
|
|
140
|
+
url.origin # => "https://example.com"
|
|
141
|
+
url.normalized.origin # => "https://www.example.com"
|
|
142
|
+
url.path # => "/"
|
|
143
|
+
url.normalized.path # => "/"
|
|
144
|
+
url.without_scheme # => "//admin:correcthorsebatterystaple@example.com/"
|
|
145
|
+
url.normalized.without_scheme # => "//admin:correcthorsebatterystaple@www.example.com/"
|
|
146
|
+
url.userinfo # => "admin:correcthorsebatterystaple"
|
|
147
|
+
url.normalized.userinfo # => "admin:correcthorsebatterystaple"
|
|
148
|
+
url.user # => "admin"
|
|
149
|
+
url.normalized.user # => "admin"
|
|
150
|
+
url.password # => "correcthorsebatterystaple"
|
|
151
|
+
url.normalized.password # => "correcthorsebatterystaple"
|
|
152
|
+
url.valid? # => "true"
|
|
153
|
+
url.normalized.valid? # => "true"
|
|
154
|
+
url.to_s # => "https://admin:correcthorsebatterystaple@example.com/"
|
|
155
|
+
url.normalized.to_s # => "https://admin:correcthorsebatterystaple@www.example.com/"
|
|
57
156
|
```
|
|
58
157
|
|
|
59
158
|
### Dependencies
|
|
@@ -63,6 +162,14 @@ The gem requires libidn.
|
|
|
63
162
|
sudo apt-get install libidn11 # Ubuntu
|
|
64
163
|
brew install libidn # OS X
|
|
65
164
|
|
|
165
|
+
## Development
|
|
166
|
+
|
|
167
|
+
To inspect the [Public Suffix List], this handy command can be used (also works in projects that use `twingly-url` as an dependency).
|
|
168
|
+
|
|
169
|
+
open $(bundle show public_suffix)/data/list.txt
|
|
170
|
+
|
|
171
|
+
[Public Suffix List]: https://github.com/weppos/publicsuffix-ruby
|
|
172
|
+
|
|
66
173
|
## Tests
|
|
67
174
|
|
|
68
175
|
Run tests with
|
|
@@ -91,6 +198,10 @@ Note that this isn't a benchmark, we're using [ruby-prof] which will slow things
|
|
|
91
198
|
|
|
92
199
|
bundle exec rake release
|
|
93
200
|
|
|
201
|
+
* Update the changelog with [GitHub Changelog Generator](https://github.com/skywinder/github-changelog-generator/) (`gem install github_changelog_generator` if you don't have it, set `CHANGELOG_GITHUB_TOKEN` to a personal access token to avoid rate limiting by GitHub). This command will update `CHANGELOG.md`, commit and push manually.
|
|
202
|
+
|
|
203
|
+
github_changelog_generator
|
|
204
|
+
|
|
94
205
|
[twingly-rubygems]: https://rubygems.org/profiles/twingly
|
|
95
206
|
[ruby-prof]: http://ruby-prof.rubyforge.org/
|
|
96
207
|
[examples]: examples/url.rb
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require "public_suffix"
|
|
2
|
+
|
|
3
|
+
module Twingly
|
|
4
|
+
class PublicSuffixList
|
|
5
|
+
ACE_PREFIX = /\Axn\-\-/i.freeze
|
|
6
|
+
|
|
7
|
+
private_constant :ACE_PREFIX
|
|
8
|
+
|
|
9
|
+
# Extend the PSL with ASCII form of all internationalized domain names
|
|
10
|
+
def self.with_punycoded_names
|
|
11
|
+
list_data = File.read(PublicSuffix::List::DEFAULT_LIST_PATH)
|
|
12
|
+
list = PublicSuffix::List.parse(list_data, private_domains: false)
|
|
13
|
+
|
|
14
|
+
punycoded_names(list).each do |punycoded_name|
|
|
15
|
+
new_rule = PublicSuffix::Rule.factory(punycoded_name)
|
|
16
|
+
list.add(new_rule, reindex: false)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
list.reindex!
|
|
20
|
+
|
|
21
|
+
list
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private_class_method \
|
|
25
|
+
def self.punycoded_names(list)
|
|
26
|
+
names = list.map { |rule| Addressable::IDNA.to_ascii(rule.value) }
|
|
27
|
+
names.select { |name| punycoded_name?(name) }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private_class_method \
|
|
31
|
+
def self.punycoded_name?(name)
|
|
32
|
+
PublicSuffix::Domain.name_to_labels(name).any? do |label|
|
|
33
|
+
label =~ ACE_PREFIX
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
data/lib/twingly/url.rb
CHANGED
|
@@ -2,17 +2,17 @@ require "addressable/uri"
|
|
|
2
2
|
require "addressable/idna/native"
|
|
3
3
|
require "public_suffix"
|
|
4
4
|
|
|
5
|
+
require_relative "public_suffix_list"
|
|
5
6
|
require_relative "url/null_url"
|
|
6
7
|
require_relative "url/error"
|
|
7
8
|
require_relative "version"
|
|
8
9
|
|
|
9
|
-
PublicSuffix::List.private_domains = false
|
|
10
|
-
|
|
11
10
|
module Twingly
|
|
12
11
|
class URL
|
|
13
12
|
include Comparable
|
|
14
13
|
|
|
15
14
|
ACCEPTED_SCHEMES = /\Ahttps?\z/i
|
|
15
|
+
CUSTOM_PSL = PublicSuffixList.with_punycoded_names
|
|
16
16
|
ENDS_WITH_SLASH = /\/+$/
|
|
17
17
|
ERRORS_TO_EXTEND = [
|
|
18
18
|
Addressable::URI::InvalidURIError,
|
|
@@ -20,7 +20,10 @@ module Twingly
|
|
|
20
20
|
IDN::Idna::IdnaError,
|
|
21
21
|
]
|
|
22
22
|
|
|
23
|
-
private_constant :ACCEPTED_SCHEMES
|
|
23
|
+
private_constant :ACCEPTED_SCHEMES
|
|
24
|
+
private_constant :CUSTOM_PSL
|
|
25
|
+
private_constant :ENDS_WITH_SLASH
|
|
26
|
+
private_constant :ERRORS_TO_EXTEND
|
|
24
27
|
|
|
25
28
|
class << self
|
|
26
29
|
def parse(potential_url)
|
|
@@ -36,9 +39,12 @@ module Twingly
|
|
|
36
39
|
scheme = addressable_uri.scheme
|
|
37
40
|
raise Twingly::URL::Error::ParseError unless scheme =~ ACCEPTED_SCHEMES
|
|
38
41
|
|
|
39
|
-
|
|
42
|
+
# URLs that can't be normalized should not be valid
|
|
43
|
+
try_addressable_normalize(addressable_uri)
|
|
40
44
|
|
|
41
|
-
|
|
45
|
+
host = addressable_uri.host
|
|
46
|
+
public_suffix_domain = PublicSuffix.parse(host, list: CUSTOM_PSL,
|
|
47
|
+
default_rule: nil)
|
|
42
48
|
raise Twingly::URL::Error::ParseError if public_suffix_domain.nil?
|
|
43
49
|
|
|
44
50
|
raise Twingly::URL::Error::ParseError if public_suffix_domain.sld.nil?
|
|
@@ -63,8 +69,8 @@ module Twingly
|
|
|
63
69
|
|
|
64
70
|
# Workaround for the following bug in addressable:
|
|
65
71
|
# https://github.com/sporkmonger/addressable/issues/224
|
|
66
|
-
def
|
|
67
|
-
addressable_uri.
|
|
72
|
+
def try_addressable_normalize(addressable_uri)
|
|
73
|
+
addressable_uri.normalize
|
|
68
74
|
rescue ArgumentError => error
|
|
69
75
|
if error.message.include?("invalid byte sequence in UTF-8")
|
|
70
76
|
raise Twingly::URL::Error::ParseError
|
|
@@ -76,7 +82,7 @@ module Twingly
|
|
|
76
82
|
private :new
|
|
77
83
|
private :internal_parse
|
|
78
84
|
private :to_addressable_uri
|
|
79
|
-
private :
|
|
85
|
+
private :try_addressable_normalize
|
|
80
86
|
end
|
|
81
87
|
|
|
82
88
|
def initialize(addressable_uri, public_suffix_domain)
|
data/lib/twingly/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: twingly-url
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 5.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Twingly AB
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-
|
|
11
|
+
date: 2016-09-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: addressable
|
|
@@ -16,28 +16,40 @@ dependencies:
|
|
|
16
16
|
requirements:
|
|
17
17
|
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: '2'
|
|
19
|
+
version: '2.4'
|
|
20
|
+
- - ">="
|
|
21
|
+
- !ruby/object:Gem::Version
|
|
22
|
+
version: 2.4.0
|
|
20
23
|
type: :runtime
|
|
21
24
|
prerelease: false
|
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
26
|
requirements:
|
|
24
27
|
- - "~>"
|
|
25
28
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: '2'
|
|
29
|
+
version: '2.4'
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: 2.4.0
|
|
27
33
|
- !ruby/object:Gem::Dependency
|
|
28
34
|
name: public_suffix
|
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
|
30
36
|
requirements:
|
|
31
37
|
- - "~>"
|
|
32
38
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '
|
|
39
|
+
version: '2.0'
|
|
40
|
+
- - ">="
|
|
41
|
+
- !ruby/object:Gem::Version
|
|
42
|
+
version: 2.0.2
|
|
34
43
|
type: :runtime
|
|
35
44
|
prerelease: false
|
|
36
45
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
46
|
requirements:
|
|
38
47
|
- - "~>"
|
|
39
48
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: '
|
|
49
|
+
version: '2.0'
|
|
50
|
+
- - ">="
|
|
51
|
+
- !ruby/object:Gem::Version
|
|
52
|
+
version: 2.0.2
|
|
41
53
|
- !ruby/object:Gem::Dependency
|
|
42
54
|
name: idn-ruby
|
|
43
55
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -116,6 +128,7 @@ extensions: []
|
|
|
116
128
|
extra_rdoc_files: []
|
|
117
129
|
files:
|
|
118
130
|
- README.md
|
|
131
|
+
- lib/twingly/public_suffix_list.rb
|
|
119
132
|
- lib/twingly/url.rb
|
|
120
133
|
- lib/twingly/url/error.rb
|
|
121
134
|
- lib/twingly/url/hasher.rb
|