url_grey 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/README.md +39 -0
- data/Rakefile +7 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/url_grey.rb +256 -0
- data/lib/url_grey/version.rb +3 -0
- data/url_grey.gemspec +27 -0
- metadata +124 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e95f9c5eda3bb27c6c30f0cdd08aa46969030b22
|
4
|
+
data.tar.gz: cf5c2c804e16c7463274f117f1ef5d2c2222bbc7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c658e0642443497b7c33cb90f9c359deedf4b30dabf6d63fd252577ca77ea08b0eede2fa454fa5898cfb752c93de7cece4f12d93e06db2346837e44771faa7bf
|
7
|
+
data.tar.gz: d0e08cb022a5d7b23cfa54fde82bd99cdd6a74cb361d1d140166f211f5601ac41dcf35b8e795be61c3b0a1c8d98baac53eb903ef67b2e791a67d22c503841512
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# URLGrey
|
2
|
+
|
3
|
+
This attempts to copy chomium's algorithm for making sense of things
|
4
|
+
typed into the url bar. You can download the [chromium source] to play
|
5
|
+
along, but note that it is currently 2.1 GB.
|
6
|
+
|
7
|
+
The ported code is very similar to how it is written in the original
|
8
|
+
C++. It is a great example of the imperative style of programming by
|
9
|
+
state mutation. I'm not gonna lie, it's pretty gross. But hey, it passes
|
10
|
+
the tests.
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
Some examples from the tests:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
URLGrey.new("google.com").fixed
|
18
|
+
#=> "http://google.com/"
|
19
|
+
|
20
|
+
URLGrey.new("www.google.com#foo").fixed
|
21
|
+
#=> "http://www.google.com/#foo"
|
22
|
+
|
23
|
+
URLGrey.new("\u6C34.com").fixed
|
24
|
+
#=> "http://xn--1rw.com/"
|
25
|
+
|
26
|
+
URLGrey.new("http://foo.com/s?q=\uC5C5").fixed
|
27
|
+
#=> "http://foo.com/s?q=%EC%97%85"
|
28
|
+
|
29
|
+
URLGrey.new("http;/www.google.com/").fixed
|
30
|
+
#=> "http://www.google.com/"
|
31
|
+
|
32
|
+
URLGrey.new(" foo.com/asdf bar").fixed
|
33
|
+
#=> "http://foo.com/asdf%20%20bar"
|
34
|
+
|
35
|
+
URLGrey.new("[::]:80/path").fixed
|
36
|
+
#=> "http://[::]/path"
|
37
|
+
```
|
38
|
+
|
39
|
+
[chromium source]: https://chromium.googlesource.com/chromium/chromium/
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "url_grey"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/lib/url_grey.rb
ADDED
@@ -0,0 +1,256 @@
|
|
1
|
+
require "simpleidn"
|
2
|
+
|
3
|
+
require "url_grey/version"
|
4
|
+
|
5
|
+
class URLGrey
|
6
|
+
AUTHORITY_TERMINATORS = "/\\?#"
|
7
|
+
ABOUT_BLANK_URL = "about:blank"
|
8
|
+
PATH_PASS_CHARS = "!$&'()*+,/:;=@[]"
|
9
|
+
PATH_UNESCAPE_CHARS = "-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"
|
10
|
+
HOST_ESCAPE_CHARS = " !\"\#$&'()*,<=>@`{|}"
|
11
|
+
HOST_NORMAL_CHARS = "+-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz"
|
12
|
+
HOST_CHROME_DEFAULT = "version"
|
13
|
+
QUERY_NORMAL_CHARS = "!$%&()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
|
14
|
+
DEFAULT_PORTS = {
|
15
|
+
ftp: 21,
|
16
|
+
gopher: 70,
|
17
|
+
http: 80,
|
18
|
+
https: 443,
|
19
|
+
ws: 80,
|
20
|
+
wss: 443,
|
21
|
+
}
|
22
|
+
STANDARD_SCHEMES = ['http', 'https', 'file', 'ftp', 'gopher', 'ws', 'wss', 'filesystem']
|
23
|
+
|
24
|
+
attr_accessor :original, :coerced
|
25
|
+
attr_accessor :scheme, :username, :password, :host, :port, :path, :query, :ref
|
26
|
+
attr_accessor :slashes
|
27
|
+
|
28
|
+
def initialize(_original)
|
29
|
+
self.original = _original.sub(%r{^\s*}, '')
|
30
|
+
|
31
|
+
parse!
|
32
|
+
end
|
33
|
+
|
34
|
+
def parts
|
35
|
+
{
|
36
|
+
scheme: self.scheme,
|
37
|
+
username: self.username,
|
38
|
+
password: self.password,
|
39
|
+
host: self.host,
|
40
|
+
port: self.port,
|
41
|
+
path: self.path,
|
42
|
+
query: self.query,
|
43
|
+
ref: self.ref
|
44
|
+
}
|
45
|
+
end
|
46
|
+
|
47
|
+
def fixed
|
48
|
+
return ABOUT_BLANK_URL if self.original == ABOUT_BLANK_URL
|
49
|
+
|
50
|
+
"#{fixed_scheme}#{fixed_credentials}#{fixed_host}#{fixed_port}#{fixed_path}#{fixed_query}#{fixed_ref}"
|
51
|
+
end
|
52
|
+
|
53
|
+
def fixed_credentials
|
54
|
+
return "" unless (!self.username.empty? || !self.password.empty?)
|
55
|
+
return "#{self.username}@" if self.password.empty?
|
56
|
+
"#{self.username}:#{self.password}@"
|
57
|
+
end
|
58
|
+
|
59
|
+
# from components/url_formatter/url_fixer.cc FixupHost
|
60
|
+
def fixed_host
|
61
|
+
fixed = self.host.gsub(%r{\s}, '').downcase
|
62
|
+
unless fixed.match(%r{^\.*$})
|
63
|
+
fixed = fixed.sub(%r{^\.*}, '')
|
64
|
+
fixed = fixed.sub(%r{(?<=\.)\.*$}, '')
|
65
|
+
end
|
66
|
+
if fixed.empty? && ["about", "chrome"].include?(self.scheme)
|
67
|
+
fixed = HOST_CHROME_DEFAULT
|
68
|
+
end
|
69
|
+
|
70
|
+
if fixed.match(%r{^[[:ascii:]]*$})
|
71
|
+
fixed = fixed.chars.map do |char|
|
72
|
+
if HOST_NORMAL_CHARS.include?(char)
|
73
|
+
char
|
74
|
+
else
|
75
|
+
"%#{char.codepoints.first.to_s(16).upcase}"
|
76
|
+
end
|
77
|
+
end.join("")
|
78
|
+
else
|
79
|
+
fixed = SimpleIDN.to_ascii(fixed)
|
80
|
+
end
|
81
|
+
fixed
|
82
|
+
end
|
83
|
+
|
84
|
+
# from url/url_canon_path.cc CanonicalizePath
|
85
|
+
def fixed_path
|
86
|
+
fixed = self.path
|
87
|
+
if (fixed[0] != '/') && ((STANDARD_SCHEMES + ["about", "chrome"]).include?(self.scheme))
|
88
|
+
fixed = '/' + fixed
|
89
|
+
end
|
90
|
+
|
91
|
+
fixed.chars.map do |char|
|
92
|
+
if PATH_PASS_CHARS.include?(char)
|
93
|
+
char
|
94
|
+
elsif PATH_UNESCAPE_CHARS.include?(char)
|
95
|
+
char
|
96
|
+
elsif char == "."
|
97
|
+
# TODO: if the dot is preceded by a slash, do directory stuff:
|
98
|
+
# google.com/abc/.././def -> google.com/def
|
99
|
+
char
|
100
|
+
else
|
101
|
+
"%#{char.codepoints.first.to_s(16).upcase}"
|
102
|
+
end
|
103
|
+
end.join("")
|
104
|
+
end
|
105
|
+
|
106
|
+
def fixed_port
|
107
|
+
return "" if (self.port.empty? || self.port.to_i == DEFAULT_PORTS[self.scheme.to_sym])
|
108
|
+
":#{self.port}"
|
109
|
+
end
|
110
|
+
|
111
|
+
def fixed_query
|
112
|
+
fixed = self.query
|
113
|
+
return "" if fixed.nil?
|
114
|
+
fixed = fixed.bytes.map do |byte|
|
115
|
+
if QUERY_NORMAL_CHARS.unpack("U*").include?(byte)
|
116
|
+
[byte].pack("U")
|
117
|
+
else
|
118
|
+
"%#{byte.to_s(16).upcase}"
|
119
|
+
end
|
120
|
+
end.join('')
|
121
|
+
"?#{fixed}"
|
122
|
+
end
|
123
|
+
|
124
|
+
def fixed_ref
|
125
|
+
return "" if self.ref.nil?
|
126
|
+
"\##{self.ref}"
|
127
|
+
end
|
128
|
+
|
129
|
+
def fixed_scheme
|
130
|
+
fixed = self.scheme
|
131
|
+
if fixed == "about"
|
132
|
+
fixed = "chrome"
|
133
|
+
end
|
134
|
+
|
135
|
+
if (STANDARD_SCHEMES + ["about", "chrome"]).include?(fixed)
|
136
|
+
"#{fixed}://"
|
137
|
+
else
|
138
|
+
"#{fixed}:#{self.slashes}"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def parse!
|
145
|
+
parse_scheme!
|
146
|
+
after_scheme = self.coerced.match(%r{:(.*)})[1]
|
147
|
+
self.slashes, after_slashes = after_scheme.match(%r{^([\\\/]*)(.*)$})[1..2]
|
148
|
+
|
149
|
+
# authority terminators: '/', '\', '?', '#'
|
150
|
+
if (after_slashes.chars & ['/', '\\', '?', '#']).any?
|
151
|
+
authority, full_path = after_slashes.match(%r{^(.*?)([\\\/?#].*)$})[1..2]
|
152
|
+
else
|
153
|
+
authority = after_slashes
|
154
|
+
full_path = ""
|
155
|
+
end
|
156
|
+
|
157
|
+
if authority.include?("@")
|
158
|
+
user_info, server_info = authority.match(%r{^(.*)@(.*)$})[1..2]
|
159
|
+
else
|
160
|
+
user_info = ""
|
161
|
+
server_info = authority
|
162
|
+
end
|
163
|
+
|
164
|
+
# parse user_info
|
165
|
+
if user_info.empty?
|
166
|
+
self.username = ""
|
167
|
+
self.password = ""
|
168
|
+
else
|
169
|
+
if user_info.include?(":")
|
170
|
+
self.username, self.password = user_info.match(%r{^(.*?):(.*)$})[1..2]
|
171
|
+
else
|
172
|
+
self.username = user_info
|
173
|
+
self.password = ""
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# parse server_info
|
178
|
+
if !server_info.include?(":")
|
179
|
+
self.host = server_info
|
180
|
+
self.port = ""
|
181
|
+
elsif server_info.include?("]")
|
182
|
+
if server_info.reverse.index(":") < server_info.reverse.index("]")
|
183
|
+
self.host, self.port = server_info.match(%r{^(.*):(.*)$})[1..2]
|
184
|
+
else
|
185
|
+
self.host = server_info
|
186
|
+
self.port = ""
|
187
|
+
end
|
188
|
+
elsif server_info.chars.first == "["
|
189
|
+
self.host = server_info
|
190
|
+
self.port = ""
|
191
|
+
else
|
192
|
+
self.host, self.port = server_info.match(%r{^(.*):(.*)$})[1..2]
|
193
|
+
end
|
194
|
+
|
195
|
+
# parse full_path
|
196
|
+
if full_path.include?("#")
|
197
|
+
before_ref, self.ref = full_path.match(%r{^(.*?)#(.*)$})[1..2]
|
198
|
+
else
|
199
|
+
before_ref = full_path
|
200
|
+
self.ref = nil
|
201
|
+
end
|
202
|
+
|
203
|
+
if before_ref.include?("?")
|
204
|
+
self.path, self.query = before_ref.match(%r{^(.*?)\?(.*)$})[1..2]
|
205
|
+
else
|
206
|
+
self.path = before_ref
|
207
|
+
self.query = nil
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def parse_scheme!
|
212
|
+
self.coerced = self.original
|
213
|
+
|
214
|
+
if !find_scheme(self.original) && (self.original[0]!= ";")
|
215
|
+
if find_scheme(self.original.sub(";", ":"))
|
216
|
+
self.coerced = self.original.sub(";", ":")
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
if !find_scheme(self.coerced)
|
221
|
+
if self.coerced.match(%r{^ftp\.}i)
|
222
|
+
self.coerced = "ftp://" + self.coerced
|
223
|
+
else
|
224
|
+
self.coerced = "http://" + self.coerced
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
self.scheme = find_scheme(self.coerced) || ""
|
229
|
+
end
|
230
|
+
|
231
|
+
def find_scheme(text)
|
232
|
+
# extract scheme
|
233
|
+
return false unless match = text.match(%r{^(.*?):})
|
234
|
+
|
235
|
+
component = match[1].downcase
|
236
|
+
|
237
|
+
return "" if component.empty?
|
238
|
+
|
239
|
+
# first character must be a letter
|
240
|
+
return false unless component.match(%r{^[a-z]})
|
241
|
+
|
242
|
+
# reject anything with invalid characters
|
243
|
+
return false unless component.match(%r{^[+\-0-9a-z]*$})
|
244
|
+
|
245
|
+
# fix up segmentation for "www:123/"
|
246
|
+
return false if has_port(text)
|
247
|
+
|
248
|
+
component
|
249
|
+
end
|
250
|
+
|
251
|
+
def has_port(text)
|
252
|
+
return false unless text.include?(":")
|
253
|
+
match = text.match(%r{:(.*?)[\\/\?#]}) || text.match(%r{:(.*)$})
|
254
|
+
match[1].match(%r{^\d+$})
|
255
|
+
end
|
256
|
+
end
|
data/url_grey.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'url_grey/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "url_grey"
|
8
|
+
spec.version = URLGrey::VERSION
|
9
|
+
spec.authors = ["Stacey Touset"]
|
10
|
+
spec.email = ["capicue@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = "Coerce and normalize user inputted URLs"
|
13
|
+
spec.homepage = "https://github.com/capicue/url_grey"
|
14
|
+
spec.licenses = ["MIT"]
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = "exe"
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "simpleidn", "~> 0.0.7"
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.10"
|
24
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
25
|
+
spec.add_development_dependency "minitest", "~> 5.8"
|
26
|
+
spec.add_development_dependency "pry", "~> 0.10"
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url_grey
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stacey Touset
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-02-17 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: simpleidn
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.0.7
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.0.7
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.10'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.10'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5.8'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '5.8'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.10'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.10'
|
83
|
+
description:
|
84
|
+
email:
|
85
|
+
- capicue@gmail.com
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".travis.yml"
|
92
|
+
- Gemfile
|
93
|
+
- README.md
|
94
|
+
- Rakefile
|
95
|
+
- bin/console
|
96
|
+
- bin/setup
|
97
|
+
- lib/url_grey.rb
|
98
|
+
- lib/url_grey/version.rb
|
99
|
+
- url_grey.gemspec
|
100
|
+
homepage: https://github.com/capicue/url_grey
|
101
|
+
licenses:
|
102
|
+
- MIT
|
103
|
+
metadata: {}
|
104
|
+
post_install_message:
|
105
|
+
rdoc_options: []
|
106
|
+
require_paths:
|
107
|
+
- lib
|
108
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
113
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
requirements: []
|
119
|
+
rubyforge_project:
|
120
|
+
rubygems_version: 2.5.1
|
121
|
+
signing_key:
|
122
|
+
specification_version: 4
|
123
|
+
summary: Coerce and normalize user inputted URLs
|
124
|
+
test_files: []
|