url_parser 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +4 -0
- data/Guardfile +40 -7
- data/LICENSE.txt +1 -1
- data/README.md +301 -5
- data/Rakefile +5 -0
- data/lib/url_parser.rb +93 -286
- data/lib/url_parser/db.yml +77 -0
- data/lib/url_parser/domain.rb +102 -0
- data/lib/url_parser/model.rb +233 -0
- data/lib/url_parser/option_setter.rb +47 -0
- data/lib/url_parser/parser.rb +206 -0
- data/lib/url_parser/uri.rb +206 -0
- data/lib/url_parser/version.rb +1 -1
- data/spec/spec_helper.rb +83 -6
- data/spec/support/.gitkeep +0 -0
- data/spec/support/helpers.rb +7 -0
- data/spec/url_parser/domain_spec.rb +163 -0
- data/spec/url_parser/model_spec.rb +426 -0
- data/spec/url_parser/option_setter_spec.rb +71 -0
- data/spec/url_parser/parser_spec.rb +515 -0
- data/spec/url_parser/uri_spec.rb +570 -0
- data/spec/url_parser_spec.rb +93 -387
- data/url_parser.gemspec +5 -6
- metadata +39 -29
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'forwardable'
|
3
|
+
require 'public_suffix'
|
4
|
+
|
5
|
+
module UrlParser
|
6
|
+
class Domain
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
VALID_LABEL = /^(?!\-)[a-z0-9\-]*(?!\-)$/i
|
10
|
+
|
11
|
+
SUFFIX_DEFAULTS = {
|
12
|
+
subdomain: nil,
|
13
|
+
domain: nil,
|
14
|
+
tld: nil,
|
15
|
+
sld: nil,
|
16
|
+
trd: nil,
|
17
|
+
to_s: ''
|
18
|
+
}
|
19
|
+
|
20
|
+
attr_reader :original, :name
|
21
|
+
|
22
|
+
attr_accessor :errors
|
23
|
+
|
24
|
+
def_delegators :suffix, *SUFFIX_DEFAULTS.keys
|
25
|
+
|
26
|
+
def initialize(name, options = {})
|
27
|
+
@original = name.to_s.downcase.chomp('.')
|
28
|
+
@name = normalize
|
29
|
+
@errors = []
|
30
|
+
@validated = false
|
31
|
+
end
|
32
|
+
|
33
|
+
def labels
|
34
|
+
PublicSuffix::Domain.domain_to_labels(name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def suffix
|
38
|
+
@suffix = begin
|
39
|
+
PublicSuffix.parse(name)
|
40
|
+
rescue
|
41
|
+
self.errors << "'#{original}' is not a valid domain"
|
42
|
+
OpenStruct.new(SUFFIX_DEFAULTS).tap do |os|
|
43
|
+
os.instance_eval('undef to_s')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def valid?
|
49
|
+
validate unless @validated
|
50
|
+
errors.empty?
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def normalize
|
56
|
+
Addressable::IDNA.to_ascii(original)
|
57
|
+
end
|
58
|
+
|
59
|
+
def validate
|
60
|
+
validate_labels
|
61
|
+
validate_label_length
|
62
|
+
validate_label_format
|
63
|
+
validate_total_length
|
64
|
+
validate_suffix
|
65
|
+
|
66
|
+
@validated = true
|
67
|
+
end
|
68
|
+
|
69
|
+
def validate_labels
|
70
|
+
if labels.count > 127
|
71
|
+
self.errors << "exceeds 127 labels"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# http://tools.ietf.org/html/rfc1034#section-3.1
|
76
|
+
#
|
77
|
+
def validate_label_length
|
78
|
+
if labels.max_by(&:length).length > 63
|
79
|
+
self.errors << "exceeds maximum label length of 63 characters"
|
80
|
+
end unless labels.empty?
|
81
|
+
end
|
82
|
+
|
83
|
+
def validate_label_format
|
84
|
+
if labels.any? { |label| !(label =~ VALID_LABEL) }
|
85
|
+
self.errors << "contains invalid characters"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# https://blogs.msdn.microsoft.com/oldnewthing/20120412-00/?p=7873/
|
90
|
+
#
|
91
|
+
def validate_total_length
|
92
|
+
if name.length > 253
|
93
|
+
self.errors << "exceeds 253 ASCII characters"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def validate_suffix
|
98
|
+
suffix
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
module UrlParser
|
2
|
+
class Model
|
3
|
+
|
4
|
+
attr_reader :parsed_uri, :parsed_domain
|
5
|
+
|
6
|
+
def initialize(uri, domain = nil)
|
7
|
+
unless uri.is_a?(Addressable::URI)
|
8
|
+
raise RequiresAddressableURI,
|
9
|
+
"#{uri} must be an Addressable::URI"
|
10
|
+
end
|
11
|
+
|
12
|
+
unless domain.is_a?(UrlParser::Domain)
|
13
|
+
raise RequiresUrlParserDomain,
|
14
|
+
"#{domain} must be a UrlParser::Domain"
|
15
|
+
end if domain
|
16
|
+
|
17
|
+
@parsed_uri = uri
|
18
|
+
@parsed_domain = domain || UrlParser::Domain.new(uri.hostname)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Top level URI naming structure / protocol.
|
22
|
+
#
|
23
|
+
def scheme
|
24
|
+
parsed_uri.scheme
|
25
|
+
end
|
26
|
+
|
27
|
+
# Username portion of the userinfo.
|
28
|
+
#
|
29
|
+
def username
|
30
|
+
parsed_uri.user
|
31
|
+
end
|
32
|
+
alias_method :user, :username
|
33
|
+
|
34
|
+
# Password portion of the userinfo.
|
35
|
+
#
|
36
|
+
def password
|
37
|
+
parsed_uri.password
|
38
|
+
end
|
39
|
+
|
40
|
+
# URI username and password for authentication.
|
41
|
+
#
|
42
|
+
def userinfo
|
43
|
+
parsed_uri.userinfo
|
44
|
+
end
|
45
|
+
|
46
|
+
# Fully qualified domain name or IP address.
|
47
|
+
#
|
48
|
+
def hostname
|
49
|
+
parsed_uri.host
|
50
|
+
end
|
51
|
+
|
52
|
+
# Fully qualified domain name or IP address without ww? prefix.
|
53
|
+
#
|
54
|
+
def naked_hostname
|
55
|
+
if www
|
56
|
+
hostname.sub(/\A#{www}./, '')
|
57
|
+
else
|
58
|
+
hostname
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Port number.
|
63
|
+
#
|
64
|
+
def port
|
65
|
+
parsed_uri.port
|
66
|
+
end
|
67
|
+
|
68
|
+
# Hostname and port.
|
69
|
+
#
|
70
|
+
def host
|
71
|
+
result = [ hostname, port ].compact.join(':')
|
72
|
+
result.empty? ? nil : result
|
73
|
+
end
|
74
|
+
|
75
|
+
# The ww? portion of the subdomain.
|
76
|
+
#
|
77
|
+
def www
|
78
|
+
trd.split('.').first.to_s[/www?\d*/] if trd
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns the top level domain portion, aka the extension.
|
82
|
+
#
|
83
|
+
def tld
|
84
|
+
parsed_domain.tld
|
85
|
+
end
|
86
|
+
alias_method :top_level_domain, :tld
|
87
|
+
alias_method :extension, :tld
|
88
|
+
|
89
|
+
# Returns the second level domain portion, aka the domain part.
|
90
|
+
#
|
91
|
+
def sld
|
92
|
+
parsed_domain.sld
|
93
|
+
end
|
94
|
+
alias_method :second_level_domain, :sld
|
95
|
+
alias_method :domain_name, :sld
|
96
|
+
|
97
|
+
# Returns the third level domain portion, aka the subdomain part.
|
98
|
+
#
|
99
|
+
def trd
|
100
|
+
parsed_domain.trd
|
101
|
+
end
|
102
|
+
alias_method :third_level_domain, :trd
|
103
|
+
alias_method :subdomains, :trd
|
104
|
+
|
105
|
+
# Any non-ww? subdomains.
|
106
|
+
#
|
107
|
+
def naked_trd
|
108
|
+
(trd && www) ? trd[/(?<=^#{www}\.).+/] : trd
|
109
|
+
end
|
110
|
+
alias_method :naked_subdomain, :naked_trd
|
111
|
+
|
112
|
+
# The domain name with the tld.
|
113
|
+
#
|
114
|
+
def domain
|
115
|
+
parsed_domain.domain
|
116
|
+
end
|
117
|
+
|
118
|
+
# All subdomains, include ww?.
|
119
|
+
#
|
120
|
+
def subdomain
|
121
|
+
parsed_domain.subdomain
|
122
|
+
end
|
123
|
+
|
124
|
+
# Scheme and host.
|
125
|
+
#
|
126
|
+
def origin
|
127
|
+
original_origin = parsed_uri.origin
|
128
|
+
original_origin == "null" ? nil : original_origin
|
129
|
+
end
|
130
|
+
|
131
|
+
# Userinfo and host.
|
132
|
+
#
|
133
|
+
def authority
|
134
|
+
parsed_uri.authority
|
135
|
+
end
|
136
|
+
|
137
|
+
# Scheme, userinfo, and host.
|
138
|
+
#
|
139
|
+
def site
|
140
|
+
parsed_uri.site
|
141
|
+
end
|
142
|
+
|
143
|
+
# Directory and segment.
|
144
|
+
#
|
145
|
+
def path
|
146
|
+
parsed_uri.path
|
147
|
+
end
|
148
|
+
|
149
|
+
# Last portion of the path.
|
150
|
+
#
|
151
|
+
def segment
|
152
|
+
(path =~ /\/\z/ ? nil : path.split('/').last) if path
|
153
|
+
end
|
154
|
+
|
155
|
+
# Any directories following the site within the URI.
|
156
|
+
#
|
157
|
+
def directory
|
158
|
+
unless path.nil? || path.empty?
|
159
|
+
parts = path.split('/')
|
160
|
+
if parts.empty?
|
161
|
+
'/'
|
162
|
+
else
|
163
|
+
parts.pop unless segment.to_s.empty?
|
164
|
+
parts.unshift('') unless parts.first.to_s.empty?
|
165
|
+
parts.compact.join('/')
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Segment if a file extension is present.
|
171
|
+
#
|
172
|
+
def filename
|
173
|
+
segment.to_s[/.+\..+/]
|
174
|
+
end
|
175
|
+
|
176
|
+
# The file extension of the filename.
|
177
|
+
#
|
178
|
+
def suffix
|
179
|
+
if path
|
180
|
+
ext = File.extname(path)
|
181
|
+
ext[0] = '' if ext[0] == '.'
|
182
|
+
ext.empty? ? nil : ext
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# Params and values as a string.
|
187
|
+
#
|
188
|
+
def query
|
189
|
+
parsed_uri.query
|
190
|
+
end
|
191
|
+
|
192
|
+
# A hash of params and values.
|
193
|
+
#
|
194
|
+
def query_values
|
195
|
+
parsed_uri.query_values.to_h
|
196
|
+
end
|
197
|
+
|
198
|
+
# Fragment identifier.
|
199
|
+
#
|
200
|
+
def fragment
|
201
|
+
parsed_uri.fragment
|
202
|
+
end
|
203
|
+
|
204
|
+
# Path, query, and fragment.
|
205
|
+
#
|
206
|
+
def resource
|
207
|
+
name = [ segment, query_string, fragment_string ].compact.join
|
208
|
+
name.empty? ? nil : name
|
209
|
+
end
|
210
|
+
|
211
|
+
# Directory and resource - everything after the site.
|
212
|
+
#
|
213
|
+
def location
|
214
|
+
if directory == '/'
|
215
|
+
directory + resource.to_s
|
216
|
+
else
|
217
|
+
result = [ directory, resource ].compact.join('/')
|
218
|
+
result.empty? ? nil : result
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
private
|
223
|
+
|
224
|
+
def query_string
|
225
|
+
query ? "?#{query}" : nil
|
226
|
+
end
|
227
|
+
|
228
|
+
def fragment_string
|
229
|
+
fragment ? "##{fragment}" : nil
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module UrlParser
|
2
|
+
class OptionSetter
|
3
|
+
|
4
|
+
attr_reader :blk
|
5
|
+
|
6
|
+
attr_accessor :options
|
7
|
+
|
8
|
+
def initialize(options = {}, &blk)
|
9
|
+
@options = options
|
10
|
+
@blk = blk
|
11
|
+
end
|
12
|
+
|
13
|
+
def unescape!
|
14
|
+
options[:unescape] = true
|
15
|
+
end
|
16
|
+
|
17
|
+
def unembed!
|
18
|
+
options[:unembed] = true
|
19
|
+
end
|
20
|
+
|
21
|
+
def canonicalize!
|
22
|
+
options[:canonicalize] = true
|
23
|
+
end
|
24
|
+
|
25
|
+
def normalize!
|
26
|
+
options[:normalize] = true
|
27
|
+
end
|
28
|
+
|
29
|
+
def clean!
|
30
|
+
unescape!
|
31
|
+
unembed!
|
32
|
+
canonicalize!
|
33
|
+
normalize!
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_hash
|
37
|
+
blk.call(self) if blk
|
38
|
+
self.options
|
39
|
+
end
|
40
|
+
alias_method :to_h, :to_hash
|
41
|
+
|
42
|
+
def method_missing(*args)
|
43
|
+
# no-op
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,206 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'digest/sha1'
|
3
|
+
|
4
|
+
module UrlParser
|
5
|
+
class Parser
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
def call(uri, options = {}, &blk)
|
10
|
+
return nil unless uri
|
11
|
+
|
12
|
+
parser = new(uri, options).tap do |uri|
|
13
|
+
if block_given?
|
14
|
+
yield uri
|
15
|
+
else
|
16
|
+
uri.parse!
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
parser.raw? ? parser.raw : parser.uri
|
21
|
+
end
|
22
|
+
alias_method :parse, :call
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
attr_reader \
|
27
|
+
:uri,
|
28
|
+
:domain,
|
29
|
+
:default_scheme,
|
30
|
+
:scheme_map,
|
31
|
+
:options
|
32
|
+
|
33
|
+
def initialize(uri, options = {})
|
34
|
+
@uri = uri
|
35
|
+
@domain = nil
|
36
|
+
@base_uri = options.delete(:base_uri) { nil }
|
37
|
+
@default_scheme = options.delete(:default_scheme) {
|
38
|
+
UrlParser.configuration.default_scheme
|
39
|
+
}
|
40
|
+
@embedded_params = options.delete(:embedded_params) {
|
41
|
+
UrlParser.configuration.embedded_params
|
42
|
+
}
|
43
|
+
@scheme_map = options.delete(:scheme_map) {
|
44
|
+
UrlParser.configuration.scheme_map
|
45
|
+
}
|
46
|
+
@raw = options.delete(:raw) { false }
|
47
|
+
@options = options
|
48
|
+
end
|
49
|
+
|
50
|
+
def base_uri
|
51
|
+
(@base_uri ? @base_uri : uri).to_s
|
52
|
+
end
|
53
|
+
|
54
|
+
def embedded_params
|
55
|
+
UrlParser.wrap(@embedded_params)
|
56
|
+
end
|
57
|
+
|
58
|
+
def raw?
|
59
|
+
!!@raw
|
60
|
+
end
|
61
|
+
|
62
|
+
def set_default_scheme?
|
63
|
+
!!@default_scheme
|
64
|
+
end
|
65
|
+
|
66
|
+
def unescape
|
67
|
+
UrlParser.unescape(uri)
|
68
|
+
end
|
69
|
+
|
70
|
+
def unescape!
|
71
|
+
@uri = unescape
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse
|
75
|
+
return uri if uri.is_a?(Addressable::URI)
|
76
|
+
|
77
|
+
Addressable::URI.parse(base_uri).tap do |parsed_uri|
|
78
|
+
parsed_uri.join!(uri) if @base_uri
|
79
|
+
|
80
|
+
if options[:host]
|
81
|
+
parsed_uri.host = options[:host]
|
82
|
+
else
|
83
|
+
parts = parsed_uri.path.to_s.split(/[\/:]/)
|
84
|
+
hostname = parsed_uri.host || parts.first
|
85
|
+
@domain = UrlParser::Domain.new(hostname)
|
86
|
+
if @domain.valid?
|
87
|
+
parsed_uri.path = '/' +
|
88
|
+
parts.drop(1).join('/') +
|
89
|
+
parsed_uri.path[/(?<=\/).*(\/)\s*$/, 1].to_s
|
90
|
+
parsed_uri.host = @domain.name
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
if scheme_map.has_key?(parsed_uri.scheme)
|
95
|
+
parsed_uri.scheme = scheme_map[parsed_uri.scheme]
|
96
|
+
end
|
97
|
+
|
98
|
+
if parsed_uri.host && !parsed_uri.scheme
|
99
|
+
parsed_uri.scheme = default_scheme
|
100
|
+
end if set_default_scheme?
|
101
|
+
|
102
|
+
if parsed_uri.host && !domain
|
103
|
+
@domain = UrlParser::Domain.new(hostname)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def parse!
|
109
|
+
@uri = parse
|
110
|
+
end
|
111
|
+
|
112
|
+
def unembed
|
113
|
+
original = parse
|
114
|
+
|
115
|
+
candidates = original.query_values.select do |key, value|
|
116
|
+
embedded_params.include?(key) &&
|
117
|
+
value =~ Addressable::URI::URIREGEX
|
118
|
+
end.values if original.query_values
|
119
|
+
|
120
|
+
embed = candidates.find do |candidate|
|
121
|
+
parsed = Addressable::URI.parse(candidate)
|
122
|
+
%w(http https).include?(parsed.scheme) && parsed.host
|
123
|
+
end if candidates
|
124
|
+
|
125
|
+
embed ? self.class.call(embed, raw: raw?) : original
|
126
|
+
end
|
127
|
+
alias_method :embedded, :unembed
|
128
|
+
|
129
|
+
def unembed!
|
130
|
+
@uri = unembed
|
131
|
+
end
|
132
|
+
alias_method :embedded!, :unembed!
|
133
|
+
|
134
|
+
def normalize
|
135
|
+
parse.tap do |uri|
|
136
|
+
uri.path = uri.path.squeeze('/')
|
137
|
+
uri.path = uri.path.chomp('/') if uri.path.size != 1
|
138
|
+
uri.query = nil if uri.query && uri.query.empty?
|
139
|
+
uri.query = uri.query.strip if uri.query
|
140
|
+
uri.fragment = nil
|
141
|
+
|
142
|
+
uri.normalize!
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def normalize!
|
147
|
+
@uri = normalize
|
148
|
+
end
|
149
|
+
|
150
|
+
def canonicalize
|
151
|
+
parse.tap do |uri|
|
152
|
+
matches_global_param = proc do |key, value|
|
153
|
+
UrlParser::DB[:global].include?(key)
|
154
|
+
end
|
155
|
+
|
156
|
+
matches_host_based_param = proc do |key, value|
|
157
|
+
UrlParser::DB[:hosts].find do |host, param|
|
158
|
+
uri.host =~ Regexp.new(Regexp.escape(host)) && param.include?(key)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
uri.query_values = uri.query_values(Array).tap do |params|
|
163
|
+
params.delete_if &matches_global_param
|
164
|
+
params.delete_if &matches_host_based_param
|
165
|
+
end if uri.query_values
|
166
|
+
end
|
167
|
+
end
|
168
|
+
alias_method :c14n, :canonicalize
|
169
|
+
|
170
|
+
def canonicalize!
|
171
|
+
@uri = canonicalize
|
172
|
+
end
|
173
|
+
alias_method :c14n!, :canonicalize!
|
174
|
+
|
175
|
+
def raw
|
176
|
+
uri.to_s
|
177
|
+
end
|
178
|
+
|
179
|
+
def raw!
|
180
|
+
@uri = raw
|
181
|
+
end
|
182
|
+
|
183
|
+
def sha1
|
184
|
+
Digest::SHA1.hexdigest(raw)
|
185
|
+
end
|
186
|
+
alias_method :hash, :sha1
|
187
|
+
|
188
|
+
def clean!
|
189
|
+
unescape!
|
190
|
+
parse!
|
191
|
+
unembed!
|
192
|
+
canonicalize!
|
193
|
+
normalize!
|
194
|
+
raw! if raw?
|
195
|
+
end
|
196
|
+
|
197
|
+
def ==(uri)
|
198
|
+
opts = options.merge(raw: false)
|
199
|
+
one = self.dup.tap { |uri| uri.clean! }
|
200
|
+
two = self.class.new(uri, opts).tap { |uri| uri.clean! }
|
201
|
+
|
202
|
+
one.sha1 == two.sha1
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|