url_parser 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +4 -0
- data/Guardfile +40 -7
- data/LICENSE.txt +1 -1
- data/README.md +301 -5
- data/Rakefile +5 -0
- data/lib/url_parser.rb +93 -286
- data/lib/url_parser/db.yml +77 -0
- data/lib/url_parser/domain.rb +102 -0
- data/lib/url_parser/model.rb +233 -0
- data/lib/url_parser/option_setter.rb +47 -0
- data/lib/url_parser/parser.rb +206 -0
- data/lib/url_parser/uri.rb +206 -0
- data/lib/url_parser/version.rb +1 -1
- data/spec/spec_helper.rb +83 -6
- data/spec/support/.gitkeep +0 -0
- data/spec/support/helpers.rb +7 -0
- data/spec/url_parser/domain_spec.rb +163 -0
- data/spec/url_parser/model_spec.rb +426 -0
- data/spec/url_parser/option_setter_spec.rb +71 -0
- data/spec/url_parser/parser_spec.rb +515 -0
- data/spec/url_parser/uri_spec.rb +570 -0
- data/spec/url_parser_spec.rb +93 -387
- data/url_parser.gemspec +5 -6
- metadata +39 -29
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'forwardable'
|
3
|
+
require 'public_suffix'
|
4
|
+
|
5
|
+
module UrlParser
|
6
|
+
class Domain
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
VALID_LABEL = /^(?!\-)[a-z0-9\-]*(?!\-)$/i
|
10
|
+
|
11
|
+
SUFFIX_DEFAULTS = {
|
12
|
+
subdomain: nil,
|
13
|
+
domain: nil,
|
14
|
+
tld: nil,
|
15
|
+
sld: nil,
|
16
|
+
trd: nil,
|
17
|
+
to_s: ''
|
18
|
+
}
|
19
|
+
|
20
|
+
attr_reader :original, :name
|
21
|
+
|
22
|
+
attr_accessor :errors
|
23
|
+
|
24
|
+
def_delegators :suffix, *SUFFIX_DEFAULTS.keys
|
25
|
+
|
26
|
+
def initialize(name, options = {})
|
27
|
+
@original = name.to_s.downcase.chomp('.')
|
28
|
+
@name = normalize
|
29
|
+
@errors = []
|
30
|
+
@validated = false
|
31
|
+
end
|
32
|
+
|
33
|
+
def labels
|
34
|
+
PublicSuffix::Domain.domain_to_labels(name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def suffix
|
38
|
+
@suffix = begin
|
39
|
+
PublicSuffix.parse(name)
|
40
|
+
rescue
|
41
|
+
self.errors << "'#{original}' is not a valid domain"
|
42
|
+
OpenStruct.new(SUFFIX_DEFAULTS).tap do |os|
|
43
|
+
os.instance_eval('undef to_s')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def valid?
|
49
|
+
validate unless @validated
|
50
|
+
errors.empty?
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def normalize
|
56
|
+
Addressable::IDNA.to_ascii(original)
|
57
|
+
end
|
58
|
+
|
59
|
+
def validate
|
60
|
+
validate_labels
|
61
|
+
validate_label_length
|
62
|
+
validate_label_format
|
63
|
+
validate_total_length
|
64
|
+
validate_suffix
|
65
|
+
|
66
|
+
@validated = true
|
67
|
+
end
|
68
|
+
|
69
|
+
def validate_labels
|
70
|
+
if labels.count > 127
|
71
|
+
self.errors << "exceeds 127 labels"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# http://tools.ietf.org/html/rfc1034#section-3.1
|
76
|
+
#
|
77
|
+
def validate_label_length
|
78
|
+
if labels.max_by(&:length).length > 63
|
79
|
+
self.errors << "exceeds maximum label length of 63 characters"
|
80
|
+
end unless labels.empty?
|
81
|
+
end
|
82
|
+
|
83
|
+
def validate_label_format
|
84
|
+
if labels.any? { |label| !(label =~ VALID_LABEL) }
|
85
|
+
self.errors << "contains invalid characters"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# https://blogs.msdn.microsoft.com/oldnewthing/20120412-00/?p=7873/
|
90
|
+
#
|
91
|
+
def validate_total_length
|
92
|
+
if name.length > 253
|
93
|
+
self.errors << "exceeds 253 ASCII characters"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def validate_suffix
|
98
|
+
suffix
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
module UrlParser
|
2
|
+
class Model
|
3
|
+
|
4
|
+
attr_reader :parsed_uri, :parsed_domain
|
5
|
+
|
6
|
+
def initialize(uri, domain = nil)
|
7
|
+
unless uri.is_a?(Addressable::URI)
|
8
|
+
raise RequiresAddressableURI,
|
9
|
+
"#{uri} must be an Addressable::URI"
|
10
|
+
end
|
11
|
+
|
12
|
+
unless domain.is_a?(UrlParser::Domain)
|
13
|
+
raise RequiresUrlParserDomain,
|
14
|
+
"#{domain} must be a UrlParser::Domain"
|
15
|
+
end if domain
|
16
|
+
|
17
|
+
@parsed_uri = uri
|
18
|
+
@parsed_domain = domain || UrlParser::Domain.new(uri.hostname)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Top level URI naming structure / protocol.
|
22
|
+
#
|
23
|
+
def scheme
|
24
|
+
parsed_uri.scheme
|
25
|
+
end
|
26
|
+
|
27
|
+
# Username portion of the userinfo.
|
28
|
+
#
|
29
|
+
def username
|
30
|
+
parsed_uri.user
|
31
|
+
end
|
32
|
+
alias_method :user, :username
|
33
|
+
|
34
|
+
# Password portion of the userinfo.
|
35
|
+
#
|
36
|
+
def password
|
37
|
+
parsed_uri.password
|
38
|
+
end
|
39
|
+
|
40
|
+
# URI username and password for authentication.
|
41
|
+
#
|
42
|
+
def userinfo
|
43
|
+
parsed_uri.userinfo
|
44
|
+
end
|
45
|
+
|
46
|
+
# Fully qualified domain name or IP address.
|
47
|
+
#
|
48
|
+
def hostname
|
49
|
+
parsed_uri.host
|
50
|
+
end
|
51
|
+
|
52
|
+
# Fully qualified domain name or IP address without ww? prefix.
|
53
|
+
#
|
54
|
+
def naked_hostname
|
55
|
+
if www
|
56
|
+
hostname.sub(/\A#{www}./, '')
|
57
|
+
else
|
58
|
+
hostname
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Port number.
|
63
|
+
#
|
64
|
+
def port
|
65
|
+
parsed_uri.port
|
66
|
+
end
|
67
|
+
|
68
|
+
# Hostname and port.
|
69
|
+
#
|
70
|
+
def host
|
71
|
+
result = [ hostname, port ].compact.join(':')
|
72
|
+
result.empty? ? nil : result
|
73
|
+
end
|
74
|
+
|
75
|
+
# The ww? portion of the subdomain.
|
76
|
+
#
|
77
|
+
def www
|
78
|
+
trd.split('.').first.to_s[/www?\d*/] if trd
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns the top level domain portion, aka the extension.
|
82
|
+
#
|
83
|
+
def tld
|
84
|
+
parsed_domain.tld
|
85
|
+
end
|
86
|
+
alias_method :top_level_domain, :tld
|
87
|
+
alias_method :extension, :tld
|
88
|
+
|
89
|
+
# Returns the second level domain portion, aka the domain part.
|
90
|
+
#
|
91
|
+
def sld
|
92
|
+
parsed_domain.sld
|
93
|
+
end
|
94
|
+
alias_method :second_level_domain, :sld
|
95
|
+
alias_method :domain_name, :sld
|
96
|
+
|
97
|
+
# Returns the third level domain portion, aka the subdomain part.
|
98
|
+
#
|
99
|
+
def trd
|
100
|
+
parsed_domain.trd
|
101
|
+
end
|
102
|
+
alias_method :third_level_domain, :trd
|
103
|
+
alias_method :subdomains, :trd
|
104
|
+
|
105
|
+
# Any non-ww? subdomains.
|
106
|
+
#
|
107
|
+
def naked_trd
|
108
|
+
(trd && www) ? trd[/(?<=^#{www}\.).+/] : trd
|
109
|
+
end
|
110
|
+
alias_method :naked_subdomain, :naked_trd
|
111
|
+
|
112
|
+
# The domain name with the tld.
|
113
|
+
#
|
114
|
+
def domain
|
115
|
+
parsed_domain.domain
|
116
|
+
end
|
117
|
+
|
118
|
+
# All subdomains, include ww?.
|
119
|
+
#
|
120
|
+
def subdomain
|
121
|
+
parsed_domain.subdomain
|
122
|
+
end
|
123
|
+
|
124
|
+
# Scheme and host.
|
125
|
+
#
|
126
|
+
def origin
|
127
|
+
original_origin = parsed_uri.origin
|
128
|
+
original_origin == "null" ? nil : original_origin
|
129
|
+
end
|
130
|
+
|
131
|
+
# Userinfo and host.
|
132
|
+
#
|
133
|
+
def authority
|
134
|
+
parsed_uri.authority
|
135
|
+
end
|
136
|
+
|
137
|
+
# Scheme, userinfo, and host.
|
138
|
+
#
|
139
|
+
def site
|
140
|
+
parsed_uri.site
|
141
|
+
end
|
142
|
+
|
143
|
+
# Directory and segment.
|
144
|
+
#
|
145
|
+
def path
|
146
|
+
parsed_uri.path
|
147
|
+
end
|
148
|
+
|
149
|
+
# Last portion of the path.
|
150
|
+
#
|
151
|
+
def segment
|
152
|
+
(path =~ /\/\z/ ? nil : path.split('/').last) if path
|
153
|
+
end
|
154
|
+
|
155
|
+
# Any directories following the site within the URI.
|
156
|
+
#
|
157
|
+
def directory
|
158
|
+
unless path.nil? || path.empty?
|
159
|
+
parts = path.split('/')
|
160
|
+
if parts.empty?
|
161
|
+
'/'
|
162
|
+
else
|
163
|
+
parts.pop unless segment.to_s.empty?
|
164
|
+
parts.unshift('') unless parts.first.to_s.empty?
|
165
|
+
parts.compact.join('/')
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Segment if a file extension is present.
|
171
|
+
#
|
172
|
+
def filename
|
173
|
+
segment.to_s[/.+\..+/]
|
174
|
+
end
|
175
|
+
|
176
|
+
# The file extension of the filename.
|
177
|
+
#
|
178
|
+
def suffix
|
179
|
+
if path
|
180
|
+
ext = File.extname(path)
|
181
|
+
ext[0] = '' if ext[0] == '.'
|
182
|
+
ext.empty? ? nil : ext
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# Params and values as a string.
|
187
|
+
#
|
188
|
+
def query
|
189
|
+
parsed_uri.query
|
190
|
+
end
|
191
|
+
|
192
|
+
# A hash of params and values.
|
193
|
+
#
|
194
|
+
def query_values
|
195
|
+
parsed_uri.query_values.to_h
|
196
|
+
end
|
197
|
+
|
198
|
+
# Fragment identifier.
|
199
|
+
#
|
200
|
+
def fragment
|
201
|
+
parsed_uri.fragment
|
202
|
+
end
|
203
|
+
|
204
|
+
# Path, query, and fragment.
|
205
|
+
#
|
206
|
+
def resource
|
207
|
+
name = [ segment, query_string, fragment_string ].compact.join
|
208
|
+
name.empty? ? nil : name
|
209
|
+
end
|
210
|
+
|
211
|
+
# Directory and resource - everything after the site.
|
212
|
+
#
|
213
|
+
def location
|
214
|
+
if directory == '/'
|
215
|
+
directory + resource.to_s
|
216
|
+
else
|
217
|
+
result = [ directory, resource ].compact.join('/')
|
218
|
+
result.empty? ? nil : result
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
private
|
223
|
+
|
224
|
+
def query_string
|
225
|
+
query ? "?#{query}" : nil
|
226
|
+
end
|
227
|
+
|
228
|
+
def fragment_string
|
229
|
+
fragment ? "##{fragment}" : nil
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module UrlParser
|
2
|
+
class OptionSetter
|
3
|
+
|
4
|
+
attr_reader :blk
|
5
|
+
|
6
|
+
attr_accessor :options
|
7
|
+
|
8
|
+
def initialize(options = {}, &blk)
|
9
|
+
@options = options
|
10
|
+
@blk = blk
|
11
|
+
end
|
12
|
+
|
13
|
+
def unescape!
|
14
|
+
options[:unescape] = true
|
15
|
+
end
|
16
|
+
|
17
|
+
def unembed!
|
18
|
+
options[:unembed] = true
|
19
|
+
end
|
20
|
+
|
21
|
+
def canonicalize!
|
22
|
+
options[:canonicalize] = true
|
23
|
+
end
|
24
|
+
|
25
|
+
def normalize!
|
26
|
+
options[:normalize] = true
|
27
|
+
end
|
28
|
+
|
29
|
+
def clean!
|
30
|
+
unescape!
|
31
|
+
unembed!
|
32
|
+
canonicalize!
|
33
|
+
normalize!
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_hash
|
37
|
+
blk.call(self) if blk
|
38
|
+
self.options
|
39
|
+
end
|
40
|
+
alias_method :to_h, :to_hash
|
41
|
+
|
42
|
+
def method_missing(*args)
|
43
|
+
# no-op
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,206 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'digest/sha1'
|
3
|
+
|
4
|
+
module UrlParser
|
5
|
+
class Parser
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
def call(uri, options = {}, &blk)
|
10
|
+
return nil unless uri
|
11
|
+
|
12
|
+
parser = new(uri, options).tap do |uri|
|
13
|
+
if block_given?
|
14
|
+
yield uri
|
15
|
+
else
|
16
|
+
uri.parse!
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
parser.raw? ? parser.raw : parser.uri
|
21
|
+
end
|
22
|
+
alias_method :parse, :call
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
attr_reader \
|
27
|
+
:uri,
|
28
|
+
:domain,
|
29
|
+
:default_scheme,
|
30
|
+
:scheme_map,
|
31
|
+
:options
|
32
|
+
|
33
|
+
def initialize(uri, options = {})
|
34
|
+
@uri = uri
|
35
|
+
@domain = nil
|
36
|
+
@base_uri = options.delete(:base_uri) { nil }
|
37
|
+
@default_scheme = options.delete(:default_scheme) {
|
38
|
+
UrlParser.configuration.default_scheme
|
39
|
+
}
|
40
|
+
@embedded_params = options.delete(:embedded_params) {
|
41
|
+
UrlParser.configuration.embedded_params
|
42
|
+
}
|
43
|
+
@scheme_map = options.delete(:scheme_map) {
|
44
|
+
UrlParser.configuration.scheme_map
|
45
|
+
}
|
46
|
+
@raw = options.delete(:raw) { false }
|
47
|
+
@options = options
|
48
|
+
end
|
49
|
+
|
50
|
+
def base_uri
|
51
|
+
(@base_uri ? @base_uri : uri).to_s
|
52
|
+
end
|
53
|
+
|
54
|
+
def embedded_params
|
55
|
+
UrlParser.wrap(@embedded_params)
|
56
|
+
end
|
57
|
+
|
58
|
+
def raw?
|
59
|
+
!!@raw
|
60
|
+
end
|
61
|
+
|
62
|
+
def set_default_scheme?
|
63
|
+
!!@default_scheme
|
64
|
+
end
|
65
|
+
|
66
|
+
def unescape
|
67
|
+
UrlParser.unescape(uri)
|
68
|
+
end
|
69
|
+
|
70
|
+
def unescape!
|
71
|
+
@uri = unescape
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse
|
75
|
+
return uri if uri.is_a?(Addressable::URI)
|
76
|
+
|
77
|
+
Addressable::URI.parse(base_uri).tap do |parsed_uri|
|
78
|
+
parsed_uri.join!(uri) if @base_uri
|
79
|
+
|
80
|
+
if options[:host]
|
81
|
+
parsed_uri.host = options[:host]
|
82
|
+
else
|
83
|
+
parts = parsed_uri.path.to_s.split(/[\/:]/)
|
84
|
+
hostname = parsed_uri.host || parts.first
|
85
|
+
@domain = UrlParser::Domain.new(hostname)
|
86
|
+
if @domain.valid?
|
87
|
+
parsed_uri.path = '/' +
|
88
|
+
parts.drop(1).join('/') +
|
89
|
+
parsed_uri.path[/(?<=\/).*(\/)\s*$/, 1].to_s
|
90
|
+
parsed_uri.host = @domain.name
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
if scheme_map.has_key?(parsed_uri.scheme)
|
95
|
+
parsed_uri.scheme = scheme_map[parsed_uri.scheme]
|
96
|
+
end
|
97
|
+
|
98
|
+
if parsed_uri.host && !parsed_uri.scheme
|
99
|
+
parsed_uri.scheme = default_scheme
|
100
|
+
end if set_default_scheme?
|
101
|
+
|
102
|
+
if parsed_uri.host && !domain
|
103
|
+
@domain = UrlParser::Domain.new(hostname)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def parse!
|
109
|
+
@uri = parse
|
110
|
+
end
|
111
|
+
|
112
|
+
def unembed
|
113
|
+
original = parse
|
114
|
+
|
115
|
+
candidates = original.query_values.select do |key, value|
|
116
|
+
embedded_params.include?(key) &&
|
117
|
+
value =~ Addressable::URI::URIREGEX
|
118
|
+
end.values if original.query_values
|
119
|
+
|
120
|
+
embed = candidates.find do |candidate|
|
121
|
+
parsed = Addressable::URI.parse(candidate)
|
122
|
+
%w(http https).include?(parsed.scheme) && parsed.host
|
123
|
+
end if candidates
|
124
|
+
|
125
|
+
embed ? self.class.call(embed, raw: raw?) : original
|
126
|
+
end
|
127
|
+
alias_method :embedded, :unembed
|
128
|
+
|
129
|
+
def unembed!
|
130
|
+
@uri = unembed
|
131
|
+
end
|
132
|
+
alias_method :embedded!, :unembed!
|
133
|
+
|
134
|
+
def normalize
|
135
|
+
parse.tap do |uri|
|
136
|
+
uri.path = uri.path.squeeze('/')
|
137
|
+
uri.path = uri.path.chomp('/') if uri.path.size != 1
|
138
|
+
uri.query = nil if uri.query && uri.query.empty?
|
139
|
+
uri.query = uri.query.strip if uri.query
|
140
|
+
uri.fragment = nil
|
141
|
+
|
142
|
+
uri.normalize!
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def normalize!
|
147
|
+
@uri = normalize
|
148
|
+
end
|
149
|
+
|
150
|
+
def canonicalize
|
151
|
+
parse.tap do |uri|
|
152
|
+
matches_global_param = proc do |key, value|
|
153
|
+
UrlParser::DB[:global].include?(key)
|
154
|
+
end
|
155
|
+
|
156
|
+
matches_host_based_param = proc do |key, value|
|
157
|
+
UrlParser::DB[:hosts].find do |host, param|
|
158
|
+
uri.host =~ Regexp.new(Regexp.escape(host)) && param.include?(key)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
uri.query_values = uri.query_values(Array).tap do |params|
|
163
|
+
params.delete_if &matches_global_param
|
164
|
+
params.delete_if &matches_host_based_param
|
165
|
+
end if uri.query_values
|
166
|
+
end
|
167
|
+
end
|
168
|
+
alias_method :c14n, :canonicalize
|
169
|
+
|
170
|
+
def canonicalize!
|
171
|
+
@uri = canonicalize
|
172
|
+
end
|
173
|
+
alias_method :c14n!, :canonicalize!
|
174
|
+
|
175
|
+
def raw
|
176
|
+
uri.to_s
|
177
|
+
end
|
178
|
+
|
179
|
+
def raw!
|
180
|
+
@uri = raw
|
181
|
+
end
|
182
|
+
|
183
|
+
def sha1
|
184
|
+
Digest::SHA1.hexdigest(raw)
|
185
|
+
end
|
186
|
+
alias_method :hash, :sha1
|
187
|
+
|
188
|
+
def clean!
|
189
|
+
unescape!
|
190
|
+
parse!
|
191
|
+
unembed!
|
192
|
+
canonicalize!
|
193
|
+
normalize!
|
194
|
+
raw! if raw?
|
195
|
+
end
|
196
|
+
|
197
|
+
def ==(uri)
|
198
|
+
opts = options.merge(raw: false)
|
199
|
+
one = self.dup.tap { |uri| uri.clean! }
|
200
|
+
two = self.class.new(uri, opts).tap { |uri| uri.clean! }
|
201
|
+
|
202
|
+
one.sha1 == two.sha1
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|