url_parser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,102 @@
1
+ require 'ostruct'
2
+ require 'forwardable'
3
+ require 'public_suffix'
4
+
5
+ module UrlParser
6
+ class Domain
7
+ extend Forwardable
8
+
9
+ VALID_LABEL = /^(?!\-)[a-z0-9\-]*(?!\-)$/i
10
+
11
+ SUFFIX_DEFAULTS = {
12
+ subdomain: nil,
13
+ domain: nil,
14
+ tld: nil,
15
+ sld: nil,
16
+ trd: nil,
17
+ to_s: ''
18
+ }
19
+
20
+ attr_reader :original, :name
21
+
22
+ attr_accessor :errors
23
+
24
+ def_delegators :suffix, *SUFFIX_DEFAULTS.keys
25
+
26
+ def initialize(name, options = {})
27
+ @original = name.to_s.downcase.chomp('.')
28
+ @name = normalize
29
+ @errors = []
30
+ @validated = false
31
+ end
32
+
33
+ def labels
34
+ PublicSuffix::Domain.domain_to_labels(name)
35
+ end
36
+
37
+ def suffix
38
+ @suffix = begin
39
+ PublicSuffix.parse(name)
40
+ rescue
41
+ self.errors << "'#{original}' is not a valid domain"
42
+ OpenStruct.new(SUFFIX_DEFAULTS).tap do |os|
43
+ os.instance_eval('undef to_s')
44
+ end
45
+ end
46
+ end
47
+
48
+ def valid?
49
+ validate unless @validated
50
+ errors.empty?
51
+ end
52
+
53
+ private
54
+
55
+ def normalize
56
+ Addressable::IDNA.to_ascii(original)
57
+ end
58
+
59
+ def validate
60
+ validate_labels
61
+ validate_label_length
62
+ validate_label_format
63
+ validate_total_length
64
+ validate_suffix
65
+
66
+ @validated = true
67
+ end
68
+
69
+ def validate_labels
70
+ if labels.count > 127
71
+ self.errors << "exceeds 127 labels"
72
+ end
73
+ end
74
+
75
+ # http://tools.ietf.org/html/rfc1034#section-3.1
76
+ #
77
+ def validate_label_length
78
+ if labels.max_by(&:length).length > 63
79
+ self.errors << "exceeds maximum label length of 63 characters"
80
+ end unless labels.empty?
81
+ end
82
+
83
+ def validate_label_format
84
+ if labels.any? { |label| !(label =~ VALID_LABEL) }
85
+ self.errors << "contains invalid characters"
86
+ end
87
+ end
88
+
89
+ # https://blogs.msdn.microsoft.com/oldnewthing/20120412-00/?p=7873/
90
+ #
91
+ def validate_total_length
92
+ if name.length > 253
93
+ self.errors << "exceeds 253 ASCII characters"
94
+ end
95
+ end
96
+
97
+ def validate_suffix
98
+ suffix
99
+ end
100
+
101
+ end
102
+ end
@@ -0,0 +1,233 @@
1
+ module UrlParser
2
+ class Model
3
+
4
+ attr_reader :parsed_uri, :parsed_domain
5
+
6
+ def initialize(uri, domain = nil)
7
+ unless uri.is_a?(Addressable::URI)
8
+ raise RequiresAddressableURI,
9
+ "#{uri} must be an Addressable::URI"
10
+ end
11
+
12
+ unless domain.is_a?(UrlParser::Domain)
13
+ raise RequiresUrlParserDomain,
14
+ "#{domain} must be a UrlParser::Domain"
15
+ end if domain
16
+
17
+ @parsed_uri = uri
18
+ @parsed_domain = domain || UrlParser::Domain.new(uri.hostname)
19
+ end
20
+
21
+ # Top level URI naming structure / protocol.
22
+ #
23
+ def scheme
24
+ parsed_uri.scheme
25
+ end
26
+
27
+ # Username portion of the userinfo.
28
+ #
29
+ def username
30
+ parsed_uri.user
31
+ end
32
+ alias_method :user, :username
33
+
34
+ # Password portion of the userinfo.
35
+ #
36
+ def password
37
+ parsed_uri.password
38
+ end
39
+
40
+ # URI username and password for authentication.
41
+ #
42
+ def userinfo
43
+ parsed_uri.userinfo
44
+ end
45
+
46
+ # Fully qualified domain name or IP address.
47
+ #
48
+ def hostname
49
+ parsed_uri.host
50
+ end
51
+
52
+ # Fully qualified domain name or IP address without ww? prefix.
53
+ #
54
+ def naked_hostname
55
+ if www
56
+ hostname.sub(/\A#{www}./, '')
57
+ else
58
+ hostname
59
+ end
60
+ end
61
+
62
+ # Port number.
63
+ #
64
+ def port
65
+ parsed_uri.port
66
+ end
67
+
68
+ # Hostname and port.
69
+ #
70
+ def host
71
+ result = [ hostname, port ].compact.join(':')
72
+ result.empty? ? nil : result
73
+ end
74
+
75
+ # The ww? portion of the subdomain.
76
+ #
77
+ def www
78
+ trd.split('.').first.to_s[/www?\d*/] if trd
79
+ end
80
+
81
+ # Returns the top level domain portion, aka the extension.
82
+ #
83
+ def tld
84
+ parsed_domain.tld
85
+ end
86
+ alias_method :top_level_domain, :tld
87
+ alias_method :extension, :tld
88
+
89
+ # Returns the second level domain portion, aka the domain part.
90
+ #
91
+ def sld
92
+ parsed_domain.sld
93
+ end
94
+ alias_method :second_level_domain, :sld
95
+ alias_method :domain_name, :sld
96
+
97
+ # Returns the third level domain portion, aka the subdomain part.
98
+ #
99
+ def trd
100
+ parsed_domain.trd
101
+ end
102
+ alias_method :third_level_domain, :trd
103
+ alias_method :subdomains, :trd
104
+
105
+ # Any non-ww? subdomains.
106
+ #
107
+ def naked_trd
108
+ (trd && www) ? trd[/(?<=^#{www}\.).+/] : trd
109
+ end
110
+ alias_method :naked_subdomain, :naked_trd
111
+
112
+ # The domain name with the tld.
113
+ #
114
+ def domain
115
+ parsed_domain.domain
116
+ end
117
+
118
+ # All subdomains, include ww?.
119
+ #
120
+ def subdomain
121
+ parsed_domain.subdomain
122
+ end
123
+
124
+ # Scheme and host.
125
+ #
126
+ def origin
127
+ original_origin = parsed_uri.origin
128
+ original_origin == "null" ? nil : original_origin
129
+ end
130
+
131
+ # Userinfo and host.
132
+ #
133
+ def authority
134
+ parsed_uri.authority
135
+ end
136
+
137
+ # Scheme, userinfo, and host.
138
+ #
139
+ def site
140
+ parsed_uri.site
141
+ end
142
+
143
+ # Directory and segment.
144
+ #
145
+ def path
146
+ parsed_uri.path
147
+ end
148
+
149
+ # Last portion of the path.
150
+ #
151
+ def segment
152
+ (path =~ /\/\z/ ? nil : path.split('/').last) if path
153
+ end
154
+
155
+ # Any directories following the site within the URI.
156
+ #
157
+ def directory
158
+ unless path.nil? || path.empty?
159
+ parts = path.split('/')
160
+ if parts.empty?
161
+ '/'
162
+ else
163
+ parts.pop unless segment.to_s.empty?
164
+ parts.unshift('') unless parts.first.to_s.empty?
165
+ parts.compact.join('/')
166
+ end
167
+ end
168
+ end
169
+
170
+ # Segment if a file extension is present.
171
+ #
172
+ def filename
173
+ segment.to_s[/.+\..+/]
174
+ end
175
+
176
+ # The file extension of the filename.
177
+ #
178
+ def suffix
179
+ if path
180
+ ext = File.extname(path)
181
+ ext[0] = '' if ext[0] == '.'
182
+ ext.empty? ? nil : ext
183
+ end
184
+ end
185
+
186
+ # Params and values as a string.
187
+ #
188
+ def query
189
+ parsed_uri.query
190
+ end
191
+
192
+ # A hash of params and values.
193
+ #
194
+ def query_values
195
+ parsed_uri.query_values.to_h
196
+ end
197
+
198
+ # Fragment identifier.
199
+ #
200
+ def fragment
201
+ parsed_uri.fragment
202
+ end
203
+
204
+ # Path, query, and fragment.
205
+ #
206
+ def resource
207
+ name = [ segment, query_string, fragment_string ].compact.join
208
+ name.empty? ? nil : name
209
+ end
210
+
211
+ # Directory and resource - everything after the site.
212
+ #
213
+ def location
214
+ if directory == '/'
215
+ directory + resource.to_s
216
+ else
217
+ result = [ directory, resource ].compact.join('/')
218
+ result.empty? ? nil : result
219
+ end
220
+ end
221
+
222
+ private
223
+
224
+ def query_string
225
+ query ? "?#{query}" : nil
226
+ end
227
+
228
+ def fragment_string
229
+ fragment ? "##{fragment}" : nil
230
+ end
231
+
232
+ end
233
+ end
@@ -0,0 +1,47 @@
1
+ module UrlParser
2
+ class OptionSetter
3
+
4
+ attr_reader :blk
5
+
6
+ attr_accessor :options
7
+
8
+ def initialize(options = {}, &blk)
9
+ @options = options
10
+ @blk = blk
11
+ end
12
+
13
+ def unescape!
14
+ options[:unescape] = true
15
+ end
16
+
17
+ def unembed!
18
+ options[:unembed] = true
19
+ end
20
+
21
+ def canonicalize!
22
+ options[:canonicalize] = true
23
+ end
24
+
25
+ def normalize!
26
+ options[:normalize] = true
27
+ end
28
+
29
+ def clean!
30
+ unescape!
31
+ unembed!
32
+ canonicalize!
33
+ normalize!
34
+ end
35
+
36
+ def to_hash
37
+ blk.call(self) if blk
38
+ self.options
39
+ end
40
+ alias_method :to_h, :to_hash
41
+
42
+ def method_missing(*args)
43
+ # no-op
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,206 @@
1
+ require 'addressable/uri'
2
+ require 'digest/sha1'
3
+
4
+ module UrlParser
5
+ class Parser
6
+
7
+ class << self
8
+
9
+ def call(uri, options = {}, &blk)
10
+ return nil unless uri
11
+
12
+ parser = new(uri, options).tap do |uri|
13
+ if block_given?
14
+ yield uri
15
+ else
16
+ uri.parse!
17
+ end
18
+ end
19
+
20
+ parser.raw? ? parser.raw : parser.uri
21
+ end
22
+ alias_method :parse, :call
23
+
24
+ end
25
+
26
+ attr_reader \
27
+ :uri,
28
+ :domain,
29
+ :default_scheme,
30
+ :scheme_map,
31
+ :options
32
+
33
+ def initialize(uri, options = {})
34
+ @uri = uri
35
+ @domain = nil
36
+ @base_uri = options.delete(:base_uri) { nil }
37
+ @default_scheme = options.delete(:default_scheme) {
38
+ UrlParser.configuration.default_scheme
39
+ }
40
+ @embedded_params = options.delete(:embedded_params) {
41
+ UrlParser.configuration.embedded_params
42
+ }
43
+ @scheme_map = options.delete(:scheme_map) {
44
+ UrlParser.configuration.scheme_map
45
+ }
46
+ @raw = options.delete(:raw) { false }
47
+ @options = options
48
+ end
49
+
50
+ def base_uri
51
+ (@base_uri ? @base_uri : uri).to_s
52
+ end
53
+
54
+ def embedded_params
55
+ UrlParser.wrap(@embedded_params)
56
+ end
57
+
58
+ def raw?
59
+ !!@raw
60
+ end
61
+
62
+ def set_default_scheme?
63
+ !!@default_scheme
64
+ end
65
+
66
+ def unescape
67
+ UrlParser.unescape(uri)
68
+ end
69
+
70
+ def unescape!
71
+ @uri = unescape
72
+ end
73
+
74
+ def parse
75
+ return uri if uri.is_a?(Addressable::URI)
76
+
77
+ Addressable::URI.parse(base_uri).tap do |parsed_uri|
78
+ parsed_uri.join!(uri) if @base_uri
79
+
80
+ if options[:host]
81
+ parsed_uri.host = options[:host]
82
+ else
83
+ parts = parsed_uri.path.to_s.split(/[\/:]/)
84
+ hostname = parsed_uri.host || parts.first
85
+ @domain = UrlParser::Domain.new(hostname)
86
+ if @domain.valid?
87
+ parsed_uri.path = '/' +
88
+ parts.drop(1).join('/') +
89
+ parsed_uri.path[/(?<=\/).*(\/)\s*$/, 1].to_s
90
+ parsed_uri.host = @domain.name
91
+ end
92
+ end
93
+
94
+ if scheme_map.has_key?(parsed_uri.scheme)
95
+ parsed_uri.scheme = scheme_map[parsed_uri.scheme]
96
+ end
97
+
98
+ if parsed_uri.host && !parsed_uri.scheme
99
+ parsed_uri.scheme = default_scheme
100
+ end if set_default_scheme?
101
+
102
+ if parsed_uri.host && !domain
103
+ @domain = UrlParser::Domain.new(hostname)
104
+ end
105
+ end
106
+ end
107
+
108
+ def parse!
109
+ @uri = parse
110
+ end
111
+
112
+ def unembed
113
+ original = parse
114
+
115
+ candidates = original.query_values.select do |key, value|
116
+ embedded_params.include?(key) &&
117
+ value =~ Addressable::URI::URIREGEX
118
+ end.values if original.query_values
119
+
120
+ embed = candidates.find do |candidate|
121
+ parsed = Addressable::URI.parse(candidate)
122
+ %w(http https).include?(parsed.scheme) && parsed.host
123
+ end if candidates
124
+
125
+ embed ? self.class.call(embed, raw: raw?) : original
126
+ end
127
+ alias_method :embedded, :unembed
128
+
129
+ def unembed!
130
+ @uri = unembed
131
+ end
132
+ alias_method :embedded!, :unembed!
133
+
134
+ def normalize
135
+ parse.tap do |uri|
136
+ uri.path = uri.path.squeeze('/')
137
+ uri.path = uri.path.chomp('/') if uri.path.size != 1
138
+ uri.query = nil if uri.query && uri.query.empty?
139
+ uri.query = uri.query.strip if uri.query
140
+ uri.fragment = nil
141
+
142
+ uri.normalize!
143
+ end
144
+ end
145
+
146
+ def normalize!
147
+ @uri = normalize
148
+ end
149
+
150
+ def canonicalize
151
+ parse.tap do |uri|
152
+ matches_global_param = proc do |key, value|
153
+ UrlParser::DB[:global].include?(key)
154
+ end
155
+
156
+ matches_host_based_param = proc do |key, value|
157
+ UrlParser::DB[:hosts].find do |host, param|
158
+ uri.host =~ Regexp.new(Regexp.escape(host)) && param.include?(key)
159
+ end
160
+ end
161
+
162
+ uri.query_values = uri.query_values(Array).tap do |params|
163
+ params.delete_if &matches_global_param
164
+ params.delete_if &matches_host_based_param
165
+ end if uri.query_values
166
+ end
167
+ end
168
+ alias_method :c14n, :canonicalize
169
+
170
+ def canonicalize!
171
+ @uri = canonicalize
172
+ end
173
+ alias_method :c14n!, :canonicalize!
174
+
175
+ def raw
176
+ uri.to_s
177
+ end
178
+
179
+ def raw!
180
+ @uri = raw
181
+ end
182
+
183
+ def sha1
184
+ Digest::SHA1.hexdigest(raw)
185
+ end
186
+ alias_method :hash, :sha1
187
+
188
+ def clean!
189
+ unescape!
190
+ parse!
191
+ unembed!
192
+ canonicalize!
193
+ normalize!
194
+ raw! if raw?
195
+ end
196
+
197
+ def ==(uri)
198
+ opts = options.merge(raw: false)
199
+ one = self.dup.tap { |uri| uri.clean! }
200
+ two = self.class.new(uri, opts).tap { |uri| uri.clean! }
201
+
202
+ one.sha1 == two.sha1
203
+ end
204
+
205
+ end
206
+ end