url_parser 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,102 @@
1
+ require 'ostruct'
2
+ require 'forwardable'
3
+ require 'public_suffix'
4
+
5
+ module UrlParser
6
+ class Domain
7
+ extend Forwardable
8
+
9
+ VALID_LABEL = /^(?!\-)[a-z0-9\-]*(?!\-)$/i
10
+
11
+ SUFFIX_DEFAULTS = {
12
+ subdomain: nil,
13
+ domain: nil,
14
+ tld: nil,
15
+ sld: nil,
16
+ trd: nil,
17
+ to_s: ''
18
+ }
19
+
20
+ attr_reader :original, :name
21
+
22
+ attr_accessor :errors
23
+
24
+ def_delegators :suffix, *SUFFIX_DEFAULTS.keys
25
+
26
+ def initialize(name, options = {})
27
+ @original = name.to_s.downcase.chomp('.')
28
+ @name = normalize
29
+ @errors = []
30
+ @validated = false
31
+ end
32
+
33
+ def labels
34
+ PublicSuffix::Domain.domain_to_labels(name)
35
+ end
36
+
37
+ def suffix
38
+ @suffix = begin
39
+ PublicSuffix.parse(name)
40
+ rescue
41
+ self.errors << "'#{original}' is not a valid domain"
42
+ OpenStruct.new(SUFFIX_DEFAULTS).tap do |os|
43
+ os.instance_eval('undef to_s')
44
+ end
45
+ end
46
+ end
47
+
48
+ def valid?
49
+ validate unless @validated
50
+ errors.empty?
51
+ end
52
+
53
+ private
54
+
55
+ def normalize
56
+ Addressable::IDNA.to_ascii(original)
57
+ end
58
+
59
+ def validate
60
+ validate_labels
61
+ validate_label_length
62
+ validate_label_format
63
+ validate_total_length
64
+ validate_suffix
65
+
66
+ @validated = true
67
+ end
68
+
69
+ def validate_labels
70
+ if labels.count > 127
71
+ self.errors << "exceeds 127 labels"
72
+ end
73
+ end
74
+
75
+ # http://tools.ietf.org/html/rfc1034#section-3.1
76
+ #
77
+ def validate_label_length
78
+ if labels.max_by(&:length).length > 63
79
+ self.errors << "exceeds maximum label length of 63 characters"
80
+ end unless labels.empty?
81
+ end
82
+
83
+ def validate_label_format
84
+ if labels.any? { |label| !(label =~ VALID_LABEL) }
85
+ self.errors << "contains invalid characters"
86
+ end
87
+ end
88
+
89
+ # https://blogs.msdn.microsoft.com/oldnewthing/20120412-00/?p=7873/
90
+ #
91
+ def validate_total_length
92
+ if name.length > 253
93
+ self.errors << "exceeds 253 ASCII characters"
94
+ end
95
+ end
96
+
97
+ def validate_suffix
98
+ suffix
99
+ end
100
+
101
+ end
102
+ end
@@ -0,0 +1,233 @@
1
+ module UrlParser
2
+ class Model
3
+
4
+ attr_reader :parsed_uri, :parsed_domain
5
+
6
+ def initialize(uri, domain = nil)
7
+ unless uri.is_a?(Addressable::URI)
8
+ raise RequiresAddressableURI,
9
+ "#{uri} must be an Addressable::URI"
10
+ end
11
+
12
+ unless domain.is_a?(UrlParser::Domain)
13
+ raise RequiresUrlParserDomain,
14
+ "#{domain} must be a UrlParser::Domain"
15
+ end if domain
16
+
17
+ @parsed_uri = uri
18
+ @parsed_domain = domain || UrlParser::Domain.new(uri.hostname)
19
+ end
20
+
21
+ # Top level URI naming structure / protocol.
22
+ #
23
+ def scheme
24
+ parsed_uri.scheme
25
+ end
26
+
27
+ # Username portion of the userinfo.
28
+ #
29
+ def username
30
+ parsed_uri.user
31
+ end
32
+ alias_method :user, :username
33
+
34
+ # Password portion of the userinfo.
35
+ #
36
+ def password
37
+ parsed_uri.password
38
+ end
39
+
40
+ # URI username and password for authentication.
41
+ #
42
+ def userinfo
43
+ parsed_uri.userinfo
44
+ end
45
+
46
+ # Fully qualified domain name or IP address.
47
+ #
48
+ def hostname
49
+ parsed_uri.host
50
+ end
51
+
52
+ # Fully qualified domain name or IP address without ww? prefix.
53
+ #
54
+ def naked_hostname
55
+ if www
56
+ hostname.sub(/\A#{www}./, '')
57
+ else
58
+ hostname
59
+ end
60
+ end
61
+
62
+ # Port number.
63
+ #
64
+ def port
65
+ parsed_uri.port
66
+ end
67
+
68
+ # Hostname and port.
69
+ #
70
+ def host
71
+ result = [ hostname, port ].compact.join(':')
72
+ result.empty? ? nil : result
73
+ end
74
+
75
+ # The ww? portion of the subdomain.
76
+ #
77
+ def www
78
+ trd.split('.').first.to_s[/www?\d*/] if trd
79
+ end
80
+
81
+ # Returns the top level domain portion, aka the extension.
82
+ #
83
+ def tld
84
+ parsed_domain.tld
85
+ end
86
+ alias_method :top_level_domain, :tld
87
+ alias_method :extension, :tld
88
+
89
+ # Returns the second level domain portion, aka the domain part.
90
+ #
91
+ def sld
92
+ parsed_domain.sld
93
+ end
94
+ alias_method :second_level_domain, :sld
95
+ alias_method :domain_name, :sld
96
+
97
+ # Returns the third level domain portion, aka the subdomain part.
98
+ #
99
+ def trd
100
+ parsed_domain.trd
101
+ end
102
+ alias_method :third_level_domain, :trd
103
+ alias_method :subdomains, :trd
104
+
105
+ # Any non-ww? subdomains.
106
+ #
107
+ def naked_trd
108
+ (trd && www) ? trd[/(?<=^#{www}\.).+/] : trd
109
+ end
110
+ alias_method :naked_subdomain, :naked_trd
111
+
112
+ # The domain name with the tld.
113
+ #
114
+ def domain
115
+ parsed_domain.domain
116
+ end
117
+
118
+ # All subdomains, include ww?.
119
+ #
120
+ def subdomain
121
+ parsed_domain.subdomain
122
+ end
123
+
124
+ # Scheme and host.
125
+ #
126
+ def origin
127
+ original_origin = parsed_uri.origin
128
+ original_origin == "null" ? nil : original_origin
129
+ end
130
+
131
+ # Userinfo and host.
132
+ #
133
+ def authority
134
+ parsed_uri.authority
135
+ end
136
+
137
+ # Scheme, userinfo, and host.
138
+ #
139
+ def site
140
+ parsed_uri.site
141
+ end
142
+
143
+ # Directory and segment.
144
+ #
145
+ def path
146
+ parsed_uri.path
147
+ end
148
+
149
+ # Last portion of the path.
150
+ #
151
+ def segment
152
+ (path =~ /\/\z/ ? nil : path.split('/').last) if path
153
+ end
154
+
155
+ # Any directories following the site within the URI.
156
+ #
157
+ def directory
158
+ unless path.nil? || path.empty?
159
+ parts = path.split('/')
160
+ if parts.empty?
161
+ '/'
162
+ else
163
+ parts.pop unless segment.to_s.empty?
164
+ parts.unshift('') unless parts.first.to_s.empty?
165
+ parts.compact.join('/')
166
+ end
167
+ end
168
+ end
169
+
170
+ # Segment if a file extension is present.
171
+ #
172
+ def filename
173
+ segment.to_s[/.+\..+/]
174
+ end
175
+
176
+ # The file extension of the filename.
177
+ #
178
+ def suffix
179
+ if path
180
+ ext = File.extname(path)
181
+ ext[0] = '' if ext[0] == '.'
182
+ ext.empty? ? nil : ext
183
+ end
184
+ end
185
+
186
+ # Params and values as a string.
187
+ #
188
+ def query
189
+ parsed_uri.query
190
+ end
191
+
192
+ # A hash of params and values.
193
+ #
194
+ def query_values
195
+ parsed_uri.query_values.to_h
196
+ end
197
+
198
+ # Fragment identifier.
199
+ #
200
+ def fragment
201
+ parsed_uri.fragment
202
+ end
203
+
204
+ # Path, query, and fragment.
205
+ #
206
+ def resource
207
+ name = [ segment, query_string, fragment_string ].compact.join
208
+ name.empty? ? nil : name
209
+ end
210
+
211
+ # Directory and resource - everything after the site.
212
+ #
213
+ def location
214
+ if directory == '/'
215
+ directory + resource.to_s
216
+ else
217
+ result = [ directory, resource ].compact.join('/')
218
+ result.empty? ? nil : result
219
+ end
220
+ end
221
+
222
+ private
223
+
224
+ def query_string
225
+ query ? "?#{query}" : nil
226
+ end
227
+
228
+ def fragment_string
229
+ fragment ? "##{fragment}" : nil
230
+ end
231
+
232
+ end
233
+ end
@@ -0,0 +1,47 @@
1
+ module UrlParser
2
+ class OptionSetter
3
+
4
+ attr_reader :blk
5
+
6
+ attr_accessor :options
7
+
8
+ def initialize(options = {}, &blk)
9
+ @options = options
10
+ @blk = blk
11
+ end
12
+
13
+ def unescape!
14
+ options[:unescape] = true
15
+ end
16
+
17
+ def unembed!
18
+ options[:unembed] = true
19
+ end
20
+
21
+ def canonicalize!
22
+ options[:canonicalize] = true
23
+ end
24
+
25
+ def normalize!
26
+ options[:normalize] = true
27
+ end
28
+
29
+ def clean!
30
+ unescape!
31
+ unembed!
32
+ canonicalize!
33
+ normalize!
34
+ end
35
+
36
+ def to_hash
37
+ blk.call(self) if blk
38
+ self.options
39
+ end
40
+ alias_method :to_h, :to_hash
41
+
42
+ def method_missing(*args)
43
+ # no-op
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,206 @@
1
+ require 'addressable/uri'
2
+ require 'digest/sha1'
3
+
4
+ module UrlParser
5
+ class Parser
6
+
7
+ class << self
8
+
9
+ def call(uri, options = {}, &blk)
10
+ return nil unless uri
11
+
12
+ parser = new(uri, options).tap do |uri|
13
+ if block_given?
14
+ yield uri
15
+ else
16
+ uri.parse!
17
+ end
18
+ end
19
+
20
+ parser.raw? ? parser.raw : parser.uri
21
+ end
22
+ alias_method :parse, :call
23
+
24
+ end
25
+
26
+ attr_reader \
27
+ :uri,
28
+ :domain,
29
+ :default_scheme,
30
+ :scheme_map,
31
+ :options
32
+
33
+ def initialize(uri, options = {})
34
+ @uri = uri
35
+ @domain = nil
36
+ @base_uri = options.delete(:base_uri) { nil }
37
+ @default_scheme = options.delete(:default_scheme) {
38
+ UrlParser.configuration.default_scheme
39
+ }
40
+ @embedded_params = options.delete(:embedded_params) {
41
+ UrlParser.configuration.embedded_params
42
+ }
43
+ @scheme_map = options.delete(:scheme_map) {
44
+ UrlParser.configuration.scheme_map
45
+ }
46
+ @raw = options.delete(:raw) { false }
47
+ @options = options
48
+ end
49
+
50
+ def base_uri
51
+ (@base_uri ? @base_uri : uri).to_s
52
+ end
53
+
54
+ def embedded_params
55
+ UrlParser.wrap(@embedded_params)
56
+ end
57
+
58
+ def raw?
59
+ !!@raw
60
+ end
61
+
62
+ def set_default_scheme?
63
+ !!@default_scheme
64
+ end
65
+
66
+ def unescape
67
+ UrlParser.unescape(uri)
68
+ end
69
+
70
+ def unescape!
71
+ @uri = unescape
72
+ end
73
+
74
+ def parse
75
+ return uri if uri.is_a?(Addressable::URI)
76
+
77
+ Addressable::URI.parse(base_uri).tap do |parsed_uri|
78
+ parsed_uri.join!(uri) if @base_uri
79
+
80
+ if options[:host]
81
+ parsed_uri.host = options[:host]
82
+ else
83
+ parts = parsed_uri.path.to_s.split(/[\/:]/)
84
+ hostname = parsed_uri.host || parts.first
85
+ @domain = UrlParser::Domain.new(hostname)
86
+ if @domain.valid?
87
+ parsed_uri.path = '/' +
88
+ parts.drop(1).join('/') +
89
+ parsed_uri.path[/(?<=\/).*(\/)\s*$/, 1].to_s
90
+ parsed_uri.host = @domain.name
91
+ end
92
+ end
93
+
94
+ if scheme_map.has_key?(parsed_uri.scheme)
95
+ parsed_uri.scheme = scheme_map[parsed_uri.scheme]
96
+ end
97
+
98
+ if parsed_uri.host && !parsed_uri.scheme
99
+ parsed_uri.scheme = default_scheme
100
+ end if set_default_scheme?
101
+
102
+ if parsed_uri.host && !domain
103
+ @domain = UrlParser::Domain.new(hostname)
104
+ end
105
+ end
106
+ end
107
+
108
+ def parse!
109
+ @uri = parse
110
+ end
111
+
112
+ def unembed
113
+ original = parse
114
+
115
+ candidates = original.query_values.select do |key, value|
116
+ embedded_params.include?(key) &&
117
+ value =~ Addressable::URI::URIREGEX
118
+ end.values if original.query_values
119
+
120
+ embed = candidates.find do |candidate|
121
+ parsed = Addressable::URI.parse(candidate)
122
+ %w(http https).include?(parsed.scheme) && parsed.host
123
+ end if candidates
124
+
125
+ embed ? self.class.call(embed, raw: raw?) : original
126
+ end
127
+ alias_method :embedded, :unembed
128
+
129
+ def unembed!
130
+ @uri = unembed
131
+ end
132
+ alias_method :embedded!, :unembed!
133
+
134
+ def normalize
135
+ parse.tap do |uri|
136
+ uri.path = uri.path.squeeze('/')
137
+ uri.path = uri.path.chomp('/') if uri.path.size != 1
138
+ uri.query = nil if uri.query && uri.query.empty?
139
+ uri.query = uri.query.strip if uri.query
140
+ uri.fragment = nil
141
+
142
+ uri.normalize!
143
+ end
144
+ end
145
+
146
+ def normalize!
147
+ @uri = normalize
148
+ end
149
+
150
+ def canonicalize
151
+ parse.tap do |uri|
152
+ matches_global_param = proc do |key, value|
153
+ UrlParser::DB[:global].include?(key)
154
+ end
155
+
156
+ matches_host_based_param = proc do |key, value|
157
+ UrlParser::DB[:hosts].find do |host, param|
158
+ uri.host =~ Regexp.new(Regexp.escape(host)) && param.include?(key)
159
+ end
160
+ end
161
+
162
+ uri.query_values = uri.query_values(Array).tap do |params|
163
+ params.delete_if &matches_global_param
164
+ params.delete_if &matches_host_based_param
165
+ end if uri.query_values
166
+ end
167
+ end
168
+ alias_method :c14n, :canonicalize
169
+
170
+ def canonicalize!
171
+ @uri = canonicalize
172
+ end
173
+ alias_method :c14n!, :canonicalize!
174
+
175
+ def raw
176
+ uri.to_s
177
+ end
178
+
179
+ def raw!
180
+ @uri = raw
181
+ end
182
+
183
+ def sha1
184
+ Digest::SHA1.hexdigest(raw)
185
+ end
186
+ alias_method :hash, :sha1
187
+
188
+ def clean!
189
+ unescape!
190
+ parse!
191
+ unembed!
192
+ canonicalize!
193
+ normalize!
194
+ raw! if raw?
195
+ end
196
+
197
+ def ==(uri)
198
+ opts = options.merge(raw: false)
199
+ one = self.dup.tap { |uri| uri.clean! }
200
+ two = self.class.new(uri, opts).tap { |uri| uri.clean! }
201
+
202
+ one.sha1 == two.sha1
203
+ end
204
+
205
+ end
206
+ end