domain_extractor 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'auth'
4
+ require_relative 'query_params'
5
+ require_relative 'uri_helpers'
6
+
3
7
  module DomainExtractor
4
8
  # ParsedURL wraps the parsing result and provides convenient accessor methods
5
9
  # with support for bang (!) and question mark (?) variants.
@@ -15,16 +19,26 @@ module DomainExtractor
15
19
  # parsed.host # => nil
16
20
  # parsed.host? # => false
17
21
  # parsed.host! # raises InvalidURLError
22
+ # rubocop:disable Metrics/ClassLength
18
23
  class ParsedURL
24
+ EMPTY_STRING = ''
25
+
19
26
  # Expose the underlying hash for backward compatibility
20
27
  attr_reader :result
21
28
 
29
+ # Store the original URI object for advanced operations
30
+ attr_reader :uri
31
+
22
32
  # List of valid result keys that should have method accessors
23
- RESULT_KEYS = %i[subdomain domain tld root_domain host path query_params].freeze
33
+ RESULT_KEYS = %i[
34
+ subdomain domain tld root_domain host path query_params
35
+ scheme port fragment user password userinfo decoded_user decoded_password
36
+ ].freeze
24
37
 
25
- def initialize(result)
26
- @result = result || {}
27
- freeze
38
+ def initialize(result, uri = nil)
39
+ @result = (result || {}).dup
40
+ @uri = uri
41
+ sync_uri_state!
28
42
  end
29
43
 
30
44
  # Hash-style access for backward compatibility
@@ -88,7 +102,9 @@ module DomainExtractor
88
102
  end
89
103
 
90
104
  def to_s
91
- @result.to_s
105
+ return EMPTY_STRING unless valid? && @uri
106
+
107
+ @uri.to_s
92
108
  end
93
109
 
94
110
  # Allow to_h conversion for hash compatibility
@@ -99,8 +115,222 @@ module DomainExtractor
99
115
  # Allow to_hash as well for better Ruby compatibility
100
116
  alias to_hash to_h
101
117
 
118
+ # Alias for URI compatibility
119
+ alias to_str to_s
120
+
121
+ def scheme
122
+ @result[:scheme]
123
+ end
124
+
125
+ def host
126
+ @result[:host]
127
+ end
128
+
129
+ def port
130
+ @result[:port]
131
+ end
132
+
133
+ def path
134
+ @result[:path]
135
+ end
136
+
137
+ def fragment
138
+ @result[:fragment]
139
+ end
140
+
141
+ def user
142
+ @result[:user]
143
+ end
144
+
145
+ def password
146
+ @result[:password]
147
+ end
148
+
149
+ def userinfo
150
+ @result[:userinfo]
151
+ end
152
+
153
+ # hostname returns host without IPv6 brackets (URI compatibility)
154
+ def hostname
155
+ return nil unless @uri || host
156
+
157
+ @uri&.hostname || host.to_s.gsub(/^\[|\]$/, '')
158
+ end
159
+
160
+ # query returns the query string (not parsed params)
161
+ def query
162
+ return nil unless @uri
163
+
164
+ @uri.query
165
+ end
166
+
167
+ # Setter methods for URI compatibility
168
+ def scheme=(value)
169
+ mutate_uri! { @uri.scheme = normalize_scheme(value) }
170
+ end
171
+
172
+ def host=(value)
173
+ mutate_uri! { replace_host(value) }
174
+ end
175
+
176
+ def hostname=(value)
177
+ self.host = value
178
+ end
179
+
180
+ def port=(value)
181
+ mutate_uri! { @uri.port = value }
182
+ end
183
+
184
+ def path=(value)
185
+ mutate_uri! { @uri.path = value.to_s }
186
+ end
187
+
188
+ def query=(value)
189
+ mutate_uri! { @uri.query = value }
190
+ end
191
+
192
+ def fragment=(value)
193
+ mutate_uri! { @uri.fragment = value }
194
+ end
195
+
196
+ def user=(value)
197
+ mutate_uri! { @uri.user = value }
198
+ end
199
+
200
+ def password=(value)
201
+ mutate_uri! { @uri.password = value }
202
+ end
203
+
204
+ def userinfo=(value)
205
+ mutate_uri! { @uri.userinfo = value }
206
+ end
207
+
208
+ # Advanced URI methods
209
+
210
+ # Generate Basic Authentication header from current credentials
211
+ # @return [String, nil] Authorization header value or nil if no credentials
212
+ def basic_auth_header
213
+ return nil if user.nil? || password.nil?
214
+
215
+ URIHelpers.basic_auth_header(decoded_user || user, decoded_password || password)
216
+ end
217
+
218
+ # Generate Bearer token header
219
+ # @param token [String] The bearer token
220
+ # @return [String] Authorization header value
221
+ def bearer_auth_header(token)
222
+ URIHelpers.bearer_auth_header(token)
223
+ end
224
+
225
+ # Find proxy for this URL
226
+ # @return [URI::Generic, nil] Proxy URI or nil
227
+ def find_proxy
228
+ return nil unless @uri
229
+
230
+ URIHelpers.find_proxy(@uri)
231
+ end
232
+
233
+ # Merge with a relative URI
234
+ # @param relative [String, URI::Generic] The relative URI
235
+ # @return [ParsedURL] New ParsedURL with merged URI
236
+ def merge(relative)
237
+ return self unless @uri
238
+
239
+ merged_uri = URIHelpers.merge_uri(@uri, relative)
240
+ DomainExtractor.parse(merged_uri.to_s)
241
+ end
242
+
243
+ # Normalize the URI (lowercase scheme/host, remove default ports)
244
+ # @return [ParsedURL] New ParsedURL with normalized URI
245
+ def normalize
246
+ return self unless @uri
247
+
248
+ normalized_uri = URIHelpers.normalize_uri(@uri)
249
+ DomainExtractor.parse(normalized_uri.to_s)
250
+ end
251
+
252
+ # Check if this is an absolute URI
253
+ # @return [Boolean] True if absolute
254
+ def absolute?
255
+ !@result[:scheme].nil?
256
+ end
257
+
258
+ # Check if this is a relative URI
259
+ # @return [Boolean] True if relative
260
+ def relative?
261
+ @result[:scheme].nil?
262
+ end
263
+
264
+ # Get the default port for the scheme
265
+ # @return [Integer, nil] Default port or nil
266
+ def default_port
267
+ URIHelpers.default_port_for(@uri || scheme)
268
+ end
269
+
270
+ # Build a complete URL string from components
271
+ # @return [String] The complete URL
272
+ def build_url
273
+ to_s
274
+ end
275
+
102
276
  private
103
277
 
278
+ def mutate_uri!
279
+ return unless @uri
280
+
281
+ yield
282
+ sync_from_uri!
283
+ end
284
+
285
+ def sync_uri_state!
286
+ return unless @uri && valid?
287
+
288
+ current_userinfo = @uri.userinfo
289
+ @uri.scheme = normalize_scheme(@result[:scheme]) if @result[:scheme]
290
+ if @result[:host]
291
+ @uri.host = normalize_host(@result[:host])
292
+ @uri.userinfo = current_userinfo if current_userinfo
293
+ end
294
+ sync_from_uri!
295
+ end
296
+
297
+ def sync_from_uri!
298
+ attributes = DomainExtractor::Parser.host_attributes(@uri.host)
299
+
300
+ unless attributes
301
+ @result.clear
302
+ return
303
+ end
304
+
305
+ @result.replace(
306
+ attributes.merge(
307
+ path: @uri.path || EMPTY_STRING,
308
+ query_params: QueryParams.call(@uri.query),
309
+ scheme: normalize_scheme(@uri.scheme),
310
+ port: @uri.port,
311
+ fragment: @uri.fragment
312
+ ).merge(Auth.extract(@uri))
313
+ )
314
+ end
315
+
316
+ def normalize_host(value)
317
+ return nil if value.nil?
318
+
319
+ value.to_s.downcase
320
+ end
321
+
322
+ def normalize_scheme(value)
323
+ return nil if value.nil?
324
+
325
+ value.to_s.downcase
326
+ end
327
+
328
+ def replace_host(value)
329
+ current_userinfo = @uri.userinfo
330
+ @uri.host = normalize_host(value)
331
+ @uri.userinfo = current_userinfo if current_userinfo
332
+ end
333
+
104
334
  # Handle bang methods that raise errors for missing values
105
335
  def handle_bang_method(method_str)
106
336
  key = method_str[0...-1].to_sym
@@ -128,4 +358,5 @@ module DomainExtractor
128
358
  true
129
359
  end
130
360
  end
361
+ # rubocop:enable Metrics/ClassLength
131
362
  end
@@ -7,18 +7,29 @@ require_relative 'normalizer'
7
7
  require_relative 'result'
8
8
  require_relative 'validators'
9
9
  require_relative 'parsed_url'
10
+ require_relative 'auth'
11
+
12
+ # Register custom URI schemes for database and other protocols
13
+ # This allows URI.parse to handle redis://, mysql://, postgresql://, etc.
14
+ %w[redis rediss mysql postgresql mongodb sftp ftps].each do |scheme|
15
+ URI.scheme_list[scheme.upcase] = URI::Generic
16
+ rescue StandardError
17
+ # Ignore if can't register
18
+ end
10
19
 
11
20
  module DomainExtractor
12
21
  # Parser orchestrates the pipeline for url normalization, validation, and domain extraction.
13
22
  module Parser
23
+ SCHEME_PATTERN = %r{\A([a-z][a-z0-9+.-]*)://}i
24
+ RETRYABLE_URI_MESSAGES = ['bad URI', 'is not URI'].freeze
25
+
14
26
  module_function
15
27
 
16
28
  def call(raw_url)
17
- components = extract_components(raw_url)
18
- return ParsedURL.new(nil) unless components
29
+ uri, host_attributes = extract_components(raw_url)
30
+ return ParsedURL.new(nil) unless uri && host_attributes
19
31
 
20
- uri, domain, host = components
21
- build_result(domain: domain, host: host, uri: uri)
32
+ build_result(host_attributes: host_attributes, uri: uri)
22
33
  rescue ::URI::InvalidURIError, ::PublicSuffix::Error
23
34
  ParsedURL.new(nil)
24
35
  end
@@ -29,16 +40,29 @@ module DomainExtractor
29
40
  false
30
41
  end
31
42
 
32
- def build_uri(raw_url)
43
+ def host_attributes(host)
44
+ return if invalid_host?(host)
45
+
46
+ normalized_host = host.downcase
47
+ domain = parse_domain(normalized_host)
48
+
49
+ return domain_attributes(domain, normalized_host) if domain
50
+
51
+ hostname_attributes(normalized_host) if Validators.valid_hostname?(normalized_host)
52
+ end
53
+
54
+ def build_uri(raw_url, retry_count = 0)
33
55
  normalized = Normalizer.call(raw_url)
34
56
  return unless normalized
35
57
 
36
58
  ::URI.parse(normalized)
59
+ rescue ::URI::InvalidURIError => e
60
+ retry_parse_with_registered_scheme(e, normalized, raw_url, retry_count)
37
61
  end
38
62
  private_class_method :build_uri
39
63
 
40
64
  def invalid_host?(host)
41
- host.nil? || Validators.ip_address?(host) || !::PublicSuffix.valid?(host)
65
+ host.nil? || Validators.ip_address?(host)
42
66
  end
43
67
  private_class_method :invalid_host?
44
68
 
@@ -46,23 +70,76 @@ module DomainExtractor
46
70
  uri = build_uri(raw_url)
47
71
  return unless uri
48
72
 
49
- host = uri.host&.downcase
50
- return if invalid_host?(host)
73
+ attributes = host_attributes(uri.host)
74
+ return unless attributes
51
75
 
52
- domain = ::PublicSuffix.parse(host)
53
- [uri, domain, host]
76
+ [uri, attributes]
54
77
  end
55
78
  private_class_method :extract_components
56
79
 
57
- def build_result(domain:, host:, uri:)
58
- Result.build(
80
+ def parse_domain(host)
81
+ ::PublicSuffix.parse(host)
82
+ rescue ::PublicSuffix::Error
83
+ nil
84
+ end
85
+ private_class_method :parse_domain
86
+
87
+ def domain_attributes(domain, host)
88
+ {
59
89
  subdomain: domain.trd,
60
90
  root_domain: domain.domain,
61
91
  domain: domain.sld,
62
92
  tld: domain.tld,
63
- host: host,
93
+ host: host
94
+ }
95
+ end
96
+ private_class_method :domain_attributes
97
+
98
+ def hostname_attributes(host)
99
+ {
100
+ subdomain: nil,
101
+ root_domain: host,
102
+ domain: host,
103
+ tld: nil,
104
+ host: host
105
+ }
106
+ end
107
+ private_class_method :hostname_attributes
108
+
109
+ def retry_parse_with_registered_scheme(error, normalized, raw_url, retry_count)
110
+ return nil unless retryable_scheme_registration?(error.message, normalized, retry_count)
111
+
112
+ register_scheme(normalized[SCHEME_PATTERN, 1])
113
+ build_uri(raw_url, 1)
114
+ rescue StandardError
115
+ nil
116
+ end
117
+ private_class_method :retry_parse_with_registered_scheme
118
+
119
+ def retryable_scheme_registration?(message, normalized, retry_count)
120
+ retry_count.zero? &&
121
+ RETRYABLE_URI_MESSAGES.any? { |fragment| message.include?(fragment) } &&
122
+ normalized.match?(SCHEME_PATTERN)
123
+ end
124
+ private_class_method :retryable_scheme_registration?
125
+
126
+ def register_scheme(scheme)
127
+ URI.scheme_list[scheme.upcase] = URI::Generic
128
+ end
129
+ private_class_method :register_scheme
130
+
131
+ def build_result(host_attributes:, uri:)
132
+ auth_components = Auth.extract(uri)
133
+
134
+ Result.build(
135
+ **host_attributes,
64
136
  path: uri.path,
65
- query: uri.query
137
+ query: uri.query,
138
+ scheme: uri.scheme,
139
+ port: uri.port,
140
+ fragment: uri.fragment,
141
+ **auth_components,
142
+ uri: uri
66
143
  )
67
144
  end
68
145
  private_class_method :build_result
@@ -12,22 +12,53 @@ module DomainExtractor
12
12
  module_function
13
13
 
14
14
  def build(**attributes)
15
- hash = {
15
+ ParsedURL.new(result_hash(attributes), attributes[:uri])
16
+ end
17
+
18
+ def normalize_subdomain(value)
19
+ value.nil? || value.empty? ? nil : value
20
+ end
21
+ private_class_method :normalize_subdomain
22
+
23
+ def result_hash(attributes)
24
+ domain_attributes(attributes)
25
+ .merge(uri_attributes(attributes))
26
+ .merge(auth_attributes(attributes))
27
+ .freeze
28
+ end
29
+ private_class_method :result_hash
30
+
31
+ def domain_attributes(attributes)
32
+ {
16
33
  subdomain: normalize_subdomain(attributes[:subdomain]),
17
34
  root_domain: attributes[:root_domain],
18
35
  domain: attributes[:domain],
19
36
  tld: attributes[:tld],
20
- host: attributes[:host],
21
- path: attributes[:path] || EMPTY_PATH,
22
- query_params: QueryParams.call(attributes[:query])
23
- }.freeze
37
+ host: attributes[:host]
38
+ }
39
+ end
40
+ private_class_method :domain_attributes
24
41
 
25
- ParsedURL.new(hash)
42
+ def uri_attributes(attributes)
43
+ {
44
+ path: attributes[:path] || EMPTY_PATH,
45
+ query_params: QueryParams.call(attributes[:query]),
46
+ scheme: attributes[:scheme],
47
+ port: attributes[:port],
48
+ fragment: attributes[:fragment]
49
+ }
26
50
  end
51
+ private_class_method :uri_attributes
27
52
 
28
- def normalize_subdomain(value)
29
- value.nil? || value.empty? ? nil : value
53
+ def auth_attributes(attributes)
54
+ {
55
+ user: attributes[:user],
56
+ password: attributes[:password],
57
+ userinfo: attributes[:userinfo],
58
+ decoded_user: attributes[:decoded_user],
59
+ decoded_password: attributes[:decoded_password]
60
+ }
30
61
  end
31
- private_class_method :normalize_subdomain
62
+ private_class_method :auth_attributes
32
63
  end
33
64
  end
@@ -0,0 +1,168 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'base64'
4
+ require 'uri'
5
+
6
+ module DomainExtractor
7
+ # URIHelpers provides advanced URI manipulation methods
8
+ # Including merge, normalize, authentication helpers, and proxy detection
9
+ # rubocop:disable Metrics/ModuleLength
10
+ module URIHelpers
11
+ CREDENTIAL_ESCAPE_PATTERN = /[^A-Za-z0-9\-._~]/
12
+ DEFAULT_PORTS = {
13
+ 'ftp' => 21,
14
+ 'ftps' => 990,
15
+ 'http' => 80,
16
+ 'https' => 443,
17
+ 'mongodb' => 27_017,
18
+ 'mysql' => 3306,
19
+ 'postgresql' => 5432,
20
+ 'redis' => 6379,
21
+ 'rediss' => 6380,
22
+ 'sftp' => 22,
23
+ 'ssh' => 22
24
+ }.freeze
25
+ HTTP_PROXY_KEYS = %w[http_proxy HTTP_PROXY].freeze
26
+ ALL_PROXY_KEYS = %w[all_proxy ALL_PROXY].freeze
27
+
28
+ module_function
29
+
30
+ # Generate Basic Authentication header
31
+ # @param username [String] The username
32
+ # @param password [String] The password
33
+ # @return [String] The Authorization header value
34
+ def basic_auth_header(username, password)
35
+ credentials = "#{username}:#{password}"
36
+ encoded = Base64.strict_encode64(credentials)
37
+ "Basic #{encoded}"
38
+ end
39
+
40
+ # Generate Bearer token header
41
+ # @param token [String] The bearer token
42
+ # @return [String] The Authorization header value
43
+ def bearer_auth_header(token)
44
+ "Bearer #{token}"
45
+ end
46
+
47
+ # Encode credentials for URL (percent-encoding)
48
+ # @param value [String] The value to encode
49
+ # @return [String] Percent-encoded value
50
+ def encode_credential(value)
51
+ URI::DEFAULT_PARSER.escape(value.to_s, CREDENTIAL_ESCAPE_PATTERN)
52
+ end
53
+
54
+ # Decode percent-encoded credential
55
+ # @param value [String] The encoded value
56
+ # @return [String] Decoded value
57
+ def decode_credential(value)
58
+ URI::DEFAULT_PARSER.unescape(value.to_s)
59
+ rescue StandardError
60
+ value
61
+ end
62
+
63
+ # Find proxy from environment variables
64
+ # Checks http_proxy, HTTP_PROXY, and no_proxy
65
+ # @param uri [URI::Generic, String] The URI to check
66
+ # @return [URI::Generic, nil] The proxy URI or nil
67
+ def find_proxy(uri)
68
+ uri_obj = coerce_uri(uri)
69
+ return nil unless uri_obj
70
+ return nil if should_bypass_proxy?(uri_obj)
71
+
72
+ proxy_url = proxy_url_for(uri_obj.scheme)
73
+ return nil unless proxy_url
74
+
75
+ URI.parse(proxy_url)
76
+ rescue URI::InvalidURIError
77
+ nil
78
+ end
79
+
80
+ # Check if URI should bypass proxy based on no_proxy
81
+ # @param uri [URI::Generic] The URI to check
82
+ # @return [Boolean] True if should bypass proxy
83
+ def should_bypass_proxy?(uri)
84
+ no_proxy = ENV['no_proxy'] || ENV.fetch('NO_PROXY', nil)
85
+ return false unless no_proxy
86
+
87
+ host = proxy_host(uri)
88
+ return false unless host
89
+
90
+ no_proxy
91
+ .split(',')
92
+ .map(&:strip)
93
+ .reject(&:empty?)
94
+ .any? { |pattern| proxy_pattern_match?(host, pattern) }
95
+ end
96
+ private_class_method :should_bypass_proxy?
97
+
98
+ def coerce_uri(uri)
99
+ uri.is_a?(String) ? URI.parse(uri) : uri
100
+ end
101
+ private_class_method :coerce_uri
102
+
103
+ def proxy_url_for(scheme)
104
+ proxy_env_keys(scheme).each do |key|
105
+ value = env_value(key)
106
+ return value if value
107
+ end
108
+
109
+ nil
110
+ end
111
+ private_class_method :proxy_url_for
112
+
113
+ def env_value(key)
114
+ value = ENV.fetch(key, nil)
115
+ value unless value.nil? || value.empty?
116
+ end
117
+ private_class_method :env_value
118
+
119
+ def proxy_host(uri)
120
+ uri.hostname || uri.host
121
+ end
122
+ private_class_method :proxy_host
123
+
124
+ def proxy_pattern_match?(host, pattern)
125
+ return true if pattern == '*'
126
+
127
+ normalized_pattern = pattern.delete_prefix('.')
128
+ host == normalized_pattern || host.end_with?(".#{normalized_pattern}")
129
+ end
130
+ private_class_method :proxy_pattern_match?
131
+
132
+ def proxy_env_keys(scheme)
133
+ scheme_keys = if scheme && !scheme.empty?
134
+ ["#{scheme.downcase}_proxy", "#{scheme.upcase}_PROXY"]
135
+ else
136
+ []
137
+ end
138
+
139
+ (scheme_keys + HTTP_PROXY_KEYS + ALL_PROXY_KEYS).uniq
140
+ end
141
+ private_class_method :proxy_env_keys
142
+
143
+ # Normalize a URI (lowercase scheme and host, remove default ports)
144
+ # @param uri [URI::Generic] The URI to normalize
145
+ # @return [URI::Generic] Normalized URI
146
+ def normalize_uri(uri)
147
+ uri.normalize
148
+ end
149
+
150
+ # Merge a relative URI with a base URI
151
+ # @param base [URI::Generic] The base URI
152
+ # @param relative [String, URI::Generic] The relative URI
153
+ # @return [URI::Generic] The merged URI
154
+ def merge_uri(base, relative)
155
+ base.merge(relative)
156
+ end
157
+
158
+ def default_port_for(uri_or_scheme)
159
+ case uri_or_scheme
160
+ when URI::Generic
161
+ uri_or_scheme.default_port || DEFAULT_PORTS[uri_or_scheme.scheme]
162
+ else
163
+ DEFAULT_PORTS[uri_or_scheme.to_s]
164
+ end
165
+ end
166
+ end
167
+ # rubocop:enable Metrics/ModuleLength
168
+ end
@@ -8,6 +8,11 @@ module DomainExtractor
8
8
  IPV4_REGEX = /\A#{IPV4_SEGMENT}(?:\.#{IPV4_SEGMENT}){3}\z/
9
9
  IPV6_REGEX = /\A\[?[0-9a-fA-F:]+\]?\z/
10
10
 
11
+ # Valid hostname pattern (RFC 1123)
12
+ # Allows: letters, numbers, hyphens, dots
13
+ # Must start with alphanumeric, can contain hyphens, must end with alphanumeric
14
+ HOSTNAME_REGEX = /\A[a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?)*\z/i
15
+
11
16
  # Frozen string constants
12
17
  DOT = '.'
13
18
  COLON = ':'
@@ -27,5 +32,15 @@ module DomainExtractor
27
32
  false
28
33
  end
29
34
  end
35
+
36
+ # Check if a string is a valid hostname
37
+ # @param host [String] The hostname to validate
38
+ # @return [Boolean] True if valid hostname
39
+ def valid_hostname?(host)
40
+ return false if host.nil? || host.empty?
41
+ return false if host.length > 253 # Max hostname length
42
+
43
+ HOSTNAME_REGEX.match?(host)
44
+ end
30
45
  end
31
46
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DomainExtractor
4
- VERSION = '0.2.7'
4
+ VERSION = '0.2.9'
5
5
  end