tldts 7.1.0 → 7.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cjs/index.js CHANGED
@@ -93,130 +93,294 @@ function getDomainWithoutSuffix$1(domain, suffix) {
93
93
  }
94
94
 
95
95
  /**
96
- * @param url - URL we want to extract a hostname from.
97
- * @param urlIsValidHostname - hint from caller; true if `url` is already a valid hostname.
96
+ * Matches an ASCII tab (U+0009) or newline (U+000A / U+000D). The WHATWG URL
97
+ * parser strips these before parsing; we only allocate a cleaned copy (and
98
+ * re-parse) on the rare input that actually contains one.
99
+ */
100
+ const CONTROL_CHARS = /[\t\n\r]/g;
101
+ /**
102
+ * Classify scheme `url.slice(schemeStart, colonIndex)` as a WHATWG special
103
+ * scheme without allocating a substring (case-insensitive via `| 32`).
104
+ * Special schemes: ftp, file, http, https, ws, wss
105
+ * (https://url.spec.whatwg.org/#special-scheme).
106
+ *
107
+ * @returns 0 = not special, 1 = special, 2 = file (its host sits only between
108
+ * "//" and the next slash).
109
+ */
110
+ function getSpecialScheme(url, schemeStart, colonIndex) {
111
+ const length = colonIndex - schemeStart;
112
+ const c0 = url.charCodeAt(schemeStart) | 32;
113
+ if (length === 2) {
114
+ return c0 === 119 && (url.charCodeAt(schemeStart + 1) | 32) === 115 ? 1 : 0; // ws
115
+ }
116
+ else if (length === 3) {
117
+ const c1 = url.charCodeAt(schemeStart + 1) | 32;
118
+ const c2 = url.charCodeAt(schemeStart + 2) | 32;
119
+ if (c0 === 119 && c1 === 115 && c2 === 115)
120
+ return 1; // wss
121
+ if (c0 === 102 && c1 === 116 && c2 === 112)
122
+ return 1; // ftp
123
+ return 0;
124
+ }
125
+ else if (length === 4) {
126
+ const c1 = url.charCodeAt(schemeStart + 1) | 32;
127
+ const c2 = url.charCodeAt(schemeStart + 2) | 32;
128
+ const c3 = url.charCodeAt(schemeStart + 3) | 32;
129
+ if (c0 === 104 && c1 === 116 && c2 === 116 && c3 === 112)
130
+ return 1; // http
131
+ if (c0 === 102 && c1 === 105 && c2 === 108 && c3 === 101)
132
+ return 2; // file
133
+ return 0;
134
+ }
135
+ else if (length === 5) {
136
+ return c0 === 104 &&
137
+ (url.charCodeAt(schemeStart + 1) | 32) === 116 &&
138
+ (url.charCodeAt(schemeStart + 2) | 32) === 116 &&
139
+ (url.charCodeAt(schemeStart + 3) | 32) === 112 &&
140
+ (url.charCodeAt(schemeStart + 4) | 32) === 115
141
+ ? 1
142
+ : 0; // https
143
+ }
144
+ return 0;
145
+ }
146
+ /**
147
+ * Extract a hostname from `url`, matching a WHATWG URL parser's host-boundary
148
+ * behaviour (https://url.spec.whatwg.org/#concept-basic-url-parser) for tldts'
149
+ * scope. It deliberately does NOT normalise the host (no IDNA/punycode or IPv4
150
+ * canonicalisation; IPv6 brackets are stripped, not compressed), strips trailing
151
+ * dots, and stays lenient where a strict parser rejects (bare host:port,
152
+ * out-of-range port, user@host) — all documented deviations.
153
+ *
154
+ * @param urlIsValidHostname - when true, `url` is already a valid hostname and is
155
+ * returned by the same reference (factory.ts skips re-validation on that
156
+ * identity), keeping the common path allocation-free.
98
157
  */
99
158
  function extractHostname(url, urlIsValidHostname) {
100
159
  let start = 0;
101
160
  let end = url.length;
102
161
  let hasUpper = false;
103
- // If url is not already a valid hostname, then try to extract hostname.
162
+ let isSpecial = false;
104
163
  if (!urlIsValidHostname) {
105
- // Special handling of data URLs
164
+ // Data URLs never carry a host (and may be huge — short-circuit them).
106
165
  if (url.startsWith('data:')) {
107
166
  return null;
108
167
  }
109
- // Trim leading spaces
168
+ // WHATWG step 1: trim leading/trailing C0 control or space (<= U+0020).
169
+ // Tab/newline elsewhere are handled lazily below.
110
170
  while (start < url.length && url.charCodeAt(start) <= 32) {
111
171
  start += 1;
112
172
  }
113
- // Trim trailing spaces
114
173
  while (end > start + 1 && url.charCodeAt(end - 1) <= 32) {
115
174
  end -= 1;
116
175
  }
117
- // Skip scheme.
118
176
  if (url.charCodeAt(start) === 47 /* '/' */ &&
119
177
  url.charCodeAt(start + 1) === 47 /* '/' */) {
178
+ // Scheme-relative reference ("//host/path").
120
179
  start += 2;
121
180
  }
122
181
  else {
123
182
  const indexOfProtocol = url.indexOf(':/', start);
124
183
  if (indexOfProtocol !== -1) {
125
- // Implement fast-path for common protocols. We expect most protocols
126
- // should be one of these 4 and thus we will not need to perform the
127
- // more expansive validity check most of the time.
128
- const protocolSize = indexOfProtocol - start;
129
- const c0 = url.charCodeAt(start);
130
- const c1 = url.charCodeAt(start + 1);
131
- const c2 = url.charCodeAt(start + 2);
132
- const c3 = url.charCodeAt(start + 3);
133
- const c4 = url.charCodeAt(start + 4);
134
- if (protocolSize === 5 &&
135
- c0 === 104 /* 'h' */ &&
136
- c1 === 116 /* 't' */ &&
137
- c2 === 116 /* 't' */ &&
138
- c3 === 112 /* 'p' */ &&
139
- c4 === 115 /* 's' */) ;
140
- else if (protocolSize === 4 &&
141
- c0 === 104 /* 'h' */ &&
142
- c1 === 116 /* 't' */ &&
143
- c2 === 116 /* 't' */ &&
144
- c3 === 112 /* 'p' */) ;
145
- else if (protocolSize === 3 &&
146
- c0 === 119 /* 'w' */ &&
147
- c1 === 115 /* 's' */ &&
148
- c2 === 115 /* 's' */) ;
149
- else if (protocolSize === 2 &&
150
- c0 === 119 /* 'w' */ &&
151
- c1 === 115 /* 's' */) ;
184
+ // "scheme://…". Classify the scheme, then position `start` at the host.
185
+ const special = getSpecialScheme(url, start, indexOfProtocol);
186
+ if (special === 1) {
187
+ // Special scheme: skip the run of '/' and '\' after it
188
+ // (special-authority-(ignore-)slashes states; '\' acts as '/').
189
+ isSpecial = true;
190
+ start = indexOfProtocol + 2;
191
+ while (url.charCodeAt(start) === 47 /* '/' */ ||
192
+ url.charCodeAt(start) === 92 /* '\' */) {
193
+ start += 1;
194
+ }
195
+ }
196
+ else if (special === 2) {
197
+ // file: the host is only what sits between "//" and the next slash, so
198
+ // "file://h/x" => "h" but "file:///x" / "file:/x" => no host.
199
+ isSpecial = true;
200
+ start = indexOfProtocol + 1;
201
+ let slashes = 0;
202
+ while ((url.charCodeAt(start) === 47 || url.charCodeAt(start) === 92) &&
203
+ slashes < 2) {
204
+ start += 1;
205
+ slashes += 1;
206
+ }
207
+ if (slashes < 2) {
208
+ return null;
209
+ }
210
+ }
152
211
  else {
153
- // Check that scheme is valid
212
+ // Unknown scheme: validate the WHATWG scheme grammar [A-Za-z0-9+.-];
213
+ // a control char means it was split by a tab/newline (strip + re-parse).
154
214
  for (let i = start; i < indexOfProtocol; i += 1) {
155
- const lowerCaseCode = url.charCodeAt(i) | 32;
156
- if (!(((lowerCaseCode >= 97 && lowerCaseCode <= 122) || // [a, z]
157
- (lowerCaseCode >= 48 && lowerCaseCode <= 57) || // [0, 9]
158
- lowerCaseCode === 46 || // '.'
159
- lowerCaseCode === 45 || // '-'
160
- lowerCaseCode === 43) // '+'
215
+ const code = url.charCodeAt(i) | 32;
216
+ if (!(((code >= 97 && code <= 122) || // [a, z]
217
+ (code >= 48 && code <= 57) || // [0, 9]
218
+ code === 46 || // '.'
219
+ code === 45 || // '-'
220
+ code === 43) // '+'
161
221
  )) {
222
+ const raw = url.charCodeAt(i);
223
+ if (raw === 9 || raw === 10 || raw === 13) {
224
+ return extractHostname(url.replace(CONTROL_CHARS, ''), urlIsValidHostname);
225
+ }
162
226
  return null;
163
227
  }
164
228
  }
229
+ // A non-special scheme has an authority only after "//" (else it is an
230
+ // opaque path with no host). `indexOf(':/')` already gave the first '/'.
231
+ if (url.charCodeAt(indexOfProtocol + 2) === 47 /* '/' */) {
232
+ start = indexOfProtocol + 3;
233
+ }
234
+ else {
235
+ return null;
236
+ }
237
+ }
238
+ }
239
+ else if (url.charCodeAt(start) !== 91 /* '[' */) {
240
+ // Cold path: no scheme "://", and not a bare IPv6 literal (whose first
241
+ // ':' would otherwise look like a scheme separator; "[…]" falls through
242
+ // to the ipv6 handling below). May be a bare host, a host:port, a
243
+ // user@host, a slash-less special scheme ("https:host"), or an opaque
244
+ // URI ("mailto:", "tel:", "urn:…").
245
+ let indexOfColon = -1;
246
+ for (let i = start; i < end; i += 1) {
247
+ const code = url.charCodeAt(i);
248
+ if (code === 9 || code === 10 || code === 13) {
249
+ return extractHostname(url.replace(CONTROL_CHARS, ''), urlIsValidHostname);
250
+ }
251
+ if (code === 58 /* ':' */) {
252
+ indexOfColon = i;
253
+ break;
254
+ }
255
+ if (code === 47 || code === 92 || code === 63 || code === 35) {
256
+ break;
257
+ }
165
258
  }
166
- // Skip 0, 1 or more '/' after ':/'
167
- start = indexOfProtocol + 2;
168
- while (url.charCodeAt(start) === 47 /* '/' */) {
169
- start += 1;
259
+ if (indexOfColon !== -1) {
260
+ // An '@' before the next delimiter => the ':' is userinfo, not a
261
+ // scheme ("user:pass@host", "mailto:a@b"): keep the whole authority.
262
+ let hasIdentifier = false;
263
+ for (let i = indexOfColon + 1; i < end; i += 1) {
264
+ const code = url.charCodeAt(i);
265
+ if (code === 47 || code === 92 || code === 63 || code === 35) {
266
+ break;
267
+ }
268
+ if (code === 64 /* '@' */) {
269
+ hasIdentifier = true;
270
+ break;
271
+ }
272
+ }
273
+ if (!hasIdentifier) {
274
+ // All-digits after ':' => a bare "host:port" (tldts accepts
275
+ // hostnames too); keep `start` and let the port handling trim it.
276
+ let allDigits = true;
277
+ let i = indexOfColon + 1;
278
+ for (; i < end; i += 1) {
279
+ const code = url.charCodeAt(i);
280
+ if (code === 47 || code === 92 || code === 63 || code === 35) {
281
+ break;
282
+ }
283
+ if (code < 48 /* '0' */ || code > 57 /* '9' */) {
284
+ allDigits = false;
285
+ break;
286
+ }
287
+ }
288
+ if (i === indexOfColon + 1) {
289
+ allDigits = false; // nothing after ':' => not a port
290
+ }
291
+ if (!allDigits) {
292
+ const special = getSpecialScheme(url, start, indexOfColon);
293
+ if (special === 0) {
294
+ // No "://" anywhere on the cold path, so a non-special scheme has
295
+ // no authority: opaque path, no host ("mailto:x", "foo:bar").
296
+ return null;
297
+ }
298
+ isSpecial = true;
299
+ start = indexOfColon + 1;
300
+ if (special === 2) {
301
+ // file (e.g. "file:\\host"): host only between "//" and next slash.
302
+ let slashes = 0;
303
+ while ((url.charCodeAt(start) === 47 ||
304
+ url.charCodeAt(start) === 92) &&
305
+ slashes < 2) {
306
+ start += 1;
307
+ slashes += 1;
308
+ }
309
+ if (slashes < 2) {
310
+ return null;
311
+ }
312
+ }
313
+ else {
314
+ while (url.charCodeAt(start) === 47 ||
315
+ url.charCodeAt(start) === 92) {
316
+ start += 1;
317
+ }
318
+ }
319
+ }
320
+ }
170
321
  }
171
322
  }
172
323
  }
173
- // Detect first occurrence of '/', '?' or '#'. We also keep track of the
174
- // last occurrence of '@', ']' or ':' to speed-up subsequent parsing of
175
- // (respectively), identifier, ipv6 or port.
324
+ // Find the host's end: first '/', '?' or '#' (and '\' for special URLs,
325
+ // which WHATWG treats like '/'). Track the last '@', ']' and ':' for
326
+ // userinfo, ipv6 and port; flag uppercase and a stray tab/newline. The loop
327
+ // is split on `code < 64` so common host characters take fewer comparisons.
176
328
  let indexOfIdentifier = -1;
177
329
  let indexOfClosingBracket = -1;
178
330
  let indexOfPort = -1;
331
+ let hasControl = false;
179
332
  for (let i = start; i < end; i += 1) {
180
333
  const code = url.charCodeAt(i);
181
- if (code === 35 || // '#'
182
- code === 47 || // '/'
183
- code === 63 // '?'
184
- ) {
334
+ if (code < 64) {
335
+ if (code === 47 || code === 35 || code === 63) {
336
+ end = i;
337
+ break;
338
+ }
339
+ else if (code === 58 /* ':' */) {
340
+ indexOfPort = i;
341
+ }
342
+ else if (code === 9 || code === 10 || code === 13) {
343
+ hasControl = true;
344
+ }
345
+ }
346
+ else if (isSpecial && code === 92 /* '\' */) {
185
347
  end = i;
186
348
  break;
187
349
  }
188
- else if (code === 64) {
189
- // '@'
350
+ else if (code === 64 /* '@' */) {
190
351
  indexOfIdentifier = i;
191
352
  }
192
- else if (code === 93) {
193
- // ']'
353
+ else if (code === 93 /* ']' */) {
194
354
  indexOfClosingBracket = i;
195
355
  }
196
- else if (code === 58) {
197
- // ':'
198
- indexOfPort = i;
199
- }
200
356
  else if (code >= 65 && code <= 90) {
201
357
  hasUpper = true;
202
358
  }
203
359
  }
204
- // Detect identifier: '@'
360
+ // A tab/newline inside the authority: strip everything and re-parse (rare).
361
+ if (hasControl) {
362
+ return extractHostname(url.replace(CONTROL_CHARS, ''), urlIsValidHostname);
363
+ }
364
+ // Skip userinfo. '>= start' so an empty userinfo ("http://@host") works too.
205
365
  if (indexOfIdentifier !== -1 &&
206
- indexOfIdentifier > start &&
366
+ indexOfIdentifier >= start &&
207
367
  indexOfIdentifier < end) {
208
368
  start = indexOfIdentifier + 1;
209
369
  }
210
- // Handle ipv6 addresses
211
370
  if (url.charCodeAt(start) === 91 /* '[' */) {
371
+ // ipv6 address: return what is between the brackets, or null if unclosed.
212
372
  if (indexOfClosingBracket !== -1) {
213
373
  return url.slice(start + 1, indexOfClosingBracket).toLowerCase();
214
374
  }
215
375
  return null;
216
376
  }
217
377
  else if (indexOfPort !== -1 && indexOfPort > start && indexOfPort < end) {
218
- // Detect port: ':'
219
- end = indexOfPort;
378
+ end = indexOfPort; // trim ':port'
379
+ }
380
+ // Empty authority ("http://", "file:///path", "//"); only reachable here via
381
+ // extraction — a bare valid hostname never lands here.
382
+ if (start >= end) {
383
+ return null;
220
384
  }
221
385
  }
222
386
  // Trim trailing dots
@@ -283,7 +447,7 @@ function isProbablyIpv6(hostname) {
283
447
  }
284
448
  else if (!(((code >= 48 && code <= 57) || // 0-9
285
449
  (code >= 97 && code <= 102) || // a-f
286
- (code >= 65 && code <= 90)) // A-F
450
+ (code >= 65 && code <= 70)) // A-F (RFC 4291 §2.2: an IPv6 hextet is hex digits only)
287
451
  )) {
288
452
  return false;
289
453
  }