tldts-core 7.2.0 → 7.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,37 @@
5
5
  */
6
6
  const CONTROL_CHARS = /[\t\n\r]/g;
7
7
 
8
+ // Set by `extractHostname` (a module-scope flag, read synchronously by
9
+ // `parseImpl` right after the call — same pattern as the reused RESULT object).
10
+ // `true` ONLY when extraction validated the returned host inline (a confirmed-
11
+ // valid, "simple" authority) so `parseImpl` can skip the separate
12
+ // `isValidHostname` pass. `false` in every other case (validation disabled, a
13
+ // complex authority — userinfo/port/brackets/trailing-dot/control — an invalid
14
+ // host, or a non-main return path); `parseImpl` then validates as usual. The
15
+ // fast path can only ever SKIP a redundant scan for hosts already known valid,
16
+ // never accept an invalid one.
17
+ export let extractedHostnameValidated = false;
18
+
19
+ /**
20
+ * True if char `code` is a valid hostname character. This is the per-char half
21
+ * of `is-valid.ts`'s `isValidAscii` (a-z, 0-9, > U+007F) PLUS three additions:
22
+ * A-Z (the host is lowercased before validation, so uppercase ≡ a valid
23
+ * lowercase letter) and '-' / '_' (valid inside a label). KEEP IN SYNC with
24
+ * `is-valid.ts`: these rules are deliberately duplicated to validate during
25
+ * extraction, so any change to the accepted character set there must be
26
+ * mirrored here (and vice-versa).
27
+ */
28
+ function isValidHostnameChar(code: number): boolean {
29
+ return (
30
+ (code >= 97 && code <= 122) || // a-z
31
+ (code >= 48 && code <= 57) || // 0-9
32
+ code > 127 || // non-ASCII (accepted, not punycode-checked)
33
+ (code >= 65 && code <= 90) || // A-Z (becomes valid once lowercased)
34
+ code === 45 || // '-'
35
+ code === 95 // '_'
36
+ );
37
+ }
38
+
8
39
  /**
9
40
  * Classify scheme `url.slice(schemeStart, colonIndex)` as a WHATWG special
10
41
  * scheme without allocating a substring (case-insensitive via `| 32`).
@@ -59,15 +90,20 @@ function getSpecialScheme(
59
90
  * @param urlIsValidHostname - when true, `url` is already a valid hostname and is
60
91
  * returned by the same reference (factory.ts skips re-validation on that
61
92
  * identity), keeping the common path allocation-free.
93
+ * @param validate - when true, validate the host inline during the authority
94
+ * scan and publish the verdict via `extractedHostnameValidated` so `parseImpl`
95
+ * can skip the redundant `isValidHostname` pass for simple authorities.
62
96
  */
63
97
  export default function extractHostname(
64
98
  url: string,
65
99
  urlIsValidHostname: boolean,
100
+ validate = false,
66
101
  ): string | null {
67
102
  let start = 0;
68
103
  let end: number = url.length;
69
104
  let hasUpper = false;
70
105
  let isSpecial = false;
106
+ extractedHostnameValidated = false;
71
107
 
72
108
  if (!urlIsValidHostname) {
73
109
  // Data URLs never carry a host (and may be huge — short-circuit them).
@@ -143,6 +179,7 @@ export default function extractHostname(
143
179
  return extractHostname(
144
180
  url.replace(CONTROL_CHARS, ''),
145
181
  urlIsValidHostname,
182
+ validate,
146
183
  );
147
184
  }
148
185
  return null;
@@ -169,6 +206,7 @@ export default function extractHostname(
169
206
  return extractHostname(
170
207
  url.replace(CONTROL_CHARS, ''),
171
208
  urlIsValidHostname,
209
+ validate,
172
210
  );
173
211
  }
174
212
  if (code === 58 /* ':' */) {
@@ -217,32 +255,54 @@ export default function extractHostname(
217
255
  if (!allDigits) {
218
256
  const special = getSpecialScheme(url, start, indexOfColon);
219
257
  if (special === 0) {
220
- // No "://" anywhere on the cold path, so a non-special scheme has
221
- // no authority: opaque path, no host ("mailto:x", "foo:bar").
222
- return null;
223
- }
224
- isSpecial = true;
225
- start = indexOfColon + 1;
226
- if (special === 2) {
227
- // file (e.g. "file:\\host"): host only between "//" and next slash.
228
- let slashes = 0;
229
- while (
230
- (url.charCodeAt(start) === 47 ||
231
- url.charCodeAt(start) === 92) &&
232
- slashes < 2
233
- ) {
234
- start += 1;
235
- slashes += 1;
258
+ // No "://" anywhere on the cold path and not a special scheme.
259
+ // A second ':' before the host's end marks a bare, unbracketed
260
+ // IPv6 literal ("2a01:e35::1"): fall through and let the host
261
+ // loop + isIp classify it. Without one this is an opaque path
262
+ // with no host ("mailto:x", "foo:bar").
263
+ let isBareIpv6 = false;
264
+ for (let j = indexOfColon + 1; j < end; j += 1) {
265
+ const code = url.charCodeAt(j);
266
+ if (
267
+ code === 47 ||
268
+ code === 92 ||
269
+ code === 63 ||
270
+ code === 35
271
+ ) {
272
+ break;
273
+ }
274
+ if (code === 58 /* ':' */) {
275
+ isBareIpv6 = true;
276
+ break;
277
+ }
236
278
  }
237
- if (slashes < 2) {
279
+ if (!isBareIpv6) {
238
280
  return null;
239
281
  }
240
282
  } else {
241
- while (
242
- url.charCodeAt(start) === 47 ||
243
- url.charCodeAt(start) === 92
244
- ) {
245
- start += 1;
283
+ isSpecial = true;
284
+ start = indexOfColon + 1;
285
+ if (special === 2) {
286
+ // file (e.g. "file:\\host"): host only between "//" and next slash.
287
+ let slashes = 0;
288
+ while (
289
+ (url.charCodeAt(start) === 47 ||
290
+ url.charCodeAt(start) === 92) &&
291
+ slashes < 2
292
+ ) {
293
+ start += 1;
294
+ slashes += 1;
295
+ }
296
+ if (slashes < 2) {
297
+ return null;
298
+ }
299
+ } else {
300
+ while (
301
+ url.charCodeAt(start) === 47 ||
302
+ url.charCodeAt(start) === 92
303
+ ) {
304
+ start += 1;
305
+ }
246
306
  }
247
307
  }
248
308
  }
@@ -253,12 +313,40 @@ export default function extractHostname(
253
313
 
254
314
  // Find the host's end: first '/', '?' or '#' (and '\' for special URLs,
255
315
  // which WHATWG treats like '/'). Track the last '@', ']' and ':' for
256
- // userinfo, ipv6 and port; flag uppercase and a stray tab/newline. The loop
257
- // is split on `code < 64` so common host characters take fewer comparisons.
316
+ // userinfo, ipv6 and port, plus the first ':' of the host (reset at each
317
+ // '@') to tell a bare IPv6 (>= 2 colons) from a host:port (exactly one);
318
+ // flag uppercase and a stray tab/newline. The loop is split on `code < 64`
319
+ // so common host characters take fewer comparisons.
320
+ //
321
+ // When `validate`, also accumulate `is-valid.ts`'s checks over the scanned
322
+ // run so a simple authority's host can be validated in this single pass.
323
+ // `vValid` only stays meaningful for a "simple" authority (no userinfo, port,
324
+ // brackets, control or trailing dot); those cases clear it / are rejected by
325
+ // the guard below, falling back to `isValidHostname`.
258
326
  let indexOfIdentifier = -1;
259
327
  let indexOfClosingBracket = -1;
260
328
  let indexOfPort = -1;
329
+ let indexOfFirstColon = -1;
261
330
  let hasControl = false;
331
+ let vValid = validate; // seeded true when validating; cleared on the first invalid char
332
+ let vLastDot = start - 1; // mirrors is-valid.ts `lastDotIndex = -1` at host start
333
+ let vLastCode = -1;
334
+ if (validate && start < end) {
335
+ // First-char rule: must be a valid host char, '.', or '_' (NOT '-').
336
+ const c0 = url.charCodeAt(start);
337
+ if (
338
+ !(
339
+ /*@__INLINE__*/ (
340
+ isValidHostnameChar(c0) ||
341
+ c0 === 46 /* '.' */ ||
342
+ c0 === 95 /* '_' */
343
+ )
344
+ ) ||
345
+ c0 === 45 /* '-' (isValidHostnameChar allows it mid-label, not first) */
346
+ ) {
347
+ vValid = false;
348
+ }
349
+ }
262
350
  for (let i = start; i < end; i += 1) {
263
351
  const code: number = url.charCodeAt(i);
264
352
  if (code < 64) {
@@ -266,19 +354,42 @@ export default function extractHostname(
266
354
  end = i;
267
355
  break;
268
356
  } else if (code === 58 /* ':' */) {
357
+ if (indexOfFirstColon === -1) {
358
+ indexOfFirstColon = i;
359
+ }
269
360
  indexOfPort = i;
270
361
  } else if (code === 9 || code === 10 || code === 13) {
271
362
  hasControl = true;
363
+ } else if (validate) {
364
+ if (code === 46 /* '.' */) {
365
+ if (i - vLastDot > 64 || vLastCode === 46 || vLastCode === 45) {
366
+ vValid = false;
367
+ }
368
+ vLastDot = i;
369
+ } else if (code < 48 || code > 57) {
370
+ // < 64 and not a delimiter/dot/digit => only '-' (45) is a valid
371
+ // host char here; everything else (space, %, !, etc.) is invalid.
372
+ if (code !== 45) {
373
+ vValid = false;
374
+ }
375
+ }
272
376
  }
273
377
  } else if (isSpecial && code === 92 /* '\' */) {
274
378
  end = i;
275
379
  break;
276
380
  } else if (code === 64 /* '@' */) {
277
381
  indexOfIdentifier = i;
382
+ indexOfFirstColon = -1; // colons before '@' are userinfo, not the host
278
383
  } else if (code === 93 /* ']' */) {
279
384
  indexOfClosingBracket = i;
280
385
  } else if (code >= 65 && code <= 90) {
281
386
  hasUpper = true;
387
+ } else if (validate && !(/*@__INLINE__*/ isValidHostnameChar(code))) {
388
+ // >= 64, not '@'/']'/upper: valid only if a-z, '_', or non-ASCII.
389
+ vValid = false;
390
+ }
391
+ if (validate) {
392
+ vLastCode = code;
282
393
  }
283
394
  }
284
395
 
@@ -287,6 +398,7 @@ export default function extractHostname(
287
398
  return extractHostname(
288
399
  url.replace(CONTROL_CHARS, ''),
289
400
  urlIsValidHostname,
401
+ validate,
290
402
  );
291
403
  }
292
404
 
@@ -305,7 +417,15 @@ export default function extractHostname(
305
417
  return url.slice(start + 1, indexOfClosingBracket).toLowerCase();
306
418
  }
307
419
  return null;
308
- } else if (indexOfPort !== -1 && indexOfPort > start && indexOfPort < end) {
420
+ } else if (
421
+ indexOfPort !== -1 &&
422
+ indexOfPort > start &&
423
+ indexOfPort < end &&
424
+ // A host:port has exactly one ':' in the host (so its first ':' is its
425
+ // last); a bare, unbracketed IPv6 literal ("2a01:e35::1") has >= 2, so
426
+ // its first ':' precedes the last. Only the former has a ':port' to trim.
427
+ indexOfFirstColon === indexOfPort
428
+ ) {
309
429
  end = indexOfPort; // trim ':port'
310
430
  }
311
431
 
@@ -314,6 +434,31 @@ export default function extractHostname(
314
434
  if (start >= end) {
315
435
  return null;
316
436
  }
437
+
438
+ // Publish the inline-validation verdict — but only for a "simple" authority,
439
+ // where the scanned run equals the final host: no userinfo skip, no port
440
+ // trim, no brackets, no trailing dot (trimmed below), and length within RFC
441
+ // limits. Anything else leaves it `false` so `parseImpl` re-validates.
442
+ //
443
+ // Every clause below is load-bearing for CORRECTNESS, not just speed: the
444
+ // loop accumulates `vValid` over the whole scanned run (it does not stop at
445
+ // ':' or '@', so any port/userinfo bytes are included), so the verdict is
446
+ // only sound when that run equals the final host. Do not drop a clause as
447
+ // "redundant" — e.g. without `indexOfPort === -1`, `host:8080` would be
448
+ // wrongly accepted.
449
+ if (
450
+ validate &&
451
+ vValid &&
452
+ indexOfIdentifier === -1 &&
453
+ indexOfPort === -1 &&
454
+ indexOfClosingBracket === -1 &&
455
+ url.charCodeAt(end - 1) !== 46 /* no trailing dot */ &&
456
+ end - start <= 255 && // total length
457
+ end - vLastDot - 1 <= 63 && // last label length
458
+ vLastCode !== 45 /* last char not '-' */
459
+ ) {
460
+ extractedHostnameValidated = true;
461
+ }
317
462
  }
318
463
 
319
464
  // Trim trailing dots
package/src/factory.ts CHANGED
@@ -6,7 +6,9 @@
6
6
 
7
7
  import getDomain from './domain';
8
8
  import getDomainWithoutSuffix from './domain-without-suffix';
9
- import extractHostname from './extract-hostname';
9
+ import extractHostname, {
10
+ extractedHostnameValidated,
11
+ } from './extract-hostname';
10
12
  import isIp from './is-ip';
11
13
  import isSpecialUse from './is-special-use';
12
14
  import isValidHostname from './is-valid';
@@ -16,9 +18,10 @@ import getSubdomain from './subdomain';
16
18
 
17
19
  export interface IResult {
18
20
  // `hostname` is either a registered name (including but not limited to a
19
- // hostname), or an IP address. IPv4 addresses must be in dot-decimal
20
- // notation, and IPv6 addresses must be enclosed in brackets ([]). This is
21
- // directly extracted from the input URL.
21
+ // hostname), or an IP address, directly extracted from the input URL. IPv4
22
+ // addresses are in dot-decimal notation. IPv6 is returned without its
23
+ // surrounding brackets; both bracketed (in URLs, e.g. `http://[::1]/`) and
24
+ // bare unbracketed (e.g. `2a01:e35::1`) IPv6 literals are accepted.
22
25
  hostname: string | null;
23
26
 
24
27
  // Is `hostname` an IP? (IPv4 or IPv6)
@@ -119,9 +122,13 @@ export function parseImpl(
119
122
  result.hostname = url;
120
123
  } else if (options.mixedInputs) {
121
124
  urlIsValid = isValidHostname(url);
122
- result.hostname = extractHostname(url, urlIsValid);
125
+ result.hostname = extractHostname(
126
+ url,
127
+ urlIsValid,
128
+ options.validateHostname,
129
+ );
123
130
  } else {
124
- result.hostname = extractHostname(url, false);
131
+ result.hostname = extractHostname(url, false, options.validateHostname);
125
132
  }
126
133
 
127
134
  // Check if `hostname` is a valid ip address
@@ -143,6 +150,9 @@ export function parseImpl(
143
150
  // Skip the re-scan when `url` was already validated and extractHostname
144
151
  // returned it unchanged (same reference => identical string, still valid).
145
152
  !(urlIsValid && result.hostname === url) &&
153
+ // Skip the re-scan when extractHostname already validated the host inline
154
+ // (a confirmed-valid simple authority — see extract-hostname.ts).
155
+ !extractedHostnameValidated &&
146
156
  !isValidHostname(result.hostname)
147
157
  ) {
148
158
  result.hostname = null;
package/src/is-valid.ts CHANGED
@@ -7,6 +7,11 @@
7
7
  * If you need stricter validation, consider using an external library.
8
8
  */
9
9
 
10
+ // KEEP IN SYNC with `extract-hostname.ts` `isValidHostnameChar` + its inline
11
+ // scan/verdict, which duplicate these structural rules to validate during
12
+ // extraction (a perf fusion). That copy additionally accepts A-Z (the host is
13
+ // not yet lowercased there) and folds in '-' / '_'. Any change to the accepted
14
+ // character set or the label/length rules here must be mirrored there.
10
15
  function isValidAscii(code: number): boolean {
11
16
  return (
12
17
  (code >= 97 && code <= 122) || (code >= 48 && code <= 57) || code > 127