tldts-core 7.2.1 → 7.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,37 @@
5
5
  */
6
6
  const CONTROL_CHARS = /[\t\n\r]/g;
7
7
 
8
+ // Set by `extractHostname` (a module-scope flag, read synchronously by
9
+ // `parseImpl` right after the call — same pattern as the reused RESULT object).
10
+ // `true` ONLY when extraction validated the returned host inline (a confirmed-
11
+ // valid, "simple" authority) so `parseImpl` can skip the separate
12
+ // `isValidHostname` pass. `false` in every other case (validation disabled, a
13
+ // complex authority — userinfo/port/brackets/trailing-dot/control — an invalid
14
+ // host, or a non-main return path); `parseImpl` then validates as usual. The
15
+ // fast path can only ever SKIP a redundant scan for hosts already known valid,
16
+ // never accept an invalid one.
17
+ export let extractedHostnameValidated = false;
18
+
19
+ /**
20
+ * True if char `code` is a valid hostname character. This is the per-char half
21
+ * of `is-valid.ts`'s `isValidAscii` (a-z, 0-9, > U+007F) PLUS three additions:
22
+ * A-Z (the host is lowercased before validation, so uppercase ≡ a valid
23
+ * lowercase letter) and '-' / '_' (valid inside a label). KEEP IN SYNC with
24
+ * `is-valid.ts`: these rules are deliberately duplicated to validate during
25
+ * extraction, so any change to the accepted character set there must be
26
+ * mirrored here (and vice-versa).
27
+ */
28
+ function isValidHostnameChar(code: number): boolean {
29
+ return (
30
+ (code >= 97 && code <= 122) || // a-z
31
+ (code >= 48 && code <= 57) || // 0-9
32
+ code > 127 || // non-ASCII (accepted, not punycode-checked)
33
+ (code >= 65 && code <= 90) || // A-Z (becomes valid once lowercased)
34
+ code === 45 || // '-'
35
+ code === 95 // '_'
36
+ );
37
+ }
38
+
8
39
  /**
9
40
  * Classify scheme `url.slice(schemeStart, colonIndex)` as a WHATWG special
10
41
  * scheme without allocating a substring (case-insensitive via `| 32`).
@@ -59,15 +90,20 @@ function getSpecialScheme(
59
90
  * @param urlIsValidHostname - when true, `url` is already a valid hostname and is
60
91
  * returned by the same reference (factory.ts skips re-validation on that
61
92
  * identity), keeping the common path allocation-free.
93
+ * @param validate - when true, validate the host inline during the authority
94
+ * scan and publish the verdict via `extractedHostnameValidated` so `parseImpl`
95
+ * can skip the redundant `isValidHostname` pass for simple authorities.
62
96
  */
63
97
  export default function extractHostname(
64
98
  url: string,
65
99
  urlIsValidHostname: boolean,
100
+ validate = false,
66
101
  ): string | null {
67
102
  let start = 0;
68
103
  let end: number = url.length;
69
104
  let hasUpper = false;
70
105
  let isSpecial = false;
106
+ extractedHostnameValidated = false;
71
107
 
72
108
  if (!urlIsValidHostname) {
73
109
  // Data URLs never carry a host (and may be huge — short-circuit them).
@@ -143,6 +179,7 @@ export default function extractHostname(
143
179
  return extractHostname(
144
180
  url.replace(CONTROL_CHARS, ''),
145
181
  urlIsValidHostname,
182
+ validate,
146
183
  );
147
184
  }
148
185
  return null;
@@ -169,6 +206,7 @@ export default function extractHostname(
169
206
  return extractHostname(
170
207
  url.replace(CONTROL_CHARS, ''),
171
208
  urlIsValidHostname,
209
+ validate,
172
210
  );
173
211
  }
174
212
  if (code === 58 /* ':' */) {
@@ -279,11 +317,36 @@ export default function extractHostname(
279
317
  // '@') to tell a bare IPv6 (>= 2 colons) from a host:port (exactly one);
280
318
  // flag uppercase and a stray tab/newline. The loop is split on `code < 64`
281
319
  // so common host characters take fewer comparisons.
320
+ //
321
+ // When `validate`, also accumulate `is-valid.ts`'s checks over the scanned
322
+ // run so a simple authority's host can be validated in this single pass.
323
+ // `vValid` only stays meaningful for a "simple" authority (no userinfo, port,
324
+ // brackets, control or trailing dot); those cases clear it / are rejected by
325
+ // the guard below, falling back to `isValidHostname`.
282
326
  let indexOfIdentifier = -1;
283
327
  let indexOfClosingBracket = -1;
284
328
  let indexOfPort = -1;
285
329
  let indexOfFirstColon = -1;
286
330
  let hasControl = false;
331
+ let vValid = validate; // seeded true when validating; cleared on the first invalid char
332
+ let vLastDot = start - 1; // mirrors is-valid.ts `lastDotIndex = -1` at host start
333
+ let vLastCode = -1;
334
+ if (validate && start < end) {
335
+ // First-char rule: must be a valid host char, '.', or '_' (NOT '-').
336
+ const c0 = url.charCodeAt(start);
337
+ if (
338
+ !(
339
+ /*@__INLINE__*/ (
340
+ isValidHostnameChar(c0) ||
341
+ c0 === 46 /* '.' */ ||
342
+ c0 === 95 /* '_' */
343
+ )
344
+ ) ||
345
+ c0 === 45 /* '-' (isValidHostnameChar allows it mid-label, not first) */
346
+ ) {
347
+ vValid = false;
348
+ }
349
+ }
287
350
  for (let i = start; i < end; i += 1) {
288
351
  const code: number = url.charCodeAt(i);
289
352
  if (code < 64) {
@@ -297,6 +360,22 @@ export default function extractHostname(
297
360
  indexOfPort = i;
298
361
  } else if (code === 9 || code === 10 || code === 13) {
299
362
  hasControl = true;
363
+ } else if (validate) {
364
+ if (code === 46 /* '.' */) {
365
+ if (i - vLastDot > 64 || vLastCode === 46 || vLastCode === 45) {
366
+ vValid = false;
367
+ }
368
+ vLastDot = i;
369
+ } else if (code < 48 || code > 57) {
370
+ // < 64 and not a delimiter/dot/digit => only '-' (45) is a valid
371
+ // host char here; everything else (space, %, !, etc.) is invalid.
372
+ // A '-' must also not START a label (the byte right after a '.') —
373
+ // mirrors is-valid.ts; the first label is covered by the first-char
374
+ // rule above. (RFC 1034 §3.5 / RFC 1035 §2.3.1 LDH.)
375
+ if (code !== 45 || vLastCode === 46 /* label-leading '-' */) {
376
+ vValid = false;
377
+ }
378
+ }
300
379
  }
301
380
  } else if (isSpecial && code === 92 /* '\' */) {
302
381
  end = i;
@@ -308,6 +387,12 @@ export default function extractHostname(
308
387
  indexOfClosingBracket = i;
309
388
  } else if (code >= 65 && code <= 90) {
310
389
  hasUpper = true;
390
+ } else if (validate && !(/*@__INLINE__*/ isValidHostnameChar(code))) {
391
+ // >= 64, not '@'/']'/upper: valid only if a-z, '_', or non-ASCII.
392
+ vValid = false;
393
+ }
394
+ if (validate) {
395
+ vLastCode = code;
311
396
  }
312
397
  }
313
398
 
@@ -316,6 +401,7 @@ export default function extractHostname(
316
401
  return extractHostname(
317
402
  url.replace(CONTROL_CHARS, ''),
318
403
  urlIsValidHostname,
404
+ validate,
319
405
  );
320
406
  }
321
407
 
@@ -351,6 +437,31 @@ export default function extractHostname(
351
437
  if (start >= end) {
352
438
  return null;
353
439
  }
440
+
441
+ // Publish the inline-validation verdict — but only for a "simple" authority,
442
+ // where the scanned run equals the final host: no userinfo skip, no port
443
+ // trim, no brackets, no trailing dot (trimmed below), and length within RFC
444
+ // limits. Anything else leaves it `false` so `parseImpl` re-validates.
445
+ //
446
+ // Every clause below is load-bearing for CORRECTNESS, not just speed: the
447
+ // loop accumulates `vValid` over the whole scanned run (it does not stop at
448
+ // ':' or '@', so any port/userinfo bytes are included), so the verdict is
449
+ // only sound when that run equals the final host. Do not drop a clause as
450
+ // "redundant" — e.g. without `indexOfPort === -1`, `host:8080` would be
451
+ // wrongly accepted.
452
+ if (
453
+ validate &&
454
+ vValid &&
455
+ indexOfIdentifier === -1 &&
456
+ indexOfPort === -1 &&
457
+ indexOfClosingBracket === -1 &&
458
+ url.charCodeAt(end - 1) !== 46 /* no trailing dot */ &&
459
+ end - start <= 255 && // total length
460
+ end - vLastDot - 1 <= 63 && // last label length
461
+ vLastCode !== 45 /* last char not '-' */
462
+ ) {
463
+ extractedHostnameValidated = true;
464
+ }
354
465
  }
355
466
 
356
467
  // Trim trailing dots
package/src/factory.ts CHANGED
@@ -6,7 +6,9 @@
6
6
 
7
7
  import getDomain from './domain';
8
8
  import getDomainWithoutSuffix from './domain-without-suffix';
9
- import extractHostname from './extract-hostname';
9
+ import extractHostname, {
10
+ extractedHostnameValidated,
11
+ } from './extract-hostname';
10
12
  import isIp from './is-ip';
11
13
  import isSpecialUse from './is-special-use';
12
14
  import isValidHostname from './is-valid';
@@ -120,9 +122,13 @@ export function parseImpl(
120
122
  result.hostname = url;
121
123
  } else if (options.mixedInputs) {
122
124
  urlIsValid = isValidHostname(url);
123
- result.hostname = extractHostname(url, urlIsValid);
125
+ result.hostname = extractHostname(
126
+ url,
127
+ urlIsValid,
128
+ options.validateHostname,
129
+ );
124
130
  } else {
125
- result.hostname = extractHostname(url, false);
131
+ result.hostname = extractHostname(url, false, options.validateHostname);
126
132
  }
127
133
 
128
134
  // Check if `hostname` is a valid ip address
@@ -144,6 +150,9 @@ export function parseImpl(
144
150
  // Skip the re-scan when `url` was already validated and extractHostname
145
151
  // returned it unchanged (same reference => identical string, still valid).
146
152
  !(urlIsValid && result.hostname === url) &&
153
+ // Skip the re-scan when extractHostname already validated the host inline
154
+ // (a confirmed-valid simple authority — see extract-hostname.ts).
155
+ !extractedHostnameValidated &&
147
156
  !isValidHostname(result.hostname)
148
157
  ) {
149
158
  result.hostname = null;
package/src/is-valid.ts CHANGED
@@ -7,6 +7,11 @@
7
7
  * If you need stricter validation, consider using an external library.
8
8
  */
9
9
 
10
+ // KEEP IN SYNC with `extract-hostname.ts` `isValidHostnameChar` + its inline
11
+ // scan/verdict, which duplicate these structural rules to validate during
12
+ // extraction (a perf fusion). That copy additionally accepts A-Z (the host is
13
+ // not yet lowercased there) and folds in '-' / '_'. Any change to the accepted
14
+ // character set or the label/length rules here must be mirrored there.
10
15
  function isValidAscii(code: number): boolean {
11
16
  return (
12
17
  (code >= 97 && code <= 122) || (code >= 48 && code <= 57) || code > 127
@@ -59,9 +64,14 @@ export default function (hostname: string): boolean {
59
64
 
60
65
  lastDotIndex = i;
61
66
  } else if (
62
- !(/*@__INLINE__*/ (isValidAscii(code) || code === 45 || code === 95))
67
+ // A forbidden character in the label...
68
+ !(/*@__INLINE__*/ (isValidAscii(code) || code === 45 || code === 95)) ||
69
+ // ...or a '-' starting a label (the byte right after a '.'). A label must
70
+ // not begin with a hyphen (RFC 1034 §3.5 / RFC 1035 §2.3.1 LDH, as amended
71
+ // by RFC 1123 §2.1; cf. UTS #46 CheckHyphens). The first label is covered by
72
+ // the leading-character guard above; mirrors the trailing-'-' rule below.
73
+ (code === 45 && lastCharCode === 46)
63
74
  ) {
64
- // Check if there is a forbidden character in the label
65
75
  return false;
66
76
  }
67
77