tldts-core 7.1.0 → 7.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,64 @@
1
1
  /**
2
- * @param url - URL we want to extract a hostname from.
3
- * @param urlIsValidHostname - hint from caller; true if `url` is already a valid hostname.
2
+ * Matches an ASCII tab (U+0009) or newline (U+000A / U+000D). The WHATWG URL
3
+ * parser strips these before parsing; we only allocate a cleaned copy (and
4
+ * re-parse) on the rare input that actually contains one.
5
+ */
6
+ const CONTROL_CHARS = /[\t\n\r]/g;
7
+
8
+ /**
9
+ * Classify scheme `url.slice(schemeStart, colonIndex)` as a WHATWG special
10
+ * scheme without allocating a substring (case-insensitive via `| 32`).
11
+ * Special schemes: ftp, file, http, https, ws, wss
12
+ * (https://url.spec.whatwg.org/#special-scheme).
13
+ *
14
+ * @returns 0 = not special, 1 = special, 2 = file (its host sits only between
15
+ * "//" and the next slash).
16
+ */
17
+ function getSpecialScheme(
18
+ url: string,
19
+ schemeStart: number,
20
+ colonIndex: number,
21
+ ): number {
22
+ const length = colonIndex - schemeStart;
23
+ const c0 = url.charCodeAt(schemeStart) | 32;
24
+ if (length === 2) {
25
+ return c0 === 119 && (url.charCodeAt(schemeStart + 1) | 32) === 115 ? 1 : 0; // ws
26
+ } else if (length === 3) {
27
+ const c1 = url.charCodeAt(schemeStart + 1) | 32;
28
+ const c2 = url.charCodeAt(schemeStart + 2) | 32;
29
+ if (c0 === 119 && c1 === 115 && c2 === 115) return 1; // wss
30
+ if (c0 === 102 && c1 === 116 && c2 === 112) return 1; // ftp
31
+ return 0;
32
+ } else if (length === 4) {
33
+ const c1 = url.charCodeAt(schemeStart + 1) | 32;
34
+ const c2 = url.charCodeAt(schemeStart + 2) | 32;
35
+ const c3 = url.charCodeAt(schemeStart + 3) | 32;
36
+ if (c0 === 104 && c1 === 116 && c2 === 116 && c3 === 112) return 1; // http
37
+ if (c0 === 102 && c1 === 105 && c2 === 108 && c3 === 101) return 2; // file
38
+ return 0;
39
+ } else if (length === 5) {
40
+ return c0 === 104 &&
41
+ (url.charCodeAt(schemeStart + 1) | 32) === 116 &&
42
+ (url.charCodeAt(schemeStart + 2) | 32) === 116 &&
43
+ (url.charCodeAt(schemeStart + 3) | 32) === 112 &&
44
+ (url.charCodeAt(schemeStart + 4) | 32) === 115
45
+ ? 1
46
+ : 0; // https
47
+ }
48
+ return 0;
49
+ }
50
+
51
+ /**
52
+ * Extract a hostname from `url`, matching a WHATWG URL parser's host-boundary
53
+ * behaviour (https://url.spec.whatwg.org/#concept-basic-url-parser) for tldts'
54
+ * scope. It deliberately does NOT normalise the host (no IDNA/punycode or IPv4
55
+ * canonicalisation; IPv6 brackets are stripped, not compressed), strips trailing
56
+ * dots, and stays lenient where a strict parser rejects (bare host:port,
57
+ * out-of-range port, user@host) — all documented deviations.
58
+ *
59
+ * @param urlIsValidHostname - when true, `url` is already a valid hostname and is
60
+ * returned by the same reference (factory.ts skips re-validation on that
61
+ * identity), keeping the common path allocation-free.
4
62
  */
5
63
  export default function extractHostname(
6
64
  url: string,
@@ -9,148 +67,252 @@ export default function extractHostname(
9
67
  let start = 0;
10
68
  let end: number = url.length;
11
69
  let hasUpper = false;
70
+ let isSpecial = false;
12
71
 
13
- // If url is not already a valid hostname, then try to extract hostname.
14
72
  if (!urlIsValidHostname) {
15
- // Special handling of data URLs
73
+ // Data URLs never carry a host (and may be huge — short-circuit them).
16
74
  if (url.startsWith('data:')) {
17
75
  return null;
18
76
  }
19
77
 
20
- // Trim leading spaces
78
+ // WHATWG step 1: trim leading/trailing C0 control or space (<= U+0020).
79
+ // Tab/newline elsewhere are handled lazily below.
21
80
  while (start < url.length && url.charCodeAt(start) <= 32) {
22
81
  start += 1;
23
82
  }
24
-
25
- // Trim trailing spaces
26
83
  while (end > start + 1 && url.charCodeAt(end - 1) <= 32) {
27
84
  end -= 1;
28
85
  }
29
86
 
30
- // Skip scheme.
31
87
  if (
32
88
  url.charCodeAt(start) === 47 /* '/' */ &&
33
89
  url.charCodeAt(start + 1) === 47 /* '/' */
34
90
  ) {
91
+ // Scheme-relative reference ("//host/path").
35
92
  start += 2;
36
93
  } else {
37
94
  const indexOfProtocol = url.indexOf(':/', start);
38
95
  if (indexOfProtocol !== -1) {
39
- // Implement fast-path for common protocols. We expect most protocols
40
- // should be one of these 4 and thus we will not need to perform the
41
- // more expansive validity check most of the time.
42
- const protocolSize = indexOfProtocol - start;
43
- const c0 = url.charCodeAt(start);
44
- const c1 = url.charCodeAt(start + 1);
45
- const c2 = url.charCodeAt(start + 2);
46
- const c3 = url.charCodeAt(start + 3);
47
- const c4 = url.charCodeAt(start + 4);
48
-
49
- if (
50
- protocolSize === 5 &&
51
- c0 === 104 /* 'h' */ &&
52
- c1 === 116 /* 't' */ &&
53
- c2 === 116 /* 't' */ &&
54
- c3 === 112 /* 'p' */ &&
55
- c4 === 115 /* 's' */
56
- ) {
57
- // https
58
- } else if (
59
- protocolSize === 4 &&
60
- c0 === 104 /* 'h' */ &&
61
- c1 === 116 /* 't' */ &&
62
- c2 === 116 /* 't' */ &&
63
- c3 === 112 /* 'p' */
64
- ) {
65
- // http
66
- } else if (
67
- protocolSize === 3 &&
68
- c0 === 119 /* 'w' */ &&
69
- c1 === 115 /* 's' */ &&
70
- c2 === 115 /* 's' */
71
- ) {
72
- // wss
73
- } else if (
74
- protocolSize === 2 &&
75
- c0 === 119 /* 'w' */ &&
76
- c1 === 115 /* 's' */
77
- ) {
78
- // ws
96
+ // "scheme://…". Classify the scheme, then position `start` at the host.
97
+ const special = getSpecialScheme(url, start, indexOfProtocol);
98
+ if (special === 1) {
99
+ // Special scheme: skip the run of '/' and '\' after it
100
+ // (special-authority-(ignore-)slashes states; '\' acts as '/').
101
+ isSpecial = true;
102
+ start = indexOfProtocol + 2;
103
+ while (
104
+ url.charCodeAt(start) === 47 /* '/' */ ||
105
+ url.charCodeAt(start) === 92 /* '\' */
106
+ ) {
107
+ start += 1;
108
+ }
109
+ } else if (special === 2) {
110
+ // file: the host is only what sits between "//" and the next slash, so
111
+ // "file://h/x" => "h" but "file:///x" / "file:/x" => no host.
112
+ isSpecial = true;
113
+ start = indexOfProtocol + 1;
114
+ let slashes = 0;
115
+ while (
116
+ (url.charCodeAt(start) === 47 || url.charCodeAt(start) === 92) &&
117
+ slashes < 2
118
+ ) {
119
+ start += 1;
120
+ slashes += 1;
121
+ }
122
+ if (slashes < 2) {
123
+ return null;
124
+ }
79
125
  } else {
80
- // Check that scheme is valid
126
+ // Unknown scheme: validate the WHATWG scheme grammar [A-Za-z0-9+.-];
127
+ // a control char means it was split by a tab/newline (strip + re-parse).
81
128
  for (let i = start; i < indexOfProtocol; i += 1) {
82
- const lowerCaseCode = url.charCodeAt(i) | 32;
129
+ const code = url.charCodeAt(i) | 32;
83
130
  if (
84
131
  !(
85
132
  (
86
- (lowerCaseCode >= 97 && lowerCaseCode <= 122) || // [a, z]
87
- (lowerCaseCode >= 48 && lowerCaseCode <= 57) || // [0, 9]
88
- lowerCaseCode === 46 || // '.'
89
- lowerCaseCode === 45 || // '-'
90
- lowerCaseCode === 43
133
+ (code >= 97 && code <= 122) || // [a, z]
134
+ (code >= 48 && code <= 57) || // [0, 9]
135
+ code === 46 || // '.'
136
+ code === 45 || // '-'
137
+ code === 43
91
138
  ) // '+'
92
139
  )
93
140
  ) {
141
+ const raw = url.charCodeAt(i);
142
+ if (raw === 9 || raw === 10 || raw === 13) {
143
+ return extractHostname(
144
+ url.replace(CONTROL_CHARS, ''),
145
+ urlIsValidHostname,
146
+ );
147
+ }
94
148
  return null;
95
149
  }
96
150
  }
151
+ // A non-special scheme has an authority only after "//" (else it is an
152
+ // opaque path with no host). `indexOf(':/')` already gave the first '/'.
153
+ if (url.charCodeAt(indexOfProtocol + 2) === 47 /* '/' */) {
154
+ start = indexOfProtocol + 3;
155
+ } else {
156
+ return null;
157
+ }
97
158
  }
159
+ } else if (url.charCodeAt(start) !== 91 /* '[' */) {
160
+ // Cold path: no scheme "://", and not a bare IPv6 literal (whose first
161
+ // ':' would otherwise look like a scheme separator; "[…]" falls through
162
+ // to the ipv6 handling below). May be a bare host, a host:port, a
163
+ // user@host, a slash-less special scheme ("https:host"), or an opaque
164
+ // URI ("mailto:", "tel:", "urn:…").
165
+ let indexOfColon = -1;
166
+ for (let i = start; i < end; i += 1) {
167
+ const code = url.charCodeAt(i);
168
+ if (code === 9 || code === 10 || code === 13) {
169
+ return extractHostname(
170
+ url.replace(CONTROL_CHARS, ''),
171
+ urlIsValidHostname,
172
+ );
173
+ }
174
+ if (code === 58 /* ':' */) {
175
+ indexOfColon = i;
176
+ break;
177
+ }
178
+ if (code === 47 || code === 92 || code === 63 || code === 35) {
179
+ break;
180
+ }
181
+ }
182
+
183
+ if (indexOfColon !== -1) {
184
+ // An '@' before the next delimiter => the ':' is userinfo, not a
185
+ // scheme ("user:pass@host", "mailto:a@b"): keep the whole authority.
186
+ let hasIdentifier = false;
187
+ for (let i = indexOfColon + 1; i < end; i += 1) {
188
+ const code = url.charCodeAt(i);
189
+ if (code === 47 || code === 92 || code === 63 || code === 35) {
190
+ break;
191
+ }
192
+ if (code === 64 /* '@' */) {
193
+ hasIdentifier = true;
194
+ break;
195
+ }
196
+ }
197
+
198
+ if (!hasIdentifier) {
199
+ // All-digits after ':' => a bare "host:port" (tldts accepts
200
+ // hostnames too); keep `start` and let the port handling trim it.
201
+ let allDigits = true;
202
+ let i = indexOfColon + 1;
203
+ for (; i < end; i += 1) {
204
+ const code = url.charCodeAt(i);
205
+ if (code === 47 || code === 92 || code === 63 || code === 35) {
206
+ break;
207
+ }
208
+ if (code < 48 /* '0' */ || code > 57 /* '9' */) {
209
+ allDigits = false;
210
+ break;
211
+ }
212
+ }
213
+ if (i === indexOfColon + 1) {
214
+ allDigits = false; // nothing after ':' => not a port
215
+ }
98
216
 
99
- // Skip 0, 1 or more '/' after ':/'
100
- start = indexOfProtocol + 2;
101
- while (url.charCodeAt(start) === 47 /* '/' */) {
102
- start += 1;
217
+ if (!allDigits) {
218
+ const special = getSpecialScheme(url, start, indexOfColon);
219
+ if (special === 0) {
220
+ // No "://" anywhere on the cold path, so a non-special scheme has
221
+ // no authority: opaque path, no host ("mailto:x", "foo:bar").
222
+ return null;
223
+ }
224
+ isSpecial = true;
225
+ start = indexOfColon + 1;
226
+ if (special === 2) {
227
+ // file (e.g. "file:\\host"): host only between "//" and next slash.
228
+ let slashes = 0;
229
+ while (
230
+ (url.charCodeAt(start) === 47 ||
231
+ url.charCodeAt(start) === 92) &&
232
+ slashes < 2
233
+ ) {
234
+ start += 1;
235
+ slashes += 1;
236
+ }
237
+ if (slashes < 2) {
238
+ return null;
239
+ }
240
+ } else {
241
+ while (
242
+ url.charCodeAt(start) === 47 ||
243
+ url.charCodeAt(start) === 92
244
+ ) {
245
+ start += 1;
246
+ }
247
+ }
248
+ }
249
+ }
103
250
  }
104
251
  }
105
252
  }
106
253
 
107
- // Detect first occurrence of '/', '?' or '#'. We also keep track of the
108
- // last occurrence of '@', ']' or ':' to speed-up subsequent parsing of
109
- // (respectively), identifier, ipv6 or port.
254
+ // Find the host's end: first '/', '?' or '#' (and '\' for special URLs,
255
+ // which WHATWG treats like '/'). Track the last '@', ']' and ':' for
256
+ // userinfo, ipv6 and port; flag uppercase and a stray tab/newline. The loop
257
+ // is split on `code < 64` so common host characters take fewer comparisons.
110
258
  let indexOfIdentifier = -1;
111
259
  let indexOfClosingBracket = -1;
112
260
  let indexOfPort = -1;
261
+ let hasControl = false;
113
262
  for (let i = start; i < end; i += 1) {
114
263
  const code: number = url.charCodeAt(i);
115
- if (
116
- code === 35 || // '#'
117
- code === 47 || // '/'
118
- code === 63 // '?'
119
- ) {
264
+ if (code < 64) {
265
+ if (code === 47 || code === 35 || code === 63) {
266
+ end = i;
267
+ break;
268
+ } else if (code === 58 /* ':' */) {
269
+ indexOfPort = i;
270
+ } else if (code === 9 || code === 10 || code === 13) {
271
+ hasControl = true;
272
+ }
273
+ } else if (isSpecial && code === 92 /* '\' */) {
120
274
  end = i;
121
275
  break;
122
- } else if (code === 64) {
123
- // '@'
276
+ } else if (code === 64 /* '@' */) {
124
277
  indexOfIdentifier = i;
125
- } else if (code === 93) {
126
- // ']'
278
+ } else if (code === 93 /* ']' */) {
127
279
  indexOfClosingBracket = i;
128
- } else if (code === 58) {
129
- // ':'
130
- indexOfPort = i;
131
280
  } else if (code >= 65 && code <= 90) {
132
281
  hasUpper = true;
133
282
  }
134
283
  }
135
284
 
136
- // Detect identifier: '@'
285
+ // A tab/newline inside the authority: strip everything and re-parse (rare).
286
+ if (hasControl) {
287
+ return extractHostname(
288
+ url.replace(CONTROL_CHARS, ''),
289
+ urlIsValidHostname,
290
+ );
291
+ }
292
+
293
+ // Skip userinfo. '>= start' so an empty userinfo ("http://@host") works too.
137
294
  if (
138
295
  indexOfIdentifier !== -1 &&
139
- indexOfIdentifier > start &&
296
+ indexOfIdentifier >= start &&
140
297
  indexOfIdentifier < end
141
298
  ) {
142
299
  start = indexOfIdentifier + 1;
143
300
  }
144
301
 
145
- // Handle ipv6 addresses
146
302
  if (url.charCodeAt(start) === 91 /* '[' */) {
303
+ // ipv6 address: return what is between the brackets, or null if unclosed.
147
304
  if (indexOfClosingBracket !== -1) {
148
305
  return url.slice(start + 1, indexOfClosingBracket).toLowerCase();
149
306
  }
150
307
  return null;
151
308
  } else if (indexOfPort !== -1 && indexOfPort > start && indexOfPort < end) {
152
- // Detect port: ':'
153
- end = indexOfPort;
309
+ end = indexOfPort; // trim ':port'
310
+ }
311
+
312
+ // Empty authority ("http://", "file:///path", "//"); only reachable here via
313
+ // extraction — a bare valid hostname never lands here.
314
+ if (start >= end) {
315
+ return null;
154
316
  }
155
317
  }
156
318
 
package/src/is-ip.ts CHANGED
@@ -66,8 +66,8 @@ function isProbablyIpv6(hostname: string): boolean {
66
66
  (
67
67
  (code >= 48 && code <= 57) || // 0-9
68
68
  (code >= 97 && code <= 102) || // a-f
69
- (code >= 65 && code <= 90)
70
- ) // A-F
69
+ (code >= 65 && code <= 70)
70
+ ) // A-F (RFC 4291 §2.2: an IPv6 hextet is hex digits only)
71
71
  )
72
72
  ) {
73
73
  return false;