tldts 7.0.32 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,60 +1,174 @@
1
+ // NOTE: kept (intentionally) near-identical to packages/tldts-icann/src/suffix-trie.ts.
2
+ // They are separate copies rather than a shared helper because the lookup is
3
+ // only fast when the typed arrays are module-scope monomorphic globals —
4
+ // closing over them (a shared factory) measured ~20% slower. The ICANN build
5
+ // also specializes (constant mask, no isIcann/isPrivate). Keep the two in sync.
1
6
  import {
2
7
  fastPathLookup,
3
8
  IPublicSuffix,
4
9
  ISuffixLookupOptions,
5
10
  } from 'tldts-core';
6
- import { exceptions, ITrie, rules } from './data/trie';
11
+ import {
12
+ edgeChild,
13
+ edgeLength,
14
+ edgeStart,
15
+ exceptionsRoot,
16
+ labelText,
17
+ nodeFlags,
18
+ rulesRoot,
19
+ } from './data/trie';
7
20
 
8
- // Flags used to know if a rule is ICANN or Private
9
21
  const enum RULE_TYPE {
10
22
  ICANN = 1,
11
23
  PRIVATE = 2,
12
24
  }
13
25
 
14
- interface IMatch {
15
- index: number;
16
- isIcann: boolean;
17
- isPrivate: boolean;
26
+ // `edgeOffset` (where each label starts in `labelText`), `edgeHash` (djb2 of
27
+ // each label) and `wildcardEdge` (each node's '*' edge, or -1) are derived once
28
+ // at load instead of being shipped: the bundle then carries only the
29
+ // compressible `labelText` + structure, while the lookup binary-searches
30
+ // integer hashes. The cost is a single ~1ms pass at first import — cheaper than
31
+ // the object trie it replaces. Kept at module scope (not captured in a closure)
32
+ // so V8 treats the typed arrays as fast monomorphic globals.
33
+ const numberOfNodes = nodeFlags.length;
34
+ const numberOfEdges = edgeLength.length;
35
+ const edgeOffset = new Uint32Array(numberOfEdges);
36
+ const edgeHash = new Uint32Array(numberOfEdges);
37
+ const wildcardEdge = new Int32Array(numberOfNodes).fill(-1);
38
+ for (let node = 0, offset = 0; node < numberOfNodes; node += 1) {
39
+ for (let edge = edgeStart[node]!; edge < edgeStart[node + 1]!; edge += 1) {
40
+ edgeOffset[edge] = offset;
41
+ const end = offset + edgeLength[edge]!;
42
+ let hash = 5381;
43
+ for (let i = end - 1; i >= offset; i -= 1) {
44
+ hash = (hash * 33) ^ labelText.charCodeAt(i);
45
+ }
46
+ edgeHash[edge] = hash >>> 0;
47
+ if (
48
+ edgeLength[edge] === 1 &&
49
+ labelText.charCodeAt(offset) === 42 /* '*' */
50
+ ) {
51
+ wildcardEdge[node] = edge;
52
+ }
53
+ offset = end;
54
+ }
18
55
  }
19
56
 
57
+ // Result of the last `walk`, kept in module scope to avoid allocating a match
58
+ // object. Safe because lookups are synchronous and read right after `walk`.
59
+ let matchNode = -1;
60
+ let matchStart = 0;
61
+ let matchEnd = 0;
62
+
20
63
  /**
21
- * Lookup parts of domain in Trie
64
+ * True if edge `edge`'s label equals `hostname[start, start + length)`.
22
65
  */
23
- function lookupInTrie(
24
- parts: string[],
25
- trie: ITrie,
26
- index: number,
27
- allowedMask: number,
28
- ): IMatch | null {
29
- let result: IMatch | null = null;
30
- let node: ITrie | undefined = trie;
31
- while (node !== undefined) {
32
- // We have a match!
33
- if ((node[0] & allowedMask) !== 0) {
34
- result = {
35
- index: index + 1,
36
- isIcann: (node[0] & RULE_TYPE.ICANN) !== 0,
37
- isPrivate: (node[0] & RULE_TYPE.PRIVATE) !== 0,
38
- };
66
+ function labelEquals(
67
+ edge: number,
68
+ hostname: string,
69
+ start: number,
70
+ length: number,
71
+ ): boolean {
72
+ if (edgeLength[edge] !== length) {
73
+ return false;
74
+ }
75
+ const offset = edgeOffset[edge]!;
76
+ for (let i = 0; i < length; i += 1) {
77
+ if (labelText.charCodeAt(offset + i) !== hostname.charCodeAt(start + i)) {
78
+ return false;
39
79
  }
80
+ }
81
+ return true;
82
+ }
40
83
 
41
- // No more `parts` to look for
42
- if (index === -1) {
43
- break;
84
+ /**
85
+ * Find the child edge of `node` whose label is `hostname[start, start + length)`.
86
+ * Edges are sorted by hash, so binary-search the hash then verify the label
87
+ * (scanning the rare run of equal hashes). Returns the edge index or -1.
88
+ */
89
+ function findEdge(
90
+ node: number,
91
+ hash: number,
92
+ hostname: string,
93
+ start: number,
94
+ length: number,
95
+ ): number {
96
+ let lo = edgeStart[node]!;
97
+ let hi = edgeStart[node + 1]!;
98
+ while (lo < hi) {
99
+ const mid = (lo + hi) >>> 1;
100
+ const value = edgeHash[mid]!;
101
+ if (value < hash) {
102
+ lo = mid + 1;
103
+ } else if (value > hash) {
104
+ hi = mid;
105
+ } else {
106
+ for (let e = mid; e >= lo && edgeHash[e] === hash; e -= 1) {
107
+ if (labelEquals(e, hostname, start, length)) return e;
108
+ }
109
+ for (let e = mid + 1; e < hi && edgeHash[e] === hash; e += 1) {
110
+ if (labelEquals(e, hostname, start, length)) return e;
111
+ }
112
+ return -1;
44
113
  }
114
+ }
115
+ return -1;
116
+ }
45
117
 
46
- const succ: Record<string, ITrie> = node[1];
47
- node = Object.prototype.hasOwnProperty.call(succ, parts[index]!)
48
- ? succ[parts[index]!]
49
- : succ['*'];
50
- index -= 1;
118
+ /**
119
+ * Walk `hostname`'s labels right-to-left from `root`, recording the deepest
120
+ * node whose flag passes `allowedMask` (with the label boundaries of that match
121
+ * in `matchStart`/`matchEnd`). Returns whether any match was found.
122
+ */
123
+ function walk(hostname: string, root: number, allowedMask: number): boolean {
124
+ let node = root;
125
+ let end = hostname.length;
126
+ let hash = 5381;
127
+ matchNode = -1;
128
+ for (let i = hostname.length - 1; i >= 0; i -= 1) {
129
+ const code = hostname.charCodeAt(i);
130
+ if (code === 46 /* '.' */) {
131
+ const start = i + 1;
132
+ let edge = findEdge(node, hash >>> 0, hostname, start, end - start);
133
+ if (edge === -1) {
134
+ edge = wildcardEdge[node]!;
135
+ }
136
+ if (edge === -1) {
137
+ return matchNode !== -1;
138
+ }
139
+ node = edgeChild[edge]!;
140
+ if ((nodeFlags[node]! & allowedMask) !== 0) {
141
+ matchNode = node;
142
+ matchStart = start;
143
+ matchEnd = end;
144
+ }
145
+ end = i;
146
+ hash = 5381;
147
+ } else {
148
+ hash = (hash * 33) ^ code;
149
+ }
51
150
  }
52
151
 
53
- return result;
152
+ // Left-most label: hostname[0, end). Same find/descend/record as the loop —
153
+ // duplicated rather than folded into the loop (via `i >= -1`) because that
154
+ // extra per-character branch measured slightly slower on the hot path.
155
+ let edge = findEdge(node, hash >>> 0, hostname, 0, end);
156
+ if (edge === -1) {
157
+ edge = wildcardEdge[node]!;
158
+ }
159
+ if (edge !== -1) {
160
+ node = edgeChild[edge]!;
161
+ if ((nodeFlags[node]! & allowedMask) !== 0) {
162
+ matchNode = node;
163
+ matchStart = 0;
164
+ matchEnd = end;
165
+ }
166
+ }
167
+ return matchNode !== -1;
54
168
  }
55
169
 
56
170
  /**
57
- * Check if `hostname` has a valid public suffix in `trie`.
171
+ * Check if `hostname` has a valid public suffix in the trie.
58
172
  */
59
173
  export default function suffixLookup(
60
174
  hostname: string,
@@ -65,46 +179,29 @@ export default function suffixLookup(
65
179
  return;
66
180
  }
67
181
 
68
- const hostnameParts = hostname.split('.');
69
-
70
182
  const allowedMask =
71
183
  (options.allowPrivateDomains ? RULE_TYPE.PRIVATE : 0) |
72
184
  (options.allowIcannDomains ? RULE_TYPE.ICANN : 0);
73
185
 
74
- // Look for exceptions
75
- const exceptionMatch = lookupInTrie(
76
- hostnameParts,
77
- exceptions,
78
- hostnameParts.length - 1,
79
- allowedMask,
80
- );
81
-
82
- if (exceptionMatch !== null) {
83
- out.isIcann = exceptionMatch.isIcann;
84
- out.isPrivate = exceptionMatch.isPrivate;
85
- out.publicSuffix = hostnameParts.slice(exceptionMatch.index + 1).join('.');
186
+ // Exceptions have priority and strip their own left-most label (e.g. the
187
+ // rule '!www.ck' makes the suffix of 'www.ck' be 'ck').
188
+ if (walk(hostname, exceptionsRoot, allowedMask)) {
189
+ out.isIcann = (nodeFlags[matchNode]! & RULE_TYPE.ICANN) !== 0;
190
+ out.isPrivate = (nodeFlags[matchNode]! & RULE_TYPE.PRIVATE) !== 0;
191
+ out.publicSuffix = hostname.slice(matchEnd + 1);
86
192
  return;
87
193
  }
88
194
 
89
- // Look for a match in rules
90
- const rulesMatch = lookupInTrie(
91
- hostnameParts,
92
- rules,
93
- hostnameParts.length - 1,
94
- allowedMask,
95
- );
96
-
97
- if (rulesMatch !== null) {
98
- out.isIcann = rulesMatch.isIcann;
99
- out.isPrivate = rulesMatch.isPrivate;
100
- out.publicSuffix = hostnameParts.slice(rulesMatch.index).join('.');
195
+ if (walk(hostname, rulesRoot, allowedMask)) {
196
+ out.isIcann = (nodeFlags[matchNode]! & RULE_TYPE.ICANN) !== 0;
197
+ out.isPrivate = (nodeFlags[matchNode]! & RULE_TYPE.PRIVATE) !== 0;
198
+ out.publicSuffix = hostname.slice(matchStart);
101
199
  return;
102
200
  }
103
201
 
104
- // No match found...
105
- // Prevailing rule is '*' so we consider the top-level domain to be the
106
- // public suffix of `hostname` (e.g.: 'example.org' => 'org').
202
+ // No match: the prevailing '*' rule makes the right-most label the suffix.
107
203
  out.isIcann = false;
108
204
  out.isPrivate = false;
109
- out.publicSuffix = hostnameParts[hostnameParts.length - 1] ?? null;
205
+ const lastDot = hostname.lastIndexOf('.');
206
+ out.publicSuffix = lastDot === -1 ? hostname : hostname.slice(lastDot + 1);
110
207
  }