@mochi.js/core 0.1.2 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Closed-shadow piercing locator.
3
+ *
4
+ * Walks a tree returned by `DOM.getDocument({ depth: -1, pierce: true })` and
5
+ * yields `backendNodeId`s for every element that matches a parsed CSS
6
+ * selector — including elements inside **closed** shadow roots, which
7
+ * `DOM.querySelector(..., pierce: true)` does NOT traverse from the parent
8
+ * document. Patchright solves the same problem in `_customFindElementsByParsed`
9
+ * (`framesPatch.ts:868-1012`); this is mochi's port — we kept the recursive-walk
10
+ * shape but simplified the selector subset (CSS only — no XPath; see task
11
+ * 0253 brief for the rationale).
12
+ *
13
+ * The walker recurses through:
14
+ * - `node.children[]` (regular DOM descendants)
15
+ * - `node.shadowRoots[]` (BOTH `shadowRootType:"open"` and `"closed"` — the
16
+ * pierce flag yields both; the matcher just doesn't care which kind it is)
17
+ * - `node.contentDocument` (iframes — same-origin only; OOPIF subframes
18
+ * surface as separate targets and are out of scope here)
19
+ * - `node.templateContent` (template fragment, rare but cheap to walk)
20
+ *
21
+ * It deliberately does NOT recurse into:
22
+ * - `pseudoElements` — `::before` / `::after` aren't real DOM nodes for
23
+ * selector matching purposes; CDP yields them but they'd produce
24
+ * spurious matches on `*` selectors.
25
+ *
26
+ * The walker keeps a *flat* ancestor chain across shadow boundaries so the
27
+ * descendant-combinator matcher can reason about "div .btn" correctly even
28
+ * when the `.btn` is inside a closed shadow rooted at `<div>`. This mirrors
29
+ * how DOM's regular ancestor walk behaves under `composedPath` semantics —
30
+ * patchright does the same.
31
+ *
32
+ * Performance: O(N) in DOM size per call. Acceptable for v0.2 (per task
33
+ * brief — a per-page cache layer is a v0.3+ concern).
34
+ *
35
+ * @see PLAN.md §8.2 — `DOM.getDocument` / `DOM.resolveNode` are not forbidden
36
+ * @see tasks/0253-closed-shadow-piercing-locator.md
37
+ */
38
+
39
+ import type { PierceDomNode } from "../cdp/types";
40
+ import { matchSelector, type ParsedSelector } from "./selector";
41
+
42
+ export interface PierceMatch {
43
+ /** The CDP `backendNodeId` of the matched element — stable across DOM mutations. */
44
+ backendNodeId: number;
45
+ /** The CDP node id (per-DOMSession-instance; less stable than backend). */
46
+ nodeId: number;
47
+ /** The matched node itself (for diagnostics + tests). */
48
+ node: PierceDomNode;
49
+ }
50
+
51
+ /**
52
+ * Walk `root` and return every matching element. Ordering is depth-first,
53
+ * pre-order (parents before children) — matches the natural `querySelectorAll`
54
+ * traversal order users expect.
55
+ *
56
+ * If `limit` is set, the walk short-circuits as soon as that many matches
57
+ * accumulate. `Page.querySelectorPiercing` passes `1` for a single-element
58
+ * lookup; `querySelectorAllPiercing` leaves it undefined.
59
+ */
60
+ export function findPiercingMatches(
61
+ root: PierceDomNode,
62
+ selector: ParsedSelector,
63
+ limit?: number,
64
+ ): PierceMatch[] {
65
+ const out: PierceMatch[] = [];
66
+ walk(root, selector, [], out, limit);
67
+ return out;
68
+ }
69
+
70
+ function walk(
71
+ node: PierceDomNode,
72
+ selector: ParsedSelector,
73
+ ancestors: PierceDomNode[],
74
+ out: PierceMatch[],
75
+ limit: number | undefined,
76
+ ): boolean {
77
+ if (limit !== undefined && out.length >= limit) return true;
78
+
79
+ // Match element nodes only — but document / fragment nodes still need to
80
+ // recurse into children.
81
+ if (node.nodeType === 1 && matchSelector(selector, node, ancestors)) {
82
+ out.push({ backendNodeId: node.backendNodeId, nodeId: node.nodeId, node });
83
+ if (limit !== undefined && out.length >= limit) return true;
84
+ }
85
+
86
+ // Push self into ancestor stack ONLY if it's an element (text / shadow-root
87
+ // / document nodes aren't ancestors for `div .btn`-style descendant matches).
88
+ const isElement = node.nodeType === 1;
89
+ if (isElement) ancestors.push(node);
90
+
91
+ // Children (regular DOM descendants).
92
+ const children = node.children;
93
+ if (children !== undefined) {
94
+ for (const child of children) {
95
+ if (walk(child, selector, ancestors, out, limit)) {
96
+ if (isElement) ancestors.pop();
97
+ return true;
98
+ }
99
+ }
100
+ }
101
+
102
+ // Shadow roots — both open AND closed. This is the whole point.
103
+ const shadowRoots = node.shadowRoots;
104
+ if (shadowRoots !== undefined) {
105
+ for (const root of shadowRoots) {
106
+ if (walk(root, selector, ancestors, out, limit)) {
107
+ if (isElement) ancestors.pop();
108
+ return true;
109
+ }
110
+ }
111
+ }
112
+
113
+ // iframe contentDocument (same-origin only — OOPIFs surface as separate
114
+ // CDP targets and aren't reachable here).
115
+ const contentDocument = node.contentDocument;
116
+ if (contentDocument !== undefined) {
117
+ if (walk(contentDocument, selector, ancestors, out, limit)) {
118
+ if (isElement) ancestors.pop();
119
+ return true;
120
+ }
121
+ }
122
+
123
+ // <template>.content — rare in real-world Cloudflare integrations but
124
+ // matches what patchright walks.
125
+ const templateContent = node.templateContent;
126
+ if (templateContent !== undefined) {
127
+ if (walk(templateContent, selector, ancestors, out, limit)) {
128
+ if (isElement) ancestors.pop();
129
+ return true;
130
+ }
131
+ }
132
+
133
+ if (isElement) ancestors.pop();
134
+ return false;
135
+ }
@@ -0,0 +1,423 @@
1
+ /**
2
+ * Tiny host-side CSS selector engine for the closed-shadow piercing locator
3
+ * (`Page.querySelectorPiercing`). Parses a CSS selector into a sequence of
4
+ * **compound** parts joined by descendant combinators, then matches a
5
+ * pre-walked `PierceDomNode` against that compound chain.
6
+ *
7
+ * Why we don't `DOM.querySelector` per shadow root: that CDP method does NOT
8
+ * pierce closed shadows even when its parent `DOM.getDocument` was called
9
+ * with `pierce: true`. Patchright's `_customFindElementsByParsed`
10
+ * (`framesPatch.ts:868-1012`) parses the selector itself and walks the tree
11
+ * manually for exactly this reason. We port the algorithm — *not* the surface
12
+ * area: only the CSS-selector subset listed in `tasks/0253` lands here.
13
+ *
14
+ * **Supported subset (CSS Selectors level 4 — strict subset):**
15
+ * - Tag selectors: `div`, `iframe`, `*`
16
+ * - ID: `#main`
17
+ * - Class: `.btn`, `.btn.primary`
18
+ * - Attribute: `[src]`, `[name="x"]`, `[href*="foo"]`, `[role^="b"]`,
19
+ * `[data-x$="y"]`, `[data-x~="z"]`, `[data-x|="en"]`. Quotes optional for
20
+ * value-less words.
21
+ * - Descendant combinator: `div .btn` (whitespace).
22
+ * - Comma-separated selector lists: `a, button` — match if ANY branch matches.
23
+ *
24
+ * **NOT supported (intentionally — see Out of scope in 0253):**
25
+ * - `>`, `+`, `~` combinators
26
+ * - `:pseudo-classes` (`:hover`, `:nth-child`, `:has`, `:not`)
27
+ * - `::pseudo-elements`
28
+ * - XPath (deferred — STRETCH per task brief; document as TODO if it lands).
29
+ * - Namespaces.
30
+ *
31
+ * Throws `SelectorParseError` on syntactically invalid input. The matcher
32
+ * itself never throws — unsupported nodes just don't match.
33
+ *
34
+ * @see tasks/0253-closed-shadow-piercing-locator.md
35
+ * @see PLAN.md §8.2 (forbidden CDP — neither `DOM.getDocument` nor
36
+ * `DOM.resolveNode` is forbidden; both fine).
37
+ */
38
+
39
+ import type { PierceDomNode } from "../cdp/types";
40
+
41
+ /** Thrown when the selector has a syntax error we can't recover from. */
42
+ export class SelectorParseError extends Error {
43
+ readonly selector: string;
44
+ constructor(selector: string, message: string) {
45
+ super(`[mochi] invalid selector "${selector}": ${message}`);
46
+ this.name = "SelectorParseError";
47
+ this.selector = selector;
48
+ }
49
+ }
50
+
51
+ /** A single attribute filter inside a compound part. */
52
+ export interface AttrFilter {
53
+ name: string;
54
+ /**
55
+ * Matcher op:
56
+ * - `"exists"`: attribute is present (value ignored)
57
+ * - `"="`: exact value
58
+ * - `"~="`: whitespace-separated word match
59
+ * - `"|="`: exact OR `value-…` prefix
60
+ * - `"^="`: prefix match
61
+ * - `"$="`: suffix match
62
+ * - `"*="`: substring match
63
+ */
64
+ op: "exists" | "=" | "~=" | "|=" | "^=" | "$=" | "*=";
65
+ /** Match value (always present except for `op === "exists"`). */
66
+ value?: string;
67
+ }
68
+
69
+ /** A compound (no whitespace) selector — one element's worth of constraints. */
70
+ export interface CompoundPart {
71
+ /** Lower-case tag, or `"*"` for the universal selector. */
72
+ tag: string;
73
+ id?: string;
74
+ classes: string[];
75
+ attrs: AttrFilter[];
76
+ }
77
+
78
+ /**
79
+ * A single descendant chain (one comma-separated branch). Matching iterates
80
+ * the chain right-to-left: the rightmost part must match the candidate; each
81
+ * earlier part must have a matching ancestor (DOM-ancestor-aware, including
82
+ * across shadow boundaries — see `matchSelector` for the walk).
83
+ */
84
+ export interface CompoundChain {
85
+ parts: CompoundPart[];
86
+ }
87
+
88
+ /** A parsed selector list — `,`-separated chains. */
89
+ export interface ParsedSelector {
90
+ chains: CompoundChain[];
91
+ }
92
+
93
+ // ---- parser ----------------------------------------------------------------
94
+
95
+ /**
96
+ * Parse a CSS selector string into a {@link ParsedSelector}. Throws
97
+ * {@link SelectorParseError} on bad input.
98
+ *
99
+ * The grammar we accept is a strict subset documented at the top of this
100
+ * module. We deliberately do not use a regex-driven parser — those struggle
101
+ * with quoted attribute values that contain `[`, `,`, or whitespace.
102
+ */
103
+ export function parseSelector(input: string): ParsedSelector {
104
+ if (typeof input !== "string") {
105
+ throw new SelectorParseError(String(input), "selector must be a string");
106
+ }
107
+ const trimmed = input.trim();
108
+ if (trimmed.length === 0) {
109
+ throw new SelectorParseError(input, "selector must not be empty");
110
+ }
111
+ const branches = splitTopLevel(trimmed, ",");
112
+ const chains: CompoundChain[] = [];
113
+ for (const branch of branches) {
114
+ const parts = splitTopLevel(branch.trim(), " ").filter((p) => p.length > 0);
115
+ if (parts.length === 0) {
116
+ throw new SelectorParseError(input, "empty selector branch");
117
+ }
118
+ chains.push({ parts: parts.map((p) => parseCompound(p, input)) });
119
+ }
120
+ return { chains };
121
+ }
122
+
123
+ /**
124
+ * Split a selector string at top-level occurrences of `sep` — i.e. ignoring
125
+ * separators inside `[...]` brackets or quoted attribute values.
126
+ */
127
+ function splitTopLevel(input: string, sep: string): string[] {
128
+ const out: string[] = [];
129
+ let buf = "";
130
+ let depth = 0;
131
+ let quote: '"' | "'" | null = null;
132
+ for (let i = 0; i < input.length; i++) {
133
+ const ch = input[i] as string;
134
+ if (quote !== null) {
135
+ buf += ch;
136
+ if (ch === "\\" && i + 1 < input.length) {
137
+ const next = input[i + 1] as string;
138
+ buf += next;
139
+ i++;
140
+ continue;
141
+ }
142
+ if (ch === quote) quote = null;
143
+ continue;
144
+ }
145
+ if (ch === '"' || ch === "'") {
146
+ quote = ch;
147
+ buf += ch;
148
+ continue;
149
+ }
150
+ if (ch === "[") {
151
+ depth++;
152
+ buf += ch;
153
+ continue;
154
+ }
155
+ if (ch === "]") {
156
+ depth = Math.max(0, depth - 1);
157
+ buf += ch;
158
+ continue;
159
+ }
160
+ if (depth === 0 && ch === sep) {
161
+ out.push(buf);
162
+ buf = "";
163
+ continue;
164
+ }
165
+ if (depth === 0 && sep === " " && /\s/.test(ch)) {
166
+ out.push(buf);
167
+ buf = "";
168
+ continue;
169
+ }
170
+ buf += ch;
171
+ }
172
+ out.push(buf);
173
+ return out;
174
+ }
175
+
176
+ /** Parse one compound (tag + ids + classes + attrs, no whitespace). */
177
+ function parseCompound(input: string, original: string): CompoundPart {
178
+ const part: CompoundPart = { tag: "*", classes: [], attrs: [] };
179
+ let i = 0;
180
+ // Optional tag prefix (or `*`).
181
+ let tagBuf = "";
182
+ while (i < input.length) {
183
+ const ch = input[i] as string;
184
+ if (ch === "#" || ch === "." || ch === "[") break;
185
+ tagBuf += ch;
186
+ i++;
187
+ }
188
+ if (tagBuf.length > 0) {
189
+ if (!/^[*a-zA-Z][a-zA-Z0-9-]*$/.test(tagBuf)) {
190
+ throw new SelectorParseError(original, `bad tag "${tagBuf}"`);
191
+ }
192
+ part.tag = tagBuf.toLowerCase();
193
+ }
194
+ while (i < input.length) {
195
+ const ch = input[i] as string;
196
+ if (ch === "#") {
197
+ i++;
198
+ const id = readIdent(input, i, original);
199
+ part.id = id.value;
200
+ i = id.next;
201
+ continue;
202
+ }
203
+ if (ch === ".") {
204
+ i++;
205
+ const cls = readIdent(input, i, original);
206
+ part.classes.push(cls.value);
207
+ i = cls.next;
208
+ continue;
209
+ }
210
+ if (ch === "[") {
211
+ i++;
212
+ const attr = readAttr(input, i, original);
213
+ part.attrs.push(attr.filter);
214
+ i = attr.next;
215
+ continue;
216
+ }
217
+ throw new SelectorParseError(original, `unexpected "${ch}" in compound "${input}"`);
218
+ }
219
+ return part;
220
+ }
221
+
222
+ /** Read an identifier starting at `i`. Returns the parsed value + next idx. */
223
+ function readIdent(input: string, i: number, original: string): { value: string; next: number } {
224
+ const start = i;
225
+ while (i < input.length) {
226
+ const ch = input[i] as string;
227
+ if (!/[a-zA-Z0-9_-]/.test(ch)) break;
228
+ i++;
229
+ }
230
+ const value = input.slice(start, i);
231
+ if (value.length === 0) {
232
+ throw new SelectorParseError(original, `expected identifier at position ${start}`);
233
+ }
234
+ return { value, next: i };
235
+ }
236
+
237
+ /** Read the contents of `[...]` starting just past the `[`. */
238
+ function readAttr(
239
+ input: string,
240
+ i: number,
241
+ original: string,
242
+ ): { filter: AttrFilter; next: number } {
243
+ // Read attribute name (case-insensitive HTML; lower-case for storage).
244
+ const nameStart = i;
245
+ while (i < input.length) {
246
+ const ch = input[i] as string;
247
+ if (!/[a-zA-Z0-9_:-]/.test(ch)) break;
248
+ i++;
249
+ }
250
+ const name = input.slice(nameStart, i).toLowerCase();
251
+ if (name.length === 0) {
252
+ throw new SelectorParseError(original, `expected attribute name at position ${nameStart}`);
253
+ }
254
+ while (i < input.length && /\s/.test(input[i] as string)) i++;
255
+ if (i >= input.length) {
256
+ throw new SelectorParseError(original, `unterminated [...] in selector`);
257
+ }
258
+ if ((input[i] as string) === "]") {
259
+ return { filter: { name, op: "exists" }, next: i + 1 };
260
+ }
261
+ // Operator.
262
+ const opChars = ["~=", "|=", "^=", "$=", "*=", "="] as const;
263
+ let op: AttrFilter["op"] | null = null;
264
+ for (const cand of opChars) {
265
+ if (input.startsWith(cand, i)) {
266
+ op = cand;
267
+ i += cand.length;
268
+ break;
269
+ }
270
+ }
271
+ if (op === null) {
272
+ throw new SelectorParseError(original, `expected operator at position ${i}`);
273
+ }
274
+ while (i < input.length && /\s/.test(input[i] as string)) i++;
275
+ // Value: quoted or bare ident.
276
+ let value: string;
277
+ const ch0 = input[i] as string | undefined;
278
+ if (ch0 === '"' || ch0 === "'") {
279
+ const quote = ch0;
280
+ i++;
281
+ let buf = "";
282
+ while (i < input.length) {
283
+ const ch = input[i] as string;
284
+ if (ch === "\\" && i + 1 < input.length) {
285
+ buf += input[i + 1];
286
+ i += 2;
287
+ continue;
288
+ }
289
+ if (ch === quote) {
290
+ i++;
291
+ break;
292
+ }
293
+ buf += ch;
294
+ i++;
295
+ }
296
+ value = buf;
297
+ } else {
298
+ const start = i;
299
+ while (i < input.length) {
300
+ const ch = input[i] as string;
301
+ if (ch === "]" || /\s/.test(ch)) break;
302
+ i++;
303
+ }
304
+ value = input.slice(start, i);
305
+ }
306
+ while (i < input.length && /\s/.test(input[i] as string)) i++;
307
+ if ((input[i] as string | undefined) !== "]") {
308
+ throw new SelectorParseError(original, `expected ']' at position ${i}`);
309
+ }
310
+ return { filter: { name, op, value }, next: i + 1 };
311
+ }
312
+
313
+ // ---- matcher ---------------------------------------------------------------
314
+
315
+ /**
316
+ * Test whether a single (already-walked) node matches the rightmost compound
317
+ * part of any branch in `parsed`, with ancestor-walking for descendant
318
+ * combinators. `ancestors` is the chain of parent element nodes from the
319
+ * document root down to (but not including) `node`, INCLUDING ancestors that
320
+ * cross shadow boundaries (the piercing walker keeps a flat chain).
321
+ */
322
+ export function matchSelector(
323
+ parsed: ParsedSelector,
324
+ node: PierceDomNode,
325
+ ancestors: PierceDomNode[],
326
+ ): boolean {
327
+ for (const chain of parsed.chains) {
328
+ if (matchChain(chain, node, ancestors)) return true;
329
+ }
330
+ return false;
331
+ }
332
+
333
+ function matchChain(
334
+ chain: CompoundChain,
335
+ node: PierceDomNode,
336
+ ancestors: PierceDomNode[],
337
+ ): boolean {
338
+ const parts = chain.parts;
339
+ if (parts.length === 0) return false;
340
+ const last = parts[parts.length - 1] as CompoundPart;
341
+ if (!matchCompound(last, node)) return false;
342
+ // Walk leftwards through compound parts, each must be matched by some
343
+ // ancestor (in any order — `parts[k]` ancestor must be deeper than
344
+ // `parts[k-1]` ancestor; we enforce by iterating right-to-left and
345
+ // consuming ancestors from the bottom up).
346
+ let idx = ancestors.length - 1;
347
+ for (let p = parts.length - 2; p >= 0; p--) {
348
+ const part = parts[p] as CompoundPart;
349
+ let found = false;
350
+ while (idx >= 0) {
351
+ const a = ancestors[idx] as PierceDomNode;
352
+ idx--;
353
+ if (matchCompound(part, a)) {
354
+ found = true;
355
+ break;
356
+ }
357
+ }
358
+ if (!found) return false;
359
+ }
360
+ return true;
361
+ }
362
+
363
+ /** Test a single compound part against a single element node. */
364
+ export function matchCompound(part: CompoundPart, node: PierceDomNode): boolean {
365
+ // Element nodes only.
366
+ if (node.nodeType !== 1) return false;
367
+ const local = (node.localName ?? node.nodeName.toLowerCase()).toLowerCase();
368
+ if (part.tag !== "*" && part.tag !== local) return false;
369
+ if (part.id !== undefined) {
370
+ const id = readAttribute(node, "id");
371
+ if (id !== part.id) return false;
372
+ }
373
+ if (part.classes.length > 0) {
374
+ const cls = readAttribute(node, "class") ?? "";
375
+ const tokens = cls.split(/\s+/).filter((t) => t.length > 0);
376
+ for (const c of part.classes) {
377
+ if (!tokens.includes(c)) return false;
378
+ }
379
+ }
380
+ for (const f of part.attrs) {
381
+ if (!matchAttr(f, node)) return false;
382
+ }
383
+ return true;
384
+ }
385
+
386
+ function matchAttr(f: AttrFilter, node: PierceDomNode): boolean {
387
+ const val = readAttribute(node, f.name);
388
+ if (f.op === "exists") return val !== undefined;
389
+ if (val === undefined) return false;
390
+ const target = f.value ?? "";
391
+ switch (f.op) {
392
+ case "=":
393
+ return val === target;
394
+ case "~=": {
395
+ // Whitespace-separated word match.
396
+ const tokens = val.split(/\s+/).filter((t) => t.length > 0);
397
+ return tokens.includes(target);
398
+ }
399
+ case "|=":
400
+ return val === target || val.startsWith(`${target}-`);
401
+ case "^=":
402
+ return target.length > 0 && val.startsWith(target);
403
+ case "$=":
404
+ return target.length > 0 && val.endsWith(target);
405
+ case "*=":
406
+ return target.length > 0 && val.indexOf(target) >= 0;
407
+ }
408
+ }
409
+
410
+ /**
411
+ * Read an attribute value from a `PierceDomNode`. CDP serialises attributes
412
+ * as a flat `[name, value, name, value, ...]` array (lower-cased names per
413
+ * the protocol). Returns `undefined` if absent.
414
+ */
415
+ export function readAttribute(node: PierceDomNode, name: string): string | undefined {
416
+ const attrs = node.attributes;
417
+ if (attrs === undefined) return undefined;
418
+ const lower = name.toLowerCase();
419
+ for (let i = 0; i + 1 < attrs.length; i += 2) {
420
+ if ((attrs[i] as string).toLowerCase() === lower) return attrs[i + 1] as string;
421
+ }
422
+ return undefined;
423
+ }