barebrowse 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ /**
2
+ * interact.js — Click, type, scroll, and press keys via CDP Input/DOM domains.
3
+ *
4
+ * All functions take a session-scoped CDP handle (from cdp.session()).
5
+ * Coordinates come from DOM.getBoxModel which returns viewport-relative quads.
6
+ */
7
+
8
+ /** Key definitions for special keys: key, code, keyCode (windowsVirtualKeyCode). */
9
+ const KEY_MAP = {
10
+ Enter: { key: 'Enter', code: 'Enter', keyCode: 13, text: '\r' },
11
+ Tab: { key: 'Tab', code: 'Tab', keyCode: 9, text: '\t' },
12
+ Escape: { key: 'Escape', code: 'Escape', keyCode: 27 },
13
+ Backspace: { key: 'Backspace', code: 'Backspace', keyCode: 8 },
14
+ Delete: { key: 'Delete', code: 'Delete', keyCode: 46 },
15
+ ArrowUp: { key: 'ArrowUp', code: 'ArrowUp', keyCode: 38 },
16
+ ArrowDown: { key: 'ArrowDown', code: 'ArrowDown', keyCode: 40 },
17
+ ArrowLeft: { key: 'ArrowLeft', code: 'ArrowLeft', keyCode: 37 },
18
+ ArrowRight: { key: 'ArrowRight', code: 'ArrowRight', keyCode: 39 },
19
+ Home: { key: 'Home', code: 'Home', keyCode: 36 },
20
+ End: { key: 'End', code: 'End', keyCode: 35 },
21
+ PageUp: { key: 'PageUp', code: 'PageUp', keyCode: 33 },
22
+ PageDown: { key: 'PageDown', code: 'PageDown', keyCode: 34 },
23
+ Space: { key: ' ', code: 'Space', keyCode: 32 },
24
+ };
25
+
26
+ /**
27
+ * Get the viewport-relative center point of a DOM node.
28
+ * Scrolls the element into view first to ensure valid coordinates.
29
+ * @param {object} session - Session-scoped CDP handle
30
+ * @param {number} backendNodeId - Backend DOM node ID from ARIA tree
31
+ * @returns {Promise<{x: number, y: number}>}
32
+ */
33
+ async function getCenter(session, backendNodeId) {
34
+ await session.send('DOM.scrollIntoViewIfNeeded', { backendNodeId });
35
+ const { model } = await session.send('DOM.getBoxModel', { backendNodeId });
36
+ // content quad: [x1,y1, x2,y2, x3,y3, x4,y4]
37
+ const [x1, y1, , , x3, y3] = model.content;
38
+ return { x: (x1 + x3) / 2, y: (y1 + y3) / 2 };
39
+ }
40
+
41
+ /**
42
+ * Click an element by its backendDOMNodeId.
43
+ * Scrolls into view, resolves coordinates, then dispatches mousePressed + mouseReleased.
44
+ *
45
+ * @param {object} session - Session-scoped CDP handle
46
+ * @param {number} backendNodeId - Backend DOM node ID
47
+ */
48
+ export async function click(session, backendNodeId) {
49
+ const { x, y } = await getCenter(session, backendNodeId);
50
+ await session.send('Input.dispatchMouseEvent', {
51
+ type: 'mousePressed', x, y, button: 'left', clickCount: 1,
52
+ });
53
+ await session.send('Input.dispatchMouseEvent', {
54
+ type: 'mouseReleased', x, y, button: 'left', clickCount: 1,
55
+ });
56
+ }
57
+
58
+ /**
59
+ * Type text into an element by its backendDOMNodeId.
60
+ * Default: DOM.focus + Input.insertText (fast, no key events).
61
+ * With { keyEvents: true }: dispatches keyDown/keyUp per character (triggers handlers).
62
+ * With { clear: true }: selects all existing text and deletes it before typing.
63
+ *
64
+ * @param {object} session - Session-scoped CDP handle
65
+ * @param {number} backendNodeId - Backend DOM node ID
66
+ * @param {string} text - Text to type
67
+ * @param {object} [opts]
68
+ * @param {boolean} [opts.keyEvents=false] - Use char-by-char key events
69
+ * @param {boolean} [opts.clear=false] - Clear existing content before typing
70
+ */
71
+ export async function type(session, backendNodeId, text, opts = {}) {
72
+ await session.send('DOM.focus', { backendNodeId });
73
+
74
+ if (opts.clear) {
75
+ // Select all (Ctrl+A) then delete
76
+ await session.send('Input.dispatchKeyEvent', {
77
+ type: 'keyDown', key: 'a', code: 'KeyA',
78
+ windowsVirtualKeyCode: 65, modifiers: 2, // 2 = Ctrl
79
+ });
80
+ await session.send('Input.dispatchKeyEvent', {
81
+ type: 'keyUp', key: 'a', code: 'KeyA',
82
+ windowsVirtualKeyCode: 65, modifiers: 2,
83
+ });
84
+ await session.send('Input.dispatchKeyEvent', {
85
+ type: 'keyDown', key: 'Backspace', code: 'Backspace',
86
+ windowsVirtualKeyCode: 8,
87
+ });
88
+ await session.send('Input.dispatchKeyEvent', {
89
+ type: 'keyUp', key: 'Backspace', code: 'Backspace',
90
+ windowsVirtualKeyCode: 8,
91
+ });
92
+ }
93
+
94
+ if (opts.keyEvents) {
95
+ for (const char of text) {
96
+ await session.send('Input.dispatchKeyEvent', { type: 'keyDown', text: char });
97
+ await session.send('Input.dispatchKeyEvent', { type: 'keyUp', text: char });
98
+ }
99
+ } else {
100
+ await session.send('Input.insertText', { text });
101
+ }
102
+ }
103
+
104
+ /**
105
+ * Press a special key (Enter, Tab, Escape, etc.).
106
+ * Dispatches keyDown + keyUp for the named key.
107
+ *
108
+ * @param {object} session - Session-scoped CDP handle
109
+ * @param {string} key - Key name (e.g. 'Enter', 'Tab', 'Escape', 'ArrowDown')
110
+ */
111
+ export async function press(session, key) {
112
+ const def = KEY_MAP[key];
113
+ if (!def) throw new Error(`Unknown key: "${key}". Valid keys: ${Object.keys(KEY_MAP).join(', ')}`);
114
+ const base = { key: def.key, code: def.code, windowsVirtualKeyCode: def.keyCode };
115
+ if (def.text) base.text = def.text;
116
+ await session.send('Input.dispatchKeyEvent', { type: 'keyDown', ...base });
117
+ await session.send('Input.dispatchKeyEvent', { type: 'keyUp', ...base });
118
+ }
119
+
120
+ /**
121
+ * Scroll the page via mouseWheel event.
122
+ * Dispatches at viewport center by default, or at given coordinates.
123
+ *
124
+ * @param {object} session - Session-scoped CDP handle
125
+ * @param {number} deltaY - Pixels to scroll (positive = down, negative = up)
126
+ * @param {number} [x=400] - X coordinate for scroll event
127
+ * @param {number} [y=300] - Y coordinate for scroll event
128
+ */
129
+ export async function scroll(session, deltaY, x = 400, y = 300) {
130
+ await session.send('Input.dispatchMouseEvent', {
131
+ type: 'mouseWheel', x, y, deltaX: 0, deltaY,
132
+ });
133
+ }
134
+
135
+ /**
136
+ * Hover over an element by its backendDOMNodeId.
137
+ * Scrolls into view, then dispatches mouseMoved at center.
138
+ *
139
+ * @param {object} session - Session-scoped CDP handle
140
+ * @param {number} backendNodeId - Backend DOM node ID
141
+ */
142
+ export async function hover(session, backendNodeId) {
143
+ const { x, y } = await getCenter(session, backendNodeId);
144
+ await session.send('Input.dispatchMouseEvent', {
145
+ type: 'mouseMoved', x, y,
146
+ });
147
+ }
148
+
149
+ /**
150
+ * Select a value in a <select> element or custom dropdown.
151
+ *
152
+ * Strategy 1: Native <select> — set .value + dispatch 'change' event.
153
+ * Strategy 2: Custom dropdown — click to open, find matching option, click it.
154
+ *
155
+ * @param {object} session - Session-scoped CDP handle
156
+ * @param {number} backendNodeId - Backend DOM node ID of the select/combobox
157
+ * @param {string} value - Value or visible text to select
158
+ */
159
+ export async function select(session, backendNodeId, value) {
160
+ // Resolve to a JS object so we can check tagName and set value
161
+ const { object } = await session.send('DOM.resolveNode', { backendNodeId });
162
+
163
+ // Try native <select> first
164
+ const { result: tagResult } = await session.send('Runtime.callFunctionOn', {
165
+ objectId: object.objectId,
166
+ functionDeclaration: 'function() { return this.tagName; }',
167
+ returnByValue: true,
168
+ });
169
+
170
+ if (tagResult.value === 'SELECT') {
171
+ // Native select: set value + dispatch change
172
+ await session.send('Runtime.callFunctionOn', {
173
+ objectId: object.objectId,
174
+ functionDeclaration: `function(v) {
175
+ // Try by value first, then by visible text
176
+ const opt = Array.from(this.options).find(o => o.value === v || o.textContent.trim() === v);
177
+ if (opt) {
178
+ this.value = opt.value;
179
+ this.dispatchEvent(new Event('change', { bubbles: true }));
180
+ return true;
181
+ }
182
+ return false;
183
+ }`,
184
+ arguments: [{ value }],
185
+ returnByValue: true,
186
+ });
187
+ return;
188
+ }
189
+
190
+ // Custom dropdown: click to open, then find and click the matching option
191
+ await click(session, backendNodeId);
192
+ await new Promise((r) => setTimeout(r, 300)); // wait for dropdown to open
193
+
194
+ // Search for a matching option in the ARIA tree
195
+ const { result: found } = await session.send('Runtime.evaluate', {
196
+ expression: `(() => {
197
+ const options = document.querySelectorAll('[role="option"], [role="menuitem"], li[role="option"]');
198
+ for (const opt of options) {
199
+ if (opt.textContent.trim() === ${JSON.stringify(value)}) {
200
+ opt.click();
201
+ return true;
202
+ }
203
+ }
204
+ return false;
205
+ })()`,
206
+ returnByValue: true,
207
+ });
208
+ }
package/src/prune.js ADDED
@@ -0,0 +1,472 @@
1
+ /**
2
+ * prune.js — ARIA tree pruning for agent consumption.
3
+ *
4
+ * Ported from mcprune. Pure function: tree in, pruned tree out.
5
+ * Zero deps, zero I/O.
6
+ *
7
+ * Node shape (from CDP Accessibility.getFullAXTree, after buildTree):
8
+ * { nodeId, role, name, properties: { level, checked, ... }, ignored, children }
9
+ *
10
+ * We adapt mcprune's logic which used:
11
+ * { role, name, ref, states: { level, checked }, text, children }
12
+ *
13
+ * The mapping:
14
+ * mcprune.ref → nodeId
15
+ * mcprune.states → properties
16
+ * mcprune.text → StaticText child's name (CDP has no inline text)
17
+ */
18
+
19
+ // --- Role taxonomy (from mcprune/roles.js) ---
20
+
21
+ const LANDMARKS = new Set([
22
+ 'banner', 'main', 'contentinfo', 'navigation', 'complementary',
23
+ 'search', 'form', 'region',
24
+ ]);
25
+
26
+ const INTERACTIVE = new Set([
27
+ 'button', 'link', 'textbox', 'searchbox', 'checkbox', 'radio',
28
+ 'combobox', 'listbox', 'menuitem', 'menuitemcheckbox', 'menuitemradio',
29
+ 'option', 'slider', 'spinbutton', 'switch', 'tab', 'treeitem',
30
+ ]);
31
+
32
+ const GROUPS = new Set([
33
+ 'radiogroup', 'tablist', 'menu', 'menubar', 'toolbar',
34
+ 'listbox', 'tree', 'treegrid', 'grid',
35
+ ]);
36
+
37
+ const STRUCTURAL = new Set([
38
+ 'generic', 'group', 'list', 'table', 'row', 'rowgroup', 'cell',
39
+ 'directory', 'document', 'application', 'presentation', 'none', 'separator',
40
+ // CDP-specific roles that map to structural
41
+ 'LayoutTable', 'LayoutTableRow', 'LayoutTableCell',
42
+ ]);
43
+
44
+ const MODE_REGIONS = {
45
+ act: new Set(['main']),
46
+ browse: new Set(['main']),
47
+ navigate: new Set(['main', 'banner', 'navigation', 'search']),
48
+ full: new Set(['main', 'banner', 'navigation', 'contentinfo', 'complementary', 'search']),
49
+ };
50
+
51
+ // Roles that are rendering noise — skip entirely
52
+ const SKIP_ROLES = new Set([
53
+ 'InlineTextBox', 'LineBreak', 'superscript',
54
+ ]);
55
+
56
+ // --- Main export ---
57
+
58
+ /**
59
+ * Prune an ARIA tree for agent consumption.
60
+ *
61
+ * @param {object} tree - Root node from buildTree() (CDP format)
62
+ * @param {object} [options]
63
+ * @param {'act'|'browse'|'navigate'|'full'} [options.mode='act'] - Pruning mode
64
+ * @param {string} [options.context=''] - Search context for relevance filtering
65
+ * @returns {object|null} Pruned tree
66
+ */
67
+ export function prune(tree, options = {}) {
68
+ const { mode = 'act', context = '' } = options;
69
+ const allowedRegions = MODE_REGIONS[mode] || MODE_REGIONS.act;
70
+ const isBrowse = mode === 'browse';
71
+ const keywords = context
72
+ ? context.toLowerCase().split(/\s+/).filter((w) => w.length > 1)
73
+ : [];
74
+
75
+ // Wrap as array for pipeline
76
+ let nodes = tree ? [tree] : [];
77
+
78
+ // Step 1: Extract landmark regions
79
+ nodes = extractRegions(nodes, allowedRegions);
80
+
81
+ // Step 2: Prune nodes
82
+ const ctx = { mode, parentRole: null, keywords };
83
+ nodes = nodes.map((n) => pruneNode(n, ctx)).filter(Boolean);
84
+
85
+ // Step 3: Collapse structural wrappers
86
+ nodes = nodes.map((n) => collapse(n)).filter(Boolean);
87
+
88
+ // Step 4: Post-clean (combobox trim, orphaned headings)
89
+ nodes = nodes.map((n) => postClean(n, isBrowse)).filter(Boolean);
90
+
91
+ // Steps 5-8: E-commerce noise removal (skip in browse mode)
92
+ if (!isBrowse) {
93
+ nodes = dedupLinks(nodes);
94
+ nodes = nodes.map((n) => dropNoiseButtons(n)).filter(Boolean);
95
+ nodes = truncateAfterFooter(nodes);
96
+ nodes = nodes.map((n) => dropFilterGroups(n)).filter(Boolean);
97
+ }
98
+
99
+ // Return single root or wrap multiple
100
+ if (nodes.length === 0) return null;
101
+ if (nodes.length === 1) return nodes[0];
102
+ return { nodeId: '', role: 'root', name: '', properties: {}, ignored: false, children: nodes };
103
+ }
104
+
105
+ // --- Step 1: Region extraction ---
106
+
107
+ function extractRegions(nodes, allowedRegions) {
108
+ // Unwrap RootWebArea
109
+ if (nodes.length === 1 && (nodes[0].role === 'RootWebArea' || nodes[0].role === 'WebArea')) {
110
+ nodes = nodes[0].children;
111
+ }
112
+
113
+ const hasLandmarks = nodes.some((n) => LANDMARKS.has(n.role));
114
+ const mainNode = nodes.find((n) => n.role === 'main');
115
+ const hasMain = mainNode ? (hasInteractive(mainNode) || hasHeading(mainNode)) : false;
116
+
117
+ const results = [];
118
+ for (const node of nodes) {
119
+ if (LANDMARKS.has(node.role)) {
120
+ if (isRegionAllowed(node, allowedRegions)) results.push(node);
121
+ } else if (hasLandmarks && hasMain) {
122
+ if (allowedRegions.has('navigation')) results.push(node);
123
+ } else if (hasLandmarks && !hasMain) {
124
+ if (hasInteractive(node) || hasHeading(node)) results.push(node);
125
+ } else {
126
+ results.push(node);
127
+ }
128
+ }
129
+ return results;
130
+ }
131
+
132
+ function isRegionAllowed(node, allowedRegions) {
133
+ if (allowedRegions.has(node.role)) return true;
134
+ if (node.role === 'region' && allowedRegions.has('main')) {
135
+ const auxPatterns = /image|review|recommend|related|similar|also viewed|cookie/i;
136
+ if (node.name && auxPatterns.test(node.name)) return false;
137
+ return true;
138
+ }
139
+ return false;
140
+ }
141
+
142
+ // --- Step 2: Node pruning ---
143
+
144
+ function pruneNode(node, ctx) {
145
+ if (!node) return null;
146
+
147
+ // Skip rendering noise
148
+ if (SKIP_ROLES.has(node.role)) return null;
149
+
150
+ const isBrowse = ctx.mode === 'browse';
151
+ const level = node.properties?.level;
152
+
153
+ // Drop links inside paragraphs in act mode
154
+ if (ctx.mode === 'act' && node.role === 'link' && ctx.parentRole === 'paragraph') {
155
+ return null;
156
+ }
157
+
158
+ // Paragraphs: drop in act, keep in browse
159
+ if (node.role === 'paragraph') {
160
+ if (ctx.mode === 'act') return null;
161
+ return { ...node, children: pruneChildren(node.children, ctx) };
162
+ }
163
+
164
+ // Navigation inside main: drop in browse (page chrome)
165
+ if (isBrowse && node.role === 'navigation') return null;
166
+
167
+ // Code blocks: keep as-is
168
+ if (node.role === 'code') return node;
169
+
170
+ // Term/definition: keep + recurse
171
+ if (node.role === 'term' || node.role === 'definition') {
172
+ return { ...node, children: pruneChildren(node.children, ctx) };
173
+ }
174
+
175
+ // Strong/emphasis/blockquote: keep in browse
176
+ if (isBrowse && (node.role === 'strong' || node.role === 'emphasis' || node.role === 'blockquote')) {
177
+ return { ...node, children: pruneChildren(node.children, ctx) };
178
+ }
179
+
180
+ // Figures in browse: caption text
181
+ if (isBrowse && node.role === 'figure') {
182
+ if (node.name) {
183
+ return { ...node, role: 'StaticText', name: `[Figure: ${node.name}]`, children: [] };
184
+ }
185
+ return null;
186
+ }
187
+
188
+ // Interactive elements: always keep
189
+ if (INTERACTIVE.has(node.role)) {
190
+ return { ...node, children: pruneChildren(node.children, ctx) };
191
+ }
192
+
193
+ // Context-aware: collapse non-matching product cards
194
+ if (!isBrowse && ctx.keywords.length > 0 && node.role === 'listitem' && hasInteractive(node)) {
195
+ const text = extractText(node).toLowerCase();
196
+ if (!ctx.keywords.some((kw) => text.includes(kw))) {
197
+ return condenseCard(node);
198
+ }
199
+ }
200
+
201
+ // Named groups: keep
202
+ if (GROUPS.has(node.role) && node.name) {
203
+ return { ...node, children: pruneChildren(node.children, ctx) };
204
+ }
205
+ if (node.role === 'group' && node.name) {
206
+ if (!isBrowse && /kleuren|colors?|couleurs?|farben/i.test(node.name)) {
207
+ return collapseColors(node);
208
+ }
209
+ return { ...node, children: pruneChildren(node.children, ctx) };
210
+ }
211
+
212
+ // Headings
213
+ if (node.role === 'heading') {
214
+ if (!isBrowse && level !== '1' && level !== 1) {
215
+ if (node.name && /about this|description|detail|feature|specification|overview/i.test(node.name)) {
216
+ return null;
217
+ }
218
+ }
219
+ return { ...node, children: [] };
220
+ }
221
+
222
+ // StaticText — CDP equivalent of mcprune's "text" nodes
223
+ if (node.role === 'StaticText') {
224
+ return keepText(node, ctx.mode) ? node : null;
225
+ }
226
+
227
+ // Images: drop in act, keep named in browse
228
+ if (node.role === 'img' || node.role === 'image') {
229
+ if (isBrowse && node.name) return { ...node, children: [] };
230
+ return null;
231
+ }
232
+
233
+ // Separators: drop
234
+ if (node.role === 'separator') return null;
235
+
236
+ // Complementary: keep in browse, drop in act
237
+ if (node.role === 'complementary') {
238
+ if (isBrowse) return { ...node, children: pruneChildren(node.children, ctx) };
239
+ return null;
240
+ }
241
+
242
+ // Aux regions: drop in act
243
+ if (node.role === 'region' && !isBrowse) {
244
+ if (node.name && /image|review|recommend|related|similar|also viewed/i.test(node.name)) {
245
+ return null;
246
+ }
247
+ }
248
+
249
+ // Note/status: keep in browse
250
+ if (isBrowse && (node.role === 'note' || node.role === 'status')) {
251
+ return { ...node, children: pruneChildren(node.children, ctx) };
252
+ }
253
+
254
+ // Structural: recurse, keep if has children
255
+ const childCtx = { ...ctx, parentRole: node.role };
256
+ const keptChildren = pruneChildren(node.children, childCtx);
257
+
258
+ // Drop text-only lists in act mode
259
+ if (!isBrowse) {
260
+ if (node.role === 'list' && keptChildren.every((c) => !hasInteractive(c))) return null;
261
+ if (node.role === 'listitem' && !hasInteractive(node)) return null;
262
+ }
263
+
264
+ if (keptChildren.length > 0) return { ...node, children: keptChildren };
265
+ return null;
266
+ }
267
+
268
+ function pruneChildren(children, ctx) {
269
+ if (!children) return [];
270
+ return children.map((c) => pruneNode(c, ctx)).filter(Boolean);
271
+ }
272
+
273
+ function keepText(node, mode) {
274
+ const t = node.name || '';
275
+ if (!t) return false;
276
+
277
+ // Browse: keep all except separator noise
278
+ if (mode === 'browse') {
279
+ if (t.length <= 2 && /^[|»·•→←>\-]$/.test(t.trim())) return false;
280
+ return true;
281
+ }
282
+
283
+ // Act: prices, stock, shipping, short labels
284
+ if (/\$[\d,]+\.?\d*|€[\d,]+/.test(t)) return true;
285
+ if (/in stock|out of stock|unavailable|available/i.test(t)) return true;
286
+ if (/delivery|shipping|free/i.test(t)) return true;
287
+ if (t.length < 40 && t.endsWith(':')) return true;
288
+ if (t.length < 30) return true;
289
+ return false;
290
+ }
291
+
292
+ // --- Step 3: Collapse structural wrappers ---
293
+
294
+ function collapse(node) {
295
+ if (!node) return null;
296
+
297
+ node = { ...node, children: node.children.map((c) => collapse(c)).filter(Boolean) };
298
+
299
+ const isTableLayout = /^LayoutTable/.test(node.role) ||
300
+ node.role === 'row' || node.role === 'cell' || node.role === 'rowgroup';
301
+
302
+ if ((STRUCTURAL.has(node.role) && !node.name) || isTableLayout) {
303
+ if (node.children.length === 1) return node.children[0];
304
+ if (node.children.length > 0) {
305
+ return { ...node, role: '_promote', children: node.children };
306
+ }
307
+ return null;
308
+ }
309
+
310
+ return node;
311
+ }
312
+
313
+ // --- Step 4: Post-clean ---
314
+
315
+ function postClean(node, isBrowse) {
316
+ if (!node) return null;
317
+
318
+ if (node.role === 'combobox' || node.role === 'listbox') {
319
+ const selected = node.children.find((c) => c.properties?.selected);
320
+ return { ...node, name: selected?.name || node.name, children: [] };
321
+ }
322
+
323
+ node = { ...node, children: node.children.map((c) => postClean(c, isBrowse)).filter(Boolean) };
324
+
325
+ if (!isBrowse && node.children) {
326
+ node = { ...node, children: dropOrphanedHeadings(node.children) };
327
+ }
328
+
329
+ return node;
330
+ }
331
+
332
+ function dropOrphanedHeadings(children) {
333
+ const result = [];
334
+ for (let i = 0; i < children.length; i++) {
335
+ const child = children[i];
336
+ const level = child.properties?.level;
337
+ if (child.role === 'heading' && level !== '1' && level !== 1) {
338
+ let found = false;
339
+ for (let j = i + 1; j < children.length; j++) {
340
+ if (children[j].role === 'heading') break;
341
+ if (hasInteractive(children[j])) { found = true; break; }
342
+ }
343
+ if (!found) continue;
344
+ }
345
+ result.push(child);
346
+ }
347
+ return result;
348
+ }
349
+
350
+ // --- Steps 5-8: E-commerce noise ---
351
+
352
+ function dedupLinks(nodes) {
353
+ const seen = new Map();
354
+ return nodes.map((n) => dedupLinksIn(n, seen)).filter(Boolean);
355
+ }
356
+
357
+ function dedupLinksIn(node, seen) {
358
+ if (!node) return null;
359
+ if (node.role === 'link' && node.name) {
360
+ if (seen.has(node.name)) return null;
361
+ seen.set(node.name, true);
362
+ }
363
+ if (node.role === 'listitem') {
364
+ const local = new Map();
365
+ node = { ...node, children: node.children.map((c) => dedupLinksIn(c, local)).filter(Boolean) };
366
+ return node.children.length > 0 ? node : null;
367
+ }
368
+ node = { ...node, children: node.children.map((c) => dedupLinksIn(c, seen)).filter(Boolean) };
369
+ return node;
370
+ }
371
+
372
+ const NOISE_BUTTONS = /energieklasse|energy\s*class|productinformatieblad|product\s*information\s*sheet|gesponsorde|sponsored|ad\s*feedback|sterren.*details.*beoordeling|stars.*rating\s*detail/i;
373
+ const NOISE_LINKS = /^opties bekijken$|^view options$|^see options$|^voir les options$/i;
374
+ const FOOTER_LINKS = /gebruiks.*voorwaarden|conditions.*use|privacy|cookie|contactgegevens|contact\s*info|advertenties|interest.*ads|lees\s*meer\s*over\s*deze\s*resultaten/i;
375
+
376
+ function dropNoiseButtons(node) {
377
+ if (!node) return null;
378
+ if (node.role === 'button' && node.name && NOISE_BUTTONS.test(node.name)) return null;
379
+ if (node.role === 'link' && node.name && (NOISE_LINKS.test(node.name) || FOOTER_LINKS.test(node.name))) return null;
380
+ node = { ...node, children: node.children.map((c) => dropNoiseButtons(c)).filter(Boolean) };
381
+ return node;
382
+ }
383
+
384
+ function truncateAfterFooter(nodes) {
385
+ const result = [];
386
+ for (let i = 0; i < nodes.length; i++) {
387
+ const node = nodes[i];
388
+ if (isFooterMarker(node)) break;
389
+ if (isSkippable(node)) continue;
390
+ if (node.children?.length > 0) {
391
+ const trimmed = { ...node, children: truncateAfterFooter(node.children) };
392
+ if (trimmed.children.length === 0 && STRUCTURAL.has(trimmed.role)) continue;
393
+ result.push(trimmed);
394
+ } else {
395
+ result.push(node);
396
+ }
397
+ }
398
+ return result;
399
+ }
400
+
401
+ function isFooterMarker(node) {
402
+ if (node.role === 'button' && node.name && /terug naar boven|back to top/i.test(node.name)) return true;
403
+ const level = node.properties?.level;
404
+ if (node.role === 'heading' && (level === '6' || level === 6)) return true;
405
+ if (node.role === 'heading' && node.name && /gerelateerde zoek|related search|hulp nodig|need help/i.test(node.name)) return true;
406
+ return false;
407
+ }
408
+
409
+ function isSkippable(node) {
410
+ return node.role === 'dialog' && node.name && /filter/i.test(node.name);
411
+ }
412
+
413
+ const FILTER_GROUP = /toepassen om de resultaten|filter.*to narrow|apply.*filter|refine by/i;
414
+
415
+ function dropFilterGroups(node) {
416
+ if (!node) return null;
417
+ if (node.role === 'group' && node.name && FILTER_GROUP.test(extractText(node))) return null;
418
+ node = { ...node, children: node.children.map((c) => dropFilterGroups(c)).filter(Boolean) };
419
+ if (STRUCTURAL.has(node.role) && !node.name && node.children.length === 0) return null;
420
+ return node;
421
+ }
422
+
423
+ // --- Helpers ---
424
+
425
+ function hasInteractive(node) {
426
+ if (INTERACTIVE.has(node.role) || GROUPS.has(node.role)) return true;
427
+ return node.children?.some((c) => hasInteractive(c)) ?? false;
428
+ }
429
+
430
+ function hasHeading(node) {
431
+ if (node.role === 'heading') return true;
432
+ return node.children?.some((c) => hasHeading(c)) ?? false;
433
+ }
434
+
435
+ function extractText(node) {
436
+ let text = node.name || '';
437
+ for (const child of (node.children || [])) text += ' ' + extractText(child);
438
+ return text;
439
+ }
440
+
441
+ function flatten(nodes) {
442
+ const result = [];
443
+ for (const n of nodes) {
444
+ result.push(n);
445
+ if (n.children) result.push(...flatten(n.children));
446
+ }
447
+ return result;
448
+ }
449
+
450
+ function condenseCard(node) {
451
+ const all = flatten([node]);
452
+ const link = all.find((n) => n.role === 'link' && n.name);
453
+ if (!link) return null;
454
+ return {
455
+ nodeId: node.nodeId, role: 'listitem', name: '', properties: {},
456
+ ignored: false, children: [{ ...link, children: [] }],
457
+ };
458
+ }
459
+
460
+ function collapseColors(node) {
461
+ const all = flatten([node]);
462
+ const colors = all.filter((n) => n.role === 'link' && n.name && !n.name.startsWith('+'))
463
+ .map((n) => n.name);
464
+ if (colors.length === 0) {
465
+ const plus = all.find((n) => n.role === 'link' && n.name);
466
+ return plus ? { ...plus, children: [] } : null;
467
+ }
468
+ return {
469
+ nodeId: node.nodeId, role: 'StaticText', name: `colors(${colors.length}): ${colors.join(', ')}`,
470
+ properties: {}, ignored: false, children: [],
471
+ };
472
+ }