barebrowse 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.mcp.json +8 -0
- package/CHANGELOG.md +100 -0
- package/CLAUDE.md +22 -0
- package/README.md +123 -43
- package/barebrowse.context.md +261 -0
- package/cli.js +156 -0
- package/docs/blueprint.md +361 -0
- package/docs/testing.md +202 -0
- package/mcp-server.js +216 -0
- package/package.json +22 -9
- package/src/aria.js +69 -0
- package/src/auth.js +279 -0
- package/src/bareagent.js +161 -0
- package/src/cdp.js +148 -0
- package/src/chromium.js +148 -0
- package/src/consent.js +210 -0
- package/src/index.js +186 -10
- package/src/interact.js +208 -0
- package/src/prune.js +472 -0
- package/src/stealth.js +51 -0
package/src/interact.js
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* interact.js — Click, type, scroll, and press keys via CDP Input/DOM domains.
|
|
3
|
+
*
|
|
4
|
+
* All functions take a session-scoped CDP handle (from cdp.session()).
|
|
5
|
+
* Coordinates come from DOM.getBoxModel which returns viewport-relative quads.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/** Key definitions for special keys: key, code, keyCode (windowsVirtualKeyCode). */
|
|
9
|
+
const KEY_MAP = {
|
|
10
|
+
Enter: { key: 'Enter', code: 'Enter', keyCode: 13, text: '\r' },
|
|
11
|
+
Tab: { key: 'Tab', code: 'Tab', keyCode: 9, text: '\t' },
|
|
12
|
+
Escape: { key: 'Escape', code: 'Escape', keyCode: 27 },
|
|
13
|
+
Backspace: { key: 'Backspace', code: 'Backspace', keyCode: 8 },
|
|
14
|
+
Delete: { key: 'Delete', code: 'Delete', keyCode: 46 },
|
|
15
|
+
ArrowUp: { key: 'ArrowUp', code: 'ArrowUp', keyCode: 38 },
|
|
16
|
+
ArrowDown: { key: 'ArrowDown', code: 'ArrowDown', keyCode: 40 },
|
|
17
|
+
ArrowLeft: { key: 'ArrowLeft', code: 'ArrowLeft', keyCode: 37 },
|
|
18
|
+
ArrowRight: { key: 'ArrowRight', code: 'ArrowRight', keyCode: 39 },
|
|
19
|
+
Home: { key: 'Home', code: 'Home', keyCode: 36 },
|
|
20
|
+
End: { key: 'End', code: 'End', keyCode: 35 },
|
|
21
|
+
PageUp: { key: 'PageUp', code: 'PageUp', keyCode: 33 },
|
|
22
|
+
PageDown: { key: 'PageDown', code: 'PageDown', keyCode: 34 },
|
|
23
|
+
Space: { key: ' ', code: 'Space', keyCode: 32 },
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Get the viewport-relative center point of a DOM node.
|
|
28
|
+
* Scrolls the element into view first to ensure valid coordinates.
|
|
29
|
+
* @param {object} session - Session-scoped CDP handle
|
|
30
|
+
* @param {number} backendNodeId - Backend DOM node ID from ARIA tree
|
|
31
|
+
* @returns {Promise<{x: number, y: number}>}
|
|
32
|
+
*/
|
|
33
|
+
async function getCenter(session, backendNodeId) {
|
|
34
|
+
await session.send('DOM.scrollIntoViewIfNeeded', { backendNodeId });
|
|
35
|
+
const { model } = await session.send('DOM.getBoxModel', { backendNodeId });
|
|
36
|
+
// content quad: [x1,y1, x2,y2, x3,y3, x4,y4]
|
|
37
|
+
const [x1, y1, , , x3, y3] = model.content;
|
|
38
|
+
return { x: (x1 + x3) / 2, y: (y1 + y3) / 2 };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Click an element by its backendDOMNodeId.
|
|
43
|
+
* Scrolls into view, resolves coordinates, then dispatches mousePressed + mouseReleased.
|
|
44
|
+
*
|
|
45
|
+
* @param {object} session - Session-scoped CDP handle
|
|
46
|
+
* @param {number} backendNodeId - Backend DOM node ID
|
|
47
|
+
*/
|
|
48
|
+
export async function click(session, backendNodeId) {
|
|
49
|
+
const { x, y } = await getCenter(session, backendNodeId);
|
|
50
|
+
await session.send('Input.dispatchMouseEvent', {
|
|
51
|
+
type: 'mousePressed', x, y, button: 'left', clickCount: 1,
|
|
52
|
+
});
|
|
53
|
+
await session.send('Input.dispatchMouseEvent', {
|
|
54
|
+
type: 'mouseReleased', x, y, button: 'left', clickCount: 1,
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Type text into an element by its backendDOMNodeId.
|
|
60
|
+
* Default: DOM.focus + Input.insertText (fast, no key events).
|
|
61
|
+
* With { keyEvents: true }: dispatches keyDown/keyUp per character (triggers handlers).
|
|
62
|
+
* With { clear: true }: selects all existing text and deletes it before typing.
|
|
63
|
+
*
|
|
64
|
+
* @param {object} session - Session-scoped CDP handle
|
|
65
|
+
* @param {number} backendNodeId - Backend DOM node ID
|
|
66
|
+
* @param {string} text - Text to type
|
|
67
|
+
* @param {object} [opts]
|
|
68
|
+
* @param {boolean} [opts.keyEvents=false] - Use char-by-char key events
|
|
69
|
+
* @param {boolean} [opts.clear=false] - Clear existing content before typing
|
|
70
|
+
*/
|
|
71
|
+
export async function type(session, backendNodeId, text, opts = {}) {
|
|
72
|
+
await session.send('DOM.focus', { backendNodeId });
|
|
73
|
+
|
|
74
|
+
if (opts.clear) {
|
|
75
|
+
// Select all (Ctrl+A) then delete
|
|
76
|
+
await session.send('Input.dispatchKeyEvent', {
|
|
77
|
+
type: 'keyDown', key: 'a', code: 'KeyA',
|
|
78
|
+
windowsVirtualKeyCode: 65, modifiers: 2, // 2 = Ctrl
|
|
79
|
+
});
|
|
80
|
+
await session.send('Input.dispatchKeyEvent', {
|
|
81
|
+
type: 'keyUp', key: 'a', code: 'KeyA',
|
|
82
|
+
windowsVirtualKeyCode: 65, modifiers: 2,
|
|
83
|
+
});
|
|
84
|
+
await session.send('Input.dispatchKeyEvent', {
|
|
85
|
+
type: 'keyDown', key: 'Backspace', code: 'Backspace',
|
|
86
|
+
windowsVirtualKeyCode: 8,
|
|
87
|
+
});
|
|
88
|
+
await session.send('Input.dispatchKeyEvent', {
|
|
89
|
+
type: 'keyUp', key: 'Backspace', code: 'Backspace',
|
|
90
|
+
windowsVirtualKeyCode: 8,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (opts.keyEvents) {
|
|
95
|
+
for (const char of text) {
|
|
96
|
+
await session.send('Input.dispatchKeyEvent', { type: 'keyDown', text: char });
|
|
97
|
+
await session.send('Input.dispatchKeyEvent', { type: 'keyUp', text: char });
|
|
98
|
+
}
|
|
99
|
+
} else {
|
|
100
|
+
await session.send('Input.insertText', { text });
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Press a special key (Enter, Tab, Escape, etc.).
|
|
106
|
+
* Dispatches keyDown + keyUp for the named key.
|
|
107
|
+
*
|
|
108
|
+
* @param {object} session - Session-scoped CDP handle
|
|
109
|
+
* @param {string} key - Key name (e.g. 'Enter', 'Tab', 'Escape', 'ArrowDown')
|
|
110
|
+
*/
|
|
111
|
+
export async function press(session, key) {
|
|
112
|
+
const def = KEY_MAP[key];
|
|
113
|
+
if (!def) throw new Error(`Unknown key: "${key}". Valid keys: ${Object.keys(KEY_MAP).join(', ')}`);
|
|
114
|
+
const base = { key: def.key, code: def.code, windowsVirtualKeyCode: def.keyCode };
|
|
115
|
+
if (def.text) base.text = def.text;
|
|
116
|
+
await session.send('Input.dispatchKeyEvent', { type: 'keyDown', ...base });
|
|
117
|
+
await session.send('Input.dispatchKeyEvent', { type: 'keyUp', ...base });
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Scroll the page via mouseWheel event.
|
|
122
|
+
* Dispatches at viewport center by default, or at given coordinates.
|
|
123
|
+
*
|
|
124
|
+
* @param {object} session - Session-scoped CDP handle
|
|
125
|
+
* @param {number} deltaY - Pixels to scroll (positive = down, negative = up)
|
|
126
|
+
* @param {number} [x=400] - X coordinate for scroll event
|
|
127
|
+
* @param {number} [y=300] - Y coordinate for scroll event
|
|
128
|
+
*/
|
|
129
|
+
export async function scroll(session, deltaY, x = 400, y = 300) {
|
|
130
|
+
await session.send('Input.dispatchMouseEvent', {
|
|
131
|
+
type: 'mouseWheel', x, y, deltaX: 0, deltaY,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Hover over an element by its backendDOMNodeId.
|
|
137
|
+
* Scrolls into view, then dispatches mouseMoved at center.
|
|
138
|
+
*
|
|
139
|
+
* @param {object} session - Session-scoped CDP handle
|
|
140
|
+
* @param {number} backendNodeId - Backend DOM node ID
|
|
141
|
+
*/
|
|
142
|
+
export async function hover(session, backendNodeId) {
|
|
143
|
+
const { x, y } = await getCenter(session, backendNodeId);
|
|
144
|
+
await session.send('Input.dispatchMouseEvent', {
|
|
145
|
+
type: 'mouseMoved', x, y,
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Select a value in a <select> element or custom dropdown.
|
|
151
|
+
*
|
|
152
|
+
* Strategy 1: Native <select> — set .value + dispatch 'change' event.
|
|
153
|
+
* Strategy 2: Custom dropdown — click to open, find matching option, click it.
|
|
154
|
+
*
|
|
155
|
+
* @param {object} session - Session-scoped CDP handle
|
|
156
|
+
* @param {number} backendNodeId - Backend DOM node ID of the select/combobox
|
|
157
|
+
* @param {string} value - Value or visible text to select
|
|
158
|
+
*/
|
|
159
|
+
export async function select(session, backendNodeId, value) {
|
|
160
|
+
// Resolve to a JS object so we can check tagName and set value
|
|
161
|
+
const { object } = await session.send('DOM.resolveNode', { backendNodeId });
|
|
162
|
+
|
|
163
|
+
// Try native <select> first
|
|
164
|
+
const { result: tagResult } = await session.send('Runtime.callFunctionOn', {
|
|
165
|
+
objectId: object.objectId,
|
|
166
|
+
functionDeclaration: 'function() { return this.tagName; }',
|
|
167
|
+
returnByValue: true,
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
if (tagResult.value === 'SELECT') {
|
|
171
|
+
// Native select: set value + dispatch change
|
|
172
|
+
await session.send('Runtime.callFunctionOn', {
|
|
173
|
+
objectId: object.objectId,
|
|
174
|
+
functionDeclaration: `function(v) {
|
|
175
|
+
// Try by value first, then by visible text
|
|
176
|
+
const opt = Array.from(this.options).find(o => o.value === v || o.textContent.trim() === v);
|
|
177
|
+
if (opt) {
|
|
178
|
+
this.value = opt.value;
|
|
179
|
+
this.dispatchEvent(new Event('change', { bubbles: true }));
|
|
180
|
+
return true;
|
|
181
|
+
}
|
|
182
|
+
return false;
|
|
183
|
+
}`,
|
|
184
|
+
arguments: [{ value }],
|
|
185
|
+
returnByValue: true,
|
|
186
|
+
});
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Custom dropdown: click to open, then find and click the matching option
|
|
191
|
+
await click(session, backendNodeId);
|
|
192
|
+
await new Promise((r) => setTimeout(r, 300)); // wait for dropdown to open
|
|
193
|
+
|
|
194
|
+
// Search for a matching option in the ARIA tree
|
|
195
|
+
const { result: found } = await session.send('Runtime.evaluate', {
|
|
196
|
+
expression: `(() => {
|
|
197
|
+
const options = document.querySelectorAll('[role="option"], [role="menuitem"], li[role="option"]');
|
|
198
|
+
for (const opt of options) {
|
|
199
|
+
if (opt.textContent.trim() === ${JSON.stringify(value)}) {
|
|
200
|
+
opt.click();
|
|
201
|
+
return true;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
return false;
|
|
205
|
+
})()`,
|
|
206
|
+
returnByValue: true,
|
|
207
|
+
});
|
|
208
|
+
}
|
package/src/prune.js
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* prune.js — ARIA tree pruning for agent consumption.
|
|
3
|
+
*
|
|
4
|
+
* Ported from mcprune. Pure function: tree in, pruned tree out.
|
|
5
|
+
* Zero deps, zero I/O.
|
|
6
|
+
*
|
|
7
|
+
* Node shape (from CDP Accessibility.getFullAXTree, after buildTree):
|
|
8
|
+
* { nodeId, role, name, properties: { level, checked, ... }, ignored, children }
|
|
9
|
+
*
|
|
10
|
+
* We adapt mcprune's logic which used:
|
|
11
|
+
* { role, name, ref, states: { level, checked }, text, children }
|
|
12
|
+
*
|
|
13
|
+
* The mapping:
|
|
14
|
+
* mcprune.ref → nodeId
|
|
15
|
+
* mcprune.states → properties
|
|
16
|
+
* mcprune.text → StaticText child's name (CDP has no inline text)
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
// --- Role taxonomy (from mcprune/roles.js) ---
|
|
20
|
+
|
|
21
|
+
const LANDMARKS = new Set([
|
|
22
|
+
'banner', 'main', 'contentinfo', 'navigation', 'complementary',
|
|
23
|
+
'search', 'form', 'region',
|
|
24
|
+
]);
|
|
25
|
+
|
|
26
|
+
const INTERACTIVE = new Set([
|
|
27
|
+
'button', 'link', 'textbox', 'searchbox', 'checkbox', 'radio',
|
|
28
|
+
'combobox', 'listbox', 'menuitem', 'menuitemcheckbox', 'menuitemradio',
|
|
29
|
+
'option', 'slider', 'spinbutton', 'switch', 'tab', 'treeitem',
|
|
30
|
+
]);
|
|
31
|
+
|
|
32
|
+
const GROUPS = new Set([
|
|
33
|
+
'radiogroup', 'tablist', 'menu', 'menubar', 'toolbar',
|
|
34
|
+
'listbox', 'tree', 'treegrid', 'grid',
|
|
35
|
+
]);
|
|
36
|
+
|
|
37
|
+
const STRUCTURAL = new Set([
|
|
38
|
+
'generic', 'group', 'list', 'table', 'row', 'rowgroup', 'cell',
|
|
39
|
+
'directory', 'document', 'application', 'presentation', 'none', 'separator',
|
|
40
|
+
// CDP-specific roles that map to structural
|
|
41
|
+
'LayoutTable', 'LayoutTableRow', 'LayoutTableCell',
|
|
42
|
+
]);
|
|
43
|
+
|
|
44
|
+
const MODE_REGIONS = {
|
|
45
|
+
act: new Set(['main']),
|
|
46
|
+
browse: new Set(['main']),
|
|
47
|
+
navigate: new Set(['main', 'banner', 'navigation', 'search']),
|
|
48
|
+
full: new Set(['main', 'banner', 'navigation', 'contentinfo', 'complementary', 'search']),
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
// Roles that are rendering noise — skip entirely
|
|
52
|
+
const SKIP_ROLES = new Set([
|
|
53
|
+
'InlineTextBox', 'LineBreak', 'superscript',
|
|
54
|
+
]);
|
|
55
|
+
|
|
56
|
+
// --- Main export ---
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Prune an ARIA tree for agent consumption.
|
|
60
|
+
*
|
|
61
|
+
* @param {object} tree - Root node from buildTree() (CDP format)
|
|
62
|
+
* @param {object} [options]
|
|
63
|
+
* @param {'act'|'browse'|'navigate'|'full'} [options.mode='act'] - Pruning mode
|
|
64
|
+
* @param {string} [options.context=''] - Search context for relevance filtering
|
|
65
|
+
* @returns {object|null} Pruned tree
|
|
66
|
+
*/
|
|
67
|
+
export function prune(tree, options = {}) {
|
|
68
|
+
const { mode = 'act', context = '' } = options;
|
|
69
|
+
const allowedRegions = MODE_REGIONS[mode] || MODE_REGIONS.act;
|
|
70
|
+
const isBrowse = mode === 'browse';
|
|
71
|
+
const keywords = context
|
|
72
|
+
? context.toLowerCase().split(/\s+/).filter((w) => w.length > 1)
|
|
73
|
+
: [];
|
|
74
|
+
|
|
75
|
+
// Wrap as array for pipeline
|
|
76
|
+
let nodes = tree ? [tree] : [];
|
|
77
|
+
|
|
78
|
+
// Step 1: Extract landmark regions
|
|
79
|
+
nodes = extractRegions(nodes, allowedRegions);
|
|
80
|
+
|
|
81
|
+
// Step 2: Prune nodes
|
|
82
|
+
const ctx = { mode, parentRole: null, keywords };
|
|
83
|
+
nodes = nodes.map((n) => pruneNode(n, ctx)).filter(Boolean);
|
|
84
|
+
|
|
85
|
+
// Step 3: Collapse structural wrappers
|
|
86
|
+
nodes = nodes.map((n) => collapse(n)).filter(Boolean);
|
|
87
|
+
|
|
88
|
+
// Step 4: Post-clean (combobox trim, orphaned headings)
|
|
89
|
+
nodes = nodes.map((n) => postClean(n, isBrowse)).filter(Boolean);
|
|
90
|
+
|
|
91
|
+
// Steps 5-8: E-commerce noise removal (skip in browse mode)
|
|
92
|
+
if (!isBrowse) {
|
|
93
|
+
nodes = dedupLinks(nodes);
|
|
94
|
+
nodes = nodes.map((n) => dropNoiseButtons(n)).filter(Boolean);
|
|
95
|
+
nodes = truncateAfterFooter(nodes);
|
|
96
|
+
nodes = nodes.map((n) => dropFilterGroups(n)).filter(Boolean);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Return single root or wrap multiple
|
|
100
|
+
if (nodes.length === 0) return null;
|
|
101
|
+
if (nodes.length === 1) return nodes[0];
|
|
102
|
+
return { nodeId: '', role: 'root', name: '', properties: {}, ignored: false, children: nodes };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// --- Step 1: Region extraction ---
|
|
106
|
+
|
|
107
|
+
function extractRegions(nodes, allowedRegions) {
|
|
108
|
+
// Unwrap RootWebArea
|
|
109
|
+
if (nodes.length === 1 && (nodes[0].role === 'RootWebArea' || nodes[0].role === 'WebArea')) {
|
|
110
|
+
nodes = nodes[0].children;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const hasLandmarks = nodes.some((n) => LANDMARKS.has(n.role));
|
|
114
|
+
const mainNode = nodes.find((n) => n.role === 'main');
|
|
115
|
+
const hasMain = mainNode ? (hasInteractive(mainNode) || hasHeading(mainNode)) : false;
|
|
116
|
+
|
|
117
|
+
const results = [];
|
|
118
|
+
for (const node of nodes) {
|
|
119
|
+
if (LANDMARKS.has(node.role)) {
|
|
120
|
+
if (isRegionAllowed(node, allowedRegions)) results.push(node);
|
|
121
|
+
} else if (hasLandmarks && hasMain) {
|
|
122
|
+
if (allowedRegions.has('navigation')) results.push(node);
|
|
123
|
+
} else if (hasLandmarks && !hasMain) {
|
|
124
|
+
if (hasInteractive(node) || hasHeading(node)) results.push(node);
|
|
125
|
+
} else {
|
|
126
|
+
results.push(node);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return results;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function isRegionAllowed(node, allowedRegions) {
|
|
133
|
+
if (allowedRegions.has(node.role)) return true;
|
|
134
|
+
if (node.role === 'region' && allowedRegions.has('main')) {
|
|
135
|
+
const auxPatterns = /image|review|recommend|related|similar|also viewed|cookie/i;
|
|
136
|
+
if (node.name && auxPatterns.test(node.name)) return false;
|
|
137
|
+
return true;
|
|
138
|
+
}
|
|
139
|
+
return false;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// --- Step 2: Node pruning ---
|
|
143
|
+
|
|
144
|
+
function pruneNode(node, ctx) {
|
|
145
|
+
if (!node) return null;
|
|
146
|
+
|
|
147
|
+
// Skip rendering noise
|
|
148
|
+
if (SKIP_ROLES.has(node.role)) return null;
|
|
149
|
+
|
|
150
|
+
const isBrowse = ctx.mode === 'browse';
|
|
151
|
+
const level = node.properties?.level;
|
|
152
|
+
|
|
153
|
+
// Drop links inside paragraphs in act mode
|
|
154
|
+
if (ctx.mode === 'act' && node.role === 'link' && ctx.parentRole === 'paragraph') {
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Paragraphs: drop in act, keep in browse
|
|
159
|
+
if (node.role === 'paragraph') {
|
|
160
|
+
if (ctx.mode === 'act') return null;
|
|
161
|
+
return { ...node, children: pruneChildren(node.children, ctx) };
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Navigation inside main: drop in browse (page chrome)
|
|
165
|
+
if (isBrowse && node.role === 'navigation') return null;
|
|
166
|
+
|
|
167
|
+
// Code blocks: keep as-is
|
|
168
|
+
if (node.role === 'code') return node;
|
|
169
|
+
|
|
170
|
+
// Term/definition: keep + recurse
|
|
171
|
+
if (node.role === 'term' || node.role === 'definition') {
|
|
172
|
+
return { ...node, children: pruneChildren(node.children, ctx) };
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Strong/emphasis/blockquote: keep in browse
|
|
176
|
+
if (isBrowse && (node.role === 'strong' || node.role === 'emphasis' || node.role === 'blockquote')) {
|
|
177
|
+
return { ...node, children: pruneChildren(node.children, ctx) };
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Figures in browse: caption text
|
|
181
|
+
if (isBrowse && node.role === 'figure') {
|
|
182
|
+
if (node.name) {
|
|
183
|
+
return { ...node, role: 'StaticText', name: `[Figure: ${node.name}]`, children: [] };
|
|
184
|
+
}
|
|
185
|
+
return null;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Interactive elements: always keep
|
|
189
|
+
if (INTERACTIVE.has(node.role)) {
|
|
190
|
+
return { ...node, children: pruneChildren(node.children, ctx) };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Context-aware: collapse non-matching product cards
|
|
194
|
+
if (!isBrowse && ctx.keywords.length > 0 && node.role === 'listitem' && hasInteractive(node)) {
|
|
195
|
+
const text = extractText(node).toLowerCase();
|
|
196
|
+
if (!ctx.keywords.some((kw) => text.includes(kw))) {
|
|
197
|
+
return condenseCard(node);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Named groups: keep
|
|
202
|
+
if (GROUPS.has(node.role) && node.name) {
|
|
203
|
+
return { ...node, children: pruneChildren(node.children, ctx) };
|
|
204
|
+
}
|
|
205
|
+
if (node.role === 'group' && node.name) {
|
|
206
|
+
if (!isBrowse && /kleuren|colors?|couleurs?|farben/i.test(node.name)) {
|
|
207
|
+
return collapseColors(node);
|
|
208
|
+
}
|
|
209
|
+
return { ...node, children: pruneChildren(node.children, ctx) };
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Headings
|
|
213
|
+
if (node.role === 'heading') {
|
|
214
|
+
if (!isBrowse && level !== '1' && level !== 1) {
|
|
215
|
+
if (node.name && /about this|description|detail|feature|specification|overview/i.test(node.name)) {
|
|
216
|
+
return null;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return { ...node, children: [] };
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// StaticText — CDP equivalent of mcprune's "text" nodes
|
|
223
|
+
if (node.role === 'StaticText') {
|
|
224
|
+
return keepText(node, ctx.mode) ? node : null;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Images: drop in act, keep named in browse
|
|
228
|
+
if (node.role === 'img' || node.role === 'image') {
|
|
229
|
+
if (isBrowse && node.name) return { ...node, children: [] };
|
|
230
|
+
return null;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Separators: drop
|
|
234
|
+
if (node.role === 'separator') return null;
|
|
235
|
+
|
|
236
|
+
// Complementary: keep in browse, drop in act
|
|
237
|
+
if (node.role === 'complementary') {
|
|
238
|
+
if (isBrowse) return { ...node, children: pruneChildren(node.children, ctx) };
|
|
239
|
+
return null;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Aux regions: drop in act
|
|
243
|
+
if (node.role === 'region' && !isBrowse) {
|
|
244
|
+
if (node.name && /image|review|recommend|related|similar|also viewed/i.test(node.name)) {
|
|
245
|
+
return null;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Note/status: keep in browse
|
|
250
|
+
if (isBrowse && (node.role === 'note' || node.role === 'status')) {
|
|
251
|
+
return { ...node, children: pruneChildren(node.children, ctx) };
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Structural: recurse, keep if has children
|
|
255
|
+
const childCtx = { ...ctx, parentRole: node.role };
|
|
256
|
+
const keptChildren = pruneChildren(node.children, childCtx);
|
|
257
|
+
|
|
258
|
+
// Drop text-only lists in act mode
|
|
259
|
+
if (!isBrowse) {
|
|
260
|
+
if (node.role === 'list' && keptChildren.every((c) => !hasInteractive(c))) return null;
|
|
261
|
+
if (node.role === 'listitem' && !hasInteractive(node)) return null;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
if (keptChildren.length > 0) return { ...node, children: keptChildren };
|
|
265
|
+
return null;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
function pruneChildren(children, ctx) {
|
|
269
|
+
if (!children) return [];
|
|
270
|
+
return children.map((c) => pruneNode(c, ctx)).filter(Boolean);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function keepText(node, mode) {
|
|
274
|
+
const t = node.name || '';
|
|
275
|
+
if (!t) return false;
|
|
276
|
+
|
|
277
|
+
// Browse: keep all except separator noise
|
|
278
|
+
if (mode === 'browse') {
|
|
279
|
+
if (t.length <= 2 && /^[|»·•→←>\-]$/.test(t.trim())) return false;
|
|
280
|
+
return true;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Act: prices, stock, shipping, short labels
|
|
284
|
+
if (/\$[\d,]+\.?\d*|€[\d,]+/.test(t)) return true;
|
|
285
|
+
if (/in stock|out of stock|unavailable|available/i.test(t)) return true;
|
|
286
|
+
if (/delivery|shipping|free/i.test(t)) return true;
|
|
287
|
+
if (t.length < 40 && t.endsWith(':')) return true;
|
|
288
|
+
if (t.length < 30) return true;
|
|
289
|
+
return false;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// --- Step 3: Collapse structural wrappers ---
|
|
293
|
+
|
|
294
|
+
function collapse(node) {
|
|
295
|
+
if (!node) return null;
|
|
296
|
+
|
|
297
|
+
node = { ...node, children: node.children.map((c) => collapse(c)).filter(Boolean) };
|
|
298
|
+
|
|
299
|
+
const isTableLayout = /^LayoutTable/.test(node.role) ||
|
|
300
|
+
node.role === 'row' || node.role === 'cell' || node.role === 'rowgroup';
|
|
301
|
+
|
|
302
|
+
if ((STRUCTURAL.has(node.role) && !node.name) || isTableLayout) {
|
|
303
|
+
if (node.children.length === 1) return node.children[0];
|
|
304
|
+
if (node.children.length > 0) {
|
|
305
|
+
return { ...node, role: '_promote', children: node.children };
|
|
306
|
+
}
|
|
307
|
+
return null;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return node;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// --- Step 4: Post-clean ---
|
|
314
|
+
|
|
315
|
+
function postClean(node, isBrowse) {
|
|
316
|
+
if (!node) return null;
|
|
317
|
+
|
|
318
|
+
if (node.role === 'combobox' || node.role === 'listbox') {
|
|
319
|
+
const selected = node.children.find((c) => c.properties?.selected);
|
|
320
|
+
return { ...node, name: selected?.name || node.name, children: [] };
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
node = { ...node, children: node.children.map((c) => postClean(c, isBrowse)).filter(Boolean) };
|
|
324
|
+
|
|
325
|
+
if (!isBrowse && node.children) {
|
|
326
|
+
node = { ...node, children: dropOrphanedHeadings(node.children) };
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
return node;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
function dropOrphanedHeadings(children) {
|
|
333
|
+
const result = [];
|
|
334
|
+
for (let i = 0; i < children.length; i++) {
|
|
335
|
+
const child = children[i];
|
|
336
|
+
const level = child.properties?.level;
|
|
337
|
+
if (child.role === 'heading' && level !== '1' && level !== 1) {
|
|
338
|
+
let found = false;
|
|
339
|
+
for (let j = i + 1; j < children.length; j++) {
|
|
340
|
+
if (children[j].role === 'heading') break;
|
|
341
|
+
if (hasInteractive(children[j])) { found = true; break; }
|
|
342
|
+
}
|
|
343
|
+
if (!found) continue;
|
|
344
|
+
}
|
|
345
|
+
result.push(child);
|
|
346
|
+
}
|
|
347
|
+
return result;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// --- Steps 5-8: E-commerce noise ---
|
|
351
|
+
|
|
352
|
+
function dedupLinks(nodes) {
|
|
353
|
+
const seen = new Map();
|
|
354
|
+
return nodes.map((n) => dedupLinksIn(n, seen)).filter(Boolean);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
function dedupLinksIn(node, seen) {
|
|
358
|
+
if (!node) return null;
|
|
359
|
+
if (node.role === 'link' && node.name) {
|
|
360
|
+
if (seen.has(node.name)) return null;
|
|
361
|
+
seen.set(node.name, true);
|
|
362
|
+
}
|
|
363
|
+
if (node.role === 'listitem') {
|
|
364
|
+
const local = new Map();
|
|
365
|
+
node = { ...node, children: node.children.map((c) => dedupLinksIn(c, local)).filter(Boolean) };
|
|
366
|
+
return node.children.length > 0 ? node : null;
|
|
367
|
+
}
|
|
368
|
+
node = { ...node, children: node.children.map((c) => dedupLinksIn(c, seen)).filter(Boolean) };
|
|
369
|
+
return node;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
const NOISE_BUTTONS = /energieklasse|energy\s*class|productinformatieblad|product\s*information\s*sheet|gesponsorde|sponsored|ad\s*feedback|sterren.*details.*beoordeling|stars.*rating\s*detail/i;
|
|
373
|
+
const NOISE_LINKS = /^opties bekijken$|^view options$|^see options$|^voir les options$/i;
|
|
374
|
+
const FOOTER_LINKS = /gebruiks.*voorwaarden|conditions.*use|privacy|cookie|contactgegevens|contact\s*info|advertenties|interest.*ads|lees\s*meer\s*over\s*deze\s*resultaten/i;
|
|
375
|
+
|
|
376
|
+
function dropNoiseButtons(node) {
|
|
377
|
+
if (!node) return null;
|
|
378
|
+
if (node.role === 'button' && node.name && NOISE_BUTTONS.test(node.name)) return null;
|
|
379
|
+
if (node.role === 'link' && node.name && (NOISE_LINKS.test(node.name) || FOOTER_LINKS.test(node.name))) return null;
|
|
380
|
+
node = { ...node, children: node.children.map((c) => dropNoiseButtons(c)).filter(Boolean) };
|
|
381
|
+
return node;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
function truncateAfterFooter(nodes) {
|
|
385
|
+
const result = [];
|
|
386
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
387
|
+
const node = nodes[i];
|
|
388
|
+
if (isFooterMarker(node)) break;
|
|
389
|
+
if (isSkippable(node)) continue;
|
|
390
|
+
if (node.children?.length > 0) {
|
|
391
|
+
const trimmed = { ...node, children: truncateAfterFooter(node.children) };
|
|
392
|
+
if (trimmed.children.length === 0 && STRUCTURAL.has(trimmed.role)) continue;
|
|
393
|
+
result.push(trimmed);
|
|
394
|
+
} else {
|
|
395
|
+
result.push(node);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
return result;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
function isFooterMarker(node) {
|
|
402
|
+
if (node.role === 'button' && node.name && /terug naar boven|back to top/i.test(node.name)) return true;
|
|
403
|
+
const level = node.properties?.level;
|
|
404
|
+
if (node.role === 'heading' && (level === '6' || level === 6)) return true;
|
|
405
|
+
if (node.role === 'heading' && node.name && /gerelateerde zoek|related search|hulp nodig|need help/i.test(node.name)) return true;
|
|
406
|
+
return false;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
function isSkippable(node) {
|
|
410
|
+
return node.role === 'dialog' && node.name && /filter/i.test(node.name);
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
const FILTER_GROUP = /toepassen om de resultaten|filter.*to narrow|apply.*filter|refine by/i;
|
|
414
|
+
|
|
415
|
+
function dropFilterGroups(node) {
|
|
416
|
+
if (!node) return null;
|
|
417
|
+
if (node.role === 'group' && node.name && FILTER_GROUP.test(extractText(node))) return null;
|
|
418
|
+
node = { ...node, children: node.children.map((c) => dropFilterGroups(c)).filter(Boolean) };
|
|
419
|
+
if (STRUCTURAL.has(node.role) && !node.name && node.children.length === 0) return null;
|
|
420
|
+
return node;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// --- Helpers ---
|
|
424
|
+
|
|
425
|
+
function hasInteractive(node) {
|
|
426
|
+
if (INTERACTIVE.has(node.role) || GROUPS.has(node.role)) return true;
|
|
427
|
+
return node.children?.some((c) => hasInteractive(c)) ?? false;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
function hasHeading(node) {
|
|
431
|
+
if (node.role === 'heading') return true;
|
|
432
|
+
return node.children?.some((c) => hasHeading(c)) ?? false;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
function extractText(node) {
|
|
436
|
+
let text = node.name || '';
|
|
437
|
+
for (const child of (node.children || [])) text += ' ' + extractText(child);
|
|
438
|
+
return text;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
function flatten(nodes) {
|
|
442
|
+
const result = [];
|
|
443
|
+
for (const n of nodes) {
|
|
444
|
+
result.push(n);
|
|
445
|
+
if (n.children) result.push(...flatten(n.children));
|
|
446
|
+
}
|
|
447
|
+
return result;
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
function condenseCard(node) {
|
|
451
|
+
const all = flatten([node]);
|
|
452
|
+
const link = all.find((n) => n.role === 'link' && n.name);
|
|
453
|
+
if (!link) return null;
|
|
454
|
+
return {
|
|
455
|
+
nodeId: node.nodeId, role: 'listitem', name: '', properties: {},
|
|
456
|
+
ignored: false, children: [{ ...link, children: [] }],
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
function collapseColors(node) {
|
|
461
|
+
const all = flatten([node]);
|
|
462
|
+
const colors = all.filter((n) => n.role === 'link' && n.name && !n.name.startsWith('+'))
|
|
463
|
+
.map((n) => n.name);
|
|
464
|
+
if (colors.length === 0) {
|
|
465
|
+
const plus = all.find((n) => n.role === 'link' && n.name);
|
|
466
|
+
return plus ? { ...plus, children: [] } : null;
|
|
467
|
+
}
|
|
468
|
+
return {
|
|
469
|
+
nodeId: node.nodeId, role: 'StaticText', name: `colors(${colors.length}): ${colors.join(', ')}`,
|
|
470
|
+
properties: {}, ignored: false, children: [],
|
|
471
|
+
};
|
|
472
|
+
}
|