@jackwener/opencli 1.7.5 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/README.zh-CN.md +5 -2
- package/cli-manifest.json +77 -1
- package/clis/bilibili/video.js +61 -0
- package/clis/bilibili/video.test.js +81 -0
- package/clis/deepseek/ask.js +21 -1
- package/clis/deepseek/ask.test.js +73 -0
- package/clis/deepseek/utils.js +84 -1
- package/clis/deepseek/utils.test.js +37 -0
- package/clis/jianyu/search.js +139 -3
- package/clis/jianyu/search.test.js +25 -0
- package/clis/jianyu/shared/procurement-detail.js +15 -0
- package/clis/jianyu/shared/procurement-detail.test.js +12 -0
- package/clis/twitter/shared.js +7 -2
- package/clis/twitter/tweets.js +218 -0
- package/clis/twitter/tweets.test.js +125 -0
- package/clis/youtube/channel.js +35 -0
- package/dist/src/browser/base-page.d.ts +13 -3
- package/dist/src/browser/base-page.js +35 -25
- package/dist/src/browser/cdp.d.ts +1 -0
- package/dist/src/browser/cdp.js +12 -3
- package/dist/src/browser/compound.d.ts +59 -0
- package/dist/src/browser/compound.js +112 -0
- package/dist/src/browser/compound.test.d.ts +1 -0
- package/dist/src/browser/compound.test.js +175 -0
- package/dist/src/browser/dom-snapshot.d.ts +7 -0
- package/dist/src/browser/dom-snapshot.js +76 -3
- package/dist/src/browser/dom-snapshot.test.js +65 -0
- package/dist/src/browser/extract.d.ts +69 -0
- package/dist/src/browser/extract.js +132 -0
- package/dist/src/browser/extract.test.d.ts +1 -0
- package/dist/src/browser/extract.test.js +129 -0
- package/dist/src/browser/find.d.ts +76 -0
- package/dist/src/browser/find.js +179 -0
- package/dist/src/browser/find.test.d.ts +1 -0
- package/dist/src/browser/find.test.js +120 -0
- package/dist/src/browser/html-tree.d.ts +75 -0
- package/dist/src/browser/html-tree.js +112 -0
- package/dist/src/browser/html-tree.test.d.ts +1 -0
- package/dist/src/browser/html-tree.test.js +181 -0
- package/dist/src/browser/network-cache.d.ts +48 -0
- package/dist/src/browser/network-cache.js +66 -0
- package/dist/src/browser/network-cache.test.d.ts +1 -0
- package/dist/src/browser/network-cache.test.js +58 -0
- package/dist/src/browser/network-key.d.ts +22 -0
- package/dist/src/browser/network-key.js +66 -0
- package/dist/src/browser/network-key.test.d.ts +1 -0
- package/dist/src/browser/network-key.test.js +49 -0
- package/dist/src/browser/shape-filter.d.ts +52 -0
- package/dist/src/browser/shape-filter.js +101 -0
- package/dist/src/browser/shape-filter.test.d.ts +1 -0
- package/dist/src/browser/shape-filter.test.js +101 -0
- package/dist/src/browser/shape.d.ts +23 -0
- package/dist/src/browser/shape.js +95 -0
- package/dist/src/browser/shape.test.d.ts +1 -0
- package/dist/src/browser/shape.test.js +82 -0
- package/dist/src/browser/target-errors.d.ts +14 -1
- package/dist/src/browser/target-errors.js +13 -0
- package/dist/src/browser/target-errors.test.js +39 -6
- package/dist/src/browser/target-resolver.d.ts +57 -10
- package/dist/src/browser/target-resolver.js +195 -75
- package/dist/src/browser/target-resolver.test.js +80 -5
- package/dist/src/cli.js +630 -125
- package/dist/src/cli.test.js +794 -0
- package/dist/src/execution.js +7 -2
- package/dist/src/execution.test.js +54 -0
- package/dist/src/main.js +16 -0
- package/dist/src/types.d.ts +18 -3
- package/package.json +1 -1
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `browser find --css <sel>` — structured CSS query.
|
|
3
|
+
*
|
|
4
|
+
* Returns every match of a selector as a JSON envelope agents can read
|
|
5
|
+
* without parsing free-text snapshot output. Each entry carries two
|
|
6
|
+
* identifiers — a numeric `ref` (matching the snapshot contract) and a
|
|
7
|
+
* stable 0-based `nth` — so the agent can act on a specific result via
|
|
8
|
+
* either path:
|
|
9
|
+
*
|
|
10
|
+
* browser click <ref> // when ref is numeric
|
|
11
|
+
* browser click "<sel>" --nth <n> // always works
|
|
12
|
+
*
|
|
13
|
+
* Refs are *allocated on the spot* for matched elements that were not
|
|
14
|
+
* tagged by a prior snapshot: `data-opencli-ref` is set on the element
|
|
15
|
+
* and a fingerprint is written into `window.__opencli_ref_identity`
|
|
16
|
+
* (same shape the snapshot uses). That makes `find` a first-class entry
|
|
17
|
+
* point to the ref system — agents can skip running `browser state`
|
|
18
|
+
* when they already know the selector.
|
|
19
|
+
*
|
|
20
|
+
* Attributes are whitelisted to keep output small and high-signal.
|
|
21
|
+
* Invisible elements are still returned so agents can reason about
|
|
22
|
+
* offscreen vs truly-missing targets.
|
|
23
|
+
*
|
|
24
|
+
* When a matched element is a compound form control (date-like input,
|
|
25
|
+
* select, file input), the entry gains a `compound` field with the
|
|
26
|
+
* rich view from `compound.ts`. This is what kills the three biggest
|
|
27
|
+
* agent-fail modes on form pages (wrong date format, guessed options,
|
|
28
|
+
* re-uploaded files) without forcing agents to probe further.
|
|
29
|
+
*/
|
|
30
|
+
import { COMPOUND_INFO_JS } from './compound.js';
|
|
31
|
+
/** Whitelist of attributes surfaced per entry. Keep small; agents do not need full DOM dumps. */
|
|
32
|
+
export const FIND_ATTR_WHITELIST = [
|
|
33
|
+
'id',
|
|
34
|
+
'class',
|
|
35
|
+
'name',
|
|
36
|
+
'type',
|
|
37
|
+
'placeholder',
|
|
38
|
+
'aria-label',
|
|
39
|
+
'title',
|
|
40
|
+
'href',
|
|
41
|
+
'value',
|
|
42
|
+
'role',
|
|
43
|
+
'data-testid',
|
|
44
|
+
];
|
|
45
|
+
/**
|
|
46
|
+
* Build the browser-side JS that performs the CSS query and emits the
|
|
47
|
+
* FindResult (or FindError) envelope. Evaluated inside `page.evaluate`.
|
|
48
|
+
*/
|
|
49
|
+
export function buildFindJs(selector, opts = {}) {
|
|
50
|
+
const safeSel = JSON.stringify(selector);
|
|
51
|
+
const limit = opts.limit ?? 50;
|
|
52
|
+
const textMax = opts.textMax ?? 120;
|
|
53
|
+
const whitelist = JSON.stringify(FIND_ATTR_WHITELIST);
|
|
54
|
+
return `
|
|
55
|
+
(() => {
|
|
56
|
+
const sel = ${safeSel};
|
|
57
|
+
const LIMIT = ${limit};
|
|
58
|
+
const TEXT_MAX = ${textMax};
|
|
59
|
+
const ATTR_WHITELIST = ${whitelist};
|
|
60
|
+
|
|
61
|
+
${COMPOUND_INFO_JS}
|
|
62
|
+
|
|
63
|
+
let matches;
|
|
64
|
+
try {
|
|
65
|
+
matches = document.querySelectorAll(sel);
|
|
66
|
+
} catch (e) {
|
|
67
|
+
return {
|
|
68
|
+
error: {
|
|
69
|
+
code: 'invalid_selector',
|
|
70
|
+
message: 'Invalid CSS selector: ' + sel + ' (' + ((e && e.message) || String(e)) + ')',
|
|
71
|
+
hint: 'Check the selector syntax.',
|
|
72
|
+
},
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (matches.length === 0) {
|
|
77
|
+
return {
|
|
78
|
+
error: {
|
|
79
|
+
code: 'selector_not_found',
|
|
80
|
+
message: 'CSS selector ' + sel + ' matched 0 elements',
|
|
81
|
+
hint: 'Use browser state to inspect the page, or try a less specific selector.',
|
|
82
|
+
},
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function pickAttrs(el) {
|
|
87
|
+
const out = {};
|
|
88
|
+
for (const key of ATTR_WHITELIST) {
|
|
89
|
+
const v = el.getAttribute(key);
|
|
90
|
+
if (v != null && v !== '') out[key] = v;
|
|
91
|
+
}
|
|
92
|
+
return out;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function isVisible(el) {
|
|
96
|
+
const rect = el.getBoundingClientRect();
|
|
97
|
+
if (rect.width === 0 && rect.height === 0) return false;
|
|
98
|
+
try {
|
|
99
|
+
const style = getComputedStyle(el);
|
|
100
|
+
if (style.display === 'none' || style.visibility === 'hidden') return false;
|
|
101
|
+
if (parseFloat(style.opacity || '1') === 0) return false;
|
|
102
|
+
} catch (_) {}
|
|
103
|
+
return true;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Ref allocation: reuse \`window.__opencli_ref_identity\` (the same map
|
|
107
|
+
// snapshot populates) as the source of truth. For matched elements that
|
|
108
|
+
// don't already carry a \`data-opencli-ref\`, assign the next free numeric
|
|
109
|
+
// ref and write the fingerprint so the target resolver can verify it on
|
|
110
|
+
// downstream click/type/get calls.
|
|
111
|
+
const identity = (window.__opencli_ref_identity = window.__opencli_ref_identity || {});
|
|
112
|
+
let maxRef = 0;
|
|
113
|
+
for (const k in identity) {
|
|
114
|
+
const n = parseInt(k, 10);
|
|
115
|
+
if (!isNaN(n) && n > maxRef) maxRef = n;
|
|
116
|
+
}
|
|
117
|
+
// Also walk any \`data-opencli-ref\` already in the DOM in case the identity
|
|
118
|
+
// map was cleared but annotations remain (e.g. soft navigation without a
|
|
119
|
+
// fresh snapshot). Guarantees allocated refs don't collide.
|
|
120
|
+
try {
|
|
121
|
+
const tagged = document.querySelectorAll('[data-opencli-ref]');
|
|
122
|
+
for (let t = 0; t < tagged.length; t++) {
|
|
123
|
+
const v = tagged[t].getAttribute('data-opencli-ref');
|
|
124
|
+
const n = v != null && /^\\d+$/.test(v) ? parseInt(v, 10) : NaN;
|
|
125
|
+
if (!isNaN(n) && n > maxRef) maxRef = n;
|
|
126
|
+
}
|
|
127
|
+
} catch (_) {}
|
|
128
|
+
|
|
129
|
+
function fingerprintOf(el) {
|
|
130
|
+
return {
|
|
131
|
+
tag: el.tagName.toLowerCase(),
|
|
132
|
+
role: el.getAttribute('role') || '',
|
|
133
|
+
text: (el.textContent || '').trim().slice(0, 30),
|
|
134
|
+
ariaLabel: el.getAttribute('aria-label') || '',
|
|
135
|
+
id: el.id || '',
|
|
136
|
+
testId: el.getAttribute('data-testid') || el.getAttribute('data-test') || '',
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const take = Math.min(matches.length, LIMIT);
|
|
141
|
+
const entries = [];
|
|
142
|
+
for (let i = 0; i < take; i++) {
|
|
143
|
+
const el = matches[i];
|
|
144
|
+
const refAttr = el.getAttribute('data-opencli-ref');
|
|
145
|
+
let refNum = refAttr != null && /^\\d+$/.test(refAttr) ? parseInt(refAttr, 10) : null;
|
|
146
|
+
if (refNum === null) {
|
|
147
|
+
refNum = ++maxRef;
|
|
148
|
+
try { el.setAttribute('data-opencli-ref', '' + refNum); } catch (_) {}
|
|
149
|
+
identity['' + refNum] = fingerprintOf(el);
|
|
150
|
+
} else if (!identity['' + refNum]) {
|
|
151
|
+
// Ref annotation survived but identity map was cleared — repopulate so the
|
|
152
|
+
// target resolver's fingerprint check passes on downstream calls.
|
|
153
|
+
identity['' + refNum] = fingerprintOf(el);
|
|
154
|
+
}
|
|
155
|
+
const text = (el.textContent || '').trim();
|
|
156
|
+
const entry = {
|
|
157
|
+
nth: i,
|
|
158
|
+
ref: refNum,
|
|
159
|
+
tag: el.tagName.toLowerCase(),
|
|
160
|
+
role: el.getAttribute('role') || '',
|
|
161
|
+
text: text.length > TEXT_MAX ? text.slice(0, TEXT_MAX) : text,
|
|
162
|
+
attrs: pickAttrs(el),
|
|
163
|
+
visible: isVisible(el),
|
|
164
|
+
};
|
|
165
|
+
const compound = compoundInfoOf(el);
|
|
166
|
+
if (compound) entry.compound = compound;
|
|
167
|
+
entries.push(entry);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
matches_n: matches.length,
|
|
172
|
+
entries,
|
|
173
|
+
};
|
|
174
|
+
})()
|
|
175
|
+
`;
|
|
176
|
+
}
|
|
177
|
+
export function isFindError(result) {
|
|
178
|
+
return !!result && typeof result === 'object' && 'error' in result;
|
|
179
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
|
2
|
+
import { buildFindJs, FIND_ATTR_WHITELIST, isFindError } from './find.js';
|
|
3
|
+
/**
|
|
4
|
+
* These tests validate the shape and options of the generated JS string
|
|
5
|
+
* (no DOM available in the default vitest unit env). Runtime behavior of
|
|
6
|
+
* the generated JS against a real DOM is covered by the browser e2e suite.
|
|
7
|
+
*/
|
|
8
|
+
describe('buildFindJs', () => {
|
|
9
|
+
it('produces syntactically valid JS that can be parsed', () => {
|
|
10
|
+
expect(() => new Function(`return (${buildFindJs('.btn')});`)).not.toThrow();
|
|
11
|
+
});
|
|
12
|
+
it('embeds the selector via JSON.stringify (injection-safe)', () => {
|
|
13
|
+
const js = buildFindJs('[data-x="a\"b"]');
|
|
14
|
+
// Unescaped literal break-out must not appear
|
|
15
|
+
expect(js).not.toContain('[data-x="a"b"]');
|
|
16
|
+
// The JSON-encoded form (with escaped quotes) should
|
|
17
|
+
expect(js).toContain(JSON.stringify('[data-x="a\"b"]'));
|
|
18
|
+
});
|
|
19
|
+
it('emits invalid_selector + selector_not_found branches', () => {
|
|
20
|
+
const js = buildFindJs('.btn');
|
|
21
|
+
expect(js).toContain("code: 'invalid_selector'");
|
|
22
|
+
expect(js).toContain("code: 'selector_not_found'");
|
|
23
|
+
});
|
|
24
|
+
it('emits matches_n + entries + per-entry shape', () => {
|
|
25
|
+
const js = buildFindJs('.btn');
|
|
26
|
+
expect(js).toContain('matches_n: matches.length');
|
|
27
|
+
expect(js).toContain('entries.push(');
|
|
28
|
+
// Per-entry keys reviewers signed off on: nth, ref, tag, role, text, attrs, visible
|
|
29
|
+
expect(js).toContain('nth: i');
|
|
30
|
+
expect(js).toContain('ref: refNum');
|
|
31
|
+
expect(js).toContain('tag: el.tagName.toLowerCase()');
|
|
32
|
+
expect(js).toContain("el.getAttribute('role')");
|
|
33
|
+
expect(js).toContain('visible: isVisible(el)');
|
|
34
|
+
});
|
|
35
|
+
it('allocates fresh refs for untagged matches (write attribute + identity map)', () => {
|
|
36
|
+
const js = buildFindJs('.btn');
|
|
37
|
+
// On the just-annotated branch we must flip the attribute on the element
|
|
38
|
+
// so downstream `browser click <ref>` works off the find output.
|
|
39
|
+
expect(js).toContain("el.setAttribute('data-opencli-ref'");
|
|
40
|
+
// The fingerprint must also land in the shared identity map so the
|
|
41
|
+
// target resolver's stale-ref check has data to verify against.
|
|
42
|
+
expect(js).toContain('__opencli_ref_identity');
|
|
43
|
+
expect(js).toContain("identity['' + refNum] = fingerprintOf(el)");
|
|
44
|
+
// Allocation walks both the identity map and any existing data-opencli-ref
|
|
45
|
+
// annotations — guards against collisions after a soft nav.
|
|
46
|
+
expect(js).toContain("document.querySelectorAll('[data-opencli-ref]')");
|
|
47
|
+
});
|
|
48
|
+
it('fingerprint shape matches the snapshot / resolver contract', () => {
|
|
49
|
+
const js = buildFindJs('.btn');
|
|
50
|
+
// The six fields resolveTargetJs verifies in its stale_ref check.
|
|
51
|
+
for (const field of ['tag:', 'role:', 'text:', 'ariaLabel:', 'id:', 'testId:']) {
|
|
52
|
+
expect(js).toContain(field);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
it('embeds defaults for limit and textMax', () => {
|
|
56
|
+
const js = buildFindJs('.btn');
|
|
57
|
+
expect(js).toContain('LIMIT = 50');
|
|
58
|
+
expect(js).toContain('TEXT_MAX = 120');
|
|
59
|
+
});
|
|
60
|
+
it('overrides limit and textMax when requested', () => {
|
|
61
|
+
const js = buildFindJs('.btn', { limit: 3, textMax: 20 });
|
|
62
|
+
expect(js).toContain('LIMIT = 3');
|
|
63
|
+
expect(js).toContain('TEXT_MAX = 20');
|
|
64
|
+
});
|
|
65
|
+
it('embeds the attribute whitelist verbatim (no style/onclick leaking)', () => {
|
|
66
|
+
const js = buildFindJs('.btn');
|
|
67
|
+
// Whitelist fields appear inside the generated JS
|
|
68
|
+
for (const key of FIND_ATTR_WHITELIST) {
|
|
69
|
+
expect(js).toContain(`"${key}"`);
|
|
70
|
+
}
|
|
71
|
+
// Sensitive / high-noise attrs must stay out of the whitelist
|
|
72
|
+
expect(FIND_ATTR_WHITELIST).not.toContain('style');
|
|
73
|
+
expect(FIND_ATTR_WHITELIST).not.toContain('onclick');
|
|
74
|
+
expect(FIND_ATTR_WHITELIST).not.toContain('onload');
|
|
75
|
+
});
|
|
76
|
+
it('inlines compoundInfoOf and attaches compound field per entry', () => {
|
|
77
|
+
const js = buildFindJs('input, select');
|
|
78
|
+
// Helper definition is inlined so each matched element can be classified.
|
|
79
|
+
expect(js).toContain('function compoundInfoOf(el)');
|
|
80
|
+
// The emitted entry opts in only when compound data is present — no noisy
|
|
81
|
+
// compound: null on every non-form element.
|
|
82
|
+
expect(js).toContain('const compound = compoundInfoOf(el);');
|
|
83
|
+
expect(js).toContain('if (compound) entry.compound = compound;');
|
|
84
|
+
// Spot-check all three compound families are covered in the inlined helper.
|
|
85
|
+
expect(js).toContain("'YYYY-MM-DD'");
|
|
86
|
+
expect(js).toContain("control: 'file'");
|
|
87
|
+
expect(js).toContain("control: 'select'");
|
|
88
|
+
});
|
|
89
|
+
it('keeps the whitelist small and explicit (guardrail against silent expansion)', () => {
|
|
90
|
+
expect(FIND_ATTR_WHITELIST).toEqual([
|
|
91
|
+
'id',
|
|
92
|
+
'class',
|
|
93
|
+
'name',
|
|
94
|
+
'type',
|
|
95
|
+
'placeholder',
|
|
96
|
+
'aria-label',
|
|
97
|
+
'title',
|
|
98
|
+
'href',
|
|
99
|
+
'value',
|
|
100
|
+
'role',
|
|
101
|
+
'data-testid',
|
|
102
|
+
]);
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
describe('isFindError', () => {
|
|
106
|
+
it('narrows { error: ... } as FindError', () => {
|
|
107
|
+
const payload = { error: { code: 'invalid_selector', message: 'x' } };
|
|
108
|
+
expect(isFindError(payload)).toBe(true);
|
|
109
|
+
if (isFindError(payload)) {
|
|
110
|
+
const err = payload;
|
|
111
|
+
expect(err.error.code).toBe('invalid_selector');
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
it('rejects successful envelopes', () => {
|
|
115
|
+
expect(isFindError({ matches_n: 0, entries: [] })).toBe(false);
|
|
116
|
+
expect(isFindError(null)).toBe(false);
|
|
117
|
+
expect(isFindError(undefined)).toBe(false);
|
|
118
|
+
expect(isFindError('string')).toBe(false);
|
|
119
|
+
});
|
|
120
|
+
});
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Client-side HTML → structured tree serializer.
|
|
3
|
+
*
|
|
4
|
+
* Returned as a JS string that gets passed to `page.evaluate`. The expression
|
|
5
|
+
* walks the DOM subtree rooted at the first selector match (or documentElement
|
|
6
|
+
* when no selector is given) and emits a compact `{tag, attrs, text, children}`
|
|
7
|
+
* tree for agents to consume instead of re-parsing raw HTML.
|
|
8
|
+
*
|
|
9
|
+
* Text handling: `text` is the concatenated text of direct text children only,
|
|
10
|
+
* whitespace-collapsed. Nested element text is left inside `children[].text`.
|
|
11
|
+
* Ordering between text and elements is not preserved — agents that need it
|
|
12
|
+
* should fall back to raw HTML mode.
|
|
13
|
+
*
|
|
14
|
+
* Budget knobs let the caller bound the output on large pages — previously an
|
|
15
|
+
* unscoped `get html --as json` could return a giant tree. Callers set any
|
|
16
|
+
* combination of `depth` / `childrenMax` / `textMax`; each hit is reported in
|
|
17
|
+
* the `truncated` envelope so agents know to narrow their selector or raise
|
|
18
|
+
* the budget.
|
|
19
|
+
*
|
|
20
|
+
* Compound controls (date / time / datetime-local / month / week / select /
|
|
21
|
+
* file) gain a `compound` field so agents inspecting the JSON tree see the
|
|
22
|
+
* full contract — date format, full option list (up to cap) with selections
|
|
23
|
+
* preserved for options beyond the cap, file `accept` and `multiple`. Without
|
|
24
|
+
* this wiring agents repeatedly guess values on these controls from the raw
|
|
25
|
+
* attributes, which is the failure mode compound.ts was built to eliminate.
|
|
26
|
+
*/
|
|
27
|
+
import { type CompoundInfo } from './compound.js';
|
|
28
|
+
export interface BuildHtmlTreeJsOptions {
|
|
29
|
+
/** CSS selector to scope the tree; unscoped = documentElement */
|
|
30
|
+
selector?: string | null;
|
|
31
|
+
/** Max depth below the root (0 = root only, no children). Omit = unlimited. */
|
|
32
|
+
depth?: number | null;
|
|
33
|
+
/** Max element children per node before the rest get dropped. Omit = unlimited. */
|
|
34
|
+
childrenMax?: number | null;
|
|
35
|
+
/** Max chars of direct text per node before truncation. Omit = unlimited. */
|
|
36
|
+
textMax?: number | null;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Returns a JS expression string. When evaluated in a page context the
|
|
40
|
+
* expression resolves to either
|
|
41
|
+
* `{selector, matched, tree, truncated}` on success, or
|
|
42
|
+
* `{selector, invalidSelector: true, reason}` when `querySelectorAll`
|
|
43
|
+
* throws a `SyntaxError` for an unparseable selector.
|
|
44
|
+
*
|
|
45
|
+
* Callers must branch on `invalidSelector` to convert it into the CLI's
|
|
46
|
+
* `invalid_selector` structured error; otherwise the browser-level exception
|
|
47
|
+
* would bubble out of `page.evaluate` and bypass the structured-error
|
|
48
|
+
* contract that agents rely on.
|
|
49
|
+
*/
|
|
50
|
+
export declare function buildHtmlTreeJs(opts?: BuildHtmlTreeJsOptions): string;
|
|
51
|
+
export interface HtmlNode {
|
|
52
|
+
tag: string;
|
|
53
|
+
attrs: Record<string, string>;
|
|
54
|
+
text: string;
|
|
55
|
+
children: HtmlNode[];
|
|
56
|
+
/**
|
|
57
|
+
* Rich view for date/select/file controls. Omitted for non-compound elements
|
|
58
|
+
* so agents can rely on `compound != null` as a signal.
|
|
59
|
+
*/
|
|
60
|
+
compound?: CompoundInfo;
|
|
61
|
+
}
|
|
62
|
+
export interface HtmlTreeTruncationInfo {
|
|
63
|
+
/** At least one element child was dropped because depth budget was hit. */
|
|
64
|
+
depth?: true;
|
|
65
|
+
/** Count of element children dropped across the tree due to `childrenMax`. */
|
|
66
|
+
children_dropped?: number;
|
|
67
|
+
/** Count of nodes whose `text` was cut to `textMax`. */
|
|
68
|
+
text_truncated?: number;
|
|
69
|
+
}
|
|
70
|
+
export interface HtmlTreeResult {
|
|
71
|
+
selector: string | null;
|
|
72
|
+
matched: number;
|
|
73
|
+
tree: HtmlNode | null;
|
|
74
|
+
truncated?: HtmlTreeTruncationInfo;
|
|
75
|
+
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Client-side HTML → structured tree serializer.
|
|
3
|
+
*
|
|
4
|
+
* Returned as a JS string that gets passed to `page.evaluate`. The expression
|
|
5
|
+
* walks the DOM subtree rooted at the first selector match (or documentElement
|
|
6
|
+
* when no selector is given) and emits a compact `{tag, attrs, text, children}`
|
|
7
|
+
* tree for agents to consume instead of re-parsing raw HTML.
|
|
8
|
+
*
|
|
9
|
+
* Text handling: `text` is the concatenated text of direct text children only,
|
|
10
|
+
* whitespace-collapsed. Nested element text is left inside `children[].text`.
|
|
11
|
+
* Ordering between text and elements is not preserved — agents that need it
|
|
12
|
+
* should fall back to raw HTML mode.
|
|
13
|
+
*
|
|
14
|
+
* Budget knobs let the caller bound the output on large pages — previously an
|
|
15
|
+
* unscoped `get html --as json` could return a giant tree. Callers set any
|
|
16
|
+
* combination of `depth` / `childrenMax` / `textMax`; each hit is reported in
|
|
17
|
+
* the `truncated` envelope so agents know to narrow their selector or raise
|
|
18
|
+
* the budget.
|
|
19
|
+
*
|
|
20
|
+
* Compound controls (date / time / datetime-local / month / week / select /
|
|
21
|
+
* file) gain a `compound` field so agents inspecting the JSON tree see the
|
|
22
|
+
* full contract — date format, full option list (up to cap) with selections
|
|
23
|
+
* preserved for options beyond the cap, file `accept` and `multiple`. Without
|
|
24
|
+
* this wiring agents repeatedly guess values on these controls from the raw
|
|
25
|
+
* attributes, which is the failure mode compound.ts was built to eliminate.
|
|
26
|
+
*/
|
|
27
|
+
import { COMPOUND_INFO_JS } from './compound.js';
|
|
28
|
+
/**
|
|
29
|
+
* Returns a JS expression string. When evaluated in a page context the
|
|
30
|
+
* expression resolves to either
|
|
31
|
+
* `{selector, matched, tree, truncated}` on success, or
|
|
32
|
+
* `{selector, invalidSelector: true, reason}` when `querySelectorAll`
|
|
33
|
+
* throws a `SyntaxError` for an unparseable selector.
|
|
34
|
+
*
|
|
35
|
+
* Callers must branch on `invalidSelector` to convert it into the CLI's
|
|
36
|
+
* `invalid_selector` structured error; otherwise the browser-level exception
|
|
37
|
+
* would bubble out of `page.evaluate` and bypass the structured-error
|
|
38
|
+
* contract that agents rely on.
|
|
39
|
+
*/
|
|
40
|
+
export function buildHtmlTreeJs(opts = {}) {
|
|
41
|
+
const selectorLiteral = opts.selector ? JSON.stringify(opts.selector) : 'null';
|
|
42
|
+
const depthLiteral = Number.isFinite(opts.depth) && opts.depth >= 0
|
|
43
|
+
? String(opts.depth)
|
|
44
|
+
: 'null';
|
|
45
|
+
const childrenMaxLiteral = Number.isFinite(opts.childrenMax) && opts.childrenMax >= 0
|
|
46
|
+
? String(opts.childrenMax)
|
|
47
|
+
: 'null';
|
|
48
|
+
const textMaxLiteral = Number.isFinite(opts.textMax) && opts.textMax >= 0
|
|
49
|
+
? String(opts.textMax)
|
|
50
|
+
: 'null';
|
|
51
|
+
return `(() => {
|
|
52
|
+
${COMPOUND_INFO_JS}
|
|
53
|
+
const selector = ${selectorLiteral};
|
|
54
|
+
const maxDepth = ${depthLiteral};
|
|
55
|
+
const maxChildren = ${childrenMaxLiteral};
|
|
56
|
+
const maxText = ${textMaxLiteral};
|
|
57
|
+
let matches;
|
|
58
|
+
if (selector) {
|
|
59
|
+
try { matches = document.querySelectorAll(selector); }
|
|
60
|
+
catch (e) {
|
|
61
|
+
return { selector: selector, invalidSelector: true, reason: (e && e.message) || String(e) };
|
|
62
|
+
}
|
|
63
|
+
} else {
|
|
64
|
+
matches = [document.documentElement];
|
|
65
|
+
}
|
|
66
|
+
const matched = matches.length;
|
|
67
|
+
const root = matches[0] || null;
|
|
68
|
+
const trunc = { depth: false, children_dropped: 0, text_truncated: 0 };
|
|
69
|
+
function serialize(el, depth) {
|
|
70
|
+
if (!el || el.nodeType !== 1) return null;
|
|
71
|
+
const attrs = {};
|
|
72
|
+
for (const a of el.attributes) attrs[a.name] = a.value;
|
|
73
|
+
let text = '';
|
|
74
|
+
for (const n of el.childNodes) {
|
|
75
|
+
if (n.nodeType === 3) text += n.nodeValue;
|
|
76
|
+
}
|
|
77
|
+
text = text.replace(/\\s+/g, ' ').trim();
|
|
78
|
+
if (maxText !== null && text.length > maxText) {
|
|
79
|
+
text = text.slice(0, maxText);
|
|
80
|
+
trunc.text_truncated++;
|
|
81
|
+
}
|
|
82
|
+
const children = [];
|
|
83
|
+
if (maxDepth === null || depth < maxDepth) {
|
|
84
|
+
const childEls = [];
|
|
85
|
+
for (const n of el.childNodes) if (n.nodeType === 1) childEls.push(n);
|
|
86
|
+
const keep = maxChildren === null ? childEls.length : Math.min(childEls.length, maxChildren);
|
|
87
|
+
for (let i = 0; i < keep; i++) {
|
|
88
|
+
const child = serialize(childEls[i], depth + 1);
|
|
89
|
+
if (child) children.push(child);
|
|
90
|
+
}
|
|
91
|
+
if (maxChildren !== null && childEls.length > maxChildren) {
|
|
92
|
+
trunc.children_dropped += childEls.length - maxChildren;
|
|
93
|
+
}
|
|
94
|
+
} else {
|
|
95
|
+
// Budget hit: we're at max depth. Count any element children we would have visited.
|
|
96
|
+
for (const n of el.childNodes) if (n.nodeType === 1) { trunc.depth = true; break; }
|
|
97
|
+
}
|
|
98
|
+
const node = { tag: el.tagName.toLowerCase(), attrs, text, children };
|
|
99
|
+
const compound = compoundInfoOf(el);
|
|
100
|
+
if (compound) node.compound = compound;
|
|
101
|
+
return node;
|
|
102
|
+
}
|
|
103
|
+
const tree = root ? serialize(root, 0) : null;
|
|
104
|
+
const truncatedOut = {};
|
|
105
|
+
if (trunc.depth) truncatedOut.depth = true;
|
|
106
|
+
if (trunc.children_dropped > 0) truncatedOut.children_dropped = trunc.children_dropped;
|
|
107
|
+
if (trunc.text_truncated > 0) truncatedOut.text_truncated = trunc.text_truncated;
|
|
108
|
+
const envelope = { selector: selector, matched: matched, tree: tree };
|
|
109
|
+
if (Object.keys(truncatedOut).length > 0) envelope.truncated = truncatedOut;
|
|
110
|
+
return envelope;
|
|
111
|
+
})()`;
|
|
112
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
|
2
|
+
import { buildHtmlTreeJs } from './html-tree.js';
|
|
3
|
+
/**
|
|
4
|
+
* The serializer runs in a page context via `page.evaluate`. In unit tests we
|
|
5
|
+
* substitute `document` with a minimal stub that mirrors the DOM surface used
|
|
6
|
+
* by the expression, then Function-eval the returned JS.
|
|
7
|
+
*/
|
|
8
|
+
function runTreeJs(root, selectorMatches, selector, budgets = {}) {
|
|
9
|
+
const js = buildHtmlTreeJs({ selector, ...budgets });
|
|
10
|
+
const fakeDocument = {
|
|
11
|
+
querySelectorAll: () => selectorMatches,
|
|
12
|
+
documentElement: root,
|
|
13
|
+
};
|
|
14
|
+
const fn = new Function('document', `return ${js};`);
|
|
15
|
+
return fn(fakeDocument);
|
|
16
|
+
}
|
|
17
|
+
function runTreeJsInvalid(selector, errorMessage) {
|
|
18
|
+
const js = buildHtmlTreeJs({ selector });
|
|
19
|
+
const fakeDocument = {
|
|
20
|
+
querySelectorAll: () => { const e = new Error(errorMessage); e.name = 'SyntaxError'; throw e; },
|
|
21
|
+
documentElement: null,
|
|
22
|
+
};
|
|
23
|
+
const fn = new Function('document', `return ${js};`);
|
|
24
|
+
return fn(fakeDocument);
|
|
25
|
+
}
|
|
26
|
+
function el(tag, attrs, children, extras = {}) {
|
|
27
|
+
return {
|
|
28
|
+
nodeType: 1,
|
|
29
|
+
tagName: tag.toUpperCase(),
|
|
30
|
+
attributes: Object.entries(attrs).map(([name, value]) => ({ name, value })),
|
|
31
|
+
childNodes: children,
|
|
32
|
+
getAttribute: (name) => (name in attrs ? attrs[name] : null),
|
|
33
|
+
value: extras.value,
|
|
34
|
+
multiple: extras.multiple,
|
|
35
|
+
files: extras.files,
|
|
36
|
+
options: extras.options,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
function txt(value) { return { nodeType: 3, nodeValue: value }; }
|
|
40
|
+
describe('buildHtmlTreeJs', () => {
|
|
41
|
+
it('serializes a simple element into {tag, attrs, text, children}', () => {
|
|
42
|
+
const root = el('div', { class: 'hero', id: 'x' }, [txt('Hello')]);
|
|
43
|
+
const result = runTreeJs(root, [root], null);
|
|
44
|
+
expect(result.selector).toBeNull();
|
|
45
|
+
expect(result.matched).toBe(1);
|
|
46
|
+
expect(result.tree).toEqual({
|
|
47
|
+
tag: 'div',
|
|
48
|
+
attrs: { class: 'hero', id: 'x' },
|
|
49
|
+
text: 'Hello',
|
|
50
|
+
children: [],
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
it('collapses whitespace in direct text content only', () => {
|
|
54
|
+
const root = el('p', {}, [
|
|
55
|
+
txt(' line \n one '),
|
|
56
|
+
el('span', {}, [txt('inner text')]),
|
|
57
|
+
txt('\tline two\t'),
|
|
58
|
+
]);
|
|
59
|
+
const result = runTreeJs(root, [root], null);
|
|
60
|
+
expect(result.tree?.text).toBe('line one line two');
|
|
61
|
+
expect(result.tree?.children[0].text).toBe('inner text');
|
|
62
|
+
});
|
|
63
|
+
it('recurses into element children and preserves their attrs', () => {
|
|
64
|
+
const root = el('ul', { role: 'list' }, [
|
|
65
|
+
el('li', { 'data-id': '1' }, [txt('first')]),
|
|
66
|
+
el('li', { 'data-id': '2' }, [txt('second')]),
|
|
67
|
+
]);
|
|
68
|
+
const result = runTreeJs(root, [root], null);
|
|
69
|
+
expect(result.tree?.children).toHaveLength(2);
|
|
70
|
+
expect(result.tree?.children[0]).toEqual({
|
|
71
|
+
tag: 'li',
|
|
72
|
+
attrs: { 'data-id': '1' },
|
|
73
|
+
text: 'first',
|
|
74
|
+
children: [],
|
|
75
|
+
});
|
|
76
|
+
});
|
|
77
|
+
it('returns matched=N and serializes only the first match', () => {
|
|
78
|
+
const first = el('article', { id: 'a' }, [txt('first')]);
|
|
79
|
+
const second = el('article', { id: 'b' }, [txt('second')]);
|
|
80
|
+
const result = runTreeJs(null, [first, second], 'article');
|
|
81
|
+
expect(result.matched).toBe(2);
|
|
82
|
+
expect(result.tree?.attrs.id).toBe('a');
|
|
83
|
+
});
|
|
84
|
+
it('returns tree=null and matched=0 when selector matches nothing', () => {
|
|
85
|
+
const result = runTreeJs(null, [], '.nothing');
|
|
86
|
+
expect(result.matched).toBe(0);
|
|
87
|
+
expect(result.tree).toBeNull();
|
|
88
|
+
});
|
|
89
|
+
it('catches SyntaxError from querySelectorAll and returns {invalidSelector:true, reason}', () => {
|
|
90
|
+
const result = runTreeJsInvalid('##$@@', "'##$@@' is not a valid selector");
|
|
91
|
+
expect(result.invalidSelector).toBe(true);
|
|
92
|
+
expect(result.selector).toBe('##$@@');
|
|
93
|
+
expect(result.reason).toContain('not a valid selector');
|
|
94
|
+
});
|
|
95
|
+
it('omits `truncated` when no budget is hit', () => {
|
|
96
|
+
const root = el('div', {}, [el('span', {}, [txt('ok')])]);
|
|
97
|
+
const result = runTreeJs(root, [root], null, { depth: 5, childrenMax: 10, textMax: 100 });
|
|
98
|
+
expect(result.truncated).toBeUndefined();
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
describe('buildHtmlTreeJs budget knobs', () => {
|
|
102
|
+
it('caps tree at `depth` and reports truncated.depth', () => {
|
|
103
|
+
const deep = el('a', {}, [
|
|
104
|
+
el('b', {}, [
|
|
105
|
+
el('c', {}, [el('d', {}, [txt('deep')])]),
|
|
106
|
+
]),
|
|
107
|
+
]);
|
|
108
|
+
// depth=1 → root + one level of children; grandchildren should be dropped.
|
|
109
|
+
const result = runTreeJs(deep, [deep], null, { depth: 1 });
|
|
110
|
+
expect(result.tree?.tag).toBe('a');
|
|
111
|
+
expect(result.tree?.children).toHaveLength(1);
|
|
112
|
+
expect(result.tree?.children[0].tag).toBe('b');
|
|
113
|
+
// The "b" node had element children but we hit the depth budget before
|
|
114
|
+
// recursing into them — children array is empty, truncated.depth is true.
|
|
115
|
+
expect(result.tree?.children[0].children).toEqual([]);
|
|
116
|
+
expect(result.truncated?.depth).toBe(true);
|
|
117
|
+
});
|
|
118
|
+
it('depth=0 keeps only the root', () => {
|
|
119
|
+
const root = el('ul', {}, [
|
|
120
|
+
el('li', {}, [txt('a')]),
|
|
121
|
+
el('li', {}, [txt('b')]),
|
|
122
|
+
]);
|
|
123
|
+
const result = runTreeJs(root, [root], null, { depth: 0 });
|
|
124
|
+
expect(result.tree?.children).toEqual([]);
|
|
125
|
+
expect(result.truncated?.depth).toBe(true);
|
|
126
|
+
});
|
|
127
|
+
it('caps children per node at `childrenMax` and reports children_dropped count', () => {
|
|
128
|
+
const root = el('ul', {}, [
|
|
129
|
+
el('li', {}, [txt('1')]),
|
|
130
|
+
el('li', {}, [txt('2')]),
|
|
131
|
+
el('li', {}, [txt('3')]),
|
|
132
|
+
el('li', {}, [txt('4')]),
|
|
133
|
+
el('li', {}, [txt('5')]),
|
|
134
|
+
]);
|
|
135
|
+
const result = runTreeJs(root, [root], null, { childrenMax: 2 });
|
|
136
|
+
expect(result.tree?.children).toHaveLength(2);
|
|
137
|
+
expect(result.truncated?.children_dropped).toBe(3);
|
|
138
|
+
});
|
|
139
|
+
it('caps direct text per node at `textMax` and reports text_truncated count', () => {
|
|
140
|
+
const root = el('p', {}, [
|
|
141
|
+
txt('a'.repeat(50)),
|
|
142
|
+
el('span', {}, [txt('b'.repeat(50))]),
|
|
143
|
+
]);
|
|
144
|
+
const result = runTreeJs(root, [root], null, { textMax: 10 });
|
|
145
|
+
expect(result.tree?.text).toHaveLength(10);
|
|
146
|
+
expect(result.tree?.children[0].text).toHaveLength(10);
|
|
147
|
+
expect(result.truncated?.text_truncated).toBe(2);
|
|
148
|
+
});
|
|
149
|
+
// Blocker B regression: compound contract must ride along with the
|
|
150
|
+
// json tree so `browser get html --as json` surfaces the full contract
|
|
151
|
+
// to agents without an extra round-trip.
|
|
152
|
+
it('attaches compound info to date/file/select nodes and omits it elsewhere', () => {
|
|
153
|
+
const date = el('input', { type: 'date', min: '2026-01-01' }, [], { value: '2026-04-21' });
|
|
154
|
+
const file = el('input', { type: 'file', accept: 'image/*' }, [], { multiple: true, files: [{ name: 'a.png' }] });
|
|
155
|
+
const sel = el('select', { name: 'country' }, [], {
|
|
156
|
+
options: [
|
|
157
|
+
{ value: 'us', label: 'United States', selected: true },
|
|
158
|
+
{ value: 'ca', label: 'Canada' },
|
|
159
|
+
],
|
|
160
|
+
});
|
|
161
|
+
const plain = el('input', { type: 'text' }, [], { value: 'hi' });
|
|
162
|
+
const root = el('form', {}, [date, file, sel, plain]);
|
|
163
|
+
const result = runTreeJs(root, [root], null);
|
|
164
|
+
expect(result.tree?.children[0].compound).toMatchObject({ control: 'date', format: 'YYYY-MM-DD', current: '2026-04-21', min: '2026-01-01' });
|
|
165
|
+
expect(result.tree?.children[1].compound).toMatchObject({ control: 'file', multiple: true, current: ['a.png'], accept: 'image/*' });
|
|
166
|
+
expect(result.tree?.children[2].compound).toMatchObject({ control: 'select', multiple: false, current: 'United States' });
|
|
167
|
+
expect(result.tree?.children[3].compound).toBeUndefined();
|
|
168
|
+
});
|
|
169
|
+
it('combines budgets and reports every hit', () => {
|
|
170
|
+
const root = el('ul', {}, [
|
|
171
|
+
el('li', {}, [txt('x'.repeat(20)), el('em', {}, [txt('y')])]),
|
|
172
|
+
el('li', {}, []),
|
|
173
|
+
el('li', {}, []),
|
|
174
|
+
]);
|
|
175
|
+
const result = runTreeJs(root, [root], null, { depth: 1, childrenMax: 2, textMax: 5 });
|
|
176
|
+
expect(result.tree?.children).toHaveLength(2);
|
|
177
|
+
expect(result.truncated?.children_dropped).toBe(1);
|
|
178
|
+
expect(result.truncated?.text_truncated).toBe(1);
|
|
179
|
+
expect(result.truncated?.depth).toBe(true);
|
|
180
|
+
});
|
|
181
|
+
});
|