@jackwener/opencli 1.7.5 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +5 -2
  2. package/README.zh-CN.md +5 -2
  3. package/cli-manifest.json +77 -1
  4. package/clis/bilibili/video.js +61 -0
  5. package/clis/bilibili/video.test.js +81 -0
  6. package/clis/deepseek/ask.js +21 -1
  7. package/clis/deepseek/ask.test.js +73 -0
  8. package/clis/deepseek/utils.js +84 -1
  9. package/clis/deepseek/utils.test.js +37 -0
  10. package/clis/jianyu/search.js +139 -3
  11. package/clis/jianyu/search.test.js +25 -0
  12. package/clis/jianyu/shared/procurement-detail.js +15 -0
  13. package/clis/jianyu/shared/procurement-detail.test.js +12 -0
  14. package/clis/twitter/shared.js +7 -2
  15. package/clis/twitter/tweets.js +218 -0
  16. package/clis/twitter/tweets.test.js +125 -0
  17. package/clis/youtube/channel.js +35 -0
  18. package/dist/src/browser/base-page.d.ts +13 -3
  19. package/dist/src/browser/base-page.js +35 -25
  20. package/dist/src/browser/cdp.d.ts +1 -0
  21. package/dist/src/browser/cdp.js +12 -3
  22. package/dist/src/browser/compound.d.ts +59 -0
  23. package/dist/src/browser/compound.js +112 -0
  24. package/dist/src/browser/compound.test.d.ts +1 -0
  25. package/dist/src/browser/compound.test.js +175 -0
  26. package/dist/src/browser/dom-snapshot.d.ts +7 -0
  27. package/dist/src/browser/dom-snapshot.js +76 -3
  28. package/dist/src/browser/dom-snapshot.test.js +65 -0
  29. package/dist/src/browser/extract.d.ts +69 -0
  30. package/dist/src/browser/extract.js +132 -0
  31. package/dist/src/browser/extract.test.d.ts +1 -0
  32. package/dist/src/browser/extract.test.js +129 -0
  33. package/dist/src/browser/find.d.ts +76 -0
  34. package/dist/src/browser/find.js +179 -0
  35. package/dist/src/browser/find.test.d.ts +1 -0
  36. package/dist/src/browser/find.test.js +120 -0
  37. package/dist/src/browser/html-tree.d.ts +75 -0
  38. package/dist/src/browser/html-tree.js +112 -0
  39. package/dist/src/browser/html-tree.test.d.ts +1 -0
  40. package/dist/src/browser/html-tree.test.js +181 -0
  41. package/dist/src/browser/network-cache.d.ts +48 -0
  42. package/dist/src/browser/network-cache.js +66 -0
  43. package/dist/src/browser/network-cache.test.d.ts +1 -0
  44. package/dist/src/browser/network-cache.test.js +58 -0
  45. package/dist/src/browser/network-key.d.ts +22 -0
  46. package/dist/src/browser/network-key.js +66 -0
  47. package/dist/src/browser/network-key.test.d.ts +1 -0
  48. package/dist/src/browser/network-key.test.js +49 -0
  49. package/dist/src/browser/shape-filter.d.ts +52 -0
  50. package/dist/src/browser/shape-filter.js +101 -0
  51. package/dist/src/browser/shape-filter.test.d.ts +1 -0
  52. package/dist/src/browser/shape-filter.test.js +101 -0
  53. package/dist/src/browser/shape.d.ts +23 -0
  54. package/dist/src/browser/shape.js +95 -0
  55. package/dist/src/browser/shape.test.d.ts +1 -0
  56. package/dist/src/browser/shape.test.js +82 -0
  57. package/dist/src/browser/target-errors.d.ts +14 -1
  58. package/dist/src/browser/target-errors.js +13 -0
  59. package/dist/src/browser/target-errors.test.js +39 -6
  60. package/dist/src/browser/target-resolver.d.ts +57 -10
  61. package/dist/src/browser/target-resolver.js +195 -75
  62. package/dist/src/browser/target-resolver.test.js +80 -5
  63. package/dist/src/cli.js +630 -125
  64. package/dist/src/cli.test.js +794 -0
  65. package/dist/src/execution.js +7 -2
  66. package/dist/src/execution.test.js +54 -0
  67. package/dist/src/main.js +16 -0
  68. package/dist/src/types.d.ts +18 -3
  69. package/package.json +1 -1
@@ -0,0 +1,179 @@
1
+ /**
2
+ * `browser find --css <sel>` — structured CSS query.
3
+ *
4
+ * Returns every match of a selector as a JSON envelope agents can read
5
+ * without parsing free-text snapshot output. Each entry carries two
6
+ * identifiers — a numeric `ref` (matching the snapshot contract) and a
7
+ * stable 0-based `nth` — so the agent can act on a specific result via
8
+ * either path:
9
+ *
10
+ * browser click <ref> // when ref is numeric
11
+ * browser click "<sel>" --nth <n> // always works
12
+ *
13
+ * Refs are *allocated on the spot* for matched elements that were not
14
+ * tagged by a prior snapshot: `data-opencli-ref` is set on the element
15
+ * and a fingerprint is written into `window.__opencli_ref_identity`
16
+ * (same shape the snapshot uses). That makes `find` a first-class entry
17
+ * point to the ref system — agents can skip running `browser state`
18
+ * when they already know the selector.
19
+ *
20
+ * Attributes are whitelisted to keep output small and high-signal.
21
+ * Invisible elements are still returned so agents can reason about
22
+ * offscreen vs truly-missing targets.
23
+ *
24
+ * When a matched element is a compound form control (date-like input,
25
+ * select, file input), the entry gains a `compound` field with the
26
+ * rich view from `compound.ts`. This is what kills the three biggest
27
+ * agent-fail modes on form pages (wrong date format, guessed options,
28
+ * re-uploaded files) without forcing agents to probe further.
29
+ */
30
+ import { COMPOUND_INFO_JS } from './compound.js';
31
+ /** Whitelist of attributes surfaced per entry. Keep small; agents do not need full DOM dumps. */
32
+ export const FIND_ATTR_WHITELIST = [
33
+ 'id',
34
+ 'class',
35
+ 'name',
36
+ 'type',
37
+ 'placeholder',
38
+ 'aria-label',
39
+ 'title',
40
+ 'href',
41
+ 'value',
42
+ 'role',
43
+ 'data-testid',
44
+ ];
45
+ /**
46
+ * Build the browser-side JS that performs the CSS query and emits the
47
+ * FindResult (or FindError) envelope. Evaluated inside `page.evaluate`.
48
+ */
49
+ export function buildFindJs(selector, opts = {}) {
50
+ const safeSel = JSON.stringify(selector);
51
+ const limit = opts.limit ?? 50;
52
+ const textMax = opts.textMax ?? 120;
53
+ const whitelist = JSON.stringify(FIND_ATTR_WHITELIST);
54
+ return `
55
+ (() => {
56
+ const sel = ${safeSel};
57
+ const LIMIT = ${limit};
58
+ const TEXT_MAX = ${textMax};
59
+ const ATTR_WHITELIST = ${whitelist};
60
+
61
+ ${COMPOUND_INFO_JS}
62
+
63
+ let matches;
64
+ try {
65
+ matches = document.querySelectorAll(sel);
66
+ } catch (e) {
67
+ return {
68
+ error: {
69
+ code: 'invalid_selector',
70
+ message: 'Invalid CSS selector: ' + sel + ' (' + ((e && e.message) || String(e)) + ')',
71
+ hint: 'Check the selector syntax.',
72
+ },
73
+ };
74
+ }
75
+
76
+ if (matches.length === 0) {
77
+ return {
78
+ error: {
79
+ code: 'selector_not_found',
80
+ message: 'CSS selector ' + sel + ' matched 0 elements',
81
+ hint: 'Use browser state to inspect the page, or try a less specific selector.',
82
+ },
83
+ };
84
+ }
85
+
86
+ function pickAttrs(el) {
87
+ const out = {};
88
+ for (const key of ATTR_WHITELIST) {
89
+ const v = el.getAttribute(key);
90
+ if (v != null && v !== '') out[key] = v;
91
+ }
92
+ return out;
93
+ }
94
+
95
+ function isVisible(el) {
96
+ const rect = el.getBoundingClientRect();
97
+ if (rect.width === 0 && rect.height === 0) return false;
98
+ try {
99
+ const style = getComputedStyle(el);
100
+ if (style.display === 'none' || style.visibility === 'hidden') return false;
101
+ if (parseFloat(style.opacity || '1') === 0) return false;
102
+ } catch (_) {}
103
+ return true;
104
+ }
105
+
106
+ // Ref allocation: reuse \`window.__opencli_ref_identity\` (the same map
107
+ // snapshot populates) as the source of truth. For matched elements that
108
+ // don't already carry a \`data-opencli-ref\`, assign the next free numeric
109
+ // ref and write the fingerprint so the target resolver can verify it on
110
+ // downstream click/type/get calls.
111
+ const identity = (window.__opencli_ref_identity = window.__opencli_ref_identity || {});
112
+ let maxRef = 0;
113
+ for (const k in identity) {
114
+ const n = parseInt(k, 10);
115
+ if (!isNaN(n) && n > maxRef) maxRef = n;
116
+ }
117
+ // Also walk any \`data-opencli-ref\` already in the DOM in case the identity
118
+ // map was cleared but annotations remain (e.g. soft navigation without a
119
+ // fresh snapshot). Guarantees allocated refs don't collide.
120
+ try {
121
+ const tagged = document.querySelectorAll('[data-opencli-ref]');
122
+ for (let t = 0; t < tagged.length; t++) {
123
+ const v = tagged[t].getAttribute('data-opencli-ref');
124
+ const n = v != null && /^\\d+$/.test(v) ? parseInt(v, 10) : NaN;
125
+ if (!isNaN(n) && n > maxRef) maxRef = n;
126
+ }
127
+ } catch (_) {}
128
+
129
+ function fingerprintOf(el) {
130
+ return {
131
+ tag: el.tagName.toLowerCase(),
132
+ role: el.getAttribute('role') || '',
133
+ text: (el.textContent || '').trim().slice(0, 30),
134
+ ariaLabel: el.getAttribute('aria-label') || '',
135
+ id: el.id || '',
136
+ testId: el.getAttribute('data-testid') || el.getAttribute('data-test') || '',
137
+ };
138
+ }
139
+
140
+ const take = Math.min(matches.length, LIMIT);
141
+ const entries = [];
142
+ for (let i = 0; i < take; i++) {
143
+ const el = matches[i];
144
+ const refAttr = el.getAttribute('data-opencli-ref');
145
+ let refNum = refAttr != null && /^\\d+$/.test(refAttr) ? parseInt(refAttr, 10) : null;
146
+ if (refNum === null) {
147
+ refNum = ++maxRef;
148
+ try { el.setAttribute('data-opencli-ref', '' + refNum); } catch (_) {}
149
+ identity['' + refNum] = fingerprintOf(el);
150
+ } else if (!identity['' + refNum]) {
151
+ // Ref annotation survived but identity map was cleared — repopulate so the
152
+ // target resolver's fingerprint check passes on downstream calls.
153
+ identity['' + refNum] = fingerprintOf(el);
154
+ }
155
+ const text = (el.textContent || '').trim();
156
+ const entry = {
157
+ nth: i,
158
+ ref: refNum,
159
+ tag: el.tagName.toLowerCase(),
160
+ role: el.getAttribute('role') || '',
161
+ text: text.length > TEXT_MAX ? text.slice(0, TEXT_MAX) : text,
162
+ attrs: pickAttrs(el),
163
+ visible: isVisible(el),
164
+ };
165
+ const compound = compoundInfoOf(el);
166
+ if (compound) entry.compound = compound;
167
+ entries.push(entry);
168
+ }
169
+
170
+ return {
171
+ matches_n: matches.length,
172
+ entries,
173
+ };
174
+ })()
175
+ `;
176
+ }
177
+ export function isFindError(result) {
178
+ return !!result && typeof result === 'object' && 'error' in result;
179
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,120 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { buildFindJs, FIND_ATTR_WHITELIST, isFindError } from './find.js';
3
+ /**
4
+ * These tests validate the shape and options of the generated JS string
5
+ * (no DOM available in the default vitest unit env). Runtime behavior of
6
+ * the generated JS against a real DOM is covered by the browser e2e suite.
7
+ */
8
+ describe('buildFindJs', () => {
9
+ it('produces syntactically valid JS that can be parsed', () => {
10
+ expect(() => new Function(`return (${buildFindJs('.btn')});`)).not.toThrow();
11
+ });
12
+ it('embeds the selector via JSON.stringify (injection-safe)', () => {
13
+ const js = buildFindJs('[data-x="a\"b"]');
14
+ // Unescaped literal break-out must not appear
15
+ expect(js).not.toContain('[data-x="a"b"]');
16
+ // The JSON-encoded form (with escaped quotes) should
17
+ expect(js).toContain(JSON.stringify('[data-x="a\"b"]'));
18
+ });
19
+ it('emits invalid_selector + selector_not_found branches', () => {
20
+ const js = buildFindJs('.btn');
21
+ expect(js).toContain("code: 'invalid_selector'");
22
+ expect(js).toContain("code: 'selector_not_found'");
23
+ });
24
+ it('emits matches_n + entries + per-entry shape', () => {
25
+ const js = buildFindJs('.btn');
26
+ expect(js).toContain('matches_n: matches.length');
27
+ expect(js).toContain('entries.push(');
28
+ // Per-entry keys reviewers signed off on: nth, ref, tag, role, text, attrs, visible
29
+ expect(js).toContain('nth: i');
30
+ expect(js).toContain('ref: refNum');
31
+ expect(js).toContain('tag: el.tagName.toLowerCase()');
32
+ expect(js).toContain("el.getAttribute('role')");
33
+ expect(js).toContain('visible: isVisible(el)');
34
+ });
35
+ it('allocates fresh refs for untagged matches (write attribute + identity map)', () => {
36
+ const js = buildFindJs('.btn');
37
+ // On the just-annotated branch we must flip the attribute on the element
38
+ // so downstream `browser click <ref>` works off the find output.
39
+ expect(js).toContain("el.setAttribute('data-opencli-ref'");
40
+ // The fingerprint must also land in the shared identity map so the
41
+ // target resolver's stale-ref check has data to verify against.
42
+ expect(js).toContain('__opencli_ref_identity');
43
+ expect(js).toContain("identity['' + refNum] = fingerprintOf(el)");
44
+ // Allocation walks both the identity map and any existing data-opencli-ref
45
+ // annotations — guards against collisions after a soft nav.
46
+ expect(js).toContain("document.querySelectorAll('[data-opencli-ref]')");
47
+ });
48
+ it('fingerprint shape matches the snapshot / resolver contract', () => {
49
+ const js = buildFindJs('.btn');
50
+ // The six fields resolveTargetJs verifies in its stale_ref check.
51
+ for (const field of ['tag:', 'role:', 'text:', 'ariaLabel:', 'id:', 'testId:']) {
52
+ expect(js).toContain(field);
53
+ }
54
+ });
55
+ it('embeds defaults for limit and textMax', () => {
56
+ const js = buildFindJs('.btn');
57
+ expect(js).toContain('LIMIT = 50');
58
+ expect(js).toContain('TEXT_MAX = 120');
59
+ });
60
+ it('overrides limit and textMax when requested', () => {
61
+ const js = buildFindJs('.btn', { limit: 3, textMax: 20 });
62
+ expect(js).toContain('LIMIT = 3');
63
+ expect(js).toContain('TEXT_MAX = 20');
64
+ });
65
+ it('embeds the attribute whitelist verbatim (no style/onclick leaking)', () => {
66
+ const js = buildFindJs('.btn');
67
+ // Whitelist fields appear inside the generated JS
68
+ for (const key of FIND_ATTR_WHITELIST) {
69
+ expect(js).toContain(`"${key}"`);
70
+ }
71
+ // Sensitive / high-noise attrs must stay out of the whitelist
72
+ expect(FIND_ATTR_WHITELIST).not.toContain('style');
73
+ expect(FIND_ATTR_WHITELIST).not.toContain('onclick');
74
+ expect(FIND_ATTR_WHITELIST).not.toContain('onload');
75
+ });
76
+ it('inlines compoundInfoOf and attaches compound field per entry', () => {
77
+ const js = buildFindJs('input, select');
78
+ // Helper definition is inlined so each matched element can be classified.
79
+ expect(js).toContain('function compoundInfoOf(el)');
80
+ // The emitted entry opts in only when compound data is present — no noisy
81
+ // compound: null on every non-form element.
82
+ expect(js).toContain('const compound = compoundInfoOf(el);');
83
+ expect(js).toContain('if (compound) entry.compound = compound;');
84
+ // Spot-check all three compound families are covered in the inlined helper.
85
+ expect(js).toContain("'YYYY-MM-DD'");
86
+ expect(js).toContain("control: 'file'");
87
+ expect(js).toContain("control: 'select'");
88
+ });
89
+ it('keeps the whitelist small and explicit (guardrail against silent expansion)', () => {
90
+ expect(FIND_ATTR_WHITELIST).toEqual([
91
+ 'id',
92
+ 'class',
93
+ 'name',
94
+ 'type',
95
+ 'placeholder',
96
+ 'aria-label',
97
+ 'title',
98
+ 'href',
99
+ 'value',
100
+ 'role',
101
+ 'data-testid',
102
+ ]);
103
+ });
104
+ });
105
+ describe('isFindError', () => {
106
+ it('narrows { error: ... } as FindError', () => {
107
+ const payload = { error: { code: 'invalid_selector', message: 'x' } };
108
+ expect(isFindError(payload)).toBe(true);
109
+ if (isFindError(payload)) {
110
+ const err = payload;
111
+ expect(err.error.code).toBe('invalid_selector');
112
+ }
113
+ });
114
+ it('rejects successful envelopes', () => {
115
+ expect(isFindError({ matches_n: 0, entries: [] })).toBe(false);
116
+ expect(isFindError(null)).toBe(false);
117
+ expect(isFindError(undefined)).toBe(false);
118
+ expect(isFindError('string')).toBe(false);
119
+ });
120
+ });
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Client-side HTML → structured tree serializer.
3
+ *
4
+ * Returned as a JS string that gets passed to `page.evaluate`. The expression
5
+ * walks the DOM subtree rooted at the first selector match (or documentElement
6
+ * when no selector is given) and emits a compact `{tag, attrs, text, children}`
7
+ * tree for agents to consume instead of re-parsing raw HTML.
8
+ *
9
+ * Text handling: `text` is the concatenated text of direct text children only,
10
+ * whitespace-collapsed. Nested element text is left inside `children[].text`.
11
+ * Ordering between text and elements is not preserved — agents that need it
12
+ * should fall back to raw HTML mode.
13
+ *
14
+ * Budget knobs let the caller bound the output on large pages — previously an
15
+ * unscoped `get html --as json` could return a giant tree. Callers set any
16
+ * combination of `depth` / `childrenMax` / `textMax`; each hit is reported in
17
+ * the `truncated` envelope so agents know to narrow their selector or raise
18
+ * the budget.
19
+ *
20
+ * Compound controls (date / time / datetime-local / month / week / select /
21
+ * file) gain a `compound` field so agents inspecting the JSON tree see the
22
+ * full contract — date format, full option list (up to cap) with selections
23
+ * preserved for options beyond the cap, file `accept` and `multiple`. Without
24
+ * this wiring agents repeatedly guess values on these controls from the raw
25
+ * attributes, which is the failure mode compound.ts was built to eliminate.
26
+ */
27
+ import { type CompoundInfo } from './compound.js';
28
+ export interface BuildHtmlTreeJsOptions {
29
+ /** CSS selector to scope the tree; unscoped = documentElement */
30
+ selector?: string | null;
31
+ /** Max depth below the root (0 = root only, no children). Omit = unlimited. */
32
+ depth?: number | null;
33
+ /** Max element children per node before the rest get dropped. Omit = unlimited. */
34
+ childrenMax?: number | null;
35
+ /** Max chars of direct text per node before truncation. Omit = unlimited. */
36
+ textMax?: number | null;
37
+ }
38
+ /**
39
+ * Returns a JS expression string. When evaluated in a page context the
40
+ * expression resolves to either
41
+ * `{selector, matched, tree, truncated}` on success, or
42
+ * `{selector, invalidSelector: true, reason}` when `querySelectorAll`
43
+ * throws a `SyntaxError` for an unparseable selector.
44
+ *
45
+ * Callers must branch on `invalidSelector` to convert it into the CLI's
46
+ * `invalid_selector` structured error; otherwise the browser-level exception
47
+ * would bubble out of `page.evaluate` and bypass the structured-error
48
+ * contract that agents rely on.
49
+ */
50
+ export declare function buildHtmlTreeJs(opts?: BuildHtmlTreeJsOptions): string;
51
+ export interface HtmlNode {
52
+ tag: string;
53
+ attrs: Record<string, string>;
54
+ text: string;
55
+ children: HtmlNode[];
56
+ /**
57
+ * Rich view for date/select/file controls. Omitted for non-compound elements
58
+ * so agents can rely on `compound != null` as a signal.
59
+ */
60
+ compound?: CompoundInfo;
61
+ }
62
+ export interface HtmlTreeTruncationInfo {
63
+ /** At least one element child was dropped because depth budget was hit. */
64
+ depth?: true;
65
+ /** Count of element children dropped across the tree due to `childrenMax`. */
66
+ children_dropped?: number;
67
+ /** Count of nodes whose `text` was cut to `textMax`. */
68
+ text_truncated?: number;
69
+ }
70
+ export interface HtmlTreeResult {
71
+ selector: string | null;
72
+ matched: number;
73
+ tree: HtmlNode | null;
74
+ truncated?: HtmlTreeTruncationInfo;
75
+ }
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Client-side HTML → structured tree serializer.
3
+ *
4
+ * Returned as a JS string that gets passed to `page.evaluate`. The expression
5
+ * walks the DOM subtree rooted at the first selector match (or documentElement
6
+ * when no selector is given) and emits a compact `{tag, attrs, text, children}`
7
+ * tree for agents to consume instead of re-parsing raw HTML.
8
+ *
9
+ * Text handling: `text` is the concatenated text of direct text children only,
10
+ * whitespace-collapsed. Nested element text is left inside `children[].text`.
11
+ * Ordering between text and elements is not preserved — agents that need it
12
+ * should fall back to raw HTML mode.
13
+ *
14
+ * Budget knobs let the caller bound the output on large pages — previously an
15
+ * unscoped `get html --as json` could return a giant tree. Callers set any
16
+ * combination of `depth` / `childrenMax` / `textMax`; each hit is reported in
17
+ * the `truncated` envelope so agents know to narrow their selector or raise
18
+ * the budget.
19
+ *
20
+ * Compound controls (date / time / datetime-local / month / week / select /
21
+ * file) gain a `compound` field so agents inspecting the JSON tree see the
22
+ * full contract — date format, full option list (up to cap) with selections
23
+ * preserved for options beyond the cap, file `accept` and `multiple`. Without
24
+ * this wiring agents repeatedly guess values on these controls from the raw
25
+ * attributes, which is the failure mode compound.ts was built to eliminate.
26
+ */
27
+ import { COMPOUND_INFO_JS } from './compound.js';
28
+ /**
29
+ * Returns a JS expression string. When evaluated in a page context the
30
+ * expression resolves to either
31
+ * `{selector, matched, tree, truncated}` on success, or
32
+ * `{selector, invalidSelector: true, reason}` when `querySelectorAll`
33
+ * throws a `SyntaxError` for an unparseable selector.
34
+ *
35
+ * Callers must branch on `invalidSelector` to convert it into the CLI's
36
+ * `invalid_selector` structured error; otherwise the browser-level exception
37
+ * would bubble out of `page.evaluate` and bypass the structured-error
38
+ * contract that agents rely on.
39
+ */
40
+ export function buildHtmlTreeJs(opts = {}) {
41
+ const selectorLiteral = opts.selector ? JSON.stringify(opts.selector) : 'null';
42
+ const depthLiteral = Number.isFinite(opts.depth) && opts.depth >= 0
43
+ ? String(opts.depth)
44
+ : 'null';
45
+ const childrenMaxLiteral = Number.isFinite(opts.childrenMax) && opts.childrenMax >= 0
46
+ ? String(opts.childrenMax)
47
+ : 'null';
48
+ const textMaxLiteral = Number.isFinite(opts.textMax) && opts.textMax >= 0
49
+ ? String(opts.textMax)
50
+ : 'null';
51
+ return `(() => {
52
+ ${COMPOUND_INFO_JS}
53
+ const selector = ${selectorLiteral};
54
+ const maxDepth = ${depthLiteral};
55
+ const maxChildren = ${childrenMaxLiteral};
56
+ const maxText = ${textMaxLiteral};
57
+ let matches;
58
+ if (selector) {
59
+ try { matches = document.querySelectorAll(selector); }
60
+ catch (e) {
61
+ return { selector: selector, invalidSelector: true, reason: (e && e.message) || String(e) };
62
+ }
63
+ } else {
64
+ matches = [document.documentElement];
65
+ }
66
+ const matched = matches.length;
67
+ const root = matches[0] || null;
68
+ const trunc = { depth: false, children_dropped: 0, text_truncated: 0 };
69
+ function serialize(el, depth) {
70
+ if (!el || el.nodeType !== 1) return null;
71
+ const attrs = {};
72
+ for (const a of el.attributes) attrs[a.name] = a.value;
73
+ let text = '';
74
+ for (const n of el.childNodes) {
75
+ if (n.nodeType === 3) text += n.nodeValue;
76
+ }
77
+ text = text.replace(/\\s+/g, ' ').trim();
78
+ if (maxText !== null && text.length > maxText) {
79
+ text = text.slice(0, maxText);
80
+ trunc.text_truncated++;
81
+ }
82
+ const children = [];
83
+ if (maxDepth === null || depth < maxDepth) {
84
+ const childEls = [];
85
+ for (const n of el.childNodes) if (n.nodeType === 1) childEls.push(n);
86
+ const keep = maxChildren === null ? childEls.length : Math.min(childEls.length, maxChildren);
87
+ for (let i = 0; i < keep; i++) {
88
+ const child = serialize(childEls[i], depth + 1);
89
+ if (child) children.push(child);
90
+ }
91
+ if (maxChildren !== null && childEls.length > maxChildren) {
92
+ trunc.children_dropped += childEls.length - maxChildren;
93
+ }
94
+ } else {
95
+ // Budget hit: we're at max depth. Count any element children we would have visited.
96
+ for (const n of el.childNodes) if (n.nodeType === 1) { trunc.depth = true; break; }
97
+ }
98
+ const node = { tag: el.tagName.toLowerCase(), attrs, text, children };
99
+ const compound = compoundInfoOf(el);
100
+ if (compound) node.compound = compound;
101
+ return node;
102
+ }
103
+ const tree = root ? serialize(root, 0) : null;
104
+ const truncatedOut = {};
105
+ if (trunc.depth) truncatedOut.depth = true;
106
+ if (trunc.children_dropped > 0) truncatedOut.children_dropped = trunc.children_dropped;
107
+ if (trunc.text_truncated > 0) truncatedOut.text_truncated = trunc.text_truncated;
108
+ const envelope = { selector: selector, matched: matched, tree: tree };
109
+ if (Object.keys(truncatedOut).length > 0) envelope.truncated = truncatedOut;
110
+ return envelope;
111
+ })()`;
112
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,181 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { buildHtmlTreeJs } from './html-tree.js';
3
+ /**
4
+ * The serializer runs in a page context via `page.evaluate`. In unit tests we
5
+ * substitute `document` with a minimal stub that mirrors the DOM surface used
6
+ * by the expression, then Function-eval the returned JS.
7
+ */
8
+ function runTreeJs(root, selectorMatches, selector, budgets = {}) {
9
+ const js = buildHtmlTreeJs({ selector, ...budgets });
10
+ const fakeDocument = {
11
+ querySelectorAll: () => selectorMatches,
12
+ documentElement: root,
13
+ };
14
+ const fn = new Function('document', `return ${js};`);
15
+ return fn(fakeDocument);
16
+ }
17
+ function runTreeJsInvalid(selector, errorMessage) {
18
+ const js = buildHtmlTreeJs({ selector });
19
+ const fakeDocument = {
20
+ querySelectorAll: () => { const e = new Error(errorMessage); e.name = 'SyntaxError'; throw e; },
21
+ documentElement: null,
22
+ };
23
+ const fn = new Function('document', `return ${js};`);
24
+ return fn(fakeDocument);
25
+ }
26
+ function el(tag, attrs, children, extras = {}) {
27
+ return {
28
+ nodeType: 1,
29
+ tagName: tag.toUpperCase(),
30
+ attributes: Object.entries(attrs).map(([name, value]) => ({ name, value })),
31
+ childNodes: children,
32
+ getAttribute: (name) => (name in attrs ? attrs[name] : null),
33
+ value: extras.value,
34
+ multiple: extras.multiple,
35
+ files: extras.files,
36
+ options: extras.options,
37
+ };
38
+ }
39
+ function txt(value) { return { nodeType: 3, nodeValue: value }; }
40
+ describe('buildHtmlTreeJs', () => {
41
+ it('serializes a simple element into {tag, attrs, text, children}', () => {
42
+ const root = el('div', { class: 'hero', id: 'x' }, [txt('Hello')]);
43
+ const result = runTreeJs(root, [root], null);
44
+ expect(result.selector).toBeNull();
45
+ expect(result.matched).toBe(1);
46
+ expect(result.tree).toEqual({
47
+ tag: 'div',
48
+ attrs: { class: 'hero', id: 'x' },
49
+ text: 'Hello',
50
+ children: [],
51
+ });
52
+ });
53
+ it('collapses whitespace in direct text content only', () => {
54
+ const root = el('p', {}, [
55
+ txt(' line \n one '),
56
+ el('span', {}, [txt('inner text')]),
57
+ txt('\tline two\t'),
58
+ ]);
59
+ const result = runTreeJs(root, [root], null);
60
+ expect(result.tree?.text).toBe('line one line two');
61
+ expect(result.tree?.children[0].text).toBe('inner text');
62
+ });
63
+ it('recurses into element children and preserves their attrs', () => {
64
+ const root = el('ul', { role: 'list' }, [
65
+ el('li', { 'data-id': '1' }, [txt('first')]),
66
+ el('li', { 'data-id': '2' }, [txt('second')]),
67
+ ]);
68
+ const result = runTreeJs(root, [root], null);
69
+ expect(result.tree?.children).toHaveLength(2);
70
+ expect(result.tree?.children[0]).toEqual({
71
+ tag: 'li',
72
+ attrs: { 'data-id': '1' },
73
+ text: 'first',
74
+ children: [],
75
+ });
76
+ });
77
+ it('returns matched=N and serializes only the first match', () => {
78
+ const first = el('article', { id: 'a' }, [txt('first')]);
79
+ const second = el('article', { id: 'b' }, [txt('second')]);
80
+ const result = runTreeJs(null, [first, second], 'article');
81
+ expect(result.matched).toBe(2);
82
+ expect(result.tree?.attrs.id).toBe('a');
83
+ });
84
+ it('returns tree=null and matched=0 when selector matches nothing', () => {
85
+ const result = runTreeJs(null, [], '.nothing');
86
+ expect(result.matched).toBe(0);
87
+ expect(result.tree).toBeNull();
88
+ });
89
+ it('catches SyntaxError from querySelectorAll and returns {invalidSelector:true, reason}', () => {
90
+ const result = runTreeJsInvalid('##$@@', "'##$@@' is not a valid selector");
91
+ expect(result.invalidSelector).toBe(true);
92
+ expect(result.selector).toBe('##$@@');
93
+ expect(result.reason).toContain('not a valid selector');
94
+ });
95
+ it('omits `truncated` when no budget is hit', () => {
96
+ const root = el('div', {}, [el('span', {}, [txt('ok')])]);
97
+ const result = runTreeJs(root, [root], null, { depth: 5, childrenMax: 10, textMax: 100 });
98
+ expect(result.truncated).toBeUndefined();
99
+ });
100
+ });
101
+ describe('buildHtmlTreeJs budget knobs', () => {
102
+ it('caps tree at `depth` and reports truncated.depth', () => {
103
+ const deep = el('a', {}, [
104
+ el('b', {}, [
105
+ el('c', {}, [el('d', {}, [txt('deep')])]),
106
+ ]),
107
+ ]);
108
+ // depth=1 → root + one level of children; grandchildren should be dropped.
109
+ const result = runTreeJs(deep, [deep], null, { depth: 1 });
110
+ expect(result.tree?.tag).toBe('a');
111
+ expect(result.tree?.children).toHaveLength(1);
112
+ expect(result.tree?.children[0].tag).toBe('b');
113
+ // The "b" node had element children but we hit the depth budget before
114
+ // recursing into them — children array is empty, truncated.depth is true.
115
+ expect(result.tree?.children[0].children).toEqual([]);
116
+ expect(result.truncated?.depth).toBe(true);
117
+ });
118
+ it('depth=0 keeps only the root', () => {
119
+ const root = el('ul', {}, [
120
+ el('li', {}, [txt('a')]),
121
+ el('li', {}, [txt('b')]),
122
+ ]);
123
+ const result = runTreeJs(root, [root], null, { depth: 0 });
124
+ expect(result.tree?.children).toEqual([]);
125
+ expect(result.truncated?.depth).toBe(true);
126
+ });
127
+ it('caps children per node at `childrenMax` and reports children_dropped count', () => {
128
+ const root = el('ul', {}, [
129
+ el('li', {}, [txt('1')]),
130
+ el('li', {}, [txt('2')]),
131
+ el('li', {}, [txt('3')]),
132
+ el('li', {}, [txt('4')]),
133
+ el('li', {}, [txt('5')]),
134
+ ]);
135
+ const result = runTreeJs(root, [root], null, { childrenMax: 2 });
136
+ expect(result.tree?.children).toHaveLength(2);
137
+ expect(result.truncated?.children_dropped).toBe(3);
138
+ });
139
+ it('caps direct text per node at `textMax` and reports text_truncated count', () => {
140
+ const root = el('p', {}, [
141
+ txt('a'.repeat(50)),
142
+ el('span', {}, [txt('b'.repeat(50))]),
143
+ ]);
144
+ const result = runTreeJs(root, [root], null, { textMax: 10 });
145
+ expect(result.tree?.text).toHaveLength(10);
146
+ expect(result.tree?.children[0].text).toHaveLength(10);
147
+ expect(result.truncated?.text_truncated).toBe(2);
148
+ });
149
+ // Blocker B regression: compound contract must ride along with the
150
+ // json tree so `browser get html --as json` surfaces the full contract
151
+ // to agents without an extra round-trip.
152
+ it('attaches compound info to date/file/select nodes and omits it elsewhere', () => {
153
+ const date = el('input', { type: 'date', min: '2026-01-01' }, [], { value: '2026-04-21' });
154
+ const file = el('input', { type: 'file', accept: 'image/*' }, [], { multiple: true, files: [{ name: 'a.png' }] });
155
+ const sel = el('select', { name: 'country' }, [], {
156
+ options: [
157
+ { value: 'us', label: 'United States', selected: true },
158
+ { value: 'ca', label: 'Canada' },
159
+ ],
160
+ });
161
+ const plain = el('input', { type: 'text' }, [], { value: 'hi' });
162
+ const root = el('form', {}, [date, file, sel, plain]);
163
+ const result = runTreeJs(root, [root], null);
164
+ expect(result.tree?.children[0].compound).toMatchObject({ control: 'date', format: 'YYYY-MM-DD', current: '2026-04-21', min: '2026-01-01' });
165
+ expect(result.tree?.children[1].compound).toMatchObject({ control: 'file', multiple: true, current: ['a.png'], accept: 'image/*' });
166
+ expect(result.tree?.children[2].compound).toMatchObject({ control: 'select', multiple: false, current: 'United States' });
167
+ expect(result.tree?.children[3].compound).toBeUndefined();
168
+ });
169
+ it('combines budgets and reports every hit', () => {
170
+ const root = el('ul', {}, [
171
+ el('li', {}, [txt('x'.repeat(20)), el('em', {}, [txt('y')])]),
172
+ el('li', {}, []),
173
+ el('li', {}, []),
174
+ ]);
175
+ const result = runTreeJs(root, [root], null, { depth: 1, childrenMax: 2, textMax: 5 });
176
+ expect(result.tree?.children).toHaveLength(2);
177
+ expect(result.truncated?.children_dropped).toBe(1);
178
+ expect(result.truncated?.text_truncated).toBe(1);
179
+ expect(result.truncated?.depth).toBe(true);
180
+ });
181
+ });