@jackwener/opencli 1.7.5 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +5 -2
  2. package/README.zh-CN.md +5 -2
  3. package/cli-manifest.json +77 -1
  4. package/clis/bilibili/video.js +61 -0
  5. package/clis/bilibili/video.test.js +81 -0
  6. package/clis/deepseek/ask.js +21 -1
  7. package/clis/deepseek/ask.test.js +73 -0
  8. package/clis/deepseek/utils.js +84 -1
  9. package/clis/deepseek/utils.test.js +37 -0
  10. package/clis/jianyu/search.js +139 -3
  11. package/clis/jianyu/search.test.js +25 -0
  12. package/clis/jianyu/shared/procurement-detail.js +15 -0
  13. package/clis/jianyu/shared/procurement-detail.test.js +12 -0
  14. package/clis/twitter/shared.js +7 -2
  15. package/clis/twitter/tweets.js +218 -0
  16. package/clis/twitter/tweets.test.js +125 -0
  17. package/clis/youtube/channel.js +35 -0
  18. package/dist/src/browser/base-page.d.ts +13 -3
  19. package/dist/src/browser/base-page.js +35 -25
  20. package/dist/src/browser/cdp.d.ts +1 -0
  21. package/dist/src/browser/cdp.js +12 -3
  22. package/dist/src/browser/compound.d.ts +59 -0
  23. package/dist/src/browser/compound.js +112 -0
  24. package/dist/src/browser/compound.test.d.ts +1 -0
  25. package/dist/src/browser/compound.test.js +175 -0
  26. package/dist/src/browser/dom-snapshot.d.ts +7 -0
  27. package/dist/src/browser/dom-snapshot.js +76 -3
  28. package/dist/src/browser/dom-snapshot.test.js +65 -0
  29. package/dist/src/browser/extract.d.ts +69 -0
  30. package/dist/src/browser/extract.js +132 -0
  31. package/dist/src/browser/extract.test.d.ts +1 -0
  32. package/dist/src/browser/extract.test.js +129 -0
  33. package/dist/src/browser/find.d.ts +76 -0
  34. package/dist/src/browser/find.js +179 -0
  35. package/dist/src/browser/find.test.d.ts +1 -0
  36. package/dist/src/browser/find.test.js +120 -0
  37. package/dist/src/browser/html-tree.d.ts +75 -0
  38. package/dist/src/browser/html-tree.js +112 -0
  39. package/dist/src/browser/html-tree.test.d.ts +1 -0
  40. package/dist/src/browser/html-tree.test.js +181 -0
  41. package/dist/src/browser/network-cache.d.ts +48 -0
  42. package/dist/src/browser/network-cache.js +66 -0
  43. package/dist/src/browser/network-cache.test.d.ts +1 -0
  44. package/dist/src/browser/network-cache.test.js +58 -0
  45. package/dist/src/browser/network-key.d.ts +22 -0
  46. package/dist/src/browser/network-key.js +66 -0
  47. package/dist/src/browser/network-key.test.d.ts +1 -0
  48. package/dist/src/browser/network-key.test.js +49 -0
  49. package/dist/src/browser/shape-filter.d.ts +52 -0
  50. package/dist/src/browser/shape-filter.js +101 -0
  51. package/dist/src/browser/shape-filter.test.d.ts +1 -0
  52. package/dist/src/browser/shape-filter.test.js +101 -0
  53. package/dist/src/browser/shape.d.ts +23 -0
  54. package/dist/src/browser/shape.js +95 -0
  55. package/dist/src/browser/shape.test.d.ts +1 -0
  56. package/dist/src/browser/shape.test.js +82 -0
  57. package/dist/src/browser/target-errors.d.ts +14 -1
  58. package/dist/src/browser/target-errors.js +13 -0
  59. package/dist/src/browser/target-errors.test.js +39 -6
  60. package/dist/src/browser/target-resolver.d.ts +57 -10
  61. package/dist/src/browser/target-resolver.js +195 -75
  62. package/dist/src/browser/target-resolver.test.js +80 -5
  63. package/dist/src/cli.js +630 -125
  64. package/dist/src/cli.test.js +794 -0
  65. package/dist/src/execution.js +7 -2
  66. package/dist/src/execution.test.js +54 -0
  67. package/dist/src/main.js +16 -0
  68. package/dist/src/types.d.ts +18 -3
  69. package/package.json +1 -1
@@ -22,7 +22,15 @@
22
22
  * Additional tools:
23
23
  * - scrollToRefJs(ref) — scroll to a data-opencli-ref element
24
24
  * - getFormStateJs() — extract all form fields as structured JSON
25
+ *
26
+ * Compound sidecar:
27
+ * After the tree, a `compounds:` section lists rich JSON for every
28
+ * date/select/file ref — format, full option list (up to cap) with
29
+ * `options_total` reflecting the true count, file `accept` + `multiple`.
30
+ * This is what the snapshot's inline attr dump cannot express and what
31
+ * agents kept blowing turns on.
25
32
  */
33
+ import { COMPOUND_INFO_JS } from './compound.js';
26
34
  // ─── Utility JS Generators ───────────────────────────────────────────
27
35
  /**
28
36
  * Generate JS to scroll to an element identified by data-opencli-ref.
@@ -155,6 +163,8 @@ export function generateSnapshotJs(opts = {}) {
155
163
  (() => {
156
164
  'use strict';
157
165
 
166
+ ${COMPOUND_INFO_JS}
167
+
158
168
  // ── Config ─────────────────────────────────────────────────────────
159
169
  const VIEWPORT_EXPAND = ${viewportExpand};
160
170
  const MAX_DEPTH = ${maxDepth};
@@ -222,6 +232,38 @@ export function generateSnapshotJs(opts = {}) {
222
232
 
223
233
  const PROPAGATING_TAGS = new Set(['a', 'button']);
224
234
 
235
+ // Roles whose element wraps its own interactive descendants (icon spans
236
+ // inside a role=button, chevron inside role=link). When we see one of these,
237
+ // we propagate its bbox to children so we can suppress duplicate refs on
238
+ // undistinctive descendants that are ≥99% contained.
239
+ const PROPAGATING_ROLES = new Set(['button', 'link', 'menuitem', 'tab', 'option']);
240
+
241
+ function isBboxPropagator(el, tag) {
242
+ if (PROPAGATING_TAGS.has(tag)) return true;
243
+ const role = el.getAttribute('role');
244
+ return !!(role && PROPAGATING_ROLES.has(role));
245
+ }
246
+
247
+ // True when an interactive element still deserves its own [N] ref even
248
+ // though it's visually subsumed by a propagating ancestor. Anything with
249
+ // an aria-label, aria-labelledby, id, test id, name, or its own form
250
+ // semantics is treated as distinctive — everything else (naked spans /
251
+ // divs / svgs that merely inherit click from the parent button) gets
252
+ // folded into the parent so the snapshot doesn't ship [1]<button>[2]<svg>.
253
+ function isDistinctivelyInteractive(el) {
254
+ if (el.hasAttribute('aria-label')) return true;
255
+ if (el.hasAttribute('aria-labelledby')) return true;
256
+ if (el.id) return true;
257
+ if (el.getAttribute('data-testid') || el.getAttribute('data-test')) return true;
258
+ if (el.hasAttribute('name')) return true;
259
+ const tag = el.tagName.toLowerCase();
260
+ // Real form controls always stand on their own, even when nested in a label/button
261
+ if (tag === 'input' || tag === 'select' || tag === 'textarea') return true;
262
+ // Anchors with their own href are distinct targets
263
+ if (tag === 'a' && el.hasAttribute('href')) return true;
264
+ return false;
265
+ }
266
+
225
267
  const AD_PATTERNS = [
226
268
  'googleadservices.com', 'doubleclick.net', 'googlesyndication.com',
227
269
  'facebook.com/tr', 'analytics.google.com', 'connect.facebook.net',
@@ -576,6 +618,7 @@ export function generateSnapshotJs(opts = {}) {
576
618
  const hiddenInteractives = [];
577
619
  const currentHashes = [];
578
620
  const refIdentity = {};
621
+ const compoundInfos = {};
579
622
  let iframeCount = 0;
580
623
  let crossOriginIndex = 0;
581
624
 
@@ -627,7 +670,9 @@ export function generateSnapshotJs(opts = {}) {
627
670
  if (!(tag === 'input' && el.type === 'file')) return false;
628
671
  }
629
672
 
630
- const interactive = isInteractive(el);
673
+ // \`interactive\` gets demoted below if bbox containment folds this node
674
+ // into a propagating ancestor — using \`let\` so the dedup pass can mutate it.
675
+ let interactive = isInteractive(el);
631
676
 
632
677
  // Viewport threshold pruning
633
678
  if (hasArea && !isInExpandedViewport(rect)) {
@@ -648,7 +693,7 @@ export function generateSnapshotJs(opts = {}) {
648
693
  const scrollInfo = getScrollInfo(el);
649
694
  const isScrollable = scrollInfo !== null;
650
695
 
651
- // BBox dedup
696
+ // BBox dedup — tier 1 (non-interactive descendants, 0.95 threshold)
652
697
  let excludedByParent = false;
653
698
  if (BBOX_DEDUP && parentPropagatingRect && !interactive) {
654
699
  if (hasArea && isContainedBy(rect, parentPropagatingRect, 0.95)) {
@@ -660,8 +705,19 @@ export function generateSnapshotJs(opts = {}) {
660
705
  }
661
706
  }
662
707
 
708
+ // BBox dedup — tier 2 (interactive descendants, 0.99 threshold, browser-use style).
709
+ // This kills the "[1]<button> [2]<svg> [3]<span>" noise on icon-buttons by
710
+ // folding the icon / chevron into the button's ref. The 0.99 threshold + the
711
+ // isDistinctivelyInteractive gate together ensure we only drop nodes that
712
+ // add no new actionable surface — a nested <input> or <a href> stays.
713
+ if (BBOX_DEDUP && parentPropagatingRect && interactive && hasArea) {
714
+ if (isContainedBy(rect, parentPropagatingRect, 0.99) && !isDistinctivelyInteractive(el)) {
715
+ interactive = false;
716
+ }
717
+ }
718
+
663
719
  let propagateRect = parentPropagatingRect;
664
- if (BBOX_DEDUP && PROPAGATING_TAGS.has(tag) && hasArea) propagateRect = rect;
720
+ if (BBOX_DEDUP && hasArea && isBboxPropagator(el, tag)) propagateRect = rect;
665
721
 
666
722
  // Process children
667
723
  const origLen = lines.length;
@@ -725,6 +781,10 @@ export function generateSnapshotJs(opts = {}) {
725
781
  id: el.id || '',
726
782
  testId: el.getAttribute('data-testid') || el.getAttribute('data-test') || '',
727
783
  };
784
+ // Compound contract for date/select/file — captured per-ref so the
785
+ // sidecar maps one-to-one with the [N] tokens in the tree.
786
+ const compound = compoundInfoOf(el);
787
+ if (compound) compoundInfos['' + interactiveIndex] = compound;
728
788
  }
729
789
 
730
790
  // Tag + attributes
@@ -806,6 +866,19 @@ export function generateSnapshotJs(opts = {}) {
806
866
  if (hiddenInteractives.length > 10) lines.push(' …' + (hiddenInteractives.length - 10) + ' more');
807
867
  }
808
868
 
869
+ // Compound sidecar — rich JSON for date/select/file refs. Keys align with [N] tokens in the tree.
870
+ const compoundRefs = Object.keys(compoundInfos);
871
+ if (compoundRefs.length > 0) {
872
+ lines.push('---');
873
+ lines.push('compounds (' + compoundRefs.length + '):');
874
+ compoundRefs.sort(function (a, b) { return parseInt(a, 10) - parseInt(b, 10); });
875
+ for (const ref of compoundRefs) {
876
+ try {
877
+ lines.push(' [' + ref + '] ' + JSON.stringify(compoundInfos[ref]));
878
+ } catch {}
879
+ }
880
+ }
881
+
809
882
  // Footer
810
883
  lines.push('---');
811
884
  lines.push('interactive: ' + interactiveIndex + ' | iframes: ' + iframeCount);
@@ -102,6 +102,9 @@ describe('generateSnapshotJs', () => {
102
102
  // BBox dedup
103
103
  expect(js).toContain('isContainedBy');
104
104
  expect(js).toContain('PROPAGATING_TAGS');
105
+ expect(js).toContain('PROPAGATING_ROLES');
106
+ expect(js).toContain('isBboxPropagator');
107
+ expect(js).toContain('isDistinctivelyInteractive');
105
108
  // Shadow DOM
106
109
  expect(js).toContain('shadowRoot');
107
110
  expect(js).toContain('|shadow|');
@@ -151,6 +154,55 @@ describe('generateSnapshotJs', () => {
151
154
  expect(js).toContain('page_scroll');
152
155
  });
153
156
  });
157
+ describe('BBox 99% containment filter', () => {
158
+ it('propagates bbox for both PROPAGATING_TAGS and PROPAGATING_ROLES', () => {
159
+ const js = generateSnapshotJs();
160
+ // Role-based propagator list covers the common wrapper-as-control patterns
161
+ // that show up as <div role=button><svg/><span/></div> on modern SPAs.
162
+ for (const role of ['button', 'link', 'menuitem', 'tab', 'option']) {
163
+ expect(js).toContain(`'${role}'`);
164
+ }
165
+ // propagate site uses the unified helper, not only the tag set
166
+ expect(js).toContain('isBboxPropagator(el, tag)');
167
+ });
168
+ it('suppresses interactive descendants at 0.99 containment when they are not distinctive', () => {
169
+ const js = generateSnapshotJs();
170
+ expect(js).toContain('isContainedBy(rect, parentPropagatingRect, 0.99)');
171
+ expect(js).toContain('!isDistinctivelyInteractive(el)');
172
+ // The suppression path flips the local interactive flag so the node is
173
+ // still emitted (for text / shape) but does not get its own [N] ref.
174
+ expect(js).toContain('interactive = false');
175
+ });
176
+ it('does not suppress inputs / href-bearing anchors even when fully contained', () => {
177
+ const js = generateSnapshotJs();
178
+ // Guards inside isDistinctivelyInteractive
179
+ expect(js).toContain("tag === 'input'");
180
+ expect(js).toContain("tag === 'select'");
181
+ expect(js).toContain("tag === 'textarea'");
182
+ expect(js).toContain("tag === 'a'");
183
+ expect(js).toContain("el.hasAttribute('href')");
184
+ // aria-label / aria-labelledby / id / test-id / name preserve distinctness
185
+ expect(js).toContain("el.hasAttribute('aria-label')");
186
+ expect(js).toContain("el.hasAttribute('aria-labelledby')");
187
+ expect(js).toContain("el.id");
188
+ expect(js).toContain("el.getAttribute('data-testid')");
189
+ expect(js).toContain("el.hasAttribute('name')");
190
+ });
191
+ it('keeps the existing 0.95 non-interactive dedup tier in place', () => {
192
+ const js = generateSnapshotJs();
193
+ // The original non-interactive bbox filter is still present alongside the
194
+ // new interactive tier — two complementary thresholds, not a replacement.
195
+ expect(js).toContain('isContainedBy(rect, parentPropagatingRect, 0.95)');
196
+ });
197
+ it('bbox containment branches are gated on BBOX_DEDUP flag', () => {
198
+ const off = generateSnapshotJs({ bboxDedup: false });
199
+ // When the option is off, the filter becomes inert (BBOX_DEDUP = false)
200
+ // but the inlined helpers still ship — we only guard at the call sites.
201
+ expect(off).toContain('BBOX_DEDUP = false');
202
+ expect(off).toContain('isBboxPropagator');
203
+ expect(off).toContain('isDistinctivelyInteractive');
204
+ });
205
+ });
154
206
  describe('scrollToRefJs', () => {
155
207
  it('generates valid JS', () => {
156
208
  const js = scrollToRefJs('42');
@@ -245,4 +297,17 @@ describe('Search Element Detection', () => {
245
297
  const js = generateSnapshotJs();
246
298
  expect(js).toContain('isSearchElement(el)');
247
299
  });
300
+ // Blocker B regression: compound contract must be emitted by `browser state`,
301
+ // not only by `browser find --css`. Otherwise agents inspecting the default
302
+ // snapshot still have to round-trip `find` on every date/select/file control.
303
+ it('inlines compoundInfoOf() and attaches compound info to each interactive ref', () => {
304
+ const js = generateSnapshotJs();
305
+ expect(js).toContain('function compoundInfoOf(el)');
306
+ // Wiring: the walk body should call compoundInfoOf on every interactive node
307
+ expect(js).toContain('compoundInfoOf(el)');
308
+ // And collect them into a per-ref map keyed by the same [N] index as the tree
309
+ expect(js).toContain('compoundInfos');
310
+ // And emit a sidecar section after the tree so agents can find the JSON
311
+ expect(js).toContain("'compounds ('");
312
+ });
248
313
  });
@@ -0,0 +1,69 @@
1
+ /**
2
+ * `browser extract` — agent-native article/content reading channel.
3
+ *
4
+ * Pipeline (from first principles — agents want the *content*, not the DOM):
5
+ * 1. Scope: select `--selector` (default: document.body or <main>/<article>)
6
+ * 2. Denoise: strip script/style/nav/header/footer/aside/iframe/svg/form, inline noise
7
+ * 3. Convert: HTML → Markdown via shared `htmlToMarkdown` (turndown)
8
+ * 4. Chunk: paragraph-boundary-aware slicing with `next_start_char` cursor
9
+ *
10
+ * Why a separate command:
11
+ * - `get html --as json` returns tree structure; useless for "read the article".
12
+ * - `get text` flattens everything; loses headings, lists, links.
13
+ * - Markdown is the agent-readable middle ground: structure preserved, noise gone.
14
+ *
15
+ * Continuation contract: the envelope always carries `start`, `end`,
16
+ * `total_chars`, and `next_start_char` (null when the last chunk was emitted).
17
+ * Agents pass `--start <next>` to continue. No session state required.
18
+ */
19
+ /**
20
+ * Returns the JS expression string used with `page.evaluate` to produce the
21
+ * cleaned HTML subtree that we then hand to `htmlToMarkdown`. We do the
22
+ * denoise/clone inside the page so we can use DOM APIs (querySelectorAll,
23
+ * cloneNode) rather than regex on serialized HTML.
24
+ */
25
+ export declare function buildExtractHtmlJs(selector: string | null): string;
26
+ export interface ExtractChunkOptions {
27
+ content: string;
28
+ start: number;
29
+ chunkSize: number;
30
+ }
31
+ export interface ExtractChunkResult {
32
+ content: string;
33
+ start: number;
34
+ end: number;
35
+ nextStartChar: number | null;
36
+ }
37
+ /**
38
+ * Slice `content` into one chunk starting at `start` with target size
39
+ * `chunkSize`. When the chunk would land mid-paragraph, we pull the break
40
+ * back to the nearest `\n\n` (or `\n`) within a small window to keep the
41
+ * output readable. If no boundary is found, we hard-cut at `start+chunkSize`.
42
+ */
43
+ export declare function chunkMarkdown(opts: ExtractChunkOptions): ExtractChunkResult;
44
+ export interface RunExtractOptions {
45
+ html: string;
46
+ url: string;
47
+ title: string;
48
+ selector: string | null;
49
+ start: number;
50
+ chunkSize: number;
51
+ }
52
+ export interface RunExtractResult {
53
+ url: string;
54
+ title: string;
55
+ selector: string | null;
56
+ total_chars: number;
57
+ chunk_size: number;
58
+ start: number;
59
+ end: number;
60
+ next_start_char: number | null;
61
+ content: string;
62
+ }
63
+ /** End-to-end host-side pipeline: HTML → markdown → chunked envelope. */
64
+ export declare function runExtractFromHtml(opts: RunExtractOptions): RunExtractResult;
65
+ export declare const __extractInternals: {
66
+ DEFAULT_CHUNK_SIZE: number;
67
+ MIN_CHUNK_SIZE: number;
68
+ MAX_CHUNK_SIZE: number;
69
+ };
@@ -0,0 +1,132 @@
1
+ /**
2
+ * `browser extract` — agent-native article/content reading channel.
3
+ *
4
+ * Pipeline (from first principles — agents want the *content*, not the DOM):
5
+ * 1. Scope: select `--selector` (default: document.body or <main>/<article>)
6
+ * 2. Denoise: strip script/style/nav/header/footer/aside/iframe/svg/form, inline noise
7
+ * 3. Convert: HTML → Markdown via shared `htmlToMarkdown` (turndown)
8
+ * 4. Chunk: paragraph-boundary-aware slicing with `next_start_char` cursor
9
+ *
10
+ * Why a separate command:
11
+ * - `get html --as json` returns tree structure; useless for "read the article".
12
+ * - `get text` flattens everything; loses headings, lists, links.
13
+ * - Markdown is the agent-readable middle ground: structure preserved, noise gone.
14
+ *
15
+ * Continuation contract: the envelope always carries `start`, `end`,
16
+ * `total_chars`, and `next_start_char` (null when the last chunk was emitted).
17
+ * Agents pass `--start <next>` to continue. No session state required.
18
+ */
19
+ import { htmlToMarkdown } from '../utils.js';
20
+ const DEFAULT_CHUNK_SIZE = 20000;
21
+ const MIN_CHUNK_SIZE = 100;
22
+ const MAX_CHUNK_SIZE = 200000;
23
+ const BOUNDARY_WINDOW_RATIO = 0.15;
24
+ /**
25
+ * Returns the JS expression string used with `page.evaluate` to produce the
26
+ * cleaned HTML subtree that we then hand to `htmlToMarkdown`. We do the
27
+ * denoise/clone inside the page so we can use DOM APIs (querySelectorAll,
28
+ * cloneNode) rather than regex on serialized HTML.
29
+ */
30
+ export function buildExtractHtmlJs(selector) {
31
+ const selectorLiteral = selector ? JSON.stringify(selector) : 'null';
32
+ return `(() => {
33
+ const sel = ${selectorLiteral};
34
+ let root = null;
35
+ if (sel) {
36
+ try { root = document.querySelector(sel); }
37
+ catch (e) {
38
+ return { invalidSelector: true, reason: (e && e.message) || String(e) };
39
+ }
40
+ if (!root) return { notFound: true };
41
+ } else {
42
+ root = document.querySelector('main') || document.querySelector('article') || document.body || document.documentElement;
43
+ }
44
+ if (!root) return { notFound: true };
45
+ const clone = root.cloneNode(true);
46
+ const drop = [
47
+ 'script', 'style', 'noscript', 'template',
48
+ 'nav', 'header', 'footer', 'aside',
49
+ 'iframe', 'svg', 'canvas',
50
+ 'form', 'button', 'input', 'select', 'textarea',
51
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]', '[role="complementary"]',
52
+ '[aria-hidden="true"]',
53
+ ];
54
+ for (const q of drop) {
55
+ for (const n of clone.querySelectorAll(q)) n.remove();
56
+ }
57
+ // Also strip event-handler and style attributes that bloat markdown output.
58
+ const walker = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
59
+ let n = walker.currentNode;
60
+ while (n) {
61
+ if (n.nodeType === 1) {
62
+ const el = n;
63
+ for (const a of [...el.attributes]) {
64
+ if (a.name.startsWith('on') || a.name === 'style' || a.name.startsWith('data-')) el.removeAttribute(a.name);
65
+ }
66
+ }
67
+ n = walker.nextNode();
68
+ }
69
+ return { ok: true, url: location.href, title: document.title || '', html: clone.outerHTML || '' };
70
+ })()`;
71
+ }
72
+ /**
73
+ * Slice `content` into one chunk starting at `start` with target size
74
+ * `chunkSize`. When the chunk would land mid-paragraph, we pull the break
75
+ * back to the nearest `\n\n` (or `\n`) within a small window to keep the
76
+ * output readable. If no boundary is found, we hard-cut at `start+chunkSize`.
77
+ */
78
+ export function chunkMarkdown(opts) {
79
+ const { content, start } = opts;
80
+ const chunkSize = Math.max(MIN_CHUNK_SIZE, Math.min(MAX_CHUNK_SIZE, opts.chunkSize));
81
+ if (start >= content.length) {
82
+ return { content: '', start, end: start, nextStartChar: null };
83
+ }
84
+ const hardEnd = Math.min(content.length, start + chunkSize);
85
+ if (hardEnd === content.length) {
86
+ return { content: content.slice(start, hardEnd), start, end: hardEnd, nextStartChar: null };
87
+ }
88
+ const windowSize = Math.max(1, Math.floor(chunkSize * BOUNDARY_WINDOW_RATIO));
89
+ const windowStart = Math.max(start + 1, hardEnd - windowSize);
90
+ const slice = content.slice(windowStart, hardEnd);
91
+ const paraBreak = slice.lastIndexOf('\n\n');
92
+ let cut = hardEnd;
93
+ if (paraBreak >= 0) {
94
+ cut = windowStart + paraBreak + 2;
95
+ }
96
+ else {
97
+ const lineBreak = slice.lastIndexOf('\n');
98
+ if (lineBreak >= 0)
99
+ cut = windowStart + lineBreak + 1;
100
+ }
101
+ return {
102
+ content: content.slice(start, cut),
103
+ start,
104
+ end: cut,
105
+ nextStartChar: cut,
106
+ };
107
+ }
108
+ /** End-to-end host-side pipeline: HTML → markdown → chunked envelope. */
109
+ export function runExtractFromHtml(opts) {
110
+ const md = htmlToMarkdown(opts.html);
111
+ const chunk = chunkMarkdown({
112
+ content: md,
113
+ start: Math.max(0, opts.start),
114
+ chunkSize: opts.chunkSize || DEFAULT_CHUNK_SIZE,
115
+ });
116
+ return {
117
+ url: opts.url,
118
+ title: opts.title,
119
+ selector: opts.selector,
120
+ total_chars: md.length,
121
+ chunk_size: chunk.end - chunk.start,
122
+ start: chunk.start,
123
+ end: chunk.end,
124
+ next_start_char: chunk.nextStartChar,
125
+ content: chunk.content,
126
+ };
127
+ }
128
+ export const __extractInternals = {
129
+ DEFAULT_CHUNK_SIZE,
130
+ MIN_CHUNK_SIZE,
131
+ MAX_CHUNK_SIZE,
132
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,129 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { buildExtractHtmlJs, chunkMarkdown, runExtractFromHtml } from './extract.js';
3
+ describe('chunkMarkdown', () => {
4
+ it('returns the full content when it fits in one chunk', () => {
5
+ const content = 'short body';
6
+ const r = chunkMarkdown({ content, start: 0, chunkSize: 20000 });
7
+ expect(r.content).toBe(content);
8
+ expect(r.start).toBe(0);
9
+ expect(r.end).toBe(content.length);
10
+ expect(r.nextStartChar).toBeNull();
11
+ });
12
+ it('emits next_start_char when more content remains', () => {
13
+ // Build content long enough that chunkSize cuts it mid-stream.
14
+ const para = 'p'.repeat(400);
15
+ const content = [para, para, para].join('\n\n');
16
+ const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
17
+ expect(r.nextStartChar).not.toBeNull();
18
+ expect(r.nextStartChar).toBeGreaterThan(0);
19
+ expect(r.nextStartChar).toBeLessThan(content.length);
20
+ });
21
+ it('prefers to break at a paragraph boundary inside the boundary window', () => {
22
+ // chunkSize=500, window=15% → [425, 500). Place `\n\n` at 450 so it lands
23
+ // inside the window; the chunker should snap the cut back to it.
24
+ const a = 'a'.repeat(450);
25
+ const b = 'b'.repeat(400);
26
+ const content = `${a}\n\n${b}`;
27
+ const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
28
+ expect(r.content.endsWith('\n\n')).toBe(true);
29
+ expect(r.nextStartChar).toBe(r.end);
30
+ expect(content.slice(r.end).startsWith('b')).toBe(true);
31
+ });
32
+ it('falls back to a single newline when no paragraph boundary is in window', () => {
33
+ // 6 lines × 90 chars joined by `\n` → `\n` at 90, 181, 272, 363, 454.
34
+ // chunkSize=500 with window [425, 500) catches the `\n` at 454.
35
+ const line = 'l'.repeat(90);
36
+ const content = Array.from({ length: 6 }, () => line).join('\n');
37
+ const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
38
+ expect(r.content.endsWith('\n')).toBe(true);
39
+ expect(content.slice(r.end).startsWith('l')).toBe(true);
40
+ });
41
+ it('hard-cuts when no boundary is found within the window', () => {
42
+ const content = 'x'.repeat(5000);
43
+ const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
44
+ expect(r.end).toBe(500);
45
+ expect(r.content).toHaveLength(500);
46
+ expect(r.nextStartChar).toBe(500);
47
+ });
48
+ it('handles start >= content.length with an empty final chunk', () => {
49
+ const content = 'hello';
50
+ const r = chunkMarkdown({ content, start: 5, chunkSize: 100 });
51
+ expect(r.content).toBe('');
52
+ expect(r.nextStartChar).toBeNull();
53
+ });
54
+ it('resumes from a provided start cursor until the stream terminates', () => {
55
+ const content = `${'a'.repeat(100)}\n\n${'b'.repeat(100)}\n\n${'c'.repeat(100)}`;
56
+ const first = chunkMarkdown({ content, start: 0, chunkSize: 110 });
57
+ expect(first.nextStartChar).not.toBeNull();
58
+ const second = chunkMarkdown({ content, start: first.nextStartChar, chunkSize: 110 });
59
+ expect(second.start).toBe(first.nextStartChar);
60
+ expect(second.content.length).toBeGreaterThan(0);
61
+ let cursor = second.nextStartChar;
62
+ let safety = 20;
63
+ while (cursor !== null && safety-- > 0) {
64
+ const step = chunkMarkdown({ content, start: cursor, chunkSize: 110 });
65
+ cursor = step.nextStartChar;
66
+ }
67
+ expect(cursor).toBeNull();
68
+ });
69
+ it('clamps chunk size to the configured minimum', () => {
70
+ const content = 'a'.repeat(2000);
71
+ const r = chunkMarkdown({ content, start: 0, chunkSize: 1 });
72
+ // MIN_CHUNK_SIZE is 100 — requesting 1 should still produce >= 100 chars.
73
+ expect(r.end).toBeGreaterThanOrEqual(100);
74
+ });
75
+ });
76
+ describe('runExtractFromHtml', () => {
77
+ it('converts HTML to markdown and wraps it in the chunking envelope', () => {
78
+ const html = '<article><h1>Title</h1><p>Hello <strong>world</strong>.</p></article>';
79
+ const r = runExtractFromHtml({
80
+ html,
81
+ url: 'https://example.com/a',
82
+ title: 'Example',
83
+ selector: 'article',
84
+ start: 0,
85
+ chunkSize: 20000,
86
+ });
87
+ expect(r.url).toBe('https://example.com/a');
88
+ expect(r.title).toBe('Example');
89
+ expect(r.selector).toBe('article');
90
+ expect(r.content).toContain('# Title');
91
+ expect(r.content).toContain('**world**');
92
+ expect(r.start).toBe(0);
93
+ expect(r.end).toBe(r.content.length);
94
+ expect(r.total_chars).toBe(r.content.length);
95
+ expect(r.next_start_char).toBeNull();
96
+ });
97
+ it('reports total_chars and chunk_size against the final markdown', () => {
98
+ const body = Array.from({ length: 30 }, (_, i) => `<p>paragraph ${i} ${'x'.repeat(200)}</p>`).join('');
99
+ const r = runExtractFromHtml({
100
+ html: `<main>${body}</main>`,
101
+ url: 'https://example.com/b',
102
+ title: 't',
103
+ selector: 'main',
104
+ start: 0,
105
+ chunkSize: 500,
106
+ });
107
+ expect(r.total_chars).toBeGreaterThan(r.end);
108
+ expect(r.chunk_size).toBe(r.end - r.start);
109
+ expect(r.next_start_char).toBe(r.end);
110
+ });
111
+ });
112
+ describe('buildExtractHtmlJs', () => {
113
+ it('embeds the selector as a JSON literal', () => {
114
+ const js = buildExtractHtmlJs('main.article');
115
+ expect(js).toContain('"main.article"');
116
+ });
117
+ it('uses null when no selector given', () => {
118
+ const js = buildExtractHtmlJs(null);
119
+ // The expression references `sel` and compares to null.
120
+ expect(js).toContain('const sel = null;');
121
+ });
122
+ it('includes the denoise selector list', () => {
123
+ const js = buildExtractHtmlJs(null);
124
+ expect(js).toContain("'script'");
125
+ expect(js).toContain("'nav'");
126
+ expect(js).toContain("'iframe'");
127
+ expect(js).toContain("'[aria-hidden=\"true\"]'");
128
+ });
129
+ });
@@ -0,0 +1,76 @@
1
+ /**
2
+ * `browser find --css <sel>` — structured CSS query.
3
+ *
4
+ * Returns every match of a selector as a JSON envelope agents can read
5
+ * without parsing free-text snapshot output. Each entry carries two
6
+ * identifiers — a numeric `ref` (matching the snapshot contract) and a
7
+ * stable 0-based `nth` — so the agent can act on a specific result via
8
+ * either path:
9
+ *
10
+ * browser click <ref> // when ref is numeric
11
+ * browser click "<sel>" --nth <n> // always works
12
+ *
13
+ * Refs are *allocated on the spot* for matched elements that were not
14
+ * tagged by a prior snapshot: `data-opencli-ref` is set on the element
15
+ * and a fingerprint is written into `window.__opencli_ref_identity`
16
+ * (same shape the snapshot uses). That makes `find` a first-class entry
17
+ * point to the ref system — agents can skip running `browser state`
18
+ * when they already know the selector.
19
+ *
20
+ * Attributes are whitelisted to keep output small and high-signal.
21
+ * Invisible elements are still returned so agents can reason about
22
+ * offscreen vs truly-missing targets.
23
+ *
24
+ * When a matched element is a compound form control (date-like input,
25
+ * select, file input), the entry gains a `compound` field with the
26
+ * rich view from `compound.ts`. This is what kills the three biggest
27
+ * agent-fail modes on form pages (wrong date format, guessed options,
28
+ * re-uploaded files) without forcing agents to probe further.
29
+ */
30
+ import { type CompoundInfo } from './compound.js';
31
+ /** Whitelist of attributes surfaced per entry. Keep small; agents do not need full DOM dumps. */
32
+ export declare const FIND_ATTR_WHITELIST: readonly ["id", "class", "name", "type", "placeholder", "aria-label", "title", "href", "value", "role", "data-testid"];
33
+ export interface FindEntry {
34
+ /** Zero-based position within the match set — pair with `--nth` on downstream commands. */
35
+ nth: number;
36
+ /**
37
+ * Numeric data-opencli-ref. Find assigns one if the element was not
38
+ * tagged by a prior snapshot, so downstream `browser click <ref>` works
39
+ * directly off the find output without requiring `browser state` first.
40
+ */
41
+ ref: number;
42
+ tag: string;
43
+ role: string;
44
+ text: string;
45
+ attrs: Record<string, string>;
46
+ visible: boolean;
47
+ /**
48
+ * Rich view for date / time / datetime-local / month / week / select /
49
+ * file inputs. Omitted (undefined) for all other element types. See
50
+ * `compound.ts` for the shape.
51
+ */
52
+ compound?: CompoundInfo;
53
+ }
54
+ export interface FindResult {
55
+ matches_n: number;
56
+ entries: FindEntry[];
57
+ }
58
+ export interface FindError {
59
+ error: {
60
+ code: 'invalid_selector' | 'selector_not_found';
61
+ message: string;
62
+ hint?: string;
63
+ };
64
+ }
65
+ export interface FindOptions {
66
+ /** Max entries returned. Default 50 — enough to pick from without flooding context. */
67
+ limit?: number;
68
+ /** Max chars of trimmed text per entry. Default 120. */
69
+ textMax?: number;
70
+ }
71
+ /**
72
+ * Build the browser-side JS that performs the CSS query and emits the
73
+ * FindResult (or FindError) envelope. Evaluated inside `page.evaluate`.
74
+ */
75
+ export declare function buildFindJs(selector: string, opts?: FindOptions): string;
76
+ export declare function isFindError(result: unknown): result is FindError;