npm - @oh-my-pi/pi-coding-agent - Versions diffs - 15.11.4 → 15.11.7 - Mend

@oh-my-pi/pi-coding-agent 15.11.4 → 15.11.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/CHANGELOG.md +82 -1
package/dist/cli.js +520 -451
package/dist/types/cli/bench-cli.d.ts +78 -0
package/dist/types/cli/usage-cli.d.ts +10 -1
package/dist/types/commands/bench.d.ts +29 -0
package/dist/types/commands/usage.d.ts +9 -0
package/dist/types/config/model-resolver.d.ts +3 -2
package/dist/types/config/settings-schema.d.ts +125 -3
package/dist/types/edit/renderer.d.ts +1 -0
package/dist/types/modes/components/oauth-selector.d.ts +10 -1
package/dist/types/modes/components/reset-usage-selector.d.ts +12 -0
package/dist/types/modes/components/session-selector.d.ts +1 -1
package/dist/types/modes/components/settings-selector.d.ts +8 -1
package/dist/types/modes/components/snapcompact-shape-preview.d.ts +31 -0
package/dist/types/modes/components/tool-execution.d.ts +18 -0
package/dist/types/modes/controllers/selector-controller.d.ts +1 -0
package/dist/types/modes/interactive-mode.d.ts +10 -0
package/dist/types/modes/session-observer-registry.d.ts +2 -0
package/dist/types/modes/setup-wizard/scenes/sign-in.d.ts +3 -0
package/dist/types/modes/setup-wizard/scenes/types.d.ts +10 -1
package/dist/types/modes/setup-wizard/scenes/web-search.d.ts +3 -0
package/dist/types/modes/types.d.ts +2 -0
package/dist/types/modes/utils/context-usage.d.ts +6 -1
package/dist/types/session/agent-session.d.ts +14 -1
package/dist/types/session/auth-storage.d.ts +1 -1
package/dist/types/session/codex-auto-reset.d.ts +107 -0
package/dist/types/session/snapcompact-inline.d.ts +107 -4
package/dist/types/slash-commands/helpers/reset-usage.d.ts +27 -0
package/dist/types/task/render.d.ts +1 -0
package/dist/types/tools/bash.d.ts +2 -0
package/dist/types/tools/eval-render.d.ts +1 -0
package/dist/types/tools/renderers.d.ts +13 -0
package/dist/types/tools/ssh.d.ts +1 -0
package/dist/types/tools/todo.d.ts +0 -11
package/package.json +11 -11
package/src/cli/bench-cli.ts +437 -0
package/src/cli/usage-cli.ts +187 -16
package/src/cli-commands.ts +1 -0
package/src/commands/bench.ts +42 -0
package/src/commands/usage.ts +8 -0
package/src/config/model-registry.ts +52 -5
package/src/config/model-resolver.ts +36 -5
package/src/config/settings-schema.ts +148 -3
package/src/config/settings.ts +9 -0
package/src/edit/renderer.ts +5 -0
package/src/hindsight/client.ts +26 -1
package/src/hindsight/state.ts +6 -2
package/src/internal-urls/docs-index.generated.ts +2 -2
package/src/mcp/transports/stdio.ts +81 -7
package/src/modes/components/oauth-selector.ts +67 -7
package/src/modes/components/reset-usage-selector.ts +161 -0
package/src/modes/components/session-selector.ts +8 -2
package/src/modes/components/settings-selector.ts +89 -47
package/src/modes/components/snapcompact-shape-preview-doc.md +11 -0
package/src/modes/components/snapcompact-shape-preview.ts +192 -0
package/src/modes/components/tool-execution.ts +26 -0
package/src/modes/components/transcript-container.ts +23 -1
package/src/modes/controllers/command-controller.ts +24 -1
package/src/modes/controllers/input-controller.ts +8 -6
package/src/modes/controllers/selector-controller.ts +72 -2
package/src/modes/interactive-mode.ts +83 -0
package/src/modes/session-observer-registry.ts +61 -3
package/src/modes/setup-wizard/index.ts +1 -0
package/src/modes/setup-wizard/scenes/glyph.ts +24 -6
package/src/modes/setup-wizard/scenes/providers.ts +36 -2
package/src/modes/setup-wizard/scenes/sign-in.ts +10 -1
package/src/modes/setup-wizard/scenes/theme.ts +28 -1
package/src/modes/setup-wizard/scenes/types.ts +10 -1
package/src/modes/setup-wizard/scenes/web-search.ts +22 -6
package/src/modes/setup-wizard/wizard-overlay.ts +38 -1
package/src/modes/theme/theme.ts +2 -2
package/src/modes/types.ts +2 -0
package/src/modes/utils/context-usage.ts +75 -1
package/src/prompts/bench.md +7 -0
package/src/prompts/system/snapcompact-context-frames-note.md +1 -0
package/src/prompts/system/snapcompact-context-stub.md +1 -0
package/src/prompts/system/snapcompact-toolresult-note.md +1 -1
package/src/prompts/tools/browser.md +33 -43
package/src/prompts/tools/eval.md +27 -50
package/src/prompts/tools/irc.md +29 -31
package/src/prompts/tools/read.md +31 -37
package/src/prompts/tools/todo.md +1 -2
package/src/sdk.ts +4 -2
package/src/session/agent-session.ts +136 -6
package/src/session/auth-storage.ts +3 -0
package/src/session/codex-auto-reset.ts +190 -0
package/src/session/snapcompact-inline.ts +404 -75
package/src/slash-commands/builtin-registry.ts +145 -8
package/src/slash-commands/helpers/context-report.ts +28 -1
package/src/slash-commands/helpers/reset-usage.ts +66 -0
package/src/slash-commands/helpers/usage-report.ts +12 -0
package/src/task/index.ts +30 -7
package/src/task/render.ts +34 -19
package/src/tools/bash.ts +3 -0
package/src/tools/eval-render.ts +4 -0
package/src/tools/renderers.ts +13 -0
package/src/tools/ssh.ts +3 -0
package/src/tools/todo.ts +8 -128

package/src/modes/setup-wizard/scenes/web-search.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { type SelectItem, SelectList, truncateToWidth } from "@oh-my-pi/pi-tui";
+import { type SelectItem, SelectList, type SgrMouseEvent, truncateToWidth } from "@oh-my-pi/pi-tui";
 import { SETTINGS_SCHEMA } from "../../../config/settings-schema";
 import { getSearchProvider, setPreferredSearchProvider } from "../../../web/search/provider";
 import { isSearchProviderPreference, type SearchProviderId } from "../../../web/search/types";
@@ -31,6 +31,8 @@ export class WebSearchTab implements SetupTab {
 	#availability = new Map<SearchProviderId, Availability>();
 	#status: string[] = [];
 	#disposed = false;
+	/** Render line where the select list begins. */
+	#listRowStart = 0;
 	constructor(private readonly host: SetupSceneHost) {
 		this.#list = new SelectList(WEB_SEARCH_ITEMS, MAX_VISIBLE, getSelectListTheme());
@@ -55,6 +57,22 @@ export class WebSearchTab implements SetupTab {
 		this.#list.handleInput(data);
 	}
+	/** Wheel moves the highlight; hover lights the row under the pointer; click confirms it. */
+	routeMouse(event: SgrMouseEvent, line: number, _col: number): void {
+		if (event.wheel !== null) {
+			this.#list.handleWheel(event.wheel);
+			return;
+		}
+		const index = this.#list.hitTest(line - this.#listRowStart);
+		if (event.motion) {
+			this.#list.setHoverIndex(index ?? null);
+			return;
+		}
+		if (event.leftClick && index !== undefined) {
+			this.#list.clickItem(index);
+		}
+	}
 	invalidate(): void {
 		this.#list.invalidate();
 	}
@@ -64,11 +82,9 @@ export class WebSearchTab implements SetupTab {
 	}
 	render(width: number): readonly string[] {
-		const lines = [
-			theme.fg("muted", "Choose the provider the web_search tool should prefer."),
-			"",
-			...this.#list.render(width),
-		];
+		const lines = [theme.fg("muted", "Choose the provider the web_search tool should prefer."), ""];
+		this.#listRowStart = lines.length;
+		lines.push(...this.#list.render(width));
 		const selected = this.#list.getSelectedItem();
 		if (selected) {
 			lines.push("", ...this.#readinessLines(selected.value).map(line => truncateToWidth(line, width)));

package/src/modes/setup-wizard/wizard-overlay.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { type Component, matchesKey, padding, truncateToWidth, visibleWidth } from "@oh-my-pi/pi-tui";
+import { type Component, matchesKey, padding, parseSgrMouse, truncateToWidth, visibleWidth } from "@oh-my-pi/pi-tui";
 import { APP_NAME } from "@oh-my-pi/pi-utils";
 import { gradientLogo, PI_LOGO } from "../components/welcome";
 import { theme } from "../theme/theme";
@@ -61,6 +61,8 @@ export class SetupWizardComponent implements Component {
 	#timer: NodeJS.Timeout | undefined;
 	#done = Promise.withResolvers<void>();
 	#disposed = false;
+	/** Screen row where the active scene's body began in the last rendered frame. */
+	#bodyRowStart = 0;
 	constructor(
 		readonly ctx: InteractiveModeContext,
@@ -87,6 +89,10 @@ export class SetupWizardComponent implements Component {
 	handleInput(data: string): void {
 		if (this.#phase === "done") return;
+		if (data.startsWith("\x1b[<")) {
+			this.#handleMouse(data);
+			return;
+		}
 		if (matchesKey(data, "ctrl+c")) {
 			this.#beginOutro();
 			return;
@@ -116,6 +122,36 @@ export class SetupWizardComponent implements Component {
 		this.#activeScene?.handleInput?.(data);
 	}
+	/**
+	 * Mouse handling for the fullscreen wizard (SGR tracking is on while the
+	 * overlay holds the alternate screen). The frame paints from screen row 0,
+	 * so report coordinates index directly into the last rendered lines: scene
+	 * body rows start at #bodyRowStart, indented by SCENE_MARGIN_X. Scenes
+	 * that implement routeMouse get hit-tested events (wheel, hover, click);
+	 * for the rest a wheel notch falls back to an arrow key. A left click
+	 * advances the splash/outro like Enter. Raw reports never reach scene
+	 * keyboard input.
+	 */
+	#handleMouse(data: string): void {
+		const event = parseSgrMouse(data);
+		if (!event) return;
+		if (this.#phase === "splash" || this.#phase === "outro") {
+			if (!event.leftClick) return;
+			if (this.#phase === "splash") this.#beginScene();
+			else this.#complete();
+			return;
+		}
+		const scene = this.#activeScene;
+		if (!scene) return;
+		if (scene.routeMouse) {
+			scene.routeMouse(event, event.row - this.#bodyRowStart, event.col - SCENE_MARGIN_X);
+			return;
+		}
+		if (event.wheel !== null) {
+			scene.handleInput?.(event.wheel === -1 ? "\x1b[A" : "\x1b[B");
+		}
+	}
 	render(width: number): readonly string[] {
 		const safeWidth = Math.max(1, width);
 		const height = Math.max(1, this.ctx.ui.terminal.rows);
@@ -163,6 +199,7 @@ export class SetupWizardComponent implements Component {
 			header.push(indentLine(theme.fg("muted", subtitle), width, SCENE_MARGIN_X));
 		}
 		header.push("");
+		this.#bodyRowStart = header.length;
 		const footer = [
 			"",

package/src/modes/theme/theme.ts CHANGED Viewed

@@ -715,7 +715,7 @@ const NERD_SYMBOLS: SymbolMap = {
 	"tool.debug": "\uEAD8",
 	"tool.mcp": "\uEB2D",
 	"tool.job": "\uEBA2",
-	"tool.task": "\uEA7E",
+	"tool.task": "\uf4a0",
 	"tool.todo": "\uEAB3",
 	"tool.memory": "\uEACE",
 	"tool.ask": "\uEAC7",
@@ -2762,7 +2762,7 @@ export function getSettingsListTheme(): SettingsListTheme {
 		label: (text: string, selected: boolean, changed: boolean) =>
 			changed ? theme.fg("statusLineGitDirty", text) : selected ? theme.fg("accent", text) : text,
 		value: (text: string, selected: boolean, changed: boolean) =>
-			selected ? theme.fg("accent", text) : changed ? theme.fg("statusLineGitDirty", text) : theme.fg("muted", text),
+			changed ? theme.fg("statusLineGitDirty", text) : selected ? theme.fg("accent", text) : theme.fg("muted", text),
 		description: (text: string) => theme.fg("dim", text),
 		cursor: theme.fg("accent", `${theme.nav.cursor} `),
 		hint: (text: string) => theme.fg("dim", text),

package/src/modes/types.ts CHANGED Viewed

@@ -81,6 +81,7 @@ export interface InteractiveModeContext {
 	pendingMessagesContainer: Container;
 	statusContainer: Container;
 	todoContainer: Container;
+	subagentContainer: Container;
 	btwContainer: Container;
 	omfgContainer: Container;
 	errorBannerContainer: Container;
@@ -287,6 +288,7 @@ export interface InteractiveModeContext {
 	handleResumeSession(sessionPath: string): Promise<void>;
 	handleSessionDeleteCommand(): Promise<void>;
 	showOAuthSelector(mode: "login" | "logout", providerId?: string): Promise<void>;
+	showResetUsageSelector(): Promise<void>;
 	showProviderSetup(): Promise<void>;
 	showHookConfirm(title: string, message: string): Promise<boolean>;
 	showDebugSelector(): Promise<void>;

package/src/modes/utils/context-usage.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import { countTokens } from "@oh-my-pi/pi-natives";
 import { formatNumber } from "@oh-my-pi/pi-utils";
 import type { Skill } from "../../extensibility/skills";
 import type { AgentSession } from "../../session/agent-session";
+import { estimateInlineSavings, type SnapcompactSavingsEstimate } from "../../session/snapcompact-inline";
 import type { Tool } from "../../tools";
 import type { theme as Theme } from "../theme/theme";
@@ -36,6 +37,8 @@ export interface ContextBreakdown {
 	usedTokens: number;
 	autoCompactBufferTokens: number;
 	freeTokens: number;
+	/** Estimated snapcompact wire savings; set when requested and a snapcompact.* setting is enabled. */
+	snapcompact?: SnapcompactSavingsEstimate;
 }
 const EMPTY_STRING_PARTS: readonly string[] = [];
@@ -109,7 +112,10 @@ function computeNonMessageBreakdown(session: AgentSession): {
  * Compute a breakdown of estimated context usage by category for the active
  * session and model.
  */
-export function computeContextBreakdown(session: AgentSession): ContextBreakdown {
+export function computeContextBreakdown(
+	session: AgentSession,
+	options?: { snapcompactSavings?: boolean },
+): ContextBreakdown {
 	const model = session.model;
 	const contextWindow = model?.contextWindow ?? 0;
@@ -169,6 +175,22 @@ export function computeContextBreakdown(session: AgentSession): ContextBreakdown
 	const freeTokens = Math.max(0, contextWindow - usedTokens - autoCompactBufferTokens);
+	// Estimated wire savings from snapcompact inline imaging. Opt-in: only the
+	// /context surfaces need it; other callers skip the extra token counting.
+	let snapcompactSavings: SnapcompactSavingsEstimate | undefined;
+	if (options?.snapcompactSavings) {
+		const renderSystemPrompt = session.settings.get("snapcompact.systemPrompt");
+		const renderToolResults = session.settings.get("snapcompact.toolResults");
+		if (renderSystemPrompt !== "none" || renderToolResults) {
+			snapcompactSavings = estimateInlineSavings({
+				options: { renderSystemPrompt, renderToolResults, shape: session.settings.get("snapcompact.shape") },
+				model,
+				systemPrompt: session.systemPrompt ?? [],
+				messages: session.messages ?? [],
+			});
+		}
+	}
 	return {
 		model,
 		contextWindow,
@@ -176,6 +198,7 @@ export function computeContextBreakdown(session: AgentSession): ContextBreakdown
 		usedTokens,
 		autoCompactBufferTokens,
 		freeTokens,
+		snapcompact: snapcompactSavings,
 	};
 }
@@ -298,6 +321,57 @@ function buildLegendLines(breakdown: ContextBreakdown, theme: typeof Theme): str
 		);
 	}
+	const snap = breakdown.snapcompact;
+	if (snap) {
+		lines.push("");
+		if (!snap.visionCapable) {
+			lines.push(theme.fg("muted", "Snapcompact: inactive (model has no image input)"));
+		} else {
+			lines.push(theme.fg("muted", "Snapcompact (estimated wire savings)"));
+			if (snap.systemPrompt) {
+				const sp = snap.systemPrompt;
+				if (sp.applied) {
+					lines.push(
+						`  System prompt (${sp.scope === "agents-md" ? "AGENTS.md" : "all"}): saves ${theme.bold(`~${formatNumber(sp.savedTokens)}`)} ` +
+							theme.fg(
+								"dim",
+								`(${formatNumber(sp.textTokens)} text → ${sp.frames} frame${sp.frames === 1 ? "" : "s"} ≈ ${formatNumber(sp.imageTokens)})`,
+							),
+					);
+				} else {
+					const reason =
+						sp.reason === "budget"
+							? "image budget exhausted"
+							: sp.reason === "empty"
+								? "nothing to image"
+								: "frames would not save tokens";
+					lines.push(
+						`  System prompt (${sp.scope === "agents-md" ? "AGENTS.md" : "all"}): ${theme.fg("dim", `stays text (${reason})`)}`,
+					);
+				}
+			}
+			if (snap.toolResults) {
+				const tr = snap.toolResults;
+				if (tr.swapped > 0) {
+					lines.push(
+						`  Tool results: saves ${theme.bold(`~${formatNumber(tr.savedTokens)}`)} ` +
+							theme.fg(
+								"dim",
+								`(${tr.swapped}/${tr.total} imaged, ${formatNumber(tr.textTokens)} text → ${tr.frames} frames ≈ ${formatNumber(tr.imageTokens)})`,
+							),
+					);
+				} else {
+					lines.push(`  Tool results: ${theme.fg("dim", `none imaged (${tr.total} in history)`)}`);
+				}
+			}
+			if (snap.savedTokens > 0) {
+				lines.push(
+					`  Next request: ${theme.bold(`~${formatNumber(Math.max(0, usedTokens - snap.savedTokens))}`)} ${theme.fg("dim", "tokens on the wire")}`,
+				);
+			}
+		}
+	}
 	return lines;
 }

package/src/prompts/bench.md ADDED Viewed

@@ -0,0 +1,7 @@
+Write a continuous, plain-prose technical explanation of how a relational database executes a SQL query: lexing and parsing, semantic analysis, logical plan construction, cost-based optimization, physical operator selection, and row-by-row execution through the iterator model.
+Form:
+- Plain paragraphs only: no headings, no lists, no code fences, no preamble.
+- Do not wrap up early or summarize; keep writing until you are cut off.
+Output only the explanation.

package/src/prompts/system/snapcompact-context-frames-note.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ === CONTEXT FILE INSTRUCTIONS — read the image(s) below as the loaded context files replaced in the system prompt ===

package/src/prompts/system/snapcompact-context-stub.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ Loaded context-file instructions were moved to PNG image(s) attached below at the start of the first user message. Read every frame in order where this marker appears, then apply those instructions as if the original context-file text remained here.

package/src/prompts/system/snapcompact-toolresult-note.md CHANGED Viewed

	@@ -1 +1 @@
1	- [~~Rasterized~~]
1	+ [The result of this tool call is in the PNG frame(s) below — read them as the output; they contain it verbatim. Delivering it as an image is deliberate harness behavior to save context, not a tool malfunction. NEVER re-run the call or report a tool issue because of it.]

package/src/prompts/tools/browser.md CHANGED Viewed

@@ -1,40 +1,39 @@
 Drives real Chromium tab; full puppeteer access via JS execution.
 <instruction>
-- For static web content (articles, docs, issues/PRs, JSON, PDFs, feeds), prefer `read` tool with URL — reader-mode text without spinning up browser. Use this tool when you need JS execution, authentication, or interactive actions.
-- Three actions only:
-  - `open` — acquire or reuse named tab. `name` defaults `"main"`. Optional `url` navigates after tab ready. Optional `viewport` sets dimensions. Optional `dialogs: "accept" | "dismiss"` auto-handles `alert`/`confirm`/`beforeunload` so navigation/clicks don't hang; by default dialogs are unhandled and the page hangs until you wire `page.on('dialog', …)`.
-  - `close` — release tab by `name`, or every tab with `all: true`. For spawned-app browsers, set `kill: true` to terminate process tree (default leaves running).
-  - `run` — execute JS against existing tab. `code` is body of async function with `page`, `browser`, `tab`, `display`, `assert`, `wait` in scope. Function's return value JSON-stringified into tool result; multiple `display(value)` calls accumulate text/images.
-- Tabs survive across `run` calls and across in-process subagents. Open once, reuse many times.
-- Browser kinds, selected by `app` field on `open`:
+- Static content (articles, docs, issues/PRs, JSON, PDFs, feeds)? Use `read` with the URL. Reach for browser only for JS execution, authentication, or interactive actions.
+- Three actions:
+  - `open` — acquire or reuse named tab (`name` defaults `"main"`). Optional `url` (navigate once ready), `viewport`, `dialogs: "accept" | "dismiss"` (auto-handle `alert`/`confirm`/`beforeunload`; unhandled dialogs hang the page until you wire `page.on('dialog', …)`).
+  - `close` — release tab by `name`, or every tab with `all: true`. `kill: true` also terminates spawned-app process trees (default leaves them running).
+  - `run` — execute JS in an existing tab. `code` is the body of an async function with `page`, `browser`, `tab`, `display`, `assert`, `wait` in scope. Return value is JSON-stringified into the result; `display(value)` calls accumulate text/images.
+- Tabs survive across `run` calls and in-process subagents — open once, reuse.
+- Browser kinds (`app` field on `open`):
   - default (no `app`) → headless Chromium with stealth patches.
-  - `app.path` → spawn absolute binary (Electron/CDP); a running instance with an open CDP port is reused. No stealth patches — NEVER tamper with real desktop app.
+  - `app.path` → spawn absolute binary (Electron/CDP); a running instance with an open CDP port is reused. No stealth patches — NEVER tamper with a real desktop app.
   - `app.cdp_url` → connect to existing CDP endpoint (e.g. `http://127.0.0.1:9222`).
-  - `app.target` (with `path`/`cdp_url`) — substring matched against url+title to pick BrowserWindow when app exposes several.
-- Inside `run`, `tab` exposes high-level helpers; reach for `page` (raw puppeteer Page) when you need anything they don't cover.
-  - `tab.goto(url, { waitUntil? })` — clears element cache and navigates.
-  - `tab.observe({ includeAll?, viewportOnly? })` — accessibility snapshot. Returns `{ url, title, viewport, scroll, elements: [{ id, role, name, value, states, … }] }`. Element ids stable until next observe/goto.
-  - `tab.id(n)` — resolves element id from most recent observe to real `ElementHandle` you can `.click()`, `.type()`, etc.
-  - `tab.click(selector)` / `tab.type(selector, text)` / `tab.fill(selector, value)` / `tab.press(key, { selector? })` / `tab.scroll(dx, dy)` — selector-based actions.
-  - `tab.waitFor(selector)` — waits until selector attached, returns resolved `ElementHandle` for chaining (e.g. `const btn = await tab.waitFor('text/Submit'); await btn.click();`).
-  - `tab.drag(from, to)` — drag from one point to another. Each endpoint either selector string (drag center-to-center) or `{ x, y }` viewport-coordinate point (for canvases, sliders).
-  - `tab.scrollIntoView(selector)` — scroll matching element to center of viewport (use before clicking off-screen elements).
-  - `tab.select(selector, …values)` — set selected option(s) on `<select>`. Returns values that ended up selected. `tab.fill` NEVER works for selects.
-  - `tab.uploadFile(selector, …filePaths)` — attach files to `<input type="file">`. Paths resolve relative to cwd.
-  - `tab.waitForUrl(pattern, { timeout? })` — pattern substring or `RegExp`. Polls `location.href` so works for SPA pushState navigations, not just real navigations. Returns matched URL.
-  - `tab.waitForResponse(pattern, { timeout? })` — pattern substring, `RegExp`, or `(response) => boolean`. Returns raw puppeteer `HTTPResponse` (call `.text()` / `.json()` / `.status()` / `.headers()` on it).
-  - `tab.evaluate(fn, …args)` — sugar for `page.evaluate` with abort signal already wired. Use this instead of dropping to `page.evaluate` for ad-hoc DOM reads.
-  - `tab.screenshot({ selector?, fullPage?, save?, silent? })` — captures a screenshot and attaches it for you to view (`silent: true` skips attaching). Pass `save` (a path) only when a later step needs the file; never just to look.
-  - `tab.extract(format = "markdown")` — returns Readability-extracted page content as a string (`"markdown"` or `"text"`). Throws if the page yields no readable content.
-- Selectors accept CSS plus puppeteer query handlers: `aria/Sign in`, `text/Continue`, `xpath/…`, `pierce/…`. Playwright-style `p-aria/[name="…"]`, `p-text/…` normalized.
-- Default `tab.observe()` over `tab.screenshot()` for page state. Screenshot only when visual appearance matters.
+  - `app.target` (with `path`/`cdp_url`) — substring matched against url+title to pick a BrowserWindow.
+- `tab` helpers; drop to raw puppeteer `page` for anything they don't cover:
+  - `tab.goto(url, { waitUntil? })` — navigate; clears element cache.
+  - `tab.observe({ includeAll?, viewportOnly? })` — accessibility snapshot: `{ url, title, viewport, scroll, elements: [{ id, role, name, value, states, … }] }`. Ids stable until next observe/goto.
+  - `tab.id(n)` — element id from last observe → `ElementHandle` (`.click()`, `.type()`, …).
+  - `tab.click(selector)` / `tab.type(selector, text)` / `tab.fill(selector, value)` / `tab.press(key, { selector? })` / `tab.scroll(dx, dy)`.
+  - `tab.waitFor(selector)` — wait until attached; returns the `ElementHandle`.
+  - `tab.drag(from, to)` — endpoints: selector (center-to-center) or `{ x, y }` viewport point (canvases, sliders).
+  - `tab.scrollIntoView(selector)` — center element in viewport; use before clicking off-screen elements.
+  - `tab.select(selector, …values)` — set `<select>` option(s); returns resulting selection. `tab.fill` NEVER works for selects.
+  - `tab.uploadFile(selector, …filePaths)` — attach files to `<input type="file">`; paths relative to cwd.
+  - `tab.waitForUrl(pattern, { timeout? })` — substring or `RegExp`; polls `location.href` (catches SPA pushState). Returns matched URL.
+  - `tab.waitForResponse(pattern, { timeout? })` — substring, `RegExp`, or `(response) => boolean`; returns puppeteer `HTTPResponse` (`.text()`/`.json()`/`.status()`/`.headers()`).
+  - `tab.evaluate(fn, …args)` — `page.evaluate` with abort signal wired; use for ad-hoc DOM reads.
+  - `tab.screenshot({ selector?, fullPage?, save?, silent? })` — capture and attach for viewing (`silent: true` skips). Pass `save` (a path) only when a later step needs the file.
+  - `tab.extract(format = "markdown")` — Readability-extracted content (`"markdown"` | `"text"`); throws when nothing readable.
+- Selectors: CSS plus puppeteer handlers `aria/Sign in`, `text/Continue`, `xpath/…`, `pierce/…`; Playwright-style `p-aria/…`, `p-text/…` normalized.
 </instruction>
 <critical>
-- MUST call `open` before `run`. `run` does not implicitly create tab.
-- NEVER screenshot just to "see what's on page" — `tab.observe()` returns structured data with element ids you can act on immediately.
-- After `tab.goto()` or any navigation, prior element ids from `tab.observe()` invalidated. Re-observe before referencing them.
+- MUST `open` before `run` — `run` never creates a tab.
+- Default to `tab.observe()` for page state — structured data with actionable element ids. Screenshot ONLY when visual appearance matters.
+- Navigation invalidates element ids — re-observe before using them.
 - `code` runs with full Node access. Treat as your code, not sandboxed code.
 </critical>
@@ -46,28 +45,19 @@ Drives real Chromium tab; full puppeteer access via JS execution.
 # Click an observed element by id
 `{"action":"run","name":"docs","code":"const obs = await tab.observe(); const link = obs.elements.find(e => e.role === 'link' && e.name === 'Sign in'); assert(link, 'Sign in link missing'); await (await tab.id(link.id)).click();"}`
-# Screenshot to look at the page — no save path
-`{"action":"run","name":"docs","code":"await tab.screenshot();"}`
-# Keep a full-page screenshot on disk for a later step
-`{"action":"run","name":"docs","code":"await tab.screenshot({ fullPage: true, save: 'screenshot.png' });"}`
 # Fill and submit a form via selectors
 `{"action":"run","name":"docs","code":"await tab.fill('input[name=email]', 'me@example.com'); await tab.click('text/Continue');"}`
+# Screenshot to look at the page — no save path
+`{"action":"run","name":"docs","code":"await tab.screenshot();"}`
 # Attach to an existing Electron app
 `{"action":"open","name":"cursor","app":{"path":"/Applications/Cursor.app/Contents/MacOS/Cursor"}}`
-# Close one tab (browser stays alive if other tabs reference it)
-`{"action":"close","name":"docs"}`
-# Close every tab; leave spawned apps running
-`{"action":"close","all":true}`
-# Close every tab and kill spawned-app processes too
+# Close every tab and kill spawned-app processes
 `{"action":"close","all":true,"kill":true}`
 </examples>
 <output>
-- Per call: any `display(value)` outputs (text/images) followed by JSON-stringified return value of `code` function. `run` always produces at least status line.
+Per call: `display(value)` outputs (text/images), then the JSON-stringified return value of `code`. `run` always produces at least a status line.
 </output>

package/src/prompts/tools/eval.md CHANGED Viewed

@@ -1,92 +1,69 @@
 Run code in a persistent kernel using a list of cells.
 <instruction>
-Each call submits one or more cells. Cells run in array order. State persists within each language — across cells, tool calls, and subagents spawned with `task`: variables a parent or subagent declares are visible to the other. Lean on this: stage helpers, loaded datasets, or live clients once, then fan out `task` subagents that use them directly. No re-importing, re-fetching, or serializing across the boundary.
+Cells run in array order. State persists per language — across cells, tool calls, and `task` subagents: variables either side defines are visible to the other. Stage helpers, datasets, or live clients once; subagents use them directly — no re-importing or serializing across the boundary.
 Cell fields:
 - `language` — {{#if py}}`"py"` for the IPython kernel{{/if}}{{#ifAll py js}}, {{/ifAll}}{{#if js}}`"js"` for the persistent JavaScript VM{{/if}}.
-- `code` — cell body, verbatim. Newlines, quotes, and indentation are JSON-encoded; no fences, no headers.
-- `title` (optional) — short label shown in the transcript (e.g. `"imports"`, `"load config"`).
-- `timeout` (optional) — per-cell wall-clock budget in seconds (1-3600). Default 30. It bounds the cell's **own** work: compute, `print`/stdout, `log()`/`phase()`, and ordinary tool calls all count. The clock pauses while an `agent()`/`parallel()`/`completion()` call is in flight, so long fanouts and slow completions never need a raised `timeout`. Raise it only for heavy local work or long non-agent tool calls.
-- `reset` (optional) — wipe this cell's language kernel before running.{{#ifAll py js}} Reset is per-language: a `py` cell's reset does not touch the JavaScript VM and vice versa.{{/ifAll}}
+- `code` — cell body, verbatim. Newlines and quotes JSON-encoded; no fences, no headers.
+- `title` (optional) — short transcript label (e.g. `"imports"`).
+- `timeout` (optional) — per-cell seconds (1-3600, default 30). Bounds the cell's own work only; the clock pauses while `agent()`/`parallel()`/`completion()` calls are in flight, so fanouts never need a raise. Raise only for heavy local compute or long non-agent tool calls.
+- `reset` (optional) — wipe this cell's language kernel first.{{#ifAll py js}} Per-language: a `py` reset never touches the JS VM.{{/ifAll}}
-**Work incrementally:**
-- One logical step per cell (imports, define, test, use).
-- Pass multiple small cells in one call.
-- Define small reusable functions for individual debugging.
-- Put workflow explanations in the assistant message or `title` — never inside cell code.
-{{#if py}}- Python cells run inside an IPython kernel with a live event loop. Use top-level `await` directly (e.g. `await main()`); `asyncio.run(…)` raises "cannot be called from a running event loop".{{/if}}
-**On failure:** errors identify the failing cell (e.g., "Cell 3 failed"). Resubmit only the fixed cell (or fixed cell + remaining cells).
+Work incrementally: one logical step per cell (imports, define, test, use); pass multiple small cells per call; define small reusable functions for individual debugging. Workflow explanations go in the assistant message or `title`, never inside cell code.
+{{#if py}}Python runs in IPython with a live event loop: use top-level `await` directly; `asyncio.run(…)` raises "cannot be called from a running event loop".{{/if}}
+On failure, errors name the failing cell ("Cell 3 failed") — resubmit only the fixed cell (plus any remaining).
 </instruction>
 <prelude>
-{{#ifAll py js}}Same helpers in both runtimes with the same positional argument order. Python: trailing options as keyword args. JavaScript: trailing options are a single trailing object literal, never positional — passing options positionally (or any extra positional arg) throws. JavaScript helpers are async and `await`able; Python helpers run synchronously.{{else}}{{#if py}}Helpers run synchronously. Trailing options are keyword arguments.{{/if}}{{#if js}}Helpers are async and `await`able. Trailing options are a single trailing object literal, never positional — passing options positionally (or any extra positional arg) throws.{{/if}}{{/ifAll}}
+{{#ifAll py js}}Same helpers in both runtimes, same positional order. Python: helpers run synchronously; trailing options are keyword args. JavaScript: helpers are async and `await`able; trailing options are ONE trailing object literal, never positional (extra positional args throw).{{else}}{{#if py}}Helpers run synchronously. Trailing options are keyword arguments.{{/if}}{{#if js}}Helpers are async and `await`able. Trailing options are ONE trailing object literal, never positional (extra positional args throw).{{/if}}{{/ifAll}}
 ```
 display(value) → None
-    Render a value in the current cell output.
+    Render value in cell output, shows presentable values natively (figures, images, dataframes)
 print(value, ...) → None
-    Print to the cell's text output.
+    Print to text output.
 read(path, offset?=1, limit?=None) → str
-    Read file contents as text. offset/limit are 1-indexed line bounds. Accepts `local://…` (resolved to the session-local root, same place `read local://…` reads).
+    Read file as text; offset/limit are 1-indexed lines. Accepts `local://…`.
 write(path, content) → str
-    Write content to a file (creates parent directories). Returns the resolved path. Accepts `local://…` to persist artifacts across turns / share with subagents.
+    Write file (creates parents); returns resolved path. `local://…` persists across turns / subagents.
 append(path, content) → str
-    Append content to a file. Returns the resolved path. Accepts `local://…`.
+    Append to file; returns resolved path. Accepts `local://…`.
 tree(path?=".", max_depth?=3, show_hidden?=False) → str
-    Render a directory tree.
+    Directory tree.
 diff(a, b) → str
-    Unified diff between two files.
+    Unified diff of two files.
 env(key?=None, value?=None) → str | None | dict
-    No args → full environment as dict. One arg → value of `key`. Two args → set `key=value` and return value.
+    No args → full env dict; one → value of `key`; two → set `key=value`, return value.
 output(*ids, format?="raw", query?=None, offset?=None, limit?=None) → str | dict | list[dict]
-    Read task/agent output by ID. Single id returns text/dict; multiple ids return a list.
+    Read task/agent output by id; one id → text/dict, multiple → list.
 tool.<name>(args) → unknown
-    Invoke any session tool by name. `args` is the tool's parameter object.
+    Invoke any session tool; `args` is its parameter object.
 completion(prompt, model?="default", system?=None, schema?=None) → str | dict
-    Oneshot, stateless completion (no history, no tools). `model` picks a tier: "smol" (fast), "default" (this session's model), "slow" (most capable). Pass `system` for a system prompt. Pass a JSON-Schema `schema` to force structured output and get the parsed object back; otherwise returns the completion text.
+    Oneshot stateless completion (no history, no tools). `model` tier: "smol" (fast) | "default" (session model) | "slow" (most capable). JSON-Schema `schema` forces structured output, returns parsed object.
 {{#if spawns}}agent(prompt, agent_type?="task", model?=None, label?=None, schema?=None) → str | dict
-    Run a subagent and return its final output. Defaults to the bundled "task" agent; pass `agent_type`/`agentType` for another discovered agent. Pass a JSON-Schema `schema` to force structured output and get the parsed object back. Share background by writing a `local://` file and referencing it in the prompt.
-{{#if js}}    In JS, pass options as one trailing object — never positional: agent(prompt, { agentType, schema }).
+    Run a subagent, return its final output. `agent_type`/`agentType` picks another discovered agent; `schema` as in completion(). Share background via `local://` files referenced in the prompt.
+{{#if js}}    JS: options are ONE trailing object — agent(prompt, { agentType, schema }).
 {{/if}}
 {{/if}}
 parallel(thunks) → list
-    Run thunks (callables) through a bounded pool, preserving input order. The pool is as wide as a `task` tool batch, so fan out as wide as the work divides — don't pre-shrink it. Barrier: returns once all finish; a thunk that throws propagates.
+    Run thunks through a bounded pool (as wide as a `task` batch — don't pre-shrink), preserving input order. Barrier: returns when all finish; a throwing thunk propagates.
 pipeline(items, ...stages) → list
-    Map each item through stages left-to-right; a barrier runs between stages (every item clears stage N before stage N+1). Each stage is a one-arg callable: stage 1 gets the original item, later stages get the previous result. Same pool width as parallel().
+    Map items through one-arg stages left-to-right, barrier between stages; stage 1 gets the item, later stages the previous result. Same pool width as parallel().
 log(message) → None
-    Emit a progress line above the status tree.
+    Progress line above the status tree.
 phase(title) → None
-    Start a phase; the status lines that follow group under it.
+    Start a phase grouping subsequent status lines.
 budget → per-turn token budget
-    {{#if py}}`budget.total` (ceiling or None), `budget.spent()` (output tokens this turn), `budget.remaining()` (math.inf when no ceiling), `budget.hard` (bool).{{/if}}{{#if js}}`await budget.total()` (ceiling or null), `await budget.spent()`, `await budget.remaining()` (Infinity when no ceiling), `await budget.hard()`.{{/if}} A ceiling is set by a `+Nk` message directive (advisory) or `+Nk!`/Goal Mode (hard — `agent()` refuses to spawn past it); otherwise total is None/null and spend is still tracked across the turn (main loop + eval subagents).
+    {{#if py}}`budget.total` (ceiling or None), `budget.spent()`, `budget.remaining()` (math.inf when no ceiling), `budget.hard` (bool).{{/if}}{{#if js}}`await budget.total()` (ceiling or null), `await budget.spent()`, `await budget.remaining()` (Infinity when no ceiling), `await budget.hard()`.{{/if}} Ceiling comes from a `+Nk` directive (advisory) or `+Nk!`/Goal Mode (hard — `agent()` refuses to spawn past it); otherwise None/null, spend still tracked across the turn.
 ```
 </prelude>
-<output>
-Cells render like a Jupyter notebook. `display(value)` renders non-presentable data as an interactive JSON tree. Presentable values (figures, images, dataframes, etc.) use their native representation.
-</output>
-<caution>
-{{#if js}}- **js**: the VM exposes a selective `process` subset, Web APIs, `Buffer`, `fs/promises`, and the `Bun` global.
-{{/if}}</caution>
 <example>
-{{#if py}}```json
 {
   "cells": [
     { "language": "py", "title": "imports", "timeout": 10, "code": "import json\nfrom pathlib import Path" },
     { "language": "py", "title": "load config", "code": "data = json.loads(read('package.json'))\ndisplay(data)" }
   ]
 }
-```{{/if}}{{#ifAll py js}}
-{{/ifAll}}{{#if js}}```json
-{
-  "cells": [
-    { "language": "js", "title": "summary", "reset": true, "code": "const data = JSON.parse(await read('package.json'));\ndisplay(data);\nreturn data.name;" }
-  ]
-}
-```{{/if}}
 </example>