npm - pi-cursor-sdk - Versions diffs - 0.1.16 → 0.1.18 - Mend

pi-cursor-sdk 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/CHANGELOG.md +53 -1
package/README.md +2 -2
package/docs/cursor-live-smoke-checklist.md +54 -41
package/docs/cursor-model-ux-spec.md +4 -3
package/docs/cursor-testing-lessons.md +199 -0
package/package.json +14 -5
package/scripts/isolated-cursor-smoke.sh +226 -0
package/scripts/steering-rpc-smoke.mjs +238 -0
package/scripts/tmux-live-smoke.sh +418 -0
package/scripts/validate-smoke-jsonl.mjs +207 -0
package/src/cursor-context-tools.ts +6 -0
package/src/cursor-display-text.ts +10 -0
package/src/cursor-edit-diff.ts +11 -0
package/src/cursor-env-boolean.ts +22 -0
package/src/cursor-live-run-coordinator.ts +483 -0
package/src/cursor-native-replay-routing.ts +48 -0
package/src/cursor-native-replay-trace.ts +29 -0
package/src/cursor-native-tool-display-registration.ts +103 -0
package/src/cursor-native-tool-display-replay.ts +465 -0
package/src/cursor-native-tool-display-state.ts +78 -0
package/src/cursor-native-tool-display-tools.ts +102 -0
package/src/cursor-native-tool-display.ts +10 -648
package/src/cursor-partial-content-emitter.ts +121 -0
package/src/cursor-pi-tool-bridge-abort.ts +133 -0
package/src/cursor-pi-tool-bridge-diagnostics.ts +179 -0
package/src/cursor-pi-tool-bridge-mcp.ts +118 -0
package/src/cursor-pi-tool-bridge-run.ts +384 -0
package/src/cursor-pi-tool-bridge-server.ts +182 -0
package/src/cursor-pi-tool-bridge-snapshot.ts +88 -0
package/src/cursor-pi-tool-bridge-types.ts +80 -0
package/src/cursor-pi-tool-bridge.ts +42 -1104
package/src/cursor-provider-live-run-drain.ts +405 -0
package/src/cursor-provider-turn-coordinator.ts +460 -0
package/src/cursor-provider.ts +77 -1103
package/src/cursor-question-tool.ts +9 -1
package/src/cursor-record-utils.ts +26 -0
package/src/cursor-sdk-output-filter.ts +100 -0
package/src/cursor-sensitive-text.ts +37 -0
package/src/cursor-tool-transcript.ts +28 -1229
package/src/cursor-transcript-tool-formatters.ts +641 -0
package/src/cursor-transcript-tool-specs.ts +441 -0
package/src/cursor-transcript-utils.ts +276 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,58 @@
 # Changelog
-## Unreleased
+## 0.1.18 - 2026-05-23
+### Added
+- Add `scripts/isolated-cursor-smoke.sh` and `npm run smoke:isolated` for packed `/tmp` install smoke with seeded `auth.json`, plan-strip shim, and JSONL replay-error scans.
+- Add `scripts/fixtures/plan-strip-shim/` to simulate plan-mode execute stripping active tools to `read`, `bash`, `edit`, and `write`.
+- Extend `scripts/validate-smoke-jsonl.mjs` with `--replay-errors` and `--replay-errors-only` to fail on persisted `Tool grep/cursor/find/ls not found` entries.
+- Add [Cursor testing lessons](docs/cursor-testing-lessons.md) documenting auth.json seeding, isolated harness layout, JSONL replay scans, and the plan-mode replay regression chain.
+- Add regression coverage in `test/cursor-native-replay-stress.test.ts`, `test/cursor-native-replay-trace.test.ts`, `test/cursor-native-replay-routing.test.ts`, and expanded live-run / extension lifecycle tests.
+### Changed
+- Centralize native replay routing in `src/cursor-native-replay-routing.ts` (`resolveNativeReplayDisposition`, shared context-tool partitioning) for turn coordinator and live-run drain.
+- Unify 240-character display truncation in `src/cursor-display-text.ts` and share `getActiveContextToolNames()` via `src/cursor-context-tools.ts`.
+- Unify inactive native replay trace formatting through `src/cursor-native-replay-trace.ts` (`title: summary`) for both live-run drain and turn-coordinator paths.
+- On non-Cursor model switch, strip all registered native replay wrappers except core pi tools (`read`, `bash`, `edit`, `write`), not only `cursor`.
+- Document `auth.json` as the primary live-smoke auth source in the live smoke checklist, README maintainer gate, and UX spec.
+### Fixed
+- Fix `Tool grep not found` and related native replay failures after plan-mode execute resets active tools by re-syncing registered Cursor replay wrappers on `before_agent_start` and `turn_start`.
+- Skip native replay `toolUse` when a replay tool is inactive in `context.tools`; emit scrubbed thinking trace instead of a broken pi tool call.
+- Partition live-run drain replay emission so inactive queued native tools fall back to trace output instead of invalid `toolUse` turns.
+## 0.1.17 - 2026-05-23
+### Added
+- Surface in-progress Cursor SDK `task` activity in the TUI from SDK-provided `args.description`, with one deduped line such as `Cursor task: Explore AI/automation projects` and no generic heartbeat or per-tool start spam.
+### Changed
+- Bump pi dev dependency baseline to `0.75.5` for read-tool collapsed-card rendering, package update fixes, and other upstream pi changes. Cursor edit replay remains display-only via `diffString`; pi's new SDK `details.patch` field is not required because Cursor agents do not execute pi's edit tool.
+- Rework live-run internals into dedicated coordination/drain/turn/partial-content modules (`cursor-live-run-coordinator.ts`, `cursor-provider-live-run-drain.ts`, `cursor-provider-turn-coordinator.ts`, `cursor-partial-content-emitter.ts`) while preserving the provider's external contract.
+- Complete phase-2 remediation for #23/#24/#25 by splitting bridge ownership across snapshot/server/run/abort/diagnostics/MCP/types modules, splitting native replay ownership across state/registration/replay/tools modules, and unifying tool completion routing through `resolveToolCompletion`.
+- Replace monolithic provider test coverage with focused stream/bridge/replay/live-run suites plus shared harness helpers.
+- Promote smoke automation into packaged entrypoints (`npm run smoke:live`, `npm run smoke:steering`, `npm run smoke:jsonl`) and make helper retry/polling behavior explicit (TUI answer/footer polling plus deterministic tmux cleanup).
+- Document the hard maintainer rule that Cursor SDK behavior must be verified against the installed `@cursor/sdk` package and/or official TypeScript SDK docs before implementation or release claims.
+- Bump package metadata to `0.1.17` so the dry-run tarball no longer collides with the existing `v0.1.16` tag.
+### Fixed
+- Resolve startup noise issue #17 by extending Cursor SDK bootstrap filtering to late hook compatibility warnings and ripgrep/ignore-mapping output while preserving non-startup logs.
+- Fix steering/follow-up delivery for active pooled Cursor runs by resuming/waiting on the in-flight run and sending incremental follow-up text after pending tool/result flow completes instead of issuing a second concurrent `Agent.send()`; additional stale tool batches from the old run are cancelled so the new user input is not lost.
+- Resolve issue #19 with a canonical edit-diff fallback resolver (`diffString → diff → unifiedDiff → patch`) shared by replay and transcript formatting paths.
+- Resolve issue #20 by updating the token-tracking investigation note to mark the `0.75.3` observation as point-in-time and call out the current `0.75.5` development baseline.
+- Resolve issue #21 by decomposing prior 1k+ provider/transcript/bridge/test monoliths into ownership-scoped modules.
+- Harden bridge diagnostics and secret scrubbing so debug JSONL stays run-safe and allowlisted without endpoint path material, raw args/results, or credential payloads.
+- Make Cursor SDK output filtering safe for overlapping provider streams by restoring the global stdout/stderr/console patch only after the last active install.
+- Reject bridge MCP calls cleanly when tool-dispatch handlers throw, and avoid suppressing unrelated MCP replay solely because an external payload reuses a known bridge request ID.
+- Bound native replay diff/write previews by both lines and characters, summarize non-text MCP content without dumping raw payload JSON, and make expanded-diff truncation copy truthful.
+- Change smoke forbidden-material scans to report only matching file names, not secret-bearing matched lines.
+- Harden live-smoke direct-output checks so a step logs `PASS` only after both command exit and expected stdout assertion succeed, with the basic prompt retrying once on empty output even when the first command exits zero.
 ## 0.1.16 - 2026-05-22

package/README.md CHANGED Viewed

@@ -232,7 +232,7 @@ PI_CURSOR_PI_TOOL_BRIDGE_DEBUG=1 pi --model cursor/composer-2.5
 ### Maintainer live smoke release gate
-For Cursor provider/runtime changes, follow the manual [Cursor live smoke checklist](docs/cursor-live-smoke-checklist.md) before release. Assume every runtime surface is in scope. The checklist uses real `pi -e . --cursor-no-fast --model cursor/composer-2.5` runs with temporary session dirs and requires the visible TUI/output, scrubbed diagnostics, and persisted JSONL to agree. Do not mark a release ready with optional, deferred, mostly-passing, or unobserved smoke checks outstanding.
+For Cursor provider/runtime changes, follow the manual [Cursor live smoke checklist](docs/cursor-live-smoke-checklist.md) before release. See [Cursor testing lessons](docs/cursor-testing-lessons.md) for auth.json seeding, isolated `/tmp` harness layout, JSONL replay-error scans, and other regression traps. Assume every runtime surface is in scope. The checklist uses real `pi -e . --cursor-no-fast --model cursor/composer-2.5` runs with temporary session dirs and requires the visible TUI/output, scrubbed diagnostics, and persisted JSONL to agree. Do not mark a release ready with optional, deferred, mostly-passing, or unobserved smoke checks outstanding.
 ## Fallback models
@@ -248,7 +248,7 @@ Actual Cursor runs still need a key from `/login`, `CURSOR_API_KEY`, or `--api-k
 - **The pi tool bridge is local and MCP-backed.** Bridgeable active pi tools are exposed to local Cursor agents through a tokenized `127.0.0.1` MCP endpoint; internal Cursor replay activity names are excluded, and overlapping built-in pi tools are hidden by default. Set `PI_CURSOR_PI_TOOL_BRIDGE=0` to disable it or `PI_CURSOR_EXPOSE_BUILTIN_TOOLS=1` to expose overlapping built-ins too.
 - **Cursor native tool replay is display-only.** Replay renders recorded Cursor SDK activity and never re-runs Cursor-side commands, reapplies Cursor edits, calls MCP servers, or mutates pi state. Workflow tools such as Cursor `SwitchMode` and Cursor todo state are not pi workflow controls. See [Cursor native tool replay](docs/cursor-native-tool-replay.md) for supported replay cards, ordering, conflict handling, and opt-out flags.
 - **Cursor run state can span tool-use turns.** Within a pi session, the extension reuses one Cursor SDK agent across compatible follow-up turns and sends incremental prompts when context still matches. It recreates the agent when context diverges, after compaction or `/tree` navigation, on API key changes, after send errors, or on session shutdown. For bridged pi tools, the matching pi `toolResult` resolves into the same live Cursor SDK run without creating a new `Agent`, unless the run was disposed, aborted, or cancelled. Replay can also split one live Cursor SDK run across pi `toolUse` turns for display.
-- **Cursor setting sources default to all.** The extension passes `local.settingSources: ["all"]` by default so configured Cursor MCP servers, plugin tools, project/user settings, and related Cursor-native capabilities are available like they are in Cursor. To narrow loading, set a comma-separated list such as `PI_CURSOR_SETTING_SOURCES=project,user,plugins`. To disable ambient setting sources, set `PI_CURSOR_SETTING_SOURCES=none`. Direct Cursor SDK startup logs are suppressed so setting/skill loading messages do not pollute the TUI.
+- **Cursor setting sources default to all.** The extension passes `local.settingSources: ["all"]` by default so configured Cursor MCP servers, plugin tools, project/user settings, and related Cursor-native capabilities are available like they are in Cursor. To narrow loading, set a comma-separated list such as `PI_CURSOR_SETTING_SOURCES=project,user,plugins`. To disable ambient setting sources, set `PI_CURSOR_SETTING_SOURCES=none`. Direct Cursor SDK bootstrap logs (settings, skills, hook-load compatibility warnings, and similar) are suppressed so they do not pollute the TUI.
 - **Max Mode is not a manual pi variant.** Cursor's SDK may enable Max Mode automatically for models that require it. This extension only advertises exact context-window variants that the SDK catalog exposes and otherwise uses conservative SDK-derived default/non-Max context windows.
 - **Output token limits are conservative.** Cursor SDK model metadata does not currently expose output token limits directly.
 - **Token usage is approximate in pi.** Cursor SDK usage events include cumulative internal agent/tool/cache work, so raw Cursor SDK counters are not copied into pi usage. The extension reports approximate pi session activity in `input`/`output`, including split-run tool calls and consumed tool results, while `totalTokens` tracks the replayable Cursor prompt/context estimate used for context display and compaction.

package/docs/cursor-live-smoke-checklist.md CHANGED Viewed

@@ -2,7 +2,7 @@
 ## Purpose
-Use this manual checklist before releasing Cursor provider/runtime changes. Unit tests and mocks are necessary, but they are not enough for this extension. Always assume every runtime surface is in scope. A release is not ready until every live check below has been observed with `cursor/composer-2.5` through the local working tree.
+Use this manual checklist before releasing Cursor provider/runtime changes. Unit tests and mocks are necessary, but they are not enough for this extension. See [Cursor testing lessons](./cursor-testing-lessons.md) for auth/isolated-harness pitfalls and the plan-mode replay regression that motivated recent hardening. Always assume every runtime surface is in scope. A release is not ready until every live check below has been observed with `cursor/composer-2.5` through the local working tree.
 ## Release rule
@@ -22,11 +22,36 @@ mkdir -p "$SMOKE_DIR"
 pi -e . --list-models cursor
 ```
+Live pi runs resolve provider auth from **`~/.pi/agent/auth.json`**, not only shell env. Isolated smoke copies that file into a clean temporary `HOME`. Ensure `auth.json` includes a `cursor` provider entry, or export `CURSOR_API_KEY` as a fallback.
+The repo also ships partial automation for the prerequisite/basic/default-settings/non-interactive math/TUI output polling/steering/diagnostic/JSONL subset:
+```bash
+npm run smoke:live
+```
+For native replay regression checks (packed install, plan-strip resync, JSONL replay-error scan), use the isolated helper:
+```bash
+npm run smoke:isolated
+# unit tests + pack only (no live Cursor):
+SKIP_LIVE=1 npm run smoke:isolated
+```
+Scan persisted sessions for native replay tool failures:
+```bash
+node scripts/validate-smoke-jsonl.mjs --replay-errors "$SMOKE_DIR"
+node scripts/validate-smoke-jsonl.mjs --replay-errors-only "$SMOKE_DIR/session-subdir"
+```
+The script is a helper only; it polls the section 3 TUI for answer/footer evidence and then cleans up the tmux session, but it does not replace manual visual review of the full TUI checklist. Release readiness still requires the manual checks below for detailed TUI behavior, bridge, standalone native replay, abort/cancel, packaging, cleanup, and any touched runtime surface not covered by the helper.
 Pass criteria:
 - `cursor/composer-2.5` appears in the model list.
 - No Cursor key or auth token is printed.
-- If `CURSOR_API_KEY` is unavailable and `/login` is not configured, stop and report the live smoke as blocked.
+- If neither `~/.pi/agent/auth.json` cursor auth nor `CURSOR_API_KEY` is available, stop and report the live smoke as blocked.
 ## 1. Basic provider reality check
@@ -155,13 +180,25 @@ Forbidden fields:
 Run a forbidden-material scan over smoke stderr/captures:
 ```bash
-find "$SMOKE_DIR" -type f \( -name '*stderr.txt' -o -name 'capture*.txt' \) -print0 |
-  xargs -0 grep -E 'CURSOR_API_KEY|Bearer [A-Za-z0-9._-]+|/cursor-pi-tool-bridge/[^ ]+/mcp|127\.0\.0\.1:[0-9]+/cursor-pi-tool-bridge|apiKey|cookie|session-cookie|secret-token'
+forbidden_files="$(find "$SMOKE_DIR" -type f \( -name '*stderr.txt' -o -name '*capture*.txt' \) -print0 |
+  xargs -0 grep -IlE 'CURSOR_API_KEY|Bearer [A-Za-z0-9._-]+|/cursor-pi-tool-bridge/[^ ]+/mcp|127\.0\.0\.1:[0-9]+/cursor-pi-tool-bridge|apiKey|cookie|session-cookie|secret-token' || true)"
+if [[ -n "$forbidden_files" ]]; then
+  printf 'Forbidden material matched in smoke files; inspect locally without pasting matched lines.\n' >&2
+  while IFS= read -r file; do
+    [[ -z "$file" ]] && continue
+    if [[ "$file" == "$SMOKE_DIR/"* ]]; then
+      printf '  %s\n' "${file#"$SMOKE_DIR/"}" >&2
+    else
+      printf '  %s\n' "$file" >&2
+    fi
+  done <<<"$forbidden_files"
+  exit 1
+fi
 ```
 Pass criteria:
-- The grep returns no matches except deliberately planted test strings that are asserted not to appear in serialized diagnostics.
+- The scan returns no matching files except deliberately planted test strings that are asserted not to appear in serialized diagnostics, and it does not print matched secret-bearing lines.
 - If tool names themselves are considered sensitive for a release target, do not enable `PI_CURSOR_PI_TOOL_BRIDGE_DEBUG=1` for shared logs. The diagnostics contract intentionally allows tool names.
 ## 7. Long-running bridge and abort/cancel
@@ -190,45 +227,21 @@ Pass criteria:
 After all live runs, scan JSONL structurally instead of reading raw content into a report:
 ```bash
-node <<'NODE'
-const fs = require('fs');
-const path = require('path');
-const root = process.env.SMOKE_DIR;
-const files = [];
-function walk(dir) {
-  for (const name of fs.readdirSync(dir)) {
-    const p = path.join(dir, name);
-    const st = fs.statSync(p);
-    if (st.isDirectory()) walk(p);
-    else if (p.endsWith('.jsonl')) files.push(p);
-  }
-}
-walk(root);
-let failures = 0;
-for (const file of files.sort()) {
-  const records = fs.readFileSync(file, 'utf8').trim().split(/\n+/).filter(Boolean).map(JSON.parse);
-  const messages = records.filter((record) => record.type === 'message').map((record) => record.message);
-  const assistants = messages.filter((message) => message.role === 'assistant');
-  const usage = assistants.map((message) => message.usage).filter(Boolean);
-  const badUsage = usage.filter((u) =>
-    typeof u.input !== 'number' || u.input < 0 ||
-    typeof u.output !== 'number' || u.output < 0 ||
-    typeof u.totalTokens !== 'number' || u.totalTokens < 0 ||
-    u.cacheRead !== 0 || u.cacheWrite !== 0
-  );
-  if (usage.length !== assistants.length || badUsage.length > 0) failures += 1;
-  console.log(JSON.stringify({ file: path.relative(root, file), assistantCount: assistants.length, usageCount: usage.length, badUsageCount: badUsage.length }));
-}
-process.exit(failures === 0 ? 0 : 1);
-NODE
+node scripts/validate-smoke-jsonl.mjs "$SMOKE_DIR"
 ```
-Pass criteria:
+Script-enforced pass criteria:
+- Every scanned JSONL file is parseable and non-empty.
+- Every scanned JSONL file contains at least one assistant message.
+- Every assistant message has usage metadata.
+- Assistant usage `input`, `output`, and `totalTokens` are non-negative numbers.
+- Assistant usage `cacheRead` and `cacheWrite` are exactly `0`.
+Additional manual usage checks for provider/accounting changes:
-- Every assistant message has valid usage.
-- Cache fields remain `0`.
-- Tool-heavy runs show nonzero output for visible assistant/tool-call activity.
-- Split runs count consumed tool-result input once on the following assistant turn.
+- Tool-heavy runs should show nonzero output for visible assistant/tool-call activity.
+- Split runs should count consumed tool-result input once on the following assistant turn.
 ## 9. Standard local gates

package/docs/cursor-model-ux-spec.md CHANGED Viewed

@@ -16,7 +16,7 @@ Current implementation notes:
 - Image payload forwarding sends images only from the latest user message. If the latest user turn is plain text after an earlier image turn, the transcript keeps an `[image omitted from transcript]` placeholder but no image bytes are sent to Cursor. The prompt explicitly tells Cursor that prior image bytes are unavailable and to ask the user to reattach or describe a prior image when needed. Carrying images forward across turns remains a future product decision because it affects token cost, privacy, stale visual context, and expected multimodal follow-up behavior.
 - `@cursor/sdk` is a package dependency of this extension; users should not need a global SDK install.
 - Cursor auth uses pi-native API-key resolution for provider `cursor`: CLI `--api-key`, stored `~/.pi/agent/auth.json` API key from `/login`, then `CURSOR_API_KEY`. The extension config file stores only non-secret Cursor-only state such as fast defaults.
-- Local agents pass `settingSources: ["all"]` by default so Cursor MCP servers, plugin tools, project/user settings, and related Cursor-native capabilities are available. Users can narrow loading with a comma-separated list such as `PI_CURSOR_SETTING_SOURCES=project,user,plugins`, or disable ambient setting sources with `PI_CURSOR_SETTING_SOURCES=none`. The provider suppresses direct Cursor SDK startup writes around agent creation so setting/skill loading logs do not pollute pi's TUI.
+- Local agents pass `settingSources: ["all"]` by default so Cursor MCP servers, plugin tools, project/user settings, and related Cursor-native capabilities are available. Users can narrow loading with a comma-separated list such as `PI_CURSOR_SETTING_SOURCES=project,user,plugins`, or disable ambient setting sources with `PI_CURSOR_SETTING_SOURCES=none`. The provider suppresses direct Cursor SDK bootstrap stdout/stderr/console noise (including late first-send workspace loading such as hook compatibility warnings) so it does not pollute pi's TUI.
 - Cursor SDK models are treated as thinking-capable even when pi reports `thinking=no`; that pi column only means the SDK did not expose a pi-controllable thinking parameter for that model.
 - Cursor-side thinking remains visible through pi's native thinking rendering when the Cursor SDK emits thinking or summary deltas.
 - Local Cursor agents get two tool surfaces. First, Cursor keeps the Cursor SDK local-agent tool surface plus configured Cursor settings, plugins, and Cursor MCP servers. Second, pi-cursor-sdk exposes active pi tools through a default-on, tokenized loopback MCP bridge when bridgeable tools exist.
@@ -27,16 +27,17 @@ Current implementation notes:
 - Cursor SDK MCP tool calls use a guarded timeout override because installed `@cursor/sdk` 1.0.13 has a 60-second MCP request default with no public per-server timeout option. The extension extends that Cursor SDK MCP `callTool` timeout path to 3600 seconds by default. Users can override it with `PI_CURSOR_MCP_TOOL_TIMEOUT_MS` or `PI_CURSOR_MCP_TOOL_TIMEOUT_SECONDS`.
 - Bridge diagnostics are opt-in only: `PI_CURSOR_PI_TOOL_BRIDGE_DEBUG=1` writes typed, allowlisted, scrubbed single-line JSONL records to `process.stderr` with prefix `[pi-cursor-sdk:bridge]`. Diagnostics are scrubbed operational logs, not anonymous telemetry. They intentionally include tool names, safe correlation IDs, run lifecycle, exposed pi↔MCP name pairs, queued requests, result resolution, rejection, cancellation, and pending counts. Correlation IDs are generated independently from the tokenized endpoint path, and Cursor MCP call IDs are hashed before serialization. Diagnostics must not include endpoint paths/URLs/path components/tokens, API keys, bearer tokens, cookies, session credentials, raw args/results, stdout/stderr payloads, file contents, Cursor settings output, or local private session paths in tracked docs, and they must not call pi UI status, notification, or footer APIs. If tool names themselves are unacceptable for a release target, bridge debug diagnostics are not safe for shared logs under the current contract.
 - This repo does not provide a generic desktop-automation, browser-driver, or CDP recipe. Provider docs should describe pi-cursor-sdk's Cursor provider/bridge contract only.
-- Cursor internal tool activity is recorded from SDK events and scrubbed. In interactive TTY sessions, supported completed `read`, `bash`, `grep`, `find`, `ls`, `edit`, `write`, diagnostics, delete, todo/plan, task, image generation, and MCP activity is replayed through pi's native tool-call rendering path with recorded Cursor results, so the TUI can show native-looking cards without rerunning Cursor's reads/shell commands/file edits. Cursor `glob` activity is replayed through native `find` cards. Cursor write activity is replayed through native-looking `write` cards, and Cursor StrReplace/edit activity uses native-looking `edit` only when recorded arguments truthfully satisfy pi's `edit` schema; path-only Cursor edit and notebook edit replay falls back to neutral Cursor activity before pi validation. Diagnostics, delete, todos/plans, task, image, and MCP activity use neutral Cursor activity cards with pi's default success/error shell. Neutral Cursor activity calls include `activityTitle` and, when available, `activitySummary` so partial/collapsed cards preserve identity such as `Cursor plan`, `Cursor todos`, `Cursor MCP`, or `Cursor edit`. Replay-only tools display recorded Cursor results, normalize workspace-local paths/diff headers for display, use pi diff colors for edit previews and path-inferred syntax highlighting for write previews, and fail closed if called without a recorded result. Native replay wrappers are registered only for tool names not already owned by another extension; conflicting tools use the bounded scrubbed transcript fallback. Cursor workflow tools such as `SwitchMode` and Cursor todo state are not pi workflow controls; reported todo/plan events are displayed as Cursor activity only. Plan/todo replay cards can be followed by Cursor's final plan text, selected from `run.wait().result` when Cursor provides one and trimmed against already-emitted text. Started Cursor SDK tool calls that never receive a completion event are discarded without synthetic replay errors; explicit failures remain visible when Cursor reports them through completed tool calls or step results. `PI_CURSOR_NATIVE_TOOL_DISPLAY=0` disables native replay, and `PI_CURSOR_REGISTER_NATIVE_TOOLS=0` is a registration-only opt-out that keeps the transcript fallback without shadowing pi tool names. When bridge or native replay cards are emitted, the provider mirrors Codex's turn shape as Cursor SDK activity arrives: assistant `toolUse`, pi `toolResult`s, live post-tool Cursor thinking/text, any later tool batches as further `toolUse` turns, then Cursor's final assistant answer. For shell replay, completed `stdout` / `stderr` are primary; unambiguous `shell-output-delta` data is used only as display-only fallback for empty successful shell completions, and overlapping shell calls drop ambiguous deltas instead of guessing. Non-interactive runs keep bounded scrubbed transcript output instead, preserving `pi -p` assistant text output. Cursor text deltas stream live when no live-run turn split is active.
+- Cursor internal tool activity is recorded from SDK events and scrubbed. In interactive TTY sessions, supported completed `read`, `bash`, `grep`, `find`, `ls`, `edit`, `write`, diagnostics, delete, todo/plan, task, image generation, and MCP activity is replayed through pi's native tool-call rendering path with recorded Cursor results, so the TUI can show native-looking cards without rerunning Cursor's reads/shell commands/file edits. Cursor `glob` activity is replayed through native `find` cards. Cursor write activity is replayed through native-looking `write` cards, and Cursor StrReplace/edit activity uses native-looking `edit` only when recorded arguments truthfully satisfy pi's `edit` schema; path-only Cursor edit and notebook edit replay falls back to neutral Cursor activity before pi validation. Diagnostics, delete, todos/plans, task, image, and MCP activity use neutral Cursor activity cards with pi's default success/error shell. Neutral Cursor activity calls include `activityTitle` and, when available, `activitySummary` so partial/collapsed cards preserve identity such as `Cursor plan`, `Cursor todos`, `Cursor MCP`, or `Cursor edit`. When the Cursor SDK emits a running `task` tool call with a description, the provider surfaces one low-noise in-progress line such as `Cursor task: Explore AI/automation projects` from SDK args only; it does not emit generic heartbeat text or per-tool start cards for ordinary `read`, `bash`, or `grep` activity. Replay-only tools display recorded Cursor results, normalize workspace-local paths/diff headers for display, use pi diff colors for edit previews and path-inferred syntax highlighting for write previews, and fail closed if called without a recorded result. Native replay wrappers are registered only for tool names not already owned by another extension; conflicting tools use the bounded scrubbed transcript fallback. Cursor workflow tools such as `SwitchMode` and Cursor todo state are not pi workflow controls; reported todo/plan events are displayed as Cursor activity only. Plan/todo replay cards can be followed by Cursor's final plan text, selected from `run.wait().result` when Cursor provides one and trimmed against already-emitted text. Started Cursor SDK tool calls that never receive a completion event are discarded without synthetic replay errors; explicit failures remain visible when Cursor reports them through completed tool calls or step results. `PI_CURSOR_NATIVE_TOOL_DISPLAY=0` disables native replay, and `PI_CURSOR_REGISTER_NATIVE_TOOLS=0` is a registration-only opt-out that keeps the transcript fallback without shadowing pi tool names. When bridge or native replay cards are emitted, the provider mirrors Codex's turn shape as Cursor SDK activity arrives: assistant `toolUse`, pi `toolResult`s, live post-tool Cursor thinking/text, any later tool batches as further `toolUse` turns, then Cursor's final assistant answer. For shell replay, completed `stdout` / `stderr` are primary; unambiguous `shell-output-delta` data is used only as display-only fallback for empty successful shell completions, and overlapping shell calls drop ambiguous deltas instead of guessing. Non-interactive runs keep bounded scrubbed transcript output instead, preserving `pi -p` assistant text output. Cursor text deltas stream live when no live-run turn split is active.
 - Synthetic replay names are internal compatibility details. New model-facing prompt text and user-visible cards use native tool names when renderer-compatible, or neutral Cursor activity labels when not. Legacy sessions containing old internal replay names are sanitized before prompt/display. Bridge MCP names such as `pi__sem_reindex` are MCP-only; pi session output uses real pi tool names.
 - Cursor SDK usage events report cumulative internal agent/tool/cache work, not the replayable pi prompt context. The extension does not copy raw Cursor SDK usage into pi usage or compaction. For Cursor assistant messages, `usage.input`/`usage.output` are approximate pi session activity components: initial Cursor prompt input is counted once, consumed split-run tool results are counted as deduped input on the following assistant turn, and assistant output includes visible text/thinking/tool-call content. `usage.totalTokens` is the replayable Cursor prompt/context estimate derived from the same `buildCursorPrompt()` path used for `Agent.send`; it may differ from `input + output` and is the context-safe value for display/compaction. `src/cursor-usage-accounting.ts` owns this usage policy, and `src/cursor-live-run-accounting.ts` owns prompt-once and consumed-tool-result accounting so provider usage and bridge result resolution share the same matched tool-result boundary.
 - Audit observation, 2026-05-19, superseded by the 2026-05-21 replay pass: a missing-file read with Composer 2.5 emitted `tool-call-started` for Cursor `read`, then streamed final text `Error: File not found`, but did not emit `tool-call-completed` or an `onStep` `toolCall` error result. Leftover started calls are now discarded at run completion instead of becoming synthetic replay errors. Cursor-reported completed/step errors remain visible.
 - Maintainer visual verification for replay-card changes should follow [Cursor Native Tool Visual Audit Workflow](./cursor-native-tool-visual-audit.md): offscreen PTY-driven pi run, xterm.js/Playwright screenshot rendering, and JSONL inspection before accepting commits or PRs.
-- Cursor provider/runtime releases should follow [Cursor Live Smoke Checklist](./cursor-live-smoke-checklist.md) with real `pi -e . --cursor-no-fast --model cursor/composer-2.5` invocations, manual observation, temporary session dirs, diagnostics scans, and persisted JSONL inspection. Assume every runtime surface is in scope. A release is not ready when any live check is optional, deferred, mostly passing, or unobserved.
+- Cursor provider/runtime releases should follow [Cursor Live Smoke Checklist](./cursor-live-smoke-checklist.md) with real `pi -e . --cursor-no-fast --model cursor/composer-2.5` invocations, manual observation, temporary session dirs, diagnostics scans, and persisted JSONL inspection. See [Cursor testing lessons](./cursor-testing-lessons.md) for auth.json seeding, isolated smoke harnesses, and replay JSONL scans. Assume every runtime surface is in scope. A release is not ready when any live check is optional, deferred, mostly passing, or unobserved.
 - For models without a catalog `context` parameter, context windows are not hardcoded. The extension ships a bundled SDK-derived default/non-Max cache generated from `createAgentPlatform().checkpointStore.loadLatest(agentId).tokenDetails.maxTokens`. Successful runs can update a local override cache, but model discovery does not probe models at startup.
 - Max Mode context windows are distinct from default/non-Max context windows. `@cursor/sdk` 1.0.13 documentation says the SDK may enable Max Mode automatically when a selected model requires it, but the public local-agent `ModelSelection` path still does not expose a manual Max Mode selector. Do not advertise Max Mode context windows unless the SDK catalog exposes an exact parameter/variant or the SDK public API adds a Max Mode selector that the extension actually sends.
 - `@cursor/sdk` 1.0.13 adds latest-style `ModelListItem.aliases`. The extension registers only unambiguous aliases as pi model IDs (with the same context suffixes when applicable) and sends the alias back in `ModelSelection.id`, while sharing Cursor-only state such as fast defaults with the underlying catalog `id`. Aliases shared by multiple base models, such as generic family aliases, are skipped because the pi row metadata would otherwise imply one base model while Cursor may resolve the alias to another.
 - Session-scoped Cursor SDK agent pooling reuses one live `@cursor/sdk` agent across compatible follow-up turns within the same pi session scope. `computeCursorContextFingerprint()` and `shouldBootstrapCursorSend()` decide whether the next turn sends a full bootstrap prompt or an incremental follow-up. The pool recreates the agent when context diverges, when branch or compaction summaries appear after `/tree` navigation or compaction, when the API key identity changes, after send errors, on `session_shutdown`, and when `session_before_tree` / `session_tree` invalidate the active branch. Incremental sends omit the full Cursor SDK tool boundary block because the session agent retains prior bootstrap context.
+- Pi steering/follow-up delivery can arrive while a split live Cursor SDK run is still active. The provider resolves pending live runs by scanning trailing `toolResult` messages while skipping trailing `user` messages, tracks the active live run per session scope, and resumes the in-flight run instead of calling `Agent.send()` again. When the context ends with steering user text after tool results, the provider releases the prior live run and chains an incremental `Agent.send()` for the latest user message in the same provider turn; if the prior run emits more text or tool requests after steering arrives, that stale activity is cancelled instead of surfacing another old-run tool turn and losing the new user input. A pre-send guard waits for or resumes any still-active scoped live run before starting a fresh send so `@cursor/sdk` `AgentBusyError` (`already has active run`) does not surface to pi users.
 ## Goal

package/docs/cursor-testing-lessons.md ADDED Viewed

@@ -0,0 +1,199 @@
+# Cursor Testing Lessons
+## Purpose
+This document records maintainer testing lessons for `pi-cursor-sdk`. It complements unit tests and the [Cursor live smoke checklist](./cursor-live-smoke-checklist.md). Use it when adding regression coverage, debugging false-green releases, or building isolated smoke harnesses.
+## Core lesson: integration-shaped bugs beat unit mocks
+The native replay `Tool grep not found` failure was integration-shaped, not unit-shaped:
+1. **Plan mode** calls `setActiveTools(["read", "bash", "edit", "write"])` when execution starts.
+2. **pi-cursor-sdk** only re-synced native replay wrappers on `session_start` / `model_select`, not every turn.
+3. **The provider** still emitted native replay `toolUse` for `grep` / `cursor`.
+4. **pi's agent loop** looked up tools in `context.tools` and failed with `Tool grep not found`.
+Passing hundreds of unit tests did not prove that chain was safe. Regression coverage now includes:
+- `test/index.test.ts` — `before_agent_start` and `turn_start` resync after plan-style strip
+- `test/cursor-native-replay-stress.test.ts` — plan strip → resync → grep replay; inactive-tool trace fallbacks
+- `test/cursor-provider-replay-live-run.test.ts` — inactive replay tools emit trace instead of broken `toolUse`
+- `test/cursor-native-replay-trace.test.ts` — shared inactive replay trace formatting
+- `test/cursor-native-replay-routing.test.ts` — `resolveNativeReplayDisposition` and `partitionNativeToolsByActiveContext`
+When changing provider/runtime behavior, ask whether the bug spans **pi extension lifecycle**, **active tool state**, **provider streaming**, and **persisted JSONL**. If yes, add an integration-style unit test or live smoke coverage for that chain.
+## Dual-check invariant: `context.tools` vs pi active tools
+Native replay routing intentionally uses two layers:
+1. **Extension resync** (`before_agent_start`, `turn_start`) updates pi's active tool set via `syncRegisteredNativeCursorToolsForModel`. This fixes the common case where plan-mode execute strips `grep`/`find`/`cursor` before the next turn.
+2. **Provider routing** uses the **`context.tools` snapshot** captured when `streamCursor()` starts (`getActiveContextToolNames` in `src/cursor-context-tools.ts`). It does not read live `pi.getActiveTools()` mid-stream.
+`src/cursor-native-replay-routing.ts` centralizes provider-side routing against the same `context.tools` snapshot:
+- **Turn coordinator** calls `resolveNativeReplayDisposition()` per completed SDK tool → `queue_replay` (queue native `toolUse`), `inactive_trace` (`formatInactiveCursorReplayTrace()`), or `transcript_trace`.
+- **Live-run drain** calls `partitionNativeToolsByActiveContext()` on already-queued native tool batches → active tools become `toolUse`; inactive tools get trace only and the batch returns `"handled"` without `toolUse`.
+Disposition outcomes:
+- `queue_replay` — tool is in `context.tools` and a live run exists
+- `inactive_trace` — native replay tool missing from `context.tools`
+- `transcript_trace` — native replay off or non-native tool
+If resync runs but `context.tools` is still stale (e.g. only `read` listed), the provider must **not** emit `toolUse` for inactive tools. `test/cursor-native-replay-stress.test.ts` covers that stale-snapshot path.
+## Auth: use `auth.json`, not only env
+pi resolves Cursor auth in this order:
+1. pi `--api-key`
+2. stored `cursor` key in `~/.pi/agent/auth.json` from `/login`
+3. `CURSOR_API_KEY`
+For live smoke and isolated harnesses:
+- **Do not assume** `CURSOR_API_KEY` or `~/.secrets` alone is enough.
+- **Do assume** pi reads auth from the active `HOME`, usually `~/.pi/agent/auth.json`.
+- Isolated runs with `env -i HOME=/tmp/...` must **copy** `auth.json` into that temporary home before calling `pi`.
+Example seed step used by `scripts/isolated-cursor-smoke.sh`:
+```bash
+mkdir -p "$HOME/.pi/agent"
+cp "$REAL_HOME/.pi/agent/auth.json" "$HOME/.pi/agent/auth.json"
+chmod 600 "$HOME/.pi/agent/auth.json"
+```
+Fallback when `auth.json` lacks a `cursor` provider entry:
+```bash
+export CURSOR_API_KEY="your-key"
+```
+Never commit, log, or paste `auth.json` contents, API keys, or session JSONL with secrets.
+## Isolated directories: why and how
+Use isolated `/tmp` trees when validating:
+- packed tarball install (`npm pack` → extract → `pi install -l`)
+- clean `HOME` with no inherited shell profile state
+- plan-mode-style tool stripping via a shim extension
+- JSONL replay-error scans independent of stdout
+Recommended layout:
+```text
+/tmp/pi-cursor-sdk-isolated-<timestamp>/
+  home/                 # seeded ~/.pi/agent/auth.json
+  pack/                 # npm pack output (*.tgz)
+  extract/package/      # unpacked extension
+  project/              # empty pi project for install -l
+  sessions/
+    basic/
+    native-replay/
+    plan-strip/
+```
+Commands:
+```bash
+# full isolated smoke (unit preflight + pack + live pi)
+npm run smoke:isolated
+# pack/unit only, no live Cursor calls
+SKIP_LIVE=1 npm run smoke:isolated
+# custom artifact root
+ISOLATED=/tmp/pi-cursor-sdk-isolated-manual npm run smoke:isolated
+```
+Every live check should use its own `--session-dir` under the isolated tree. Do not reuse session dirs across scenarios.
+## Harness traps we hit repeatedly
+| Trap | What went wrong | Fix |
+| --- | --- | --- |
+| Clean `HOME` without auth | `pi` could not authenticate Cursor in isolated runs | Copy `~/.pi/agent/auth.json` into isolated `HOME` |
+| `npm pack \| tail -1` | Captured npm notice text, not tarball path | Use `ls -t "$PACK_DIR"/*.tgz \| head -1` |
+| Packed extension, no install | Provider never loaded in isolated project | Run `npm install --omit=dev` inside extracted package |
+| Inherited shell env | mise/profile hooks hung or polluted runs | Use `env -i ... MISE_DISABLE=1` for isolated pi calls |
+| No per-check timeout | One stuck prompt blocked entire harness | Wrap each live check with timeout/watchdog |
+| stdout-only assertions | Missed replay failures persisted only in JSONL | Scan JSONL for `Tool grep/cursor/find/ls not found` |
+| Plan strip only on first turn | Under-tested multi-turn resync | Shim strips on every `turn_start`; stress multi-turn separately |
+| Assuming env auth equals pi auth | False "blocked" or false "pass" in CI-like shells | Check `auth.json` provider keys explicitly when needed |
+## JSONL is the source of truth for replay regressions
+Stdout can look fine while persisted tool results contain errors. Prefer structural JSONL scans over grepping terminal output.
+Replay failure scan:
+```bash
+node scripts/validate-smoke-jsonl.mjs --replay-errors-only "$SESSION_DIR"
+```
+Combined usage + replay scan after broader smoke:
+```bash
+node scripts/validate-smoke-jsonl.mjs --replay-errors "$SMOKE_DIR"
+```
+The replay scan fails on records containing:
+- `Tool grep not found`
+- `Tool cursor not found`
+- `Tool find not found`
+- `Tool ls not found`
+## Plan-mode regression scenario
+Simulate plan-mode execute stripping with the repo fixture:
+- `scripts/fixtures/plan-strip-shim/index.ts`
+It sets active tools to `read`, `bash`, `edit`, `write` on each `turn_start`. Run pi with:
+```bash
+pi -e scripts/fixtures/plan-strip-shim --cursor-no-fast --model cursor/composer-2.5 \
+  --session-dir "$SMOKE_DIR/plan-strip" \
+  -p 'After reset, read README.md and answer PLAN_STRIP_OK=yes.'
+```
+Pass criteria:
+- No replay `Tool * not found` entries in JSONL
+- Native replay tools (`grep`, `find`, `read`, etc.) succeed after `turn_start` resync
+- On non-Cursor model switch, native replay wrappers are removed except core pi tools
+## Local validation ladder
+Run in order before claiming release-ready for provider/runtime changes:
+```bash
+npm test
+npm run typecheck
+npm pack --dry-run
+SKIP_LIVE=1 npm run smoke:isolated
+npm run smoke:isolated            # requires auth.json or CURSOR_API_KEY
+npm run smoke:live                # partial tmux checklist subset
+```
+Then follow the full manual [Cursor live smoke checklist](./cursor-live-smoke-checklist.md) for surfaces the scripts do not cover (bridge MCP, abort/cancel, full TUI observation, packaging review, cleanup).
+## What belongs in CI vs manual smoke
+- **CI / default `npm test`:** mocked provider tests, extension lifecycle tests, JSONL validator tests, script syntax/help checks. No live Cursor calls.
+- **Manual / pre-release:** `npm run smoke:isolated`, `npm run smoke:live`, and the full checklist. Requires real Cursor auth and observes TUI/runtime behavior mocks cannot reproduce.
+If live smoke auth is unavailable, report the release as **blocked**, not skipped-ready.
+## Related docs and scripts
+- [Cursor live smoke checklist](./cursor-live-smoke-checklist.md)
+- [Cursor native tool replay](./cursor-native-tool-replay.md)
+- `scripts/isolated-cursor-smoke.sh`
+- `scripts/tmux-live-smoke.sh`
+- `scripts/validate-smoke-jsonl.mjs`
+- `test/helpers/cursor-provider-harness.ts` — controllable native replay pi mock (`createNativeToolDisplayPiForTest`)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "pi-cursor-sdk",
-	"version": "0.1.16",
+	"version": "0.1.18",
 	"description": "pi provider extension backed by @cursor/sdk local agents",
 	"author": "Mitch Fultz (https://github.com/fitchmultz)",
 	"license": "MIT",
@@ -24,9 +24,14 @@
 	"files": [
 		"src",
 		"scripts/refresh-cursor-model-snapshots.mjs",
+		"scripts/steering-rpc-smoke.mjs",
+		"scripts/tmux-live-smoke.sh",
+		"scripts/isolated-cursor-smoke.sh",
+		"scripts/validate-smoke-jsonl.mjs",
 		"README.md",
 		"docs/cursor-model-ux-spec.md",
 		"docs/cursor-live-smoke-checklist.md",
+		"docs/cursor-testing-lessons.md",
 		"docs/cursor-native-tool-replay.md",
 		"docs/cursor-native-tool-visual-audit.md",
 		"LICENSE",
@@ -40,7 +45,11 @@
 		"typecheck": "tsc --noEmit",
 		"test": "vitest run",
 		"test:watch": "vitest",
-		"refresh:cursor-snapshots": "node scripts/refresh-cursor-model-snapshots.mjs"
+		"refresh:cursor-snapshots": "node scripts/refresh-cursor-model-snapshots.mjs",
+		"smoke:live": "scripts/tmux-live-smoke.sh",
+		"smoke:isolated": "scripts/isolated-cursor-smoke.sh",
+		"smoke:steering": "node scripts/steering-rpc-smoke.mjs",
+		"smoke:jsonl": "node scripts/validate-smoke-jsonl.mjs"
 	},
 	"dependencies": {
 		"@cursor/sdk": "^1.0.13",
@@ -53,9 +62,9 @@
 		"typebox": "*"
 	},
 	"devDependencies": {
-		"@earendil-works/pi-ai": "^0.75.3",
-		"@earendil-works/pi-coding-agent": "^0.75.3",
-		"@earendil-works/pi-tui": "^0.75.3",
+		"@earendil-works/pi-ai": "^0.75.5",
+		"@earendil-works/pi-coding-agent": "^0.75.5",
+		"@earendil-works/pi-tui": "^0.75.5",
 		"typebox": "^1.1.38",
 		"typescript": "^6.0.3",
 		"vitest": "^4.1.6"