npm - patchwork-os - Versions diffs - 0.2.0-beta.5.canary.94 → 0.2.0-beta.6 - Mend

patchwork-os 0.2.0-beta.5.canary.94 → 0.2.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

package/dist/ajv2020.d.ts +25 -0
package/dist/ajv2020.js +33 -0
package/dist/ajv2020.js.map +1 -0
package/dist/approvalQueue.d.ts +17 -0
package/dist/approvalQueue.js.map +1 -1
package/dist/bridge.js +16 -0
package/dist/bridge.js.map +1 -1
package/dist/commands/recipeInstall.js +5 -1
package/dist/commands/recipeInstall.js.map +1 -1
package/dist/commands/tools.d.ts +20 -1
package/dist/commands/tools.js +112 -3
package/dist/commands/tools.js.map +1 -1
package/dist/haltPushDispatch.d.ts +33 -0
package/dist/haltPushDispatch.js +103 -0
package/dist/haltPushDispatch.js.map +1 -0
package/dist/inboxRoutes.d.ts +22 -0
package/dist/inboxRoutes.js +61 -1
package/dist/inboxRoutes.js.map +1 -1
package/dist/index.js +8 -0
package/dist/index.js.map +1 -1
package/dist/oauthRoutes.d.ts +1 -1
package/dist/oauthRoutes.js +2 -2
package/dist/recipeRoutes.js +133 -65
package/dist/recipeRoutes.js.map +1 -1
package/dist/recipes/githubInstallSource.d.ts +66 -0
package/dist/recipes/githubInstallSource.js +85 -4
package/dist/recipes/githubInstallSource.js.map +1 -1
package/dist/recipes/haltCategory.d.ts +4 -0
package/dist/recipes/haltCategory.js +6 -0
package/dist/recipes/haltCategory.js.map +1 -1
package/dist/recipes/names.d.ts +20 -0
package/dist/recipes/names.js +25 -0
package/dist/recipes/names.js.map +1 -1
package/dist/recipes/parser.js +7 -2
package/dist/recipes/parser.js.map +1 -1
package/dist/recipes/stepObservation.js +9 -0
package/dist/recipes/stepObservation.js.map +1 -1
package/dist/recipes/tools/fanOut.d.ts +20 -0
package/dist/recipes/tools/fanOut.js +199 -0
package/dist/recipes/tools/fanOut.js.map +1 -0
package/dist/recipes/tools/index.d.ts +1 -0
package/dist/recipes/tools/index.js +1 -0
package/dist/recipes/tools/index.js.map +1 -1
package/dist/recipes/tools/slack.js +1 -1
package/dist/recipes/validation.js +2 -2
package/dist/recipes/validation.js.map +1 -1
package/dist/recipes/workspaceRoot.d.ts +37 -0
package/dist/recipes/workspaceRoot.js +73 -0
package/dist/recipes/workspaceRoot.js.map +1 -0
package/dist/recipes/yamlRunner.d.ts +72 -0
package/dist/recipes/yamlRunner.js +621 -295
package/dist/recipes/yamlRunner.js.map +1 -1
package/dist/runLog.d.ts +22 -0
package/dist/runLog.js +12 -1
package/dist/runLog.js.map +1 -1
package/dist/server.d.ts +14 -0
package/dist/server.js +36 -3
package/dist/server.js.map +1 -1
package/dist/tools/batchLsp.d.ts +3 -0
package/dist/tools/cancelClaudeTask.d.ts +1 -0
package/dist/tools/clipboard.d.ts +2 -0
package/dist/tools/closeTabs.d.ts +1 -0
package/dist/tools/codeLens.d.ts +1 -0
package/dist/tools/createIssueFromAIComment.d.ts +1 -0
package/dist/tools/ctxSaveTrace.d.ts +1 -0
package/dist/tools/debug.d.ts +4 -0
package/dist/tools/decorations.d.ts +2 -0
package/dist/tools/documentLinks.d.ts +1 -0
package/dist/tools/editText.d.ts +1 -0
package/dist/tools/enrichCommit.d.ts +1 -0
package/dist/tools/explainDiagnostic.d.ts +1 -0
package/dist/tools/explainSymbol.d.ts +1 -0
package/dist/tools/fileOperations.d.ts +3 -0
package/dist/tools/fileWatcher.d.ts +2 -0
package/dist/tools/findFiles.d.ts +1 -0
package/dist/tools/fixAllLintErrors.d.ts +1 -0
package/dist/tools/foldingRanges.d.ts +1 -0
package/dist/tools/formatDocument.d.ts +1 -0
package/dist/tools/generateTests.d.ts +1 -0
package/dist/tools/getAIComments.d.ts +1 -0
package/dist/tools/getBufferContent.d.ts +1 -0
package/dist/tools/getChangeImpact.d.ts +1 -0
package/dist/tools/getClaudeTaskStatus.d.ts +1 -0
package/dist/tools/getCodeCoverage.d.ts +1 -0
package/dist/tools/getCommitsForIssue.d.ts +1 -0
package/dist/tools/getDebugState.d.ts +1 -0
package/dist/tools/getDocumentSymbols.d.ts +1 -0
package/dist/tools/getGitHotspots.d.ts +1 -0
package/dist/tools/getImportedSignatures.d.ts +1 -0
package/dist/tools/getPRTemplate.d.ts +1 -0
package/dist/tools/getSymbolHistory.d.ts +1 -0
package/dist/tools/getTypeSignature.d.ts +1 -0
package/dist/tools/getWorkspaceSettings.d.ts +1 -0
package/dist/tools/gitWrite.d.ts +11 -0
package/dist/tools/github/actions.d.ts +2 -0
package/dist/tools/github/composite.d.ts +3 -0
package/dist/tools/github/issues.d.ts +4 -0
package/dist/tools/github/pr.d.ts +7 -0
package/dist/tools/handoffNote.d.ts +1 -0
package/dist/tools/hoverAtCursor.d.ts +1 -0
package/dist/tools/httpClient.d.ts +2 -0
package/dist/tools/inlayHints.d.ts +1 -0
package/dist/tools/launchQuickTask.d.ts +1 -0
package/dist/tools/listClaudeTasks.d.ts +1 -0
package/dist/tools/listTerminals.d.ts +1 -0
package/dist/tools/lsp.d.ts +15 -0
package/dist/tools/navigateToSymbolByName.d.ts +1 -0
package/dist/tools/openDiff.d.ts +1 -0
package/dist/tools/openFile.d.ts +1 -0
package/dist/tools/organizeImports.d.ts +1 -0
package/dist/tools/planPersistence.d.ts +3 -0
package/dist/tools/previewEdit.d.ts +1 -0
package/dist/tools/refactorAnalyze.d.ts +1 -0
package/dist/tools/refactorPreview.d.ts +1 -0
package/dist/tools/replaceBlock.d.ts +1 -0
package/dist/tools/resumeClaudeTask.d.ts +1 -0
package/dist/tools/runClaudeTask.d.ts +1 -0
package/dist/tools/screenshot.d.ts +1 -0
package/dist/tools/searchAndReplace.d.ts +1 -0
package/dist/tools/searchWorkspace.d.ts +1 -0
package/dist/tools/selectionRanges.d.ts +1 -0
package/dist/tools/semanticTokens.d.ts +1 -0
package/dist/tools/signatureHelp.d.ts +1 -0
package/dist/tools/terminal.d.ts +6 -0
package/dist/tools/testTraceToSource.d.ts +1 -0
package/dist/tools/transaction.d.ts +4 -0
package/dist/tools/typeHierarchy.d.ts +1 -0
package/dist/tools/utils.d.ts +18 -0
package/dist/tools/utils.js +28 -6
package/dist/tools/utils.js.map +1 -1
package/dist/tools/vscodeCommands.d.ts +2 -0
package/dist/tools/vscodeTasks.d.ts +2 -0
package/dist/tools/workspaceSettings.d.ts +1 -0
package/dist/transport.js +2 -2
package/dist/transport.js.map +1 -1
package/dist/wireHaltPushDispatch.d.ts +38 -0
package/dist/wireHaltPushDispatch.js +71 -0
package/dist/wireHaltPushDispatch.js.map +1 -0
package/package.json +1 -1

package/dist/recipes/yamlRunner.js CHANGED Viewed

@@ -51,6 +51,7 @@ import { RunBudget } from "./runBudget.js";
 import { detectSilentFail } from "./stepObservation.js";
 // Import tool registry and trigger tool self-registration
 import { applyToolOutputContext, executeTool, getTool, hasTool, registerPluginTools, } from "./toolRegistry.js";
+import { resolveWorkspaceRoot } from "./workspaceRoot.js";
 import "./tools/index.js";
 /**
  * Bundled-templates directory used as a third allowed root for nested-recipe
@@ -127,7 +128,119 @@ export function evaluateExpect(result, expect) {
     }
     return failures;
 }
+/**
+ * Lazy AJV for `step.expect.schema`. Initialised on first use so recipes
+ * without schema assertions don't pay the import/compile cost.
+ */
+let _stepExpectAjv;
+async function getStepExpectAjv() {
+    if (!_stepExpectAjv) {
+        const { createAjv2020 } = await import("../ajv2020.js");
+        _stepExpectAjv = createAjv2020({ strict: false, allErrors: true });
+    }
+    return _stepExpectAjv;
+}
+/**
+ * Stringify a step value for assertion purposes. Strings pass through;
+ * other values JSON.stringify so `matches`/`contains` see something stable.
+ */
+function stringifyForAssert(value) {
+    if (typeof value === "string")
+        return value;
+    try {
+        return JSON.stringify(value);
+    }
+    catch {
+        return String(value);
+    }
+}
+/**
+ * Evaluate a per-step `expect` block against the step's output value.
+ * Returns the list of failure messages (empty = all assertions passed).
+ *
+ * Slice 2 of the agentic-workflow primitives. v1 supports
+ * schema/equals/matches/contains; `on_fail: judge` deliberately omitted —
+ * see comment on `StepExpect`.
+ */
+export async function evaluateStepExpect(expect, value) {
+    const failures = [];
+    const asString = stringifyForAssert(value);
+    if (expect.equals !== undefined) {
+        const expected = expect.equals;
+        const expectedStr = typeof expected === "string" ? expected : stringifyForAssert(expected);
+        if (asString !== expectedStr) {
+            failures.push(`equals: expected ${JSON.stringify(expectedStr)}, got ${JSON.stringify(asString)}`);
+        }
+    }
+    if (expect.contains !== undefined) {
+        const needles = Array.isArray(expect.contains)
+            ? expect.contains
+            : [expect.contains];
+        for (const needle of needles) {
+            if (!asString.includes(needle)) {
+                failures.push(`contains: missing ${JSON.stringify(needle)}`);
+            }
+        }
+    }
+    if (expect.matches !== undefined) {
+        let re;
+        try {
+            re = new RegExp(expect.matches);
+        }
+        catch (err) {
+            failures.push(`matches: invalid regex ${JSON.stringify(expect.matches)} (${err instanceof Error ? err.message : String(err)})`);
+            return failures;
+        }
+        if (!re.test(asString)) {
+            failures.push(`matches: ${JSON.stringify(expect.matches)} did not match output`);
+        }
+    }
+    if (expect.schema !== undefined) {
+        let parsed;
+        try {
+            parsed = typeof value === "string" ? JSON.parse(value) : value;
+        }
+        catch {
+            failures.push(`schema: output is not valid JSON`);
+            return failures;
+        }
+        try {
+            const ajv = await getStepExpectAjv();
+            const validate = ajv.compile(expect.schema);
+            if (!validate(parsed)) {
+                const errs = (validate.errors ?? [])
+                    .map((e) => `${e.instancePath || "/"} ${e.message ?? "invalid"}`)
+                    .join("; ");
+                failures.push(`schema: ${errs || "validation failed"}`);
+            }
+        }
+        catch (err) {
+            failures.push(`schema: compile error (${err instanceof Error ? err.message : String(err)})`);
+        }
+    }
+    return failures;
+}
 // Strip tool-call narration some models (e.g. Gemini) prepend before the markdown block.
+/**
+ * Phase 0β — separator-agnostic inbox-path detector. Extracted so the
+ * Windows path-separator behaviour can be unit-tested by injecting
+ * `path.win32` / `path.posix` without booting a real recipe runner.
+ *
+ * Returns true when `candidate` resolves to a direct child of
+ * `inboxDirAbs`, isn't a dotfile, and lives in (not above) the inbox
+ * dir. Both arguments must already be platform-appropriate absolute
+ * paths (resolve them with the same path module before calling).
+ */
+export function isInboxPathFor(candidate, inboxDirAbs, pathMod) {
+    const target = pathMod.resolve(candidate);
+    const rel = pathMod.relative(inboxDirAbs, target);
+    if (!rel || rel.startsWith("..") || pathMod.isAbsolute(rel))
+        return false;
+    if (pathMod.basename(target).startsWith("."))
+        return false;
+    // Only direct children — `~/.patchwork/inbox/foo.md`, not nested.
+    return !rel.includes(pathMod.sep);
+}
 function stripLeadingNarration(text) {
     const lines = text.split("\n");
     const firstMarkdown = lines.findIndex((l) => /^(#|>|`|\||[-*+] |\d+\. |\*\*)/.test(l.trimStart()));
@@ -239,6 +352,84 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
         ...seedContext,
     };
     const stepDeps = resolveStepDeps(deps, { recipeName: recipe.name });
+    // Phase 0β — inbox provenance. When a recipe `file.write` / `file.append`
+    // step targets `~/.patchwork/inbox/`, prepend a YAML frontmatter block
+    // (first write only) recording recipe + run + trigger, and accumulate the
+    // delivered filename onto the run record's `inboxOutputs`. Old recipes /
+    // non-inbox paths pass through unchanged.
+    //
+    // Windows path-separator fix (CI repro 2026-05-20): the original
+    // implementation built the prefix as `${os.homedir()}/.patchwork/inbox/`
+    // and compared with `startsWith`, which failed on Windows where
+    // resolved absolute paths use `\` separators and `os.homedir()` returns
+    // `C:\Users\...`. Now we resolve both sides through `path.resolve()`
+    // and use `path.relative()` to detect containment so the comparison is
+    // separator-agnostic. Also case-insensitive on Win32 (NTFS).
+    const inboxDirAbs = path.resolve(path.join(os.homedir(), ".patchwork", "inbox"));
+    const inboxOutputs = [];
+    const isInboxPath = (abs) => isInboxPathFor(abs, inboxDirAbs, path);
+    const buildFrontmatter = () => {
+        const triggerKindAtWrite = yamlTriggerKind;
+        const lines = ["---", `recipe: ${recipe.name}`];
+        if (runSeq !== undefined)
+            lines.push(`runSeq: ${runSeq}`);
+        lines.push(`trigger: ${triggerKindAtWrite}`, `deliveredAt: ${new Date().toISOString()}`, "---", "", "");
+        return lines.join("\n");
+    };
+    const recordInboxDelivery = (abs) => {
+        inboxOutputs.push({
+            filename: path.basename(abs),
+            deliveredAt: Date.now(),
+        });
+    };
+    // Atomic read-or-default: a single `readFileSync` in a try/catch. No
+    // `existsSync`/`statSync` probe around the write — on Windows a stat
+    // immediately before write can race a concurrent fd holder and surface
+    // `EBUSY`/`EPERM`. The read either succeeds (file present) or throws
+    // ENOENT (treated as new file). Either way we never stat the same path
+    // we're about to write.
+    const readExistingOrEmpty = (abs) => {
+        try {
+            return readFileSync(abs, "utf-8");
+        }
+        catch {
+            return "";
+        }
+    };
+    const originalWrite = stepDeps.writeFile;
+    const originalAppend = stepDeps.appendFile;
+    stepDeps.writeFile = (p, content) => {
+        if (isInboxPath(p)) {
+            // First-write detection by content shape, not by stat. Empty string
+            // (ENOENT) and any file that does NOT already begin with `---\n`
+            // gets frontmatter; pre-frontmattered files are overwritten as-is
+            // so consumers can replay a recipe without doubling the header.
+            const existing = readExistingOrEmpty(p);
+            const hasFm = existing.startsWith("---\n");
+            const final = hasFm ? content : buildFrontmatter() + content;
+            originalWrite(p, final);
+            recordInboxDelivery(p);
+            return;
+        }
+        originalWrite(p, content);
+    };
+    stepDeps.appendFile = (p, content) => {
+        if (isInboxPath(p)) {
+            // file.append: never re-prepend. If file is brand-new, seed one
+            // frontmatter block so an append-only recipe still gets
+            // provenance. Same atomic read-or-default — no stat probe.
+            const existing = readExistingOrEmpty(p);
+            if (existing.length === 0) {
+                originalWrite(p, buildFrontmatter() + content);
+            }
+            else {
+                originalAppend(p, content);
+            }
+            recordInboxDelivery(p);
+            return;
+        }
+        originalAppend(p, content);
+    };
     // PR2b: one per-run budget shared across all agent steps. Absent
     // `recipe.budget` → no enforcement, no overhead.
     const runBudget = new RunBudget(recipe.budget);
@@ -316,348 +507,459 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
     // Track per-step start timestamps so done events carry durationMs
     // without a second roundtrip.
     const stepStartTs = new Map();
-    for (const step of recipe.steps) {
-        const stepIdForEmit = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
-        const stepTs = Date.now();
-        stepStartTs.set(stepIdForEmit, stepTs);
-        emit("recipe_step_start", {
+    // Emit recipe_step_done for the step result just pushed onto
+    // `stepResults`. Every loop branch (skip / budget / agent / tool)
+    // pushes exactly one result before it ends, so the last element is
+    // always the current step. `stepId` mirrors recipe_step_start's
+    // `stepIdForEmit` so live consumers can correlate start↔done — the
+    // pushed result's own id can diverge for agent steps without `into`.
+    const emitStepDone = (stepIdForEmit) => {
+        const justPushed = stepResults[stepResults.length - 1];
+        if (!justPushed)
+            return;
+        const haltReason = justPushed.haltReason;
+        emit("recipe_step_done", {
             runSeq,
             recipeName: recipe.name,
             stepId: stepIdForEmit,
-            tool: step.agent ? "agent" : step.tool,
-            ts: stepTs,
+            tool: justPushed.tool,
+            status: justPushed.status,
+            durationMs: justPushed.durationMs,
+            ...(justPushed.error !== undefined && { error: justPushed.error }),
+            ...(haltReason !== undefined && {
+                haltReason,
+                haltCategory: categoriseHaltReason(haltReason),
+            }),
+            ts: Date.now(),
         });
-        // Evaluate `when` guard before running anything. Mirrors
-        // chainedRunner.ts:248-266 — render the template, then truthy-check the
-        // result (empty string, "0", "false", "null", "undefined" are falsy).
-        // A falsy guard records the step as `skipped`, increments stepsRun, and
-        // continues — it is NOT a failure. Bridge-dev iMessage recipes rely on
-        // this to suppress the iMessage agent step when phone is empty.
-        if (typeof step.when === "string" && step.when.length > 0) {
-            const rendered = render(step.when, ctx).trim().toLowerCase();
-            const truthy = !!rendered &&
-                rendered !== "0" &&
-                rendered !== "false" &&
-                rendered !== "null" &&
-                rendered !== "undefined";
-            if (!truthy) {
-                const skipId = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
-                stepResults.push({
-                    id: skipId,
-                    tool: step.agent ? "agent" : step.tool,
-                    status: "skipped",
-                    durationMs: 0,
-                });
-                stepsRun++;
-                persistLiveStepResults();
-                emit("recipe_step_done", {
-                    runSeq,
-                    recipeName: recipe.name,
-                    stepId: skipId,
-                    tool: step.agent ? "agent" : step.tool,
-                    status: "skipped",
-                    durationMs: 0,
-                    ts: Date.now(),
-                });
-                continue;
-            }
-        }
-        // Handle agent steps separately
-        if (step.agent) {
-            const agentCfg = step.agent;
-            const isJudge = agentCfg.kind === "judge";
-            // PR3a: judge prompt convention. Append the structured-verdict
-            // suffix and, when `reviews: <stepId>` is set, inject the
-            // upstream step's output as an <artefact> block.
-            let renderedPrompt = render(agentCfg.prompt, ctx);
-            if (isJudge) {
-                if (agentCfg.reviews) {
-                    renderedPrompt += buildJudgeArtefactBlock(ctx[agentCfg.reviews]);
+    };
+    // The step loop is wrapped so an uncaught throw from any unguarded
+    // call site (a `when`/prompt render on a malformed step, a path-jail
+    // re-check, etc.) cannot escape `runYamlRecipe` and strand the
+    // run-log entry at "running" forever. On throw we capture the
+    // message into `runError` and fall through to the normal
+    // finalization path, which marks the run "error".
+    try {
+        for (const step of recipe.steps) {
+            const stepIdForEmit = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
+            const stepTs = Date.now();
+            stepStartTs.set(stepIdForEmit, stepTs);
+            emit("recipe_step_start", {
+                runSeq,
+                recipeName: recipe.name,
+                stepId: stepIdForEmit,
+                tool: step.agent ? "agent" : step.tool,
+                ts: stepTs,
+            });
+            // Evaluate `when` guard before running anything. Mirrors
+            // chainedRunner.ts:248-266 — render the template, then truthy-check the
+            // result (empty string, "0", "false", "null", "undefined" are falsy).
+            // A falsy guard records the step as `skipped`, increments stepsRun, and
+            // continues — it is NOT a failure. Bridge-dev iMessage recipes rely on
+            // this to suppress the iMessage agent step when phone is empty.
+            if (typeof step.when === "string" && step.when.length > 0) {
+                const rendered = render(step.when, ctx).trim().toLowerCase();
+                const truthy = !!rendered &&
+                    rendered !== "0" &&
+                    rendered !== "false" &&
+                    rendered !== "null" &&
+                    rendered !== "undefined";
+                if (!truthy) {
+                    const skipId = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
+                    stepResults.push({
+                        id: skipId,
+                        tool: step.agent ? "agent" : step.tool,
+                        status: "skipped",
+                        durationMs: 0,
+                    });
+                    stepsRun++;
+                    persistLiveStepResults();
+                    emit("recipe_step_done", {
+                        runSeq,
+                        recipeName: recipe.name,
+                        stepId: skipId,
+                        tool: step.agent ? "agent" : step.tool,
+                        status: "skipped",
+                        durationMs: 0,
+                        ts: Date.now(),
+                    });
+                    continue;
                 }
-                renderedPrompt += JUDGE_PROMPT_SUFFIX;
             }
-            const intoKey = agentCfg.into ?? "agent_output";
-            const stepId = intoKey;
-            const stepStart = Date.now();
-            let agentResult;
-            // PR2b: per-recipe token budget. Admission check before dispatch;
-            // reconcile actual consumption after. Subscription drivers
-            // (Claude CLI, provider subprocess) report `usage === undefined`
-            // — `RunBudget.reconcile` records a fail-open warning per driver
-            // per run and continues.
-            const admission = runBudget.admit();
-            if (!admission.admitted) {
-                const reason = admission.reason ??
-                    "Run exceeded its token budget — budget_exceeded.";
-                runError = runError ?? reason;
-                stepResults.push({
-                    id: stepId,
-                    tool: "agent",
-                    status: "error",
-                    error: reason,
-                    haltReason: reason,
-                    durationMs: 0,
-                });
-                stepsRun++;
-                persistLiveStepResults();
-                continue;
-            }
-            try {
-                const agentReturn = await _executeAgent({
-                    prompt: renderedPrompt,
-                    driver: agentCfg.driver === "api" ? "anthropic" : agentCfg.driver,
-                    model: agentCfg.model,
-                    ...(agentCfg.mcpAccess !== undefined && {
-                        mcpAccess: agentCfg.mcpAccess,
-                    }),
-                }, buildAgentExecutorDeps(stepDeps, deps));
-                agentResult = agentReturn.text;
-                runBudget.reconcile(agentCfg.driver === "api" ? "anthropic" : (agentCfg.driver ?? "auto"), agentReturn.usage);
-                // Catch both `[agent step failed: ...]` (existing) and the
-                // silent-fail patterns `[agent step skipped: ...]` etc. via the
-                // shared detector. Per-step opt-out via `silentFailDetection: false`.
-                const agentSilentFail = step.silentFailDetection !== false
-                    ? detectSilentFail(agentResult)
-                    : null;
-                if (agentResult.startsWith("[agent step failed:") || agentSilentFail) {
-                    const reason = agentSilentFail
-                        ? `silent-fail detected (${agentSilentFail.reason}): ${agentSilentFail.matched}`
-                        : agentResult;
+            // Handle agent steps separately
+            if (step.agent) {
+                const agentCfg = step.agent;
+                const isJudge = agentCfg.kind === "judge";
+                // PR3a: judge prompt convention. Append the structured-verdict
+                // suffix and, when `reviews: <stepId>` is set, inject the
+                // upstream step's output as an <artefact> block.
+                let renderedPrompt = render(agentCfg.prompt, ctx);
+                if (isJudge) {
+                    if (agentCfg.reviews) {
+                        renderedPrompt += buildJudgeArtefactBlock(ctx[agentCfg.reviews]);
+                    }
+                    renderedPrompt += JUDGE_PROMPT_SUFFIX;
+                }
+                const intoKey = agentCfg.into ?? "agent_output";
+                const stepId = intoKey;
+                const stepStart = Date.now();
+                let agentResult;
+                // PR2b: per-recipe token budget. Admission check before dispatch;
+                // reconcile actual consumption after. Subscription drivers
+                // (Claude CLI, provider subprocess) report `usage === undefined`
+                // — `RunBudget.reconcile` records a fail-open warning per driver
+                // per run and continues.
+                const admission = runBudget.admit();
+                if (!admission.admitted) {
+                    const reason = admission.reason ??
+                        "Run exceeded its token budget — budget_exceeded.";
                     runError = runError ?? reason;
                     stepResults.push({
                         id: stepId,
                         tool: "agent",
                         status: "error",
                         error: reason,
-                        haltReason: agentSilentFail
-                            ? `Agent step "${stepId}" returned no usable output (silent-fail: ${agentSilentFail.reason}).`
-                            : `Agent step "${stepId}" reported failure.`,
-                        durationMs: Date.now() - stepStart,
+                        haltReason: reason,
+                        durationMs: 0,
                     });
+                    stepsRun++;
+                    persistLiveStepResults();
+                    emitStepDone(stepIdForEmit);
+                    continue;
                 }
-                else {
-                    const stripped = stripLeadingNarration(agentResult);
-                    if (!stripped.trim()) {
-                        const errMsg = `[agent step failed: ${agentCfg.driver ?? "agent"} returned only narration or whitespace — no content]`;
-                        runError = runError ?? errMsg;
+                try {
+                    const agentReturn = await _executeAgent({
+                        prompt: renderedPrompt,
+                        driver: agentCfg.driver === "api" ? "anthropic" : agentCfg.driver,
+                        model: agentCfg.model,
+                        ...(agentCfg.mcpAccess !== undefined && {
+                            mcpAccess: agentCfg.mcpAccess,
+                        }),
+                    }, buildAgentExecutorDeps(stepDeps, deps));
+                    agentResult = agentReturn.text;
+                    runBudget.reconcile(agentCfg.driver === "api"
+                        ? "anthropic"
+                        : (agentCfg.driver ?? "auto"), agentReturn.usage);
+                    // Catch both `[agent step failed: ...]` (existing) and the
+                    // silent-fail patterns `[agent step skipped: ...]` etc. via the
+                    // shared detector. Per-step opt-out via `silentFailDetection: false`.
+                    const agentSilentFail = step.silentFailDetection !== false
+                        ? detectSilentFail(agentResult)
+                        : null;
+                    if (agentResult.startsWith("[agent step failed:") ||
+                        agentSilentFail) {
+                        const reason = agentSilentFail
+                            ? `silent-fail detected (${agentSilentFail.reason}): ${agentSilentFail.matched}`
+                            : agentResult;
+                        runError = runError ?? reason;
                         stepResults.push({
                             id: stepId,
                             tool: "agent",
                             status: "error",
-                            error: errMsg,
-                            haltReason: `Agent step "${stepId}" returned only narration or whitespace — no content.`,
+                            error: reason,
+                            haltReason: agentSilentFail
+                                ? `Agent step "${stepId}" returned no usable output (silent-fail: ${agentSilentFail.reason}).`
+                                : `Agent step "${stepId}" reported failure.`,
                             durationMs: Date.now() - stepStart,
                         });
                     }
                     else {
-                        // Try to parse as JSON so dot-notation ({{meeting.field}}) works
+                        const stripped = stripLeadingNarration(agentResult);
+                        if (!stripped.trim()) {
+                            const errMsg = `[agent step failed: ${agentCfg.driver ?? "agent"} returned only narration or whitespace — no content]`;
+                            runError = runError ?? errMsg;
+                            stepResults.push({
+                                id: stepId,
+                                tool: "agent",
+                                status: "error",
+                                error: errMsg,
+                                haltReason: `Agent step "${stepId}" returned only narration or whitespace — no content.`,
+                                durationMs: Date.now() - stepStart,
+                            });
+                        }
+                        else {
+                            // Try to parse as JSON so dot-notation ({{meeting.field}}) works
+                            try {
+                                const jsonMatch = /```(?:json)?\s*([\s\S]*?)```/.exec(stripped) ?? [null, stripped];
+                                const parsed = sanitizeParsed(JSON.parse((jsonMatch[1] ?? "").trim()));
+                                ctx[intoKey] = parsed;
+                            }
+                            catch {
+                                ctx[intoKey] = stripped;
+                            }
+                            outputs.push(intoKey);
+                            // PR3a: parse + stash the judge verdict on the step result.
+                            // Augment-only: a `request_changes` verdict still yields
+                            // `status: "ok"`. The verdict surfaces via the runlog +
+                            // future PR3b dashboard panel, but never gates the run.
+                            const judgeVerdict = isJudge
+                                ? parseJudgeVerdict(stripped)
+                                : undefined;
+                            stepResults.push({
+                                id: stepId,
+                                tool: "agent",
+                                status: "ok",
+                                ...(judgeVerdict !== undefined && { judgeVerdict }),
+                                durationMs: Date.now() - stepStart,
+                            });
+                            // Slice 2 — per-step expect eval. Runs on the value just
+                            // committed to ctx[intoKey]. Halt failure flips the just-pushed
+                            // result to error and rolls back the ctx commit so downstream
+                            // steps don't see a value the recipe author rejected.
+                            if (step.expect) {
+                                const failures = await evaluateStepExpect(step.expect, ctx[intoKey]);
+                                if (failures.length > 0) {
+                                    const onFail = step.expect.on_fail ?? "halt";
+                                    const last = stepResults[stepResults.length - 1];
+                                    if (last) {
+                                        if (onFail === "halt") {
+                                            last.status = "error";
+                                            last.error = `expect_failed: ${failures.join("; ")}`;
+                                            last.haltReason = `expect_failed in step "${stepId}": ${failures.join("; ")}`;
+                                            const fbk = recipe.on_error?.fallback;
+                                            const fbkOpen = fbk === "log_only" || fbk === "deliver_original";
+                                            const failOpenAgent = step.optional === true || fbkOpen;
+                                            if (!failOpenAgent) {
+                                                runError = runError ?? last.haltReason;
+                                            }
+                                            delete ctx[intoKey];
+                                        }
+                                        else {
+                                            last.expectWarnings = failures;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                catch (err) {
+                    const msg = err instanceof Error ? err.message : String(err);
+                    runError = runError ?? `agent step "${stepId}" failed: ${msg}`;
+                    stepResults.push({
+                        id: stepId,
+                        tool: "agent",
+                        status: "error",
+                        error: msg,
+                        haltReason: `Agent step "${stepId}" threw before completing: ${msg}`,
+                        durationMs: Date.now() - stepStart,
+                    });
+                }
+                stepsRun++;
+                persistLiveStepResults();
+                emitStepDone(stepIdForEmit);
+                continue;
+            }
+            const stepStart = Date.now();
+            const stepId = step.into ?? `step_${stepsRun}`;
+            // Resolve retry policy: step-level overrides recipe-level.
+            const retryCount = step.retry ?? recipe.on_error?.retry ?? 0;
+            const retryDelayMs = step.retryDelay ?? recipe.on_error?.retryDelay ?? 1000;
+            let result = null;
+            let stepError;
+            let thrownError;
+            let thrownErrorCode;
+            for (let attempt = 0; attempt <= retryCount; attempt++) {
+                if (attempt > 0) {
+                    await new Promise((r) => setTimeout(r, retryDelayMs));
+                }
+                stepError = undefined;
+                thrownError = undefined;
+                thrownErrorCode = undefined;
+                try {
+                    // Slice (sandbox-alternative): per-step wall-clock timeout via
+                    // Promise.race. The underlying tool keeps running in the
+                    // background — this is a halt signal for the runner, not a
+                    // process kill. The thrown error carries a `step_timeout`
+                    // prefix so categoriseHaltReason maps it correctly.
+                    const timeoutMs = typeof step.timeout_ms === "number" && step.timeout_ms > 0
+                        ? step.timeout_ms
+                        : 0;
+                    if (timeoutMs > 0) {
+                        let timer;
+                        const timeoutPromise = new Promise((_, reject) => {
+                            timer = setTimeout(() => {
+                                reject(new Error(`step_timeout: exceeded ${timeoutMs}ms in step "${step.into ?? step.tool ?? "?"}"`));
+                            }, timeoutMs);
+                        });
+                        try {
+                            result = await Promise.race([
+                                executeStep(step, ctx, stepDeps),
+                                timeoutPromise,
+                            ]);
+                        }
+                        finally {
+                            if (timer)
+                                clearTimeout(timer);
+                        }
+                    }
+                    else {
+                        result = await executeStep(step, ctx, stepDeps);
+                    }
+                    // Detect tool-level errors reported as JSON {ok: false, error: ...}
+                    if (result !== null) {
                         try {
-                            const jsonMatch = /```(?:json)?\s*([\s\S]*?)```/.exec(stripped) ?? [null, stripped];
-                            const parsed = sanitizeParsed(JSON.parse((jsonMatch[1] ?? "").trim()));
-                            ctx[intoKey] = parsed;
+                            const parsed = JSON.parse(result);
+                            if (parsed.ok === false && typeof parsed.error === "string") {
+                                stepError = parsed.error;
+                            }
                         }
                         catch {
-                            ctx[intoKey] = stripped;
+                            /* non-JSON result is fine */
                         }
-                        outputs.push(intoKey);
-                        // PR3a: parse + stash the judge verdict on the step result.
-                        // Augment-only: a `request_changes` verdict still yields
-                        // `status: "ok"`. The verdict surfaces via the runlog +
-                        // future PR3b dashboard panel, but never gates the run.
-                        const judgeVerdict = isJudge
-                            ? parseJudgeVerdict(stripped)
-                            : undefined;
-                        stepResults.push({
-                            id: stepId,
-                            tool: "agent",
-                            status: "ok",
-                            ...(judgeVerdict !== undefined && { judgeVerdict }),
-                            durationMs: Date.now() - stepStart,
-                        });
                     }
+                    // Silent-fail detection: tools that return string placeholders
+                    // (`(git branches unavailable)`, `[agent step skipped: ...]`)
+                    // or empty list-tool error shapes (`{count:0,error:"..."}`)
+                    // succeed with bad data — flag them as `error` so the runner
+                    // doesn't quietly hand garbage to a downstream agent. Per-step
+                    // opt-out via `silentFailDetection: false`.
+                    if (!stepError &&
+                        result !== null &&
+                        step.silentFailDetection !== false) {
+                        const detected = detectSilentFail(result);
+                        if (detected) {
+                            stepError = `silent-fail detected (${detected.reason}): ${detected.matched}`;
+                        }
+                    }
+                }
+                catch (err) {
+                    thrownError = err instanceof Error ? err.message : String(err);
+                    // Preserve structured error codes (e.g. recipe_path_jail_escape)
+                    // so callers and tests can branch on `err.code` per R2 M-4
+                    // without scraping the message string.
+                    const code = err?.code;
+                    if (typeof code === "string")
+                        thrownErrorCode = code;
+                    result = null;
                 }
+                if (!stepError && !thrownError)
+                    break;
             }
-            catch (err) {
-                const msg = err instanceof Error ? err.message : String(err);
-                runError = runError ?? `agent step "${stepId}" failed: ${msg}`;
+            // Recipe-level fallback: log_only / deliver_original treat step failure
+            // as non-fatal (fail-open) — same semantics as step-level optional: true.
+            const fallback = recipe.on_error?.fallback;
+            const fallbackFailOpen = fallback === "log_only" || fallback === "deliver_original";
+            const failOpen = step.optional === true || fallbackFailOpen;
+            if (thrownError) {
+                const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
                 stepResults.push({
                     id: stepId,
-                    tool: "agent",
+                    tool: step.tool,
                     status: "error",
-                    error: msg,
-                    haltReason: `Agent step "${stepId}" threw before completing: ${msg}`,
+                    error: thrownError,
+                    ...(thrownErrorCode ? { errorCode: thrownErrorCode } : {}),
+                    haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" threw${retryNote}: ${thrownError}`,
                     durationMs: Date.now() - stepStart,
                 });
+                if (!failOpen) {
+                    runError = runError ?? `${step.tool} failed: ${thrownError}`;
+                }
+                else if (fallbackFailOpen && !step.optional) {
+                    console.warn(`step ${stepId} failed but on_error.fallback=${fallback} — treating as non-fatal: ${thrownError}`);
+                }
             }
-            stepsRun++;
-            persistLiveStepResults();
-            continue;
-        }
-        const stepStart = Date.now();
-        const stepId = step.into ?? `step_${stepsRun}`;
-        // Resolve retry policy: step-level overrides recipe-level.
-        const retryCount = step.retry ?? recipe.on_error?.retry ?? 0;
-        const retryDelayMs = step.retryDelay ?? recipe.on_error?.retryDelay ?? 1000;
-        let result = null;
-        let stepError;
-        let thrownError;
-        let thrownErrorCode;
-        for (let attempt = 0; attempt <= retryCount; attempt++) {
-            if (attempt > 0) {
-                await new Promise((r) => setTimeout(r, retryDelayMs));
+            else {
+                const finalStatus = result === null ? "skipped" : stepError ? "error" : "ok";
+                const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
+                stepResults.push({
+                    id: stepId,
+                    tool: step.tool,
+                    status: finalStatus,
+                    error: stepError,
+                    ...(finalStatus === "error" && stepError
+                        ? {
+                            haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" reported an error${retryNote}: ${stepError}`,
+                        }
+                        : {}),
+                    durationMs: Date.now() - stepStart,
+                });
+                if (stepError) {
+                    if (!failOpen) {
+                        runError = runError ?? `${step.tool} failed: ${stepError}`;
+                    }
+                    else if (fallbackFailOpen && !step.optional) {
+                        console.warn(`step ${stepId} failed but on_error.fallback=${fallback} — treating as non-fatal: ${stepError}`);
+                    }
+                }
             }
-            stepError = undefined;
-            thrownError = undefined;
-            thrownErrorCode = undefined;
-            try {
-                result = await executeStep(step, ctx, stepDeps);
-                // Detect tool-level errors reported as JSON {ok: false, error: ...}
-                if (result !== null) {
+            stepsRun++;
+            if (result !== null) {
+                // Apply transform if present — render template with $result injected
+                if (step.transform) {
                     try {
-                        const parsed = JSON.parse(result);
-                        if (parsed.ok === false && typeof parsed.error === "string") {
-                            stepError = parsed.error;
-                        }
+                        result = render(step.transform, { ...ctx, $result: result });
                     }
-                    catch {
-                        /* non-JSON result is fine */
+                    catch (err) {
+                        // warn but fall through with original result
+                        console.warn(`transform failed for step ${step.into ?? step.tool ?? "?"}: ${err}`);
                     }
                 }
-                // Silent-fail detection: tools that return string placeholders
-                // (`(git branches unavailable)`, `[agent step skipped: ...]`)
-                // or empty list-tool error shapes (`{count:0,error:"..."}`)
-                // succeed with bad data — flag them as `error` so the runner
-                // doesn't quietly hand garbage to a downstream agent. Per-step
-                // opt-out via `silentFailDetection: false`.
-                if (!stepError &&
-                    result !== null &&
-                    step.silentFailDetection !== false) {
-                    const detected = detectSilentFail(result);
-                    if (detected) {
-                        stepError = `silent-fail detected (${detected.reason}): ${detected.matched}`;
+                // Slice 2 — per-step expect eval. Runs on the post-transform value
+                // (what would land in ctx) and only when the step otherwise succeeded.
+                // Halt failure flips the just-pushed result to error and suppresses
+                // the ctx commit by nulling `result` so the downstream `if (step.into)`
+                // block skips. Composes with `optional: true` / `on_error.fallback`.
+                if (step.expect && !thrownError && !stepError && result !== null) {
+                    const failures = await evaluateStepExpect(step.expect, result);
+                    if (failures.length > 0) {
+                        const onFail = step.expect.on_fail ?? "halt";
+                        const last = stepResults[stepResults.length - 1];
+                        if (last) {
+                            if (onFail === "halt") {
+                                last.status = "error";
+                                last.error = `expect_failed: ${failures.join("; ")}`;
+                                last.haltReason = `expect_failed in step "${stepId}": ${failures.join("; ")}`;
+                                if (!failOpen) {
+                                    runError = runError ?? last.haltReason;
+                                }
+                                result = null;
+                            }
+                            else {
+                                last.expectWarnings = failures;
+                            }
+                        }
                     }
                 }
-            }
-            catch (err) {
-                thrownError = err instanceof Error ? err.message : String(err);
-                // Preserve structured error codes (e.g. recipe_path_jail_escape)
-                // so callers and tests can branch on `err.code` per R2 M-4
-                // without scraping the message string.
-                const code = err?.code;
-                if (typeof code === "string")
-                    thrownErrorCode = code;
-                result = null;
-            }
-            if (!stepError && !thrownError)
-                break;
-        }
-        // Recipe-level fallback: log_only / deliver_original treat step failure
-        // as non-fatal (fail-open) — same semantics as step-level optional: true.
-        const fallback = recipe.on_error?.fallback;
-        const fallbackFailOpen = fallback === "log_only" || fallback === "deliver_original";
-        const failOpen = step.optional === true || fallbackFailOpen;
-        if (thrownError) {
-            const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
-            stepResults.push({
-                id: stepId,
-                tool: step.tool,
-                status: "error",
-                error: thrownError,
-                ...(thrownErrorCode ? { errorCode: thrownErrorCode } : {}),
-                haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" threw${retryNote}: ${thrownError}`,
-                durationMs: Date.now() - stepStart,
-            });
-            if (!failOpen) {
-                runError = runError ?? `${step.tool} failed: ${thrownError}`;
-            }
-            else if (fallbackFailOpen && !step.optional) {
-                console.warn(`step ${stepId} failed but on_error.fallback=${fallback} — treating as non-fatal: ${thrownError}`);
-            }
-        }
-        else {
-            const finalStatus = result === null ? "skipped" : stepError ? "error" : "ok";
-            const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
-            stepResults.push({
-                id: stepId,
-                tool: step.tool,
-                status: finalStatus,
-                error: stepError,
-                ...(finalStatus === "error" && stepError
-                    ? {
-                        haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" reported an error${retryNote}: ${stepError}`,
+                if (result !== null && step.into) {
+                    ctx[step.into] = result;
+                    if (step.tool) {
+                        applyToolOutputContext(step.tool, step.into, result, ctx);
                     }
-                    : {}),
-                durationMs: Date.now() - stepStart,
-            });
-            if (stepError) {
-                if (!failOpen) {
-                    runError = runError ?? `${step.tool} failed: ${stepError}`;
                 }
-                else if (fallbackFailOpen && !step.optional) {
-                    console.warn(`step ${stepId} failed but on_error.fallback=${fallback} — treating as non-fatal: ${stepError}`);
+                if (step.tool === "file.write" || step.tool === "file.append") {
+                    // R2 C-1 / F-02: re-validate the rendered path against the jail so a
+                    // template substitution that survived earlier checks (e.g. via a
+                    // chained sub-recipe deps override) cannot smuggle an out-of-jail
+                    // path into the run log / dashboard outputs list.
+                    const renderedPath = render(step.path, ctx);
+                    outputs.push(resolveRecipePath(renderedPath, {
+                        workspace: stepDeps.workdir,
+                        write: true,
+                    }));
                 }
             }
+            persistLiveStepResults();
+            emitStepDone(stepIdForEmit);
         }
-        stepsRun++;
-        if (result !== null) {
-            // Apply transform if present — render template with $result injected
-            if (step.transform) {
-                try {
-                    result = render(step.transform, { ...ctx, $result: result });
-                }
-                catch (err) {
-                    // warn but fall through with original result
-                    console.warn(`transform failed for step ${step.into ?? step.tool ?? "?"}: ${err}`);
-                }
-            }
-            if (step.into) {
-                ctx[step.into] = result;
-                if (step.tool) {
-                    applyToolOutputContext(step.tool, step.into, result, ctx);
-                }
-            }
-            if (step.tool === "file.write" || step.tool === "file.append") {
-                // R2 C-1 / F-02: re-validate the rendered path against the jail so a
-                // template substitution that survived earlier checks (e.g. via a
-                // chained sub-recipe deps override) cannot smuggle an out-of-jail
-                // path into the run log / dashboard outputs list.
-                const renderedPath = render(step.path, ctx);
-                outputs.push(resolveRecipePath(renderedPath, {
-                    workspace: stepDeps.workdir,
-                    write: true,
-                }));
-            }
+    }
+    catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        runError = runError ?? `recipe run aborted: ${msg}`;
+    }
+    // Evaluate expect block before persisting so failures are stored in the
+    // run log. Guarded: a throw here must not skip finalization and strand
+    // the run at "running".
+    let assertionFailures = [];
+    if (recipe.expect) {
+        try {
+            assertionFailures = evaluateExpect({ stepsRun, outputs, context: ctx, errorMessage: runError }, recipe.expect);
         }
-        persistLiveStepResults();
-        // Emit recipe_step_done for live-tail SSE. Look up the matching
-        // entry in stepResults (the loop pushed at most one with this id);
-        // payload mirrors chainedRunner's done event plus haltCategory.
-        const justPushed = stepResults
-            .slice()
-            .reverse()
-            .find((r) => r.id === stepIdForEmit);
-        if (justPushed) {
-            const haltReason = justPushed.haltReason;
-            emit("recipe_step_done", {
-                runSeq,
-                recipeName: recipe.name,
-                stepId: justPushed.id,
-                tool: justPushed.tool,
-                status: justPushed.status,
-                durationMs: justPushed.durationMs,
-                ...(justPushed.error !== undefined && { error: justPushed.error }),
-                ...(haltReason !== undefined && {
-                    haltReason,
-                    haltCategory: categoriseHaltReason(haltReason),
-                }),
-                ts: Date.now(),
-            });
+        catch (err) {
+            const msg = err instanceof Error ? err.message : String(err);
+            runError = runError ?? `expect evaluation failed: ${msg}`;
         }
     }
-    // Evaluate expect block before persisting so failures are stored in the run log
-    const assertionFailures = recipe.expect
-        ? evaluateExpect({ stepsRun, outputs, context: ctx, errorMessage: runError }, recipe.expect)
-        : [];
     // Write to RecipeRunLog so the dashboard Runs page shows this execution.
     // Bridge path: completeRun on the running entry opened above (live-tail).
     // CLI path: construct a local log + appendDirect (no live-tail).
@@ -686,6 +988,7 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
                     outputTail,
                     ...(runError !== undefined && { errorMessage: runError }),
                     ...(assertionFailures.length > 0 ? { assertionFailures } : {}),
+                    ...(inboxOutputs.length > 0 ? { inboxOutputs } : {}),
                 });
                 emit("recipe_done", {
                     runSeq,
@@ -693,6 +996,10 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
                     status: runError ? "error" : "done",
                     durationMs: doneAt - recipeStartedAt,
                     stepCount: finalStepResults.length,
+                    // A `done` run can still carry step errors — the runner
+                    // continues past a non-fatal step failure. Surface it so
+                    // live consumers can show "completed with errors".
+                    hadStepErrors: finalStepResults.some((s) => s.status === "error"),
                     ...(runError !== undefined && { errorMessage: runError }),
                     ...(assertionFailures.length > 0 && {
                         assertionFailureCount: assertionFailures.length,
@@ -718,6 +1025,7 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
                     errorMessage: runError,
                     stepResults: finalStepResults,
                     ...(assertionFailures.length > 0 ? { assertionFailures } : {}),
+                    ...(inboxOutputs.length > 0 ? { inboxOutputs } : {}),
                 });
             }
         }
@@ -772,11 +1080,19 @@ export async function executeStep(step, ctx, deps) {
     // Check if tool is registered in the new registry
     if (hasTool(toolId)) {
         const tool = getTool(toolId);
-        // Build params with template rendering for string values
+        // Build params with template rendering for string values.
+        // `do` is left raw: it carries a nested sub-step template (used by
+        // `fan_out`) whose `{{item.*}}` placeholders must be rendered per-iter
+        // with the loop variable in scope, not pre-rendered against the outer
+        // ctx (which would resolve them to empty strings).
         const params = {};
         for (const [key, value] of Object.entries(step)) {
             if (key === "tool" || key === "agent" || key === "into")
                 continue;
+            if (key === "do") {
+                params[key] = value;
+                continue;
+            }
             params[key] = deepRender(value, ctx);
         }
         // Check if mock connector is available for this tool
@@ -1096,8 +1412,16 @@ export function resolveClaudeBinary() {
     }
     return ensureCmdShim("claude");
 }
-function defaultClaudeCodeFn(prompt, _opts) {
+export function defaultClaudeCodeFn(prompt, _opts) {
     const binary = resolveClaudeBinary();
+    // Resolve a workspace cwd so the spawned `claude -p` doesn't inherit the
+    // bridge LaunchAgent's `$HOME` (P2 from the 2026-05-20 research run).
+    // When nothing resolves, surface a typed reason instead of silently
+    // shelling out from the wrong directory.
+    const workspace = resolveWorkspaceRoot();
+    if (!workspace) {
+        return Promise.resolve(`[agent step failed: recipe_no_workspace — no .git ancestor of "${process.cwd()}" and PATCHWORK_WORKSPACE not set. Set PATCHWORK_WORKSPACE in the bridge environment or add a 'workspace:' field to the recipe.]`);
+    }
     try {
         const result = spawnSync(binary, [
             "-p",
@@ -1106,6 +1430,7 @@ function defaultClaudeCodeFn(prompt, _opts) {
             "You are a helpful assistant processing a recipe task. Use ONLY the data explicitly provided in the user message — treat it as ground truth. Do not call tools to look up git history, emails, or any other information; all necessary data is already included.",
             "--no-session-persistence",
         ], {
+            cwd: workspace.path,
             encoding: "utf-8",
             timeout: 120_000,
             maxBuffer: 10 * 1024 * 1024,
@@ -1144,10 +1469,11 @@ function makeProviderDriverFn() {
             const timeoutMs = 300_000;
             const startupTimeoutMs = 30_000;
             const timeout = setTimeout(() => controller.abort(), timeoutMs);
+            const resolvedWorkspace = process.cwd();
             try {
                 const result = await driver.run({
                     prompt,
-                    workspace: process.cwd(),
+                    workspace: resolvedWorkspace,
                     timeoutMs,
                     startupTimeoutMs,
                     signal: controller.signal,