patchwork-os 0.2.0-beta.5.canary.94 → 0.2.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/dist/ajv2020.d.ts +25 -0
  2. package/dist/ajv2020.js +33 -0
  3. package/dist/ajv2020.js.map +1 -0
  4. package/dist/approvalQueue.d.ts +17 -0
  5. package/dist/approvalQueue.js.map +1 -1
  6. package/dist/bridge.js +16 -0
  7. package/dist/bridge.js.map +1 -1
  8. package/dist/commands/recipeInstall.js +5 -1
  9. package/dist/commands/recipeInstall.js.map +1 -1
  10. package/dist/commands/tools.d.ts +20 -1
  11. package/dist/commands/tools.js +112 -3
  12. package/dist/commands/tools.js.map +1 -1
  13. package/dist/haltPushDispatch.d.ts +33 -0
  14. package/dist/haltPushDispatch.js +103 -0
  15. package/dist/haltPushDispatch.js.map +1 -0
  16. package/dist/inboxRoutes.d.ts +22 -0
  17. package/dist/inboxRoutes.js +61 -1
  18. package/dist/inboxRoutes.js.map +1 -1
  19. package/dist/index.js +8 -0
  20. package/dist/index.js.map +1 -1
  21. package/dist/oauthRoutes.d.ts +1 -1
  22. package/dist/oauthRoutes.js +2 -2
  23. package/dist/recipeRoutes.js +133 -65
  24. package/dist/recipeRoutes.js.map +1 -1
  25. package/dist/recipes/githubInstallSource.d.ts +66 -0
  26. package/dist/recipes/githubInstallSource.js +85 -4
  27. package/dist/recipes/githubInstallSource.js.map +1 -1
  28. package/dist/recipes/haltCategory.d.ts +4 -0
  29. package/dist/recipes/haltCategory.js +6 -0
  30. package/dist/recipes/haltCategory.js.map +1 -1
  31. package/dist/recipes/names.d.ts +20 -0
  32. package/dist/recipes/names.js +25 -0
  33. package/dist/recipes/names.js.map +1 -1
  34. package/dist/recipes/parser.js +7 -2
  35. package/dist/recipes/parser.js.map +1 -1
  36. package/dist/recipes/stepObservation.js +9 -0
  37. package/dist/recipes/stepObservation.js.map +1 -1
  38. package/dist/recipes/tools/fanOut.d.ts +20 -0
  39. package/dist/recipes/tools/fanOut.js +199 -0
  40. package/dist/recipes/tools/fanOut.js.map +1 -0
  41. package/dist/recipes/tools/index.d.ts +1 -0
  42. package/dist/recipes/tools/index.js +1 -0
  43. package/dist/recipes/tools/index.js.map +1 -1
  44. package/dist/recipes/tools/slack.js +1 -1
  45. package/dist/recipes/validation.js +2 -2
  46. package/dist/recipes/validation.js.map +1 -1
  47. package/dist/recipes/workspaceRoot.d.ts +37 -0
  48. package/dist/recipes/workspaceRoot.js +73 -0
  49. package/dist/recipes/workspaceRoot.js.map +1 -0
  50. package/dist/recipes/yamlRunner.d.ts +72 -0
  51. package/dist/recipes/yamlRunner.js +621 -295
  52. package/dist/recipes/yamlRunner.js.map +1 -1
  53. package/dist/runLog.d.ts +22 -0
  54. package/dist/runLog.js +12 -1
  55. package/dist/runLog.js.map +1 -1
  56. package/dist/server.d.ts +14 -0
  57. package/dist/server.js +36 -3
  58. package/dist/server.js.map +1 -1
  59. package/dist/tools/batchLsp.d.ts +3 -0
  60. package/dist/tools/cancelClaudeTask.d.ts +1 -0
  61. package/dist/tools/clipboard.d.ts +2 -0
  62. package/dist/tools/closeTabs.d.ts +1 -0
  63. package/dist/tools/codeLens.d.ts +1 -0
  64. package/dist/tools/createIssueFromAIComment.d.ts +1 -0
  65. package/dist/tools/ctxSaveTrace.d.ts +1 -0
  66. package/dist/tools/debug.d.ts +4 -0
  67. package/dist/tools/decorations.d.ts +2 -0
  68. package/dist/tools/documentLinks.d.ts +1 -0
  69. package/dist/tools/editText.d.ts +1 -0
  70. package/dist/tools/enrichCommit.d.ts +1 -0
  71. package/dist/tools/explainDiagnostic.d.ts +1 -0
  72. package/dist/tools/explainSymbol.d.ts +1 -0
  73. package/dist/tools/fileOperations.d.ts +3 -0
  74. package/dist/tools/fileWatcher.d.ts +2 -0
  75. package/dist/tools/findFiles.d.ts +1 -0
  76. package/dist/tools/fixAllLintErrors.d.ts +1 -0
  77. package/dist/tools/foldingRanges.d.ts +1 -0
  78. package/dist/tools/formatDocument.d.ts +1 -0
  79. package/dist/tools/generateTests.d.ts +1 -0
  80. package/dist/tools/getAIComments.d.ts +1 -0
  81. package/dist/tools/getBufferContent.d.ts +1 -0
  82. package/dist/tools/getChangeImpact.d.ts +1 -0
  83. package/dist/tools/getClaudeTaskStatus.d.ts +1 -0
  84. package/dist/tools/getCodeCoverage.d.ts +1 -0
  85. package/dist/tools/getCommitsForIssue.d.ts +1 -0
  86. package/dist/tools/getDebugState.d.ts +1 -0
  87. package/dist/tools/getDocumentSymbols.d.ts +1 -0
  88. package/dist/tools/getGitHotspots.d.ts +1 -0
  89. package/dist/tools/getImportedSignatures.d.ts +1 -0
  90. package/dist/tools/getPRTemplate.d.ts +1 -0
  91. package/dist/tools/getSymbolHistory.d.ts +1 -0
  92. package/dist/tools/getTypeSignature.d.ts +1 -0
  93. package/dist/tools/getWorkspaceSettings.d.ts +1 -0
  94. package/dist/tools/gitWrite.d.ts +11 -0
  95. package/dist/tools/github/actions.d.ts +2 -0
  96. package/dist/tools/github/composite.d.ts +3 -0
  97. package/dist/tools/github/issues.d.ts +4 -0
  98. package/dist/tools/github/pr.d.ts +7 -0
  99. package/dist/tools/handoffNote.d.ts +1 -0
  100. package/dist/tools/hoverAtCursor.d.ts +1 -0
  101. package/dist/tools/httpClient.d.ts +2 -0
  102. package/dist/tools/inlayHints.d.ts +1 -0
  103. package/dist/tools/launchQuickTask.d.ts +1 -0
  104. package/dist/tools/listClaudeTasks.d.ts +1 -0
  105. package/dist/tools/listTerminals.d.ts +1 -0
  106. package/dist/tools/lsp.d.ts +15 -0
  107. package/dist/tools/navigateToSymbolByName.d.ts +1 -0
  108. package/dist/tools/openDiff.d.ts +1 -0
  109. package/dist/tools/openFile.d.ts +1 -0
  110. package/dist/tools/organizeImports.d.ts +1 -0
  111. package/dist/tools/planPersistence.d.ts +3 -0
  112. package/dist/tools/previewEdit.d.ts +1 -0
  113. package/dist/tools/refactorAnalyze.d.ts +1 -0
  114. package/dist/tools/refactorPreview.d.ts +1 -0
  115. package/dist/tools/replaceBlock.d.ts +1 -0
  116. package/dist/tools/resumeClaudeTask.d.ts +1 -0
  117. package/dist/tools/runClaudeTask.d.ts +1 -0
  118. package/dist/tools/screenshot.d.ts +1 -0
  119. package/dist/tools/searchAndReplace.d.ts +1 -0
  120. package/dist/tools/searchWorkspace.d.ts +1 -0
  121. package/dist/tools/selectionRanges.d.ts +1 -0
  122. package/dist/tools/semanticTokens.d.ts +1 -0
  123. package/dist/tools/signatureHelp.d.ts +1 -0
  124. package/dist/tools/terminal.d.ts +6 -0
  125. package/dist/tools/testTraceToSource.d.ts +1 -0
  126. package/dist/tools/transaction.d.ts +4 -0
  127. package/dist/tools/typeHierarchy.d.ts +1 -0
  128. package/dist/tools/utils.d.ts +18 -0
  129. package/dist/tools/utils.js +28 -6
  130. package/dist/tools/utils.js.map +1 -1
  131. package/dist/tools/vscodeCommands.d.ts +2 -0
  132. package/dist/tools/vscodeTasks.d.ts +2 -0
  133. package/dist/tools/workspaceSettings.d.ts +1 -0
  134. package/dist/transport.js +2 -2
  135. package/dist/transport.js.map +1 -1
  136. package/dist/wireHaltPushDispatch.d.ts +38 -0
  137. package/dist/wireHaltPushDispatch.js +71 -0
  138. package/dist/wireHaltPushDispatch.js.map +1 -0
  139. package/package.json +1 -1
@@ -51,6 +51,7 @@ import { RunBudget } from "./runBudget.js";
51
51
  import { detectSilentFail } from "./stepObservation.js";
52
52
  // Import tool registry and trigger tool self-registration
53
53
  import { applyToolOutputContext, executeTool, getTool, hasTool, registerPluginTools, } from "./toolRegistry.js";
54
+ import { resolveWorkspaceRoot } from "./workspaceRoot.js";
54
55
  import "./tools/index.js";
55
56
  /**
56
57
  * Bundled-templates directory used as a third allowed root for nested-recipe
@@ -127,7 +128,119 @@ export function evaluateExpect(result, expect) {
127
128
  }
128
129
  return failures;
129
130
  }
131
+ /**
132
+ * Lazy AJV for `step.expect.schema`. Initialised on first use so recipes
133
+ * without schema assertions don't pay the import/compile cost.
134
+ */
135
+ let _stepExpectAjv;
136
+ async function getStepExpectAjv() {
137
+ if (!_stepExpectAjv) {
138
+ const { createAjv2020 } = await import("../ajv2020.js");
139
+ _stepExpectAjv = createAjv2020({ strict: false, allErrors: true });
140
+ }
141
+ return _stepExpectAjv;
142
+ }
143
+ /**
144
+ * Stringify a step value for assertion purposes. Strings pass through;
145
+ * other values JSON.stringify so `matches`/`contains` see something stable.
146
+ */
147
+ function stringifyForAssert(value) {
148
+ if (typeof value === "string")
149
+ return value;
150
+ try {
151
+ return JSON.stringify(value);
152
+ }
153
+ catch {
154
+ return String(value);
155
+ }
156
+ }
157
+ /**
158
+ * Evaluate a per-step `expect` block against the step's output value.
159
+ * Returns the list of failure messages (empty = all assertions passed).
160
+ *
161
+ * Slice 2 of the agentic-workflow primitives. v1 supports
162
+ * schema/equals/matches/contains; `on_fail: judge` deliberately omitted —
163
+ * see comment on `StepExpect`.
164
+ */
165
+ export async function evaluateStepExpect(expect, value) {
166
+ const failures = [];
167
+ const asString = stringifyForAssert(value);
168
+ if (expect.equals !== undefined) {
169
+ const expected = expect.equals;
170
+ const expectedStr = typeof expected === "string" ? expected : stringifyForAssert(expected);
171
+ if (asString !== expectedStr) {
172
+ failures.push(`equals: expected ${JSON.stringify(expectedStr)}, got ${JSON.stringify(asString)}`);
173
+ }
174
+ }
175
+ if (expect.contains !== undefined) {
176
+ const needles = Array.isArray(expect.contains)
177
+ ? expect.contains
178
+ : [expect.contains];
179
+ for (const needle of needles) {
180
+ if (!asString.includes(needle)) {
181
+ failures.push(`contains: missing ${JSON.stringify(needle)}`);
182
+ }
183
+ }
184
+ }
185
+ if (expect.matches !== undefined) {
186
+ let re;
187
+ try {
188
+ re = new RegExp(expect.matches);
189
+ }
190
+ catch (err) {
191
+ failures.push(`matches: invalid regex ${JSON.stringify(expect.matches)} (${err instanceof Error ? err.message : String(err)})`);
192
+ return failures;
193
+ }
194
+ if (!re.test(asString)) {
195
+ failures.push(`matches: ${JSON.stringify(expect.matches)} did not match output`);
196
+ }
197
+ }
198
+ if (expect.schema !== undefined) {
199
+ let parsed;
200
+ try {
201
+ parsed = typeof value === "string" ? JSON.parse(value) : value;
202
+ }
203
+ catch {
204
+ failures.push(`schema: output is not valid JSON`);
205
+ return failures;
206
+ }
207
+ try {
208
+ const ajv = await getStepExpectAjv();
209
+ const validate = ajv.compile(expect.schema);
210
+ if (!validate(parsed)) {
211
+ const errs = (validate.errors ?? [])
212
+ .map((e) => `${e.instancePath || "/"} ${e.message ?? "invalid"}`)
213
+ .join("; ");
214
+ failures.push(`schema: ${errs || "validation failed"}`);
215
+ }
216
+ }
217
+ catch (err) {
218
+ failures.push(`schema: compile error (${err instanceof Error ? err.message : String(err)})`);
219
+ }
220
+ }
221
+ return failures;
222
+ }
130
223
  // Strip tool-call narration some models (e.g. Gemini) prepend before the markdown block.
224
+ /**
225
+ * Phase 0β — separator-agnostic inbox-path detector. Extracted so the
226
+ * Windows path-separator behaviour can be unit-tested by injecting
227
+ * `path.win32` / `path.posix` without booting a real recipe runner.
228
+ *
229
+ * Returns true when `candidate` resolves to a direct child of
230
+ * `inboxDirAbs`, isn't a dotfile, and lives in (not above) the inbox
231
+ * dir. Both arguments must already be platform-appropriate absolute
232
+ * paths (resolve them with the same path module before calling).
233
+ */
234
+ export function isInboxPathFor(candidate, inboxDirAbs, pathMod) {
235
+ const target = pathMod.resolve(candidate);
236
+ const rel = pathMod.relative(inboxDirAbs, target);
237
+ if (!rel || rel.startsWith("..") || pathMod.isAbsolute(rel))
238
+ return false;
239
+ if (pathMod.basename(target).startsWith("."))
240
+ return false;
241
+ // Only direct children — `~/.patchwork/inbox/foo.md`, not nested.
242
+ return !rel.includes(pathMod.sep);
243
+ }
131
244
  function stripLeadingNarration(text) {
132
245
  const lines = text.split("\n");
133
246
  const firstMarkdown = lines.findIndex((l) => /^(#|>|`|\||[-*+] |\d+\. |\*\*)/.test(l.trimStart()));
@@ -239,6 +352,84 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
239
352
  ...seedContext,
240
353
  };
241
354
  const stepDeps = resolveStepDeps(deps, { recipeName: recipe.name });
355
+ // Phase 0β — inbox provenance. When a recipe `file.write` / `file.append`
356
+ // step targets `~/.patchwork/inbox/`, prepend a YAML frontmatter block
357
+ // (first write only) recording recipe + run + trigger, and accumulate the
358
+ // delivered filename onto the run record's `inboxOutputs`. Old recipes /
359
+ // non-inbox paths pass through unchanged.
360
+ //
361
+ // Windows path-separator fix (CI repro 2026-05-20): the original
362
+ // implementation built the prefix as `${os.homedir()}/.patchwork/inbox/`
363
+ // and compared with `startsWith`, which failed on Windows where
364
+ // resolved absolute paths use `\` separators and `os.homedir()` returns
365
+ // `C:\Users\...`. Now we resolve both sides through `path.resolve()`
366
+ // and use `path.relative()` to detect containment so the comparison is
367
+ // separator-agnostic. Also case-insensitive on Win32 (NTFS).
368
+ const inboxDirAbs = path.resolve(path.join(os.homedir(), ".patchwork", "inbox"));
369
+ const inboxOutputs = [];
370
+ const isInboxPath = (abs) => isInboxPathFor(abs, inboxDirAbs, path);
371
+ const buildFrontmatter = () => {
372
+ const triggerKindAtWrite = yamlTriggerKind;
373
+ const lines = ["---", `recipe: ${recipe.name}`];
374
+ if (runSeq !== undefined)
375
+ lines.push(`runSeq: ${runSeq}`);
376
+ lines.push(`trigger: ${triggerKindAtWrite}`, `deliveredAt: ${new Date().toISOString()}`, "---", "", "");
377
+ return lines.join("\n");
378
+ };
379
+ const recordInboxDelivery = (abs) => {
380
+ inboxOutputs.push({
381
+ filename: path.basename(abs),
382
+ deliveredAt: Date.now(),
383
+ });
384
+ };
385
+ // Atomic read-or-default: a single `readFileSync` in a try/catch. No
386
+ // `existsSync`/`statSync` probe around the write — on Windows a stat
387
+ // immediately before write can race a concurrent fd holder and surface
388
+ // `EBUSY`/`EPERM`. The read either succeeds (file present) or throws
389
+ // ENOENT (treated as new file). Either way we never stat the same path
390
+ // we're about to write.
391
+ const readExistingOrEmpty = (abs) => {
392
+ try {
393
+ return readFileSync(abs, "utf-8");
394
+ }
395
+ catch {
396
+ return "";
397
+ }
398
+ };
399
+ const originalWrite = stepDeps.writeFile;
400
+ const originalAppend = stepDeps.appendFile;
401
+ stepDeps.writeFile = (p, content) => {
402
+ if (isInboxPath(p)) {
403
+ // First-write detection by content shape, not by stat. Empty string
404
+ // (ENOENT) and any file that does NOT already begin with `---\n`
405
+ // gets frontmatter; pre-frontmattered files are overwritten as-is
406
+ // so consumers can replay a recipe without doubling the header.
407
+ const existing = readExistingOrEmpty(p);
408
+ const hasFm = existing.startsWith("---\n");
409
+ const final = hasFm ? content : buildFrontmatter() + content;
410
+ originalWrite(p, final);
411
+ recordInboxDelivery(p);
412
+ return;
413
+ }
414
+ originalWrite(p, content);
415
+ };
416
+ stepDeps.appendFile = (p, content) => {
417
+ if (isInboxPath(p)) {
418
+ // file.append: never re-prepend. If file is brand-new, seed one
419
+ // frontmatter block so an append-only recipe still gets
420
+ // provenance. Same atomic read-or-default — no stat probe.
421
+ const existing = readExistingOrEmpty(p);
422
+ if (existing.length === 0) {
423
+ originalWrite(p, buildFrontmatter() + content);
424
+ }
425
+ else {
426
+ originalAppend(p, content);
427
+ }
428
+ recordInboxDelivery(p);
429
+ return;
430
+ }
431
+ originalAppend(p, content);
432
+ };
242
433
  // PR2b: one per-run budget shared across all agent steps. Absent
243
434
  // `recipe.budget` → no enforcement, no overhead.
244
435
  const runBudget = new RunBudget(recipe.budget);
@@ -316,348 +507,459 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
316
507
  // Track per-step start timestamps so done events carry durationMs
317
508
  // without a second roundtrip.
318
509
  const stepStartTs = new Map();
319
- for (const step of recipe.steps) {
320
- const stepIdForEmit = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
321
- const stepTs = Date.now();
322
- stepStartTs.set(stepIdForEmit, stepTs);
323
- emit("recipe_step_start", {
510
+ // Emit recipe_step_done for the step result just pushed onto
511
+ // `stepResults`. Every loop branch (skip / budget / agent / tool)
512
+ // pushes exactly one result before it ends, so the last element is
513
+ // always the current step. `stepId` mirrors recipe_step_start's
514
+ // `stepIdForEmit` so live consumers can correlate start↔done — the
515
+ // pushed result's own id can diverge for agent steps without `into`.
516
+ const emitStepDone = (stepIdForEmit) => {
517
+ const justPushed = stepResults[stepResults.length - 1];
518
+ if (!justPushed)
519
+ return;
520
+ const haltReason = justPushed.haltReason;
521
+ emit("recipe_step_done", {
324
522
  runSeq,
325
523
  recipeName: recipe.name,
326
524
  stepId: stepIdForEmit,
327
- tool: step.agent ? "agent" : step.tool,
328
- ts: stepTs,
525
+ tool: justPushed.tool,
526
+ status: justPushed.status,
527
+ durationMs: justPushed.durationMs,
528
+ ...(justPushed.error !== undefined && { error: justPushed.error }),
529
+ ...(haltReason !== undefined && {
530
+ haltReason,
531
+ haltCategory: categoriseHaltReason(haltReason),
532
+ }),
533
+ ts: Date.now(),
329
534
  });
330
- // Evaluate `when` guard before running anything. Mirrors
331
- // chainedRunner.ts:248-266 render the template, then truthy-check the
332
- // result (empty string, "0", "false", "null", "undefined" are falsy).
333
- // A falsy guard records the step as `skipped`, increments stepsRun, and
334
- // continues it is NOT a failure. Bridge-dev iMessage recipes rely on
335
- // this to suppress the iMessage agent step when phone is empty.
336
- if (typeof step.when === "string" && step.when.length > 0) {
337
- const rendered = render(step.when, ctx).trim().toLowerCase();
338
- const truthy = !!rendered &&
339
- rendered !== "0" &&
340
- rendered !== "false" &&
341
- rendered !== "null" &&
342
- rendered !== "undefined";
343
- if (!truthy) {
344
- const skipId = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
345
- stepResults.push({
346
- id: skipId,
347
- tool: step.agent ? "agent" : step.tool,
348
- status: "skipped",
349
- durationMs: 0,
350
- });
351
- stepsRun++;
352
- persistLiveStepResults();
353
- emit("recipe_step_done", {
354
- runSeq,
355
- recipeName: recipe.name,
356
- stepId: skipId,
357
- tool: step.agent ? "agent" : step.tool,
358
- status: "skipped",
359
- durationMs: 0,
360
- ts: Date.now(),
361
- });
362
- continue;
363
- }
364
- }
365
- // Handle agent steps separately
366
- if (step.agent) {
367
- const agentCfg = step.agent;
368
- const isJudge = agentCfg.kind === "judge";
369
- // PR3a: judge prompt convention. Append the structured-verdict
370
- // suffix and, when `reviews: <stepId>` is set, inject the
371
- // upstream step's output as an <artefact> block.
372
- let renderedPrompt = render(agentCfg.prompt, ctx);
373
- if (isJudge) {
374
- if (agentCfg.reviews) {
375
- renderedPrompt += buildJudgeArtefactBlock(ctx[agentCfg.reviews]);
535
+ };
536
+ // The step loop is wrapped so an uncaught throw from any unguarded
537
+ // call site (a `when`/prompt render on a malformed step, a path-jail
538
+ // re-check, etc.) cannot escape `runYamlRecipe` and strand the
539
+ // run-log entry at "running" forever. On throw we capture the
540
+ // message into `runError` and fall through to the normal
541
+ // finalization path, which marks the run "error".
542
+ try {
543
+ for (const step of recipe.steps) {
544
+ const stepIdForEmit = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
545
+ const stepTs = Date.now();
546
+ stepStartTs.set(stepIdForEmit, stepTs);
547
+ emit("recipe_step_start", {
548
+ runSeq,
549
+ recipeName: recipe.name,
550
+ stepId: stepIdForEmit,
551
+ tool: step.agent ? "agent" : step.tool,
552
+ ts: stepTs,
553
+ });
554
+ // Evaluate `when` guard before running anything. Mirrors
555
+ // chainedRunner.ts:248-266 — render the template, then truthy-check the
556
+ // result (empty string, "0", "false", "null", "undefined" are falsy).
557
+ // A falsy guard records the step as `skipped`, increments stepsRun, and
558
+ // continues — it is NOT a failure. Bridge-dev iMessage recipes rely on
559
+ // this to suppress the iMessage agent step when phone is empty.
560
+ if (typeof step.when === "string" && step.when.length > 0) {
561
+ const rendered = render(step.when, ctx).trim().toLowerCase();
562
+ const truthy = !!rendered &&
563
+ rendered !== "0" &&
564
+ rendered !== "false" &&
565
+ rendered !== "null" &&
566
+ rendered !== "undefined";
567
+ if (!truthy) {
568
+ const skipId = step.into ?? step.agent?.into ?? `step_${stepsRun}`;
569
+ stepResults.push({
570
+ id: skipId,
571
+ tool: step.agent ? "agent" : step.tool,
572
+ status: "skipped",
573
+ durationMs: 0,
574
+ });
575
+ stepsRun++;
576
+ persistLiveStepResults();
577
+ emit("recipe_step_done", {
578
+ runSeq,
579
+ recipeName: recipe.name,
580
+ stepId: skipId,
581
+ tool: step.agent ? "agent" : step.tool,
582
+ status: "skipped",
583
+ durationMs: 0,
584
+ ts: Date.now(),
585
+ });
586
+ continue;
376
587
  }
377
- renderedPrompt += JUDGE_PROMPT_SUFFIX;
378
588
  }
379
- const intoKey = agentCfg.into ?? "agent_output";
380
- const stepId = intoKey;
381
- const stepStart = Date.now();
382
- let agentResult;
383
- // PR2b: per-recipe token budget. Admission check before dispatch;
384
- // reconcile actual consumption after. Subscription drivers
385
- // (Claude CLI, provider subprocess) report `usage === undefined`
386
- // `RunBudget.reconcile` records a fail-open warning per driver
387
- // per run and continues.
388
- const admission = runBudget.admit();
389
- if (!admission.admitted) {
390
- const reason = admission.reason ??
391
- "Run exceeded its token budget — budget_exceeded.";
392
- runError = runError ?? reason;
393
- stepResults.push({
394
- id: stepId,
395
- tool: "agent",
396
- status: "error",
397
- error: reason,
398
- haltReason: reason,
399
- durationMs: 0,
400
- });
401
- stepsRun++;
402
- persistLiveStepResults();
403
- continue;
404
- }
405
- try {
406
- const agentReturn = await _executeAgent({
407
- prompt: renderedPrompt,
408
- driver: agentCfg.driver === "api" ? "anthropic" : agentCfg.driver,
409
- model: agentCfg.model,
410
- ...(agentCfg.mcpAccess !== undefined && {
411
- mcpAccess: agentCfg.mcpAccess,
412
- }),
413
- }, buildAgentExecutorDeps(stepDeps, deps));
414
- agentResult = agentReturn.text;
415
- runBudget.reconcile(agentCfg.driver === "api" ? "anthropic" : (agentCfg.driver ?? "auto"), agentReturn.usage);
416
- // Catch both `[agent step failed: ...]` (existing) and the
417
- // silent-fail patterns `[agent step skipped: ...]` etc. via the
418
- // shared detector. Per-step opt-out via `silentFailDetection: false`.
419
- const agentSilentFail = step.silentFailDetection !== false
420
- ? detectSilentFail(agentResult)
421
- : null;
422
- if (agentResult.startsWith("[agent step failed:") || agentSilentFail) {
423
- const reason = agentSilentFail
424
- ? `silent-fail detected (${agentSilentFail.reason}): ${agentSilentFail.matched}`
425
- : agentResult;
589
+ // Handle agent steps separately
590
+ if (step.agent) {
591
+ const agentCfg = step.agent;
592
+ const isJudge = agentCfg.kind === "judge";
593
+ // PR3a: judge prompt convention. Append the structured-verdict
594
+ // suffix and, when `reviews: <stepId>` is set, inject the
595
+ // upstream step's output as an <artefact> block.
596
+ let renderedPrompt = render(agentCfg.prompt, ctx);
597
+ if (isJudge) {
598
+ if (agentCfg.reviews) {
599
+ renderedPrompt += buildJudgeArtefactBlock(ctx[agentCfg.reviews]);
600
+ }
601
+ renderedPrompt += JUDGE_PROMPT_SUFFIX;
602
+ }
603
+ const intoKey = agentCfg.into ?? "agent_output";
604
+ const stepId = intoKey;
605
+ const stepStart = Date.now();
606
+ let agentResult;
607
+ // PR2b: per-recipe token budget. Admission check before dispatch;
608
+ // reconcile actual consumption after. Subscription drivers
609
+ // (Claude CLI, provider subprocess) report `usage === undefined`
610
+ // — `RunBudget.reconcile` records a fail-open warning per driver
611
+ // per run and continues.
612
+ const admission = runBudget.admit();
613
+ if (!admission.admitted) {
614
+ const reason = admission.reason ??
615
+ "Run exceeded its token budget — budget_exceeded.";
426
616
  runError = runError ?? reason;
427
617
  stepResults.push({
428
618
  id: stepId,
429
619
  tool: "agent",
430
620
  status: "error",
431
621
  error: reason,
432
- haltReason: agentSilentFail
433
- ? `Agent step "${stepId}" returned no usable output (silent-fail: ${agentSilentFail.reason}).`
434
- : `Agent step "${stepId}" reported failure.`,
435
- durationMs: Date.now() - stepStart,
622
+ haltReason: reason,
623
+ durationMs: 0,
436
624
  });
625
+ stepsRun++;
626
+ persistLiveStepResults();
627
+ emitStepDone(stepIdForEmit);
628
+ continue;
437
629
  }
438
- else {
439
- const stripped = stripLeadingNarration(agentResult);
440
- if (!stripped.trim()) {
441
- const errMsg = `[agent step failed: ${agentCfg.driver ?? "agent"} returned only narration or whitespace — no content]`;
442
- runError = runError ?? errMsg;
630
+ try {
631
+ const agentReturn = await _executeAgent({
632
+ prompt: renderedPrompt,
633
+ driver: agentCfg.driver === "api" ? "anthropic" : agentCfg.driver,
634
+ model: agentCfg.model,
635
+ ...(agentCfg.mcpAccess !== undefined && {
636
+ mcpAccess: agentCfg.mcpAccess,
637
+ }),
638
+ }, buildAgentExecutorDeps(stepDeps, deps));
639
+ agentResult = agentReturn.text;
640
+ runBudget.reconcile(agentCfg.driver === "api"
641
+ ? "anthropic"
642
+ : (agentCfg.driver ?? "auto"), agentReturn.usage);
643
+ // Catch both `[agent step failed: ...]` (existing) and the
644
+ // silent-fail patterns `[agent step skipped: ...]` etc. via the
645
+ // shared detector. Per-step opt-out via `silentFailDetection: false`.
646
+ const agentSilentFail = step.silentFailDetection !== false
647
+ ? detectSilentFail(agentResult)
648
+ : null;
649
+ if (agentResult.startsWith("[agent step failed:") ||
650
+ agentSilentFail) {
651
+ const reason = agentSilentFail
652
+ ? `silent-fail detected (${agentSilentFail.reason}): ${agentSilentFail.matched}`
653
+ : agentResult;
654
+ runError = runError ?? reason;
443
655
  stepResults.push({
444
656
  id: stepId,
445
657
  tool: "agent",
446
658
  status: "error",
447
- error: errMsg,
448
- haltReason: `Agent step "${stepId}" returned only narration or whitespace — no content.`,
659
+ error: reason,
660
+ haltReason: agentSilentFail
661
+ ? `Agent step "${stepId}" returned no usable output (silent-fail: ${agentSilentFail.reason}).`
662
+ : `Agent step "${stepId}" reported failure.`,
449
663
  durationMs: Date.now() - stepStart,
450
664
  });
451
665
  }
452
666
  else {
453
- // Try to parse as JSON so dot-notation ({{meeting.field}}) works
667
+ const stripped = stripLeadingNarration(agentResult);
668
+ if (!stripped.trim()) {
669
+ const errMsg = `[agent step failed: ${agentCfg.driver ?? "agent"} returned only narration or whitespace — no content]`;
670
+ runError = runError ?? errMsg;
671
+ stepResults.push({
672
+ id: stepId,
673
+ tool: "agent",
674
+ status: "error",
675
+ error: errMsg,
676
+ haltReason: `Agent step "${stepId}" returned only narration or whitespace — no content.`,
677
+ durationMs: Date.now() - stepStart,
678
+ });
679
+ }
680
+ else {
681
+ // Try to parse as JSON so dot-notation ({{meeting.field}}) works
682
+ try {
683
+ const jsonMatch = /```(?:json)?\s*([\s\S]*?)```/.exec(stripped) ?? [null, stripped];
684
+ const parsed = sanitizeParsed(JSON.parse((jsonMatch[1] ?? "").trim()));
685
+ ctx[intoKey] = parsed;
686
+ }
687
+ catch {
688
+ ctx[intoKey] = stripped;
689
+ }
690
+ outputs.push(intoKey);
691
+ // PR3a: parse + stash the judge verdict on the step result.
692
+ // Augment-only: a `request_changes` verdict still yields
693
+ // `status: "ok"`. The verdict surfaces via the runlog +
694
+ // future PR3b dashboard panel, but never gates the run.
695
+ const judgeVerdict = isJudge
696
+ ? parseJudgeVerdict(stripped)
697
+ : undefined;
698
+ stepResults.push({
699
+ id: stepId,
700
+ tool: "agent",
701
+ status: "ok",
702
+ ...(judgeVerdict !== undefined && { judgeVerdict }),
703
+ durationMs: Date.now() - stepStart,
704
+ });
705
+ // Slice 2 — per-step expect eval. Runs on the value just
706
+ // committed to ctx[intoKey]. Halt failure flips the just-pushed
707
+ // result to error and rolls back the ctx commit so downstream
708
+ // steps don't see a value the recipe author rejected.
709
+ if (step.expect) {
710
+ const failures = await evaluateStepExpect(step.expect, ctx[intoKey]);
711
+ if (failures.length > 0) {
712
+ const onFail = step.expect.on_fail ?? "halt";
713
+ const last = stepResults[stepResults.length - 1];
714
+ if (last) {
715
+ if (onFail === "halt") {
716
+ last.status = "error";
717
+ last.error = `expect_failed: ${failures.join("; ")}`;
718
+ last.haltReason = `expect_failed in step "${stepId}": ${failures.join("; ")}`;
719
+ const fbk = recipe.on_error?.fallback;
720
+ const fbkOpen = fbk === "log_only" || fbk === "deliver_original";
721
+ const failOpenAgent = step.optional === true || fbkOpen;
722
+ if (!failOpenAgent) {
723
+ runError = runError ?? last.haltReason;
724
+ }
725
+ delete ctx[intoKey];
726
+ }
727
+ else {
728
+ last.expectWarnings = failures;
729
+ }
730
+ }
731
+ }
732
+ }
733
+ }
734
+ }
735
+ }
736
+ catch (err) {
737
+ const msg = err instanceof Error ? err.message : String(err);
738
+ runError = runError ?? `agent step "${stepId}" failed: ${msg}`;
739
+ stepResults.push({
740
+ id: stepId,
741
+ tool: "agent",
742
+ status: "error",
743
+ error: msg,
744
+ haltReason: `Agent step "${stepId}" threw before completing: ${msg}`,
745
+ durationMs: Date.now() - stepStart,
746
+ });
747
+ }
748
+ stepsRun++;
749
+ persistLiveStepResults();
750
+ emitStepDone(stepIdForEmit);
751
+ continue;
752
+ }
753
+ const stepStart = Date.now();
754
+ const stepId = step.into ?? `step_${stepsRun}`;
755
+ // Resolve retry policy: step-level overrides recipe-level.
756
+ const retryCount = step.retry ?? recipe.on_error?.retry ?? 0;
757
+ const retryDelayMs = step.retryDelay ?? recipe.on_error?.retryDelay ?? 1000;
758
+ let result = null;
759
+ let stepError;
760
+ let thrownError;
761
+ let thrownErrorCode;
762
+ for (let attempt = 0; attempt <= retryCount; attempt++) {
763
+ if (attempt > 0) {
764
+ await new Promise((r) => setTimeout(r, retryDelayMs));
765
+ }
766
+ stepError = undefined;
767
+ thrownError = undefined;
768
+ thrownErrorCode = undefined;
769
+ try {
770
+ // Slice (sandbox-alternative): per-step wall-clock timeout via
771
+ // Promise.race. The underlying tool keeps running in the
772
+ // background — this is a halt signal for the runner, not a
773
+ // process kill. The thrown error carries a `step_timeout`
774
+ // prefix so categoriseHaltReason maps it correctly.
775
+ const timeoutMs = typeof step.timeout_ms === "number" && step.timeout_ms > 0
776
+ ? step.timeout_ms
777
+ : 0;
778
+ if (timeoutMs > 0) {
779
+ let timer;
780
+ const timeoutPromise = new Promise((_, reject) => {
781
+ timer = setTimeout(() => {
782
+ reject(new Error(`step_timeout: exceeded ${timeoutMs}ms in step "${step.into ?? step.tool ?? "?"}"`));
783
+ }, timeoutMs);
784
+ });
785
+ try {
786
+ result = await Promise.race([
787
+ executeStep(step, ctx, stepDeps),
788
+ timeoutPromise,
789
+ ]);
790
+ }
791
+ finally {
792
+ if (timer)
793
+ clearTimeout(timer);
794
+ }
795
+ }
796
+ else {
797
+ result = await executeStep(step, ctx, stepDeps);
798
+ }
799
+ // Detect tool-level errors reported as JSON {ok: false, error: ...}
800
+ if (result !== null) {
454
801
  try {
455
- const jsonMatch = /```(?:json)?\s*([\s\S]*?)```/.exec(stripped) ?? [null, stripped];
456
- const parsed = sanitizeParsed(JSON.parse((jsonMatch[1] ?? "").trim()));
457
- ctx[intoKey] = parsed;
802
+ const parsed = JSON.parse(result);
803
+ if (parsed.ok === false && typeof parsed.error === "string") {
804
+ stepError = parsed.error;
805
+ }
458
806
  }
459
807
  catch {
460
- ctx[intoKey] = stripped;
808
+ /* non-JSON result is fine */
461
809
  }
462
- outputs.push(intoKey);
463
- // PR3a: parse + stash the judge verdict on the step result.
464
- // Augment-only: a `request_changes` verdict still yields
465
- // `status: "ok"`. The verdict surfaces via the runlog +
466
- // future PR3b dashboard panel, but never gates the run.
467
- const judgeVerdict = isJudge
468
- ? parseJudgeVerdict(stripped)
469
- : undefined;
470
- stepResults.push({
471
- id: stepId,
472
- tool: "agent",
473
- status: "ok",
474
- ...(judgeVerdict !== undefined && { judgeVerdict }),
475
- durationMs: Date.now() - stepStart,
476
- });
477
810
  }
811
+ // Silent-fail detection: tools that return string placeholders
812
+ // (`(git branches unavailable)`, `[agent step skipped: ...]`)
813
+ // or empty list-tool error shapes (`{count:0,error:"..."}`)
814
+ // succeed with bad data — flag them as `error` so the runner
815
+ // doesn't quietly hand garbage to a downstream agent. Per-step
816
+ // opt-out via `silentFailDetection: false`.
817
+ if (!stepError &&
818
+ result !== null &&
819
+ step.silentFailDetection !== false) {
820
+ const detected = detectSilentFail(result);
821
+ if (detected) {
822
+ stepError = `silent-fail detected (${detected.reason}): ${detected.matched}`;
823
+ }
824
+ }
825
+ }
826
+ catch (err) {
827
+ thrownError = err instanceof Error ? err.message : String(err);
828
+ // Preserve structured error codes (e.g. recipe_path_jail_escape)
829
+ // so callers and tests can branch on `err.code` per R2 M-4
830
+ // without scraping the message string.
831
+ const code = err?.code;
832
+ if (typeof code === "string")
833
+ thrownErrorCode = code;
834
+ result = null;
478
835
  }
836
+ if (!stepError && !thrownError)
837
+ break;
479
838
  }
480
- catch (err) {
481
- const msg = err instanceof Error ? err.message : String(err);
482
- runError = runError ?? `agent step "${stepId}" failed: ${msg}`;
839
+ // Recipe-level fallback: log_only / deliver_original treat step failure
840
+ // as non-fatal (fail-open) same semantics as step-level optional: true.
841
+ const fallback = recipe.on_error?.fallback;
842
+ const fallbackFailOpen = fallback === "log_only" || fallback === "deliver_original";
843
+ const failOpen = step.optional === true || fallbackFailOpen;
844
+ if (thrownError) {
845
+ const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
483
846
  stepResults.push({
484
847
  id: stepId,
485
- tool: "agent",
848
+ tool: step.tool,
486
849
  status: "error",
487
- error: msg,
488
- haltReason: `Agent step "${stepId}" threw before completing: ${msg}`,
850
+ error: thrownError,
851
+ ...(thrownErrorCode ? { errorCode: thrownErrorCode } : {}),
852
+ haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" threw${retryNote}: ${thrownError}`,
489
853
  durationMs: Date.now() - stepStart,
490
854
  });
855
+ if (!failOpen) {
856
+ runError = runError ?? `${step.tool} failed: ${thrownError}`;
857
+ }
858
+ else if (fallbackFailOpen && !step.optional) {
859
+ console.warn(`step ${stepId} failed but on_error.fallback=${fallback} — treating as non-fatal: ${thrownError}`);
860
+ }
491
861
  }
492
- stepsRun++;
493
- persistLiveStepResults();
494
- continue;
495
- }
496
- const stepStart = Date.now();
497
- const stepId = step.into ?? `step_${stepsRun}`;
498
- // Resolve retry policy: step-level overrides recipe-level.
499
- const retryCount = step.retry ?? recipe.on_error?.retry ?? 0;
500
- const retryDelayMs = step.retryDelay ?? recipe.on_error?.retryDelay ?? 1000;
501
- let result = null;
502
- let stepError;
503
- let thrownError;
504
- let thrownErrorCode;
505
- for (let attempt = 0; attempt <= retryCount; attempt++) {
506
- if (attempt > 0) {
507
- await new Promise((r) => setTimeout(r, retryDelayMs));
862
+ else {
863
+ const finalStatus = result === null ? "skipped" : stepError ? "error" : "ok";
864
+ const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
865
+ stepResults.push({
866
+ id: stepId,
867
+ tool: step.tool,
868
+ status: finalStatus,
869
+ error: stepError,
870
+ ...(finalStatus === "error" && stepError
871
+ ? {
872
+ haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" reported an error${retryNote}: ${stepError}`,
873
+ }
874
+ : {}),
875
+ durationMs: Date.now() - stepStart,
876
+ });
877
+ if (stepError) {
878
+ if (!failOpen) {
879
+ runError = runError ?? `${step.tool} failed: ${stepError}`;
880
+ }
881
+ else if (fallbackFailOpen && !step.optional) {
882
+ console.warn(`step ${stepId} failed but on_error.fallback=${fallback} — treating as non-fatal: ${stepError}`);
883
+ }
884
+ }
508
885
  }
509
- stepError = undefined;
510
- thrownError = undefined;
511
- thrownErrorCode = undefined;
512
- try {
513
- result = await executeStep(step, ctx, stepDeps);
514
- // Detect tool-level errors reported as JSON {ok: false, error: ...}
515
- if (result !== null) {
886
+ stepsRun++;
887
+ if (result !== null) {
888
+ // Apply transform if present — render template with $result injected
889
+ if (step.transform) {
516
890
  try {
517
- const parsed = JSON.parse(result);
518
- if (parsed.ok === false && typeof parsed.error === "string") {
519
- stepError = parsed.error;
520
- }
891
+ result = render(step.transform, { ...ctx, $result: result });
521
892
  }
522
- catch {
523
- /* non-JSON result is fine */
893
+ catch (err) {
894
+ // warn but fall through with original result
895
+ console.warn(`transform failed for step ${step.into ?? step.tool ?? "?"}: ${err}`);
524
896
  }
525
897
  }
526
- // Silent-fail detection: tools that return string placeholders
527
- // (`(git branches unavailable)`, `[agent step skipped: ...]`)
528
- // or empty list-tool error shapes (`{count:0,error:"..."}`)
529
- // succeed with bad data flag them as `error` so the runner
530
- // doesn't quietly hand garbage to a downstream agent. Per-step
531
- // opt-out via `silentFailDetection: false`.
532
- if (!stepError &&
533
- result !== null &&
534
- step.silentFailDetection !== false) {
535
- const detected = detectSilentFail(result);
536
- if (detected) {
537
- stepError = `silent-fail detected (${detected.reason}): ${detected.matched}`;
898
+ // Slice 2 — per-step expect eval. Runs on the post-transform value
899
+ // (what would land in ctx) and only when the step otherwise succeeded.
900
+ // Halt failure flips the just-pushed result to error and suppresses
901
+ // the ctx commit by nulling `result` so the downstream `if (step.into)`
902
+ // block skips. Composes with `optional: true` / `on_error.fallback`.
903
+ if (step.expect && !thrownError && !stepError && result !== null) {
904
+ const failures = await evaluateStepExpect(step.expect, result);
905
+ if (failures.length > 0) {
906
+ const onFail = step.expect.on_fail ?? "halt";
907
+ const last = stepResults[stepResults.length - 1];
908
+ if (last) {
909
+ if (onFail === "halt") {
910
+ last.status = "error";
911
+ last.error = `expect_failed: ${failures.join("; ")}`;
912
+ last.haltReason = `expect_failed in step "${stepId}": ${failures.join("; ")}`;
913
+ if (!failOpen) {
914
+ runError = runError ?? last.haltReason;
915
+ }
916
+ result = null;
917
+ }
918
+ else {
919
+ last.expectWarnings = failures;
920
+ }
921
+ }
538
922
  }
539
923
  }
540
- }
541
- catch (err) {
542
- thrownError = err instanceof Error ? err.message : String(err);
543
- // Preserve structured error codes (e.g. recipe_path_jail_escape)
544
- // so callers and tests can branch on `err.code` per R2 M-4
545
- // without scraping the message string.
546
- const code = err?.code;
547
- if (typeof code === "string")
548
- thrownErrorCode = code;
549
- result = null;
550
- }
551
- if (!stepError && !thrownError)
552
- break;
553
- }
554
- // Recipe-level fallback: log_only / deliver_original treat step failure
555
- // as non-fatal (fail-open) — same semantics as step-level optional: true.
556
- const fallback = recipe.on_error?.fallback;
557
- const fallbackFailOpen = fallback === "log_only" || fallback === "deliver_original";
558
- const failOpen = step.optional === true || fallbackFailOpen;
559
- if (thrownError) {
560
- const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
561
- stepResults.push({
562
- id: stepId,
563
- tool: step.tool,
564
- status: "error",
565
- error: thrownError,
566
- ...(thrownErrorCode ? { errorCode: thrownErrorCode } : {}),
567
- haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" threw${retryNote}: ${thrownError}`,
568
- durationMs: Date.now() - stepStart,
569
- });
570
- if (!failOpen) {
571
- runError = runError ?? `${step.tool} failed: ${thrownError}`;
572
- }
573
- else if (fallbackFailOpen && !step.optional) {
574
- console.warn(`step ${stepId} failed but on_error.fallback=${fallback} — treating as non-fatal: ${thrownError}`);
575
- }
576
- }
577
- else {
578
- const finalStatus = result === null ? "skipped" : stepError ? "error" : "ok";
579
- const retryNote = retryCount > 0 ? ` after ${retryCount + 1} attempts` : "";
580
- stepResults.push({
581
- id: stepId,
582
- tool: step.tool,
583
- status: finalStatus,
584
- error: stepError,
585
- ...(finalStatus === "error" && stepError
586
- ? {
587
- haltReason: `Tool "${step.tool ?? "?"}" in step "${stepId}" reported an error${retryNote}: ${stepError}`,
924
+ if (result !== null && step.into) {
925
+ ctx[step.into] = result;
926
+ if (step.tool) {
927
+ applyToolOutputContext(step.tool, step.into, result, ctx);
588
928
  }
589
- : {}),
590
- durationMs: Date.now() - stepStart,
591
- });
592
- if (stepError) {
593
- if (!failOpen) {
594
- runError = runError ?? `${step.tool} failed: ${stepError}`;
595
929
  }
596
- else if (fallbackFailOpen && !step.optional) {
597
- console.warn(`step ${stepId} failed but on_error.fallback=${fallback} treating as non-fatal: ${stepError}`);
930
+ if (step.tool === "file.write" || step.tool === "file.append") {
931
+ // R2 C-1 / F-02: re-validate the rendered path against the jail so a
932
+ // template substitution that survived earlier checks (e.g. via a
933
+ // chained sub-recipe deps override) cannot smuggle an out-of-jail
934
+ // path into the run log / dashboard outputs list.
935
+ const renderedPath = render(step.path, ctx);
936
+ outputs.push(resolveRecipePath(renderedPath, {
937
+ workspace: stepDeps.workdir,
938
+ write: true,
939
+ }));
598
940
  }
599
941
  }
942
+ persistLiveStepResults();
943
+ emitStepDone(stepIdForEmit);
600
944
  }
601
- stepsRun++;
602
- if (result !== null) {
603
- // Apply transform if present render template with $result injected
604
- if (step.transform) {
605
- try {
606
- result = render(step.transform, { ...ctx, $result: result });
607
- }
608
- catch (err) {
609
- // warn but fall through with original result
610
- console.warn(`transform failed for step ${step.into ?? step.tool ?? "?"}: ${err}`);
611
- }
612
- }
613
- if (step.into) {
614
- ctx[step.into] = result;
615
- if (step.tool) {
616
- applyToolOutputContext(step.tool, step.into, result, ctx);
617
- }
618
- }
619
- if (step.tool === "file.write" || step.tool === "file.append") {
620
- // R2 C-1 / F-02: re-validate the rendered path against the jail so a
621
- // template substitution that survived earlier checks (e.g. via a
622
- // chained sub-recipe deps override) cannot smuggle an out-of-jail
623
- // path into the run log / dashboard outputs list.
624
- const renderedPath = render(step.path, ctx);
625
- outputs.push(resolveRecipePath(renderedPath, {
626
- workspace: stepDeps.workdir,
627
- write: true,
628
- }));
629
- }
945
+ }
946
+ catch (err) {
947
+ const msg = err instanceof Error ? err.message : String(err);
948
+ runError = runError ?? `recipe run aborted: ${msg}`;
949
+ }
950
+ // Evaluate expect block before persisting so failures are stored in the
951
+ // run log. Guarded: a throw here must not skip finalization and strand
952
+ // the run at "running".
953
+ let assertionFailures = [];
954
+ if (recipe.expect) {
955
+ try {
956
+ assertionFailures = evaluateExpect({ stepsRun, outputs, context: ctx, errorMessage: runError }, recipe.expect);
630
957
  }
631
- persistLiveStepResults();
632
- // Emit recipe_step_done for live-tail SSE. Look up the matching
633
- // entry in stepResults (the loop pushed at most one with this id);
634
- // payload mirrors chainedRunner's done event plus haltCategory.
635
- const justPushed = stepResults
636
- .slice()
637
- .reverse()
638
- .find((r) => r.id === stepIdForEmit);
639
- if (justPushed) {
640
- const haltReason = justPushed.haltReason;
641
- emit("recipe_step_done", {
642
- runSeq,
643
- recipeName: recipe.name,
644
- stepId: justPushed.id,
645
- tool: justPushed.tool,
646
- status: justPushed.status,
647
- durationMs: justPushed.durationMs,
648
- ...(justPushed.error !== undefined && { error: justPushed.error }),
649
- ...(haltReason !== undefined && {
650
- haltReason,
651
- haltCategory: categoriseHaltReason(haltReason),
652
- }),
653
- ts: Date.now(),
654
- });
958
+ catch (err) {
959
+ const msg = err instanceof Error ? err.message : String(err);
960
+ runError = runError ?? `expect evaluation failed: ${msg}`;
655
961
  }
656
962
  }
657
- // Evaluate expect block before persisting so failures are stored in the run log
658
- const assertionFailures = recipe.expect
659
- ? evaluateExpect({ stepsRun, outputs, context: ctx, errorMessage: runError }, recipe.expect)
660
- : [];
661
963
  // Write to RecipeRunLog so the dashboard Runs page shows this execution.
662
964
  // Bridge path: completeRun on the running entry opened above (live-tail).
663
965
  // CLI path: construct a local log + appendDirect (no live-tail).
@@ -686,6 +988,7 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
686
988
  outputTail,
687
989
  ...(runError !== undefined && { errorMessage: runError }),
688
990
  ...(assertionFailures.length > 0 ? { assertionFailures } : {}),
991
+ ...(inboxOutputs.length > 0 ? { inboxOutputs } : {}),
689
992
  });
690
993
  emit("recipe_done", {
691
994
  runSeq,
@@ -693,6 +996,10 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
693
996
  status: runError ? "error" : "done",
694
997
  durationMs: doneAt - recipeStartedAt,
695
998
  stepCount: finalStepResults.length,
999
+ // A `done` run can still carry step errors — the runner
1000
+ // continues past a non-fatal step failure. Surface it so
1001
+ // live consumers can show "completed with errors".
1002
+ hadStepErrors: finalStepResults.some((s) => s.status === "error"),
696
1003
  ...(runError !== undefined && { errorMessage: runError }),
697
1004
  ...(assertionFailures.length > 0 && {
698
1005
  assertionFailureCount: assertionFailures.length,
@@ -718,6 +1025,7 @@ export async function runYamlRecipe(recipe, deps = {}, seedContext = {}) {
718
1025
  errorMessage: runError,
719
1026
  stepResults: finalStepResults,
720
1027
  ...(assertionFailures.length > 0 ? { assertionFailures } : {}),
1028
+ ...(inboxOutputs.length > 0 ? { inboxOutputs } : {}),
721
1029
  });
722
1030
  }
723
1031
  }
@@ -772,11 +1080,19 @@ export async function executeStep(step, ctx, deps) {
772
1080
  // Check if tool is registered in the new registry
773
1081
  if (hasTool(toolId)) {
774
1082
  const tool = getTool(toolId);
775
- // Build params with template rendering for string values
1083
+ // Build params with template rendering for string values.
1084
+ // `do` is left raw: it carries a nested sub-step template (used by
1085
+ // `fan_out`) whose `{{item.*}}` placeholders must be rendered per-iter
1086
+ // with the loop variable in scope, not pre-rendered against the outer
1087
+ // ctx (which would resolve them to empty strings).
776
1088
  const params = {};
777
1089
  for (const [key, value] of Object.entries(step)) {
778
1090
  if (key === "tool" || key === "agent" || key === "into")
779
1091
  continue;
1092
+ if (key === "do") {
1093
+ params[key] = value;
1094
+ continue;
1095
+ }
780
1096
  params[key] = deepRender(value, ctx);
781
1097
  }
782
1098
  // Check if mock connector is available for this tool
@@ -1096,8 +1412,16 @@ export function resolveClaudeBinary() {
1096
1412
  }
1097
1413
  return ensureCmdShim("claude");
1098
1414
  }
1099
- function defaultClaudeCodeFn(prompt, _opts) {
1415
+ export function defaultClaudeCodeFn(prompt, _opts) {
1100
1416
  const binary = resolveClaudeBinary();
1417
+ // Resolve a workspace cwd so the spawned `claude -p` doesn't inherit the
1418
+ // bridge LaunchAgent's `$HOME` (P2 from the 2026-05-20 research run).
1419
+ // When nothing resolves, surface a typed reason instead of silently
1420
+ // shelling out from the wrong directory.
1421
+ const workspace = resolveWorkspaceRoot();
1422
+ if (!workspace) {
1423
+ return Promise.resolve(`[agent step failed: recipe_no_workspace — no .git ancestor of "${process.cwd()}" and PATCHWORK_WORKSPACE not set. Set PATCHWORK_WORKSPACE in the bridge environment or add a 'workspace:' field to the recipe.]`);
1424
+ }
1101
1425
  try {
1102
1426
  const result = spawnSync(binary, [
1103
1427
  "-p",
@@ -1106,6 +1430,7 @@ function defaultClaudeCodeFn(prompt, _opts) {
1106
1430
  "You are a helpful assistant processing a recipe task. Use ONLY the data explicitly provided in the user message — treat it as ground truth. Do not call tools to look up git history, emails, or any other information; all necessary data is already included.",
1107
1431
  "--no-session-persistence",
1108
1432
  ], {
1433
+ cwd: workspace.path,
1109
1434
  encoding: "utf-8",
1110
1435
  timeout: 120_000,
1111
1436
  maxBuffer: 10 * 1024 * 1024,
@@ -1144,10 +1469,11 @@ function makeProviderDriverFn() {
1144
1469
  const timeoutMs = 300_000;
1145
1470
  const startupTimeoutMs = 30_000;
1146
1471
  const timeout = setTimeout(() => controller.abort(), timeoutMs);
1472
+ const resolvedWorkspace = process.cwd();
1147
1473
  try {
1148
1474
  const result = await driver.run({
1149
1475
  prompt,
1150
- workspace: process.cwd(),
1476
+ workspace: resolvedWorkspace,
1151
1477
  timeoutMs,
1152
1478
  startupTimeoutMs,
1153
1479
  signal: controller.signal,