@flumecode/runner 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -342,12 +342,97 @@ function createPlanTooling() {
342
342
  return { mcpServer, getPlans: () => renderedPlans };
343
343
  }
344
344
 
345
+ // src/report.ts
346
+ import { createSdkMcpServer as createSdkMcpServer3, tool as tool3 } from "@anthropic-ai/claude-agent-sdk";
347
+ import { z as z3 } from "zod";
348
+ var SERVER_NAME3 = "flume_report";
349
+ var SUBMIT_REPORT = "submit_report";
350
+ var REPORT_TOOL_NAME = `mcp__${SERVER_NAME3}__${SUBMIT_REPORT}`;
351
+ var STATUS_ICON = {
352
+ met: "\u2705",
353
+ not_met: "\u274C",
354
+ unclear: "\u26A0\uFE0F"
355
+ };
356
+ var evidenceSchema = z3.object({
357
+ file: z3.string().min(1).describe("Repo-relative path the hunk comes from."),
358
+ hunk: z3.string().min(1).describe(
359
+ "A unified-diff hunk body proving the criterion \u2014 the lines that matter, not the whole file. Rendered verbatim as a ```diff block."
360
+ ),
361
+ note: z3.string().optional().describe("Optional one-line explanation of why this hunk satisfies the criterion.")
362
+ });
363
+ var acVerdictSchema = z3.object({
364
+ criterion: z3.string().min(1).describe("The acceptance-criterion text, verbatim from the plan."),
365
+ status: z3.enum(["met", "not_met", "unclear"]).describe("Verdict for this criterion, verified against the actual diff."),
366
+ rationale: z3.string().min(1).describe("One or two sentences on why the verdict holds."),
367
+ evidence: z3.array(evidenceSchema).describe(
368
+ "Diff hunks proving the verdict. Include the relevant hunk(s) for a met criterion; may be empty for not_met / unclear."
369
+ )
370
+ });
371
+ var reportInputSchema = {
372
+ summary: z3.string().min(1).describe("One or two sentences on what was implemented."),
373
+ prose: z3.string().min(1).describe(
374
+ "Markdown for the remaining report sections \u2014 What changed, Files changed, Build / tests, and Caveats / follow-ups. Use ## headings. Do NOT include the acceptance-criteria section here (that goes in acceptanceCriteria) and do NOT include the PR link (the runner appends it)."
375
+ ),
376
+ acceptanceCriteria: z3.array(acVerdictSchema).min(1).describe(
377
+ "One entry per acceptance criterion from the plan, in plan order, each with a verdict and the diff evidence behind it."
378
+ )
379
+ };
380
+ var reportSchema = z3.object(reportInputSchema);
381
+ function renderReport(report) {
382
+ const lines = [];
383
+ lines.push(report.summary.trim());
384
+ lines.push("");
385
+ lines.push(report.prose.trim());
386
+ lines.push("");
387
+ lines.push("## Acceptance criteria");
388
+ for (const ac of report.acceptanceCriteria) {
389
+ lines.push("");
390
+ lines.push(`### ${STATUS_ICON[ac.status]} ${ac.criterion}`);
391
+ lines.push("");
392
+ lines.push(ac.rationale.trim());
393
+ for (const ev of ac.evidence) {
394
+ lines.push("");
395
+ lines.push(ev.note ? `\`${ev.file}\` \u2014 ${ev.note}` : `\`${ev.file}\``);
396
+ lines.push("");
397
+ lines.push("```diff");
398
+ lines.push(ev.hunk.replace(/\n+$/, ""));
399
+ lines.push("```");
400
+ }
401
+ }
402
+ return lines.join("\n");
403
+ }
404
+ function createReportTooling() {
405
+ let submittedReport = null;
406
+ const submitReport = tool3(
407
+ SUBMIT_REPORT,
408
+ "Submit the final implementation report as structured data. Call this exactly once, at the end of the run. `acceptanceCriteria` must contain one entry per plan criterion, each with a met / not_met / unclear verdict and the diff hunk(s) that prove it. `summary` + `prose` are markdown for the rest of the report. Do NOT include a PR link \u2014 the runner appends it.",
409
+ reportInputSchema,
410
+ async (args) => {
411
+ submittedReport = reportSchema.parse(args);
412
+ return {
413
+ content: [
414
+ {
415
+ type: "text",
416
+ text: "Report submitted. The runner will render and post it. End your turn now."
417
+ }
418
+ ]
419
+ };
420
+ }
421
+ );
422
+ const mcpServer = createSdkMcpServer3({
423
+ name: SERVER_NAME3,
424
+ tools: [submitReport]
425
+ });
426
+ return { mcpServer, getReport: () => submittedReport };
427
+ }
428
+
345
429
  // src/executor.ts
346
430
  var FLUME_PLUGIN_DIR = fileURLToPath2(new URL("../skills-plugin", import.meta.url));
347
431
  async function runClaudeCode(opts) {
348
432
  let finalText = "";
349
433
  const { mcpServer, collected } = createWidgetTooling();
350
434
  const { mcpServer: planServer, getPlans } = createPlanTooling();
435
+ const { mcpServer: reportServer, getReport } = createReportTooling();
351
436
  for await (const message of query({
352
437
  prompt: opts.prompt,
353
438
  options: {
@@ -368,8 +453,8 @@ async function runClaudeCode(opts) {
368
453
  // does NOT restrict anything else). Task lets the implement-plan
369
454
  // orchestrator spawn its subagents; without pre-approval the spawn could
370
455
  // stall waiting for an approval no one can give.
371
- mcpServers: { flume_widgets: mcpServer, flume_plan: planServer },
372
- allowedTools: [...WIDGET_TOOL_NAMES, PLAN_TOOL_NAME, "Task"]
456
+ mcpServers: { flume_widgets: mcpServer, flume_plan: planServer, flume_report: reportServer },
457
+ allowedTools: [...WIDGET_TOOL_NAMES, PLAN_TOOL_NAME, REPORT_TOOL_NAME, "Task"]
373
458
  }
374
459
  })) {
375
460
  if (message.type === "assistant") {
@@ -389,7 +474,7 @@ async function runClaudeCode(opts) {
389
474
  if (opts.abortController?.signal.aborted) {
390
475
  throw new Error("Run canceled by user");
391
476
  }
392
- return { text: finalText, widgets: collected, plans: getPlans() };
477
+ return { text: finalText, widgets: collected, plans: getPlans(), report: getReport() };
393
478
  }
394
479
 
395
480
  // src/health.ts
@@ -586,7 +671,7 @@ function buildRepairPrompt(ctx, hookLog) {
586
671
  ];
587
672
  return lines.join("\n");
588
673
  }
589
- function buildReleasePrompt(ctx) {
674
+ function buildReleasePrompt(ctx, baseChecks) {
590
675
  const task = `Use the \`flumecode:create-release\` skill to handle this turn. You are driving a release: first analyse commits since the last tag, propose version bumps, and ask the user to confirm via widgets (Phase 1); once the user's widget answers appear in the thread, apply the bumps to package.json files and update CHANGELOG.md (Phase 2). Do NOT commit or push \u2014 the runner handles that and opens the bump PR.`;
591
676
  const orient = `Before investigating raw source, check for a FlumeCode wiki at \`.flumecode/wiki/\`. If it exists, read \`.flumecode/wiki/README.md\` first \u2014 it is the index \u2014 and follow its links to the pages and source paths relevant to this release. If there is no wiki, work from the code directly.`;
592
677
  const widgets = `When you need the user to choose, ask it as a widget rather than writing the options as prose: call \`single_select\` for a one-of-N choice (radio buttons) or \`multi_select\` for a "select all that apply" choice (checkboxes). Don't add your own "Other" option \u2014 the UI always provides one. After calling a widget tool, end your turn \u2014 the user's answer comes back as their next message and starts a fresh run.`;
@@ -606,6 +691,23 @@ function buildReleasePrompt(ctx) {
606
691
  if (ctx.request?.body) {
607
692
  lines.push("", ctx.request.body);
608
693
  }
694
+ if (baseChecks && !baseChecks.ok) {
695
+ lines.push(
696
+ "",
697
+ "# Pre-release check status",
698
+ "",
699
+ "\u26A0\uFE0F The repository's pre-commit checks (lint / typecheck / tests) are currently FAILING on the base branch, independently of any version bump. A release must not ship a broken base:",
700
+ "",
701
+ "- **Phase 1 (propose):** tell the user, in your reply, that the base currently fails these checks and that the release will fix them as part of the bump.",
702
+ "- **Phase 2 (apply):** fix the failing code at its root so the checks pass, THEN apply the version bumps and CHANGELOG. Do NOT delete/skip tests or weaken assertions. The fixes ship in the same bump PR. Still do NOT commit or push \u2014 the runner does.",
703
+ "",
704
+ "Failing check output:",
705
+ "",
706
+ "```",
707
+ baseChecks.log,
708
+ "```"
709
+ );
710
+ }
609
711
  appendThread(lines, ctx);
610
712
  lines.push(
611
713
  "",
@@ -640,6 +742,12 @@ var MAX_BUFFER = 1 << 24;
640
742
  async function git(args) {
641
743
  return exec("git", args, { maxBuffer: MAX_BUFFER });
642
744
  }
745
+ var RUNNER_GIT_EMAIL = "runner@flumecode.local";
746
+ var RUNNER_GIT_NAME = "FlumeCode Runner";
747
+ async function ensureGitIdentity(dir) {
748
+ await git(["-C", dir, "config", "user.email", RUNNER_GIT_EMAIL]);
749
+ await git(["-C", dir, "config", "user.name", RUNNER_GIT_NAME]);
750
+ }
643
751
  function cloneUrl(ctx) {
644
752
  const { owner, name, cloneToken } = ctx.repo;
645
753
  return `https://x-access-token:${cloneToken}@github.com/${owner}/${name}.git`;
@@ -703,15 +811,20 @@ async function resetWorkspace(dir) {
703
811
  async function prepareAtSha(ctx, dir, reused) {
704
812
  if (!reused) {
705
813
  await cloneAtSha(ctx, dir);
814
+ await ensureGitIdentity(dir);
706
815
  return;
707
816
  }
708
817
  await git(["-C", dir, "remote", "set-url", "origin", cloneUrl(ctx)]);
818
+ await ensureGitIdentity(dir);
709
819
  }
710
820
  async function prepareResumingBranch(ctx, dir, reused) {
711
821
  if (!reused) {
712
- return cloneResumingBranch(ctx, dir);
822
+ const result = await cloneResumingBranch(ctx, dir);
823
+ await ensureGitIdentity(dir);
824
+ return result;
713
825
  }
714
826
  await git(["-C", dir, "remote", "set-url", "origin", cloneUrl(ctx)]);
827
+ await ensureGitIdentity(dir);
715
828
  return { resumed: true };
716
829
  }
717
830
  async function sweepWorkspaces() {
@@ -765,21 +878,25 @@ function commitFailureLog(err) {
765
878
  const parts = [e.stdout, e.stderr].map((s) => typeof s === "string" ? s.trim() : "").filter((s) => s.length > 0);
766
879
  return parts.length > 0 ? parts.join("\n") : e.message ?? String(err);
767
880
  }
881
+ function isUnsupportedGitSubcommand(err) {
882
+ const e = err;
883
+ const text = `${typeof e.stderr === "string" ? e.stderr : ""}
884
+ ${e.message ?? ""}`;
885
+ return /is not a git command|unknown subcommand|usage: git hook/i.test(text);
886
+ }
887
+ async function runRepoChecks(dir) {
888
+ try {
889
+ await git(["-C", dir, "hook", "run", "pre-commit"]);
890
+ return { ok: true, log: "", skipped: false };
891
+ } catch (err) {
892
+ if (isUnsupportedGitSubcommand(err)) return { ok: true, log: "", skipped: true };
893
+ return { ok: false, log: commitFailureLog(err), skipped: false };
894
+ }
895
+ }
768
896
  async function commitChanges(ctx, dir) {
769
897
  if (!await hasChanges(dir)) return false;
770
898
  try {
771
- await git([
772
- "-C",
773
- dir,
774
- "-c",
775
- "user.email=runner@flumecode.local",
776
- "-c",
777
- "user.name=FlumeCode Runner",
778
- "commit",
779
- "--quiet",
780
- "-m",
781
- `FlumeCode: ${jobTitle(ctx)}`
782
- ]);
899
+ await git(["-C", dir, "commit", "--quiet", "-m", `FlumeCode: ${jobTitle(ctx)}`]);
783
900
  } catch (err) {
784
901
  throw new PreCommitError(commitFailureLog(err));
785
902
  }
@@ -819,17 +936,7 @@ async function mergeInMergeBranch(ctx, dir) {
819
936
  if (!mergeBranch) return { conflicted: false };
820
937
  await git(["-C", dir, "fetch", "--quiet", "origin", mergeBranch]);
821
938
  try {
822
- await git([
823
- "-C",
824
- dir,
825
- "-c",
826
- "user.email=runner@flumecode.local",
827
- "-c",
828
- "user.name=FlumeCode Runner",
829
- "merge",
830
- "--no-edit",
831
- "FETCH_HEAD"
832
- ]);
939
+ await git(["-C", dir, "merge", "--no-edit", "FETCH_HEAD"]);
833
940
  return { conflicted: false };
834
941
  } catch {
835
942
  return { conflicted: true };
@@ -890,6 +997,7 @@ var CANCEL_POLL_MS = 2500;
890
997
  var ORCHESTRATOR_MODEL = "sonnet";
891
998
  var ORCHESTRATOR_MAX_TURNS = 80;
892
999
  var MAX_COMMIT_REPAIRS = 2;
1000
+ var MAX_IMPLEMENT_RETRIES = 1;
893
1001
  var INIT_MAX_TURNS = 200;
894
1002
  var DOCUMENT_MAX_TURNS = 120;
895
1003
  var HEARTBEAT_MS = 5 * 6e4;
@@ -1082,19 +1190,36 @@ async function processChatJob(ctx, dir, abort) {
1082
1190
  reply += outcomeBanner(outcome, { branch: ctx.repo.checkoutBranch, documented, autoMerged });
1083
1191
  return { text: reply, widgets: [] };
1084
1192
  }
1193
+ function reportClaimsWork(report) {
1194
+ return !!report && report.acceptanceCriteria.some((ac) => ac.status === "met" && ac.evidence.length > 0);
1195
+ }
1085
1196
  async function processImplementJob(ctx, dir, resumed, abort) {
1086
1197
  console.log(`
1087
1198
  \u25B6 Implement ${ctx.jobId} \u2014 ${ctx.repo.fullName}: "${jobTitle(ctx)}"`);
1088
1199
  const installResult = await installDependencies(dir);
1089
- const result = await runClaudeCode({
1090
- cwd: dir,
1091
- prompt: buildPrompt(ctx),
1092
- permissionMode: ctx.permissionMode,
1093
- model: ORCHESTRATOR_MODEL,
1094
- maxTurns: ORCHESTRATOR_MAX_TURNS,
1095
- abortController: abort
1096
- });
1097
- let reply = result.text.trim() || "(the agent produced no report)";
1200
+ let report;
1201
+ let reply;
1202
+ for (let attempt = 0; ; attempt++) {
1203
+ const result = await runClaudeCode({
1204
+ cwd: dir,
1205
+ prompt: buildPrompt(ctx),
1206
+ permissionMode: ctx.permissionMode,
1207
+ model: ORCHESTRATOR_MODEL,
1208
+ maxTurns: ORCHESTRATOR_MAX_TURNS,
1209
+ abortController: abort
1210
+ });
1211
+ report = result.report ?? void 0;
1212
+ reply = (report ? renderReport(report) : result.text.trim()) || "(the agent produced no report)";
1213
+ if (abort.signal.aborted || !reportClaimsWork(report) || await hasChanges(dir)) break;
1214
+ if (attempt >= MAX_IMPLEMENT_RETRIES) {
1215
+ throw new Error(
1216
+ `Implementation reported completed work (acceptance criteria met with diff evidence) but the working tree is clean after ${attempt + 1} attempt(s) \u2014 no changes were persisted, so no pull request could be opened.`
1217
+ );
1218
+ }
1219
+ console.warn(
1220
+ ` implement ${ctx.jobId}: report claims changes but the working tree is clean \u2014 re-running implementation (attempt ${attempt + 2})`
1221
+ );
1222
+ }
1098
1223
  if (installResult.status === "failed") {
1099
1224
  reply += `
1100
1225
 
@@ -1118,7 +1243,12 @@ async function processImplementJob(ctx, dir, resumed, abort) {
1118
1243
  }
1119
1244
  const { outcome, autoMerged } = await pushAndOpenPr(ctx, dir, abort, { rebase: !resumed });
1120
1245
  reply += outcomeBanner(outcome, { branch: ctx.repo.checkoutBranch, documented, autoMerged });
1121
- return { text: reply, widgets: [], ...outcome.kind === "pr" ? { pr: outcome.pr } : {} };
1246
+ return {
1247
+ text: reply,
1248
+ widgets: [],
1249
+ ...report ? { report } : {},
1250
+ ...outcome.kind === "pr" ? { pr: outcome.pr } : {}
1251
+ };
1122
1252
  }
1123
1253
  async function processReviseJob(ctx, dir, resumed, abort) {
1124
1254
  console.log(`
@@ -1194,9 +1324,16 @@ async function processReleaseJob(ctx, dir, resumed, abort) {
1194
1324
  console.log(`
1195
1325
  \u25B6 Release ${ctx.jobId} \u2014 ${ctx.repo.fullName}: "${jobTitle(ctx)}"`);
1196
1326
  const installResult = await installDependencies(dir);
1327
+ const checks = await runRepoChecks(dir);
1328
+ if (checks.skipped) {
1329
+ console.log(` \u2026release ${ctx.jobId}: pre-release checks skipped (git too old for 'hook run')`);
1330
+ } else {
1331
+ console.log(` \u2026release ${ctx.jobId}: pre-release checks ${checks.ok ? "passed" : "FAILED"}`);
1332
+ }
1333
+ const baseChecks = checks.ok ? void 0 : { ok: false, log: trimHookLog(checks.log) };
1197
1334
  const result = await runClaudeCode({
1198
1335
  cwd: dir,
1199
- prompt: buildReleasePrompt(ctx),
1336
+ prompt: buildReleasePrompt(ctx, baseChecks),
1200
1337
  permissionMode: ctx.permissionMode,
1201
1338
  model: ORCHESTRATOR_MODEL,
1202
1339
  maxTurns: ORCHESTRATOR_MAX_TURNS,
@@ -1282,13 +1419,14 @@ async function pollLoop(config) {
1282
1419
  };
1283
1420
  scheduleCancelPoll();
1284
1421
  try {
1285
- const { text, widgets, pr, plans } = await processJob(ctx, abort);
1422
+ const { text, widgets, pr, plans, report } = await processJob(ctx, abort);
1286
1423
  await reportJob(config, ctx.jobId, {
1287
1424
  status: "done",
1288
1425
  text,
1289
1426
  widgets,
1290
1427
  pr,
1291
- ...plans?.length ? { plans } : {}
1428
+ ...plans?.length ? { plans } : {},
1429
+ ...report ? { report } : {}
1292
1430
  });
1293
1431
  console.log(`\u2713 Job ${ctx.jobId} done`);
1294
1432
  } catch (err) {
@@ -1300,10 +1438,12 @@ async function pollLoop(config) {
1300
1438
  console.error(` (failed to report the cancellation: ${errorMessage2(reportErr)})`);
1301
1439
  }
1302
1440
  } else {
1303
- const message = errorMessage2(err);
1304
- console.error(`\u2717 Job ${ctx.jobId} failed: ${message}`);
1441
+ console.error(`\u2717 Job ${ctx.jobId} failed: ${errorMessage2(err)}`);
1305
1442
  try {
1306
- await reportJob(config, ctx.jobId, { status: "error", error: message });
1443
+ await reportJob(config, ctx.jobId, {
1444
+ status: "error",
1445
+ error: formatJobError(ctx, err)
1446
+ });
1307
1447
  } catch (reportErr) {
1308
1448
  console.error(` (also failed to report the error: ${errorMessage2(reportErr)})`);
1309
1449
  }
@@ -1322,6 +1462,40 @@ function sleep(ms) {
1322
1462
  function errorMessage2(err) {
1323
1463
  return err instanceof Error ? err.message : String(err);
1324
1464
  }
1465
+ var MAX_HOOK_LOG_LINES = 80;
1466
+ var MAX_HOOK_LOG_CHARS = 4e3;
1467
+ function trimHookLog(log) {
1468
+ let trimmed = log.trimEnd();
1469
+ let elided = false;
1470
+ const lines = trimmed.split("\n");
1471
+ if (lines.length > MAX_HOOK_LOG_LINES) {
1472
+ trimmed = lines.slice(-MAX_HOOK_LOG_LINES).join("\n");
1473
+ elided = true;
1474
+ }
1475
+ if (trimmed.length > MAX_HOOK_LOG_CHARS) {
1476
+ trimmed = trimmed.slice(-MAX_HOOK_LOG_CHARS);
1477
+ elided = true;
1478
+ }
1479
+ return elided ? `\u2026(earlier output trimmed)\u2026
1480
+ ${trimmed}` : trimmed;
1481
+ }
1482
+ function formatJobError(ctx, err) {
1483
+ if (!(err instanceof PreCommitError)) return errorMessage2(err);
1484
+ const nextStep = ctx.kind === "release" ? `These checks are failing on \`${ctx.repo.mergeBranch}\` independently of the version bump, and the release couldn't fix them after ${MAX_COMMIT_REPAIRS} automatic attempts. Open a request on **${ctx.repo.fullName}** to fix the failing checks above, then start the release again once that fix has merged.` : `The agent couldn't get its change past these checks after ${MAX_COMMIT_REPAIRS} automatic repair attempts. Open a request on **${ctx.repo.fullName}** describing the failing checks above so the agent can fix them at their root, then try again.`;
1485
+ return [
1486
+ "\u274C **Blocked by failing pre-commit checks.**",
1487
+ "",
1488
+ `The repository's pre-commit hook (lint / typecheck / tests) rejected the commit after ${MAX_COMMIT_REPAIRS} automatic repair attempts, so nothing was pushed.`,
1489
+ "",
1490
+ "**What failed:**",
1491
+ "",
1492
+ "```",
1493
+ trimHookLog(err.log),
1494
+ "```",
1495
+ "",
1496
+ `**Next step:** ${nextStep}`
1497
+ ].join("\n");
1498
+ }
1325
1499
 
1326
1500
  // src/cli.ts
1327
1501
  var DEFAULT_SERVER = process.env.FLUME_SERVER || "http://localhost:3000";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flumecode/runner",
3
- "version": "0.7.0",
3
+ "version": "0.8.0",
4
4
  "type": "module",
5
5
  "description": "FlumeCode local runner — claims jobs and drives your local Claude Code against a real checkout.",
6
6
  "bin": {
@@ -171,6 +171,28 @@ version did not change.
171
171
  `apps/runner/package.json`. Leave `apps/web/package.json` unchanged.
172
172
  - **Clear Phase 1 text:** be explicit about what changed since the last tag so the
173
173
  user can confidently confirm or override your suggestions.
174
- - **Never edit** any file other than `apps/web/package.json`,
175
- `apps/runner/package.json`, and `CHANGELOG.md`.
174
+ - **Edit only version files with one exception.** Normally edit only
175
+ `apps/web/package.json`, `apps/runner/package.json`, and `CHANGELOG.md`. The sole
176
+ exception: when the prompt includes a **`# Pre-release check status`** section
177
+ reporting failing checks, you must also fix the failing code (any file needed) so
178
+ the tree is green — see "Pre-release checks" below. Never weaken or skip checks to
179
+ silence them.
176
180
  - **Never commit, push, or open a PR** — the runner does that.
181
+
182
+ ## Pre-release checks
183
+
184
+ We cannot release code with failing checks. Before this turn, the runner ran the
185
+ repository's own pre-commit hook (lint / typecheck / tests). If the prompt contains
186
+ a **`# Pre-release check status`** section, the base branch is currently broken
187
+ _independently of the version bump_:
188
+
189
+ - **Phase 1:** state plainly in your reply that the base currently fails these
190
+ checks and that the release will fix them as part of the bump, then ask the
191
+ version questions as usual.
192
+ - **Phase 2:** fix the failing code at its root **first** (so the checks pass),
193
+ **then** apply the version bumps and CHANGELOG. The fixes ship in the same bump
194
+ PR. Do not delete or skip tests, weaken assertions, or disable checks. Still do
195
+ not commit or push — the runner commits everything together.
196
+
197
+ If there is no `# Pre-release check status` section, the base is clean (or the check
198
+ was skipped); proceed normally and edit only the version files.
@@ -5,8 +5,9 @@ description: >-
5
5
  subagents instead of writing the code yourself. Use in edit-capable runs. You
6
6
  act as the orchestrator: delegate implementation, acceptance-criteria review,
7
7
  code-quality review, and report-writing to Task subagents — picking the right
8
- model for each phase then return their report. Makes edits via subagents;
9
- never commits, pushes, or opens a PR (the runner does that).
8
+ model for each phase. The report subagent submits a structured report (with
9
+ per-criterion diff evidence) via the submit_report tool. Makes edits via
10
+ subagents; never commits, pushes, or opens a PR (the runner does that).
10
11
  ---
11
12
 
12
13
  # implement-plan
@@ -67,8 +68,19 @@ the next step.
67
68
  3. **Acceptance-criteria review** — Task, `model: "opus"`, read-only. Give the
68
69
  subagent the full AC list and tell it to verify each one against the actual
69
70
  changes (run `git --no-pager diff`, read the changed files, run tests/build if
70
- useful). It must return a per-AC verdict: **met / not met / unclear**, each
71
- with concrete evidence (file:line, test result).
71
+ useful). For **each** AC it must return: the criterion text verbatim, a verdict
72
+ (**met / not met / unclear**), a one-or-two-sentence rationale, and — this is the
73
+ evidence the report needs — the **exact diff hunk(s)** that prove it, each tagged
74
+ with its file path (the minimal lines that matter, copied verbatim from
75
+ `git --no-pager diff`; not the whole file). A _met_ AC should cite at least one
76
+ hunk; _not met_ / _unclear_ may cite none. **Ground every verdict in the actual
77
+ diff:** a criterion may be marked _met_ only if `git --no-pager diff` really
78
+ contains the change that satisfies it, and each cited hunk must be copied verbatim
79
+ from that live output — never reconstructed from the plan or from what the
80
+ implement subagent claimed. If `git --no-pager diff` is empty, the implementation
81
+ produced no changes: no criterion may be _met_, and the review must say so. Tell it
82
+ to return this as a clean, structured list so you can hand it straight to the
83
+ report step.
72
84
 
73
85
  4. **Code-quality review** — Task, `model: "opus"`, read-only. Give the subagent
74
86
  the coding guidelines (verbatim) and tell it to review the changes for
@@ -82,30 +94,45 @@ the next step.
82
94
  failed. Repeat at most **2** times. If something still fails after that, stop
83
95
  looping and record the gap honestly in the report — do not hide it.
84
96
 
85
- 6. **Report** — Task, `model: "opus"`, read-only. Give the subagent the plan, the
86
- final `git --no-pager diff` (or tell it to run it), the AC verdicts, and the
87
- quality findings, and have it write the user-facing report in the shape below.
88
-
89
- 7. **Return the report.** Your final reply **is** that report output it verbatim
90
- as your last message, with nothing added. The runner posts it to the thread and
91
- appends the pull-request link.
92
-
93
- ## The report (what the user sees)
94
-
95
- Have the report subagent produce, in this shape:
96
-
97
- - **Summary** — one or two sentences on what was implemented.
98
- - **What changed** the plan steps, each mapped to the concrete changes that
99
- satisfy it.
100
- - **Acceptance criteria** a checklist, each AC marked met / not met /
101
- ⚠️ unclear, mirroring the AC review.
102
- - **Code quality** — a short note on the quality-review outcome and anything left
103
- as nice-to-have.
104
- - **Files changed** — the list from the diff.
105
- - **Build / tests** what was run and the result, or why it wasn't run.
106
- - **Caveats / follow-ups** — anything deferred, unmet, or worth a human's eyes.
107
-
108
- Do **not** include a PR link the runner adds it.
97
+ 6. **Report** — Task, `model: "opus"`, read-only. Give the subagent the plan, the AC
98
+ verdicts (from step 3), and the quality findings, and tell it to run
99
+ `git --no-pager diff` itself as the **single source of truth** for the report.
100
+ Every `evidence` hunk it submits must be copied verbatim from that live diff — it
101
+ must drop or correct any hunk carried over from step 3 that no longer appears in
102
+ the actual diff, and the **Files changed** list must come from
103
+ `git --no-pager diff --stat`, not from what an earlier subagent claimed. **If
104
+ `git --no-pager diff` is empty, the implementation changed nothing:** the report
105
+ must say so plainly an honest `summary`, no AC marked `met` with evidence — and
106
+ must never describe edits that aren't in the diff. Tell it to submit the
107
+ user-facing report by calling the **`submit_report`** tool it has that tool
108
+ available. It must call `submit_report` exactly once and must not edit any files.
109
+
110
+ 7. **Confirm and end.** Once the report subagent has called `submit_report`, you are
111
+ done — end your turn. The runner reads the submitted report, renders it, posts it
112
+ to the thread, and appends the pull-request link. (Your own final text is only a
113
+ fallback if no report was submitted, so make sure the subagent submits one.)
114
+
115
+ ## The report (what `submit_report` takes)
116
+
117
+ The report subagent calls `submit_report` with these fields:
118
+
119
+ - **`summary`** — one or two sentences on what was implemented.
120
+ - **`prose`** markdown for the remaining sections, using `##` headings:
121
+ **What changed** (the plan steps, each mapped to the concrete changes that satisfy
122
+ it), **Code quality** (the quality-review outcome and anything left as
123
+ nice-to-have), **Files changed** (the list from the diff), **Build / tests** (what
124
+ was run and the result, or why it wasn't run), and **Caveats / follow-ups**
125
+ (anything deferred, unmet, or worth a human's eyes). Do **not** put the
126
+ acceptance-criteria section in `prose`, and do **not** include a PR link — the
127
+ runner adds it.
128
+ - **`acceptanceCriteria`** — one entry per AC from the plan, in plan order, each:
129
+ - `criterion` — the AC text verbatim.
130
+ - `status` — `"met"` / `"not_met"` / `"unclear"`, mirroring the AC review.
131
+ - `rationale` — one or two sentences on why the verdict holds.
132
+ - `evidence` — an array of `{ file, hunk, note? }`, where `hunk` is copied
133
+ verbatim from the live `git --no-pager diff` and proves the verdict (`note`
134
+ optionally explains it). Never include a hunk that isn't in the actual diff. Cite
135
+ the supporting hunk(s) for a met criterion; may be empty for not_met / unclear.
109
136
 
110
137
  ## Always
111
138
 
@@ -115,4 +142,7 @@ Do **not** include a PR link — the runner adds it.
115
142
  - Make every Task prompt self-contained — subagents see only what you give them.
116
143
  - Reviewers and the report writer never modify files.
117
144
  - Never commit, push, or open a PR.
118
- - Your final message is the report, verbatim.
145
+ - The report subagent delivers the report by calling `submit_report` (structured),
146
+ once — not as prose for you to echo. Each acceptance criterion carries the diff
147
+ hunk(s) that prove its verdict, copied verbatim from the live `git --no-pager diff`
148
+ — never fabricated. An empty diff means an honest "nothing changed" report.
@@ -77,8 +77,10 @@ Your last message **is** the comment posted to the plan thread — write it for
77
77
  user:
78
78
 
79
79
  - **Implemented:** a short report — what you changed and why, which files, and how
80
- it was verified (build/tests). The runner appends the pull-request link, so don't
81
- add one.
80
+ it was verified (build/tests). Base "what changed" and "which files" on the actual
81
+ `git --no-pager diff` (`--stat` for the file list), not on what a subagent claimed;
82
+ if the diff is empty, say nothing was changed rather than describing edits that
83
+ aren't there. The runner appends the pull-request link, so don't add one.
82
84
  - **Clarify / push back:** your question or reasoning, as prose (plus any widget).
83
85
  - **Re-plan:** you called `submit_plan`; the rendered plan is posted automatically,
84
86
  so keep any extra reply text minimal.