npm - @flumecode/runner - Versions diffs - 0.7.0 → 0.9.0 - Mend

@flumecode/runner 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +1 -1
package/dist/cli.js +218 -44
package/package.json +1 -1
package/skills-plugin/skills/create-release/SKILL.md +24 -2
package/skills-plugin/skills/implement-plan/SKILL.md +106 -45
package/skills-plugin/skills/revise-implementation/SKILL.md +13 -6

package/README.md CHANGED Viewed

@@ -63,7 +63,7 @@ skipping if that version is already on npm).
 6. Report the summary back (`POST /api/runner/jobs/:id/complete`), which fills in
    the pending agent comment in the thread.
-Jobs come in two kinds. **chat** jobs answer a request thread (the flow above).
+Jobs come in two kinds. **comment** jobs answer a request thread (the flow above).
 **init** jobs bootstrap a repository: they clone the default branch onto a fresh
 `flumecode/init-*` branch, run the `flumecode:document` skill to create the
 `.flumecode/` wiki, and open a PR. A repo must be initialized (from its dashboard

package/dist/cli.js CHANGED Viewed

@@ -342,12 +342,97 @@ function createPlanTooling() {
   return { mcpServer, getPlans: () => renderedPlans };
 }
+// src/report.ts
+import { createSdkMcpServer as createSdkMcpServer3, tool as tool3 } from "@anthropic-ai/claude-agent-sdk";
+import { z as z3 } from "zod";
+var SERVER_NAME3 = "flume_report";
+var SUBMIT_REPORT = "submit_report";
+var REPORT_TOOL_NAME = `mcp__${SERVER_NAME3}__${SUBMIT_REPORT}`;
+var STATUS_ICON = {
+  met: "\u2705",
+  not_met: "\u274C",
+  unclear: "\u26A0\uFE0F"
+};
+var evidenceSchema = z3.object({
+  file: z3.string().min(1).describe("Repo-relative path the hunk comes from."),
+  hunk: z3.string().min(1).describe(
+    "A unified-diff hunk body proving the criterion \u2014 the lines that matter, not the whole file. Rendered verbatim as a ```diff block."
+  ),
+  note: z3.string().optional().describe("Optional one-line explanation of why this hunk satisfies the criterion.")
+});
+var acVerdictSchema = z3.object({
+  criterion: z3.string().min(1).describe("The acceptance-criterion text, verbatim from the plan."),
+  status: z3.enum(["met", "not_met", "unclear"]).describe("Verdict for this criterion, verified against the actual diff."),
+  rationale: z3.string().min(1).describe("One or two sentences on why the verdict holds."),
+  evidence: z3.array(evidenceSchema).describe(
+    "Diff hunks proving the verdict. Include the relevant hunk(s) for a met criterion; may be empty for not_met / unclear."
+  )
+});
+var reportInputSchema = {
+  summary: z3.string().min(1).describe("One or two sentences on what was implemented."),
+  prose: z3.string().min(1).describe(
+    "Markdown for the remaining report sections \u2014 What changed, Files changed, Build / tests, and Caveats / follow-ups. Use ## headings. Do NOT include the acceptance-criteria section here (that goes in acceptanceCriteria) and do NOT include the PR link (the runner appends it)."
+  ),
+  acceptanceCriteria: z3.array(acVerdictSchema).min(1).describe(
+    "One entry per acceptance criterion from the plan, in plan order, each with a verdict and the diff evidence behind it."
+  )
+};
+var reportSchema = z3.object(reportInputSchema);
+function renderReport(report) {
+  const lines = [];
+  lines.push(report.summary.trim());
+  lines.push("");
+  lines.push(report.prose.trim());
+  lines.push("");
+  lines.push("## Acceptance criteria");
+  for (const ac of report.acceptanceCriteria) {
+    lines.push("");
+    lines.push(`### ${STATUS_ICON[ac.status]} ${ac.criterion}`);
+    lines.push("");
+    lines.push(ac.rationale.trim());
+    for (const ev of ac.evidence) {
+      lines.push("");
+      lines.push(ev.note ? `\`${ev.file}\` \u2014 ${ev.note}` : `\`${ev.file}\``);
+      lines.push("");
+      lines.push("```diff");
+      lines.push(ev.hunk.replace(/\n+$/, ""));
+      lines.push("```");
+    }
+  }
+  return lines.join("\n");
+}
+function createReportTooling() {
+  let submittedReport = null;
+  const submitReport = tool3(
+    SUBMIT_REPORT,
+    "Submit the final implementation report as structured data. Call this exactly once, at the end of the run. `acceptanceCriteria` must contain one entry per plan criterion, each with a met / not_met / unclear verdict and the diff hunk(s) that prove it. `summary` + `prose` are markdown for the rest of the report. Do NOT include a PR link \u2014 the runner appends it.",
+    reportInputSchema,
+    async (args) => {
+      submittedReport = reportSchema.parse(args);
+      return {
+        content: [
+          {
+            type: "text",
+            text: "Report submitted. The runner will render and post it. End your turn now."
+          }
+        ]
+      };
+    }
+  );
+  const mcpServer = createSdkMcpServer3({
+    name: SERVER_NAME3,
+    tools: [submitReport]
+  });
+  return { mcpServer, getReport: () => submittedReport };
+}
 // src/executor.ts
 var FLUME_PLUGIN_DIR = fileURLToPath2(new URL("../skills-plugin", import.meta.url));
 async function runClaudeCode(opts) {
   let finalText = "";
   const { mcpServer, collected } = createWidgetTooling();
   const { mcpServer: planServer, getPlans } = createPlanTooling();
+  const { mcpServer: reportServer, getReport } = createReportTooling();
   for await (const message of query({
     prompt: opts.prompt,
     options: {
@@ -368,8 +453,8 @@ async function runClaudeCode(opts) {
       // does NOT restrict anything else). Task lets the implement-plan
       // orchestrator spawn its subagents; without pre-approval the spawn could
       // stall waiting for an approval no one can give.
-      mcpServers: { flume_widgets: mcpServer, flume_plan: planServer },
-      allowedTools: [...WIDGET_TOOL_NAMES, PLAN_TOOL_NAME, "Task"]
+      mcpServers: { flume_widgets: mcpServer, flume_plan: planServer, flume_report: reportServer },
+      allowedTools: [...WIDGET_TOOL_NAMES, PLAN_TOOL_NAME, REPORT_TOOL_NAME, "Task"]
     }
   })) {
     if (message.type === "assistant") {
@@ -389,7 +474,7 @@ async function runClaudeCode(opts) {
   if (opts.abortController?.signal.aborted) {
     throw new Error("Run canceled by user");
   }
-  return { text: finalText, widgets: collected, plans: getPlans() };
+  return { text: finalText, widgets: collected, plans: getPlans(), report: getReport() };
 }
 // src/health.ts
@@ -586,7 +671,7 @@ function buildRepairPrompt(ctx, hookLog) {
   ];
   return lines.join("\n");
 }
-function buildReleasePrompt(ctx) {
+function buildReleasePrompt(ctx, baseChecks) {
   const task = `Use the \`flumecode:create-release\` skill to handle this turn. You are driving a release: first analyse commits since the last tag, propose version bumps, and ask the user to confirm via widgets (Phase 1); once the user's widget answers appear in the thread, apply the bumps to package.json files and update CHANGELOG.md (Phase 2). Do NOT commit or push \u2014 the runner handles that and opens the bump PR.`;
   const orient = `Before investigating raw source, check for a FlumeCode wiki at \`.flumecode/wiki/\`. If it exists, read \`.flumecode/wiki/README.md\` first \u2014 it is the index \u2014 and follow its links to the pages and source paths relevant to this release. If there is no wiki, work from the code directly.`;
   const widgets = `When you need the user to choose, ask it as a widget rather than writing the options as prose: call \`single_select\` for a one-of-N choice (radio buttons) or \`multi_select\` for a "select all that apply" choice (checkboxes). Don't add your own "Other" option \u2014 the UI always provides one. After calling a widget tool, end your turn \u2014 the user's answer comes back as their next message and starts a fresh run.`;
@@ -606,6 +691,23 @@ function buildReleasePrompt(ctx) {
   if (ctx.request?.body) {
     lines.push("", ctx.request.body);
   }
+  if (baseChecks && !baseChecks.ok) {
+    lines.push(
+      "",
+      "# Pre-release check status",
+      "",
+      "\u26A0\uFE0F The repository's pre-commit checks (lint / typecheck / tests) are currently FAILING on the base branch, independently of any version bump. A release must not ship a broken base:",
+      "",
+      "- **Phase 1 (propose):** tell the user, in your reply, that the base currently fails these checks and that the release will fix them as part of the bump.",
+      "- **Phase 2 (apply):** fix the failing code at its root so the checks pass, THEN apply the version bumps and CHANGELOG. Do NOT delete/skip tests or weaken assertions. The fixes ship in the same bump PR. Still do NOT commit or push \u2014 the runner does.",
+      "",
+      "Failing check output:",
+      "",
+      "```",
+      baseChecks.log,
+      "```"
+    );
+  }
   appendThread(lines, ctx);
   lines.push(
     "",
@@ -640,6 +742,12 @@ var MAX_BUFFER = 1 << 24;
 async function git(args) {
   return exec("git", args, { maxBuffer: MAX_BUFFER });
 }
+var RUNNER_GIT_EMAIL = "runner@flumecode.local";
+var RUNNER_GIT_NAME = "FlumeCode Runner";
+async function ensureGitIdentity(dir) {
+  await git(["-C", dir, "config", "user.email", RUNNER_GIT_EMAIL]);
+  await git(["-C", dir, "config", "user.name", RUNNER_GIT_NAME]);
+}
 function cloneUrl(ctx) {
   const { owner, name, cloneToken } = ctx.repo;
   return `https://x-access-token:${cloneToken}@github.com/${owner}/${name}.git`;
@@ -703,15 +811,20 @@ async function resetWorkspace(dir) {
 async function prepareAtSha(ctx, dir, reused) {
   if (!reused) {
     await cloneAtSha(ctx, dir);
+    await ensureGitIdentity(dir);
     return;
   }
   await git(["-C", dir, "remote", "set-url", "origin", cloneUrl(ctx)]);
+  await ensureGitIdentity(dir);
 }
 async function prepareResumingBranch(ctx, dir, reused) {
   if (!reused) {
-    return cloneResumingBranch(ctx, dir);
+    const result = await cloneResumingBranch(ctx, dir);
+    await ensureGitIdentity(dir);
+    return result;
   }
   await git(["-C", dir, "remote", "set-url", "origin", cloneUrl(ctx)]);
+  await ensureGitIdentity(dir);
   return { resumed: true };
 }
 async function sweepWorkspaces() {
@@ -765,21 +878,25 @@ function commitFailureLog(err) {
   const parts = [e.stdout, e.stderr].map((s) => typeof s === "string" ? s.trim() : "").filter((s) => s.length > 0);
   return parts.length > 0 ? parts.join("\n") : e.message ?? String(err);
 }
+function isUnsupportedGitSubcommand(err) {
+  const e = err;
+  const text = `${typeof e.stderr === "string" ? e.stderr : ""}
+${e.message ?? ""}`;
+  return /is not a git command|unknown subcommand|usage: git hook/i.test(text);
+}
+async function runRepoChecks(dir) {
+  try {
+    await git(["-C", dir, "hook", "run", "pre-commit"]);
+    return { ok: true, log: "", skipped: false };
+  } catch (err) {
+    if (isUnsupportedGitSubcommand(err)) return { ok: true, log: "", skipped: true };
+    return { ok: false, log: commitFailureLog(err), skipped: false };
+  }
+}
 async function commitChanges(ctx, dir) {
   if (!await hasChanges(dir)) return false;
   try {
-    await git([
-      "-C",
-      dir,
-      "-c",
-      "user.email=runner@flumecode.local",
-      "-c",
-      "user.name=FlumeCode Runner",
-      "commit",
-      "--quiet",
-      "-m",
-      `FlumeCode: ${jobTitle(ctx)}`
-    ]);
+    await git(["-C", dir, "commit", "--quiet", "-m", `FlumeCode: ${jobTitle(ctx)}`]);
   } catch (err) {
     throw new PreCommitError(commitFailureLog(err));
   }
@@ -819,17 +936,7 @@ async function mergeInMergeBranch(ctx, dir) {
   if (!mergeBranch) return { conflicted: false };
   await git(["-C", dir, "fetch", "--quiet", "origin", mergeBranch]);
   try {
-    await git([
-      "-C",
-      dir,
-      "-c",
-      "user.email=runner@flumecode.local",
-      "-c",
-      "user.name=FlumeCode Runner",
-      "merge",
-      "--no-edit",
-      "FETCH_HEAD"
-    ]);
+    await git(["-C", dir, "merge", "--no-edit", "FETCH_HEAD"]);
     return { conflicted: false };
   } catch {
     return { conflicted: true };
@@ -890,6 +997,7 @@ var CANCEL_POLL_MS = 2500;
 var ORCHESTRATOR_MODEL = "sonnet";
 var ORCHESTRATOR_MAX_TURNS = 80;
 var MAX_COMMIT_REPAIRS = 2;
+var MAX_IMPLEMENT_RETRIES = 1;
 var INIT_MAX_TURNS = 200;
 var DOCUMENT_MAX_TURNS = 120;
 var HEARTBEAT_MS = 5 * 6e4;
@@ -1082,19 +1190,36 @@ async function processChatJob(ctx, dir, abort) {
   reply += outcomeBanner(outcome, { branch: ctx.repo.checkoutBranch, documented, autoMerged });
   return { text: reply, widgets: [] };
 }
+function reportClaimsWork(report) {
+  return !!report && report.acceptanceCriteria.some((ac) => ac.status === "met" && ac.evidence.length > 0);
+}
 async function processImplementJob(ctx, dir, resumed, abort) {
   console.log(`
 \u25B6 Implement ${ctx.jobId} \u2014 ${ctx.repo.fullName}: "${jobTitle(ctx)}"`);
   const installResult = await installDependencies(dir);
-  const result = await runClaudeCode({
-    cwd: dir,
-    prompt: buildPrompt(ctx),
-    permissionMode: ctx.permissionMode,
-    model: ORCHESTRATOR_MODEL,
-    maxTurns: ORCHESTRATOR_MAX_TURNS,
-    abortController: abort
-  });
-  let reply = result.text.trim() || "(the agent produced no report)";
+  let report;
+  let reply;
+  for (let attempt = 0; ; attempt++) {
+    const result = await runClaudeCode({
+      cwd: dir,
+      prompt: buildPrompt(ctx),
+      permissionMode: ctx.permissionMode,
+      model: ORCHESTRATOR_MODEL,
+      maxTurns: ORCHESTRATOR_MAX_TURNS,
+      abortController: abort
+    });
+    report = result.report ?? void 0;
+    reply = (report ? renderReport(report) : result.text.trim()) || "(the agent produced no report)";
+    if (abort.signal.aborted || !reportClaimsWork(report) || await hasChanges(dir)) break;
+    if (attempt >= MAX_IMPLEMENT_RETRIES) {
+      throw new Error(
+        `Implementation reported completed work (acceptance criteria met with diff evidence) but the working tree is clean after ${attempt + 1} attempt(s) \u2014 no changes were persisted, so no pull request could be opened.`
+      );
+    }
+    console.warn(
+      `  implement ${ctx.jobId}: report claims changes but the working tree is clean \u2014 re-running implementation (attempt ${attempt + 2})`
+    );
+  }
   if (installResult.status === "failed") {
     reply += `
@@ -1118,7 +1243,12 @@ async function processImplementJob(ctx, dir, resumed, abort) {
   }
   const { outcome, autoMerged } = await pushAndOpenPr(ctx, dir, abort, { rebase: !resumed });
   reply += outcomeBanner(outcome, { branch: ctx.repo.checkoutBranch, documented, autoMerged });
-  return { text: reply, widgets: [], ...outcome.kind === "pr" ? { pr: outcome.pr } : {} };
+  return {
+    text: reply,
+    widgets: [],
+    ...report ? { report } : {},
+    ...outcome.kind === "pr" ? { pr: outcome.pr } : {}
+  };
 }
 async function processReviseJob(ctx, dir, resumed, abort) {
   console.log(`
@@ -1194,9 +1324,16 @@ async function processReleaseJob(ctx, dir, resumed, abort) {
   console.log(`
 \u25B6 Release ${ctx.jobId} \u2014 ${ctx.repo.fullName}: "${jobTitle(ctx)}"`);
   const installResult = await installDependencies(dir);
+  const checks = await runRepoChecks(dir);
+  if (checks.skipped) {
+    console.log(`  \u2026release ${ctx.jobId}: pre-release checks skipped (git too old for 'hook run')`);
+  } else {
+    console.log(`  \u2026release ${ctx.jobId}: pre-release checks ${checks.ok ? "passed" : "FAILED"}`);
+  }
+  const baseChecks = checks.ok ? void 0 : { ok: false, log: trimHookLog(checks.log) };
   const result = await runClaudeCode({
     cwd: dir,
-    prompt: buildReleasePrompt(ctx),
+    prompt: buildReleasePrompt(ctx, baseChecks),
     permissionMode: ctx.permissionMode,
     model: ORCHESTRATOR_MODEL,
     maxTurns: ORCHESTRATOR_MAX_TURNS,
@@ -1282,13 +1419,14 @@ async function pollLoop(config) {
       };
       scheduleCancelPoll();
       try {
-        const { text, widgets, pr, plans } = await processJob(ctx, abort);
+        const { text, widgets, pr, plans, report } = await processJob(ctx, abort);
         await reportJob(config, ctx.jobId, {
           status: "done",
           text,
           widgets,
           pr,
-          ...plans?.length ? { plans } : {}
+          ...plans?.length ? { plans } : {},
+          ...report ? { report } : {}
         });
         console.log(`\u2713 Job ${ctx.jobId} done`);
       } catch (err) {
@@ -1300,10 +1438,12 @@ async function pollLoop(config) {
             console.error(`  (failed to report the cancellation: ${errorMessage2(reportErr)})`);
           }
         } else {
-          const message = errorMessage2(err);
-          console.error(`\u2717 Job ${ctx.jobId} failed: ${message}`);
+          console.error(`\u2717 Job ${ctx.jobId} failed: ${errorMessage2(err)}`);
           try {
-            await reportJob(config, ctx.jobId, { status: "error", error: message });
+            await reportJob(config, ctx.jobId, {
+              status: "error",
+              error: formatJobError(ctx, err)
+            });
           } catch (reportErr) {
             console.error(`  (also failed to report the error: ${errorMessage2(reportErr)})`);
           }
@@ -1322,6 +1462,40 @@ function sleep(ms) {
 function errorMessage2(err) {
   return err instanceof Error ? err.message : String(err);
 }
+var MAX_HOOK_LOG_LINES = 80;
+var MAX_HOOK_LOG_CHARS = 4e3;
+function trimHookLog(log) {
+  let trimmed = log.trimEnd();
+  let elided = false;
+  const lines = trimmed.split("\n");
+  if (lines.length > MAX_HOOK_LOG_LINES) {
+    trimmed = lines.slice(-MAX_HOOK_LOG_LINES).join("\n");
+    elided = true;
+  }
+  if (trimmed.length > MAX_HOOK_LOG_CHARS) {
+    trimmed = trimmed.slice(-MAX_HOOK_LOG_CHARS);
+    elided = true;
+  }
+  return elided ? `\u2026(earlier output trimmed)\u2026
+${trimmed}` : trimmed;
+}
+function formatJobError(ctx, err) {
+  if (!(err instanceof PreCommitError)) return errorMessage2(err);
+  const nextStep = ctx.kind === "release" ? `These checks are failing on \`${ctx.repo.mergeBranch}\` independently of the version bump, and the release couldn't fix them after ${MAX_COMMIT_REPAIRS} automatic attempts. Open a request on **${ctx.repo.fullName}** to fix the failing checks above, then start the release again once that fix has merged.` : `The agent couldn't get its change past these checks after ${MAX_COMMIT_REPAIRS} automatic repair attempts. Open a request on **${ctx.repo.fullName}** describing the failing checks above so the agent can fix them at their root, then try again.`;
+  return [
+    "\u274C **Blocked by failing pre-commit checks.**",
+    "",
+    `The repository's pre-commit hook (lint / typecheck / tests) rejected the commit after ${MAX_COMMIT_REPAIRS} automatic repair attempts, so nothing was pushed.`,
+    "",
+    "**What failed:**",
+    "",
+    "```",
+    trimHookLog(err.log),
+    "```",
+    "",
+    `**Next step:** ${nextStep}`
+  ].join("\n");
+}
 // src/cli.ts
 var DEFAULT_SERVER = process.env.FLUME_SERVER || "http://localhost:3000";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@flumecode/runner",
-  "version": "0.7.0",
+  "version": "0.9.0",
   "type": "module",
   "description": "FlumeCode local runner — claims jobs and drives your local Claude Code against a real checkout.",
   "bin": {

package/skills-plugin/skills/create-release/SKILL.md CHANGED Viewed

@@ -171,6 +171,28 @@ version did not change.
   `apps/runner/package.json`. Leave `apps/web/package.json` unchanged.
 - **Clear Phase 1 text:** be explicit about what changed since the last tag so the
   user can confidently confirm or override your suggestions.
-- **Never edit** any file other than `apps/web/package.json`,
-  `apps/runner/package.json`, and `CHANGELOG.md`.
+- **Edit only version files — with one exception.** Normally edit only
+  `apps/web/package.json`, `apps/runner/package.json`, and `CHANGELOG.md`. The sole
+  exception: when the prompt includes a **`# Pre-release check status`** section
+  reporting failing checks, you must also fix the failing code (any file needed) so
+  the tree is green — see "Pre-release checks" below. Never weaken or skip checks to
+  silence them.
 - **Never commit, push, or open a PR** — the runner does that.
+## Pre-release checks
+We cannot release code with failing checks. Before this turn, the runner ran the
+repository's own pre-commit hook (lint / typecheck / tests). If the prompt contains
+a **`# Pre-release check status`** section, the base branch is currently broken
+_independently of the version bump_:
+- **Phase 1:** state plainly in your reply that the base currently fails these
+  checks and that the release will fix them as part of the bump, then ask the
+  version questions as usual.
+- **Phase 2:** fix the failing code at its root **first** (so the checks pass),
+  **then** apply the version bumps and CHANGELOG. The fixes ship in the same bump
+  PR. Do not delete or skip tests, weaken assertions, or disable checks. Still do
+  not commit or push — the runner commits everything together.
+If there is no `# Pre-release check status` section, the base is clean (or the check
+was skipped); proceed normally and edit only the version files.

package/skills-plugin/skills/implement-plan/SKILL.md CHANGED Viewed

@@ -5,8 +5,9 @@ description: >-
   subagents instead of writing the code yourself. Use in edit-capable runs. You
   act as the orchestrator: delegate implementation, acceptance-criteria review,
   code-quality review, and report-writing to Task subagents — picking the right
-  model for each phase — then return their report. Makes edits via subagents;
-  never commits, pushes, or opens a PR (the runner does that).
+  model for each phase. The report subagent submits a structured report (with
+  per-criterion diff evidence) via the submit_report tool. Makes edits via
+  subagents; never commits, pushes, or opens a PR (the runner does that).
 ---
 # implement-plan
@@ -30,10 +31,11 @@ put it in the prompt, the subagent doesn't have it.
 - Spawn each phase with the **Task** tool, `subagent_type: "general-purpose"`.
 - **Model per phase** (pass it as the Task `model` argument):
-  - `"sonnet"` — implementation and fixes (the code-writing work).
+  - `"sonnet"` — implementation, fixes, and the Verify step (mechanical
+    command-running; Verify is read-only even though it uses sonnet).
   - `"opus"` — acceptance-criteria review, code-quality review, and the report.
-- **Reviewers are read-only.** Tell every review/report subagent to _inspect and
-  report only — never edit, create, or delete files_. Only implementation/fix
+- **Read-only phases.** Tell every review, Verify, and report subagent to _inspect
+  and report only — never edit, create, or delete files_. Only implementation/fix
   subagents may change the working tree.
 - **No git side effects.** Neither you nor any subagent may commit, push, or open
   a PR. Leave the changes in the working tree; the runner commits + opens the PR
@@ -60,59 +62,118 @@ the next step.
 2. **Implement** — Task, `model: "sonnet"`. Give the subagent: the plan steps, a
    pointer to the wiki/orientation, and the coding guidelines (verbatim). Tell it
-   to make all the code changes in the working tree to satisfy the plan, keep the
-   build and tests green where practical, and end by reporting which files it
-   changed and how each step was addressed. It must not commit or push.
-3. **Acceptance-criteria review** — Task, `model: "opus"`, read-only. Give the
+   to make all the code changes in the working tree to satisfy the plan, then
+   self-verify by discovering and running the project's verification commands —
+   checking these sources in order: `package.json` scripts (look for `build`,
+   `typecheck`, `lint`, `test`), `CLAUDE.md`, any `.flumecode/wiki/` page that
+   mentions commands, and `Makefile`. Use whatever is present and appropriate for
+   this repo; do not hardcode specific command strings. Run each discovered
+   command and fix any errors that the edits introduced before returning. If no
+   build/test setup exists in this repo, note that and move on — do not fail. End
+   by reporting: the verification commands it ran and their pass/fail results,
+   which files it changed, and how each plan step was addressed. It must not
+   commit or push.
+3. **Verify (build & tests)** — Task, `model: "sonnet"`, read-only. This step
+   gives the orchestrator an objective, independent build/test signal before the
+   subjective AC and quality reviews. Tell the subagent to:
+   - Discover the project's verification commands from `package.json` scripts
+     (look for `build`, `typecheck`, `lint`, `test`), `CLAUDE.md`,
+     `.flumecode/wiki/` (any page that mentions commands), and `Makefile`. Use
+     what is present; do not hardcode specific command strings.
+   - Run each discovered command and record: the exact command, whether it passed
+     or failed, and — for any failure — a short excerpt of the failing output
+     (enough to diagnose the problem).
+   - If no build/test setup exists in this repo, say so explicitly and pass the
+     gate.
+   - Return a structured per-check result: command, pass/fail, failing-output
+     excerpt (if any).
+   - Must not edit, create, or delete any files.
+4. **Acceptance-criteria review** — Task, `model: "opus"`, read-only. Give the
    subagent the full AC list and tell it to verify each one against the actual
    changes (run `git --no-pager diff`, read the changed files, run tests/build if
-   useful). It must return a per-AC verdict: **met / not met / unclear**, each
-   with concrete evidence (file:line, test result).
-4. **Code-quality review** — Task, `model: "opus"`, read-only. Give the subagent
+   useful). For **each** AC it must return: the criterion text verbatim, a verdict
+   (**met / not met / unclear**), a one-or-two-sentence rationale, and — this is the
+   evidence the report needs — the **exact diff hunk(s)** that prove it, each tagged
+   with its file path (the minimal lines that matter, copied verbatim from
+   `git --no-pager diff`; not the whole file). A _met_ AC should cite at least one
+   hunk; _not met_ / _unclear_ may cite none. **Ground every verdict in the actual
+   diff:** a criterion may be marked _met_ only if `git --no-pager diff` really
+   contains the change that satisfies it, and each cited hunk must be copied verbatim
+   from that live output — never reconstructed from the plan or from what the
+   implement subagent claimed. If `git --no-pager diff` is empty, the implementation
+   produced no changes: no criterion may be _met_, and the review must say so. Tell it
+   to return this as a clean, structured list so you can hand it straight to the
+   report step.
+5. **Code-quality review** — Task, `model: "opus"`, read-only. Give the subagent
    the coding guidelines (verbatim) and tell it to review the changes for
    violations and quality problems, returning concrete findings as
    `file:line — what — why`, each marked **must-fix** or **nice-to-have**.
-5. **Fix loop.** If the AC review reports any _not met_ AC, or the quality review
+6. **Fix loop.** If the Verify step (step 3) reports any failing check, the AC
+   review (step 4) reports any _not met_ AC, or the quality review (step 5)
    reports any _must-fix_ finding: spawn an **Implement/fix** subagent (Task,
    `model: "sonnet"`) whose prompt lists exactly those findings and tells it to
-   resolve them without regressing the rest. Then re-run only the review(s) that
-   failed. Repeat at most **2** times. If something still fails after that, stop
-   looping and record the gap honestly in the report — do not hide it.
-6. **Report** — Task, `model: "opus"`, read-only. Give the subagent the plan, the
-   final `git --no-pager diff` (or tell it to run it), the AC verdicts, and the
-   quality findings, and have it write the user-facing report in the shape below.
-7. **Return the report.** Your final reply **is** that report — output it verbatim
-   as your last message, with nothing added. The runner posts it to the thread and
-   appends the pull-request link.
-## The report (what the user sees)
-Have the report subagent produce, in this shape:
-- **Summary** — one or two sentences on what was implemented.
-- **What changed** — the plan steps, each mapped to the concrete changes that
-  satisfy it.
-- **Acceptance criteria** — a checklist, each AC marked ✅ met / ❌ not met /
-  ⚠️ unclear, mirroring the AC review.
-- **Code quality** — a short note on the quality-review outcome and anything left
-  as nice-to-have.
-- **Files changed** — the list from the diff.
-- **Build / tests** — what was run and the result, or why it wasn't run.
-- **Caveats / follow-ups** — anything deferred, unmet, or worth a human's eyes.
-Do **not** include a PR link — the runner adds it.
+   resolve them without regressing the rest. When a Verify failure triggered the
+   fix, include the failing command(s) and their error output excerpt(s) from the
+   Verify result in the fix subagent's prompt so it has the full context. After
+   each fix iteration, re-run the Verify step (step 3) in addition to any AC or
+   quality review that failed. Repeat at most **2** times. If something still
+   fails after that, stop looping and record the gap honestly in the report — do
+   not hide it.
+7. **Report** — Task, `model: "opus"`, read-only. Give the subagent the plan, the
+   Verify results (from step 3), the AC verdicts (from step 4), and the quality
+   findings, and tell it to run `git --no-pager diff` itself as the **single
+   source of truth** for the report. Every `evidence` hunk it submits must be
+   copied verbatim from that live diff — it must drop or correct any hunk carried
+   over from step 4 that no longer appears in the actual diff, and the **Files
+   changed** list must come from `git --no-pager diff --stat`, not from what an
+   earlier subagent claimed. **If `git --no-pager diff` is empty, the
+   implementation changed nothing:** the report must say so plainly — an honest
+   `summary`, no AC marked `met` with evidence — and must never describe edits
+   that aren't in the diff. Tell it to submit the user-facing report by calling
+   the **`submit_report`** tool — it has that tool available. It must call
+   `submit_report` exactly once and must not edit any files.
+8. **Confirm and end.** Once the report subagent has called `submit_report`, you are
+   done — end your turn. The runner reads the submitted report, renders it, posts it
+   to the thread, and appends the pull-request link. (Your own final text is only a
+   fallback if no report was submitted, so make sure the subagent submits one.)
+## The report (what `submit_report` takes)
+The report subagent calls `submit_report` with these fields:
+- **`summary`** — one or two sentences on what was implemented.
+- **`prose`** — markdown for the remaining sections, using `##` headings:
+  **What changed** (the plan steps, each mapped to the concrete changes that satisfy
+  it), **Code quality** (the quality-review outcome and anything left as
+  nice-to-have), **Files changed** (the list from the diff), **Build / tests** (lists
+  each verification command and its final pass/fail result, or explains that no
+  build/test setup was found), and **Caveats / follow-ups** (anything deferred,
+  unmet, or worth a human's eyes). Do **not** put the acceptance-criteria section in
+  `prose`, and do **not** include a PR link — the runner adds it.
+- **`acceptanceCriteria`** — one entry per AC from the plan, in plan order, each:
+  - `criterion` — the AC text verbatim.
+  - `status` — `"met"` / `"not_met"` / `"unclear"`, mirroring the AC review.
+  - `rationale` — one or two sentences on why the verdict holds.
+  - `evidence` — an array of `{ file, hunk, note? }`, where `hunk` is copied
+    verbatim from the live `git --no-pager diff` and proves the verdict (`note`
+    optionally explains it). Never include a hunk that isn't in the actual diff. Cite
+    the supporting hunk(s) for a met criterion; may be empty for not_met / unclear.
 ## Always
 - Delegate through Task subagents; don't implement, review, or write the report
   yourself.
-- Right model per phase: `sonnet` to implement/fix, `opus` to review/report.
+- Right model per phase: `sonnet` to implement/fix/verify (Verify is read-only), `opus` to review/report.
 - Make every Task prompt self-contained — subagents see only what you give them.
 - Reviewers and the report writer never modify files.
 - Never commit, push, or open a PR.
-- Your final message is the report, verbatim.
+- The report subagent delivers the report by calling `submit_report` (structured),
+  once — not as prose for you to echo. Each acceptance criterion carries the diff
+  hunk(s) that prove its verdict, copied verbatim from the live `git --no-pager diff`
+  — never fabricated. An empty diff means an honest "nothing changed" report.

package/skills-plugin/skills/revise-implementation/SKILL.md CHANGED Viewed

@@ -65,9 +65,12 @@ essentials:
 - **Scope the work to the request.** This is a fine-tune of an existing
   implementation, not a rebuild. Change only what the user asked for plus what that
   change strictly requires; don't regress the rest of the plan.
-- **Pipeline:** Implement (Task, `model: "sonnet"`) → acceptance/quality review of
-  the change (Task, `model: "opus"`, read-only) → fix loop if needed (≤2) → report
-  (Task, `model: "opus"`, read-only). Reviewers and the report writer never edit.
+- **Pipeline:** Implement (self-runs build/tests & fixes its own errors, Task
+  `model: "sonnet"`) → Verify (build/tests, read-only, Task `model: "sonnet"`) →
+  acceptance/quality review (Task `model: "opus"`, read-only) → fix loop if needed
+  (≤2, re-run Verify after each fix) → report (Task `model: "opus"`, read-only).
+  Detailed mechanics (command discovery, Verify step spec, fix-loop trigger
+  conditions) are in `implement-plan/SKILL.md` — read it for the full pipeline.
 - **No git side effects.** Never commit, push, or open a PR — leave the changes in
   the working tree. The runner commits them and updates the existing pull request.
@@ -76,9 +79,13 @@ essentials:
 Your last message **is** the comment posted to the plan thread — write it for the
 user:
-- **Implemented:** a short report — what you changed and why, which files, and how
-  it was verified (build/tests). The runner appends the pull-request link, so don't
-  add one.
+- **Implemented:** a short report — what you changed and why, which files, and the
+  verification results: list each build/test command that was run and its final
+  pass/fail result (or note that no build/test setup was found). Base "what changed"
+  and "which files" on the actual `git --no-pager diff` (`--stat` for the file
+  list), not on what a subagent claimed; if the diff is empty, say nothing was
+  changed rather than describing edits that aren't there. The runner appends the
+  pull-request link, so don't add one.
 - **Clarify / push back:** your question or reasoning, as prose (plus any widget).
 - **Re-plan:** you called `submit_plan`; the rendered plan is posted automatically,
   so keep any extra reply text minimal.