@flumecode/runner 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js
CHANGED
|
@@ -738,7 +738,7 @@ var planInputSchema = {
|
|
|
738
738
|
assumptions: z2.array(z2.string()).describe("Anything decided during planning, including unanswered defaults."),
|
|
739
739
|
steps: z2.array(stepSchema).min(1).describe("Ordered list of changes. Each step says what and why, with file references."),
|
|
740
740
|
acceptanceCriteria: z2.array(z2.string().min(1)).min(2).describe(
|
|
741
|
-
"
|
|
741
|
+
"Concrete, deterministically-checkable conditions that together define done. Each names a trigger/precondition and the exact observable result (run X -> output Y; file Z contains W; f(a) returns b) \u2014 no vague adjectives, not a restatement of a step. The set must collectively cover every step's change. At least 2 required."
|
|
742
742
|
),
|
|
743
743
|
risks: z2.array(z2.string()).describe("Anything that could change the approach."),
|
|
744
744
|
outOfScope: z2.array(z2.string()).describe("What is deliberately not being done.")
|
|
@@ -858,13 +858,19 @@ var acVerdictSchema = z3.object({
|
|
|
858
858
|
status: z3.enum(["met", "not_met", "unclear"]).describe("Verdict for this criterion, verified against the actual diff."),
|
|
859
859
|
rationale: z3.string().min(1).describe("One or two sentences on why the verdict holds."),
|
|
860
860
|
evidence: z3.array(evidenceSchema).describe(
|
|
861
|
-
"Diff hunks proving the verdict.
|
|
861
|
+
"Diff hunks proving the verdict, copied verbatim from git --no-pager diff. Across ALL criteria the evidence must collectively cover every hunk in the diff \u2014 each changed hunk appears under at least one criterion. Cite the relevant hunk(s) for a met criterion; may be empty for not_met / unclear."
|
|
862
862
|
)
|
|
863
863
|
});
|
|
864
864
|
var reportInputSchema = {
|
|
865
865
|
summary: z3.string().min(1).describe("One or two sentences on what was implemented."),
|
|
866
|
-
|
|
867
|
-
"Markdown
|
|
866
|
+
filesChanged: z3.string().min(1).describe(
|
|
867
|
+
"Markdown: the list of files changed (from the diff). Rendered under '## Files changed'."
|
|
868
|
+
),
|
|
869
|
+
codeQuality: z3.string().min(1).describe(
|
|
870
|
+
"Markdown: the code-quality review outcome and anything left as nice-to-have. Rendered under '## Code quality'."
|
|
871
|
+
),
|
|
872
|
+
caveats: z3.string().min(1).describe(
|
|
873
|
+
"Markdown: anything deferred, unmet, or worth a human's eyes, incl. diff hunks that map to no plan AC. Write 'None.' if nothing. Rendered under '## Caveats / follow-ups'."
|
|
868
874
|
),
|
|
869
875
|
acceptanceCriteria: z3.array(acVerdictSchema).min(1).describe(
|
|
870
876
|
"One entry per acceptance criterion from the plan, in plan order, each with a verdict and the diff evidence behind it."
|
|
@@ -874,10 +880,8 @@ var reportSchema = z3.object(reportInputSchema);
|
|
|
874
880
|
function renderReport(report) {
|
|
875
881
|
const lines2 = [];
|
|
876
882
|
lines2.push(report.summary.trim());
|
|
877
|
-
lines2.push("");
|
|
878
|
-
lines2.push(
|
|
879
|
-
lines2.push("");
|
|
880
|
-
lines2.push("## Acceptance criteria");
|
|
883
|
+
lines2.push("", "## Files changed", "", report.filesChanged.trim());
|
|
884
|
+
lines2.push("", "## Acceptance criteria");
|
|
881
885
|
for (const ac of report.acceptanceCriteria) {
|
|
882
886
|
lines2.push("");
|
|
883
887
|
lines2.push(`### ${STATUS_ICON[ac.status]} ${ac.criterion}`);
|
|
@@ -892,13 +896,15 @@ function renderReport(report) {
|
|
|
892
896
|
lines2.push("```");
|
|
893
897
|
}
|
|
894
898
|
}
|
|
899
|
+
lines2.push("", "## Code quality", "", report.codeQuality.trim());
|
|
900
|
+
lines2.push("", "## Caveats / follow-ups", "", report.caveats.trim());
|
|
895
901
|
return lines2.join("\n");
|
|
896
902
|
}
|
|
897
903
|
function createReportTooling() {
|
|
898
904
|
let submittedReport = null;
|
|
899
905
|
const submitReport = tool3(
|
|
900
906
|
SUBMIT_REPORT,
|
|
901
|
-
"Submit the final implementation report as structured data. Call this exactly once, at the end of the run. `acceptanceCriteria` must contain one entry per plan criterion, each with a met / not_met / unclear verdict and the diff hunk(s) that prove it. `summary`
|
|
907
|
+
"Submit the final implementation report as structured data. Call this exactly once, at the end of the run. `acceptanceCriteria` must contain one entry per plan criterion, each with a met / not_met / unclear verdict and the diff hunk(s) that prove it. `summary`, `filesChanged`, `codeQuality`, and `caveats` are the four named markdown sections. Do NOT include a PR link \u2014 the runner appends it.",
|
|
902
908
|
reportInputSchema,
|
|
903
909
|
async (args) => {
|
|
904
910
|
submittedReport = reportSchema.parse(args);
|
package/package.json
CHANGED
|
@@ -96,8 +96,9 @@ the next step.
|
|
|
96
96
|
useful). For **each** AC it must return: the criterion text verbatim, a verdict
|
|
97
97
|
(**met / not met / unclear**), a one-or-two-sentence rationale, and — this is the
|
|
98
98
|
evidence the report needs — the **exact diff hunk(s)** that prove it, each tagged
|
|
99
|
-
with its file path (the
|
|
100
|
-
`git --no-pager diff
|
|
99
|
+
with its file path (the hunks that prove it, copied verbatim from
|
|
100
|
+
`git --no-pager diff`, such that the union of every AC's evidence covers the
|
|
101
|
+
entire diff — each changed hunk cited under at least one criterion). A _met_ AC should cite at least one
|
|
101
102
|
hunk; _not met_ / _unclear_ may cite none. **Ground every verdict in the actual
|
|
102
103
|
diff:** a criterion may be marked _met_ only if `git --no-pager diff` really
|
|
103
104
|
contains the change that satisfies it, and each cited hunk must be copied verbatim
|
|
@@ -105,7 +106,7 @@ the next step.
|
|
|
105
106
|
implement subagent claimed. If `git --no-pager diff` is empty, the implementation
|
|
106
107
|
produced no changes: no criterion may be _met_, and the review must say so. Tell it
|
|
107
108
|
to return this as a clean, structured list so you can hand it straight to the
|
|
108
|
-
report step.
|
|
109
|
+
report step. In addition to per-AC verdicts, cross-check that every hunk in `git --no-pager diff` is cited by at least one AC's evidence; report any uncovered hunk as a coverage gap (signalling a missing AC or an out-of-scope change).
|
|
109
110
|
|
|
110
111
|
5. **Code-quality review** — Task, `model: "opus"`, read-only. Give the subagent
|
|
111
112
|
the coding guidelines (verbatim) and tell it to review the changes for
|
|
@@ -131,7 +132,7 @@ the next step.
|
|
|
131
132
|
copied verbatim from that live diff — it must drop or correct any hunk carried
|
|
132
133
|
over from step 4 that no longer appears in the actual diff, and the **Files
|
|
133
134
|
changed** list must come from `git --no-pager diff --stat`, not from what an
|
|
134
|
-
earlier subagent claimed. **If `git --no-pager diff` is empty, the
|
|
135
|
+
earlier subagent claimed. Tell it to enumerate all hunks from `git --no-pager diff` and ensure each is attached to ≥1 AC's `evidence`; any hunk mapping to no plan AC goes under `## Caveats / follow-ups` as an explicit unattributed change. **If `git --no-pager diff` is empty, the
|
|
135
136
|
implementation changed nothing:** the report must say so plainly — an honest
|
|
136
137
|
`summary`, no AC marked `met` with evidence — and must never describe edits
|
|
137
138
|
that aren't in the diff. Tell it to submit the user-facing report by calling
|
|
@@ -148,14 +149,9 @@ the next step.
|
|
|
148
149
|
The report subagent calls `submit_report` with these fields:
|
|
149
150
|
|
|
150
151
|
- **`summary`** — one or two sentences on what was implemented.
|
|
151
|
-
- **`
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
nice-to-have), **Files changed** (the list from the diff), **Build / tests** (lists
|
|
155
|
-
each verification command and its final pass/fail result, or explains that no
|
|
156
|
-
build/test setup was found), and **Caveats / follow-ups** (anything deferred,
|
|
157
|
-
unmet, or worth a human's eyes). Do **not** put the acceptance-criteria section in
|
|
158
|
-
`prose`, and do **not** include a PR link — the runner adds it.
|
|
152
|
+
- **`filesChanged`** — markdown list of files changed (from the diff). Rendered under `## Files changed`.
|
|
153
|
+
- **`codeQuality`** — the code-quality review outcome and anything left as nice-to-have. Rendered under `## Code quality`.
|
|
154
|
+
- **`caveats`** — anything deferred, unmet, or worth a human's eyes, including diff hunks that map to no plan AC. Write 'None.' if nothing. Rendered under `## Caveats / follow-ups`.
|
|
159
155
|
- **`acceptanceCriteria`** — one entry per AC from the plan, in plan order, each:
|
|
160
156
|
- `criterion` — the AC text verbatim.
|
|
161
157
|
- `status` — `"met"` / `"not_met"` / `"unclear"`, mirroring the AC review.
|
|
@@ -177,3 +173,4 @@ The report subagent calls `submit_report` with these fields:
|
|
|
177
173
|
once — not as prose for you to echo. Each acceptance criterion carries the diff
|
|
178
174
|
hunk(s) that prove its verdict, copied verbatim from the live `git --no-pager diff`
|
|
179
175
|
— never fabricated. An empty diff means an honest "nothing changed" report.
|
|
176
|
+
- The report exists so the human reviewer can verify each acceptance criterion is satisfied — the ACs and their diff evidence are the primary review surface.
|
|
@@ -71,11 +71,19 @@ Field-by-field guidance:
|
|
|
71
71
|
- **`description`** — what changes and why: the concrete change being made and the rationale for it. Use concrete file references (`path/to/file.ts`) and name the functions/symbols involved.
|
|
72
72
|
- **`pseudoCode`** — an array of `{ file, pseudoCode }` entries. Provide an entry for every file the step touches **except** documentation files (SKILL.md, README.md, wiki pages, etc.). `pseudoCode` is optional in the schema but expected for all non-documentation files. Each entry names the file path and contains pseudo code that precisely describes the changes to make in that file.
|
|
73
73
|
- **`acceptanceCriteria`** — **required; at least 2 items.** Each criterion must
|
|
74
|
-
be
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
74
|
+
be a concrete, deterministically-checkable condition that a third party can verify
|
|
75
|
+
without knowing the author's intent. Write each as a trigger/precondition and the
|
|
76
|
+
exact observable result: `run X → output Y`, `file Z contains W`, `calling f(a) returns b`.
|
|
77
|
+
No vague adjectives (`robust`, `clean`, `properly`, `works correctly`). The set
|
|
78
|
+
must be **collectively exhaustive** — every step's intended change is covered by
|
|
79
|
+
at least one AC. Do **not** restate a step as a criterion.
|
|
80
|
+
|
|
81
|
+
**Good vs bad examples:**
|
|
82
|
+
- ✅ `grep -rn "What changed" apps/runner/src/report.ts` produces no matches.
|
|
83
|
+
- ❌ The report is cleaner and no longer mentions 'What changed'. _(vague, not checkable)_
|
|
84
|
+
- ✅ `pnpm test` in the repo root exits 0 and report.test.ts output contains no failures.
|
|
85
|
+
- ❌ Tests pass correctly. _(no trigger, no observable result)_
|
|
86
|
+
|
|
79
87
|
- **`risks`** — anything that could change the approach or surface a problem.
|
|
80
88
|
- **`outOfScope`** — what you are deliberately not doing.
|
|
81
89
|
|