@glrs-dev/cli 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +2 -0
- package/dist/vendor/harness-opencode/dist/cli.js +314 -21
- package/dist/vendor/harness-opencode/dist/index.js +1 -1
- package/dist/vendor/harness-opencode/dist/skills/code-quality/SKILL.md +45 -0
- package/dist/vendor/harness-opencode/dist/skills/code-quality/rules/building.md +125 -0
- package/dist/vendor/harness-opencode/dist/skills/code-quality/rules/gap-analysis.md +92 -0
- package/dist/vendor/harness-opencode/dist/skills/code-quality/rules/planning.md +96 -0
- package/dist/vendor/harness-opencode/dist/skills/code-quality/rules/review.md +104 -0
- package/dist/vendor/harness-opencode/dist/skills/pilot-planning/rules/self-review.md +1 -1
- package/dist/vendor/harness-opencode/dist/skills/pilot-planning/rules/verify-design.md +42 -0
- package/dist/vendor/harness-opencode/package.json +1 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1142,11 +1142,60 @@ CREATE TABLE IF NOT EXISTS events (
|
|
|
1142
1142
|
CREATE INDEX IF NOT EXISTS idx_events_run ON events(run_id, id);
|
|
1143
1143
|
CREATE INDEX IF NOT EXISTS idx_events_run_task ON events(run_id, task_id, id);
|
|
1144
1144
|
`.trim();
|
|
1145
|
+
var V2_SQL = `
|
|
1146
|
+
CREATE TABLE IF NOT EXISTS workflows (
|
|
1147
|
+
id TEXT NOT NULL PRIMARY KEY,
|
|
1148
|
+
goal TEXT NOT NULL,
|
|
1149
|
+
started_at INTEGER NOT NULL,
|
|
1150
|
+
finished_at INTEGER,
|
|
1151
|
+
status TEXT NOT NULL CHECK (status IN ('pending','running','completed','aborted','failed')),
|
|
1152
|
+
current_phase TEXT
|
|
1153
|
+
);
|
|
1154
|
+
|
|
1155
|
+
CREATE TABLE IF NOT EXISTS phases (
|
|
1156
|
+
workflow_id TEXT NOT NULL,
|
|
1157
|
+
name TEXT NOT NULL CHECK (name IN ('scope','plan','build','qa','followup')),
|
|
1158
|
+
status TEXT NOT NULL CHECK (status IN ('pending','running','completed','aborted','failed')),
|
|
1159
|
+
started_at INTEGER,
|
|
1160
|
+
finished_at INTEGER,
|
|
1161
|
+
artifact_path TEXT,
|
|
1162
|
+
PRIMARY KEY (workflow_id, name),
|
|
1163
|
+
FOREIGN KEY (workflow_id) REFERENCES workflows(id) ON DELETE CASCADE
|
|
1164
|
+
);
|
|
1165
|
+
|
|
1166
|
+
CREATE TABLE IF NOT EXISTS artifacts (
|
|
1167
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1168
|
+
workflow_id TEXT NOT NULL,
|
|
1169
|
+
phase TEXT NOT NULL,
|
|
1170
|
+
kind TEXT NOT NULL,
|
|
1171
|
+
path TEXT NOT NULL,
|
|
1172
|
+
created_at INTEGER NOT NULL,
|
|
1173
|
+
sha256 TEXT,
|
|
1174
|
+
FOREIGN KEY (workflow_id) REFERENCES workflows(id) ON DELETE CASCADE
|
|
1175
|
+
);
|
|
1176
|
+
|
|
1177
|
+
CREATE INDEX IF NOT EXISTS idx_artifacts_workflow_phase ON artifacts(workflow_id, phase);
|
|
1178
|
+
|
|
1179
|
+
ALTER TABLE events ADD COLUMN phase TEXT;
|
|
1180
|
+
|
|
1181
|
+
INSERT INTO workflows (id, goal, started_at, finished_at, status, current_phase)
|
|
1182
|
+
SELECT id, plan_slug, started_at, finished_at, status, 'build' FROM runs;
|
|
1183
|
+
|
|
1184
|
+
INSERT INTO phases (workflow_id, name, status, started_at, finished_at, artifact_path)
|
|
1185
|
+
SELECT id, 'build', status, started_at, finished_at, NULL FROM runs;
|
|
1186
|
+
|
|
1187
|
+
UPDATE events SET phase = 'build' WHERE phase IS NULL;
|
|
1188
|
+
`.trim();
|
|
1145
1189
|
var MIGRATIONS = [
|
|
1146
1190
|
{
|
|
1147
1191
|
version: 1,
|
|
1148
1192
|
description: "initial pilot schema (runs/tasks/events)",
|
|
1149
1193
|
sql: V1_SQL
|
|
1194
|
+
},
|
|
1195
|
+
{
|
|
1196
|
+
version: 2,
|
|
1197
|
+
description: "workflows/phases/artifacts tables + events.phase column",
|
|
1198
|
+
sql: V2_SQL
|
|
1150
1199
|
}
|
|
1151
1200
|
];
|
|
1152
1201
|
function applyMigrations(db) {
|
|
@@ -1279,8 +1328,8 @@ function appendEvent(db, args) {
|
|
|
1279
1328
|
});
|
|
1280
1329
|
}
|
|
1281
1330
|
db.run(
|
|
1282
|
-
`INSERT INTO events (run_id, task_id, ts, kind, payload) VALUES (?, ?, ?, ?, ?)`,
|
|
1283
|
-
[args.runId, args.taskId ?? null, ts, args.kind, payloadStr]
|
|
1331
|
+
`INSERT INTO events (run_id, task_id, ts, kind, payload, phase) VALUES (?, ?, ?, ?, ?, ?)`,
|
|
1332
|
+
[args.runId, args.taskId ?? null, ts, args.kind, payloadStr, args.phase ?? null]
|
|
1284
1333
|
);
|
|
1285
1334
|
if (eventSubscribers.length > 0) {
|
|
1286
1335
|
const snapshot = eventSubscribers.slice();
|
|
@@ -1291,6 +1340,7 @@ function appendEvent(db, args) {
|
|
|
1291
1340
|
taskId: args.taskId ?? null,
|
|
1292
1341
|
kind: args.kind,
|
|
1293
1342
|
payload: args.payload,
|
|
1343
|
+
phase: args.phase ?? null,
|
|
1294
1344
|
ts
|
|
1295
1345
|
});
|
|
1296
1346
|
} catch {
|
|
@@ -1865,25 +1915,78 @@ function fixPrompt(_task, last) {
|
|
|
1865
1915
|
return sections.join("\n");
|
|
1866
1916
|
}
|
|
1867
1917
|
|
|
1868
|
-
// src/pilot/
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
var DEFAULT_OUTPUT_CAP_BYTES = 256 * 1024;
|
|
1872
|
-
var TRUNCATION_NOTICE = "\n[pilot] verify output truncated\n";
|
|
1873
|
-
async function runVerify(commands, options) {
|
|
1918
|
+
// src/pilot/gates/composite.ts
|
|
1919
|
+
async function evalAllGate(gate, ctx) {
|
|
1920
|
+
const startedAt = Date.now();
|
|
1874
1921
|
const results = [];
|
|
1875
|
-
for (const
|
|
1876
|
-
const
|
|
1877
|
-
results.push(result);
|
|
1878
|
-
if (!
|
|
1879
|
-
|
|
1922
|
+
for (const sub of gate.gates) {
|
|
1923
|
+
const subResult = await evalGate(sub, ctx);
|
|
1924
|
+
results.push({ gate: sub, result: subResult });
|
|
1925
|
+
if (!subResult.ok) {
|
|
1926
|
+
const evidence2 = {
|
|
1927
|
+
kind: "all",
|
|
1928
|
+
results,
|
|
1929
|
+
failure: subResult
|
|
1930
|
+
};
|
|
1931
|
+
return {
|
|
1932
|
+
ok: false,
|
|
1933
|
+
reason: subResult.reason,
|
|
1934
|
+
evidence: evidence2,
|
|
1935
|
+
durationMs: Date.now() - startedAt
|
|
1936
|
+
};
|
|
1880
1937
|
}
|
|
1881
1938
|
}
|
|
1939
|
+
const evidence = { kind: "all", results };
|
|
1882
1940
|
return {
|
|
1883
1941
|
ok: true,
|
|
1884
|
-
|
|
1942
|
+
evidence,
|
|
1943
|
+
durationMs: Date.now() - startedAt
|
|
1885
1944
|
};
|
|
1886
1945
|
}
|
|
1946
|
+
async function evalAnyGate(gate, ctx) {
|
|
1947
|
+
const startedAt = Date.now();
|
|
1948
|
+
const results = [];
|
|
1949
|
+
if (gate.gates.length === 0) {
|
|
1950
|
+
const evidence2 = { kind: "any", results };
|
|
1951
|
+
return {
|
|
1952
|
+
ok: false,
|
|
1953
|
+
reason: "any-gate has no sub-gates to satisfy",
|
|
1954
|
+
evidence: evidence2,
|
|
1955
|
+
durationMs: Date.now() - startedAt
|
|
1956
|
+
};
|
|
1957
|
+
}
|
|
1958
|
+
let lastResult = null;
|
|
1959
|
+
for (const sub of gate.gates) {
|
|
1960
|
+
const subResult = await evalGate(sub, ctx);
|
|
1961
|
+
results.push({ gate: sub, result: subResult });
|
|
1962
|
+
lastResult = subResult;
|
|
1963
|
+
if (subResult.ok) {
|
|
1964
|
+
const evidence2 = { kind: "any", results };
|
|
1965
|
+
return {
|
|
1966
|
+
ok: true,
|
|
1967
|
+
evidence: evidence2,
|
|
1968
|
+
durationMs: Date.now() - startedAt
|
|
1969
|
+
};
|
|
1970
|
+
}
|
|
1971
|
+
}
|
|
1972
|
+
const evidence = {
|
|
1973
|
+
kind: "any",
|
|
1974
|
+
results,
|
|
1975
|
+
failure: lastResult ?? void 0
|
|
1976
|
+
};
|
|
1977
|
+
return {
|
|
1978
|
+
ok: false,
|
|
1979
|
+
reason: `any-gate exhausted: all ${results.length} sub-gates failed`,
|
|
1980
|
+
evidence,
|
|
1981
|
+
durationMs: Date.now() - startedAt
|
|
1982
|
+
};
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
// src/pilot/verify/spawn.ts
|
|
1986
|
+
import { spawn as spawn2 } from "child_process";
|
|
1987
|
+
var DEFAULT_TIMEOUT_MS = 5 * 60 * 1e3;
|
|
1988
|
+
var DEFAULT_OUTPUT_CAP_BYTES = 256 * 1024;
|
|
1989
|
+
var TRUNCATION_NOTICE = "\n[pilot] verify output truncated\n";
|
|
1887
1990
|
async function runOne(command10, options) {
|
|
1888
1991
|
if (typeof command10 !== "string" || command10.length === 0) {
|
|
1889
1992
|
throw new TypeError(`runOne: command must be a non-empty string`);
|
|
@@ -2020,6 +2123,147 @@ function killTree(child) {
|
|
|
2020
2123
|
}, 2e3).unref();
|
|
2021
2124
|
}
|
|
2022
2125
|
|
|
2126
|
+
// src/pilot/gates/shell.ts
|
|
2127
|
+
async function evalShellGate(gate, ctx) {
|
|
2128
|
+
const result = await runOne(gate.command, {
|
|
2129
|
+
cwd: ctx.cwd,
|
|
2130
|
+
env: ctx.env,
|
|
2131
|
+
abortSignal: ctx.abortSignal,
|
|
2132
|
+
onLine: ctx.onShellLine,
|
|
2133
|
+
timeoutMs: gate.timeoutMs,
|
|
2134
|
+
outputCapBytes: ctx.shellOutputCapBytes
|
|
2135
|
+
});
|
|
2136
|
+
return toGateResult(result);
|
|
2137
|
+
}
|
|
2138
|
+
function toGateResult(result) {
|
|
2139
|
+
if (result.ok) {
|
|
2140
|
+
return {
|
|
2141
|
+
ok: true,
|
|
2142
|
+
durationMs: result.durationMs,
|
|
2143
|
+
evidence: { kind: "shell", result }
|
|
2144
|
+
};
|
|
2145
|
+
}
|
|
2146
|
+
const reason = formatShellFailure(result);
|
|
2147
|
+
return {
|
|
2148
|
+
ok: false,
|
|
2149
|
+
reason,
|
|
2150
|
+
durationMs: result.durationMs,
|
|
2151
|
+
evidence: { kind: "shell", result }
|
|
2152
|
+
};
|
|
2153
|
+
}
|
|
2154
|
+
function formatShellFailure(result) {
|
|
2155
|
+
const flags = [];
|
|
2156
|
+
if (result.timedOut) flags.push("timed-out");
|
|
2157
|
+
if (result.aborted) flags.push("aborted");
|
|
2158
|
+
if (result.signal) flags.push(`signal=${result.signal}`);
|
|
2159
|
+
const flagSuffix = flags.length > 0 ? ` [${flags.join(",")}]` : "";
|
|
2160
|
+
return `shell gate failed: ${result.command} \u2192 exit ${result.exitCode}${flagSuffix}`;
|
|
2161
|
+
}
|
|
2162
|
+
|
|
2163
|
+
// src/pilot/gates/eval.ts
|
|
2164
|
+
async function evalGate(gate, ctx) {
|
|
2165
|
+
switch (gate.kind) {
|
|
2166
|
+
case "shell":
|
|
2167
|
+
return evalShellGate(gate, ctx);
|
|
2168
|
+
case "all":
|
|
2169
|
+
return evalAllGate(gate, ctx);
|
|
2170
|
+
case "any":
|
|
2171
|
+
return evalAnyGate(gate, ctx);
|
|
2172
|
+
default: {
|
|
2173
|
+
const _exhaustive = gate;
|
|
2174
|
+
throw new Error(
|
|
2175
|
+
`evalGate: unknown gate kind ${_exhaustive.kind}`
|
|
2176
|
+
);
|
|
2177
|
+
}
|
|
2178
|
+
}
|
|
2179
|
+
}
|
|
2180
|
+
|
|
2181
|
+
// src/pilot/gates/types.ts
|
|
2182
|
+
function asShellEvidence(evidence) {
|
|
2183
|
+
if (typeof evidence === "object" && evidence !== null && evidence.kind === "shell") {
|
|
2184
|
+
return evidence;
|
|
2185
|
+
}
|
|
2186
|
+
return null;
|
|
2187
|
+
}
|
|
2188
|
+
function asCompositeEvidence(evidence) {
|
|
2189
|
+
if (typeof evidence === "object" && evidence !== null && (evidence.kind === "all" || evidence.kind === "any")) {
|
|
2190
|
+
return evidence;
|
|
2191
|
+
}
|
|
2192
|
+
return null;
|
|
2193
|
+
}
|
|
2194
|
+
|
|
2195
|
+
// src/pilot/verify/runner.ts
|
|
2196
|
+
async function runVerify(commands, options) {
|
|
2197
|
+
if (commands.length === 0) {
|
|
2198
|
+
return { ok: true, results: [] };
|
|
2199
|
+
}
|
|
2200
|
+
const gate = {
|
|
2201
|
+
kind: "all",
|
|
2202
|
+
gates: commands.map((command10) => ({
|
|
2203
|
+
kind: "shell",
|
|
2204
|
+
command: command10,
|
|
2205
|
+
timeoutMs: options.timeoutMs
|
|
2206
|
+
}))
|
|
2207
|
+
};
|
|
2208
|
+
const ctx = {
|
|
2209
|
+
cwd: options.cwd,
|
|
2210
|
+
env: options.env,
|
|
2211
|
+
abortSignal: options.abortSignal,
|
|
2212
|
+
onShellLine: options.onLine,
|
|
2213
|
+
shellOutputCapBytes: options.outputCapBytes
|
|
2214
|
+
};
|
|
2215
|
+
const gateResult = await evalGate(gate, ctx);
|
|
2216
|
+
return toRunVerifyResult(gateResult);
|
|
2217
|
+
}
|
|
2218
|
+
function toRunVerifyResult(gateResult) {
|
|
2219
|
+
const composite = asCompositeEvidence(gateResult.evidence);
|
|
2220
|
+
if (composite === null || composite.kind !== "all") {
|
|
2221
|
+
throw new Error(
|
|
2222
|
+
`runVerify: expected composite all-gate evidence, got ${gateResultDescriptor(gateResult)}`
|
|
2223
|
+
);
|
|
2224
|
+
}
|
|
2225
|
+
const results = composite.results.map((entry) => extractCommandResult(entry));
|
|
2226
|
+
if (gateResult.ok) {
|
|
2227
|
+
return {
|
|
2228
|
+
ok: true,
|
|
2229
|
+
results
|
|
2230
|
+
};
|
|
2231
|
+
}
|
|
2232
|
+
const failingEntry = composite.results[composite.results.length - 1];
|
|
2233
|
+
if (!failingEntry || failingEntry.result.ok) {
|
|
2234
|
+
throw new Error(
|
|
2235
|
+
"runVerify: all-gate failed but no failing sub-result was recorded"
|
|
2236
|
+
);
|
|
2237
|
+
}
|
|
2238
|
+
const failureCommandResult = extractCommandResult(failingEntry);
|
|
2239
|
+
if (failureCommandResult.ok) {
|
|
2240
|
+
throw new Error(
|
|
2241
|
+
"runVerify: failing sub-gate produced a successful CommandResult"
|
|
2242
|
+
);
|
|
2243
|
+
}
|
|
2244
|
+
return {
|
|
2245
|
+
ok: false,
|
|
2246
|
+
results,
|
|
2247
|
+
failure: failureCommandResult
|
|
2248
|
+
};
|
|
2249
|
+
}
|
|
2250
|
+
function extractCommandResult(entry) {
|
|
2251
|
+
const shell = asShellEvidence(entry.result.evidence);
|
|
2252
|
+
if (shell === null) {
|
|
2253
|
+
throw new Error(
|
|
2254
|
+
`runVerify: expected shell-gate evidence in all-gate child, got ${gateResultDescriptor(entry.result)}`
|
|
2255
|
+
);
|
|
2256
|
+
}
|
|
2257
|
+
return shell.result;
|
|
2258
|
+
}
|
|
2259
|
+
function gateResultDescriptor(result) {
|
|
2260
|
+
const evidence = result.evidence;
|
|
2261
|
+
return JSON.stringify({
|
|
2262
|
+
ok: result.ok,
|
|
2263
|
+
evidenceKind: evidence?.kind ?? null
|
|
2264
|
+
});
|
|
2265
|
+
}
|
|
2266
|
+
|
|
2023
2267
|
// src/pilot/verify/touches.ts
|
|
2024
2268
|
import picomatch2 from "picomatch";
|
|
2025
2269
|
import { execFile as execFile2 } from "child_process";
|
|
@@ -2530,7 +2774,11 @@ async function runOneTaskImpl(deps, task, opts) {
|
|
|
2530
2774
|
command: f.command,
|
|
2531
2775
|
exitCode: f.exitCode,
|
|
2532
2776
|
output: f.output.slice(0, 4096),
|
|
2533
|
-
reason: reason2
|
|
2777
|
+
reason: reason2,
|
|
2778
|
+
// Step 1 of pilot redesign: gate descriptor on every
|
|
2779
|
+
// verify-derived event. Future LLM/approval gates emit
|
|
2780
|
+
// identically-shaped events with a different `gate.kind`.
|
|
2781
|
+
gate: { kind: "shell", command: f.command }
|
|
2534
2782
|
}
|
|
2535
2783
|
});
|
|
2536
2784
|
return;
|
|
@@ -2539,7 +2787,10 @@ async function runOneTaskImpl(deps, task, opts) {
|
|
|
2539
2787
|
runId: deps.runId,
|
|
2540
2788
|
taskId: task.id,
|
|
2541
2789
|
kind: "task.baseline.passed",
|
|
2542
|
-
payload: {
|
|
2790
|
+
payload: {
|
|
2791
|
+
commands: allVerify.length,
|
|
2792
|
+
gate: { kind: "all", subKind: "shell", count: baselineVerify.length }
|
|
2793
|
+
}
|
|
2543
2794
|
});
|
|
2544
2795
|
}
|
|
2545
2796
|
let lastFailure = null;
|
|
@@ -2695,7 +2946,8 @@ async function runOneTaskImpl(deps, task, opts) {
|
|
|
2695
2946
|
exitCode: lastFailure.exitCode,
|
|
2696
2947
|
timedOut: verifyResult.failure.timedOut,
|
|
2697
2948
|
aborted: verifyResult.failure.aborted,
|
|
2698
|
-
output: verifyResult.failure.output.slice(-2048)
|
|
2949
|
+
output: verifyResult.failure.output.slice(-2048),
|
|
2950
|
+
gate: { kind: "shell", command: lastFailure.command }
|
|
2699
2951
|
}
|
|
2700
2952
|
});
|
|
2701
2953
|
if (verifyResult.failure.aborted) {
|
|
@@ -2721,7 +2973,10 @@ async function runOneTaskImpl(deps, task, opts) {
|
|
|
2721
2973
|
runId: deps.runId,
|
|
2722
2974
|
taskId: task.id,
|
|
2723
2975
|
kind: "task.verify.passed",
|
|
2724
|
-
payload: {
|
|
2976
|
+
payload: {
|
|
2977
|
+
attempt,
|
|
2978
|
+
gate: { kind: "all", subKind: "shell", count: allVerify.length }
|
|
2979
|
+
}
|
|
2725
2980
|
});
|
|
2726
2981
|
const touches = await enforceTouches({
|
|
2727
2982
|
cwd,
|
|
@@ -3311,7 +3566,7 @@ function startStreamingLogger(args) {
|
|
|
3311
3566
|
const taskStart = /* @__PURE__ */ new Map();
|
|
3312
3567
|
let succeeded = 0;
|
|
3313
3568
|
let failed = 0;
|
|
3314
|
-
const INLINE_BLOCKED_CAP =
|
|
3569
|
+
const INLINE_BLOCKED_CAP = 0;
|
|
3315
3570
|
let blockedCount = 0;
|
|
3316
3571
|
let blockedInlineEmitted = 0;
|
|
3317
3572
|
let blockedOverflowEmitted = false;
|
|
@@ -3350,6 +3605,24 @@ function startStreamingLogger(args) {
|
|
|
3350
3605
|
if (id !== null) taskStart.set(id, event.ts);
|
|
3351
3606
|
write(`task.started ${id ?? "?"}`);
|
|
3352
3607
|
break;
|
|
3608
|
+
case "task.baseline.passed":
|
|
3609
|
+
break;
|
|
3610
|
+
case "task.baseline.failed": {
|
|
3611
|
+
const bp = event.payload;
|
|
3612
|
+
if (bp !== null && typeof bp === "object" && typeof bp.command === "string" && typeof bp.exitCode === "number") {
|
|
3613
|
+
write(
|
|
3614
|
+
`task.baseline.failed ${id ?? "?"} (${bp.command} \u2192 exit ${bp.exitCode})`
|
|
3615
|
+
);
|
|
3616
|
+
const output = typeof bp.output === "string" ? bp.output : null;
|
|
3617
|
+
if (output !== null && output.trim().length > 0) {
|
|
3618
|
+
const tail = output.trim().split("\n").slice(-6).map((l) => ` ${l}`).join("\n");
|
|
3619
|
+
writeRaw(tail);
|
|
3620
|
+
}
|
|
3621
|
+
} else {
|
|
3622
|
+
write(`task.baseline.failed ${id ?? "?"}`);
|
|
3623
|
+
}
|
|
3624
|
+
break;
|
|
3625
|
+
}
|
|
3353
3626
|
case "task.verify.passed":
|
|
3354
3627
|
write(`task.verify.passed ${id ?? "?"}`);
|
|
3355
3628
|
break;
|
|
@@ -3435,7 +3708,7 @@ function startStreamingLogger(args) {
|
|
|
3435
3708
|
case "task.attempt": {
|
|
3436
3709
|
const p = event.payload;
|
|
3437
3710
|
if (p !== null && typeof p === "object" && typeof p.attempt === "number" && typeof p.of === "number" && p.attempt >= 2) {
|
|
3438
|
-
|
|
3711
|
+
write(`task.retry ${id ?? "?"} attempt ${p.attempt}/${p.of}`);
|
|
3439
3712
|
}
|
|
3440
3713
|
break;
|
|
3441
3714
|
}
|
|
@@ -3561,9 +3834,17 @@ Failed tasks (${failed.length}):
|
|
|
3561
3834
|
session: ${session}
|
|
3562
3835
|
worktree: ${worktree}
|
|
3563
3836
|
elapsed: ${elapsed} attempts: ${t.attempts}
|
|
3564
|
-
|
|
3565
3837
|
`
|
|
3566
3838
|
);
|
|
3839
|
+
const baselineOutput = resolveBaselineOutput(db, runId, t.task_id);
|
|
3840
|
+
if (baselineOutput !== null) {
|
|
3841
|
+
const tail = baselineOutput.trim().split("\n").slice(-6).map((l) => ` ${l}`).join("\n");
|
|
3842
|
+
process.stdout.write(` output:
|
|
3843
|
+
${tail}
|
|
3844
|
+
`);
|
|
3845
|
+
}
|
|
3846
|
+
process.stdout.write(`
|
|
3847
|
+
`);
|
|
3567
3848
|
}
|
|
3568
3849
|
}
|
|
3569
3850
|
}
|
|
@@ -3592,6 +3873,18 @@ function resolveFailureDetail(db, runId, row) {
|
|
|
3592
3873
|
reason: row.last_error ?? "(no reason recorded)"
|
|
3593
3874
|
};
|
|
3594
3875
|
}
|
|
3876
|
+
function resolveBaselineOutput(db, runId, taskId) {
|
|
3877
|
+
const events = readEventsDecoded(db, { runId, taskId });
|
|
3878
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
3879
|
+
const e = events[i];
|
|
3880
|
+
if (e.kind !== "task.baseline.failed") continue;
|
|
3881
|
+
const p = e.payload;
|
|
3882
|
+
if (p !== null && typeof p === "object" && typeof p.output === "string") {
|
|
3883
|
+
return p.output;
|
|
3884
|
+
}
|
|
3885
|
+
}
|
|
3886
|
+
return null;
|
|
3887
|
+
}
|
|
3595
3888
|
function truncateSummary(s, maxChars) {
|
|
3596
3889
|
if (s.length <= maxChars) return s;
|
|
3597
3890
|
return s.slice(0, maxChars - 1) + "\u2026";
|
|
@@ -1866,7 +1866,7 @@ import { join as join8 } from "path";
|
|
|
1866
1866
|
var APP_KEY = "A-US-3617699429";
|
|
1867
1867
|
var ENDPOINT = "https://us.aptabase.com/api/v0/event";
|
|
1868
1868
|
var PKG_NAME = "@glrs-dev/harness-plugin-opencode";
|
|
1869
|
-
var PKG_VERSION = true ? "1.
|
|
1869
|
+
var PKG_VERSION = true ? "1.2.0" : "dev";
|
|
1870
1870
|
var DISABLED = process.env.HARNESS_OPENCODE_TELEMETRY === "0" || process.env.HARNESS_OPENCODE_TELEMETRY === "false" || process.env.DO_NOT_TRACK === "1" || process.env.CI === "true";
|
|
1871
1871
|
var SESSION_ID = randomUUID();
|
|
1872
1872
|
function getInstallId() {
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: code-quality
|
|
3
|
+
description: Four principles for autonomous code quality — think before coding, simplicity first, surgical changes, goal-driven execution. Load this skill when planning, building, or reviewing any non-trivial change. Derived from observed patterns in AI-agent-authored PRs where review feedback clustered around wrong assumptions, overcomplication, scope creep, and missing failure-mode coverage.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Code Quality Principles
|
|
7
|
+
|
|
8
|
+
Four principles that prevent the most common classes of defects in AI-agent-authored code. Each principle applies at every pipeline phase, but the enforcement actions differ by phase. Load the rule file for your current role.
|
|
9
|
+
|
|
10
|
+
These principles are derived from empirical analysis of recurring review feedback on agent-authored PRs. The top defect categories — wrong assumptions at system boundaries, overcomplicated implementations, unplanned side-effects, and happy-path-only coverage — are all preventable by applying the right check at the right phase.
|
|
11
|
+
|
|
12
|
+
## The four principles
|
|
13
|
+
|
|
14
|
+
1. **Think Before Coding** — Don't assume. Surface ambiguity, verify cross-boundary names, present tradeoffs, stop when confused.
|
|
15
|
+
2. **Simplicity First** — Minimum code that solves the problem. No speculative features, no single-use abstractions, no "flexibility" that wasn't requested.
|
|
16
|
+
3. **Surgical Changes** — Touch only what you must. Every changed line traces to the plan. Minimize blast radius on security-sensitive files.
|
|
17
|
+
4. **Goal-Driven Execution** — Define success criteria with real verify commands. Enumerate failure modes. Test the error paths, not just the happy path.
|
|
18
|
+
|
|
19
|
+
## Phase-specific rules
|
|
20
|
+
|
|
21
|
+
Each rule file applies all four principles through the lens of a specific pipeline phase. Load the one that matches your current role:
|
|
22
|
+
|
|
23
|
+
1. [`rules/gap-analysis.md`](rules/gap-analysis.md) — For `@gap-analyzer`. Surface hidden assumptions, missing failure modes, naming mismatches, and overscoped plans before the draft is written.
|
|
24
|
+
|
|
25
|
+
2. [`rules/planning.md`](rules/planning.md) — For `@plan` and `@plan-reviewer`. Verify every cross-boundary identifier. Reject plans that exceed what the goal requires. Require failure-mode coverage in acceptance criteria.
|
|
26
|
+
|
|
27
|
+
3. [`rules/building.md`](rules/building.md) — For `@build`. Enforce surgical changes. Verify names before using them. Flag unplanned edits. Write failure-path tests before happy-path code.
|
|
28
|
+
|
|
29
|
+
4. [`rules/review.md`](rules/review.md) — For `@qa-reviewer` and `@qa-thorough`. Verify failure-path coverage in the diff. Grep-confirm cross-boundary string literals. Reject diffs with unplanned scope.
|
|
30
|
+
|
|
31
|
+
## When to load this skill
|
|
32
|
+
|
|
33
|
+
Any non-trivial change — defined as any plan with 3+ file-level changes, or any change touching a system boundary (API contract, database schema, config/security file, cross-service integration).
|
|
34
|
+
|
|
35
|
+
Do NOT load for trivial work (typo fixes, single-file renames, doc-only changes). The overhead isn't worth it.
|
|
36
|
+
|
|
37
|
+
## Observable outcomes
|
|
38
|
+
|
|
39
|
+
These are the signals that the principles are working:
|
|
40
|
+
|
|
41
|
+
- Fewer naming mismatches at system boundaries (cross-boundary identifiers are grep-confirmed before use)
|
|
42
|
+
- Smaller, more focused PRs (plans that exceed ~15 files get split or justified)
|
|
43
|
+
- Zero unplanned changes in diffs (every changed line traces to the plan)
|
|
44
|
+
- Failure-mode coverage in acceptance criteria (negative tests exist for medium+ risk changes)
|
|
45
|
+
- Narrower security-config changes (specific paths instead of broad globs)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Code Quality — Building Phase
|
|
2
|
+
|
|
3
|
+
You are the build agent. Your job is to execute the plan without introducing the defect classes that dominate agent-authored PRs. These four principles tell you what to enforce during execution.
|
|
4
|
+
|
|
5
|
+
## Principle 1: Think Before Coding
|
|
6
|
+
|
|
7
|
+
At the building phase, this means verifying every assumption the plan makes before writing code against it. The plan is your spec, but specs can be wrong.
|
|
8
|
+
|
|
9
|
+
### Before editing each file
|
|
10
|
+
|
|
11
|
+
- **Verify cross-boundary identifiers.** Before using any identifier from the plan that references an existing system concept (database column, enum value, API field, Temporal signal name, config key, registry target), grep the codebase for the canonical form. If the plan says `"eligibility_request"` but the codebase uses `"eligibilityRequest"`, the plan is wrong — STOP and report.
|
|
12
|
+
- **Verify behavioral assumptions.** If the plan says "this function returns X" or "this endpoint accepts Y," read the actual implementation before writing code that depends on it. Don't trust the plan's description of existing behavior — verify it.
|
|
13
|
+
- **Check for domain-specific safety constraints.** Before modifying a Temporal workflow, check whether the change requires a `patched()` guard. Before modifying a database migration, check whether a down() path is needed. Before modifying an auth flow, check whether the change affects token scoping. These constraints aren't always in the plan — they're in the codebase's conventions.
|
|
14
|
+
|
|
15
|
+
### When you find a mismatch
|
|
16
|
+
|
|
17
|
+
Don't silently work around it. STOP and report:
|
|
18
|
+
|
|
19
|
+
> Plan says `<identifier>` but codebase uses `<canonical form>`. Which is correct?
|
|
20
|
+
|
|
21
|
+
This is a design-change signal, not a cosmetic threshold. The plan needs to be updated before you proceed.
|
|
22
|
+
|
|
23
|
+
### Anti-pattern: the trusting builder
|
|
24
|
+
|
|
25
|
+
Plan says: register target as `"eligibility_request"`. Builder writes code and tests using that name. Tests pass (builder wrote the fixtures). Production breaks because the registry uses `"eligibilityRequest"`. The builder trusted the plan instead of verifying.
|
|
26
|
+
|
|
27
|
+
**Your action:** Grep for every cross-boundary identifier before first use. One grep per identifier. This takes seconds and prevents the most common class of runtime failure.
|
|
28
|
+
|
|
29
|
+
## Principle 2: Simplicity First
|
|
30
|
+
|
|
31
|
+
At the building phase, this means writing the minimum code that satisfies each plan item — not the most comprehensive code you can generate.
|
|
32
|
+
|
|
33
|
+
### During implementation
|
|
34
|
+
|
|
35
|
+
- **Fight the generation instinct.** Your training data is full of comprehensive, well-documented, heavily-abstracted code. That's not what the plan asked for. Write the specific thing the plan describes, in the fewest lines that are correct and readable.
|
|
36
|
+
- **No speculative error handling.** Handle the error cases the plan specifies. Don't add error handling for scenarios the plan doesn't mention — that's scope creep disguised as robustness.
|
|
37
|
+
- **No premature abstraction.** If the plan says "add a function that does X," write a function that does X. Don't write a class hierarchy, a factory, or a strategy pattern unless the plan explicitly calls for it.
|
|
38
|
+
- **Prefer inline over extracted.** If a helper function would be called once, inline it. If a constant would be referenced once, inline it. Extraction is warranted at 2+ call sites.
|
|
39
|
+
- **Match the plan's complexity level.** If the plan describes a 50-line change, don't produce 200 lines. If you find yourself writing significantly more code than the plan implies, that's a signal to STOP and check whether you're overcomplicating.
|
|
40
|
+
|
|
41
|
+
### Anti-pattern: the comprehensive implementation
|
|
42
|
+
|
|
43
|
+
Plan says: "add env-var toggle for mock client." Builder produces: a resolver pattern with dynamic imports, a factory function, a type-safe config schema, and conditional module loading — 200 lines for what could be a 20-line `if (process.env.USE_MOCK)` check. The extra complexity introduces a bug where mock data is unconditionally imported in production.
|
|
44
|
+
|
|
45
|
+
**Your action:** Before writing, estimate the line count the plan implies. If your implementation exceeds 2x that estimate, pause and simplify.
|
|
46
|
+
|
|
47
|
+
## Principle 3: Surgical Changes
|
|
48
|
+
|
|
49
|
+
This is your primary principle. The build agent's #1 failure mode is unplanned side-effects.
|
|
50
|
+
|
|
51
|
+
### After every file edit, check
|
|
52
|
+
|
|
53
|
+
1. **Is this file in `## File-level changes`?** If not → STOP and report. Do not silently expand scope. Do not add files to the plan yourself unless the expansion is ≤2 files and directly required by a planned change.
|
|
54
|
+
|
|
55
|
+
2. **Does every changed line trace to a plan item?** Review your own diff mentally. If any line is "while I'm here" cleanup, adjacent-code improvement, or style normalization — revert it. Your diff should contain zero surprises.
|
|
56
|
+
|
|
57
|
+
3. **Did I modify a security-sensitive file?** Scanner allowlists, auth configs, CORS settings, `.env` templates, CI workflow files, permission manifests. If yes:
|
|
58
|
+
- Is the change the narrowest possible? Could I use a specific file path instead of a glob pattern?
|
|
59
|
+
- Does the plan explicitly mention this change? If not → STOP and report.
|
|
60
|
+
- Would a reviewer looking at this diff ask "why was this changed?" If yes, the change needs justification.
|
|
61
|
+
|
|
62
|
+
4. **Did I touch imports/exports in a file I'm editing?** Only remove imports YOUR changes made unused. If a pre-existing import was already unused, leave it. Only add exports the plan requires. Don't "clean up" the import block.
|
|
63
|
+
|
|
64
|
+
5. **Am I matching existing style?** Read the surrounding code before writing. Match indentation, naming conventions, comment style, error handling patterns, and test structure — even if you'd do it differently. Consistency within a file matters more than your preference.
|
|
65
|
+
|
|
66
|
+
### Security-sensitive file patterns
|
|
67
|
+
|
|
68
|
+
These files require extra scrutiny. Any change must be the narrowest possible and explicitly justified by the plan:
|
|
69
|
+
|
|
70
|
+
- `**/.*rc*`, `**/.eslintrc*`, `**/.secretlintrc*` — linter/scanner configs
|
|
71
|
+
- `**/allowlist*`, `**/whitelist*`, `**/ignore*` — exclusion lists
|
|
72
|
+
- `**/.env*`, `**/env.*.ts` — environment configs
|
|
73
|
+
- `**/auth/**`, `**/security/**`, `**/crypto/**` — auth/security modules
|
|
74
|
+
- `**/*.workflow.ts`, `**/workflows/**` — Temporal workflows (replay safety)
|
|
75
|
+
- `**/migrations/**`, `**/*.sql` — database migrations
|
|
76
|
+
- `**/.github/workflows/**` — CI pipelines
|
|
77
|
+
|
|
78
|
+
### Anti-pattern: the expedient side-effect
|
|
79
|
+
|
|
80
|
+
The builder needs mock data for tests. The PHI scanner flags the mock file. Instead of adding the specific file path (`test/mocks/mock-pms-client.ts`) to the allowlist, the builder adds `**/mock-*.ts` — disabling PHI detection for any matching file across the entire repo. The test passes. The security hole ships.
|
|
81
|
+
|
|
82
|
+
**Your action:** When you need to modify a security-sensitive file, use the most specific pattern possible. If the plan doesn't specify the exact pattern, STOP and ask — don't improvise with a broad glob.
|
|
83
|
+
|
|
84
|
+
### Anti-pattern: the stale-data forward
|
|
85
|
+
|
|
86
|
+
Plan says: "forward the RCM enabled setting to the API." Builder forwards the entire `settings.solutions` object instead of the single `rcmEnabled` field. A concurrent write to any other field in the object gets overwritten by the stale snapshot.
|
|
87
|
+
|
|
88
|
+
**Your action:** When the plan says "forward X," forward exactly X — not the parent object, not a snapshot, not a superset. Read the existing forwarding pattern in the codebase and match it.
|
|
89
|
+
|
|
90
|
+
## Principle 4: Goal-Driven Execution
|
|
91
|
+
|
|
92
|
+
At the building phase, this means working in TDD order and verifying each step — including failure paths.
|
|
93
|
+
|
|
94
|
+
### Execution order
|
|
95
|
+
|
|
96
|
+
For each acceptance criterion in the plan-state fence:
|
|
97
|
+
|
|
98
|
+
1. **Write the test(s) first.** The `tests:` field names the test cases. Write them. They should fail (the implementation doesn't exist yet).
|
|
99
|
+
2. **Write the implementation.** Make the tests pass.
|
|
100
|
+
3. **Run the verify command.** The `verify:` field is the acceptance gate. If it exits non-zero, fix and re-run.
|
|
101
|
+
4. **Check for failure-path coverage.** If the plan includes negative tests (it should for medium+ risk changes), write those too. If the plan doesn't include negative tests but the change has obvious failure modes, write them anyway and note the addition in your return payload.
|
|
102
|
+
|
|
103
|
+
### Cross-boundary verification
|
|
104
|
+
|
|
105
|
+
After implementing code that uses a string literal referencing a domain concept:
|
|
106
|
+
|
|
107
|
+
- **Grep for the canonical form.** `grep -r "eligibilityRequest" src/` to confirm the registry key exists.
|
|
108
|
+
- **Check casing.** If your code uses `"eligibility_request"` and the grep returns `"eligibilityRequest"`, you have a bug — even though TypeScript is happy.
|
|
109
|
+
- **Check plurality.** `"credentials"` vs `"credential"`, `"member"` vs `"members"` — these mismatches pass type checks and fail at runtime.
|
|
110
|
+
|
|
111
|
+
### Anti-pattern: the happy-path-only builder
|
|
112
|
+
|
|
113
|
+
Plan says: "add route validation for Tailscale subnet routes." Builder implements validation for `dev`, `sbx`, and `prod`. For an unknown stack value, the validation returns an empty set — which the approval logic interprets as "all routes approved." The builder didn't write a test for the unknown-stack case because the plan's acceptance criteria only covered known stacks.
|
|
114
|
+
|
|
115
|
+
**Your action:** If the plan's acceptance criteria are all positive and the change has obvious failure modes, write the negative test anyway. Note it in your return payload as a plan expansion. Better to over-test than to ship a fail-open bug.
|
|
116
|
+
|
|
117
|
+
### Temporal workflow safety (domain-specific)
|
|
118
|
+
|
|
119
|
+
If you're modifying a Temporal workflow function body:
|
|
120
|
+
|
|
121
|
+
- **Never delete a workflow branch.** Only add new ones behind `patched()` guards.
|
|
122
|
+
- **The old code path stays behind `!patched(patchId)`.** In-flight executions replay against the old history. Removing the old branch causes a determinism violation.
|
|
123
|
+
- **Test with replay fixtures.** If the plan includes workflow changes, verify that existing replay tests still pass.
|
|
124
|
+
|
|
125
|
+
This is the single highest-severity domain-specific constraint. A determinism violation breaks in-flight production workflows silently.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Code Quality — Gap Analysis Phase
|
|
2
|
+
|
|
3
|
+
You are the gap-analyzer. Your job is to find what's missing before the plan is written. These four principles tell you what to look for.
|
|
4
|
+
|
|
5
|
+
## Principle 1: Think Before Coding
|
|
6
|
+
|
|
7
|
+
This is your primary principle. The gap-analyzer exists to catch wrong assumptions before they propagate into the plan.
|
|
8
|
+
|
|
9
|
+
### What to check
|
|
10
|
+
|
|
11
|
+
- **Cross-boundary identifiers.** For every identifier the planner references — database column, enum value, API field, Temporal signal name, config key, registry target — grep the codebase for the canonical form. The #1 source of runtime failures that pass type checks is a naming mismatch at a system boundary. Snake_case vs camelCase is the most common variant.
|
|
12
|
+
- **Assumed behaviors.** When the planner says "X will call Y" or "Z returns a list of W," verify by reading the actual code. Don't trust documentation — it drifts. Read the implementation.
|
|
13
|
+
- **Silent interpretation choices.** If the user's request is ambiguous and the planner picked one interpretation without stating the alternative, surface the alternative. "The planner assumed X, but Y is also a valid reading."
|
|
14
|
+
- **Missing context.** If the planner references a system the gap-analyzer hasn't seen evidence of (a service, a table, a config file), flag it. "Planner references `eligibility_request` table but I found `eligibilityRequest` in the registry — which is canonical?"
|
|
15
|
+
|
|
16
|
+
### Anti-pattern to catch
|
|
17
|
+
|
|
18
|
+
The planner reads a doc that says "eligibility requests use snake_case keys." The planner writes a plan using snake_case. The actual runtime registry uses camelCase. If you don't catch this, the builder will write code and tests that both use the wrong name — tests pass, production breaks.
|
|
19
|
+
|
|
20
|
+
**Your action:** For every cross-boundary name in the plan draft, report whether you confirmed it or couldn't. Use `serena_find_symbol` for code symbols, `grep` for string literals and config keys.
|
|
21
|
+
|
|
22
|
+
## Principle 2: Simplicity First
|
|
23
|
+
|
|
24
|
+
Surface overscoping before the plan is written. It's cheaper to cut scope now than to review a 13,000-line PR later.
|
|
25
|
+
|
|
26
|
+
### What to check
|
|
27
|
+
|
|
28
|
+
- **Goal-to-file ratio.** If the planner's understanding implies 15+ files for a goal that could be achieved with 5, flag it. "The goal is 'add a config toggle' but the current understanding implies an admin UI, audit logging, and settings forwarding — are all of these in scope?"
|
|
29
|
+
- **Single-use abstractions.** If the planner is proposing a generic framework (registry, engine, factory) and there's only one consumer, flag it. "A generic analytics engine is proposed but only one report type exists — consider a specific implementation."
|
|
30
|
+
- **Speculative features.** If the planner's understanding includes features the user didn't ask for, flag them. "User asked for a mock client; planner's understanding includes an env-var toggle and a resolver pattern — confirm these are needed."
|
|
31
|
+
|
|
32
|
+
### Anti-pattern to catch
|
|
33
|
+
|
|
34
|
+
The planner receives "add per-org RCM toggle" and scopes it as: migration + model + API endpoint + admin UI + audit logging + settings forwarding + 16 files. The narrower scope — toggle + migration + one API field — would ship the feature with fewer defects.
|
|
35
|
+
|
|
36
|
+
**Your action:** If the scope seems wider than the goal requires, list the minimum set of changes that would satisfy the goal and ask whether the additional scope is intentional.
|
|
37
|
+
|
|
38
|
+
## Principle 3: Surgical Changes
|
|
39
|
+
|
|
40
|
+
At the gap-analysis phase, surgical changes means identifying which existing files will be affected and flagging unintended side-effects before they happen.
|
|
41
|
+
|
|
42
|
+
### What to check
|
|
43
|
+
|
|
44
|
+
- **Adjacent code impact.** For each file the planner intends to change, check what else imports from or depends on that file. If a change to `settings.ts` will affect 12 consumers, that's a gap worth surfacing.
|
|
45
|
+
- **Security-sensitive files.** If the planner's scope implies touching a scanner allowlist, auth config, CORS setting, or similar security file, flag it explicitly. "This change will require modifying the PHI scanner allowlist — ensure the plan specifies the narrowest possible pattern."
|
|
46
|
+
- **Config/schema ripple effects.** If the change adds a database column, enum value, or config key, check whether other systems read from the same source. A new column in `member` might need to be excluded from API responses, added to admin endpoints, or handled in export logic.
|
|
47
|
+
|
|
48
|
+
**Your action:** For each file in the planner's scope, report its inbound dependencies (who imports it) and outbound dependencies (what it imports). Flag any dependency that the planner hasn't accounted for.
|
|
49
|
+
|
|
50
|
+
## Principle 4: Goal-Driven Execution
|
|
51
|
+
|
|
52
|
+
At the gap-analysis phase, goal-driven execution means ensuring the plan will have testable success criteria — including failure modes.
|
|
53
|
+
|
|
54
|
+
### What to check
|
|
55
|
+
|
|
56
|
+
- **Missing failure modes.** For each file-level change the planner is considering, ask:
|
|
57
|
+
- What happens on invalid input?
|
|
58
|
+
- What happens on concurrent access?
|
|
59
|
+
- What happens when a dependency is unavailable?
|
|
60
|
+
- What happens when the input data doesn't match the expected schema/casing/format?
|
|
61
|
+
If the planner hasn't considered these, surface them as gaps.
|
|
62
|
+
- **Happy-path-only acceptance criteria.** If the planner's acceptance criteria are all positive ("X works when Y"), flag the missing negatives. "No acceptance criterion covers what happens when the stack value is unknown — this is how fail-open bugs ship."
|
|
63
|
+
- **Unverifiable criteria.** If a criterion can't be checked by running a command, it's not a real criterion. "Criterion says 'settings are persisted correctly' — what command verifies this?"
|
|
64
|
+
|
|
65
|
+
### Anti-pattern to catch
|
|
66
|
+
|
|
67
|
+
The planner writes acceptance criteria for Tailscale route auto-approval: "routes are approved for dev, sbx, and prod stacks." Missing: "unknown stack values produce an error, not an empty approval." The feature works in testing and fails open in production.
|
|
68
|
+
|
|
69
|
+
**Your action:** For every acceptance criterion, propose the corresponding negative test. "If the positive criterion is 'routes approved for known stacks,' the negative criterion should be 'unknown stacks produce an error.'"
|
|
70
|
+
|
|
71
|
+
## Output format
|
|
72
|
+
|
|
73
|
+
Your output should integrate these checks into your standard gap-analysis format:
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
## Gaps
|
|
77
|
+
|
|
78
|
+
1. <Gap from any principle>. Why it matters: <one sentence>. Suggested clarifying question: <one sentence>.
|
|
79
|
+
2. ...
|
|
80
|
+
|
|
81
|
+
## Cross-boundary name verification
|
|
82
|
+
|
|
83
|
+
| Identifier | Source (plan/doc) | Canonical form (codebase) | Match? |
|
|
84
|
+
|---|---|---|---|
|
|
85
|
+
| ... | ... | ... | ✓ / ✗ / not found |
|
|
86
|
+
|
|
87
|
+
## Confirmed assumptions
|
|
88
|
+
|
|
89
|
+
- <Things you checked that DO hold true>
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The cross-boundary name table is new — add it whenever the plan references existing system identifiers. This is the single highest-leverage check you perform.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# Code Quality — Planning Phase
|
|
2
|
+
|
|
3
|
+
You are the plan agent or plan-reviewer. Your job is to produce (or validate) a plan that the builder can execute without introducing the defect classes that dominate agent-authored PRs. These four principles tell you what to enforce.
|
|
4
|
+
|
|
5
|
+
## Principle 1: Think Before Coding
|
|
6
|
+
|
|
7
|
+
At the planning phase, this means every claim in the plan is grounded in the codebase — not in assumptions, not in documentation that may have drifted, not in pattern-matching from training data.
|
|
8
|
+
|
|
9
|
+
### For the plan agent
|
|
10
|
+
|
|
11
|
+
- **Grep-confirm every cross-boundary identifier before writing it into the plan.** Database columns, enum values, API fields, Temporal signal/query names, config keys, registry targets. Use `serena_find_symbol` for code symbols, `grep` for string literals. If you can't confirm the canonical form, put it in `## Open questions` — don't guess.
|
|
12
|
+
- **Cite the source file for every behavioral assumption.** "The webhook fires after finalize" — cite the file and line where that happens. "The settings object is forwarded to the API" — cite the forwarding code. Uncited assumptions become bugs.
|
|
13
|
+
- **Name alternatives you rejected.** If you considered two approaches and picked one, state both in `## Constraints` or inline in the relevant `## File-level changes` entry. The plan-reviewer and builder need to know what you ruled out and why.
|
|
14
|
+
|
|
15
|
+
### For the plan-reviewer
|
|
16
|
+
|
|
17
|
+
- **Spot-check at least one cross-boundary identifier per plan.** Pick the identifier that crosses the most boundaries (e.g., a registry key used by both the API and the worker). Grep for it. If the plan uses a different casing or spelling than the codebase, REJECT.
|
|
18
|
+
- **Flag uncited behavioral assumptions.** If the plan says "X calls Y" without citing a file path, that's a gap. The builder will trust the plan and write code against a behavior that may not exist.
|
|
19
|
+
|
|
20
|
+
### Anti-pattern: the naming mismatch cascade
|
|
21
|
+
|
|
22
|
+
Plan says: target name is `"eligibility_request"` (snake_case, from a doc). Codebase registry uses `"eligibilityRequest"` (camelCase). Builder writes code and tests using the plan's name. Tests pass (builder wrote the fixtures too). Production breaks because the registry key doesn't match.
|
|
23
|
+
|
|
24
|
+
**Prevention:** The plan must contain the canonical form, confirmed by grep. The plan-reviewer must spot-check it.
|
|
25
|
+
|
|
26
|
+
## Principle 2: Simplicity First
|
|
27
|
+
|
|
28
|
+
At the planning phase, this means the plan's scope matches the goal — no more, no less.
|
|
29
|
+
|
|
30
|
+
### For the plan agent
|
|
31
|
+
|
|
32
|
+
- **Every file in `## File-level changes` must trace to `## Goal`.** If you can't explain why a file is there in one sentence that references the goal, it doesn't belong.
|
|
33
|
+
- **No single-use abstractions.** If the plan introduces a generic interface, base class, factory, or registry pattern, there must be 2+ concrete implementations in the plan. One implementation = write the specific thing, not the abstraction.
|
|
34
|
+
- **No speculative features.** Env-var toggles, feature flags, admin UIs, and strategy patterns are scope unless the goal explicitly calls for them. "While we're at it" is not a justification.
|
|
35
|
+
- **Consider splitting.** If the plan exceeds ~15 files or ~1000 lines of estimated changes, ask whether it can be two independently-shippable PRs. Each PR should leave the system in a working state.
|
|
36
|
+
- **Prefer the shorter implementation.** If 200 lines could be 50, the plan should describe the 50-line version. The agent's instinct is to generate comprehensive code — the plan should constrain that instinct.
|
|
37
|
+
|
|
38
|
+
### For the plan-reviewer
|
|
39
|
+
|
|
40
|
+
- **Count files vs. goal complexity.** A "add a config toggle" goal with 16 files is a red flag. A "build a new service" goal with 16 files may be appropriate. The ratio matters.
|
|
41
|
+
- **Flag single-use abstractions.** If `## File-level changes` introduces an interface/factory/registry and only one implementation, REJECT with: "Single-use abstraction: `<name>` has only one implementation. Write the specific thing."
|
|
42
|
+
- **Flag "while we're at it" scope.** If a file-level change says "also update X for consistency" or "clean up Y while editing," that's scope creep. REJECT unless `## Goal` explicitly includes it.
|
|
43
|
+
|
|
44
|
+
### Anti-pattern: the full vertical slice
|
|
45
|
+
|
|
46
|
+
Goal: "add per-org RCM toggle." Plan: migration + model change + API endpoint + admin UI + audit logging + settings forwarding = 16 files. The settings-forwarding logic snapshots the entire settings object instead of the single field, creating a stale-data overwrite bug. A narrower plan — toggle + migration + one API field — would have shipped the feature with fewer defects.
|
|
47
|
+
|
|
48
|
+
**Prevention:** The plan-reviewer should ask: "What is the minimum set of files that satisfies the goal?" If the plan has more, each extra file needs explicit justification.
|
|
49
|
+
|
|
50
|
+
## Principle 3: Surgical Changes
|
|
51
|
+
|
|
52
|
+
At the planning phase, surgical changes means scoping the plan tightly and flagging files that need careful handling.
|
|
53
|
+
|
|
54
|
+
### For the plan agent
|
|
55
|
+
|
|
56
|
+
- **Mark security-sensitive files explicitly.** If the plan touches a scanner allowlist, auth config, CORS setting, `.env` template, or similar security file, set `Risk: high` on that entry and add a note: "Security-sensitive file — builder must use the narrowest possible change."
|
|
57
|
+
- **Specify what NOT to change.** Use `## Non-goals` aggressively. "Do NOT modify `src/auth/session.ts`." "Do NOT refactor the existing report runner." Explicit exclusions prevent the builder from "improving" adjacent code.
|
|
58
|
+
- **Scope config changes precisely.** If the plan requires adding a path to an allowlist, specify the exact path in the plan — not "add the mock file to the allowlist" but "add `test/mocks/mock-pms-client.ts` to `.secretlintrc` allowlist." The builder should not have to decide the pattern.
|
|
59
|
+
|
|
60
|
+
### For the plan-reviewer
|
|
61
|
+
|
|
62
|
+
- **Check `## Non-goals` exists and is specific.** A plan without non-goals is a plan that hasn't thought about boundaries. REJECT if missing on any plan with 5+ file-level changes.
|
|
63
|
+
- **Flag missing `Risk:` annotations on security-sensitive files.** If the plan touches an auth, config, or security file and doesn't mark it `Risk: medium` or higher, REJECT.
|
|
64
|
+
|
|
65
|
+
### Anti-pattern: the broad allowlist
|
|
66
|
+
|
|
67
|
+
Plan says "add mock file to PHI scanner allowlist." Builder adds `**/mock-*.ts` instead of the specific file path. The broad glob disables PHI detection for any file matching that pattern across the entire repo.
|
|
68
|
+
|
|
69
|
+
**Prevention:** The plan must specify the exact allowlist entry. The plan-reviewer must verify the entry is specific, not a glob.
|
|
70
|
+
|
|
71
|
+
## Principle 4: Goal-Driven Execution
|
|
72
|
+
|
|
73
|
+
At the planning phase, goal-driven execution means writing acceptance criteria that catch failure modes — not just happy paths.
|
|
74
|
+
|
|
75
|
+
### For the plan agent
|
|
76
|
+
|
|
77
|
+
- **Every acceptance criterion needs a negative test.** For each `- [ ]` item in the plan-state fence, ask: "What's the corresponding failure case?" If the positive criterion is "routes approved for known stacks," the negative criterion should be "unknown stacks produce an error, not an empty approval."
|
|
78
|
+
- **Enumerate failure modes for `Risk: medium+` changes.** In the `## File-level changes` entry or in `## Test plan`, answer:
|
|
79
|
+
- What happens on invalid input?
|
|
80
|
+
- What happens on concurrent access?
|
|
81
|
+
- What happens when a dependency is unavailable?
|
|
82
|
+
- What happens when the input data doesn't match the expected schema/casing/format?
|
|
83
|
+
- **Verify commands must be real assertions.** Not `echo done`. Not `test -f file.ts`. A command that fails when the criterion isn't met. The plan-state fence enforces this structurally, but the plan agent must write meaningful commands.
|
|
84
|
+
- **Include cross-boundary verification.** If the plan introduces a string literal that references a domain concept (table name, enum value, signal name), add a verify step that greps for the canonical form. TypeScript catches type mismatches but not string-literal mismatches.
|
|
85
|
+
|
|
86
|
+
### For the plan-reviewer
|
|
87
|
+
|
|
88
|
+
- **Check for negative tests.** If every acceptance criterion is positive ("X works when Y") and none are negative ("X fails when Z"), REJECT. Happy-path-only criteria produce happy-path-only implementations.
|
|
89
|
+
- **Check verify commands are meaningful.** If a verify command is `echo done`, `test -f`, or `true`, REJECT. The verify must exercise behavior, not existence.
|
|
90
|
+
- **Check failure-mode coverage on `Risk: medium+` entries.** If a high-risk file-level change has no corresponding failure-mode test in `## Test plan` or `## Acceptance criteria`, REJECT.
|
|
91
|
+
|
|
92
|
+
### Anti-pattern: the happy-path-only plan
|
|
93
|
+
|
|
94
|
+
Acceptance criteria: "Tailscale routes are approved for dev, sbx, and prod stacks." Missing: "Unknown stack values produce an error." The builder implements exactly what the plan says. The feature works in testing (which only uses known stacks) and fails open in production.
|
|
95
|
+
|
|
96
|
+
**Prevention:** The plan must include negative acceptance criteria for every medium+ risk change. The plan-reviewer must verify they exist.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Code Quality — Review Phase
|
|
2
|
+
|
|
3
|
+
You are the QA reviewer (fast or thorough variant). Your job is to catch the defect classes that survive planning and building. These four principles tell you what to look for in the diff.
|
|
4
|
+
|
|
5
|
+
## Principle 1: Think Before Coding (verify assumptions survived)
|
|
6
|
+
|
|
7
|
+
The plan made assumptions. The builder may have trusted them without verifying. Your job is to catch the ones that slipped through.
|
|
8
|
+
|
|
9
|
+
### What to check
|
|
10
|
+
|
|
11
|
+
- **Cross-boundary string literals.** For every new string literal in the diff that references a domain concept (table name, enum value, signal name, config key, registry target, Temporal workflow/signal/query name), grep the codebase for the canonical form. If the diff uses `"eligibility_request"` but the codebase uses `"eligibilityRequest"`, that's a FAIL — even if tests pass (the tests probably use the same wrong name).
|
|
12
|
+
- **Casing and plurality mismatches.** Specifically check:
|
|
13
|
+
- snake_case vs camelCase vs PascalCase
|
|
14
|
+
- Singular vs plural (`"credential"` vs `"credentials"`, `"member"` vs `"members"`)
|
|
15
|
+
- Abbreviated vs full (`"req"` vs `"request"`, `"org"` vs `"organization"`)
|
|
16
|
+
- **Behavioral assumptions in the code.** If the diff contains a comment like "// this returns X" or "// called after Y," spot-check one or two of these by reading the referenced code. If the comment is wrong, the code is probably wrong too.
|
|
17
|
+
- **Temporal workflow changes.** If the diff modifies any file matching `**/*.workflow.ts` or `**/workflows/**`:
|
|
18
|
+
- Check for `patched()` guards on any removed or modified branch.
|
|
19
|
+
- Verify the old code path is preserved behind `!patched(patchId)`.
|
|
20
|
+
- If a workflow branch was deleted without a patch guard, that's a FAIL — determinism violation.
|
|
21
|
+
|
|
22
|
+
### Output format for naming mismatches
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
FAIL
|
|
26
|
+
|
|
27
|
+
1. src/analytics/engine.ts:42 — String literal "eligibility_request" does not match canonical form "eligibilityRequest" (found in src/registry/targets.ts:15). Runtime key mismatch.
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Principle 2: Simplicity First (verify scope matches goal)
|
|
31
|
+
|
|
32
|
+
The plan may have been well-scoped, but the builder may have expanded it. Or the plan itself may have been overscoped and the plan-reviewer missed it. You're the last line of defense.
|
|
33
|
+
|
|
34
|
+
### What to check
|
|
35
|
+
|
|
36
|
+
- **File count vs. goal complexity.** Read the plan's `## Goal`. Count the files in the diff. Does the ratio make sense? A "add a config toggle" goal with 16 changed files is suspicious. A "build a new service" goal with 16 files may be appropriate.
|
|
37
|
+
- **Single-use abstractions in the diff.** If the diff introduces an interface, base class, factory, or registry pattern, check whether it has more than one implementation in the diff. If not, FAIL with: "Single-use abstraction: `<name>` has only one implementation in this diff. Simplify to the concrete implementation."
|
|
38
|
+
- **Speculative code.** If the diff contains code paths that aren't exercised by any test in the diff and aren't required by the plan, that's dead-on-arrival code. FAIL with the specific file and line.
|
|
39
|
+
- **Unnecessary complexity.** If a function in the diff could be written in significantly fewer lines without losing correctness or readability, note it. This isn't an auto-FAIL, but it's worth flagging: "src/resolver.ts:15-80 — 65-line resolver pattern could be a 10-line conditional import. Consider simplifying."
|
|
40
|
+
|
|
41
|
+
## Principle 3: Surgical Changes (verify diff discipline)
|
|
42
|
+
|
|
43
|
+
This is your primary enforcement principle. The QA reviewer exists to catch unplanned changes.
|
|
44
|
+
|
|
45
|
+
### What to check
|
|
46
|
+
|
|
47
|
+
- **Plan drift (AUTO-FAIL).** For each modified file in the diff, verify it appears in the plan's `## File-level changes`. A modified file NOT listed in the plan is AUTO-FAIL. Report as: `Plan drift: <path> modified but not in ## File-level changes`.
|
|
48
|
+
- **Scope creep (AUTO-FAIL).** For each untracked file (from `git status`) not in the plan, run `git log --oneline -- <file>` to check if it's pre-existing. No prior commits AND not in the plan → FAIL with: `Scope creep: <path> untracked and not in plan`.
|
|
49
|
+
- **Security-sensitive file changes.** If the diff modifies any of these file patterns, apply extra scrutiny:
|
|
50
|
+
- Scanner/linter configs (`.*rc*`, `allowlist*`, `ignore*`)
|
|
51
|
+
- Auth/security modules (`auth/**`, `security/**`, `crypto/**`)
|
|
52
|
+
- Environment configs (`.env*`, `env.*.ts`)
|
|
53
|
+
- CI pipelines (`.github/workflows/**`)
|
|
54
|
+
- Database migrations (`migrations/**`, `*.sql`)
|
|
55
|
+
- Temporal workflows (`*.workflow.ts`, `workflows/**`)
|
|
56
|
+
|
|
57
|
+
For each, check:
|
|
58
|
+
- Does the plan explicitly mention this file? If not → FAIL.
|
|
59
|
+
- Is the change the narrowest possible? If a glob pattern was added where a specific path would do → FAIL with: `Overly broad pattern in <file>: "<glob>" should be "<specific-path>"`.
|
|
60
|
+
- **"While I'm here" changes.** If the diff contains style fixes, import reordering, comment updates, or dead-code removal in lines adjacent to (but not part of) the planned change, FAIL with: `Unplanned adjacent change in <file>:<line> — not in plan`.
|
|
61
|
+
- **Pre-existing code modifications.** If the diff removes or modifies code that existed before this branch and the plan doesn't mention it, FAIL. The builder should only remove orphans its own changes created.
|
|
62
|
+
|
|
63
|
+
## Principle 4: Goal-Driven Execution (verify failure-path coverage)
|
|
64
|
+
|
|
65
|
+
The builder may have implemented the happy path perfectly and skipped every failure mode. Your job is to catch that.
|
|
66
|
+
|
|
67
|
+
### What to check
|
|
68
|
+
|
|
69
|
+
- **Failure-path test coverage.** For each file-level change with `Risk: medium` or higher in the plan:
|
|
70
|
+
- Does the diff include at least one test for an error/failure case? Not just "valid input produces correct output" but "invalid input produces an error."
|
|
71
|
+
- If the change adds a new API endpoint, does the diff include a test for an error response (400, 404, 500)?
|
|
72
|
+
- If the change adds validation logic, does the diff include a test for invalid input?
|
|
73
|
+
- If the change modifies a config/security file, does the diff include a test that verifies the restriction works?
|
|
74
|
+
If no failure-path tests exist for a medium+ risk change → FAIL with: `Missing failure-path test for <file> (Risk: <level>). No error/edge-case test found in diff.`
|
|
75
|
+
|
|
76
|
+
- **Fail-open patterns.** Specifically look for:
|
|
77
|
+
- Validation functions that return empty/default on unknown input instead of throwing
|
|
78
|
+
- Switch/if-else chains with no default/else that handles unexpected values
|
|
79
|
+
- Try-catch blocks that swallow errors silently (empty catch, catch that only logs)
|
|
80
|
+
- Config lookups that fall back to permissive defaults on missing keys
|
|
81
|
+
Report each as: `Potential fail-open: <file>:<line> — <description>. Unknown input falls through to <permissive behavior>.`
|
|
82
|
+
|
|
83
|
+
- **Verify command execution.** Run every verify command from the plan-state fence. Trust nothing — not the `[x]` checkboxes, not the builder's narrative. If a verify command exits non-zero → FAIL.
|
|
84
|
+
|
|
85
|
+
- **Cross-boundary contract verification.** For every new string literal in the diff that references a domain concept, grep for the canonical form. This overlaps with Principle 1's check — do it anyway. It's the single highest-leverage check and takes seconds.
|
|
86
|
+
|
|
87
|
+
### Anti-pattern: the invisible fail-open
|
|
88
|
+
|
|
89
|
+
Diff adds a function `validateStack(stack: string)` that returns `approvedRoutes` for known stacks and `[]` (empty array) for unknown stacks. The caller interprets `[]` as "no routes to reject" → approves everything. No test covers the unknown-stack case. The QA reviewer who doesn't check for fail-open patterns misses it.
|
|
90
|
+
|
|
91
|
+
**Your action:** For every validation/filtering function in the diff, trace what happens when the input doesn't match any expected value. If the result is permissive (empty set, null, undefined, default-allow), that's a fail-open candidate. FAIL unless a test explicitly covers that case.
|
|
92
|
+
|
|
93
|
+
## Summary: the four checks in execution order
|
|
94
|
+
|
|
95
|
+
Run these in order during your review:
|
|
96
|
+
|
|
97
|
+
1. **Plan drift + scope creep** (Principle 3) — fast, mechanical, AUTO-FAIL
|
|
98
|
+
2. **Security-sensitive file scrutiny** (Principle 3) — check narrowness of patterns
|
|
99
|
+
3. **Cross-boundary name verification** (Principle 1) — grep string literals against canonical forms
|
|
100
|
+
4. **Failure-path coverage** (Principle 4) — check for negative tests on medium+ risk changes
|
|
101
|
+
5. **Simplicity check** (Principle 2) — flag single-use abstractions and speculative code
|
|
102
|
+
6. **Verify command execution** (Principle 4) — run every verify command from the fence
|
|
103
|
+
|
|
104
|
+
Items 1-2 are AUTO-FAIL. Items 3-4 are FAIL if the issue is confirmed. Items 5 are advisory (flag but don't auto-fail unless egregious). Item 6 is FAIL on non-zero exit.
|
|
@@ -8,7 +8,7 @@ The validator catches schema, DAG, and glob errors. It cannot catch "this verify
|
|
|
8
8
|
|
|
9
9
|
1. **Is each task right-sized?** Reread each task's prompt. Could the pilot-builder do it in ~20 minutes with the standard `max_turns: 50`? If a task feels like 2 hours of work, split it. If it feels like 2 minutes, merge it.
|
|
10
10
|
|
|
11
|
-
2. **Does each verify command HAVE to fail before the task runs?** For each task, mentally checkout the pre-task state. Would the verify command fail there? If not, the verify isn't observing the task's effect — fix it.
|
|
11
|
+
2. **Does each verify command HAVE to fail before the task runs?** For each task, mentally checkout the pre-task state. Would the verify command fail there? If not, the verify isn't observing the task's effect — fix it. **Also check milestone and defaults verify commands:** mentally walk the DAG in order and confirm that `defaults.verify_after_each` and each milestone's `verify` pass at every task boundary — including right after scaffold tasks that create a test runner config but zero test files. If a broad `test` command would exit 1 on "no test files found", add `--passWithNoTests` (vitest/jest) or equivalent.
|
|
12
12
|
|
|
13
13
|
3. **Is each `touches:` glob the tightest fit?** For each task, list the files the agent will need to edit. Are they all matched? Are there ANY paths matched that the agent SHOULDN'T touch? If yes to either, refine.
|
|
14
14
|
|
|
@@ -59,6 +59,48 @@ This prevents the agent from wasting its 5-attempt retry budget on failures it d
|
|
|
59
59
|
|
|
60
60
|
The agent gets 5 attempts (with escalating "try a different approach" nudges) for failures it introduces AFTER the baseline passes. Pre-existing failures never reach the agent.
|
|
61
61
|
|
|
62
|
+
## Milestone and defaults verify run in the baseline too
|
|
63
|
+
|
|
64
|
+
The baseline check doesn't only run task-specific verify commands — it runs **everything except** the task's own `verify:` list. That means:
|
|
65
|
+
|
|
66
|
+
- `defaults.verify_after_each` commands
|
|
67
|
+
- The task's milestone `verify` commands
|
|
68
|
+
- `pilot.json` `baseline` and `after_each` commands
|
|
69
|
+
|
|
70
|
+
These commands run on the clean tree **before every task in their scope**. If a milestone verify is `pnpm --filter @pkg test` and the first task in that milestone scaffolds the package with a test runner config but zero test files, the *second* task's baseline fails — vitest/jest exit 1 on "no test files found", and the entire downstream DAG cascades to failure.
|
|
71
|
+
|
|
72
|
+
**The rule: every milestone and defaults verify command must pass at every point in the DAG where it applies — including immediately after scaffold tasks that create zero test files.**
|
|
73
|
+
|
|
74
|
+
### The empty-test-suite trap
|
|
75
|
+
|
|
76
|
+
Test runners treat "no test files found" as a failure by default:
|
|
77
|
+
|
|
78
|
+
| Runner | Behavior on zero tests | Fix |
|
|
79
|
+
|---|---|---|
|
|
80
|
+
| vitest | exit 1 | `--passWithNoTests` |
|
|
81
|
+
| jest | exit 1 | `--passWithNoTests` |
|
|
82
|
+
| bun test | exit 0 (safe by default) | — |
|
|
83
|
+
|
|
84
|
+
When a plan scaffolds a new package or module, the scaffold task creates the test runner config but typically no test files — the first real task creates those. Any milestone or defaults verify that runs the package's test suite will hit the empty-suite exit code.
|
|
85
|
+
|
|
86
|
+
**Fix: always use `--passWithNoTests` (or equivalent) on milestone and defaults verify commands that run a test suite.** This is not a weakening of the verify — it's acknowledging that "zero tests, zero failures" is a valid baseline state for a package under construction.
|
|
87
|
+
|
|
88
|
+
```yaml
|
|
89
|
+
# WRONG — fails baseline after scaffold task
|
|
90
|
+
milestones:
|
|
91
|
+
- name: M1-ENGINE
|
|
92
|
+
verify:
|
|
93
|
+
- pnpm --filter @pkg test
|
|
94
|
+
|
|
95
|
+
# RIGHT — tolerates the empty state between scaffold and first real task
|
|
96
|
+
milestones:
|
|
97
|
+
- name: M1-ENGINE
|
|
98
|
+
verify:
|
|
99
|
+
- pnpm --filter @pkg test -- --passWithNoTests
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Task-specific verify does NOT need `--passWithNoTests` — it targets the exact test file the task creates, and the baseline excludes task-specific verify commands (they'd fail before the task runs by design — that's TDD).
|
|
103
|
+
|
|
62
104
|
## Two-tier verify
|
|
63
105
|
|
|
64
106
|
Use BOTH a per-task verify and `defaults.verify_after_each`:
|