@glrs-dev/cli 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # @glrs-dev/cli
2
2
 
3
+ ## 1.2.0
4
+
3
5
  ## 1.1.0
4
6
 
5
7
  ### Patch Changes
@@ -1142,11 +1142,60 @@ CREATE TABLE IF NOT EXISTS events (
1142
1142
  CREATE INDEX IF NOT EXISTS idx_events_run ON events(run_id, id);
1143
1143
  CREATE INDEX IF NOT EXISTS idx_events_run_task ON events(run_id, task_id, id);
1144
1144
  `.trim();
1145
+ var V2_SQL = `
1146
+ CREATE TABLE IF NOT EXISTS workflows (
1147
+ id TEXT NOT NULL PRIMARY KEY,
1148
+ goal TEXT NOT NULL,
1149
+ started_at INTEGER NOT NULL,
1150
+ finished_at INTEGER,
1151
+ status TEXT NOT NULL CHECK (status IN ('pending','running','completed','aborted','failed')),
1152
+ current_phase TEXT
1153
+ );
1154
+
1155
+ CREATE TABLE IF NOT EXISTS phases (
1156
+ workflow_id TEXT NOT NULL,
1157
+ name TEXT NOT NULL CHECK (name IN ('scope','plan','build','qa','followup')),
1158
+ status TEXT NOT NULL CHECK (status IN ('pending','running','completed','aborted','failed')),
1159
+ started_at INTEGER,
1160
+ finished_at INTEGER,
1161
+ artifact_path TEXT,
1162
+ PRIMARY KEY (workflow_id, name),
1163
+ FOREIGN KEY (workflow_id) REFERENCES workflows(id) ON DELETE CASCADE
1164
+ );
1165
+
1166
+ CREATE TABLE IF NOT EXISTS artifacts (
1167
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
1168
+ workflow_id TEXT NOT NULL,
1169
+ phase TEXT NOT NULL,
1170
+ kind TEXT NOT NULL,
1171
+ path TEXT NOT NULL,
1172
+ created_at INTEGER NOT NULL,
1173
+ sha256 TEXT,
1174
+ FOREIGN KEY (workflow_id) REFERENCES workflows(id) ON DELETE CASCADE
1175
+ );
1176
+
1177
+ CREATE INDEX IF NOT EXISTS idx_artifacts_workflow_phase ON artifacts(workflow_id, phase);
1178
+
1179
+ ALTER TABLE events ADD COLUMN phase TEXT;
1180
+
1181
+ INSERT INTO workflows (id, goal, started_at, finished_at, status, current_phase)
1182
+ SELECT id, plan_slug, started_at, finished_at, status, 'build' FROM runs;
1183
+
1184
+ INSERT INTO phases (workflow_id, name, status, started_at, finished_at, artifact_path)
1185
+ SELECT id, 'build', status, started_at, finished_at, NULL FROM runs;
1186
+
1187
+ UPDATE events SET phase = 'build' WHERE phase IS NULL;
1188
+ `.trim();
1145
1189
  var MIGRATIONS = [
1146
1190
  {
1147
1191
  version: 1,
1148
1192
  description: "initial pilot schema (runs/tasks/events)",
1149
1193
  sql: V1_SQL
1194
+ },
1195
+ {
1196
+ version: 2,
1197
+ description: "workflows/phases/artifacts tables + events.phase column",
1198
+ sql: V2_SQL
1150
1199
  }
1151
1200
  ];
1152
1201
  function applyMigrations(db) {
@@ -1279,8 +1328,8 @@ function appendEvent(db, args) {
1279
1328
  });
1280
1329
  }
1281
1330
  db.run(
1282
- `INSERT INTO events (run_id, task_id, ts, kind, payload) VALUES (?, ?, ?, ?, ?)`,
1283
- [args.runId, args.taskId ?? null, ts, args.kind, payloadStr]
1331
+ `INSERT INTO events (run_id, task_id, ts, kind, payload, phase) VALUES (?, ?, ?, ?, ?, ?)`,
1332
+ [args.runId, args.taskId ?? null, ts, args.kind, payloadStr, args.phase ?? null]
1284
1333
  );
1285
1334
  if (eventSubscribers.length > 0) {
1286
1335
  const snapshot = eventSubscribers.slice();
@@ -1291,6 +1340,7 @@ function appendEvent(db, args) {
1291
1340
  taskId: args.taskId ?? null,
1292
1341
  kind: args.kind,
1293
1342
  payload: args.payload,
1343
+ phase: args.phase ?? null,
1294
1344
  ts
1295
1345
  });
1296
1346
  } catch {
@@ -1865,25 +1915,78 @@ function fixPrompt(_task, last) {
1865
1915
  return sections.join("\n");
1866
1916
  }
1867
1917
 
1868
- // src/pilot/verify/runner.ts
1869
- import { spawn as spawn2 } from "child_process";
1870
- var DEFAULT_TIMEOUT_MS = 5 * 60 * 1e3;
1871
- var DEFAULT_OUTPUT_CAP_BYTES = 256 * 1024;
1872
- var TRUNCATION_NOTICE = "\n[pilot] verify output truncated\n";
1873
- async function runVerify(commands, options) {
1918
+ // src/pilot/gates/composite.ts
1919
+ async function evalAllGate(gate, ctx) {
1920
+ const startedAt = Date.now();
1874
1921
  const results = [];
1875
- for (const command10 of commands) {
1876
- const result = await runOne(command10, options);
1877
- results.push(result);
1878
- if (!result.ok) {
1879
- return { ok: false, results, failure: result };
1922
+ for (const sub of gate.gates) {
1923
+ const subResult = await evalGate(sub, ctx);
1924
+ results.push({ gate: sub, result: subResult });
1925
+ if (!subResult.ok) {
1926
+ const evidence2 = {
1927
+ kind: "all",
1928
+ results,
1929
+ failure: subResult
1930
+ };
1931
+ return {
1932
+ ok: false,
1933
+ reason: subResult.reason,
1934
+ evidence: evidence2,
1935
+ durationMs: Date.now() - startedAt
1936
+ };
1880
1937
  }
1881
1938
  }
1939
+ const evidence = { kind: "all", results };
1882
1940
  return {
1883
1941
  ok: true,
1884
- results
1942
+ evidence,
1943
+ durationMs: Date.now() - startedAt
1885
1944
  };
1886
1945
  }
1946
+ async function evalAnyGate(gate, ctx) {
1947
+ const startedAt = Date.now();
1948
+ const results = [];
1949
+ if (gate.gates.length === 0) {
1950
+ const evidence2 = { kind: "any", results };
1951
+ return {
1952
+ ok: false,
1953
+ reason: "any-gate has no sub-gates to satisfy",
1954
+ evidence: evidence2,
1955
+ durationMs: Date.now() - startedAt
1956
+ };
1957
+ }
1958
+ let lastResult = null;
1959
+ for (const sub of gate.gates) {
1960
+ const subResult = await evalGate(sub, ctx);
1961
+ results.push({ gate: sub, result: subResult });
1962
+ lastResult = subResult;
1963
+ if (subResult.ok) {
1964
+ const evidence2 = { kind: "any", results };
1965
+ return {
1966
+ ok: true,
1967
+ evidence: evidence2,
1968
+ durationMs: Date.now() - startedAt
1969
+ };
1970
+ }
1971
+ }
1972
+ const evidence = {
1973
+ kind: "any",
1974
+ results,
1975
+ failure: lastResult ?? void 0
1976
+ };
1977
+ return {
1978
+ ok: false,
1979
+ reason: `any-gate exhausted: all ${results.length} sub-gates failed`,
1980
+ evidence,
1981
+ durationMs: Date.now() - startedAt
1982
+ };
1983
+ }
1984
+
1985
+ // src/pilot/verify/spawn.ts
1986
+ import { spawn as spawn2 } from "child_process";
1987
+ var DEFAULT_TIMEOUT_MS = 5 * 60 * 1e3;
1988
+ var DEFAULT_OUTPUT_CAP_BYTES = 256 * 1024;
1989
+ var TRUNCATION_NOTICE = "\n[pilot] verify output truncated\n";
1887
1990
  async function runOne(command10, options) {
1888
1991
  if (typeof command10 !== "string" || command10.length === 0) {
1889
1992
  throw new TypeError(`runOne: command must be a non-empty string`);
@@ -2020,6 +2123,147 @@ function killTree(child) {
2020
2123
  }, 2e3).unref();
2021
2124
  }
2022
2125
 
2126
+ // src/pilot/gates/shell.ts
2127
+ async function evalShellGate(gate, ctx) {
2128
+ const result = await runOne(gate.command, {
2129
+ cwd: ctx.cwd,
2130
+ env: ctx.env,
2131
+ abortSignal: ctx.abortSignal,
2132
+ onLine: ctx.onShellLine,
2133
+ timeoutMs: gate.timeoutMs,
2134
+ outputCapBytes: ctx.shellOutputCapBytes
2135
+ });
2136
+ return toGateResult(result);
2137
+ }
2138
+ function toGateResult(result) {
2139
+ if (result.ok) {
2140
+ return {
2141
+ ok: true,
2142
+ durationMs: result.durationMs,
2143
+ evidence: { kind: "shell", result }
2144
+ };
2145
+ }
2146
+ const reason = formatShellFailure(result);
2147
+ return {
2148
+ ok: false,
2149
+ reason,
2150
+ durationMs: result.durationMs,
2151
+ evidence: { kind: "shell", result }
2152
+ };
2153
+ }
2154
+ function formatShellFailure(result) {
2155
+ const flags = [];
2156
+ if (result.timedOut) flags.push("timed-out");
2157
+ if (result.aborted) flags.push("aborted");
2158
+ if (result.signal) flags.push(`signal=${result.signal}`);
2159
+ const flagSuffix = flags.length > 0 ? ` [${flags.join(",")}]` : "";
2160
+ return `shell gate failed: ${result.command} \u2192 exit ${result.exitCode}${flagSuffix}`;
2161
+ }
2162
+
2163
+ // src/pilot/gates/eval.ts
2164
+ async function evalGate(gate, ctx) {
2165
+ switch (gate.kind) {
2166
+ case "shell":
2167
+ return evalShellGate(gate, ctx);
2168
+ case "all":
2169
+ return evalAllGate(gate, ctx);
2170
+ case "any":
2171
+ return evalAnyGate(gate, ctx);
2172
+ default: {
2173
+ const _exhaustive = gate;
2174
+ throw new Error(
2175
+ `evalGate: unknown gate kind ${_exhaustive.kind}`
2176
+ );
2177
+ }
2178
+ }
2179
+ }
2180
+
2181
+ // src/pilot/gates/types.ts
2182
+ function asShellEvidence(evidence) {
2183
+ if (typeof evidence === "object" && evidence !== null && evidence.kind === "shell") {
2184
+ return evidence;
2185
+ }
2186
+ return null;
2187
+ }
2188
+ function asCompositeEvidence(evidence) {
2189
+ if (typeof evidence === "object" && evidence !== null && (evidence.kind === "all" || evidence.kind === "any")) {
2190
+ return evidence;
2191
+ }
2192
+ return null;
2193
+ }
2194
+
2195
+ // src/pilot/verify/runner.ts
2196
+ async function runVerify(commands, options) {
2197
+ if (commands.length === 0) {
2198
+ return { ok: true, results: [] };
2199
+ }
2200
+ const gate = {
2201
+ kind: "all",
2202
+ gates: commands.map((command10) => ({
2203
+ kind: "shell",
2204
+ command: command10,
2205
+ timeoutMs: options.timeoutMs
2206
+ }))
2207
+ };
2208
+ const ctx = {
2209
+ cwd: options.cwd,
2210
+ env: options.env,
2211
+ abortSignal: options.abortSignal,
2212
+ onShellLine: options.onLine,
2213
+ shellOutputCapBytes: options.outputCapBytes
2214
+ };
2215
+ const gateResult = await evalGate(gate, ctx);
2216
+ return toRunVerifyResult(gateResult);
2217
+ }
2218
+ function toRunVerifyResult(gateResult) {
2219
+ const composite = asCompositeEvidence(gateResult.evidence);
2220
+ if (composite === null || composite.kind !== "all") {
2221
+ throw new Error(
2222
+ `runVerify: expected composite all-gate evidence, got ${gateResultDescriptor(gateResult)}`
2223
+ );
2224
+ }
2225
+ const results = composite.results.map((entry) => extractCommandResult(entry));
2226
+ if (gateResult.ok) {
2227
+ return {
2228
+ ok: true,
2229
+ results
2230
+ };
2231
+ }
2232
+ const failingEntry = composite.results[composite.results.length - 1];
2233
+ if (!failingEntry || failingEntry.result.ok) {
2234
+ throw new Error(
2235
+ "runVerify: all-gate failed but no failing sub-result was recorded"
2236
+ );
2237
+ }
2238
+ const failureCommandResult = extractCommandResult(failingEntry);
2239
+ if (failureCommandResult.ok) {
2240
+ throw new Error(
2241
+ "runVerify: failing sub-gate produced a successful CommandResult"
2242
+ );
2243
+ }
2244
+ return {
2245
+ ok: false,
2246
+ results,
2247
+ failure: failureCommandResult
2248
+ };
2249
+ }
2250
+ function extractCommandResult(entry) {
2251
+ const shell = asShellEvidence(entry.result.evidence);
2252
+ if (shell === null) {
2253
+ throw new Error(
2254
+ `runVerify: expected shell-gate evidence in all-gate child, got ${gateResultDescriptor(entry.result)}`
2255
+ );
2256
+ }
2257
+ return shell.result;
2258
+ }
2259
+ function gateResultDescriptor(result) {
2260
+ const evidence = result.evidence;
2261
+ return JSON.stringify({
2262
+ ok: result.ok,
2263
+ evidenceKind: evidence?.kind ?? null
2264
+ });
2265
+ }
2266
+
2023
2267
  // src/pilot/verify/touches.ts
2024
2268
  import picomatch2 from "picomatch";
2025
2269
  import { execFile as execFile2 } from "child_process";
@@ -2530,7 +2774,11 @@ async function runOneTaskImpl(deps, task, opts) {
2530
2774
  command: f.command,
2531
2775
  exitCode: f.exitCode,
2532
2776
  output: f.output.slice(0, 4096),
2533
- reason: reason2
2777
+ reason: reason2,
2778
+ // Step 1 of pilot redesign: gate descriptor on every
2779
+ // verify-derived event. Future LLM/approval gates emit
2780
+ // identically-shaped events with a different `gate.kind`.
2781
+ gate: { kind: "shell", command: f.command }
2534
2782
  }
2535
2783
  });
2536
2784
  return;
@@ -2539,7 +2787,10 @@ async function runOneTaskImpl(deps, task, opts) {
2539
2787
  runId: deps.runId,
2540
2788
  taskId: task.id,
2541
2789
  kind: "task.baseline.passed",
2542
- payload: { commands: allVerify.length }
2790
+ payload: {
2791
+ commands: allVerify.length,
2792
+ gate: { kind: "all", subKind: "shell", count: baselineVerify.length }
2793
+ }
2543
2794
  });
2544
2795
  }
2545
2796
  let lastFailure = null;
@@ -2695,7 +2946,8 @@ async function runOneTaskImpl(deps, task, opts) {
2695
2946
  exitCode: lastFailure.exitCode,
2696
2947
  timedOut: verifyResult.failure.timedOut,
2697
2948
  aborted: verifyResult.failure.aborted,
2698
- output: verifyResult.failure.output.slice(-2048)
2949
+ output: verifyResult.failure.output.slice(-2048),
2950
+ gate: { kind: "shell", command: lastFailure.command }
2699
2951
  }
2700
2952
  });
2701
2953
  if (verifyResult.failure.aborted) {
@@ -2721,7 +2973,10 @@ async function runOneTaskImpl(deps, task, opts) {
2721
2973
  runId: deps.runId,
2722
2974
  taskId: task.id,
2723
2975
  kind: "task.verify.passed",
2724
- payload: { attempt }
2976
+ payload: {
2977
+ attempt,
2978
+ gate: { kind: "all", subKind: "shell", count: allVerify.length }
2979
+ }
2725
2980
  });
2726
2981
  const touches = await enforceTouches({
2727
2982
  cwd,
@@ -3311,7 +3566,7 @@ function startStreamingLogger(args) {
3311
3566
  const taskStart = /* @__PURE__ */ new Map();
3312
3567
  let succeeded = 0;
3313
3568
  let failed = 0;
3314
- const INLINE_BLOCKED_CAP = 5;
3569
+ const INLINE_BLOCKED_CAP = 0;
3315
3570
  let blockedCount = 0;
3316
3571
  let blockedInlineEmitted = 0;
3317
3572
  let blockedOverflowEmitted = false;
@@ -3350,6 +3605,24 @@ function startStreamingLogger(args) {
3350
3605
  if (id !== null) taskStart.set(id, event.ts);
3351
3606
  write(`task.started ${id ?? "?"}`);
3352
3607
  break;
3608
+ case "task.baseline.passed":
3609
+ break;
3610
+ case "task.baseline.failed": {
3611
+ const bp = event.payload;
3612
+ if (bp !== null && typeof bp === "object" && typeof bp.command === "string" && typeof bp.exitCode === "number") {
3613
+ write(
3614
+ `task.baseline.failed ${id ?? "?"} (${bp.command} \u2192 exit ${bp.exitCode})`
3615
+ );
3616
+ const output = typeof bp.output === "string" ? bp.output : null;
3617
+ if (output !== null && output.trim().length > 0) {
3618
+ const tail = output.trim().split("\n").slice(-6).map((l) => ` ${l}`).join("\n");
3619
+ writeRaw(tail);
3620
+ }
3621
+ } else {
3622
+ write(`task.baseline.failed ${id ?? "?"}`);
3623
+ }
3624
+ break;
3625
+ }
3353
3626
  case "task.verify.passed":
3354
3627
  write(`task.verify.passed ${id ?? "?"}`);
3355
3628
  break;
@@ -3435,7 +3708,7 @@ function startStreamingLogger(args) {
3435
3708
  case "task.attempt": {
3436
3709
  const p = event.payload;
3437
3710
  if (p !== null && typeof p === "object" && typeof p.attempt === "number" && typeof p.of === "number" && p.attempt >= 2) {
3438
- writeRaw(` attempt ${p.attempt}/${p.of} (retry with fix prompt)`);
3711
+ write(`task.retry ${id ?? "?"} attempt ${p.attempt}/${p.of}`);
3439
3712
  }
3440
3713
  break;
3441
3714
  }
@@ -3561,9 +3834,17 @@ Failed tasks (${failed.length}):
3561
3834
  session: ${session}
3562
3835
  worktree: ${worktree}
3563
3836
  elapsed: ${elapsed} attempts: ${t.attempts}
3564
-
3565
3837
  `
3566
3838
  );
3839
+ const baselineOutput = resolveBaselineOutput(db, runId, t.task_id);
3840
+ if (baselineOutput !== null) {
3841
+ const tail = baselineOutput.trim().split("\n").slice(-6).map((l) => ` ${l}`).join("\n");
3842
+ process.stdout.write(` output:
3843
+ ${tail}
3844
+ `);
3845
+ }
3846
+ process.stdout.write(`
3847
+ `);
3567
3848
  }
3568
3849
  }
3569
3850
  }
@@ -3592,6 +3873,18 @@ function resolveFailureDetail(db, runId, row) {
3592
3873
  reason: row.last_error ?? "(no reason recorded)"
3593
3874
  };
3594
3875
  }
3876
+ function resolveBaselineOutput(db, runId, taskId) {
3877
+ const events = readEventsDecoded(db, { runId, taskId });
3878
+ for (let i = events.length - 1; i >= 0; i--) {
3879
+ const e = events[i];
3880
+ if (e.kind !== "task.baseline.failed") continue;
3881
+ const p = e.payload;
3882
+ if (p !== null && typeof p === "object" && typeof p.output === "string") {
3883
+ return p.output;
3884
+ }
3885
+ }
3886
+ return null;
3887
+ }
3595
3888
  function truncateSummary(s, maxChars) {
3596
3889
  if (s.length <= maxChars) return s;
3597
3890
  return s.slice(0, maxChars - 1) + "\u2026";
@@ -1866,7 +1866,7 @@ import { join as join8 } from "path";
1866
1866
  var APP_KEY = "A-US-3617699429";
1867
1867
  var ENDPOINT = "https://us.aptabase.com/api/v0/event";
1868
1868
  var PKG_NAME = "@glrs-dev/harness-plugin-opencode";
1869
- var PKG_VERSION = true ? "1.1.0" : "dev";
1869
+ var PKG_VERSION = true ? "1.2.0" : "dev";
1870
1870
  var DISABLED = process.env.HARNESS_OPENCODE_TELEMETRY === "0" || process.env.HARNESS_OPENCODE_TELEMETRY === "false" || process.env.DO_NOT_TRACK === "1" || process.env.CI === "true";
1871
1871
  var SESSION_ID = randomUUID();
1872
1872
  function getInstallId() {
@@ -0,0 +1,45 @@
1
+ ---
2
+ name: code-quality
3
+ description: Four principles for autonomous code quality — think before coding, simplicity first, surgical changes, goal-driven execution. Load this skill when planning, building, or reviewing any non-trivial change. Derived from observed patterns in AI-agent-authored PRs where review feedback clustered around wrong assumptions, overcomplication, scope creep, and missing failure-mode coverage.
4
+ ---
5
+
6
+ # Code Quality Principles
7
+
8
+ Four principles that prevent the most common classes of defects in AI-agent-authored code. Each principle applies at every pipeline phase, but the enforcement actions differ by phase. Load the rule file for your current role.
9
+
10
+ These principles are derived from empirical analysis of recurring review feedback on agent-authored PRs. The top defect categories — wrong assumptions at system boundaries, overcomplicated implementations, unplanned side-effects, and happy-path-only coverage — are all preventable by applying the right check at the right phase.
11
+
12
+ ## The four principles
13
+
14
+ 1. **Think Before Coding** — Don't assume. Surface ambiguity, verify cross-boundary names, present tradeoffs, stop when confused.
15
+ 2. **Simplicity First** — Minimum code that solves the problem. No speculative features, no single-use abstractions, no "flexibility" that wasn't requested.
16
+ 3. **Surgical Changes** — Touch only what you must. Every changed line traces to the plan. Minimize blast radius on security-sensitive files.
17
+ 4. **Goal-Driven Execution** — Define success criteria with real verify commands. Enumerate failure modes. Test the error paths, not just the happy path.
18
+
19
+ ## Phase-specific rules
20
+
21
+ Each rule file applies all four principles through the lens of a specific pipeline phase. Load the one that matches your current role:
22
+
23
+ 1. [`rules/gap-analysis.md`](rules/gap-analysis.md) — For `@gap-analyzer`. Surface hidden assumptions, missing failure modes, naming mismatches, and overscoped plans before the draft is written.
24
+
25
+ 2. [`rules/planning.md`](rules/planning.md) — For `@plan` and `@plan-reviewer`. Verify every cross-boundary identifier. Reject plans that exceed what the goal requires. Require failure-mode coverage in acceptance criteria.
26
+
27
+ 3. [`rules/building.md`](rules/building.md) — For `@build`. Enforce surgical changes. Verify names before using them. Flag unplanned edits. Write failure-path tests before happy-path code.
28
+
29
+ 4. [`rules/review.md`](rules/review.md) — For `@qa-reviewer` and `@qa-thorough`. Verify failure-path coverage in the diff. Grep-confirm cross-boundary string literals. Reject diffs with unplanned scope.
30
+
31
+ ## When to load this skill
32
+
33
+ Any non-trivial change — defined as any plan with 3+ file-level changes, or any change touching a system boundary (API contract, database schema, config/security file, cross-service integration).
34
+
35
+ Do NOT load for trivial work (typo fixes, single-file renames, doc-only changes). The overhead isn't worth it.
36
+
37
+ ## Observable outcomes
38
+
39
+ These are the signals that the principles are working:
40
+
41
+ - Fewer naming mismatches at system boundaries (cross-boundary identifiers are grep-confirmed before use)
42
+ - Smaller, more focused PRs (plans that exceed ~15 files get split or justified)
43
+ - Zero unplanned changes in diffs (every changed line traces to the plan)
44
+ - Failure-mode coverage in acceptance criteria (negative tests exist for medium+ risk changes)
45
+ - Narrower security-config changes (specific paths instead of broad globs)
@@ -0,0 +1,125 @@
1
+ # Code Quality — Building Phase
2
+
3
+ You are the build agent. Your job is to execute the plan without introducing the defect classes that dominate agent-authored PRs. These four principles tell you what to enforce during execution.
4
+
5
+ ## Principle 1: Think Before Coding
6
+
7
+ At the building phase, this means verifying every assumption the plan makes before writing code against it. The plan is your spec, but specs can be wrong.
8
+
9
+ ### Before editing each file
10
+
11
+ - **Verify cross-boundary identifiers.** Before using any identifier from the plan that references an existing system concept (database column, enum value, API field, Temporal signal name, config key, registry target), grep the codebase for the canonical form. If the plan says `"eligibility_request"` but the codebase uses `"eligibilityRequest"`, the plan is wrong — STOP and report.
12
+ - **Verify behavioral assumptions.** If the plan says "this function returns X" or "this endpoint accepts Y," read the actual implementation before writing code that depends on it. Don't trust the plan's description of existing behavior — verify it.
13
+ - **Check for domain-specific safety constraints.** Before modifying a Temporal workflow, check whether the change requires a `patched()` guard. Before modifying a database migration, check whether a down() path is needed. Before modifying an auth flow, check whether the change affects token scoping. These constraints aren't always in the plan — they're in the codebase's conventions.
14
+
15
+ ### When you find a mismatch
16
+
17
+ Don't silently work around it. STOP and report:
18
+
19
+ > Plan says `<identifier>` but codebase uses `<canonical form>`. Which is correct?
20
+
21
+ This is a design-change signal, not a cosmetic threshold. The plan needs to be updated before you proceed.
22
+
23
+ ### Anti-pattern: the trusting builder
24
+
25
+ Plan says: register target as `"eligibility_request"`. Builder writes code and tests using that name. Tests pass (builder wrote the fixtures). Production breaks because the registry uses `"eligibilityRequest"`. The builder trusted the plan instead of verifying.
26
+
27
+ **Your action:** Grep for every cross-boundary identifier before first use. One grep per identifier. This takes seconds and prevents the most common class of runtime failure.
28
+
29
+ ## Principle 2: Simplicity First
30
+
31
+ At the building phase, this means writing the minimum code that satisfies each plan item — not the most comprehensive code you can generate.
32
+
33
+ ### During implementation
34
+
35
+ - **Fight the generation instinct.** Your training data is full of comprehensive, well-documented, heavily-abstracted code. That's not what the plan asked for. Write the specific thing the plan describes, in the fewest lines that are correct and readable.
36
+ - **No speculative error handling.** Handle the error cases the plan specifies. Don't add error handling for scenarios the plan doesn't mention — that's scope creep disguised as robustness.
37
+ - **No premature abstraction.** If the plan says "add a function that does X," write a function that does X. Don't write a class hierarchy, a factory, or a strategy pattern unless the plan explicitly calls for it.
38
+ - **Prefer inline over extracted.** If a helper function would be called once, inline it. If a constant would be referenced once, inline it. Extraction is warranted at 2+ call sites.
39
+ - **Match the plan's complexity level.** If the plan describes a 50-line change, don't produce 200 lines. If you find yourself writing significantly more code than the plan implies, that's a signal to STOP and check whether you're overcomplicating.
40
+
41
+ ### Anti-pattern: the comprehensive implementation
42
+
43
+ Plan says: "add env-var toggle for mock client." Builder produces: a resolver pattern with dynamic imports, a factory function, a type-safe config schema, and conditional module loading — 200 lines for what could be a 20-line `if (process.env.USE_MOCK)` check. The extra complexity introduces a bug where mock data is unconditionally imported in production.
44
+
45
+ **Your action:** Before writing, estimate the line count the plan implies. If your implementation exceeds 2x that estimate, pause and simplify.
46
+
47
+ ## Principle 3: Surgical Changes
48
+
49
+ This is your primary principle. The build agent's #1 failure mode is unplanned side-effects.
50
+
51
+ ### After every file edit, check
52
+
53
+ 1. **Is this file in `## File-level changes`?** If not → STOP and report. Do not silently expand scope. Do not add files to the plan yourself unless the expansion is ≤2 files and directly required by a planned change.
54
+
55
+ 2. **Does every changed line trace to a plan item?** Review your own diff mentally. If any line is "while I'm here" cleanup, adjacent-code improvement, or style normalization — revert it. Your diff should contain zero surprises.
56
+
57
+ 3. **Did I modify a security-sensitive file?** Scanner allowlists, auth configs, CORS settings, `.env` templates, CI workflow files, permission manifests. If yes:
58
+ - Is the change the narrowest possible? Could I use a specific file path instead of a glob pattern?
59
+ - Does the plan explicitly mention this change? If not → STOP and report.
60
+ - Would a reviewer looking at this diff ask "why was this changed?" If yes, the change needs justification.
61
+
62
+ 4. **Did I touch imports/exports in a file I'm editing?** Only remove imports YOUR changes made unused. If a pre-existing import was already unused, leave it. Only add exports the plan requires. Don't "clean up" the import block.
63
+
64
+ 5. **Am I matching existing style?** Read the surrounding code before writing. Match indentation, naming conventions, comment style, error handling patterns, and test structure — even if you'd do it differently. Consistency within a file matters more than your preference.
65
+
66
+ ### Security-sensitive file patterns
67
+
68
+ These files require extra scrutiny. Any change must be the narrowest possible and explicitly justified by the plan:
69
+
70
+ - `**/.*rc*`, `**/.eslintrc*`, `**/.secretlintrc*` — linter/scanner configs
71
+ - `**/allowlist*`, `**/whitelist*`, `**/ignore*` — exclusion lists
72
+ - `**/.env*`, `**/env.*.ts` — environment configs
73
+ - `**/auth/**`, `**/security/**`, `**/crypto/**` — auth/security modules
74
+ - `**/*.workflow.ts`, `**/workflows/**` — Temporal workflows (replay safety)
75
+ - `**/migrations/**`, `**/*.sql` — database migrations
76
+ - `**/.github/workflows/**` — CI pipelines
77
+
78
+ ### Anti-pattern: the expedient side-effect
79
+
80
+ The builder needs mock data for tests. The PHI scanner flags the mock file. Instead of adding the specific file path (`test/mocks/mock-pms-client.ts`) to the allowlist, the builder adds `**/mock-*.ts` — disabling PHI detection for any matching file across the entire repo. The test passes. The security hole ships.
81
+
82
+ **Your action:** When you need to modify a security-sensitive file, use the most specific pattern possible. If the plan doesn't specify the exact pattern, STOP and ask — don't improvise with a broad glob.
83
+
84
+ ### Anti-pattern: the stale-data forward
85
+
86
+ Plan says: "forward the RCM enabled setting to the API." Builder forwards the entire `settings.solutions` object instead of the single `rcmEnabled` field. A concurrent write to any other field in the object gets overwritten by the stale snapshot.
87
+
88
+ **Your action:** When the plan says "forward X," forward exactly X — not the parent object, not a snapshot, not a superset. Read the existing forwarding pattern in the codebase and match it.
89
+
90
+ ## Principle 4: Goal-Driven Execution
91
+
92
+ At the building phase, this means working in TDD order and verifying each step — including failure paths.
93
+
94
+ ### Execution order
95
+
96
+ For each acceptance criterion in the plan-state fence:
97
+
98
+ 1. **Write the test(s) first.** The `tests:` field names the test cases. Write them. They should fail (the implementation doesn't exist yet).
99
+ 2. **Write the implementation.** Make the tests pass.
100
+ 3. **Run the verify command.** The `verify:` field is the acceptance gate. If it exits non-zero, fix and re-run.
101
+ 4. **Check for failure-path coverage.** If the plan includes negative tests (it should for medium+ risk changes), write those too. If the plan doesn't include negative tests but the change has obvious failure modes, write them anyway and note the addition in your return payload.
102
+
103
+ ### Cross-boundary verification
104
+
105
+ After implementing code that uses a string literal referencing a domain concept:
106
+
107
+ - **Grep for the canonical form.** `grep -r "eligibilityRequest" src/` to confirm the registry key exists.
108
+ - **Check casing.** If your code uses `"eligibility_request"` and the grep returns `"eligibilityRequest"`, you have a bug — even though TypeScript is happy.
109
+ - **Check plurality.** `"credentials"` vs `"credential"`, `"member"` vs `"members"` — these mismatches pass type checks and fail at runtime.
110
+
111
+ ### Anti-pattern: the happy-path-only builder
112
+
113
+ Plan says: "add route validation for Tailscale subnet routes." Builder implements validation for `dev`, `sbx`, and `prod`. For an unknown stack value, the validation returns an empty set — which the approval logic interprets as "all routes approved." The builder didn't write a test for the unknown-stack case because the plan's acceptance criteria only covered known stacks.
114
+
115
+ **Your action:** If the plan's acceptance criteria are all positive and the change has obvious failure modes, write the negative test anyway. Note it in your return payload as a plan expansion. Better to over-test than to ship a fail-open bug.
116
+
117
+ ### Temporal workflow safety (domain-specific)
118
+
119
+ If you're modifying a Temporal workflow function body:
120
+
121
+ - **Never delete a workflow branch.** Only add new ones behind `patched()` guards.
122
+ - **The old code path stays behind `!patched(patchId)`.** In-flight executions replay against the old history. Removing the old branch causes a determinism violation.
123
+ - **Test with replay fixtures.** If the plan includes workflow changes, verify that existing replay tests still pass.
124
+
125
+ This is the single highest-severity domain-specific constraint. A determinism violation breaks in-flight production workflows silently.
@@ -0,0 +1,92 @@
1
+ # Code Quality — Gap Analysis Phase
2
+
3
+ You are the gap-analyzer. Your job is to find what's missing before the plan is written. These four principles tell you what to look for.
4
+
5
+ ## Principle 1: Think Before Coding
6
+
7
+ This is your primary principle. The gap-analyzer exists to catch wrong assumptions before they propagate into the plan.
8
+
9
+ ### What to check
10
+
11
+ - **Cross-boundary identifiers.** For every identifier the planner references — database column, enum value, API field, Temporal signal name, config key, registry target — grep the codebase for the canonical form. The #1 source of runtime failures that pass type checks is a naming mismatch at a system boundary. Snake_case vs camelCase is the most common variant.
12
+ - **Assumed behaviors.** When the planner says "X will call Y" or "Z returns a list of W," verify by reading the actual code. Don't trust documentation — it drifts. Read the implementation.
13
+ - **Silent interpretation choices.** If the user's request is ambiguous and the planner picked one interpretation without stating the alternative, surface the alternative. "The planner assumed X, but Y is also a valid reading."
14
+ - **Missing context.** If the planner references a system the gap-analyzer hasn't seen evidence of (a service, a table, a config file), flag it. "Planner references `eligibility_request` table but I found `eligibilityRequest` in the registry — which is canonical?"
15
+
16
+ ### Anti-pattern to catch
17
+
18
+ The planner reads a doc that says "eligibility requests use snake_case keys." The planner writes a plan using snake_case. The actual runtime registry uses camelCase. If you don't catch this, the builder will write code and tests that both use the wrong name — tests pass, production breaks.
19
+
20
+ **Your action:** For every cross-boundary name in the plan draft, report whether you confirmed it or couldn't. Use `serena_find_symbol` for code symbols, `grep` for string literals and config keys.
21
+
22
+ ## Principle 2: Simplicity First
23
+
24
+ Surface overscoping before the plan is written. It's cheaper to cut scope now than to review a 13,000-line PR later.
25
+
26
+ ### What to check
27
+
28
+ - **Goal-to-file ratio.** If the planner's understanding implies 15+ files for a goal that could be achieved with 5, flag it. "The goal is 'add a config toggle' but the current understanding implies an admin UI, audit logging, and settings forwarding — are all of these in scope?"
29
+ - **Single-use abstractions.** If the planner is proposing a generic framework (registry, engine, factory) and there's only one consumer, flag it. "A generic analytics engine is proposed but only one report type exists — consider a specific implementation."
30
+ - **Speculative features.** If the planner's understanding includes features the user didn't ask for, flag them. "User asked for a mock client; planner's understanding includes an env-var toggle and a resolver pattern — confirm these are needed."
31
+
32
+ ### Anti-pattern to catch
33
+
34
+ The planner receives "add per-org RCM toggle" and scopes it as: migration + model + API endpoint + admin UI + audit logging + settings forwarding + 16 files. The narrower scope — toggle + migration + one API field — would ship the feature with fewer defects.
35
+
36
+ **Your action:** If the scope seems wider than the goal requires, list the minimum set of changes that would satisfy the goal and ask whether the additional scope is intentional.
37
+
38
+ ## Principle 3: Surgical Changes
39
+
40
+ At the gap-analysis phase, surgical changes means identifying which existing files will be affected and flagging unintended side-effects before they happen.
41
+
42
+ ### What to check
43
+
44
+ - **Adjacent code impact.** For each file the planner intends to change, check what else imports from or depends on that file. If a change to `settings.ts` will affect 12 consumers, that's a gap worth surfacing.
45
+ - **Security-sensitive files.** If the planner's scope implies touching a scanner allowlist, auth config, CORS setting, or similar security file, flag it explicitly. "This change will require modifying the PHI scanner allowlist — ensure the plan specifies the narrowest possible pattern."
46
+ - **Config/schema ripple effects.** If the change adds a database column, enum value, or config key, check whether other systems read from the same source. A new column in `member` might need to be excluded from API responses, added to admin endpoints, or handled in export logic.
47
+
48
+ **Your action:** For each file in the planner's scope, report its inbound dependencies (who imports it) and outbound dependencies (what it imports). Flag any dependency that the planner hasn't accounted for.
49
+
50
+ ## Principle 4: Goal-Driven Execution
51
+
52
+ At the gap-analysis phase, goal-driven execution means ensuring the plan will have testable success criteria — including failure modes.
53
+
54
+ ### What to check
55
+
56
+ - **Missing failure modes.** For each file-level change the planner is considering, ask:
57
+ - What happens on invalid input?
58
+ - What happens on concurrent access?
59
+ - What happens when a dependency is unavailable?
60
+ - What happens when the input data doesn't match the expected schema/casing/format?
61
+ If the planner hasn't considered these, surface them as gaps.
62
+ - **Happy-path-only acceptance criteria.** If the planner's acceptance criteria are all positive ("X works when Y"), flag the missing negatives. "No acceptance criterion covers what happens when the stack value is unknown — this is how fail-open bugs ship."
63
+ - **Unverifiable criteria.** If a criterion can't be checked by running a command, it's not a real criterion. "Criterion says 'settings are persisted correctly' — what command verifies this?"
64
+
65
+ ### Anti-pattern to catch
66
+
67
+ The planner writes acceptance criteria for Tailscale route auto-approval: "routes are approved for dev, sbx, and prod stacks." Missing: "unknown stack values produce an error, not an empty approval." The feature works in testing and fails open in production.
68
+
69
+ **Your action:** For every acceptance criterion, propose the corresponding negative test. "If the positive criterion is 'routes approved for known stacks,' the negative criterion should be 'unknown stacks produce an error.'"
70
+
71
+ ## Output format
72
+
73
+ Your output should integrate these checks into your standard gap-analysis format:
74
+
75
+ ```
76
+ ## Gaps
77
+
78
+ 1. <Gap from any principle>. Why it matters: <one sentence>. Suggested clarifying question: <one sentence>.
79
+ 2. ...
80
+
81
+ ## Cross-boundary name verification
82
+
83
+ | Identifier | Source (plan/doc) | Canonical form (codebase) | Match? |
84
+ |---|---|---|---|
85
+ | ... | ... | ... | ✓ / ✗ / not found |
86
+
87
+ ## Confirmed assumptions
88
+
89
+ - <Things you checked that DO hold true>
90
+ ```
91
+
92
+ The cross-boundary name table is new — add it whenever the plan references existing system identifiers. This is the single highest-leverage check you perform.
@@ -0,0 +1,96 @@
1
+ # Code Quality — Planning Phase
2
+
3
+ You are the plan agent or plan-reviewer. Your job is to produce (or validate) a plan that the builder can execute without introducing the defect classes that dominate agent-authored PRs. These four principles tell you what to enforce.
4
+
5
+ ## Principle 1: Think Before Coding
6
+
7
+ At the planning phase, this means every claim in the plan is grounded in the codebase — not in assumptions, not in documentation that may have drifted, not in pattern-matching from training data.
8
+
9
+ ### For the plan agent
10
+
11
+ - **Grep-confirm every cross-boundary identifier before writing it into the plan.** Database columns, enum values, API fields, Temporal signal/query names, config keys, registry targets. Use `serena_find_symbol` for code symbols, `grep` for string literals. If you can't confirm the canonical form, put it in `## Open questions` — don't guess.
12
+ - **Cite the source file for every behavioral assumption.** "The webhook fires after finalize" — cite the file and line where that happens. "The settings object is forwarded to the API" — cite the forwarding code. Uncited assumptions become bugs.
13
+ - **Name alternatives you rejected.** If you considered two approaches and picked one, state both in `## Constraints` or inline in the relevant `## File-level changes` entry. The plan-reviewer and builder need to know what you ruled out and why.
14
+
15
+ ### For the plan-reviewer
16
+
17
+ - **Spot-check at least one cross-boundary identifier per plan.** Pick the identifier that crosses the most boundaries (e.g., a registry key used by both the API and the worker). Grep for it. If the plan uses a different casing or spelling than the codebase, REJECT.
18
+ - **Flag uncited behavioral assumptions.** If the plan says "X calls Y" without citing a file path, that's a gap. The builder will trust the plan and write code against a behavior that may not exist.
19
+
20
+ ### Anti-pattern: the naming mismatch cascade
21
+
22
+ Plan says: target name is `"eligibility_request"` (snake_case, from a doc). Codebase registry uses `"eligibilityRequest"` (camelCase). Builder writes code and tests using the plan's name. Tests pass (builder wrote the fixtures too). Production breaks because the registry key doesn't match.
23
+
24
+ **Prevention:** The plan must contain the canonical form, confirmed by grep. The plan-reviewer must spot-check it.
25
+
26
+ ## Principle 2: Simplicity First
27
+
28
+ At the planning phase, this means the plan's scope matches the goal — no more, no less.
29
+
30
+ ### For the plan agent
31
+
32
+ - **Every file in `## File-level changes` must trace to `## Goal`.** If you can't explain why a file is there in one sentence that references the goal, it doesn't belong.
33
+ - **No single-use abstractions.** If the plan introduces a generic interface, base class, factory, or registry pattern, there must be 2+ concrete implementations in the plan. One implementation = write the specific thing, not the abstraction.
34
+ - **No speculative features.** Env-var toggles, feature flags, admin UIs, and strategy patterns are scope unless the goal explicitly calls for them. "While we're at it" is not a justification.
35
+ - **Consider splitting.** If the plan exceeds ~15 files or ~1000 lines of estimated changes, ask whether it can be two independently-shippable PRs. Each PR should leave the system in a working state.
36
+ - **Prefer the shorter implementation.** If 200 lines could be 50, the plan should describe the 50-line version. The agent's instinct is to generate comprehensive code — the plan should constrain that instinct.
37
+
38
+ ### For the plan-reviewer
39
+
40
+ - **Count files vs. goal complexity.** A "add a config toggle" goal with 16 files is a red flag. A "build a new service" goal with 16 files may be appropriate. The ratio matters.
41
+ - **Flag single-use abstractions.** If `## File-level changes` introduces an interface/factory/registry and only one implementation, REJECT with: "Single-use abstraction: `<name>` has only one implementation. Write the specific thing."
42
+ - **Flag "while we're at it" scope.** If a file-level change says "also update X for consistency" or "clean up Y while editing," that's scope creep. REJECT unless `## Goal` explicitly includes it.
43
+
44
+ ### Anti-pattern: the full vertical slice
45
+
46
+ Goal: "add per-org RCM toggle." Plan: migration + model change + API endpoint + admin UI + audit logging + settings forwarding = 16 files. The settings-forwarding logic snapshots the entire settings object instead of the single field, creating a stale-data overwrite bug. A narrower plan — toggle + migration + one API field — would have shipped the feature with fewer defects.
47
+
48
+ **Prevention:** The plan-reviewer should ask: "What is the minimum set of files that satisfies the goal?" If the plan has more, each extra file needs explicit justification.
49
+
50
+ ## Principle 3: Surgical Changes
51
+
52
+ At the planning phase, surgical changes means scoping the plan tightly and flagging files that need careful handling.
53
+
54
+ ### For the plan agent
55
+
56
+ - **Mark security-sensitive files explicitly.** If the plan touches a scanner allowlist, auth config, CORS setting, `.env` template, or similar security file, set `Risk: high` on that entry and add a note: "Security-sensitive file — builder must use the narrowest possible change."
57
+ - **Specify what NOT to change.** Use `## Non-goals` aggressively. "Do NOT modify `src/auth/session.ts`." "Do NOT refactor the existing report runner." Explicit exclusions prevent the builder from "improving" adjacent code.
58
+ - **Scope config changes precisely.** If the plan requires adding a path to an allowlist, specify the exact path in the plan — not "add the mock file to the allowlist" but "add `test/mocks/mock-pms-client.ts` to `.secretlintrc` allowlist." The builder should not have to decide the pattern.
59
+
60
+ ### For the plan-reviewer
61
+
62
+ - **Check `## Non-goals` exists and is specific.** A plan without non-goals is a plan that hasn't thought about boundaries. REJECT if missing on any plan with 5+ file-level changes.
63
+ - **Flag missing `Risk:` annotations on security-sensitive files.** If the plan touches an auth, config, or security file and doesn't mark it `Risk: medium` or higher, REJECT.
64
+
65
+ ### Anti-pattern: the broad allowlist
66
+
67
+ Plan says "add mock file to PHI scanner allowlist." Builder adds `**/mock-*.ts` instead of the specific file path. The broad glob disables PHI detection for any file matching that pattern across the entire repo.
68
+
69
+ **Prevention:** The plan must specify the exact allowlist entry. The plan-reviewer must verify the entry is specific, not a glob.
70
+
71
+ ## Principle 4: Goal-Driven Execution
72
+
73
+ At the planning phase, goal-driven execution means writing acceptance criteria that catch failure modes — not just happy paths.
74
+
75
+ ### For the plan agent
76
+
77
+ - **Every acceptance criterion needs a negative test.** For each `- [ ]` item in the plan-state fence, ask: "What's the corresponding failure case?" If the positive criterion is "routes approved for known stacks," the negative criterion should be "unknown stacks produce an error, not an empty approval."
78
+ - **Enumerate failure modes for `Risk: medium+` changes.** In the `## File-level changes` entry or in `## Test plan`, answer:
79
+ - What happens on invalid input?
80
+ - What happens on concurrent access?
81
+ - What happens when a dependency is unavailable?
82
+ - What happens when the input data doesn't match the expected schema/casing/format?
83
+ - **Verify commands must be real assertions.** Not `echo done`. Not `test -f file.ts`. A command that fails when the criterion isn't met. The plan-state fence enforces this structurally, but the plan agent must write meaningful commands.
84
+ - **Include cross-boundary verification.** If the plan introduces a string literal that references a domain concept (table name, enum value, signal name), add a verify step that greps for the canonical form. TypeScript catches type mismatches but not string-literal mismatches.
85
+
86
+ ### For the plan-reviewer
87
+
88
+ - **Check for negative tests.** If every acceptance criterion is positive ("X works when Y") and none are negative ("X fails when Z"), REJECT. Happy-path-only criteria produce happy-path-only implementations.
89
+ - **Check verify commands are meaningful.** If a verify command is `echo done`, `test -f`, or `true`, REJECT. The verify must exercise behavior, not existence.
90
+ - **Check failure-mode coverage on `Risk: medium+` entries.** If a high-risk file-level change has no corresponding failure-mode test in `## Test plan` or `## Acceptance criteria`, REJECT.
91
+
92
+ ### Anti-pattern: the happy-path-only plan
93
+
94
+ Acceptance criteria: "Tailscale routes are approved for dev, sbx, and prod stacks." Missing: "Unknown stack values produce an error." The builder implements exactly what the plan says. The feature works in testing (which only uses known stacks) and fails open in production.
95
+
96
+ **Prevention:** The plan must include negative acceptance criteria for every medium+ risk change. The plan-reviewer must verify they exist.
@@ -0,0 +1,104 @@
1
+ # Code Quality — Review Phase
2
+
3
+ You are the QA reviewer (fast or thorough variant). Your job is to catch the defect classes that survive planning and building. These four principles tell you what to look for in the diff.
4
+
5
+ ## Principle 1: Think Before Coding (verify assumptions survived)
6
+
7
+ The plan made assumptions. The builder may have trusted them without verifying. Your job is to catch the ones that slipped through.
8
+
9
+ ### What to check
10
+
11
+ - **Cross-boundary string literals.** For every new string literal in the diff that references a domain concept (table name, enum value, signal name, config key, registry target, Temporal workflow/signal/query name), grep the codebase for the canonical form. If the diff uses `"eligibility_request"` but the codebase uses `"eligibilityRequest"`, that's a FAIL — even if tests pass (the tests probably use the same wrong name).
12
+ - **Casing and plurality mismatches.** Specifically check:
13
+ - snake_case vs camelCase vs PascalCase
14
+ - Singular vs plural (`"credential"` vs `"credentials"`, `"member"` vs `"members"`)
15
+ - Abbreviated vs full (`"req"` vs `"request"`, `"org"` vs `"organization"`)
16
+ - **Behavioral assumptions in the code.** If the diff contains a comment like "// this returns X" or "// called after Y," spot-check one or two of these by reading the referenced code. If the comment is wrong, the code is probably wrong too.
17
+ - **Temporal workflow changes.** If the diff modifies any file matching `**/*.workflow.ts` or `**/workflows/**`:
18
+ - Check for `patched()` guards on any removed or modified branch.
19
+ - Verify the old code path is preserved behind `!patched(patchId)`.
20
+ - If a workflow branch was deleted without a patch guard, that's a FAIL — determinism violation.
21
+
22
+ ### Output format for naming mismatches
23
+
24
+ ```
25
+ FAIL
26
+
27
+ 1. src/analytics/engine.ts:42 — String literal "eligibility_request" does not match canonical form "eligibilityRequest" (found in src/registry/targets.ts:15). Runtime key mismatch.
28
+ ```
29
+
30
+ ## Principle 2: Simplicity First (verify scope matches goal)
31
+
32
+ The plan may have been well-scoped, but the builder may have expanded it. Or the plan itself may have been overscoped and the plan-reviewer missed it. You're the last line of defense.
33
+
34
+ ### What to check
35
+
36
+ - **File count vs. goal complexity.** Read the plan's `## Goal`. Count the files in the diff. Does the ratio make sense? A "add a config toggle" goal with 16 changed files is suspicious. A "build a new service" goal with 16 files may be appropriate.
37
+ - **Single-use abstractions in the diff.** If the diff introduces an interface, base class, factory, or registry pattern, check whether it has more than one implementation in the diff. If not, FAIL with: "Single-use abstraction: `<name>` has only one implementation in this diff. Simplify to the concrete implementation."
38
+ - **Speculative code.** If the diff contains code paths that aren't exercised by any test in the diff and aren't required by the plan, that's dead-on-arrival code. FAIL with the specific file and line.
39
+ - **Unnecessary complexity.** If a function in the diff could be written in significantly fewer lines without losing correctness or readability, note it. This isn't an auto-FAIL, but it's worth flagging: "src/resolver.ts:15-80 — 65-line resolver pattern could be a 10-line conditional import. Consider simplifying."
40
+
41
+ ## Principle 3: Surgical Changes (verify diff discipline)
42
+
43
+ This is your primary enforcement principle. The QA reviewer exists to catch unplanned changes.
44
+
45
+ ### What to check
46
+
47
+ - **Plan drift (AUTO-FAIL).** For each modified file in the diff, verify it appears in the plan's `## File-level changes`. A modified file NOT listed in the plan is AUTO-FAIL. Report as: `Plan drift: <path> modified but not in ## File-level changes`.
48
+ - **Scope creep (AUTO-FAIL).** For each untracked file (from `git status`) not in the plan, run `git log --oneline -- <file>` to check if it's pre-existing. No prior commits AND not in the plan → FAIL with: `Scope creep: <path> untracked and not in plan`.
49
+ - **Security-sensitive file changes.** If the diff modifies any of these file patterns, apply extra scrutiny:
50
+ - Scanner/linter configs (`.*rc*`, `allowlist*`, `ignore*`)
51
+ - Auth/security modules (`auth/**`, `security/**`, `crypto/**`)
52
+ - Environment configs (`.env*`, `env.*.ts`)
53
+ - CI pipelines (`.github/workflows/**`)
54
+ - Database migrations (`migrations/**`, `*.sql`)
55
+ - Temporal workflows (`*.workflow.ts`, `workflows/**`)
56
+
57
+ For each, check:
58
+ - Does the plan explicitly mention this file? If not → FAIL.
59
+ - Is the change the narrowest possible? If a glob pattern was added where a specific path would do → FAIL with: `Overly broad pattern in <file>: "<glob>" should be "<specific-path>"`.
60
+ - **"While I'm here" changes.** If the diff contains style fixes, import reordering, comment updates, or dead-code removal in lines adjacent to (but not part of) the planned change, FAIL with: `Unplanned adjacent change in <file>:<line> — not in plan`.
61
+ - **Pre-existing code modifications.** If the diff removes or modifies code that existed before this branch and the plan doesn't mention it, FAIL. The builder should only remove orphans its own changes created.
62
+
63
+ ## Principle 4: Goal-Driven Execution (verify failure-path coverage)
64
+
65
+ The builder may have implemented the happy path perfectly and skipped every failure mode. Your job is to catch that.
66
+
67
+ ### What to check
68
+
69
+ - **Failure-path test coverage.** For each file-level change with `Risk: medium` or higher in the plan:
70
+ - Does the diff include at least one test for an error/failure case? Not just "valid input produces correct output" but "invalid input produces an error."
71
+ - If the change adds a new API endpoint, does the diff include a test for an error response (400, 404, 500)?
72
+ - If the change adds validation logic, does the diff include a test for invalid input?
73
+ - If the change modifies a config/security file, does the diff include a test that verifies the restriction works?
74
+ If no failure-path tests exist for a medium+ risk change → FAIL with: `Missing failure-path test for <file> (Risk: <level>). No error/edge-case test found in diff.`
75
+
76
+ - **Fail-open patterns.** Specifically look for:
77
+ - Validation functions that return empty/default on unknown input instead of throwing
78
+ - Switch/if-else chains with no default/else that handles unexpected values
79
+ - Try-catch blocks that swallow errors silently (empty catch, catch that only logs)
80
+ - Config lookups that fall back to permissive defaults on missing keys
81
+ Report each as: `Potential fail-open: <file>:<line> — <description>. Unknown input falls through to <permissive behavior>.`
82
+
83
+ - **Verify command execution.** Run every verify command from the plan-state fence. Trust nothing — not the `[x]` checkboxes, not the builder's narrative. If a verify command exits non-zero → FAIL.
84
+
85
+ - **Cross-boundary contract verification.** For every new string literal in the diff that references a domain concept, grep for the canonical form. This overlaps with Principle 1's check — do it anyway. It's the single highest-leverage check and takes seconds.
86
+
87
+ ### Anti-pattern: the invisible fail-open
88
+
89
+ Diff adds a function `validateStack(stack: string)` that returns `approvedRoutes` for known stacks and `[]` (empty array) for unknown stacks. The caller interprets `[]` as "no routes to reject" → approves everything. No test covers the unknown-stack case. The QA reviewer who doesn't check for fail-open patterns misses it.
90
+
91
+ **Your action:** For every validation/filtering function in the diff, trace what happens when the input doesn't match any expected value. If the result is permissive (empty set, null, undefined, default-allow), that's a fail-open candidate. FAIL unless a test explicitly covers that case.
92
+
93
+ ## Summary: the four checks in execution order
94
+
95
+ Run these in order during your review:
96
+
97
+ 1. **Plan drift + scope creep** (Principle 3) — fast, mechanical, AUTO-FAIL
98
+ 2. **Security-sensitive file scrutiny** (Principle 3) — check narrowness of patterns
99
+ 3. **Cross-boundary name verification** (Principle 1) — grep string literals against canonical forms
100
+ 4. **Failure-path coverage** (Principle 4) — check for negative tests on medium+ risk changes
101
+ 5. **Simplicity check** (Principle 2) — flag single-use abstractions and speculative code
102
+ 6. **Verify command execution** (Principle 4) — run every verify command from the fence
103
+
104
+ Items 1-2 are AUTO-FAIL. Items 3-4 are FAIL if the issue is confirmed. Items 5 are advisory (flag but don't auto-fail unless egregious). Item 6 is FAIL on non-zero exit.
@@ -8,7 +8,7 @@ The validator catches schema, DAG, and glob errors. It cannot catch "this verify
8
8
 
9
9
  1. **Is each task right-sized?** Reread each task's prompt. Could the pilot-builder do it in ~20 minutes with the standard `max_turns: 50`? If a task feels like 2 hours of work, split it. If it feels like 2 minutes, merge it.
10
10
 
11
- 2. **Does each verify command HAVE to fail before the task runs?** For each task, mentally checkout the pre-task state. Would the verify command fail there? If not, the verify isn't observing the task's effect — fix it.
11
+ 2. **Does each verify command HAVE to fail before the task runs?** For each task, mentally checkout the pre-task state. Would the verify command fail there? If not, the verify isn't observing the task's effect — fix it. **Also check milestone and defaults verify commands:** mentally walk the DAG in order and confirm that `defaults.verify_after_each` and each milestone's `verify` pass at every task boundary — including right after scaffold tasks that create a test runner config but zero test files. If a broad `test` command would exit 1 on "no test files found", add `--passWithNoTests` (vitest/jest) or equivalent.
12
12
 
13
13
  3. **Is each `touches:` glob the tightest fit?** For each task, list the files the agent will need to edit. Are they all matched? Are there ANY paths matched that the agent SHOULDN'T touch? If yes to either, refine.
14
14
 
@@ -59,6 +59,48 @@ This prevents the agent from wasting its 5-attempt retry budget on failures it d
59
59
 
60
60
  The agent gets 5 attempts (with escalating "try a different approach" nudges) for failures it introduces AFTER the baseline passes. Pre-existing failures never reach the agent.
61
61
 
62
+ ## Milestone and defaults verify run in the baseline too
63
+
64
+ The baseline check doesn't only run task-specific verify commands — it runs **everything except** the task's own `verify:` list. That means:
65
+
66
+ - `defaults.verify_after_each` commands
67
+ - The task's milestone `verify` commands
68
+ - `pilot.json` `baseline` and `after_each` commands
69
+
70
+ These commands run on the clean tree **before every task in their scope**. If a milestone verify is `pnpm --filter @pkg test` and the first task in that milestone scaffolds the package with a test runner config but zero test files, the *second* task's baseline fails — vitest/jest exit 1 on "no test files found", and the entire downstream DAG cascades to failure.
71
+
72
+ **The rule: every milestone and defaults verify command must pass at every point in the DAG where it applies — including immediately after scaffold tasks that create zero test files.**
73
+
74
+ ### The empty-test-suite trap
75
+
76
+ Test runners treat "no test files found" as a failure by default:
77
+
78
+ | Runner | Behavior on zero tests | Fix |
79
+ |---|---|---|
80
+ | vitest | exit 1 | `--passWithNoTests` |
81
+ | jest | exit 1 | `--passWithNoTests` |
82
+ | bun test | exit 0 (safe by default) | — |
83
+
84
+ When a plan scaffolds a new package or module, the scaffold task creates the test runner config but typically no test files — the first real task creates those. Any milestone or defaults verify that runs the package's test suite will hit the empty-suite exit code.
85
+
86
+ **Fix: always use `--passWithNoTests` (or equivalent) on milestone and defaults verify commands that run a test suite.** This is not a weakening of the verify — it's acknowledging that "zero tests, zero failures" is a valid baseline state for a package under construction.
87
+
88
+ ```yaml
89
+ # WRONG — fails baseline after scaffold task
90
+ milestones:
91
+ - name: M1-ENGINE
92
+ verify:
93
+ - pnpm --filter @pkg test
94
+
95
+ # RIGHT — tolerates the empty state between scaffold and first real task
96
+ milestones:
97
+ - name: M1-ENGINE
98
+ verify:
99
+ - pnpm --filter @pkg test -- --passWithNoTests
100
+ ```
101
+
102
+ Task-specific verify does NOT need `--passWithNoTests` — it targets the exact test file the task creates, and the baseline excludes task-specific verify commands (they'd fail before the task runs by design — that's TDD).
103
+
62
104
  ## Two-tier verify
63
105
 
64
106
  Use BOTH a per-task verify and `defaults.verify_after_each`:
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@glrs-dev/harness-plugin-opencode",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "type": "module",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@glrs-dev/cli",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "description": "Unified CLI for the @glrs-dev ecosystem — OpenCode agent harness dispatch + worktree management.",
5
5
  "license": "MIT",
6
6
  "repository": {