codeharness 0.32.3 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
2895
2895
  }
2896
2896
 
2897
2897
  // src/modules/infra/init-project.ts
2898
- var HARNESS_VERSION = true ? "0.32.3" : "0.0.0-dev";
2898
+ var HARNESS_VERSION = true ? "0.33.0" : "0.0.0-dev";
2899
2899
  function failResult(opts, error) {
2900
2900
  return {
2901
2901
  status: "fail",
@@ -16,7 +16,7 @@ import {
16
16
  stopCollectorOnly,
17
17
  stopSharedStack,
18
18
  stopStack
19
- } from "./chunk-N57BYUXA.js";
19
+ } from "./chunk-4YPX74BX.js";
20
20
  export {
21
21
  checkRemoteEndpoint,
22
22
  cleanupOrphanedContainers,
package/dist/index.js CHANGED
@@ -40,7 +40,7 @@ import {
40
40
  validateDockerfile,
41
41
  warn,
42
42
  writeState
43
- } from "./chunk-N57BYUXA.js";
43
+ } from "./chunk-4YPX74BX.js";
44
44
 
45
45
  // src/index.ts
46
46
  import { Command } from "commander";
@@ -2507,7 +2507,7 @@ function resolveWorkflow(options) {
2507
2507
  }
2508
2508
 
2509
2509
  // src/lib/workflow-engine.ts
2510
- import { readFileSync as readFileSync13, existsSync as existsSync15 } from "fs";
2510
+ import { readFileSync as readFileSync13, existsSync as existsSync15, writeFileSync as writeFileSync8, mkdirSync as mkdirSync6, rmSync as rmSync2 } from "fs";
2511
2511
  import { join as join12 } from "path";
2512
2512
  import { parse as parse5 } from "yaml";
2513
2513
 
@@ -3347,7 +3347,7 @@ async function executeNullTask(task, taskName, storyKey, state, config, previous
3347
3347
  writeWorkflowState(updatedState, projectDir);
3348
3348
  return { updatedState, output: result.output ?? "", contract };
3349
3349
  }
3350
- async function dispatchTaskWithResult(task, taskName, storyKey, definition, state, config, customPrompt, previousOutputContract) {
3350
+ async function dispatchTaskWithResult(task, taskName, storyKey, definition, state, config, customPrompt, previousOutputContract, storyFiles) {
3351
3351
  const projectDir = config.projectDir ?? process.cwd();
3352
3352
  const traceId = generateTraceId(config.runId, state.iteration, taskName);
3353
3353
  const tracePrompt = formatTracePrompt(traceId);
@@ -3361,7 +3361,7 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
3361
3361
  let workspace = null;
3362
3362
  if (task.source_access === false) {
3363
3363
  try {
3364
- workspace = await createIsolatedWorkspace({ runId: config.runId, storyFiles: [] });
3364
+ workspace = await createIsolatedWorkspace({ runId: config.runId, storyFiles: storyFiles ?? [] });
3365
3365
  cwd = workspace?.toDispatchOptions()?.cwd ?? projectDir;
3366
3366
  } catch {
3367
3367
  cwd = projectDir;
@@ -4030,8 +4030,27 @@ async function executeWorkflow(config) {
4030
4030
  }
4031
4031
  const epicSentinel = `__epic_${epicId}__`;
4032
4032
  if (isTaskCompleted(state, taskName, epicSentinel)) continue;
4033
+ let guideFiles = [];
4034
+ if (task.source_access === false) {
4035
+ const guidesDir = join12(projectDir, ".codeharness", "verify-guides");
4036
+ try {
4037
+ mkdirSync6(guidesDir, { recursive: true });
4038
+ for (const item of epicItems) {
4039
+ const contractPath = join12(projectDir, ".codeharness", "contracts", `document-${item.key}.json`);
4040
+ if (existsSync15(contractPath)) {
4041
+ const contractData = JSON.parse(readFileSync13(contractPath, "utf-8"));
4042
+ if (contractData.output) {
4043
+ const guidePath = join12(guidesDir, `${item.key}-guide.md`);
4044
+ writeFileSync8(guidePath, contractData.output, "utf-8");
4045
+ guideFiles.push(guidePath);
4046
+ }
4047
+ }
4048
+ }
4049
+ } catch {
4050
+ }
4051
+ }
4033
4052
  try {
4034
- const dr = await dispatchTaskWithResult(task, taskName, epicSentinel, definition, state, config, void 0, lastOutputContract ?? void 0);
4053
+ const dr = await dispatchTaskWithResult(task, taskName, epicSentinel, definition, state, config, void 0, lastOutputContract ?? void 0, guideFiles);
4035
4054
  state = dr.updatedState;
4036
4055
  lastOutputContract = dr.contract;
4037
4056
  propagateVerifyFlags(taskName, dr.contract, projectDir);
@@ -4050,6 +4069,14 @@ async function executeWorkflow(config) {
4050
4069
  if (err instanceof DispatchError && HALT_ERROR_CODES.has(err.code)) {
4051
4070
  halted = true;
4052
4071
  }
4072
+ } finally {
4073
+ if (guideFiles.length > 0) {
4074
+ const guidesDir = join12(projectDir, ".codeharness", "verify-guides");
4075
+ try {
4076
+ rmSync2(guidesDir, { recursive: true, force: true });
4077
+ } catch {
4078
+ }
4079
+ }
4053
4080
  }
4054
4081
  }
4055
4082
  if (!halted) {
@@ -4115,7 +4142,7 @@ import { join as join14 } from "path";
4115
4142
 
4116
4143
  // src/lib/cross-worktree-validator.ts
4117
4144
  import { exec } from "child_process";
4118
- import { appendFileSync as appendFileSync2, mkdirSync as mkdirSync6 } from "fs";
4145
+ import { appendFileSync as appendFileSync2, mkdirSync as mkdirSync7 } from "fs";
4119
4146
  import { join as join13 } from "path";
4120
4147
  import { promisify } from "util";
4121
4148
  var execAsync = promisify(exec);
@@ -4151,7 +4178,7 @@ function writeMergeTelemetry(opts, result) {
4151
4178
  errors: result.valid ? [] : ["Test suite failed after merge"]
4152
4179
  };
4153
4180
  const dir = join13(opts.cwd, TELEMETRY_DIR2);
4154
- mkdirSync6(dir, { recursive: true });
4181
+ mkdirSync7(dir, { recursive: true });
4155
4182
  appendFileSync2(join13(dir, TELEMETRY_FILE2), JSON.stringify(entry) + "\n");
4156
4183
  } catch {
4157
4184
  }
@@ -5552,11 +5579,11 @@ function startRenderer(options) {
5552
5579
  let lastStoryKey = state.sprintInfo?.storyKey ?? null;
5553
5580
  const pendingStoryCosts = /* @__PURE__ */ new Map();
5554
5581
  let cleaned = false;
5582
+ process.stdout.write("\x1B[2J\x1B[H");
5555
5583
  const onQuit = options?.onQuit;
5556
5584
  const inkInstance = inkRender(/* @__PURE__ */ jsx9(App, { state, onCycleLane: () => cycleLane(), onQuit: onQuit ? () => onQuit() : void 0 }), {
5557
5585
  exitOnCtrlC: false,
5558
- patchConsole: false,
5559
- // Disable console patching to prevent flicker
5586
+ patchConsole: !options?._forceTTY,
5560
5587
  maxFps: 10
5561
5588
  });
5562
5589
  function rerender() {
@@ -6199,10 +6226,11 @@ function registerRunCommand(program) {
6199
6226
  currentTaskName = event.taskName;
6200
6227
  const inLoop = inEpicPhase && epicLoopTasks.has(event.taskName) && taskStates[event.taskName] === "done";
6201
6228
  const stateKey = inLoop ? `loop:${event.taskName}` : event.taskName;
6202
- const epicId = extractEpicId2(event.storyKey);
6229
+ const epicId = event.storyKey.startsWith("__epic_") ? event.storyKey.replace("__epic_", "").replace("__", "") : extractEpicId2(event.storyKey);
6230
+ const displayStoryKey = event.storyKey.startsWith("__epic_") ? `Epic ${epicId}` : event.storyKey;
6203
6231
  const epic = epicData[epicId];
6204
6232
  renderer.updateSprintState({
6205
- storyKey: event.storyKey,
6233
+ storyKey: displayStoryKey,
6206
6234
  phase: event.taskName,
6207
6235
  done: storiesDone,
6208
6236
  total: counts.total,
@@ -6398,22 +6426,6 @@ import { readFileSync as readFileSync24 } from "fs";
6398
6426
 
6399
6427
  // src/modules/verify/proof.ts
6400
6428
  import { existsSync as existsSync18, readFileSync as readFileSync15 } from "fs";
6401
-
6402
- // src/modules/verify/types.ts
6403
- var TIER_HIERARCHY = [
6404
- "test-provable",
6405
- "runtime-provable",
6406
- "environment-provable",
6407
- "escalate"
6408
- ];
6409
- var LEGACY_TIER_MAP = {
6410
- "cli-verifiable": "test-provable",
6411
- "integration-required": "environment-provable",
6412
- "unit-testable": "test-provable",
6413
- "black-box": "environment-provable"
6414
- };
6415
-
6416
- // src/modules/verify/proof.ts
6417
6429
  function classifyEvidenceCommands(proofContent) {
6418
6430
  const results = [];
6419
6431
  const codeBlockPattern = /```(?:bash|shell)\n([\s\S]*?)```/g;
@@ -6503,15 +6515,7 @@ function validateProofQuality(proofPath) {
6503
6515
  return emptyResult;
6504
6516
  }
6505
6517
  const content = readFileSync15(proofPath, "utf-8");
6506
- const allTierNames = [...TIER_HIERARCHY, ...Object.keys(LEGACY_TIER_MAP)];
6507
- const uniqueTierNames = [...new Set(allTierNames)];
6508
- const tierPattern = new RegExp(`\\*\\*Tier:\\*\\*\\s*(${uniqueTierNames.join("|")})`, "i");
6509
- const bbTierMatch = tierPattern.exec(content);
6510
- const rawTierValue = bbTierMatch ? bbTierMatch[1].toLowerCase() : null;
6511
- const normalizedTier = rawTierValue ? LEGACY_TIER_MAP[rawTierValue] ?? (TIER_HIERARCHY.includes(rawTierValue) ? rawTierValue : null) : null;
6512
- const skipDockerEnforcement = normalizedTier !== null && normalizedTier !== "environment-provable";
6513
- const bbRawEnforcement = checkBlackBoxEnforcement(content);
6514
- const bbEnforcement = skipDockerEnforcement ? { ...bbRawEnforcement, blackBoxPass: true } : bbRawEnforcement;
6518
+ const bbEnforcement = checkBlackBoxEnforcement(content);
6515
6519
  function buildResult(base) {
6516
6520
  const basePassed = base.pending === 0 && base.verified > 0;
6517
6521
  return {
@@ -6651,7 +6655,7 @@ function validateProofQuality(proofPath) {
6651
6655
 
6652
6656
  // src/modules/verify/orchestrator.ts
6653
6657
  import { execFileSync } from "child_process";
6654
- import { mkdirSync as mkdirSync8, writeFileSync as writeFileSync9 } from "fs";
6658
+ import { mkdirSync as mkdirSync9, writeFileSync as writeFileSync10 } from "fs";
6655
6659
  import { join as join20 } from "path";
6656
6660
 
6657
6661
  // src/lib/doc-health/types.ts
@@ -7113,10 +7117,10 @@ function checkAgentsMdLineCount(filePath, docPath, documents) {
7113
7117
  // src/lib/doc-health/report.ts
7114
7118
  import {
7115
7119
  existsSync as existsSync21,
7116
- mkdirSync as mkdirSync7,
7120
+ mkdirSync as mkdirSync8,
7117
7121
  readFileSync as readFileSync18,
7118
7122
  unlinkSync as unlinkSync2,
7119
- writeFileSync as writeFileSync8
7123
+ writeFileSync as writeFileSync9
7120
7124
  } from "fs";
7121
7125
  import { join as join19 } from "path";
7122
7126
  function printDocHealthOutput(report) {
@@ -7152,9 +7156,9 @@ function completeExecPlan(storyId, dir) {
7152
7156
  Completed: ${timestamp}`
7153
7157
  );
7154
7158
  const completedDir = join19(root, "docs", "exec-plans", "completed");
7155
- mkdirSync7(completedDir, { recursive: true });
7159
+ mkdirSync8(completedDir, { recursive: true });
7156
7160
  const completedPath = join19(completedDir, `${storyId}.md`);
7157
- writeFileSync8(completedPath, content, "utf-8");
7161
+ writeFileSync9(completedPath, content, "utf-8");
7158
7162
  try {
7159
7163
  unlinkSync2(activePath);
7160
7164
  } catch {
@@ -7196,9 +7200,9 @@ function checkPreconditions(dir, storyId) {
7196
7200
  function createProofDocument(storyId, _storyTitle, _acs, dir) {
7197
7201
  const root = dir ?? process.cwd();
7198
7202
  const verificationDir = join20(root, "verification");
7199
- mkdirSync8(verificationDir, { recursive: true });
7203
+ mkdirSync9(verificationDir, { recursive: true });
7200
7204
  const proofPath = join20(verificationDir, `${storyId}-proof.md`);
7201
- writeFileSync9(proofPath, `# ${storyId} \u2014 Proof
7205
+ writeFileSync10(proofPath, `# ${storyId} \u2014 Proof
7202
7206
 
7203
7207
  Pending: blind evaluator (Epic 6)
7204
7208
  `, "utf-8");
@@ -7261,87 +7265,8 @@ var DB_KEYWORDS = [
7261
7265
  "sql",
7262
7266
  "table"
7263
7267
  ];
7264
- var INTEGRATION_KEYWORDS = [
7265
- "external system",
7266
- "real infrastructure",
7267
- "manual verification"
7268
- ];
7269
- var ESCALATE_KEYWORDS = [
7270
- "physical hardware",
7271
- "manual human",
7272
- "visual inspection by human",
7273
- "paid external service"
7274
- ];
7275
- var RUNTIME_PROVABLE_KEYWORDS = [
7276
- "cli command",
7277
- "api endpoint",
7278
- "http",
7279
- "server",
7280
- "output shows",
7281
- "exit code",
7282
- "binary",
7283
- "runs and produces",
7284
- "cli outputs",
7285
- "when run"
7286
- ];
7287
- var ENVIRONMENT_PROVABLE_KEYWORDS = [
7288
- "docker",
7289
- "container",
7290
- "observability",
7291
- "telemetry",
7292
- "database",
7293
- "queue",
7294
- "distributed",
7295
- "multi-service",
7296
- "end-to-end",
7297
- "victorialogs"
7298
- ];
7299
- var ESCALATE_TIER_KEYWORDS = [
7300
- "physical hardware",
7301
- "human visual",
7302
- "paid service",
7303
- "gpu",
7304
- "manual inspection",
7305
- "physical display"
7306
- ];
7307
7268
 
7308
7269
  // src/modules/verify/parser.ts
7309
- function classifyVerifiability(description) {
7310
- const lower = description.toLowerCase();
7311
- for (const kw of INTEGRATION_KEYWORDS) {
7312
- if (lower.includes(kw)) return "integration-required";
7313
- }
7314
- return "cli-verifiable";
7315
- }
7316
- function classifyStrategy(description) {
7317
- const lower = description.toLowerCase();
7318
- for (const kw of ESCALATE_KEYWORDS) {
7319
- if (lower.includes(kw)) return "escalate";
7320
- }
7321
- return "docker";
7322
- }
7323
- function classifyTier(description) {
7324
- const lower = description.toLowerCase();
7325
- for (const kw of ESCALATE_TIER_KEYWORDS) {
7326
- if (lower.includes(kw)) return "escalate";
7327
- }
7328
- for (const kw of ENVIRONMENT_PROVABLE_KEYWORDS) {
7329
- if (lower.includes(kw)) return "environment-provable";
7330
- }
7331
- for (const kw of RUNTIME_PROVABLE_KEYWORDS) {
7332
- if (lower.includes(kw)) return "runtime-provable";
7333
- }
7334
- return "test-provable";
7335
- }
7336
- var VERIFICATION_TAG_PATTERN = /<!--\s*verification:\s*(cli-verifiable|integration-required|unit-testable|black-box|test-provable|runtime-provable|environment-provable|escalate)\s*-->/;
7337
- function parseVerificationTag(text) {
7338
- const match = VERIFICATION_TAG_PATTERN.exec(text);
7339
- if (!match) return null;
7340
- const raw = match[1];
7341
- const mapped = LEGACY_TIER_MAP[raw] ?? raw;
7342
- if (!TIER_HIERARCHY.includes(mapped)) return null;
7343
- return mapped;
7344
- }
7345
7270
  function classifyAC(description) {
7346
7271
  const lower = description.toLowerCase();
7347
7272
  for (const kw of UI_KEYWORDS) {
@@ -7391,17 +7316,10 @@ function parseStoryACs(storyFilePath) {
7391
7316
  if (currentId !== null && currentDesc.length > 0) {
7392
7317
  const description = currentDesc.join(" ").trim();
7393
7318
  if (description) {
7394
- const tag = parseVerificationTag(description);
7395
- const tier = tag ?? classifyTier(description);
7396
- const verifiability = classifyVerifiability(description);
7397
- const strategy = classifyStrategy(description);
7398
7319
  acs.push({
7399
7320
  id: currentId,
7400
7321
  description,
7401
- type: classifyAC(description),
7402
- verifiability,
7403
- strategy,
7404
- tier
7322
+ type: classifyAC(description)
7405
7323
  });
7406
7324
  } else {
7407
7325
  warn(`Skipping malformed AC #${currentId}: empty description`);
@@ -7579,7 +7497,7 @@ function normalizeSeverity(severity) {
7579
7497
  }
7580
7498
 
7581
7499
  // src/modules/observability/coverage.ts
7582
- import { readFileSync as readFileSync20, writeFileSync as writeFileSync10, renameSync as renameSync3, existsSync as existsSync24 } from "fs";
7500
+ import { readFileSync as readFileSync20, writeFileSync as writeFileSync11, renameSync as renameSync3, existsSync as existsSync24 } from "fs";
7583
7501
  import { join as join22 } from "path";
7584
7502
  var STATE_FILE2 = "sprint-state.json";
7585
7503
  var DEFAULT_STATIC_TARGET = 80;
@@ -7668,7 +7586,7 @@ function parseGapArray(raw) {
7668
7586
  }
7669
7587
 
7670
7588
  // src/modules/observability/runtime-coverage.ts
7671
- import { readFileSync as readFileSync21, writeFileSync as writeFileSync11, renameSync as renameSync4, existsSync as existsSync25 } from "fs";
7589
+ import { readFileSync as readFileSync21, writeFileSync as writeFileSync12, renameSync as renameSync4, existsSync as existsSync25 } from "fs";
7672
7590
  import { join as join23 } from "path";
7673
7591
 
7674
7592
  // src/modules/observability/coverage-gate.ts
@@ -8510,7 +8428,7 @@ function getACById(id) {
8510
8428
 
8511
8429
  // src/modules/verify/validation-runner.ts
8512
8430
  import { execSync as execSync5 } from "child_process";
8513
- import { writeFileSync as writeFileSync12, mkdirSync as mkdirSync9 } from "fs";
8431
+ import { writeFileSync as writeFileSync13, mkdirSync as mkdirSync10 } from "fs";
8514
8432
  import { join as join25, dirname as dirname3 } from "path";
8515
8433
  var MAX_VALIDATION_ATTEMPTS = 10;
8516
8434
  var AC_COMMAND_TIMEOUT_MS = 3e4;
@@ -8663,8 +8581,8 @@ function createFixStory(ac, error) {
8663
8581
  "Fix the root cause so the validation command passes.",
8664
8582
  ""
8665
8583
  ].join("\n");
8666
- mkdirSync9(dirname3(storyPath), { recursive: true });
8667
- writeFileSync12(storyPath, markdown, "utf-8");
8584
+ mkdirSync10(dirname3(storyPath), { recursive: true });
8585
+ writeFileSync13(storyPath, markdown, "utf-8");
8668
8586
  return ok2(storyKey);
8669
8587
  } catch (err) {
8670
8588
  const msg = err instanceof Error ? err.message : String(err);
@@ -8990,7 +8908,7 @@ function runValidationCycle() {
8990
8908
 
8991
8909
  // src/modules/verify/env.ts
8992
8910
  import { execFileSync as execFileSync5 } from "child_process";
8993
- import { existsSync as existsSync27, mkdirSync as mkdirSync10, readdirSync as readdirSync7, readFileSync as readFileSync23, writeFileSync as writeFileSync13, cpSync, rmSync as rmSync2, statSync as statSync6 } from "fs";
8911
+ import { existsSync as existsSync27, mkdirSync as mkdirSync11, readdirSync as readdirSync7, readFileSync as readFileSync23, writeFileSync as writeFileSync14, cpSync, rmSync as rmSync3, statSync as statSync6 } from "fs";
8994
8912
  import { join as join27, basename as basename2 } from "path";
8995
8913
  import { createHash } from "crypto";
8996
8914
 
@@ -9139,7 +9057,7 @@ function buildNodeImage(projectDir) {
9139
9057
  const tarballName = basename2(lastLine);
9140
9058
  const tarballPath = join27("/tmp", tarballName);
9141
9059
  const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
9142
- mkdirSync10(buildContext, { recursive: true });
9060
+ mkdirSync11(buildContext, { recursive: true });
9143
9061
  try {
9144
9062
  cpSync(tarballPath, join27(buildContext, tarballName));
9145
9063
  const dockerfile = generateVerifyDockerfile(projectDir) + `
@@ -9148,15 +9066,15 @@ ARG TARBALL=package.tgz
9148
9066
  COPY \${TARBALL} /tmp/\${TARBALL}
9149
9067
  RUN npm install -g /tmp/\${TARBALL} && rm /tmp/\${TARBALL}
9150
9068
  `;
9151
- writeFileSync13(join27(buildContext, "Dockerfile"), dockerfile);
9069
+ writeFileSync14(join27(buildContext, "Dockerfile"), dockerfile);
9152
9070
  execFileSync5("docker", ["build", "-t", IMAGE_TAG, "--build-arg", `TARBALL=${tarballName}`, "."], {
9153
9071
  cwd: buildContext,
9154
9072
  stdio: "pipe",
9155
9073
  timeout: 12e4
9156
9074
  });
9157
9075
  } finally {
9158
- rmSync2(buildContext, { recursive: true, force: true });
9159
- rmSync2(tarballPath, { force: true });
9076
+ rmSync3(buildContext, { recursive: true, force: true });
9077
+ rmSync3(tarballPath, { force: true });
9160
9078
  }
9161
9079
  }
9162
9080
  function buildPythonImage(projectDir) {
@@ -9167,7 +9085,7 @@ function buildPythonImage(projectDir) {
9167
9085
  }
9168
9086
  const distFile = distFiles.filter((f) => f.endsWith(".tar.gz"))[0] ?? distFiles[0];
9169
9087
  const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
9170
- mkdirSync10(buildContext, { recursive: true });
9088
+ mkdirSync11(buildContext, { recursive: true });
9171
9089
  try {
9172
9090
  cpSync(join27(distDir, distFile), join27(buildContext, distFile));
9173
9091
  const dockerfile = generateVerifyDockerfile(projectDir) + `
@@ -9175,14 +9093,14 @@ function buildPythonImage(projectDir) {
9175
9093
  COPY ${distFile} /tmp/${distFile}
9176
9094
  RUN pip install --break-system-packages /tmp/${distFile} && rm /tmp/${distFile}
9177
9095
  `;
9178
- writeFileSync13(join27(buildContext, "Dockerfile"), dockerfile);
9096
+ writeFileSync14(join27(buildContext, "Dockerfile"), dockerfile);
9179
9097
  execFileSync5("docker", ["build", "-t", IMAGE_TAG, "."], {
9180
9098
  cwd: buildContext,
9181
9099
  stdio: "pipe",
9182
9100
  timeout: 12e4
9183
9101
  });
9184
9102
  } finally {
9185
- rmSync2(buildContext, { recursive: true, force: true });
9103
+ rmSync3(buildContext, { recursive: true, force: true });
9186
9104
  }
9187
9105
  }
9188
9106
  function prepareVerifyWorkspace(storyKey, projectDir) {
@@ -9193,8 +9111,8 @@ function prepareVerifyWorkspace(storyKey, projectDir) {
9193
9111
  const storyFile = join27(root, STORY_DIR, `${storyKey}.md`);
9194
9112
  if (!existsSync27(storyFile)) throw new Error(`Story file not found: ${storyFile}`);
9195
9113
  const workspace = `${TEMP_PREFIX}${storyKey}`;
9196
- if (existsSync27(workspace)) rmSync2(workspace, { recursive: true, force: true });
9197
- mkdirSync10(workspace, { recursive: true });
9114
+ if (existsSync27(workspace)) rmSync3(workspace, { recursive: true, force: true });
9115
+ mkdirSync11(workspace, { recursive: true });
9198
9116
  cpSync(storyFile, join27(workspace, "story.md"));
9199
9117
  const readmePath = join27(root, "README.md");
9200
9118
  if (existsSync27(readmePath)) cpSync(readmePath, join27(workspace, "README.md"));
@@ -9202,7 +9120,7 @@ function prepareVerifyWorkspace(storyKey, projectDir) {
9202
9120
  if (existsSync27(docsDir) && statSync6(docsDir).isDirectory()) {
9203
9121
  cpSync(docsDir, join27(workspace, "docs"), { recursive: true });
9204
9122
  }
9205
- mkdirSync10(join27(workspace, "verification"), { recursive: true });
9123
+ mkdirSync11(join27(workspace, "verification"), { recursive: true });
9206
9124
  return workspace;
9207
9125
  }
9208
9126
  function checkVerifyEnv() {
@@ -9244,7 +9162,7 @@ function cleanupVerifyEnv(storyKey) {
9244
9162
  }
9245
9163
  const workspace = `${TEMP_PREFIX}${storyKey}`;
9246
9164
  const containerName = `codeharness-verify-${storyKey}`;
9247
- if (existsSync27(workspace)) rmSync2(workspace, { recursive: true, force: true });
9165
+ if (existsSync27(workspace)) rmSync3(workspace, { recursive: true, force: true });
9248
9166
  try {
9249
9167
  execFileSync5("docker", ["stop", containerName], { stdio: "pipe", timeout: 15e3 });
9250
9168
  } catch {
@@ -9256,7 +9174,7 @@ function cleanupVerifyEnv(storyKey) {
9256
9174
  }
9257
9175
  function buildPluginImage(projectDir) {
9258
9176
  const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
9259
- mkdirSync10(buildContext, { recursive: true });
9177
+ mkdirSync11(buildContext, { recursive: true });
9260
9178
  try {
9261
9179
  const pluginDir = join27(projectDir, ".claude-plugin");
9262
9180
  cpSync(pluginDir, join27(buildContext, ".claude-plugin"), { recursive: true });
@@ -9266,28 +9184,28 @@ function buildPluginImage(projectDir) {
9266
9184
  cpSync(src, join27(buildContext, dir), { recursive: true });
9267
9185
  }
9268
9186
  }
9269
- writeFileSync13(join27(buildContext, "Dockerfile"), generateVerifyDockerfile(projectDir));
9187
+ writeFileSync14(join27(buildContext, "Dockerfile"), generateVerifyDockerfile(projectDir));
9270
9188
  execFileSync5("docker", ["build", "-t", IMAGE_TAG, "."], {
9271
9189
  cwd: buildContext,
9272
9190
  stdio: "pipe",
9273
9191
  timeout: 12e4
9274
9192
  });
9275
9193
  } finally {
9276
- rmSync2(buildContext, { recursive: true, force: true });
9194
+ rmSync3(buildContext, { recursive: true, force: true });
9277
9195
  }
9278
9196
  }
9279
9197
  function buildSimpleImage(projectDir, timeout = 12e4) {
9280
9198
  const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
9281
- mkdirSync10(buildContext, { recursive: true });
9199
+ mkdirSync11(buildContext, { recursive: true });
9282
9200
  try {
9283
- writeFileSync13(join27(buildContext, "Dockerfile"), generateVerifyDockerfile(projectDir));
9201
+ writeFileSync14(join27(buildContext, "Dockerfile"), generateVerifyDockerfile(projectDir));
9284
9202
  execFileSync5("docker", ["build", "-t", IMAGE_TAG, "."], {
9285
9203
  cwd: buildContext,
9286
9204
  stdio: "pipe",
9287
9205
  timeout
9288
9206
  });
9289
9207
  } finally {
9290
- rmSync2(buildContext, { recursive: true, force: true });
9208
+ rmSync3(buildContext, { recursive: true, force: true });
9291
9209
  }
9292
9210
  }
9293
9211
  function dockerImageExists(tag) {
@@ -10885,7 +10803,7 @@ function formatAuditJson(result) {
10885
10803
  }
10886
10804
 
10887
10805
  // src/modules/audit/fix-generator.ts
10888
- import { existsSync as existsSync34, writeFileSync as writeFileSync14, mkdirSync as mkdirSync11 } from "fs";
10806
+ import { existsSync as existsSync34, writeFileSync as writeFileSync15, mkdirSync as mkdirSync12 } from "fs";
10889
10807
  import { join as join33, dirname as dirname5 } from "path";
10890
10808
  function buildStoryKey(gap2, index) {
10891
10809
  const safeDimension = gap2.dimension.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/(^-|-$)/g, "");
@@ -10940,8 +10858,8 @@ function generateFixStories(auditResult) {
10940
10858
  continue;
10941
10859
  }
10942
10860
  const markdown = buildStoryMarkdown(gap2, key);
10943
- mkdirSync11(dirname5(filePath), { recursive: true });
10944
- writeFileSync14(filePath, markdown, "utf-8");
10861
+ mkdirSync12(dirname5(filePath), { recursive: true });
10862
+ writeFileSync15(filePath, markdown, "utf-8");
10945
10863
  stories.push({ key, filePath, gap: gap2, skipped: false });
10946
10864
  created++;
10947
10865
  }
@@ -11117,7 +11035,7 @@ function registerOnboardCommand(program) {
11117
11035
  }
11118
11036
 
11119
11037
  // src/commands/teardown.ts
11120
- import { existsSync as existsSync35, unlinkSync as unlinkSync3, readFileSync as readFileSync29, writeFileSync as writeFileSync15, rmSync as rmSync3 } from "fs";
11038
+ import { existsSync as existsSync35, unlinkSync as unlinkSync3, readFileSync as readFileSync29, writeFileSync as writeFileSync16, rmSync as rmSync4 } from "fs";
11121
11039
  import { join as join34 } from "path";
11122
11040
  function buildDefaultResult() {
11123
11041
  return {
@@ -11164,7 +11082,7 @@ function registerTeardownCommand(program) {
11164
11082
  } else if (otlpMode === "remote-routed") {
11165
11083
  if (!options.keepDocker) {
11166
11084
  try {
11167
- const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-UY37PFPB.js");
11085
+ const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-Y73EO7Z4.js");
11168
11086
  stopCollectorOnly2();
11169
11087
  result.docker.stopped = true;
11170
11088
  if (!isJson) {
@@ -11196,7 +11114,7 @@ function registerTeardownCommand(program) {
11196
11114
  info("Shared stack: kept running (other projects may use it)");
11197
11115
  }
11198
11116
  } else if (isLegacyStack) {
11199
- const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-UY37PFPB.js");
11117
+ const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-Y73EO7Z4.js");
11200
11118
  let stackRunning = false;
11201
11119
  try {
11202
11120
  stackRunning = isStackRunning2(composeFile);
@@ -11266,7 +11184,7 @@ function registerTeardownCommand(program) {
11266
11184
  for (const key of keysToRemove) {
11267
11185
  delete scripts[key];
11268
11186
  }
11269
- writeFileSync15(pkgPath, JSON.stringify(pkg, null, 2) + "\n", "utf-8");
11187
+ writeFileSync16(pkgPath, JSON.stringify(pkg, null, 2) + "\n", "utf-8");
11270
11188
  result.otlp_cleaned = true;
11271
11189
  if (!isJson) {
11272
11190
  ok("OTLP: removed instrumented scripts from package.json");
@@ -11294,7 +11212,7 @@ function registerTeardownCommand(program) {
11294
11212
  }
11295
11213
  const harnessDir = join34(projectDir, ".harness");
11296
11214
  if (existsSync35(harnessDir)) {
11297
- rmSync3(harnessDir, { recursive: true, force: true });
11215
+ rmSync4(harnessDir, { recursive: true, force: true });
11298
11216
  result.removed.push(".harness/");
11299
11217
  if (!isJson) {
11300
11218
  ok("Removed: .harness/");
@@ -12100,7 +12018,7 @@ function isDuplicate(newItem, existingTitles, threshold = 0.8) {
12100
12018
  }
12101
12019
 
12102
12020
  // src/lib/issue-tracker.ts
12103
- import { existsSync as existsSync36, readFileSync as readFileSync30, writeFileSync as writeFileSync16, mkdirSync as mkdirSync12 } from "fs";
12021
+ import { existsSync as existsSync36, readFileSync as readFileSync30, writeFileSync as writeFileSync17, mkdirSync as mkdirSync13 } from "fs";
12104
12022
  import { join as join35 } from "path";
12105
12023
  import { parse as parse6, stringify as stringify3 } from "yaml";
12106
12024
  var VALID_PRIORITIES = /* @__PURE__ */ new Set([
@@ -12129,9 +12047,9 @@ function writeIssues(data, dir = process.cwd()) {
12129
12047
  const filePath = issuesPath(dir);
12130
12048
  const dirPath = join35(dir, ".codeharness");
12131
12049
  if (!existsSync36(dirPath)) {
12132
- mkdirSync12(dirPath, { recursive: true });
12050
+ mkdirSync13(dirPath, { recursive: true });
12133
12051
  }
12134
- writeFileSync16(filePath, stringify3(data, { nullStr: "" }), "utf-8");
12052
+ writeFileSync17(filePath, stringify3(data, { nullStr: "" }), "utf-8");
12135
12053
  }
12136
12054
  function nextIssueId(existing) {
12137
12055
  let max = 0;
@@ -13117,7 +13035,7 @@ function registerAuditCommand(program) {
13117
13035
  }
13118
13036
 
13119
13037
  // src/commands/stats.ts
13120
- import { existsSync as existsSync39, readdirSync as readdirSync10, readFileSync as readFileSync32, writeFileSync as writeFileSync17 } from "fs";
13038
+ import { existsSync as existsSync39, readdirSync as readdirSync10, readFileSync as readFileSync32, writeFileSync as writeFileSync18 } from "fs";
13121
13039
  import { join as join38 } from "path";
13122
13040
  var RATES = {
13123
13041
  input: 15,
@@ -13327,7 +13245,7 @@ function registerStatsCommand(program) {
13327
13245
  console.log(formatted);
13328
13246
  if (options.save) {
13329
13247
  const outPath = join38(projectDir, "_bmad-output", "implementation-artifacts", "cost-report.md");
13330
- writeFileSync17(outPath, formatted, "utf-8");
13248
+ writeFileSync18(outPath, formatted, "utf-8");
13331
13249
  ok(`Report saved to ${outPath}`);
13332
13250
  }
13333
13251
  });
@@ -14183,7 +14101,7 @@ function registerDriversCommand(program) {
14183
14101
  }
14184
14102
 
14185
14103
  // src/index.ts
14186
- var VERSION = true ? "0.32.3" : "0.0.0-dev";
14104
+ var VERSION = true ? "0.33.0" : "0.0.0-dev";
14187
14105
  function createProgram() {
14188
14106
  const program = new Command();
14189
14107
  program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codeharness",
3
- "version": "0.32.3",
3
+ "version": "0.33.0",
4
4
  "type": "module",
5
5
  "description": "CLI for codeharness — makes autonomous coding agents produce software that actually works",
6
6
  "bin": {
@@ -3,8 +3,8 @@
3
3
  Dev agents repeatedly shipped code without reading module conventions (AGENTS.md),
4
4
  skipped observability checks, and produced features that could not be verified
5
5
  from outside the source tree. This patch enforces architecture awareness,
6
- observability validation, documentation hygiene, test coverage gates, and
7
- verification tier awareness — all operational failures observed in prior sprints.
6
+ observability validation, documentation hygiene, and test coverage gates
7
+ — all operational failures observed in prior sprints.
8
8
  (FR33, FR34, NFR20)
9
9
 
10
10
  ## Codeharness Development Enforcement
@@ -35,23 +35,14 @@ After running tests, verify telemetry is flowing:
35
35
  - Coverage gate: 100% of new/changed code
36
36
  - Run `npm test` / `pytest` and verify no regressions
37
37
 
38
- ### Verification Tier Awareness
38
+ ### Verification Readiness
39
39
 
40
- Write code that can be verified at the appropriate tier. The four verification tiers determine what evidence is needed to prove an AC works:
40
+ Write code that can be verified via Docker-based blind verification. Ask yourself:
41
+ - Are my functions testable and my outputs greppable?
42
+ - Can I run the CLI/server and verify output?
43
+ - Does `docker exec` work? Are logs flowing to the observability stack?
41
44
 
42
- - **`test-provable`** Code must be testable via `npm test` / `npm run build`. Ensure functions have test coverage, outputs are greppable, and build artifacts are inspectable. No running app required.
43
- - **`runtime-provable`** — Code must be exercisable via CLI or local server. Ensure the binary/CLI produces verifiable stdout, exit codes, or HTTP responses without needing Docker.
44
- - **`environment-provable`** — Code must work in a Docker verification environment. Ensure the Dockerfile is current, services start correctly, and `docker exec` can exercise the feature. Observability queries should return expected log/trace events.
45
- - **`escalate`** — Reserved for ACs that genuinely cannot be automated (physical hardware, paid external APIs). This is rare — exhaust all automated approaches first.
46
-
47
- Ask yourself:
48
- - What tier is this story tagged with?
49
- - Does my implementation produce the evidence that tier requires?
50
- - If `test-provable`: are my functions testable and my outputs greppable?
51
- - If `runtime-provable`: can I run the CLI/server and verify output locally?
52
- - If `environment-provable`: does `docker exec` work? Are logs flowing to the observability stack?
53
-
54
- If the answer is "no", the feature has a testability gap — fix the code to be verifiable at the appropriate tier.
45
+ If the answer is "no", the feature has a testability gap fix the code to be verifiable.
55
46
 
56
47
  ### Dockerfile Maintenance
57
48
 
@@ -20,7 +20,7 @@ quality trends, and mandatory concrete action items with owners.
20
20
 
21
21
  - Did the verifier hang on permissions? (check for `--allowedTools` issues)
22
22
  - Did stories get stuck in verify→dev loops? (check `attempts` counter)
23
- - Were stories assigned the wrong verification tier?
23
+ - Were stories assigned the wrong verification method?
24
24
  - Did the verify parser correctly detect `[FAIL]` verdicts?
25
25
 
26
26
  ### Documentation Health
@@ -1,10 +1,10 @@
1
1
  ## WHY
2
2
 
3
3
  Review agents approved stories without verifying proof documents existed or
4
- checking that evidence matched the story's verification tier. Stories passed review
5
- with fabricated output and missing coverage data. This patch enforces proof
6
- existence, tier-appropriate evidence quality, and coverage delta reporting as hard
7
- gates before a story can leave review.
4
+ checking that evidence was real. Stories passed review with fabricated output
5
+ and missing coverage data. This patch enforces proof existence, evidence
6
+ quality, and coverage delta reporting as hard gates before a story can leave
7
+ review.
8
8
  (FR33, FR34, NFR20)
9
9
 
10
10
  ## Codeharness Review Gates
@@ -18,34 +18,12 @@ gates before a story can leave review.
18
18
 
19
19
  ### Proof Quality Checks
20
20
 
21
- The proof must pass tier-appropriate evidence enforcement. The required evidence depends on the story's verification tier:
22
-
23
- #### `test-provable` stories
24
- - Evidence comes from build output, test results, and grep/read of code or generated artifacts
25
- - `npm test` / `npm run build` output is the primary evidence
26
- - Source-level assertions (grep against `src/`) are acceptable — this IS the verification method for this tier
27
- - `docker exec` evidence is NOT required
28
- - Each AC section must show actual test output or build results
29
-
30
- #### `runtime-provable` stories
31
- - Evidence comes from running the actual binary, CLI, or server
32
- - Process execution output (stdout, stderr, exit codes) is the primary evidence
33
- - HTTP responses from a locally running server are acceptable
34
- - `docker exec` evidence is NOT required
35
- - Each AC section must show actual command execution and output
36
-
37
- #### `environment-provable` stories
38
21
  - Commands run via `docker exec` (not direct host access)
39
22
  - Less than 50% of evidence commands are `grep` against `src/`
40
23
  - Each AC section has at least one `docker exec`, `docker ps/logs`, or observability query
41
24
  - `[FAIL]` verdicts outside code blocks cause the proof to fail
42
25
  - `[ESCALATE]` is acceptable only when all automated approaches are exhausted
43
26
 
44
- #### `escalate` stories
45
- - Human judgment is required — automated evidence may be partial or absent
46
- - Proof document must explain why automation is not possible
47
- - `[ESCALATE]` verdict is expected and acceptable
48
-
49
27
  ### Observability
50
28
 
51
29
  Run `semgrep scan --config patches/observability/ --config patches/error-handling/ --json` against changed files and report gaps.
@@ -1,49 +1,25 @@
1
1
  ## WHY
2
2
 
3
3
  Stories were marked "done" with no proof artifact, or with proofs that only
4
- grepped source code instead of exercising the feature at the appropriate
5
- verification tier. This patch mandates tier-appropriate proof documents,
6
- verification tags per AC, and test coverage targets — preventing regressions
7
- from being hidden behind inadequate evidence.
4
+ grepped source code instead of exercising the feature. This patch mandates
5
+ proof documents with real evidence, and test coverage targets — preventing
6
+ regressions from being hidden behind inadequate evidence.
8
7
  (FR33, FR36, NFR20)
9
8
 
10
9
  ## Verification Requirements
11
10
 
12
- Every story must produce a **proof document** with evidence appropriate to its verification tier.
11
+ Every story must produce a **proof document** with real evidence from Docker-based blind verification.
13
12
 
14
13
  ### Proof Standard
15
14
 
16
15
  - Proof document at `verification/<story-key>-proof.md`
17
- - Each AC gets a `## AC N:` section with tier-appropriate evidence and captured output
16
+ - Each AC gets a `## AC N:` section with evidence and captured output
18
17
  - `[FAIL]` = AC failed with evidence showing what went wrong
19
18
  - `[ESCALATE]` = AC genuinely cannot be automated (last resort — try everything first)
20
19
 
21
- **Tier-dependent evidence rules:**
22
-
23
- - **`test-provable`** — Evidence comes from build + test output + grep/read of code or artifacts. Run `npm test` or `npm run build`, capture results. Source-level assertions are the primary verification method. No running app or Docker required.
24
- - **`runtime-provable`** — Evidence comes from running the actual binary/server and interacting with it. Start the process, make requests or run commands, capture stdout/stderr/exit codes. No Docker stack required.
25
- - **`environment-provable`** — Evidence comes from `docker exec` commands and observability queries. Full Docker verification environment required. Each AC section needs at least one `docker exec`, `docker ps/logs`, or observability query. Evidence must come from running the installed CLI/tool in Docker, not from grepping source.
26
- - **`escalate`** — Human judgment required. Document why automation is not possible. `[ESCALATE]` verdict is expected.
27
-
28
- ### Verification Tags
29
-
30
- For each AC, append a tag indicating its verification tier:
31
- - `<!-- verification: test-provable -->` — Can be verified by building and running tests. Evidence: build output, test results, grep/read of code. No running app needed.
32
- - `<!-- verification: runtime-provable -->` — Requires running the actual binary/CLI/server. Evidence: process output, HTTP responses, exit codes. No Docker stack needed.
33
- - `<!-- verification: environment-provable -->` — Requires full Docker environment with observability. Evidence: `docker exec` commands, VictoriaLogs queries, multi-service interaction.
34
- - `<!-- verification: escalate -->` — Cannot be automated. Requires human judgment, physical hardware, or paid external services.
35
-
36
- **Decision criteria:**
37
- 1. Can you prove it with `npm test` or `npm run build` alone? → `test-provable`
38
- 2. Do you need to run the actual binary/server locally? → `runtime-provable`
39
- 3. Do you need Docker, external services, or observability? → `environment-provable`
40
- 4. Have you exhausted all automated approaches? → `escalate`
41
-
42
- **Do not over-tag.** Most stories are `test-provable` or `runtime-provable`. Only use `environment-provable` when Docker infrastructure is genuinely needed. Only use `escalate` as a last resort.
43
-
44
20
  ### Observability Evidence
45
21
 
46
- After each `docker exec` command (applicable to `environment-provable` stories), query the observability backend for log events from the last 30 seconds.
22
+ After each `docker exec` command, query the observability backend for log events from the last 30 seconds.
47
23
  Use the configured VictoriaLogs endpoint (default: `http://localhost:9428`):
48
24
 
49
25
  ```bash
@@ -0,0 +1,64 @@
1
+ name: documenter
2
+ role:
3
+ title: Verification Guide Writer
4
+ purpose: Read implementation and write Docker-executable verification guides for blind QA
5
+ persona:
6
+ identity: |
7
+ Technical writer who translates source code into executable verification steps.
8
+ Reads what was built, understands how it works, then writes guides that a blind
9
+ QA agent can follow using only Docker commands.
10
+ communication_style: "Precise, command-oriented. Every verification step is a copy-pasteable command with expected output."
11
+ principles:
12
+ - Every AC must map to a concrete docker exec or curl command
13
+ - Commands must be copy-pasteable — no pseudocode, no placeholders
14
+ - Include the Docker container name in every command
15
+ - 'Expected output must be specific — not "should work" but "prints PASS: hook registered"'
16
+ - Include a Prerequisites section with container name and required services
17
+ prompt_template: |
18
+ ## Role
19
+
20
+ You are writing a verification guide for a blind QA evaluator. The evaluator CANNOT see source code — it can only run Docker commands and observe output.
21
+
22
+ ## Process
23
+
24
+ 1. Read the story spec to understand the acceptance criteria
25
+ 2. Read the implementation source to understand what was built
26
+ 3. Discover the Docker container name: run `docker ps` or read `docker-compose.yml`
27
+ 4. For each AC, write an executable verification step
28
+
29
+ ## Guide Format
30
+
31
+ Write a markdown document with this structure:
32
+
33
+ ```
34
+ # Verification Guide: [Story Title]
35
+
36
+ ## Prerequisites
37
+ - Container: [container name from docker ps]
38
+ - Required services: [list any dependent services]
39
+ - Setup: [any one-time setup commands needed]
40
+
41
+ ## AC 1: [AC description]
42
+ ### Command
43
+ docker exec [container] python -c "from app.module import Class; obj = Class(); result = obj.method(args); assert result == expected; print('PASS: [description]')"
44
+ ### Expected Output
45
+ PASS: [description]
46
+ ### What This Proves
47
+ [One sentence: why this output satisfies the AC]
48
+
49
+ ## AC 2: [AC description]
50
+ ...
51
+ ```
52
+
53
+ ## Rules
54
+
55
+ - Every command must be copy-pasteable into a terminal
56
+ - No pseudocode — use real import paths, real class names, real method signatures
57
+ - For API features: use `curl http://localhost:PORT/endpoint` with expected response body
58
+ - For internal code: use `docker exec [container] python -c "..."` with assertion + print
59
+ - For CLI features: use `docker exec [container] command --args` with expected output
60
+ - If a feature cannot be verified via Docker (e.g., build-time only), state this explicitly with reason
61
+
62
+ ## Output
63
+
64
+ Write the complete verification guide as your response. Do not write to files — the engine captures your output.
@@ -1,7 +1,7 @@
1
1
  name: evaluator
2
2
  role:
3
3
  title: Adversarial QA Evaluator
4
- purpose: Exercise the built artifact and determine if it actually works
4
+ purpose: Exercise the built artifact via Docker and determine if it actually works
5
5
  persona:
6
6
  identity: Senior QA engineer who trusts nothing without evidence. Treats every claim as unverified until proven with concrete output. Assumes code is broken until demonstrated otherwise.
7
7
  communication_style: "Blunt, evidence-first. States what was observed, not what was expected. No softening, no encouragement, no benefit of the doubt."
@@ -22,11 +22,23 @@ disallowedTools:
22
22
  prompt_template: |
23
23
  ## Role
24
24
 
25
- You are verifying acceptance criteria for a software story. Your job is to determine whether each AC actually passes by gathering concrete evidence.
25
+ You are verifying acceptance criteria for an epic. Your job is to determine whether each AC actually passes by running commands and observing output.
26
26
 
27
27
  ## Input
28
28
 
29
- Read acceptance criteria from ./story-files/. Each file contains the ACs to verify. Parse every AC and verify each one independently.
29
+ Read verification guides from ./story-files/. Each guide explains:
30
+ - What was built
31
+ - Docker container name and prerequisites
32
+ - For each AC: an exact command to run and expected output
33
+
34
+ ## Verification Method
35
+
36
+ Use `docker exec`, `docker logs`, `curl`, and other Docker/HTTP commands as described in the guides. Every AC must be verified by:
37
+ 1. Running the exact command from the guide
38
+ 2. Capturing the actual output
39
+ 3. Comparing to expected output
40
+
41
+ You do NOT have access to source code. You verify by exercising the running system via Docker only.
30
42
 
31
43
  ## Anti-Leniency Rules
32
44
 
@@ -35,14 +47,7 @@ prompt_template: |
35
47
  - Every PASS requires commands_run evidence — if you cannot run a command to verify, score UNKNOWN.
36
48
  - UNKNOWN if unable to verify — never guess at outcomes.
37
49
  - Do not infer success from lack of errors. Silence is not evidence.
38
-
39
- ## Tool Access
40
-
41
- You have access to:
42
- - Docker commands: `docker exec`, `docker logs`, `docker ps`
43
- - Observability query endpoints
44
-
45
- You do NOT have access to source code. Do not attempt to read, edit, or write source files. Gather all evidence through runtime observation only.
50
+ - If Docker is not running or the app container is not available, report ALL ACs as UNKNOWN with reason "Docker not available".
46
51
 
47
52
  ## Evidence Requirements
48
53
 
@@ -19,6 +19,11 @@ tasks:
19
19
  session: fresh
20
20
  source_access: true
21
21
  driver: codex
22
+ document:
23
+ agent: documenter
24
+ session: fresh
25
+ source_access: true
26
+ model: claude-opus-4-6
22
27
  verify:
23
28
  agent: evaluator
24
29
  session: fresh
@@ -40,6 +45,7 @@ story_flow:
40
45
  - implement
41
46
  - check
42
47
  - review
48
+ - document
43
49
 
44
50
  epic_flow:
45
51
  - story_flow
@@ -48,5 +54,6 @@ epic_flow:
48
54
  - retry
49
55
  - check
50
56
  - review
57
+ - document
51
58
  - verify
52
59
  - retro