codeharness 0.32.2 → 0.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IICSAAF4.js → chunk-4YPX74BX.js} +1 -1
- package/dist/{docker-GLX24TXX.js → docker-Y73EO7Z4.js} +1 -1
- package/dist/index.js +102 -180
- package/package.json +1 -1
- package/patches/dev/enforcement.md +8 -17
- package/patches/retro/enforcement.md +1 -1
- package/patches/review/enforcement.md +4 -26
- package/patches/verify/story-verification.md +6 -30
- package/templates/agents/documenter.yaml +64 -0
- package/templates/agents/evaluator.yaml +16 -11
- package/templates/workflows/default.yaml +7 -0
|
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
|
|
|
2895
2895
|
}
|
|
2896
2896
|
|
|
2897
2897
|
// src/modules/infra/init-project.ts
|
|
2898
|
-
var HARNESS_VERSION = true ? "0.
|
|
2898
|
+
var HARNESS_VERSION = true ? "0.33.0" : "0.0.0-dev";
|
|
2899
2899
|
function failResult(opts, error) {
|
|
2900
2900
|
return {
|
|
2901
2901
|
status: "fail",
|
package/dist/index.js
CHANGED
|
@@ -40,7 +40,7 @@ import {
|
|
|
40
40
|
validateDockerfile,
|
|
41
41
|
warn,
|
|
42
42
|
writeState
|
|
43
|
-
} from "./chunk-
|
|
43
|
+
} from "./chunk-4YPX74BX.js";
|
|
44
44
|
|
|
45
45
|
// src/index.ts
|
|
46
46
|
import { Command } from "commander";
|
|
@@ -2507,7 +2507,7 @@ function resolveWorkflow(options) {
|
|
|
2507
2507
|
}
|
|
2508
2508
|
|
|
2509
2509
|
// src/lib/workflow-engine.ts
|
|
2510
|
-
import { readFileSync as readFileSync13, existsSync as existsSync15 } from "fs";
|
|
2510
|
+
import { readFileSync as readFileSync13, existsSync as existsSync15, writeFileSync as writeFileSync8, mkdirSync as mkdirSync6, rmSync as rmSync2 } from "fs";
|
|
2511
2511
|
import { join as join12 } from "path";
|
|
2512
2512
|
import { parse as parse5 } from "yaml";
|
|
2513
2513
|
|
|
@@ -3347,7 +3347,7 @@ async function executeNullTask(task, taskName, storyKey, state, config, previous
|
|
|
3347
3347
|
writeWorkflowState(updatedState, projectDir);
|
|
3348
3348
|
return { updatedState, output: result.output ?? "", contract };
|
|
3349
3349
|
}
|
|
3350
|
-
async function dispatchTaskWithResult(task, taskName, storyKey, definition, state, config, customPrompt, previousOutputContract) {
|
|
3350
|
+
async function dispatchTaskWithResult(task, taskName, storyKey, definition, state, config, customPrompt, previousOutputContract, storyFiles) {
|
|
3351
3351
|
const projectDir = config.projectDir ?? process.cwd();
|
|
3352
3352
|
const traceId = generateTraceId(config.runId, state.iteration, taskName);
|
|
3353
3353
|
const tracePrompt = formatTracePrompt(traceId);
|
|
@@ -3361,7 +3361,7 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
|
|
|
3361
3361
|
let workspace = null;
|
|
3362
3362
|
if (task.source_access === false) {
|
|
3363
3363
|
try {
|
|
3364
|
-
workspace = await createIsolatedWorkspace({ runId: config.runId, storyFiles: [] });
|
|
3364
|
+
workspace = await createIsolatedWorkspace({ runId: config.runId, storyFiles: storyFiles ?? [] });
|
|
3365
3365
|
cwd = workspace?.toDispatchOptions()?.cwd ?? projectDir;
|
|
3366
3366
|
} catch {
|
|
3367
3367
|
cwd = projectDir;
|
|
@@ -3878,7 +3878,7 @@ async function executeWorkflow(config) {
|
|
|
3878
3878
|
}
|
|
3879
3879
|
if (state.phase === "error" || state.phase === "failed") {
|
|
3880
3880
|
const errorCount = state.tasks_completed.filter((t) => t.error).length;
|
|
3881
|
-
info(`Resuming from ${state.phase} state \u2014 ${errorCount} previous error(s), retrying failed tasks`);
|
|
3881
|
+
if (!config.onEvent) info(`Resuming from ${state.phase} state \u2014 ${errorCount} previous error(s), retrying failed tasks`);
|
|
3882
3882
|
}
|
|
3883
3883
|
state = {
|
|
3884
3884
|
...state,
|
|
@@ -3929,13 +3929,17 @@ async function executeWorkflow(config) {
|
|
|
3929
3929
|
for (const [epicId, epicItems] of epicGroups) {
|
|
3930
3930
|
if (halted) break;
|
|
3931
3931
|
if (config.abortSignal?.aborted) {
|
|
3932
|
-
info("Execution interrupted \u2014 saving state");
|
|
3932
|
+
if (!config.onEvent) info("Execution interrupted \u2014 saving state");
|
|
3933
3933
|
state = { ...state, phase: "interrupted" };
|
|
3934
3934
|
writeWorkflowState(state, projectDir);
|
|
3935
3935
|
halted = true;
|
|
3936
3936
|
break;
|
|
3937
3937
|
}
|
|
3938
|
-
|
|
3938
|
+
if (config.onEvent) {
|
|
3939
|
+
config.onEvent({ type: "dispatch-start", taskName: "story_flow", storyKey: `__epic_${epicId}__` });
|
|
3940
|
+
} else {
|
|
3941
|
+
info(`[epic-${epicId}] Starting epic with ${epicItems.length} stories`);
|
|
3942
|
+
}
|
|
3939
3943
|
for (const step of config.workflow.epicFlow) {
|
|
3940
3944
|
if (halted) break;
|
|
3941
3945
|
if (config.abortSignal?.aborted) {
|
|
@@ -4026,8 +4030,27 @@ async function executeWorkflow(config) {
|
|
|
4026
4030
|
}
|
|
4027
4031
|
const epicSentinel = `__epic_${epicId}__`;
|
|
4028
4032
|
if (isTaskCompleted(state, taskName, epicSentinel)) continue;
|
|
4033
|
+
let guideFiles = [];
|
|
4034
|
+
if (task.source_access === false) {
|
|
4035
|
+
const guidesDir = join12(projectDir, ".codeharness", "verify-guides");
|
|
4036
|
+
try {
|
|
4037
|
+
mkdirSync6(guidesDir, { recursive: true });
|
|
4038
|
+
for (const item of epicItems) {
|
|
4039
|
+
const contractPath = join12(projectDir, ".codeharness", "contracts", `document-${item.key}.json`);
|
|
4040
|
+
if (existsSync15(contractPath)) {
|
|
4041
|
+
const contractData = JSON.parse(readFileSync13(contractPath, "utf-8"));
|
|
4042
|
+
if (contractData.output) {
|
|
4043
|
+
const guidePath = join12(guidesDir, `${item.key}-guide.md`);
|
|
4044
|
+
writeFileSync8(guidePath, contractData.output, "utf-8");
|
|
4045
|
+
guideFiles.push(guidePath);
|
|
4046
|
+
}
|
|
4047
|
+
}
|
|
4048
|
+
}
|
|
4049
|
+
} catch {
|
|
4050
|
+
}
|
|
4051
|
+
}
|
|
4029
4052
|
try {
|
|
4030
|
-
const dr = await dispatchTaskWithResult(task, taskName, epicSentinel, definition, state, config, void 0, lastOutputContract ?? void 0);
|
|
4053
|
+
const dr = await dispatchTaskWithResult(task, taskName, epicSentinel, definition, state, config, void 0, lastOutputContract ?? void 0, guideFiles);
|
|
4031
4054
|
state = dr.updatedState;
|
|
4032
4055
|
lastOutputContract = dr.contract;
|
|
4033
4056
|
propagateVerifyFlags(taskName, dr.contract, projectDir);
|
|
@@ -4046,10 +4069,18 @@ async function executeWorkflow(config) {
|
|
|
4046
4069
|
if (err instanceof DispatchError && HALT_ERROR_CODES.has(err.code)) {
|
|
4047
4070
|
halted = true;
|
|
4048
4071
|
}
|
|
4072
|
+
} finally {
|
|
4073
|
+
if (guideFiles.length > 0) {
|
|
4074
|
+
const guidesDir = join12(projectDir, ".codeharness", "verify-guides");
|
|
4075
|
+
try {
|
|
4076
|
+
rmSync2(guidesDir, { recursive: true, force: true });
|
|
4077
|
+
} catch {
|
|
4078
|
+
}
|
|
4079
|
+
}
|
|
4049
4080
|
}
|
|
4050
4081
|
}
|
|
4051
4082
|
if (!halted) {
|
|
4052
|
-
info(`[epic-${epicId}] Epic completed`);
|
|
4083
|
+
if (!config.onEvent) info(`[epic-${epicId}] Epic completed`);
|
|
4053
4084
|
}
|
|
4054
4085
|
}
|
|
4055
4086
|
if (state.phase === "interrupted") {
|
|
@@ -4111,7 +4142,7 @@ import { join as join14 } from "path";
|
|
|
4111
4142
|
|
|
4112
4143
|
// src/lib/cross-worktree-validator.ts
|
|
4113
4144
|
import { exec } from "child_process";
|
|
4114
|
-
import { appendFileSync as appendFileSync2, mkdirSync as
|
|
4145
|
+
import { appendFileSync as appendFileSync2, mkdirSync as mkdirSync7 } from "fs";
|
|
4115
4146
|
import { join as join13 } from "path";
|
|
4116
4147
|
import { promisify } from "util";
|
|
4117
4148
|
var execAsync = promisify(exec);
|
|
@@ -4147,7 +4178,7 @@ function writeMergeTelemetry(opts, result) {
|
|
|
4147
4178
|
errors: result.valid ? [] : ["Test suite failed after merge"]
|
|
4148
4179
|
};
|
|
4149
4180
|
const dir = join13(opts.cwd, TELEMETRY_DIR2);
|
|
4150
|
-
|
|
4181
|
+
mkdirSync7(dir, { recursive: true });
|
|
4151
4182
|
appendFileSync2(join13(dir, TELEMETRY_FILE2), JSON.stringify(entry) + "\n");
|
|
4152
4183
|
} catch {
|
|
4153
4184
|
}
|
|
@@ -5548,11 +5579,11 @@ function startRenderer(options) {
|
|
|
5548
5579
|
let lastStoryKey = state.sprintInfo?.storyKey ?? null;
|
|
5549
5580
|
const pendingStoryCosts = /* @__PURE__ */ new Map();
|
|
5550
5581
|
let cleaned = false;
|
|
5582
|
+
process.stdout.write("\x1B[2J\x1B[H");
|
|
5551
5583
|
const onQuit = options?.onQuit;
|
|
5552
5584
|
const inkInstance = inkRender(/* @__PURE__ */ jsx9(App, { state, onCycleLane: () => cycleLane(), onQuit: onQuit ? () => onQuit() : void 0 }), {
|
|
5553
5585
|
exitOnCtrlC: false,
|
|
5554
|
-
patchConsole:
|
|
5555
|
-
// Disable console patching to prevent flicker
|
|
5586
|
+
patchConsole: !options?._forceTTY,
|
|
5556
5587
|
maxFps: 10
|
|
5557
5588
|
});
|
|
5558
5589
|
function rerender() {
|
|
@@ -6195,10 +6226,11 @@ function registerRunCommand(program) {
|
|
|
6195
6226
|
currentTaskName = event.taskName;
|
|
6196
6227
|
const inLoop = inEpicPhase && epicLoopTasks.has(event.taskName) && taskStates[event.taskName] === "done";
|
|
6197
6228
|
const stateKey = inLoop ? `loop:${event.taskName}` : event.taskName;
|
|
6198
|
-
const epicId = extractEpicId2(event.storyKey);
|
|
6229
|
+
const epicId = event.storyKey.startsWith("__epic_") ? event.storyKey.replace("__epic_", "").replace("__", "") : extractEpicId2(event.storyKey);
|
|
6230
|
+
const displayStoryKey = event.storyKey.startsWith("__epic_") ? `Epic ${epicId}` : event.storyKey;
|
|
6199
6231
|
const epic = epicData[epicId];
|
|
6200
6232
|
renderer.updateSprintState({
|
|
6201
|
-
storyKey:
|
|
6233
|
+
storyKey: displayStoryKey,
|
|
6202
6234
|
phase: event.taskName,
|
|
6203
6235
|
done: storiesDone,
|
|
6204
6236
|
total: counts.total,
|
|
@@ -6239,17 +6271,17 @@ function registerRunCommand(program) {
|
|
|
6239
6271
|
total: counts.total,
|
|
6240
6272
|
totalCost: totalCostUsd
|
|
6241
6273
|
});
|
|
6242
|
-
if (
|
|
6243
|
-
const
|
|
6244
|
-
|
|
6245
|
-
|
|
6246
|
-
|
|
6247
|
-
|
|
6248
|
-
|
|
6249
|
-
storyEntries[
|
|
6250
|
-
renderer.updateStories([...storyEntries]);
|
|
6274
|
+
if (event.taskName === "verify" && event.storyKey.startsWith("__epic_")) {
|
|
6275
|
+
const epicId = event.storyKey.replace("__epic_", "").replace("__", "");
|
|
6276
|
+
for (let i = 0; i < storyEntries.length; i++) {
|
|
6277
|
+
const se = storyEntries[i];
|
|
6278
|
+
if (se.status === "in-progress" && se.key.startsWith(`${epicId}-`)) {
|
|
6279
|
+
storiesDone++;
|
|
6280
|
+
updateStoryStatus2(se.key, "done");
|
|
6281
|
+
storyEntries[i] = { ...se, status: "done" };
|
|
6251
6282
|
}
|
|
6252
6283
|
}
|
|
6284
|
+
renderer.updateStories([...storyEntries]);
|
|
6253
6285
|
}
|
|
6254
6286
|
}
|
|
6255
6287
|
if (event.type === "dispatch-error") {
|
|
@@ -6394,22 +6426,6 @@ import { readFileSync as readFileSync24 } from "fs";
|
|
|
6394
6426
|
|
|
6395
6427
|
// src/modules/verify/proof.ts
|
|
6396
6428
|
import { existsSync as existsSync18, readFileSync as readFileSync15 } from "fs";
|
|
6397
|
-
|
|
6398
|
-
// src/modules/verify/types.ts
|
|
6399
|
-
var TIER_HIERARCHY = [
|
|
6400
|
-
"test-provable",
|
|
6401
|
-
"runtime-provable",
|
|
6402
|
-
"environment-provable",
|
|
6403
|
-
"escalate"
|
|
6404
|
-
];
|
|
6405
|
-
var LEGACY_TIER_MAP = {
|
|
6406
|
-
"cli-verifiable": "test-provable",
|
|
6407
|
-
"integration-required": "environment-provable",
|
|
6408
|
-
"unit-testable": "test-provable",
|
|
6409
|
-
"black-box": "environment-provable"
|
|
6410
|
-
};
|
|
6411
|
-
|
|
6412
|
-
// src/modules/verify/proof.ts
|
|
6413
6429
|
function classifyEvidenceCommands(proofContent) {
|
|
6414
6430
|
const results = [];
|
|
6415
6431
|
const codeBlockPattern = /```(?:bash|shell)\n([\s\S]*?)```/g;
|
|
@@ -6499,15 +6515,7 @@ function validateProofQuality(proofPath) {
|
|
|
6499
6515
|
return emptyResult;
|
|
6500
6516
|
}
|
|
6501
6517
|
const content = readFileSync15(proofPath, "utf-8");
|
|
6502
|
-
const
|
|
6503
|
-
const uniqueTierNames = [...new Set(allTierNames)];
|
|
6504
|
-
const tierPattern = new RegExp(`\\*\\*Tier:\\*\\*\\s*(${uniqueTierNames.join("|")})`, "i");
|
|
6505
|
-
const bbTierMatch = tierPattern.exec(content);
|
|
6506
|
-
const rawTierValue = bbTierMatch ? bbTierMatch[1].toLowerCase() : null;
|
|
6507
|
-
const normalizedTier = rawTierValue ? LEGACY_TIER_MAP[rawTierValue] ?? (TIER_HIERARCHY.includes(rawTierValue) ? rawTierValue : null) : null;
|
|
6508
|
-
const skipDockerEnforcement = normalizedTier !== null && normalizedTier !== "environment-provable";
|
|
6509
|
-
const bbRawEnforcement = checkBlackBoxEnforcement(content);
|
|
6510
|
-
const bbEnforcement = skipDockerEnforcement ? { ...bbRawEnforcement, blackBoxPass: true } : bbRawEnforcement;
|
|
6518
|
+
const bbEnforcement = checkBlackBoxEnforcement(content);
|
|
6511
6519
|
function buildResult(base) {
|
|
6512
6520
|
const basePassed = base.pending === 0 && base.verified > 0;
|
|
6513
6521
|
return {
|
|
@@ -6647,7 +6655,7 @@ function validateProofQuality(proofPath) {
|
|
|
6647
6655
|
|
|
6648
6656
|
// src/modules/verify/orchestrator.ts
|
|
6649
6657
|
import { execFileSync } from "child_process";
|
|
6650
|
-
import { mkdirSync as
|
|
6658
|
+
import { mkdirSync as mkdirSync9, writeFileSync as writeFileSync10 } from "fs";
|
|
6651
6659
|
import { join as join20 } from "path";
|
|
6652
6660
|
|
|
6653
6661
|
// src/lib/doc-health/types.ts
|
|
@@ -7109,10 +7117,10 @@ function checkAgentsMdLineCount(filePath, docPath, documents) {
|
|
|
7109
7117
|
// src/lib/doc-health/report.ts
|
|
7110
7118
|
import {
|
|
7111
7119
|
existsSync as existsSync21,
|
|
7112
|
-
mkdirSync as
|
|
7120
|
+
mkdirSync as mkdirSync8,
|
|
7113
7121
|
readFileSync as readFileSync18,
|
|
7114
7122
|
unlinkSync as unlinkSync2,
|
|
7115
|
-
writeFileSync as
|
|
7123
|
+
writeFileSync as writeFileSync9
|
|
7116
7124
|
} from "fs";
|
|
7117
7125
|
import { join as join19 } from "path";
|
|
7118
7126
|
function printDocHealthOutput(report) {
|
|
@@ -7148,9 +7156,9 @@ function completeExecPlan(storyId, dir) {
|
|
|
7148
7156
|
Completed: ${timestamp}`
|
|
7149
7157
|
);
|
|
7150
7158
|
const completedDir = join19(root, "docs", "exec-plans", "completed");
|
|
7151
|
-
|
|
7159
|
+
mkdirSync8(completedDir, { recursive: true });
|
|
7152
7160
|
const completedPath = join19(completedDir, `${storyId}.md`);
|
|
7153
|
-
|
|
7161
|
+
writeFileSync9(completedPath, content, "utf-8");
|
|
7154
7162
|
try {
|
|
7155
7163
|
unlinkSync2(activePath);
|
|
7156
7164
|
} catch {
|
|
@@ -7192,9 +7200,9 @@ function checkPreconditions(dir, storyId) {
|
|
|
7192
7200
|
function createProofDocument(storyId, _storyTitle, _acs, dir) {
|
|
7193
7201
|
const root = dir ?? process.cwd();
|
|
7194
7202
|
const verificationDir = join20(root, "verification");
|
|
7195
|
-
|
|
7203
|
+
mkdirSync9(verificationDir, { recursive: true });
|
|
7196
7204
|
const proofPath = join20(verificationDir, `${storyId}-proof.md`);
|
|
7197
|
-
|
|
7205
|
+
writeFileSync10(proofPath, `# ${storyId} \u2014 Proof
|
|
7198
7206
|
|
|
7199
7207
|
Pending: blind evaluator (Epic 6)
|
|
7200
7208
|
`, "utf-8");
|
|
@@ -7257,87 +7265,8 @@ var DB_KEYWORDS = [
|
|
|
7257
7265
|
"sql",
|
|
7258
7266
|
"table"
|
|
7259
7267
|
];
|
|
7260
|
-
var INTEGRATION_KEYWORDS = [
|
|
7261
|
-
"external system",
|
|
7262
|
-
"real infrastructure",
|
|
7263
|
-
"manual verification"
|
|
7264
|
-
];
|
|
7265
|
-
var ESCALATE_KEYWORDS = [
|
|
7266
|
-
"physical hardware",
|
|
7267
|
-
"manual human",
|
|
7268
|
-
"visual inspection by human",
|
|
7269
|
-
"paid external service"
|
|
7270
|
-
];
|
|
7271
|
-
var RUNTIME_PROVABLE_KEYWORDS = [
|
|
7272
|
-
"cli command",
|
|
7273
|
-
"api endpoint",
|
|
7274
|
-
"http",
|
|
7275
|
-
"server",
|
|
7276
|
-
"output shows",
|
|
7277
|
-
"exit code",
|
|
7278
|
-
"binary",
|
|
7279
|
-
"runs and produces",
|
|
7280
|
-
"cli outputs",
|
|
7281
|
-
"when run"
|
|
7282
|
-
];
|
|
7283
|
-
var ENVIRONMENT_PROVABLE_KEYWORDS = [
|
|
7284
|
-
"docker",
|
|
7285
|
-
"container",
|
|
7286
|
-
"observability",
|
|
7287
|
-
"telemetry",
|
|
7288
|
-
"database",
|
|
7289
|
-
"queue",
|
|
7290
|
-
"distributed",
|
|
7291
|
-
"multi-service",
|
|
7292
|
-
"end-to-end",
|
|
7293
|
-
"victorialogs"
|
|
7294
|
-
];
|
|
7295
|
-
var ESCALATE_TIER_KEYWORDS = [
|
|
7296
|
-
"physical hardware",
|
|
7297
|
-
"human visual",
|
|
7298
|
-
"paid service",
|
|
7299
|
-
"gpu",
|
|
7300
|
-
"manual inspection",
|
|
7301
|
-
"physical display"
|
|
7302
|
-
];
|
|
7303
7268
|
|
|
7304
7269
|
// src/modules/verify/parser.ts
|
|
7305
|
-
function classifyVerifiability(description) {
|
|
7306
|
-
const lower = description.toLowerCase();
|
|
7307
|
-
for (const kw of INTEGRATION_KEYWORDS) {
|
|
7308
|
-
if (lower.includes(kw)) return "integration-required";
|
|
7309
|
-
}
|
|
7310
|
-
return "cli-verifiable";
|
|
7311
|
-
}
|
|
7312
|
-
function classifyStrategy(description) {
|
|
7313
|
-
const lower = description.toLowerCase();
|
|
7314
|
-
for (const kw of ESCALATE_KEYWORDS) {
|
|
7315
|
-
if (lower.includes(kw)) return "escalate";
|
|
7316
|
-
}
|
|
7317
|
-
return "docker";
|
|
7318
|
-
}
|
|
7319
|
-
function classifyTier(description) {
|
|
7320
|
-
const lower = description.toLowerCase();
|
|
7321
|
-
for (const kw of ESCALATE_TIER_KEYWORDS) {
|
|
7322
|
-
if (lower.includes(kw)) return "escalate";
|
|
7323
|
-
}
|
|
7324
|
-
for (const kw of ENVIRONMENT_PROVABLE_KEYWORDS) {
|
|
7325
|
-
if (lower.includes(kw)) return "environment-provable";
|
|
7326
|
-
}
|
|
7327
|
-
for (const kw of RUNTIME_PROVABLE_KEYWORDS) {
|
|
7328
|
-
if (lower.includes(kw)) return "runtime-provable";
|
|
7329
|
-
}
|
|
7330
|
-
return "test-provable";
|
|
7331
|
-
}
|
|
7332
|
-
var VERIFICATION_TAG_PATTERN = /<!--\s*verification:\s*(cli-verifiable|integration-required|unit-testable|black-box|test-provable|runtime-provable|environment-provable|escalate)\s*-->/;
|
|
7333
|
-
function parseVerificationTag(text) {
|
|
7334
|
-
const match = VERIFICATION_TAG_PATTERN.exec(text);
|
|
7335
|
-
if (!match) return null;
|
|
7336
|
-
const raw = match[1];
|
|
7337
|
-
const mapped = LEGACY_TIER_MAP[raw] ?? raw;
|
|
7338
|
-
if (!TIER_HIERARCHY.includes(mapped)) return null;
|
|
7339
|
-
return mapped;
|
|
7340
|
-
}
|
|
7341
7270
|
function classifyAC(description) {
|
|
7342
7271
|
const lower = description.toLowerCase();
|
|
7343
7272
|
for (const kw of UI_KEYWORDS) {
|
|
@@ -7387,17 +7316,10 @@ function parseStoryACs(storyFilePath) {
|
|
|
7387
7316
|
if (currentId !== null && currentDesc.length > 0) {
|
|
7388
7317
|
const description = currentDesc.join(" ").trim();
|
|
7389
7318
|
if (description) {
|
|
7390
|
-
const tag = parseVerificationTag(description);
|
|
7391
|
-
const tier = tag ?? classifyTier(description);
|
|
7392
|
-
const verifiability = classifyVerifiability(description);
|
|
7393
|
-
const strategy = classifyStrategy(description);
|
|
7394
7319
|
acs.push({
|
|
7395
7320
|
id: currentId,
|
|
7396
7321
|
description,
|
|
7397
|
-
type: classifyAC(description)
|
|
7398
|
-
verifiability,
|
|
7399
|
-
strategy,
|
|
7400
|
-
tier
|
|
7322
|
+
type: classifyAC(description)
|
|
7401
7323
|
});
|
|
7402
7324
|
} else {
|
|
7403
7325
|
warn(`Skipping malformed AC #${currentId}: empty description`);
|
|
@@ -7575,7 +7497,7 @@ function normalizeSeverity(severity) {
|
|
|
7575
7497
|
}
|
|
7576
7498
|
|
|
7577
7499
|
// src/modules/observability/coverage.ts
|
|
7578
|
-
import { readFileSync as readFileSync20, writeFileSync as
|
|
7500
|
+
import { readFileSync as readFileSync20, writeFileSync as writeFileSync11, renameSync as renameSync3, existsSync as existsSync24 } from "fs";
|
|
7579
7501
|
import { join as join22 } from "path";
|
|
7580
7502
|
var STATE_FILE2 = "sprint-state.json";
|
|
7581
7503
|
var DEFAULT_STATIC_TARGET = 80;
|
|
@@ -7664,7 +7586,7 @@ function parseGapArray(raw) {
|
|
|
7664
7586
|
}
|
|
7665
7587
|
|
|
7666
7588
|
// src/modules/observability/runtime-coverage.ts
|
|
7667
|
-
import { readFileSync as readFileSync21, writeFileSync as
|
|
7589
|
+
import { readFileSync as readFileSync21, writeFileSync as writeFileSync12, renameSync as renameSync4, existsSync as existsSync25 } from "fs";
|
|
7668
7590
|
import { join as join23 } from "path";
|
|
7669
7591
|
|
|
7670
7592
|
// src/modules/observability/coverage-gate.ts
|
|
@@ -8506,7 +8428,7 @@ function getACById(id) {
|
|
|
8506
8428
|
|
|
8507
8429
|
// src/modules/verify/validation-runner.ts
|
|
8508
8430
|
import { execSync as execSync5 } from "child_process";
|
|
8509
|
-
import { writeFileSync as
|
|
8431
|
+
import { writeFileSync as writeFileSync13, mkdirSync as mkdirSync10 } from "fs";
|
|
8510
8432
|
import { join as join25, dirname as dirname3 } from "path";
|
|
8511
8433
|
var MAX_VALIDATION_ATTEMPTS = 10;
|
|
8512
8434
|
var AC_COMMAND_TIMEOUT_MS = 3e4;
|
|
@@ -8659,8 +8581,8 @@ function createFixStory(ac, error) {
|
|
|
8659
8581
|
"Fix the root cause so the validation command passes.",
|
|
8660
8582
|
""
|
|
8661
8583
|
].join("\n");
|
|
8662
|
-
|
|
8663
|
-
|
|
8584
|
+
mkdirSync10(dirname3(storyPath), { recursive: true });
|
|
8585
|
+
writeFileSync13(storyPath, markdown, "utf-8");
|
|
8664
8586
|
return ok2(storyKey);
|
|
8665
8587
|
} catch (err) {
|
|
8666
8588
|
const msg = err instanceof Error ? err.message : String(err);
|
|
@@ -8986,7 +8908,7 @@ function runValidationCycle() {
|
|
|
8986
8908
|
|
|
8987
8909
|
// src/modules/verify/env.ts
|
|
8988
8910
|
import { execFileSync as execFileSync5 } from "child_process";
|
|
8989
|
-
import { existsSync as existsSync27, mkdirSync as
|
|
8911
|
+
import { existsSync as existsSync27, mkdirSync as mkdirSync11, readdirSync as readdirSync7, readFileSync as readFileSync23, writeFileSync as writeFileSync14, cpSync, rmSync as rmSync3, statSync as statSync6 } from "fs";
|
|
8990
8912
|
import { join as join27, basename as basename2 } from "path";
|
|
8991
8913
|
import { createHash } from "crypto";
|
|
8992
8914
|
|
|
@@ -9135,7 +9057,7 @@ function buildNodeImage(projectDir) {
|
|
|
9135
9057
|
const tarballName = basename2(lastLine);
|
|
9136
9058
|
const tarballPath = join27("/tmp", tarballName);
|
|
9137
9059
|
const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
|
|
9138
|
-
|
|
9060
|
+
mkdirSync11(buildContext, { recursive: true });
|
|
9139
9061
|
try {
|
|
9140
9062
|
cpSync(tarballPath, join27(buildContext, tarballName));
|
|
9141
9063
|
const dockerfile = generateVerifyDockerfile(projectDir) + `
|
|
@@ -9144,15 +9066,15 @@ ARG TARBALL=package.tgz
|
|
|
9144
9066
|
COPY \${TARBALL} /tmp/\${TARBALL}
|
|
9145
9067
|
RUN npm install -g /tmp/\${TARBALL} && rm /tmp/\${TARBALL}
|
|
9146
9068
|
`;
|
|
9147
|
-
|
|
9069
|
+
writeFileSync14(join27(buildContext, "Dockerfile"), dockerfile);
|
|
9148
9070
|
execFileSync5("docker", ["build", "-t", IMAGE_TAG, "--build-arg", `TARBALL=${tarballName}`, "."], {
|
|
9149
9071
|
cwd: buildContext,
|
|
9150
9072
|
stdio: "pipe",
|
|
9151
9073
|
timeout: 12e4
|
|
9152
9074
|
});
|
|
9153
9075
|
} finally {
|
|
9154
|
-
|
|
9155
|
-
|
|
9076
|
+
rmSync3(buildContext, { recursive: true, force: true });
|
|
9077
|
+
rmSync3(tarballPath, { force: true });
|
|
9156
9078
|
}
|
|
9157
9079
|
}
|
|
9158
9080
|
function buildPythonImage(projectDir) {
|
|
@@ -9163,7 +9085,7 @@ function buildPythonImage(projectDir) {
|
|
|
9163
9085
|
}
|
|
9164
9086
|
const distFile = distFiles.filter((f) => f.endsWith(".tar.gz"))[0] ?? distFiles[0];
|
|
9165
9087
|
const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
|
|
9166
|
-
|
|
9088
|
+
mkdirSync11(buildContext, { recursive: true });
|
|
9167
9089
|
try {
|
|
9168
9090
|
cpSync(join27(distDir, distFile), join27(buildContext, distFile));
|
|
9169
9091
|
const dockerfile = generateVerifyDockerfile(projectDir) + `
|
|
@@ -9171,14 +9093,14 @@ function buildPythonImage(projectDir) {
|
|
|
9171
9093
|
COPY ${distFile} /tmp/${distFile}
|
|
9172
9094
|
RUN pip install --break-system-packages /tmp/${distFile} && rm /tmp/${distFile}
|
|
9173
9095
|
`;
|
|
9174
|
-
|
|
9096
|
+
writeFileSync14(join27(buildContext, "Dockerfile"), dockerfile);
|
|
9175
9097
|
execFileSync5("docker", ["build", "-t", IMAGE_TAG, "."], {
|
|
9176
9098
|
cwd: buildContext,
|
|
9177
9099
|
stdio: "pipe",
|
|
9178
9100
|
timeout: 12e4
|
|
9179
9101
|
});
|
|
9180
9102
|
} finally {
|
|
9181
|
-
|
|
9103
|
+
rmSync3(buildContext, { recursive: true, force: true });
|
|
9182
9104
|
}
|
|
9183
9105
|
}
|
|
9184
9106
|
function prepareVerifyWorkspace(storyKey, projectDir) {
|
|
@@ -9189,8 +9111,8 @@ function prepareVerifyWorkspace(storyKey, projectDir) {
|
|
|
9189
9111
|
const storyFile = join27(root, STORY_DIR, `${storyKey}.md`);
|
|
9190
9112
|
if (!existsSync27(storyFile)) throw new Error(`Story file not found: ${storyFile}`);
|
|
9191
9113
|
const workspace = `${TEMP_PREFIX}${storyKey}`;
|
|
9192
|
-
if (existsSync27(workspace))
|
|
9193
|
-
|
|
9114
|
+
if (existsSync27(workspace)) rmSync3(workspace, { recursive: true, force: true });
|
|
9115
|
+
mkdirSync11(workspace, { recursive: true });
|
|
9194
9116
|
cpSync(storyFile, join27(workspace, "story.md"));
|
|
9195
9117
|
const readmePath = join27(root, "README.md");
|
|
9196
9118
|
if (existsSync27(readmePath)) cpSync(readmePath, join27(workspace, "README.md"));
|
|
@@ -9198,7 +9120,7 @@ function prepareVerifyWorkspace(storyKey, projectDir) {
|
|
|
9198
9120
|
if (existsSync27(docsDir) && statSync6(docsDir).isDirectory()) {
|
|
9199
9121
|
cpSync(docsDir, join27(workspace, "docs"), { recursive: true });
|
|
9200
9122
|
}
|
|
9201
|
-
|
|
9123
|
+
mkdirSync11(join27(workspace, "verification"), { recursive: true });
|
|
9202
9124
|
return workspace;
|
|
9203
9125
|
}
|
|
9204
9126
|
function checkVerifyEnv() {
|
|
@@ -9240,7 +9162,7 @@ function cleanupVerifyEnv(storyKey) {
|
|
|
9240
9162
|
}
|
|
9241
9163
|
const workspace = `${TEMP_PREFIX}${storyKey}`;
|
|
9242
9164
|
const containerName = `codeharness-verify-${storyKey}`;
|
|
9243
|
-
if (existsSync27(workspace))
|
|
9165
|
+
if (existsSync27(workspace)) rmSync3(workspace, { recursive: true, force: true });
|
|
9244
9166
|
try {
|
|
9245
9167
|
execFileSync5("docker", ["stop", containerName], { stdio: "pipe", timeout: 15e3 });
|
|
9246
9168
|
} catch {
|
|
@@ -9252,7 +9174,7 @@ function cleanupVerifyEnv(storyKey) {
|
|
|
9252
9174
|
}
|
|
9253
9175
|
function buildPluginImage(projectDir) {
|
|
9254
9176
|
const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
|
|
9255
|
-
|
|
9177
|
+
mkdirSync11(buildContext, { recursive: true });
|
|
9256
9178
|
try {
|
|
9257
9179
|
const pluginDir = join27(projectDir, ".claude-plugin");
|
|
9258
9180
|
cpSync(pluginDir, join27(buildContext, ".claude-plugin"), { recursive: true });
|
|
@@ -9262,28 +9184,28 @@ function buildPluginImage(projectDir) {
|
|
|
9262
9184
|
cpSync(src, join27(buildContext, dir), { recursive: true });
|
|
9263
9185
|
}
|
|
9264
9186
|
}
|
|
9265
|
-
|
|
9187
|
+
writeFileSync14(join27(buildContext, "Dockerfile"), generateVerifyDockerfile(projectDir));
|
|
9266
9188
|
execFileSync5("docker", ["build", "-t", IMAGE_TAG, "."], {
|
|
9267
9189
|
cwd: buildContext,
|
|
9268
9190
|
stdio: "pipe",
|
|
9269
9191
|
timeout: 12e4
|
|
9270
9192
|
});
|
|
9271
9193
|
} finally {
|
|
9272
|
-
|
|
9194
|
+
rmSync3(buildContext, { recursive: true, force: true });
|
|
9273
9195
|
}
|
|
9274
9196
|
}
|
|
9275
9197
|
function buildSimpleImage(projectDir, timeout = 12e4) {
|
|
9276
9198
|
const buildContext = join27("/tmp", `codeharness-verify-build-${Date.now()}`);
|
|
9277
|
-
|
|
9199
|
+
mkdirSync11(buildContext, { recursive: true });
|
|
9278
9200
|
try {
|
|
9279
|
-
|
|
9201
|
+
writeFileSync14(join27(buildContext, "Dockerfile"), generateVerifyDockerfile(projectDir));
|
|
9280
9202
|
execFileSync5("docker", ["build", "-t", IMAGE_TAG, "."], {
|
|
9281
9203
|
cwd: buildContext,
|
|
9282
9204
|
stdio: "pipe",
|
|
9283
9205
|
timeout
|
|
9284
9206
|
});
|
|
9285
9207
|
} finally {
|
|
9286
|
-
|
|
9208
|
+
rmSync3(buildContext, { recursive: true, force: true });
|
|
9287
9209
|
}
|
|
9288
9210
|
}
|
|
9289
9211
|
function dockerImageExists(tag) {
|
|
@@ -10881,7 +10803,7 @@ function formatAuditJson(result) {
|
|
|
10881
10803
|
}
|
|
10882
10804
|
|
|
10883
10805
|
// src/modules/audit/fix-generator.ts
|
|
10884
|
-
import { existsSync as existsSync34, writeFileSync as
|
|
10806
|
+
import { existsSync as existsSync34, writeFileSync as writeFileSync15, mkdirSync as mkdirSync12 } from "fs";
|
|
10885
10807
|
import { join as join33, dirname as dirname5 } from "path";
|
|
10886
10808
|
function buildStoryKey(gap2, index) {
|
|
10887
10809
|
const safeDimension = gap2.dimension.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/(^-|-$)/g, "");
|
|
@@ -10936,8 +10858,8 @@ function generateFixStories(auditResult) {
|
|
|
10936
10858
|
continue;
|
|
10937
10859
|
}
|
|
10938
10860
|
const markdown = buildStoryMarkdown(gap2, key);
|
|
10939
|
-
|
|
10940
|
-
|
|
10861
|
+
mkdirSync12(dirname5(filePath), { recursive: true });
|
|
10862
|
+
writeFileSync15(filePath, markdown, "utf-8");
|
|
10941
10863
|
stories.push({ key, filePath, gap: gap2, skipped: false });
|
|
10942
10864
|
created++;
|
|
10943
10865
|
}
|
|
@@ -11113,7 +11035,7 @@ function registerOnboardCommand(program) {
|
|
|
11113
11035
|
}
|
|
11114
11036
|
|
|
11115
11037
|
// src/commands/teardown.ts
|
|
11116
|
-
import { existsSync as existsSync35, unlinkSync as unlinkSync3, readFileSync as readFileSync29, writeFileSync as
|
|
11038
|
+
import { existsSync as existsSync35, unlinkSync as unlinkSync3, readFileSync as readFileSync29, writeFileSync as writeFileSync16, rmSync as rmSync4 } from "fs";
|
|
11117
11039
|
import { join as join34 } from "path";
|
|
11118
11040
|
function buildDefaultResult() {
|
|
11119
11041
|
return {
|
|
@@ -11160,7 +11082,7 @@ function registerTeardownCommand(program) {
|
|
|
11160
11082
|
} else if (otlpMode === "remote-routed") {
|
|
11161
11083
|
if (!options.keepDocker) {
|
|
11162
11084
|
try {
|
|
11163
|
-
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-
|
|
11085
|
+
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-Y73EO7Z4.js");
|
|
11164
11086
|
stopCollectorOnly2();
|
|
11165
11087
|
result.docker.stopped = true;
|
|
11166
11088
|
if (!isJson) {
|
|
@@ -11192,7 +11114,7 @@ function registerTeardownCommand(program) {
|
|
|
11192
11114
|
info("Shared stack: kept running (other projects may use it)");
|
|
11193
11115
|
}
|
|
11194
11116
|
} else if (isLegacyStack) {
|
|
11195
|
-
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-
|
|
11117
|
+
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-Y73EO7Z4.js");
|
|
11196
11118
|
let stackRunning = false;
|
|
11197
11119
|
try {
|
|
11198
11120
|
stackRunning = isStackRunning2(composeFile);
|
|
@@ -11262,7 +11184,7 @@ function registerTeardownCommand(program) {
|
|
|
11262
11184
|
for (const key of keysToRemove) {
|
|
11263
11185
|
delete scripts[key];
|
|
11264
11186
|
}
|
|
11265
|
-
|
|
11187
|
+
writeFileSync16(pkgPath, JSON.stringify(pkg, null, 2) + "\n", "utf-8");
|
|
11266
11188
|
result.otlp_cleaned = true;
|
|
11267
11189
|
if (!isJson) {
|
|
11268
11190
|
ok("OTLP: removed instrumented scripts from package.json");
|
|
@@ -11290,7 +11212,7 @@ function registerTeardownCommand(program) {
|
|
|
11290
11212
|
}
|
|
11291
11213
|
const harnessDir = join34(projectDir, ".harness");
|
|
11292
11214
|
if (existsSync35(harnessDir)) {
|
|
11293
|
-
|
|
11215
|
+
rmSync4(harnessDir, { recursive: true, force: true });
|
|
11294
11216
|
result.removed.push(".harness/");
|
|
11295
11217
|
if (!isJson) {
|
|
11296
11218
|
ok("Removed: .harness/");
|
|
@@ -12096,7 +12018,7 @@ function isDuplicate(newItem, existingTitles, threshold = 0.8) {
|
|
|
12096
12018
|
}
|
|
12097
12019
|
|
|
12098
12020
|
// src/lib/issue-tracker.ts
|
|
12099
|
-
import { existsSync as existsSync36, readFileSync as readFileSync30, writeFileSync as
|
|
12021
|
+
import { existsSync as existsSync36, readFileSync as readFileSync30, writeFileSync as writeFileSync17, mkdirSync as mkdirSync13 } from "fs";
|
|
12100
12022
|
import { join as join35 } from "path";
|
|
12101
12023
|
import { parse as parse6, stringify as stringify3 } from "yaml";
|
|
12102
12024
|
var VALID_PRIORITIES = /* @__PURE__ */ new Set([
|
|
@@ -12125,9 +12047,9 @@ function writeIssues(data, dir = process.cwd()) {
|
|
|
12125
12047
|
const filePath = issuesPath(dir);
|
|
12126
12048
|
const dirPath = join35(dir, ".codeharness");
|
|
12127
12049
|
if (!existsSync36(dirPath)) {
|
|
12128
|
-
|
|
12050
|
+
mkdirSync13(dirPath, { recursive: true });
|
|
12129
12051
|
}
|
|
12130
|
-
|
|
12052
|
+
writeFileSync17(filePath, stringify3(data, { nullStr: "" }), "utf-8");
|
|
12131
12053
|
}
|
|
12132
12054
|
function nextIssueId(existing) {
|
|
12133
12055
|
let max = 0;
|
|
@@ -13113,7 +13035,7 @@ function registerAuditCommand(program) {
|
|
|
13113
13035
|
}
|
|
13114
13036
|
|
|
13115
13037
|
// src/commands/stats.ts
|
|
13116
|
-
import { existsSync as existsSync39, readdirSync as readdirSync10, readFileSync as readFileSync32, writeFileSync as
|
|
13038
|
+
import { existsSync as existsSync39, readdirSync as readdirSync10, readFileSync as readFileSync32, writeFileSync as writeFileSync18 } from "fs";
|
|
13117
13039
|
import { join as join38 } from "path";
|
|
13118
13040
|
var RATES = {
|
|
13119
13041
|
input: 15,
|
|
@@ -13323,7 +13245,7 @@ function registerStatsCommand(program) {
|
|
|
13323
13245
|
console.log(formatted);
|
|
13324
13246
|
if (options.save) {
|
|
13325
13247
|
const outPath = join38(projectDir, "_bmad-output", "implementation-artifacts", "cost-report.md");
|
|
13326
|
-
|
|
13248
|
+
writeFileSync18(outPath, formatted, "utf-8");
|
|
13327
13249
|
ok(`Report saved to ${outPath}`);
|
|
13328
13250
|
}
|
|
13329
13251
|
});
|
|
@@ -14179,7 +14101,7 @@ function registerDriversCommand(program) {
|
|
|
14179
14101
|
}
|
|
14180
14102
|
|
|
14181
14103
|
// src/index.ts
|
|
14182
|
-
var VERSION = true ? "0.
|
|
14104
|
+
var VERSION = true ? "0.33.0" : "0.0.0-dev";
|
|
14183
14105
|
function createProgram() {
|
|
14184
14106
|
const program = new Command();
|
|
14185
14107
|
program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
|
package/package.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
Dev agents repeatedly shipped code without reading module conventions (AGENTS.md),
|
|
4
4
|
skipped observability checks, and produced features that could not be verified
|
|
5
5
|
from outside the source tree. This patch enforces architecture awareness,
|
|
6
|
-
observability validation, documentation hygiene, test coverage gates
|
|
7
|
-
|
|
6
|
+
observability validation, documentation hygiene, and test coverage gates
|
|
7
|
+
— all operational failures observed in prior sprints.
|
|
8
8
|
(FR33, FR34, NFR20)
|
|
9
9
|
|
|
10
10
|
## Codeharness Development Enforcement
|
|
@@ -35,23 +35,14 @@ After running tests, verify telemetry is flowing:
|
|
|
35
35
|
- Coverage gate: 100% of new/changed code
|
|
36
36
|
- Run `npm test` / `pytest` and verify no regressions
|
|
37
37
|
|
|
38
|
-
### Verification
|
|
38
|
+
### Verification Readiness
|
|
39
39
|
|
|
40
|
-
Write code that can be verified
|
|
40
|
+
Write code that can be verified via Docker-based blind verification. Ask yourself:
|
|
41
|
+
- Are my functions testable and my outputs greppable?
|
|
42
|
+
- Can I run the CLI/server and verify output?
|
|
43
|
+
- Does `docker exec` work? Are logs flowing to the observability stack?
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
- **`runtime-provable`** — Code must be exercisable via CLI or local server. Ensure the binary/CLI produces verifiable stdout, exit codes, or HTTP responses without needing Docker.
|
|
44
|
-
- **`environment-provable`** — Code must work in a Docker verification environment. Ensure the Dockerfile is current, services start correctly, and `docker exec` can exercise the feature. Observability queries should return expected log/trace events.
|
|
45
|
-
- **`escalate`** — Reserved for ACs that genuinely cannot be automated (physical hardware, paid external APIs). This is rare — exhaust all automated approaches first.
|
|
46
|
-
|
|
47
|
-
Ask yourself:
|
|
48
|
-
- What tier is this story tagged with?
|
|
49
|
-
- Does my implementation produce the evidence that tier requires?
|
|
50
|
-
- If `test-provable`: are my functions testable and my outputs greppable?
|
|
51
|
-
- If `runtime-provable`: can I run the CLI/server and verify output locally?
|
|
52
|
-
- If `environment-provable`: does `docker exec` work? Are logs flowing to the observability stack?
|
|
53
|
-
|
|
54
|
-
If the answer is "no", the feature has a testability gap — fix the code to be verifiable at the appropriate tier.
|
|
45
|
+
If the answer is "no", the feature has a testability gap — fix the code to be verifiable.
|
|
55
46
|
|
|
56
47
|
### Dockerfile Maintenance
|
|
57
48
|
|
|
@@ -20,7 +20,7 @@ quality trends, and mandatory concrete action items with owners.
|
|
|
20
20
|
|
|
21
21
|
- Did the verifier hang on permissions? (check for `--allowedTools` issues)
|
|
22
22
|
- Did stories get stuck in verify→dev loops? (check `attempts` counter)
|
|
23
|
-
- Were stories assigned the wrong verification
|
|
23
|
+
- Were stories assigned the wrong verification method?
|
|
24
24
|
- Did the verify parser correctly detect `[FAIL]` verdicts?
|
|
25
25
|
|
|
26
26
|
### Documentation Health
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
## WHY
|
|
2
2
|
|
|
3
3
|
Review agents approved stories without verifying proof documents existed or
|
|
4
|
-
checking that evidence
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
checking that evidence was real. Stories passed review with fabricated output
|
|
5
|
+
and missing coverage data. This patch enforces proof existence, evidence
|
|
6
|
+
quality, and coverage delta reporting as hard gates before a story can leave
|
|
7
|
+
review.
|
|
8
8
|
(FR33, FR34, NFR20)
|
|
9
9
|
|
|
10
10
|
## Codeharness Review Gates
|
|
@@ -18,34 +18,12 @@ gates before a story can leave review.
|
|
|
18
18
|
|
|
19
19
|
### Proof Quality Checks
|
|
20
20
|
|
|
21
|
-
The proof must pass tier-appropriate evidence enforcement. The required evidence depends on the story's verification tier:
|
|
22
|
-
|
|
23
|
-
#### `test-provable` stories
|
|
24
|
-
- Evidence comes from build output, test results, and grep/read of code or generated artifacts
|
|
25
|
-
- `npm test` / `npm run build` output is the primary evidence
|
|
26
|
-
- Source-level assertions (grep against `src/`) are acceptable — this IS the verification method for this tier
|
|
27
|
-
- `docker exec` evidence is NOT required
|
|
28
|
-
- Each AC section must show actual test output or build results
|
|
29
|
-
|
|
30
|
-
#### `runtime-provable` stories
|
|
31
|
-
- Evidence comes from running the actual binary, CLI, or server
|
|
32
|
-
- Process execution output (stdout, stderr, exit codes) is the primary evidence
|
|
33
|
-
- HTTP responses from a locally running server are acceptable
|
|
34
|
-
- `docker exec` evidence is NOT required
|
|
35
|
-
- Each AC section must show actual command execution and output
|
|
36
|
-
|
|
37
|
-
#### `environment-provable` stories
|
|
38
21
|
- Commands run via `docker exec` (not direct host access)
|
|
39
22
|
- Less than 50% of evidence commands are `grep` against `src/`
|
|
40
23
|
- Each AC section has at least one `docker exec`, `docker ps/logs`, or observability query
|
|
41
24
|
- `[FAIL]` verdicts outside code blocks cause the proof to fail
|
|
42
25
|
- `[ESCALATE]` is acceptable only when all automated approaches are exhausted
|
|
43
26
|
|
|
44
|
-
#### `escalate` stories
|
|
45
|
-
- Human judgment is required — automated evidence may be partial or absent
|
|
46
|
-
- Proof document must explain why automation is not possible
|
|
47
|
-
- `[ESCALATE]` verdict is expected and acceptable
|
|
48
|
-
|
|
49
27
|
### Observability
|
|
50
28
|
|
|
51
29
|
Run `semgrep scan --config patches/observability/ --config patches/error-handling/ --json` against changed files and report gaps.
|
|
@@ -1,49 +1,25 @@
|
|
|
1
1
|
## WHY
|
|
2
2
|
|
|
3
3
|
Stories were marked "done" with no proof artifact, or with proofs that only
|
|
4
|
-
grepped source code instead of exercising the feature
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from being hidden behind inadequate evidence.
|
|
4
|
+
grepped source code instead of exercising the feature. This patch mandates
|
|
5
|
+
proof documents with real evidence, and test coverage targets — preventing
|
|
6
|
+
regressions from being hidden behind inadequate evidence.
|
|
8
7
|
(FR33, FR36, NFR20)
|
|
9
8
|
|
|
10
9
|
## Verification Requirements
|
|
11
10
|
|
|
12
|
-
Every story must produce a **proof document** with evidence
|
|
11
|
+
Every story must produce a **proof document** with real evidence from Docker-based blind verification.
|
|
13
12
|
|
|
14
13
|
### Proof Standard
|
|
15
14
|
|
|
16
15
|
- Proof document at `verification/<story-key>-proof.md`
|
|
17
|
-
- Each AC gets a `## AC N:` section with
|
|
16
|
+
- Each AC gets a `## AC N:` section with evidence and captured output
|
|
18
17
|
- `[FAIL]` = AC failed with evidence showing what went wrong
|
|
19
18
|
- `[ESCALATE]` = AC genuinely cannot be automated (last resort — try everything first)
|
|
20
19
|
|
|
21
|
-
**Tier-dependent evidence rules:**
|
|
22
|
-
|
|
23
|
-
- **`test-provable`** — Evidence comes from build + test output + grep/read of code or artifacts. Run `npm test` or `npm run build`, capture results. Source-level assertions are the primary verification method. No running app or Docker required.
|
|
24
|
-
- **`runtime-provable`** — Evidence comes from running the actual binary/server and interacting with it. Start the process, make requests or run commands, capture stdout/stderr/exit codes. No Docker stack required.
|
|
25
|
-
- **`environment-provable`** — Evidence comes from `docker exec` commands and observability queries. Full Docker verification environment required. Each AC section needs at least one `docker exec`, `docker ps/logs`, or observability query. Evidence must come from running the installed CLI/tool in Docker, not from grepping source.
|
|
26
|
-
- **`escalate`** — Human judgment required. Document why automation is not possible. `[ESCALATE]` verdict is expected.
|
|
27
|
-
|
|
28
|
-
### Verification Tags
|
|
29
|
-
|
|
30
|
-
For each AC, append a tag indicating its verification tier:
|
|
31
|
-
- `<!-- verification: test-provable -->` — Can be verified by building and running tests. Evidence: build output, test results, grep/read of code. No running app needed.
|
|
32
|
-
- `<!-- verification: runtime-provable -->` — Requires running the actual binary/CLI/server. Evidence: process output, HTTP responses, exit codes. No Docker stack needed.
|
|
33
|
-
- `<!-- verification: environment-provable -->` — Requires full Docker environment with observability. Evidence: `docker exec` commands, VictoriaLogs queries, multi-service interaction.
|
|
34
|
-
- `<!-- verification: escalate -->` — Cannot be automated. Requires human judgment, physical hardware, or paid external services.
|
|
35
|
-
|
|
36
|
-
**Decision criteria:**
|
|
37
|
-
1. Can you prove it with `npm test` or `npm run build` alone? → `test-provable`
|
|
38
|
-
2. Do you need to run the actual binary/server locally? → `runtime-provable`
|
|
39
|
-
3. Do you need Docker, external services, or observability? → `environment-provable`
|
|
40
|
-
4. Have you exhausted all automated approaches? → `escalate`
|
|
41
|
-
|
|
42
|
-
**Do not over-tag.** Most stories are `test-provable` or `runtime-provable`. Only use `environment-provable` when Docker infrastructure is genuinely needed. Only use `escalate` as a last resort.
|
|
43
|
-
|
|
44
20
|
### Observability Evidence
|
|
45
21
|
|
|
46
|
-
After each `docker exec` command
|
|
22
|
+
After each `docker exec` command, query the observability backend for log events from the last 30 seconds.
|
|
47
23
|
Use the configured VictoriaLogs endpoint (default: `http://localhost:9428`):
|
|
48
24
|
|
|
49
25
|
```bash
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
name: documenter
|
|
2
|
+
role:
|
|
3
|
+
title: Verification Guide Writer
|
|
4
|
+
purpose: Read implementation and write Docker-executable verification guides for blind QA
|
|
5
|
+
persona:
|
|
6
|
+
identity: |
|
|
7
|
+
Technical writer who translates source code into executable verification steps.
|
|
8
|
+
Reads what was built, understands how it works, then writes guides that a blind
|
|
9
|
+
QA agent can follow using only Docker commands.
|
|
10
|
+
communication_style: "Precise, command-oriented. Every verification step is a copy-pasteable command with expected output."
|
|
11
|
+
principles:
|
|
12
|
+
- Every AC must map to a concrete docker exec or curl command
|
|
13
|
+
- Commands must be copy-pasteable — no pseudocode, no placeholders
|
|
14
|
+
- Include the Docker container name in every command
|
|
15
|
+
- 'Expected output must be specific — not "should work" but "prints PASS: hook registered"'
|
|
16
|
+
- Include a Prerequisites section with container name and required services
|
|
17
|
+
prompt_template: |
|
|
18
|
+
## Role
|
|
19
|
+
|
|
20
|
+
You are writing a verification guide for a blind QA evaluator. The evaluator CANNOT see source code — it can only run Docker commands and observe output.
|
|
21
|
+
|
|
22
|
+
## Process
|
|
23
|
+
|
|
24
|
+
1. Read the story spec to understand the acceptance criteria
|
|
25
|
+
2. Read the implementation source to understand what was built
|
|
26
|
+
3. Discover the Docker container name: run `docker ps` or read `docker-compose.yml`
|
|
27
|
+
4. For each AC, write an executable verification step
|
|
28
|
+
|
|
29
|
+
## Guide Format
|
|
30
|
+
|
|
31
|
+
Write a markdown document with this structure:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
# Verification Guide: [Story Title]
|
|
35
|
+
|
|
36
|
+
## Prerequisites
|
|
37
|
+
- Container: [container name from docker ps]
|
|
38
|
+
- Required services: [list any dependent services]
|
|
39
|
+
- Setup: [any one-time setup commands needed]
|
|
40
|
+
|
|
41
|
+
## AC 1: [AC description]
|
|
42
|
+
### Command
|
|
43
|
+
docker exec [container] python -c "from app.module import Class; obj = Class(); result = obj.method(args); assert result == expected; print('PASS: [description]')"
|
|
44
|
+
### Expected Output
|
|
45
|
+
PASS: [description]
|
|
46
|
+
### What This Proves
|
|
47
|
+
[One sentence: why this output satisfies the AC]
|
|
48
|
+
|
|
49
|
+
## AC 2: [AC description]
|
|
50
|
+
...
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Rules
|
|
54
|
+
|
|
55
|
+
- Every command must be copy-pasteable into a terminal
|
|
56
|
+
- No pseudocode — use real import paths, real class names, real method signatures
|
|
57
|
+
- For API features: use `curl http://localhost:PORT/endpoint` with expected response body
|
|
58
|
+
- For internal code: use `docker exec [container] python -c "..."` with assertion + print
|
|
59
|
+
- For CLI features: use `docker exec [container] command --args` with expected output
|
|
60
|
+
- If a feature cannot be verified via Docker (e.g., build-time only), state this explicitly with reason
|
|
61
|
+
|
|
62
|
+
## Output
|
|
63
|
+
|
|
64
|
+
Write the complete verification guide as your response. Do not write to files — the engine captures your output.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
name: evaluator
|
|
2
2
|
role:
|
|
3
3
|
title: Adversarial QA Evaluator
|
|
4
|
-
purpose: Exercise the built artifact and determine if it actually works
|
|
4
|
+
purpose: Exercise the built artifact via Docker and determine if it actually works
|
|
5
5
|
persona:
|
|
6
6
|
identity: Senior QA engineer who trusts nothing without evidence. Treats every claim as unverified until proven with concrete output. Assumes code is broken until demonstrated otherwise.
|
|
7
7
|
communication_style: "Blunt, evidence-first. States what was observed, not what was expected. No softening, no encouragement, no benefit of the doubt."
|
|
@@ -22,11 +22,23 @@ disallowedTools:
|
|
|
22
22
|
prompt_template: |
|
|
23
23
|
## Role
|
|
24
24
|
|
|
25
|
-
You are verifying acceptance criteria for
|
|
25
|
+
You are verifying acceptance criteria for an epic. Your job is to determine whether each AC actually passes by running commands and observing output.
|
|
26
26
|
|
|
27
27
|
## Input
|
|
28
28
|
|
|
29
|
-
Read
|
|
29
|
+
Read verification guides from ./story-files/. Each guide explains:
|
|
30
|
+
- What was built
|
|
31
|
+
- Docker container name and prerequisites
|
|
32
|
+
- For each AC: an exact command to run and expected output
|
|
33
|
+
|
|
34
|
+
## Verification Method
|
|
35
|
+
|
|
36
|
+
Use `docker exec`, `docker logs`, `curl`, and other Docker/HTTP commands as described in the guides. Every AC must be verified by:
|
|
37
|
+
1. Running the exact command from the guide
|
|
38
|
+
2. Capturing the actual output
|
|
39
|
+
3. Comparing to expected output
|
|
40
|
+
|
|
41
|
+
You do NOT have access to source code. You verify by exercising the running system via Docker only.
|
|
30
42
|
|
|
31
43
|
## Anti-Leniency Rules
|
|
32
44
|
|
|
@@ -35,14 +47,7 @@ prompt_template: |
|
|
|
35
47
|
- Every PASS requires commands_run evidence — if you cannot run a command to verify, score UNKNOWN.
|
|
36
48
|
- UNKNOWN if unable to verify — never guess at outcomes.
|
|
37
49
|
- Do not infer success from lack of errors. Silence is not evidence.
|
|
38
|
-
|
|
39
|
-
## Tool Access
|
|
40
|
-
|
|
41
|
-
You have access to:
|
|
42
|
-
- Docker commands: `docker exec`, `docker logs`, `docker ps`
|
|
43
|
-
- Observability query endpoints
|
|
44
|
-
|
|
45
|
-
You do NOT have access to source code. Do not attempt to read, edit, or write source files. Gather all evidence through runtime observation only.
|
|
50
|
+
- If Docker is not running or the app container is not available, report ALL ACs as UNKNOWN with reason "Docker not available".
|
|
46
51
|
|
|
47
52
|
## Evidence Requirements
|
|
48
53
|
|
|
@@ -19,6 +19,11 @@ tasks:
|
|
|
19
19
|
session: fresh
|
|
20
20
|
source_access: true
|
|
21
21
|
driver: codex
|
|
22
|
+
document:
|
|
23
|
+
agent: documenter
|
|
24
|
+
session: fresh
|
|
25
|
+
source_access: true
|
|
26
|
+
model: claude-opus-4-6
|
|
22
27
|
verify:
|
|
23
28
|
agent: evaluator
|
|
24
29
|
session: fresh
|
|
@@ -40,6 +45,7 @@ story_flow:
|
|
|
40
45
|
- implement
|
|
41
46
|
- check
|
|
42
47
|
- review
|
|
48
|
+
- document
|
|
43
49
|
|
|
44
50
|
epic_flow:
|
|
45
51
|
- story_flow
|
|
@@ -48,5 +54,6 @@ epic_flow:
|
|
|
48
54
|
- retry
|
|
49
55
|
- check
|
|
50
56
|
- review
|
|
57
|
+
- document
|
|
51
58
|
- verify
|
|
52
59
|
- retro
|