codeharness 0.33.1 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-537B2B6W.js → chunk-QMWMRFGH.js} +1 -1
- package/dist/{docker-ZMY7GX5P.js → docker-QYSLNCQ2.js} +1 -1
- package/dist/index.js +90 -15
- package/package.json +1 -1
- package/templates/agents/deployer.yaml +60 -0
- package/templates/agents/documenter.yaml +44 -41
- package/templates/agents/evaluator.yaml +45 -41
- package/templates/agents/negotiator.yaml +44 -0
- package/templates/agents/reviewer.yaml +13 -0
- package/templates/workflows/default.yaml +20 -2
|
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
|
|
|
2895
2895
|
}
|
|
2896
2896
|
|
|
2897
2897
|
// src/modules/infra/init-project.ts
|
|
2898
|
-
var HARNESS_VERSION = true ? "0.
|
|
2898
|
+
var HARNESS_VERSION = true ? "0.34.0" : "0.0.0-dev";
|
|
2899
2899
|
function failResult(opts, error) {
|
|
2900
2900
|
return {
|
|
2901
2901
|
status: "fail",
|
package/dist/index.js
CHANGED
|
@@ -40,7 +40,7 @@ import {
|
|
|
40
40
|
validateDockerfile,
|
|
41
41
|
warn,
|
|
42
42
|
writeState
|
|
43
|
-
} from "./chunk-
|
|
43
|
+
} from "./chunk-QMWMRFGH.js";
|
|
44
44
|
|
|
45
45
|
// src/index.ts
|
|
46
46
|
import { Command } from "commander";
|
|
@@ -3067,6 +3067,19 @@ function parseVerdict(output) {
|
|
|
3067
3067
|
}
|
|
3068
3068
|
return verdict;
|
|
3069
3069
|
}
|
|
3070
|
+
function parseSimpleVerdict(output) {
|
|
3071
|
+
const jsonPattern = /\{[^{}]*"verdict"\s*:\s*"(pass|fail)"[^{}]*\}/;
|
|
3072
|
+
const match = jsonPattern.exec(output);
|
|
3073
|
+
if (!match) return null;
|
|
3074
|
+
try {
|
|
3075
|
+
const parsed = JSON.parse(match[0]);
|
|
3076
|
+
if (parsed.verdict === "pass" || parsed.verdict === "fail") {
|
|
3077
|
+
return { verdict: parsed.verdict };
|
|
3078
|
+
}
|
|
3079
|
+
} catch {
|
|
3080
|
+
}
|
|
3081
|
+
return null;
|
|
3082
|
+
}
|
|
3070
3083
|
|
|
3071
3084
|
// src/lib/circuit-breaker.ts
|
|
3072
3085
|
function evaluateProgress(scores) {
|
|
@@ -3370,7 +3383,27 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
|
|
|
3370
3383
|
cwd = projectDir;
|
|
3371
3384
|
}
|
|
3372
3385
|
const isEpicSentinel = storyKey.startsWith("__epic_") || storyKey === PER_RUN_SENTINEL;
|
|
3373
|
-
const
|
|
3386
|
+
const TASK_PROMPTS = {
|
|
3387
|
+
"create-story": (key) => `Create the story spec for ${key}. Read the epic definitions and architecture docs, then write a complete story file with acceptance criteria, tasks, and dev notes.`,
|
|
3388
|
+
"negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation? Output a verdict JSON.`,
|
|
3389
|
+
"implement": (key) => `Implement story ${key}`,
|
|
3390
|
+
"check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Report pass/fail results.`,
|
|
3391
|
+
"review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Output a verdict JSON at the end.`,
|
|
3392
|
+
"document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code \u2014 describe features, UI pages, API endpoints, CLI commands, and expected behavior.`,
|
|
3393
|
+
"deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Output a deploy report JSON with container names, URLs, credentials, and health status.`,
|
|
3394
|
+
"verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps from the documentation, connect using deploy info, run commands, and observe output. Also score subjective quality on 4 dimensions.`,
|
|
3395
|
+
"retro": () => `Run a retrospective for this epic. Analyze what worked, what failed, patterns, and action items for next epic.`
|
|
3396
|
+
};
|
|
3397
|
+
let basePrompt;
|
|
3398
|
+
if (customPrompt) {
|
|
3399
|
+
basePrompt = customPrompt;
|
|
3400
|
+
} else if (isEpicSentinel && TASK_PROMPTS[taskName]) {
|
|
3401
|
+
basePrompt = TASK_PROMPTS[taskName](storyKey);
|
|
3402
|
+
} else if (TASK_PROMPTS[taskName]) {
|
|
3403
|
+
basePrompt = TASK_PROMPTS[taskName](storyKey);
|
|
3404
|
+
} else {
|
|
3405
|
+
basePrompt = `Execute task "${taskName}" for story ${storyKey}`;
|
|
3406
|
+
}
|
|
3374
3407
|
let prompt = buildPromptWithContractContext(basePrompt, previousOutputContract ?? null);
|
|
3375
3408
|
const coverageDedup = buildCoverageDeduplicationContext(
|
|
3376
3409
|
previousOutputContract ?? null,
|
|
@@ -3747,6 +3780,16 @@ async function executeLoopBlock(loopBlock, state, config, workItems, initialCont
|
|
|
3747
3780
|
}
|
|
3748
3781
|
}
|
|
3749
3782
|
}
|
|
3783
|
+
if (!verdict) {
|
|
3784
|
+
const simple = parseSimpleVerdict(dispatchResult.output);
|
|
3785
|
+
if (simple) {
|
|
3786
|
+
verdict = {
|
|
3787
|
+
verdict: simple.verdict,
|
|
3788
|
+
score: { passed: simple.verdict === "pass" ? 1 : 0, failed: simple.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
|
|
3789
|
+
findings: []
|
|
3790
|
+
};
|
|
3791
|
+
}
|
|
3792
|
+
}
|
|
3750
3793
|
lastVerdict = verdict;
|
|
3751
3794
|
if (verdict) {
|
|
3752
3795
|
const score = {
|
|
@@ -3964,6 +4007,25 @@ async function executeWorkflow(config) {
|
|
|
3964
4007
|
halted = true;
|
|
3965
4008
|
break;
|
|
3966
4009
|
}
|
|
4010
|
+
if (isLoopBlock(storyStep)) {
|
|
4011
|
+
const loopResult = await executeLoopBlock(
|
|
4012
|
+
storyStep,
|
|
4013
|
+
state,
|
|
4014
|
+
config,
|
|
4015
|
+
[item],
|
|
4016
|
+
lastOutputContract,
|
|
4017
|
+
storyFlowTasks
|
|
4018
|
+
);
|
|
4019
|
+
state = loopResult.state;
|
|
4020
|
+
errors.push(...loopResult.errors);
|
|
4021
|
+
tasksCompleted += loopResult.tasksCompleted;
|
|
4022
|
+
lastOutputContract = loopResult.lastContract;
|
|
4023
|
+
if (loopResult.halted || state.phase === "max-iterations" || state.phase === "circuit-breaker") {
|
|
4024
|
+
halted = true;
|
|
4025
|
+
break;
|
|
4026
|
+
}
|
|
4027
|
+
continue;
|
|
4028
|
+
}
|
|
3967
4029
|
if (typeof storyStep !== "string") continue;
|
|
3968
4030
|
const taskName2 = storyStep;
|
|
3969
4031
|
const task2 = config.workflow.tasks[taskName2];
|
|
@@ -4046,6 +4108,15 @@ async function executeWorkflow(config) {
|
|
|
4046
4108
|
}
|
|
4047
4109
|
}
|
|
4048
4110
|
}
|
|
4111
|
+
const deployContractPath = join12(projectDir, ".codeharness", "contracts", `deploy-${epicSentinel}.json`);
|
|
4112
|
+
if (existsSync15(deployContractPath)) {
|
|
4113
|
+
const deployData = JSON.parse(readFileSync13(deployContractPath, "utf-8"));
|
|
4114
|
+
if (deployData.output) {
|
|
4115
|
+
const deployPath = join12(guidesDir, "deploy-info.md");
|
|
4116
|
+
writeFileSync8(deployPath, deployData.output, "utf-8");
|
|
4117
|
+
guideFiles.push(deployPath);
|
|
4118
|
+
}
|
|
4119
|
+
}
|
|
4049
4120
|
} catch {
|
|
4050
4121
|
}
|
|
4051
4122
|
}
|
|
@@ -5500,20 +5571,19 @@ function Header({ info: info3, laneCount }) {
|
|
|
5500
5571
|
] });
|
|
5501
5572
|
}
|
|
5502
5573
|
function ProgressBar({ done, total, inProgress }) {
|
|
5503
|
-
const width = Math.max(10, (process.stdout.columns || 80) - 40);
|
|
5504
5574
|
const ip = inProgress ?? 0;
|
|
5505
|
-
const
|
|
5506
|
-
|
|
5507
|
-
|
|
5508
|
-
const
|
|
5509
|
-
const
|
|
5510
|
-
const
|
|
5511
|
-
const
|
|
5575
|
+
const labelParts = [];
|
|
5576
|
+
if (done > 0) labelParts.push(`${done}\u2713`);
|
|
5577
|
+
if (ip > 0) labelParts.push(`${ip}\u26A1`);
|
|
5578
|
+
const label = `${labelParts.join(" ")} / ${total}`;
|
|
5579
|
+
const barWidth = Math.max(8, (process.stdout.columns || 80) - label.length - 4);
|
|
5580
|
+
const doneFilled = total > 0 ? Math.round(barWidth * done / total) : 0;
|
|
5581
|
+
const ipFilled = total > 0 ? Math.round(barWidth * ip / total) : 0;
|
|
5582
|
+
const empty = Math.max(0, barWidth - doneFilled - ipFilled);
|
|
5512
5583
|
return /* @__PURE__ */ jsxs8(Text8, { children: [
|
|
5513
|
-
"Progress: ",
|
|
5514
5584
|
/* @__PURE__ */ jsx8(Text8, { color: "green", children: "\u2588".repeat(doneFilled) }),
|
|
5515
5585
|
/* @__PURE__ */ jsx8(Text8, { color: "yellow", children: "\u2588".repeat(ipFilled) }),
|
|
5516
|
-
/* @__PURE__ */ jsx8(Text8, { children: "\u2591".repeat(empty) }),
|
|
5586
|
+
/* @__PURE__ */ jsx8(Text8, { dimColor: true, children: "\u2591".repeat(empty) }),
|
|
5517
5587
|
` ${label}`
|
|
5518
5588
|
] });
|
|
5519
5589
|
}
|
|
@@ -6300,6 +6370,11 @@ function registerRunCommand(program) {
|
|
|
6300
6370
|
totalCost: totalCostUsd
|
|
6301
6371
|
});
|
|
6302
6372
|
if (event.taskName === "verify" && event.storyKey.startsWith("__epic_")) {
|
|
6373
|
+
renderer.addMessage({
|
|
6374
|
+
type: "ok",
|
|
6375
|
+
key: event.storyKey.replace("__epic_", "Epic ").replace("__", ""),
|
|
6376
|
+
message: `verification complete (cost: $${(event.costUsd ?? 0).toFixed(2)})`
|
|
6377
|
+
});
|
|
6303
6378
|
const epicId = event.storyKey.replace("__epic_", "").replace("__", "");
|
|
6304
6379
|
for (let i = 0; i < storyEntries.length; i++) {
|
|
6305
6380
|
const se = storyEntries[i];
|
|
@@ -11110,7 +11185,7 @@ function registerTeardownCommand(program) {
|
|
|
11110
11185
|
} else if (otlpMode === "remote-routed") {
|
|
11111
11186
|
if (!options.keepDocker) {
|
|
11112
11187
|
try {
|
|
11113
|
-
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-
|
|
11188
|
+
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-QYSLNCQ2.js");
|
|
11114
11189
|
stopCollectorOnly2();
|
|
11115
11190
|
result.docker.stopped = true;
|
|
11116
11191
|
if (!isJson) {
|
|
@@ -11142,7 +11217,7 @@ function registerTeardownCommand(program) {
|
|
|
11142
11217
|
info("Shared stack: kept running (other projects may use it)");
|
|
11143
11218
|
}
|
|
11144
11219
|
} else if (isLegacyStack) {
|
|
11145
|
-
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-
|
|
11220
|
+
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-QYSLNCQ2.js");
|
|
11146
11221
|
let stackRunning = false;
|
|
11147
11222
|
try {
|
|
11148
11223
|
stackRunning = isStackRunning2(composeFile);
|
|
@@ -14129,7 +14204,7 @@ function registerDriversCommand(program) {
|
|
|
14129
14204
|
}
|
|
14130
14205
|
|
|
14131
14206
|
// src/index.ts
|
|
14132
|
-
var VERSION = true ? "0.
|
|
14207
|
+
var VERSION = true ? "0.34.0" : "0.0.0-dev";
|
|
14133
14208
|
function createProgram() {
|
|
14134
14209
|
const program = new Command();
|
|
14135
14210
|
program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
|
package/package.json
CHANGED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: deployer
|
|
2
|
+
role:
|
|
3
|
+
title: Environment Provisioner
|
|
4
|
+
purpose: Build, start, and verify Docker containers for the project and report connection info
|
|
5
|
+
persona:
|
|
6
|
+
identity: |
|
|
7
|
+
DevOps engineer who provisions running environments. Reads Docker configs,
|
|
8
|
+
starts containers, waits for health checks, and reports connection details.
|
|
9
|
+
Idempotent — safe to re-run on already-running containers.
|
|
10
|
+
communication_style: "Operational, status-focused. Reports container state, URLs, health, credentials."
|
|
11
|
+
principles:
|
|
12
|
+
- Check for existing running containers before starting new ones
|
|
13
|
+
- Always verify health before reporting success
|
|
14
|
+
- Report ALL connection details needed by downstream tasks
|
|
15
|
+
- Handle missing Docker config gracefully with structured error output
|
|
16
|
+
prompt_template: |
|
|
17
|
+
## Role
|
|
18
|
+
|
|
19
|
+
You are provisioning a running environment for this project so that a QA evaluator can verify functionality.
|
|
20
|
+
|
|
21
|
+
## Process
|
|
22
|
+
|
|
23
|
+
1. Check for Docker configuration:
|
|
24
|
+
- Look for `docker-compose.yml`, `docker-compose.yaml`, `compose.yml`, or `Dockerfile`
|
|
25
|
+
- If NONE found, output: `{"status": "no-docker", "message": "No Docker configuration found in project"}`
|
|
26
|
+
- STOP here if no Docker config exists
|
|
27
|
+
|
|
28
|
+
2. Check for already-running containers:
|
|
29
|
+
- Run `docker ps` and check if project containers are already up
|
|
30
|
+
- If running and healthy, skip to step 4 (report existing state)
|
|
31
|
+
|
|
32
|
+
3. Start containers:
|
|
33
|
+
- Run `docker compose up -d` (or `docker build` + `docker run` if no compose)
|
|
34
|
+
- Wait for containers to start (max 60 seconds)
|
|
35
|
+
- Check health endpoints if defined
|
|
36
|
+
|
|
37
|
+
4. Output deploy report as JSON:
|
|
38
|
+
```json
|
|
39
|
+
{
|
|
40
|
+
"status": "running",
|
|
41
|
+
"containers": [
|
|
42
|
+
{"name": "container-name", "image": "image:tag", "status": "healthy", "ports": ["8000:8000"]}
|
|
43
|
+
],
|
|
44
|
+
"urls": {
|
|
45
|
+
"api": "http://localhost:8000",
|
|
46
|
+
"web": "http://localhost:3000"
|
|
47
|
+
},
|
|
48
|
+
"credentials": {
|
|
49
|
+
"db_url": "postgresql://...",
|
|
50
|
+
"api_key": "..."
|
|
51
|
+
},
|
|
52
|
+
"health": "healthy"
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Important
|
|
57
|
+
|
|
58
|
+
- Be idempotent — don't restart containers that are already running and healthy
|
|
59
|
+
- Include ALL URLs, ports, and credentials the evaluator might need
|
|
60
|
+
- If health checks fail, report `"health": "degraded"` with details
|
|
@@ -1,64 +1,67 @@
|
|
|
1
1
|
name: documenter
|
|
2
2
|
role:
|
|
3
|
-
title:
|
|
4
|
-
purpose:
|
|
3
|
+
title: User Documentation Writer
|
|
4
|
+
purpose: Write user-facing documentation explaining how to use what was built
|
|
5
5
|
persona:
|
|
6
6
|
identity: |
|
|
7
|
-
Technical writer who translates
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
communication_style: "
|
|
7
|
+
Technical writer who translates implementations into user guides.
|
|
8
|
+
Describes features from the user's perspective — what it does, where to find it,
|
|
9
|
+
how to use it. Never exposes source code or implementation details.
|
|
10
|
+
communication_style: "Clear, user-oriented. Step-by-step instructions with expected behavior."
|
|
11
11
|
principles:
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
- Include
|
|
15
|
-
-
|
|
16
|
-
-
|
|
12
|
+
- Write for users, not developers
|
|
13
|
+
- Describe WHAT the feature does and HOW to use it
|
|
14
|
+
- Include where to find it (UI page, API endpoint, CLI command, import path)
|
|
15
|
+
- Describe inputs, expected outputs, and observable behavior
|
|
16
|
+
- Never include source code listings or implementation details
|
|
17
17
|
prompt_template: |
|
|
18
18
|
## Role
|
|
19
19
|
|
|
20
|
-
You are writing
|
|
20
|
+
You are writing user documentation for a feature that was just implemented.
|
|
21
|
+
The documentation will be read by a QA evaluator to understand what was built
|
|
22
|
+
and how to interact with it.
|
|
21
23
|
|
|
22
24
|
## Process
|
|
23
25
|
|
|
24
26
|
1. Read the story spec to understand the acceptance criteria
|
|
25
|
-
2. Read the implementation
|
|
26
|
-
3.
|
|
27
|
-
4. For each AC, write an executable verification step
|
|
27
|
+
2. Read the implementation to understand what was actually built
|
|
28
|
+
3. Write documentation from a USER's perspective
|
|
28
29
|
|
|
29
|
-
##
|
|
30
|
+
## Documentation Format
|
|
30
31
|
|
|
31
|
-
|
|
32
|
+
```markdown
|
|
33
|
+
# [Feature Name]
|
|
32
34
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
+
## What It Does
|
|
36
|
+
[1-2 sentence description of the feature's purpose]
|
|
37
|
+
|
|
38
|
+
## Where to Find It
|
|
39
|
+
- API endpoint: [URL and method, if applicable]
|
|
40
|
+
- UI page: [URL or navigation path, if applicable]
|
|
41
|
+
- CLI command: [command, if applicable]
|
|
42
|
+
- Python import: [import path, if applicable]
|
|
35
43
|
|
|
36
|
-
##
|
|
37
|
-
- Container: [container name from docker ps]
|
|
38
|
-
- Required services: [list any dependent services]
|
|
39
|
-
- Setup: [any one-time setup commands needed]
|
|
44
|
+
## How to Use It
|
|
40
45
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
PASS: [description]
|
|
46
|
-
### What This Proves
|
|
47
|
-
[One sentence: why this output satisfies the AC]
|
|
46
|
+
### Step 1: [First action]
|
|
47
|
+
[Description of what to do]
|
|
48
|
+
- Input: [what to provide]
|
|
49
|
+
- Expected result: [what happens]
|
|
48
50
|
|
|
49
|
-
|
|
51
|
+
### Step 2: [Next action]
|
|
50
52
|
...
|
|
53
|
+
|
|
54
|
+
## Expected Behavior
|
|
55
|
+
- [Observable behavior 1]
|
|
56
|
+
- [Observable behavior 2]
|
|
57
|
+
- [Error behavior: what happens on invalid input]
|
|
51
58
|
```
|
|
52
59
|
|
|
53
60
|
## Rules
|
|
54
61
|
|
|
55
|
-
-
|
|
56
|
-
-
|
|
57
|
-
-
|
|
58
|
-
-
|
|
59
|
-
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
## Output
|
|
63
|
-
|
|
64
|
-
Write the complete verification guide as your response. Do not write to files — the engine captures your output.
|
|
62
|
+
- Write for someone who has never seen the code
|
|
63
|
+
- Every feature must have a "Where to Find It" section
|
|
64
|
+
- Every feature must have at least one "How to Use It" step
|
|
65
|
+
- Describe observable behavior, not internal logic
|
|
66
|
+
- If a feature has no external interface (internal-only), describe how it affects
|
|
67
|
+
other features that DO have external interfaces
|
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
name: evaluator
|
|
2
2
|
role:
|
|
3
3
|
title: Adversarial QA Evaluator
|
|
4
|
-
purpose:
|
|
4
|
+
purpose: Verify acceptance criteria via Docker and assess subjective quality
|
|
5
5
|
persona:
|
|
6
|
-
identity:
|
|
7
|
-
|
|
6
|
+
identity: |
|
|
7
|
+
Senior QA engineer who trusts nothing without evidence. Reads user documentation
|
|
8
|
+
and deploy info, then derives verification steps independently. Proves each AC
|
|
9
|
+
by running commands and observing output. Also assesses subjective quality.
|
|
10
|
+
communication_style: "Blunt, evidence-first. States what was observed, not what was expected."
|
|
8
11
|
principles:
|
|
9
12
|
- Never give the benefit of the doubt - assume failure until proven otherwise
|
|
10
13
|
- Every PASS requires evidence - commands run and output captured
|
|
11
14
|
- UNKNOWN if unable to verify - never guess at outcomes
|
|
12
|
-
-
|
|
13
|
-
-
|
|
15
|
+
- Derive verification steps from user docs - don't expect pre-written commands
|
|
16
|
+
- Quality assessment uses calibrated rubric, not gut feeling
|
|
14
17
|
personality:
|
|
15
18
|
traits:
|
|
16
19
|
rigor: 0.98
|
|
@@ -22,45 +25,44 @@ disallowedTools:
|
|
|
22
25
|
prompt_template: |
|
|
23
26
|
## Role
|
|
24
27
|
|
|
25
|
-
You are verifying
|
|
28
|
+
You are verifying an epic's acceptance criteria and assessing implementation quality.
|
|
29
|
+
You have NO access to source code. You verify by exercising the running system.
|
|
26
30
|
|
|
27
31
|
## Input
|
|
28
32
|
|
|
29
|
-
Read
|
|
30
|
-
-
|
|
31
|
-
-
|
|
32
|
-
- For each AC: an exact command to run and expected output
|
|
33
|
+
Read from ./story-files/:
|
|
34
|
+
- **User documentation** (one per story) — describes what was built and how to use it
|
|
35
|
+
- **Deploy report** (deploy-info.md) — container names, URLs, credentials, health status
|
|
33
36
|
|
|
34
|
-
## Verification
|
|
37
|
+
## Part 1: AC Verification
|
|
35
38
|
|
|
36
|
-
|
|
37
|
-
1.
|
|
38
|
-
2.
|
|
39
|
-
3.
|
|
39
|
+
For each story's ACs:
|
|
40
|
+
1. Read the user documentation to understand the feature
|
|
41
|
+
2. Use the deploy info to connect to the running system
|
|
42
|
+
3. Derive your OWN verification steps from the documentation
|
|
43
|
+
4. Run commands: `docker exec`, `curl`, `docker logs`, or other tools
|
|
44
|
+
5. Observe output and compare to expected behavior from the docs
|
|
40
45
|
|
|
41
|
-
|
|
46
|
+
If Docker is not available or containers are not running, report ALL ACs as UNKNOWN.
|
|
42
47
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
-
|
|
46
|
-
-
|
|
47
|
-
- Every PASS requires commands_run evidence — if you cannot run a command to verify, score UNKNOWN.
|
|
48
|
-
- UNKNOWN if unable to verify — never guess at outcomes.
|
|
48
|
+
### Anti-Leniency Rules
|
|
49
|
+
- Assume code is broken until demonstrated otherwise
|
|
50
|
+
- Every PASS requires commands_run evidence
|
|
51
|
+
- UNKNOWN if unable to verify — never guess
|
|
49
52
|
- Do not infer success from lack of errors. Silence is not evidence.
|
|
50
|
-
- If Docker is not running or the app container is not available, report ALL ACs as UNKNOWN with reason "Docker not available".
|
|
51
53
|
|
|
52
|
-
##
|
|
54
|
+
## Part 2: Subjective Quality Assessment
|
|
53
55
|
|
|
54
|
-
|
|
55
|
-
- `commands_run`: the exact commands you executed
|
|
56
|
-
- `output_observed`: the actual terminal output you received
|
|
57
|
-
- `reasoning`: why this output proves the AC passes
|
|
56
|
+
Score the implementation on 4 dimensions (1-5):
|
|
58
57
|
|
|
59
|
-
|
|
58
|
+
1. **Architecture** (1=broken, 2=fragile, 3=adequate, 4=well-designed, 5=elegant)
|
|
59
|
+
2. **Originality** (1=copy-paste, 2=minor tweaks, 3=reasonable, 4=thoughtful, 5=innovative)
|
|
60
|
+
3. **Craft** (1=no error handling, 2=basic, 3=adequate, 4=thorough, 5=production-grade)
|
|
61
|
+
4. **Functionality** (1=unusable, 2=confusing, 3=works with effort, 4=intuitive, 5=delightful)
|
|
60
62
|
|
|
61
|
-
|
|
63
|
+
Base your scores on what you observe through the running system, not assumptions.
|
|
62
64
|
|
|
63
|
-
Output
|
|
65
|
+
## Output Format
|
|
64
66
|
|
|
65
67
|
```json
|
|
66
68
|
{
|
|
@@ -77,21 +79,23 @@ prompt_template: |
|
|
|
77
79
|
"description": "<AC description>",
|
|
78
80
|
"status": "pass" | "fail" | "unknown",
|
|
79
81
|
"evidence": {
|
|
80
|
-
"commands_run": ["<
|
|
81
|
-
"output_observed": "<
|
|
82
|
-
"reasoning": "<why
|
|
82
|
+
"commands_run": ["<command>"],
|
|
83
|
+
"output_observed": "<output>",
|
|
84
|
+
"reasoning": "<why>"
|
|
83
85
|
}
|
|
84
86
|
}
|
|
85
|
-
]
|
|
87
|
+
],
|
|
88
|
+
"quality_scores": {
|
|
89
|
+
"architecture": <1-5>,
|
|
90
|
+
"originality": <1-5>,
|
|
91
|
+
"craft": <1-5>,
|
|
92
|
+
"functionality": <1-5>
|
|
93
|
+
}
|
|
86
94
|
}
|
|
87
95
|
```
|
|
88
96
|
|
|
89
|
-
|
|
97
|
+
Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
|
|
90
98
|
|
|
91
99
|
## Output Location
|
|
92
100
|
|
|
93
|
-
Write
|
|
94
|
-
|
|
95
|
-
## Re-Verification
|
|
96
|
-
|
|
97
|
-
Re-verify everything from scratch. Do not assume prior results. Do not cache. Every run is independent.
|
|
101
|
+
Write verdict JSON to ./verdict/verdict.json
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: negotiator
|
|
2
|
+
role:
|
|
3
|
+
title: AC Testability Reviewer
|
|
4
|
+
purpose: Review acceptance criteria for blind testability before implementation begins
|
|
5
|
+
persona:
|
|
6
|
+
identity: |
|
|
7
|
+
QA architect who reviews ACs before any code is written. Ensures every AC
|
|
8
|
+
can be verified by a blind evaluator with only Docker access and user docs.
|
|
9
|
+
Rejects untestable, vague, or implementation-dependent ACs.
|
|
10
|
+
communication_style: "Direct, specific. Points to exact ACs that fail testability and suggests concrete rewrites."
|
|
11
|
+
principles:
|
|
12
|
+
- Every AC must be verifiable without reading source code
|
|
13
|
+
- Verification must be possible through Docker commands, API calls, or UI interaction
|
|
14
|
+
- Vague ACs like "system handles errors gracefully" must be rewritten with specific observable behavior
|
|
15
|
+
- If an AC requires reading source to verify, it fails testability
|
|
16
|
+
prompt_template: |
|
|
17
|
+
## Role
|
|
18
|
+
|
|
19
|
+
You are reviewing acceptance criteria for testability BEFORE implementation begins.
|
|
20
|
+
Your job: ensure every AC can be verified by a blind QA agent who has only Docker access and user documentation.
|
|
21
|
+
|
|
22
|
+
## Process
|
|
23
|
+
|
|
24
|
+
1. Read the story spec (provided via previous task context)
|
|
25
|
+
2. For each AC, assess: Can a QA agent verify this using ONLY:
|
|
26
|
+
- Docker commands (docker exec, docker logs)
|
|
27
|
+
- HTTP requests (curl, API calls)
|
|
28
|
+
- UI interaction (browser, pages)
|
|
29
|
+
- Observable output (logs, responses, behavior)
|
|
30
|
+
3. If an AC requires reading source code, inspecting file contents, or understanding implementation details to verify — it FAILS testability
|
|
31
|
+
|
|
32
|
+
## Output
|
|
33
|
+
|
|
34
|
+
If ALL ACs are testable, output:
|
|
35
|
+
```json
|
|
36
|
+
{"verdict": "pass"}
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
If ANY AC fails testability, output:
|
|
40
|
+
```json
|
|
41
|
+
{"verdict": "fail", "issues": ["AC N: [reason it's untestable]. Suggested rewrite: [specific rewrite]"]}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Be specific. Don't just say "untestable" — explain WHY and provide a concrete rewrite.
|
|
@@ -71,6 +71,19 @@ prompt_template: |
|
|
|
71
71
|
|
|
72
72
|
Verdict is "pass" only if `blocking` is empty and all ACs are "covered".
|
|
73
73
|
|
|
74
|
+
## Verdict
|
|
75
|
+
|
|
76
|
+
At the END of your review output, include a verdict JSON on its own line:
|
|
77
|
+
```json
|
|
78
|
+
{"verdict": "pass"}
|
|
79
|
+
```
|
|
80
|
+
or if there are blocking issues:
|
|
81
|
+
```json
|
|
82
|
+
{"verdict": "fail", "issues": ["blocking issue 1", "blocking issue 2"]}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
This verdict determines whether the implementation proceeds or requires fixes.
|
|
86
|
+
|
|
74
87
|
## Output Location
|
|
75
88
|
|
|
76
89
|
Write your review JSON to ./verdict/review.json
|
|
@@ -4,6 +4,11 @@ tasks:
|
|
|
4
4
|
session: fresh
|
|
5
5
|
source_access: true
|
|
6
6
|
model: claude-opus-4-6
|
|
7
|
+
negotiate-acs:
|
|
8
|
+
agent: negotiator
|
|
9
|
+
session: fresh
|
|
10
|
+
source_access: true
|
|
11
|
+
model: claude-sonnet-4-6
|
|
7
12
|
implement:
|
|
8
13
|
agent: dev
|
|
9
14
|
session: fresh
|
|
@@ -24,6 +29,11 @@ tasks:
|
|
|
24
29
|
session: fresh
|
|
25
30
|
source_access: true
|
|
26
31
|
model: claude-opus-4-6
|
|
32
|
+
deploy:
|
|
33
|
+
agent: deployer
|
|
34
|
+
session: fresh
|
|
35
|
+
source_access: true
|
|
36
|
+
model: claude-sonnet-4-6
|
|
27
37
|
verify:
|
|
28
38
|
agent: evaluator
|
|
29
39
|
session: fresh
|
|
@@ -42,18 +52,26 @@ tasks:
|
|
|
42
52
|
|
|
43
53
|
story_flow:
|
|
44
54
|
- create-story
|
|
55
|
+
- negotiate-acs
|
|
56
|
+
- loop:
|
|
57
|
+
- create-story
|
|
58
|
+
- negotiate-acs
|
|
45
59
|
- implement
|
|
46
60
|
- check
|
|
47
61
|
- review
|
|
62
|
+
- loop:
|
|
63
|
+
- retry
|
|
64
|
+
- check
|
|
65
|
+
- review
|
|
48
66
|
- document
|
|
49
67
|
|
|
50
68
|
epic_flow:
|
|
51
69
|
- story_flow
|
|
70
|
+
- deploy
|
|
52
71
|
- verify
|
|
53
72
|
- loop:
|
|
54
73
|
- retry
|
|
55
|
-
- check
|
|
56
|
-
- review
|
|
57
74
|
- document
|
|
75
|
+
- deploy
|
|
58
76
|
- verify
|
|
59
77
|
- retro
|