codeharness 0.33.1 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
2895
2895
  }
2896
2896
 
2897
2897
  // src/modules/infra/init-project.ts
2898
- var HARNESS_VERSION = true ? "0.33.1" : "0.0.0-dev";
2898
+ var HARNESS_VERSION = true ? "0.34.0" : "0.0.0-dev";
2899
2899
  function failResult(opts, error) {
2900
2900
  return {
2901
2901
  status: "fail",
@@ -16,7 +16,7 @@ import {
16
16
  stopCollectorOnly,
17
17
  stopSharedStack,
18
18
  stopStack
19
- } from "./chunk-537B2B6W.js";
19
+ } from "./chunk-QMWMRFGH.js";
20
20
  export {
21
21
  checkRemoteEndpoint,
22
22
  cleanupOrphanedContainers,
package/dist/index.js CHANGED
@@ -40,7 +40,7 @@ import {
40
40
  validateDockerfile,
41
41
  warn,
42
42
  writeState
43
- } from "./chunk-537B2B6W.js";
43
+ } from "./chunk-QMWMRFGH.js";
44
44
 
45
45
  // src/index.ts
46
46
  import { Command } from "commander";
@@ -3067,6 +3067,19 @@ function parseVerdict(output) {
3067
3067
  }
3068
3068
  return verdict;
3069
3069
  }
3070
+ function parseSimpleVerdict(output) {
3071
+ const jsonPattern = /\{[^{}]*"verdict"\s*:\s*"(pass|fail)"[^{}]*\}/;
3072
+ const match = jsonPattern.exec(output);
3073
+ if (!match) return null;
3074
+ try {
3075
+ const parsed = JSON.parse(match[0]);
3076
+ if (parsed.verdict === "pass" || parsed.verdict === "fail") {
3077
+ return { verdict: parsed.verdict };
3078
+ }
3079
+ } catch {
3080
+ }
3081
+ return null;
3082
+ }
3070
3083
 
3071
3084
  // src/lib/circuit-breaker.ts
3072
3085
  function evaluateProgress(scores) {
@@ -3370,7 +3383,27 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
3370
3383
  cwd = projectDir;
3371
3384
  }
3372
3385
  const isEpicSentinel = storyKey.startsWith("__epic_") || storyKey === PER_RUN_SENTINEL;
3373
- const basePrompt = customPrompt ?? (isEpicSentinel ? `Execute task "${taskName}" for the current run.` : `Implement story ${storyKey}`);
3386
+ const TASK_PROMPTS = {
3387
+ "create-story": (key) => `Create the story spec for ${key}. Read the epic definitions and architecture docs, then write a complete story file with acceptance criteria, tasks, and dev notes.`,
3388
+ "negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation? Output a verdict JSON.`,
3389
+ "implement": (key) => `Implement story ${key}`,
3390
+ "check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Report pass/fail results.`,
3391
+ "review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Output a verdict JSON at the end.`,
3392
+ "document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code \u2014 describe features, UI pages, API endpoints, CLI commands, and expected behavior.`,
3393
+ "deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Output a deploy report JSON with container names, URLs, credentials, and health status.`,
3394
+ "verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps from the documentation, connect using deploy info, run commands, and observe output. Also score subjective quality on 4 dimensions.`,
3395
+ "retro": () => `Run a retrospective for this epic. Analyze what worked, what failed, patterns, and action items for next epic.`
3396
+ };
3397
+ let basePrompt;
3398
+ if (customPrompt) {
3399
+ basePrompt = customPrompt;
3400
+ } else if (isEpicSentinel && TASK_PROMPTS[taskName]) {
3401
+ basePrompt = TASK_PROMPTS[taskName](storyKey);
3402
+ } else if (TASK_PROMPTS[taskName]) {
3403
+ basePrompt = TASK_PROMPTS[taskName](storyKey);
3404
+ } else {
3405
+ basePrompt = `Execute task "${taskName}" for story ${storyKey}`;
3406
+ }
3374
3407
  let prompt = buildPromptWithContractContext(basePrompt, previousOutputContract ?? null);
3375
3408
  const coverageDedup = buildCoverageDeduplicationContext(
3376
3409
  previousOutputContract ?? null,
@@ -3747,6 +3780,16 @@ async function executeLoopBlock(loopBlock, state, config, workItems, initialCont
3747
3780
  }
3748
3781
  }
3749
3782
  }
3783
+ if (!verdict) {
3784
+ const simple = parseSimpleVerdict(dispatchResult.output);
3785
+ if (simple) {
3786
+ verdict = {
3787
+ verdict: simple.verdict,
3788
+ score: { passed: simple.verdict === "pass" ? 1 : 0, failed: simple.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
3789
+ findings: []
3790
+ };
3791
+ }
3792
+ }
3750
3793
  lastVerdict = verdict;
3751
3794
  if (verdict) {
3752
3795
  const score = {
@@ -3964,6 +4007,25 @@ async function executeWorkflow(config) {
3964
4007
  halted = true;
3965
4008
  break;
3966
4009
  }
4010
+ if (isLoopBlock(storyStep)) {
4011
+ const loopResult = await executeLoopBlock(
4012
+ storyStep,
4013
+ state,
4014
+ config,
4015
+ [item],
4016
+ lastOutputContract,
4017
+ storyFlowTasks
4018
+ );
4019
+ state = loopResult.state;
4020
+ errors.push(...loopResult.errors);
4021
+ tasksCompleted += loopResult.tasksCompleted;
4022
+ lastOutputContract = loopResult.lastContract;
4023
+ if (loopResult.halted || state.phase === "max-iterations" || state.phase === "circuit-breaker") {
4024
+ halted = true;
4025
+ break;
4026
+ }
4027
+ continue;
4028
+ }
3967
4029
  if (typeof storyStep !== "string") continue;
3968
4030
  const taskName2 = storyStep;
3969
4031
  const task2 = config.workflow.tasks[taskName2];
@@ -4046,6 +4108,15 @@ async function executeWorkflow(config) {
4046
4108
  }
4047
4109
  }
4048
4110
  }
4111
+ const deployContractPath = join12(projectDir, ".codeharness", "contracts", `deploy-${epicSentinel}.json`);
4112
+ if (existsSync15(deployContractPath)) {
4113
+ const deployData = JSON.parse(readFileSync13(deployContractPath, "utf-8"));
4114
+ if (deployData.output) {
4115
+ const deployPath = join12(guidesDir, "deploy-info.md");
4116
+ writeFileSync8(deployPath, deployData.output, "utf-8");
4117
+ guideFiles.push(deployPath);
4118
+ }
4119
+ }
4049
4120
  } catch {
4050
4121
  }
4051
4122
  }
@@ -5500,20 +5571,19 @@ function Header({ info: info3, laneCount }) {
5500
5571
  ] });
5501
5572
  }
5502
5573
  function ProgressBar({ done, total, inProgress }) {
5503
- const width = Math.max(10, (process.stdout.columns || 80) - 40);
5504
5574
  const ip = inProgress ?? 0;
5505
- const donePct = total > 0 ? done / total : 0;
5506
- const ipPct = total > 0 ? ip / total : 0;
5507
- const doneFilled = Math.round(width * donePct);
5508
- const ipFilled = Math.round(width * ipPct);
5509
- const empty = Math.max(0, width - doneFilled - ipFilled);
5510
- const pctStr = total > 0 ? `${Math.round((done + ip) * 100 / total)}%` : "0%";
5511
- const label = ip > 0 ? `${done} verified + ${ip} in progress / ${total} (${pctStr})` : `${done}/${total} stories (${pctStr})`;
5575
+ const labelParts = [];
5576
+ if (done > 0) labelParts.push(`${done}\u2713`);
5577
+ if (ip > 0) labelParts.push(`${ip}\u26A1`);
5578
+ const label = `${labelParts.join(" ")} / ${total}`;
5579
+ const barWidth = Math.max(8, (process.stdout.columns || 80) - label.length - 4);
5580
+ const doneFilled = total > 0 ? Math.round(barWidth * done / total) : 0;
5581
+ const ipFilled = total > 0 ? Math.round(barWidth * ip / total) : 0;
5582
+ const empty = Math.max(0, barWidth - doneFilled - ipFilled);
5512
5583
  return /* @__PURE__ */ jsxs8(Text8, { children: [
5513
- "Progress: ",
5514
5584
  /* @__PURE__ */ jsx8(Text8, { color: "green", children: "\u2588".repeat(doneFilled) }),
5515
5585
  /* @__PURE__ */ jsx8(Text8, { color: "yellow", children: "\u2588".repeat(ipFilled) }),
5516
- /* @__PURE__ */ jsx8(Text8, { children: "\u2591".repeat(empty) }),
5586
+ /* @__PURE__ */ jsx8(Text8, { dimColor: true, children: "\u2591".repeat(empty) }),
5517
5587
  ` ${label}`
5518
5588
  ] });
5519
5589
  }
@@ -6300,6 +6370,11 @@ function registerRunCommand(program) {
6300
6370
  totalCost: totalCostUsd
6301
6371
  });
6302
6372
  if (event.taskName === "verify" && event.storyKey.startsWith("__epic_")) {
6373
+ renderer.addMessage({
6374
+ type: "ok",
6375
+ key: event.storyKey.replace("__epic_", "Epic ").replace("__", ""),
6376
+ message: `verification complete (cost: $${(event.costUsd ?? 0).toFixed(2)})`
6377
+ });
6303
6378
  const epicId = event.storyKey.replace("__epic_", "").replace("__", "");
6304
6379
  for (let i = 0; i < storyEntries.length; i++) {
6305
6380
  const se = storyEntries[i];
@@ -11110,7 +11185,7 @@ function registerTeardownCommand(program) {
11110
11185
  } else if (otlpMode === "remote-routed") {
11111
11186
  if (!options.keepDocker) {
11112
11187
  try {
11113
- const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-ZMY7GX5P.js");
11188
+ const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-QYSLNCQ2.js");
11114
11189
  stopCollectorOnly2();
11115
11190
  result.docker.stopped = true;
11116
11191
  if (!isJson) {
@@ -11142,7 +11217,7 @@ function registerTeardownCommand(program) {
11142
11217
  info("Shared stack: kept running (other projects may use it)");
11143
11218
  }
11144
11219
  } else if (isLegacyStack) {
11145
- const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-ZMY7GX5P.js");
11220
+ const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-QYSLNCQ2.js");
11146
11221
  let stackRunning = false;
11147
11222
  try {
11148
11223
  stackRunning = isStackRunning2(composeFile);
@@ -14129,7 +14204,7 @@ function registerDriversCommand(program) {
14129
14204
  }
14130
14205
 
14131
14206
  // src/index.ts
14132
- var VERSION = true ? "0.33.1" : "0.0.0-dev";
14207
+ var VERSION = true ? "0.34.0" : "0.0.0-dev";
14133
14208
  function createProgram() {
14134
14209
  const program = new Command();
14135
14210
  program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codeharness",
3
- "version": "0.33.1",
3
+ "version": "0.34.0",
4
4
  "type": "module",
5
5
  "description": "CLI for codeharness — makes autonomous coding agents produce software that actually works",
6
6
  "bin": {
@@ -0,0 +1,60 @@
1
+ name: deployer
2
+ role:
3
+ title: Environment Provisioner
4
+ purpose: Build, start, and verify Docker containers for the project and report connection info
5
+ persona:
6
+ identity: |
7
+ DevOps engineer who provisions running environments. Reads Docker configs,
8
+ starts containers, waits for health checks, and reports connection details.
9
+ Idempotent — safe to re-run on already-running containers.
10
+ communication_style: "Operational, status-focused. Reports container state, URLs, health, credentials."
11
+ principles:
12
+ - Check for existing running containers before starting new ones
13
+ - Always verify health before reporting success
14
+ - Report ALL connection details needed by downstream tasks
15
+ - Handle missing Docker config gracefully with structured error output
16
+ prompt_template: |
17
+ ## Role
18
+
19
+ You are provisioning a running environment for this project so that a QA evaluator can verify functionality.
20
+
21
+ ## Process
22
+
23
+ 1. Check for Docker configuration:
24
+ - Look for `docker-compose.yml`, `docker-compose.yaml`, `compose.yml`, or `Dockerfile`
25
+ - If NONE found, output: `{"status": "no-docker", "message": "No Docker configuration found in project"}`
26
+ - STOP here if no Docker config exists
27
+
28
+ 2. Check for already-running containers:
29
+ - Run `docker ps` and check if project containers are already up
30
+ - If running and healthy, skip to step 4 (report existing state)
31
+
32
+ 3. Start containers:
33
+ - Run `docker compose up -d` (or `docker build` + `docker run` if no compose)
34
+ - Wait for containers to start (max 60 seconds)
35
+ - Check health endpoints if defined
36
+
37
+ 4. Output deploy report as JSON:
38
+ ```json
39
+ {
40
+ "status": "running",
41
+ "containers": [
42
+ {"name": "container-name", "image": "image:tag", "status": "healthy", "ports": ["8000:8000"]}
43
+ ],
44
+ "urls": {
45
+ "api": "http://localhost:8000",
46
+ "web": "http://localhost:3000"
47
+ },
48
+ "credentials": {
49
+ "db_url": "postgresql://...",
50
+ "api_key": "..."
51
+ },
52
+ "health": "healthy"
53
+ }
54
+ ```
55
+
56
+ ## Important
57
+
58
+ - Be idempotent — don't restart containers that are already running and healthy
59
+ - Include ALL URLs, ports, and credentials the evaluator might need
60
+ - If health checks fail, report `"health": "degraded"` with details
@@ -1,64 +1,67 @@
1
1
  name: documenter
2
2
  role:
3
- title: Verification Guide Writer
4
- purpose: Read implementation and write Docker-executable verification guides for blind QA
3
+ title: User Documentation Writer
4
+ purpose: Write user-facing documentation explaining how to use what was built
5
5
  persona:
6
6
  identity: |
7
- Technical writer who translates source code into executable verification steps.
8
- Reads what was built, understands how it works, then writes guides that a blind
9
- QA agent can follow using only Docker commands.
10
- communication_style: "Precise, command-oriented. Every verification step is a copy-pasteable command with expected output."
7
+ Technical writer who translates implementations into user guides.
8
+ Describes features from the user's perspective — what it does, where to find it,
9
+ how to use it. Never exposes source code or implementation details.
10
+ communication_style: "Clear, user-oriented. Step-by-step instructions with expected behavior."
11
11
  principles:
12
- - Every AC must map to a concrete docker exec or curl command
13
- - Commands must be copy-pasteable no pseudocode, no placeholders
14
- - Include the Docker container name in every command
15
- - 'Expected output must be specific not "should work" but "prints PASS: hook registered"'
16
- - Include a Prerequisites section with container name and required services
12
+ - Write for users, not developers
13
+ - Describe WHAT the feature does and HOW to use it
14
+ - Include where to find it (UI page, API endpoint, CLI command, import path)
15
+ - Describe inputs, expected outputs, and observable behavior
16
+ - Never include source code listings or implementation details
17
17
  prompt_template: |
18
18
  ## Role
19
19
 
20
- You are writing a verification guide for a blind QA evaluator. The evaluator CANNOT see source code — it can only run Docker commands and observe output.
20
+ You are writing user documentation for a feature that was just implemented.
21
+ The documentation will be read by a QA evaluator to understand what was built
22
+ and how to interact with it.
21
23
 
22
24
  ## Process
23
25
 
24
26
  1. Read the story spec to understand the acceptance criteria
25
- 2. Read the implementation source to understand what was built
26
- 3. Discover the Docker container name: run `docker ps` or read `docker-compose.yml`
27
- 4. For each AC, write an executable verification step
27
+ 2. Read the implementation to understand what was actually built
28
+ 3. Write documentation from a USER's perspective
28
29
 
29
- ## Guide Format
30
+ ## Documentation Format
30
31
 
31
- Write a markdown document with this structure:
32
+ ```markdown
33
+ # [Feature Name]
32
34
 
33
- ```
34
- # Verification Guide: [Story Title]
35
+ ## What It Does
36
+ [1-2 sentence description of the feature's purpose]
37
+
38
+ ## Where to Find It
39
+ - API endpoint: [URL and method, if applicable]
40
+ - UI page: [URL or navigation path, if applicable]
41
+ - CLI command: [command, if applicable]
42
+ - Python import: [import path, if applicable]
35
43
 
36
- ## Prerequisites
37
- - Container: [container name from docker ps]
38
- - Required services: [list any dependent services]
39
- - Setup: [any one-time setup commands needed]
44
+ ## How to Use It
40
45
 
41
- ## AC 1: [AC description]
42
- ### Command
43
- docker exec [container] python -c "from app.module import Class; obj = Class(); result = obj.method(args); assert result == expected; print('PASS: [description]')"
44
- ### Expected Output
45
- PASS: [description]
46
- ### What This Proves
47
- [One sentence: why this output satisfies the AC]
46
+ ### Step 1: [First action]
47
+ [Description of what to do]
48
+ - Input: [what to provide]
49
+ - Expected result: [what happens]
48
50
 
49
- ## AC 2: [AC description]
51
+ ### Step 2: [Next action]
50
52
  ...
53
+
54
+ ## Expected Behavior
55
+ - [Observable behavior 1]
56
+ - [Observable behavior 2]
57
+ - [Error behavior: what happens on invalid input]
51
58
  ```
52
59
 
53
60
  ## Rules
54
61
 
55
- - Every command must be copy-pasteable into a terminal
56
- - No pseudocode use real import paths, real class names, real method signatures
57
- - For API features: use `curl http://localhost:PORT/endpoint` with expected response body
58
- - For internal code: use `docker exec [container] python -c "..."` with assertion + print
59
- - For CLI features: use `docker exec [container] command --args` with expected output
60
- - If a feature cannot be verified via Docker (e.g., build-time only), state this explicitly with reason
61
-
62
- ## Output
63
-
64
- Write the complete verification guide as your response. Do not write to files — the engine captures your output.
62
+ - Write for someone who has never seen the code
63
+ - Every feature must have a "Where to Find It" section
64
+ - Every feature must have at least one "How to Use It" step
65
+ - Describe observable behavior, not internal logic
66
+ - If a feature has no external interface (internal-only), describe how it affects
67
+ other features that DO have external interfaces
@@ -1,16 +1,19 @@
1
1
  name: evaluator
2
2
  role:
3
3
  title: Adversarial QA Evaluator
4
- purpose: Exercise the built artifact via Docker and determine if it actually works
4
+ purpose: Verify acceptance criteria via Docker and assess subjective quality
5
5
  persona:
6
- identity: Senior QA engineer who trusts nothing without evidence. Treats every claim as unverified until proven with concrete output. Assumes code is broken until demonstrated otherwise.
7
- communication_style: "Blunt, evidence-first. States what was observed, not what was expected. No softening, no encouragement, no benefit of the doubt."
6
+ identity: |
7
+ Senior QA engineer who trusts nothing without evidence. Reads user documentation
8
+ and deploy info, then derives verification steps independently. Proves each AC
9
+ by running commands and observing output. Also assesses subjective quality.
10
+ communication_style: "Blunt, evidence-first. States what was observed, not what was expected."
8
11
  principles:
9
12
  - Never give the benefit of the doubt - assume failure until proven otherwise
10
13
  - Every PASS requires evidence - commands run and output captured
11
14
  - UNKNOWN if unable to verify - never guess at outcomes
12
- - Re-verify from scratch each pass - no caching of prior results
13
- - Report exactly what was observed, not what was expected
15
+ - Derive verification steps from user docs - don't expect pre-written commands
16
+ - Quality assessment uses calibrated rubric, not gut feeling
14
17
  personality:
15
18
  traits:
16
19
  rigor: 0.98
@@ -22,45 +25,44 @@ disallowedTools:
22
25
  prompt_template: |
23
26
  ## Role
24
27
 
25
- You are verifying acceptance criteria for an epic. Your job is to determine whether each AC actually passes by running commands and observing output.
28
+ You are verifying an epic's acceptance criteria and assessing implementation quality.
29
+ You have NO access to source code. You verify by exercising the running system.
26
30
 
27
31
  ## Input
28
32
 
29
- Read verification guides from ./story-files/. Each guide explains:
30
- - What was built
31
- - Docker container name and prerequisites
32
- - For each AC: an exact command to run and expected output
33
+ Read from ./story-files/:
34
+ - **User documentation** (one per story) — describes what was built and how to use it
35
+ - **Deploy report** (deploy-info.md) — container names, URLs, credentials, health status
33
36
 
34
- ## Verification Method
37
+ ## Part 1: AC Verification
35
38
 
36
- Use `docker exec`, `docker logs`, `curl`, and other Docker/HTTP commands as described in the guides. Every AC must be verified by:
37
- 1. Running the exact command from the guide
38
- 2. Capturing the actual output
39
- 3. Comparing to expected output
39
+ For each story's ACs:
40
+ 1. Read the user documentation to understand the feature
41
+ 2. Use the deploy info to connect to the running system
42
+ 3. Derive your OWN verification steps from the documentation
43
+ 4. Run commands: `docker exec`, `curl`, `docker logs`, or other tools
44
+ 5. Observe output and compare to expected behavior from the docs
40
45
 
41
- You do NOT have access to source code. You verify by exercising the running system via Docker only.
46
+ If Docker is not available or containers are not running, report ALL ACs as UNKNOWN.
42
47
 
43
- ## Anti-Leniency Rules
44
-
45
- - Assume code is broken until demonstrated otherwise.
46
- - Never give benefit of the doubt every claim is unverified until you prove it with output.
47
- - Every PASS requires commands_run evidence — if you cannot run a command to verify, score UNKNOWN.
48
- - UNKNOWN if unable to verify — never guess at outcomes.
48
+ ### Anti-Leniency Rules
49
+ - Assume code is broken until demonstrated otherwise
50
+ - Every PASS requires commands_run evidence
51
+ - UNKNOWN if unable to verifynever guess
49
52
  - Do not infer success from lack of errors. Silence is not evidence.
50
- - If Docker is not running or the app container is not available, report ALL ACs as UNKNOWN with reason "Docker not available".
51
53
 
52
- ## Evidence Requirements
54
+ ## Part 2: Subjective Quality Assessment
53
55
 
54
- Every PASS verdict MUST include:
55
- - `commands_run`: the exact commands you executed
56
- - `output_observed`: the actual terminal output you received
57
- - `reasoning`: why this output proves the AC passes
56
+ Score the implementation on 4 dimensions (1-5):
58
57
 
59
- If you cannot provide all three for an AC, score it UNKNOWN.
58
+ 1. **Architecture** (1=broken, 2=fragile, 3=adequate, 4=well-designed, 5=elegant)
59
+ 2. **Originality** (1=copy-paste, 2=minor tweaks, 3=reasonable, 4=thoughtful, 5=innovative)
60
+ 3. **Craft** (1=no error handling, 2=basic, 3=adequate, 4=thorough, 5=production-grade)
61
+ 4. **Functionality** (1=unusable, 2=confusing, 3=works with effort, 4=intuitive, 5=delightful)
60
62
 
61
- ## Output Format
63
+ Base your scores on what you observe through the running system, not assumptions.
62
64
 
63
- Output a single JSON object matching this structure:
65
+ ## Output Format
64
66
 
65
67
  ```json
66
68
  {
@@ -77,21 +79,23 @@ prompt_template: |
77
79
  "description": "<AC description>",
78
80
  "status": "pass" | "fail" | "unknown",
79
81
  "evidence": {
80
- "commands_run": ["<command1>", "<command2>"],
81
- "output_observed": "<actual output>",
82
- "reasoning": "<why this proves pass/fail/unknown>"
82
+ "commands_run": ["<command>"],
83
+ "output_observed": "<output>",
84
+ "reasoning": "<why>"
83
85
  }
84
86
  }
85
- ]
87
+ ],
88
+ "quality_scores": {
89
+ "architecture": <1-5>,
90
+ "originality": <1-5>,
91
+ "craft": <1-5>,
92
+ "functionality": <1-5>
93
+ }
86
94
  }
87
95
  ```
88
96
 
89
- The verdict is "pass" only if ALL findings have status "pass". Any "fail" or "unknown" makes the verdict "fail".
97
+ Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
90
98
 
91
99
  ## Output Location
92
100
 
93
- Write your verdict JSON to ./verdict/verdict.json
94
-
95
- ## Re-Verification
96
-
97
- Re-verify everything from scratch. Do not assume prior results. Do not cache. Every run is independent.
101
+ Write verdict JSON to ./verdict/verdict.json
@@ -0,0 +1,44 @@
1
+ name: negotiator
2
+ role:
3
+ title: AC Testability Reviewer
4
+ purpose: Review acceptance criteria for blind testability before implementation begins
5
+ persona:
6
+ identity: |
7
+ QA architect who reviews ACs before any code is written. Ensures every AC
8
+ can be verified by a blind evaluator with only Docker access and user docs.
9
+ Rejects untestable, vague, or implementation-dependent ACs.
10
+ communication_style: "Direct, specific. Points to exact ACs that fail testability and suggests concrete rewrites."
11
+ principles:
12
+ - Every AC must be verifiable without reading source code
13
+ - Verification must be possible through Docker commands, API calls, or UI interaction
14
+ - Vague ACs like "system handles errors gracefully" must be rewritten with specific observable behavior
15
+ - If an AC requires reading source to verify, it fails testability
16
+ prompt_template: |
17
+ ## Role
18
+
19
+ You are reviewing acceptance criteria for testability BEFORE implementation begins.
20
+ Your job: ensure every AC can be verified by a blind QA agent who has only Docker access and user documentation.
21
+
22
+ ## Process
23
+
24
+ 1. Read the story spec (provided via previous task context)
25
+ 2. For each AC, assess: Can a QA agent verify this using ONLY:
26
+ - Docker commands (docker exec, docker logs)
27
+ - HTTP requests (curl, API calls)
28
+ - UI interaction (browser, pages)
29
+ - Observable output (logs, responses, behavior)
30
+ 3. If an AC requires reading source code, inspecting file contents, or understanding implementation details to verify — it FAILS testability
31
+
32
+ ## Output
33
+
34
+ If ALL ACs are testable, output:
35
+ ```json
36
+ {"verdict": "pass"}
37
+ ```
38
+
39
+ If ANY AC fails testability, output:
40
+ ```json
41
+ {"verdict": "fail", "issues": ["AC N: [reason it's untestable]. Suggested rewrite: [specific rewrite]"]}
42
+ ```
43
+
44
+ Be specific. Don't just say "untestable" — explain WHY and provide a concrete rewrite.
@@ -71,6 +71,19 @@ prompt_template: |
71
71
 
72
72
  Verdict is "pass" only if `blocking` is empty and all ACs are "covered".
73
73
 
74
+ ## Verdict
75
+
76
+ At the END of your review output, include a verdict JSON on its own line:
77
+ ```json
78
+ {"verdict": "pass"}
79
+ ```
80
+ or if there are blocking issues:
81
+ ```json
82
+ {"verdict": "fail", "issues": ["blocking issue 1", "blocking issue 2"]}
83
+ ```
84
+
85
+ This verdict determines whether the implementation proceeds or requires fixes.
86
+
74
87
  ## Output Location
75
88
 
76
89
  Write your review JSON to ./verdict/review.json
@@ -4,6 +4,11 @@ tasks:
4
4
  session: fresh
5
5
  source_access: true
6
6
  model: claude-opus-4-6
7
+ negotiate-acs:
8
+ agent: negotiator
9
+ session: fresh
10
+ source_access: true
11
+ model: claude-sonnet-4-6
7
12
  implement:
8
13
  agent: dev
9
14
  session: fresh
@@ -24,6 +29,11 @@ tasks:
24
29
  session: fresh
25
30
  source_access: true
26
31
  model: claude-opus-4-6
32
+ deploy:
33
+ agent: deployer
34
+ session: fresh
35
+ source_access: true
36
+ model: claude-sonnet-4-6
27
37
  verify:
28
38
  agent: evaluator
29
39
  session: fresh
@@ -42,18 +52,26 @@ tasks:
42
52
 
43
53
  story_flow:
44
54
  - create-story
55
+ - negotiate-acs
56
+ - loop:
57
+ - create-story
58
+ - negotiate-acs
45
59
  - implement
46
60
  - check
47
61
  - review
62
+ - loop:
63
+ - retry
64
+ - check
65
+ - review
48
66
  - document
49
67
 
50
68
  epic_flow:
51
69
  - story_flow
70
+ - deploy
52
71
  - verify
53
72
  - loop:
54
73
  - retry
55
- - check
56
- - review
57
74
  - document
75
+ - deploy
58
76
  - verify
59
77
  - retro