codeharness 0.33.1 → 0.34.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
2895
2895
  }
2896
2896
 
2897
2897
  // src/modules/infra/init-project.ts
2898
- var HARNESS_VERSION = true ? "0.33.1" : "0.0.0-dev";
2898
+ var HARNESS_VERSION = true ? "0.34.1" : "0.0.0-dev";
2899
2899
  function failResult(opts, error) {
2900
2900
  return {
2901
2901
  status: "fail",
@@ -16,7 +16,7 @@ import {
16
16
  stopCollectorOnly,
17
17
  stopSharedStack,
18
18
  stopStack
19
- } from "./chunk-537B2B6W.js";
19
+ } from "./chunk-G2RR744K.js";
20
20
  export {
21
21
  checkRemoteEndpoint,
22
22
  cleanupOrphanedContainers,
package/dist/index.js CHANGED
@@ -40,7 +40,7 @@ import {
40
40
  validateDockerfile,
41
41
  warn,
42
42
  writeState
43
- } from "./chunk-537B2B6W.js";
43
+ } from "./chunk-G2RR744K.js";
44
44
 
45
45
  // src/index.ts
46
46
  import { Command } from "commander";
@@ -3067,6 +3067,23 @@ function parseVerdict(output) {
3067
3067
  }
3068
3068
  return verdict;
3069
3069
  }
3070
+ function parseSimpleVerdict(output) {
3071
+ const jsonPattern = /\{[^{}]*"verdict"\s*:\s*"(pass|fail)"[^{}]*\}/g;
3072
+ let lastMatch = null;
3073
+ let m;
3074
+ while ((m = jsonPattern.exec(output)) !== null) {
3075
+ lastMatch = m;
3076
+ }
3077
+ if (!lastMatch) return null;
3078
+ try {
3079
+ const parsed = JSON.parse(lastMatch[0]);
3080
+ if (parsed.verdict === "pass" || parsed.verdict === "fail") {
3081
+ return { verdict: parsed.verdict };
3082
+ }
3083
+ } catch {
3084
+ }
3085
+ return null;
3086
+ }
3070
3087
 
3071
3088
  // src/lib/circuit-breaker.ts
3072
3089
  function evaluateProgress(scores) {
@@ -3370,7 +3387,27 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
3370
3387
  cwd = projectDir;
3371
3388
  }
3372
3389
  const isEpicSentinel = storyKey.startsWith("__epic_") || storyKey === PER_RUN_SENTINEL;
3373
- const basePrompt = customPrompt ?? (isEpicSentinel ? `Execute task "${taskName}" for the current run.` : `Implement story ${storyKey}`);
3390
+ const TASK_PROMPTS = {
3391
+ "create-story": (key) => `Create or revise the story spec for ${key}. Read the epic definitions and architecture docs. If previous feedback is provided (from AC negotiation or review), revise the story to address that feedback. Write a complete story file with acceptance criteria, tasks, and dev notes.`,
3392
+ "negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation? Your response MUST end with exactly one JSON line: {"verdict": "pass"} or {"verdict": "fail", "issues": ["..."]}`,
3393
+ "implement": (key) => `Implement story ${key}`,
3394
+ "check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Report pass/fail results.`,
3395
+ "review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Output a verdict JSON at the end.`,
3396
+ "document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code \u2014 describe features, UI pages, API endpoints, CLI commands, and expected behavior.`,
3397
+ "deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Output a deploy report JSON with container names, URLs, credentials, and health status.`,
3398
+ "verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps from the documentation, connect using deploy info, run commands, and observe output. Also score subjective quality on 4 dimensions.`,
3399
+ "retro": () => `Run a retrospective for this epic. Analyze what worked, what failed, patterns, and action items for next epic.`
3400
+ };
3401
+ let basePrompt;
3402
+ if (customPrompt) {
3403
+ basePrompt = customPrompt;
3404
+ } else if (isEpicSentinel && TASK_PROMPTS[taskName]) {
3405
+ basePrompt = TASK_PROMPTS[taskName](storyKey);
3406
+ } else if (TASK_PROMPTS[taskName]) {
3407
+ basePrompt = TASK_PROMPTS[taskName](storyKey);
3408
+ } else {
3409
+ basePrompt = `Execute task "${taskName}" for story ${storyKey}`;
3410
+ }
3374
3411
  let prompt = buildPromptWithContractContext(basePrompt, previousOutputContract ?? null);
3375
3412
  const coverageDedup = buildCoverageDeduplicationContext(
3376
3413
  previousOutputContract ?? null,
@@ -3747,6 +3784,16 @@ async function executeLoopBlock(loopBlock, state, config, workItems, initialCont
3747
3784
  }
3748
3785
  }
3749
3786
  }
3787
+ if (!verdict) {
3788
+ const simple = parseSimpleVerdict(dispatchResult.output);
3789
+ if (simple) {
3790
+ verdict = {
3791
+ verdict: simple.verdict,
3792
+ score: { passed: simple.verdict === "pass" ? 1 : 0, failed: simple.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
3793
+ findings: []
3794
+ };
3795
+ }
3796
+ }
3750
3797
  lastVerdict = verdict;
3751
3798
  if (verdict) {
3752
3799
  const score = {
@@ -3964,6 +4011,25 @@ async function executeWorkflow(config) {
3964
4011
  halted = true;
3965
4012
  break;
3966
4013
  }
4014
+ if (isLoopBlock(storyStep)) {
4015
+ const loopResult = await executeLoopBlock(
4016
+ storyStep,
4017
+ state,
4018
+ config,
4019
+ [item],
4020
+ lastOutputContract,
4021
+ storyFlowTasks
4022
+ );
4023
+ state = loopResult.state;
4024
+ errors.push(...loopResult.errors);
4025
+ tasksCompleted += loopResult.tasksCompleted;
4026
+ lastOutputContract = loopResult.lastContract;
4027
+ if (loopResult.halted || state.phase === "max-iterations" || state.phase === "circuit-breaker") {
4028
+ halted = true;
4029
+ break;
4030
+ }
4031
+ continue;
4032
+ }
3967
4033
  if (typeof storyStep !== "string") continue;
3968
4034
  const taskName2 = storyStep;
3969
4035
  const task2 = config.workflow.tasks[taskName2];
@@ -4046,6 +4112,15 @@ async function executeWorkflow(config) {
4046
4112
  }
4047
4113
  }
4048
4114
  }
4115
+ const deployContractPath = join12(projectDir, ".codeharness", "contracts", `deploy-${epicSentinel}.json`);
4116
+ if (existsSync15(deployContractPath)) {
4117
+ const deployData = JSON.parse(readFileSync13(deployContractPath, "utf-8"));
4118
+ if (deployData.output) {
4119
+ const deployPath = join12(guidesDir, "deploy-info.md");
4120
+ writeFileSync8(deployPath, deployData.output, "utf-8");
4121
+ guideFiles.push(deployPath);
4122
+ }
4123
+ }
4049
4124
  } catch {
4050
4125
  }
4051
4126
  }
@@ -5500,20 +5575,19 @@ function Header({ info: info3, laneCount }) {
5500
5575
  ] });
5501
5576
  }
5502
5577
  function ProgressBar({ done, total, inProgress }) {
5503
- const width = Math.max(10, (process.stdout.columns || 80) - 40);
5504
5578
  const ip = inProgress ?? 0;
5505
- const donePct = total > 0 ? done / total : 0;
5506
- const ipPct = total > 0 ? ip / total : 0;
5507
- const doneFilled = Math.round(width * donePct);
5508
- const ipFilled = Math.round(width * ipPct);
5509
- const empty = Math.max(0, width - doneFilled - ipFilled);
5510
- const pctStr = total > 0 ? `${Math.round((done + ip) * 100 / total)}%` : "0%";
5511
- const label = ip > 0 ? `${done} verified + ${ip} in progress / ${total} (${pctStr})` : `${done}/${total} stories (${pctStr})`;
5579
+ const labelParts = [];
5580
+ if (done > 0) labelParts.push(`${done}\u2713`);
5581
+ if (ip > 0) labelParts.push(`${ip}\u26A1`);
5582
+ const label = `${labelParts.join(" ")} / ${total}`;
5583
+ const barWidth = Math.max(8, (process.stdout.columns || 80) - label.length - 4);
5584
+ const doneFilled = total > 0 ? Math.round(barWidth * done / total) : 0;
5585
+ const ipFilled = total > 0 ? Math.round(barWidth * ip / total) : 0;
5586
+ const empty = Math.max(0, barWidth - doneFilled - ipFilled);
5512
5587
  return /* @__PURE__ */ jsxs8(Text8, { children: [
5513
- "Progress: ",
5514
5588
  /* @__PURE__ */ jsx8(Text8, { color: "green", children: "\u2588".repeat(doneFilled) }),
5515
5589
  /* @__PURE__ */ jsx8(Text8, { color: "yellow", children: "\u2588".repeat(ipFilled) }),
5516
- /* @__PURE__ */ jsx8(Text8, { children: "\u2591".repeat(empty) }),
5590
+ /* @__PURE__ */ jsx8(Text8, { dimColor: true, children: "\u2591".repeat(empty) }),
5517
5591
  ` ${label}`
5518
5592
  ] });
5519
5593
  }
@@ -6300,6 +6374,11 @@ function registerRunCommand(program) {
6300
6374
  totalCost: totalCostUsd
6301
6375
  });
6302
6376
  if (event.taskName === "verify" && event.storyKey.startsWith("__epic_")) {
6377
+ renderer.addMessage({
6378
+ type: "ok",
6379
+ key: event.storyKey.replace("__epic_", "Epic ").replace("__", ""),
6380
+ message: `verification complete (cost: $${(event.costUsd ?? 0).toFixed(2)})`
6381
+ });
6303
6382
  const epicId = event.storyKey.replace("__epic_", "").replace("__", "");
6304
6383
  for (let i = 0; i < storyEntries.length; i++) {
6305
6384
  const se = storyEntries[i];
@@ -11110,7 +11189,7 @@ function registerTeardownCommand(program) {
11110
11189
  } else if (otlpMode === "remote-routed") {
11111
11190
  if (!options.keepDocker) {
11112
11191
  try {
11113
- const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-ZMY7GX5P.js");
11192
+ const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-MEPWHG4P.js");
11114
11193
  stopCollectorOnly2();
11115
11194
  result.docker.stopped = true;
11116
11195
  if (!isJson) {
@@ -11142,7 +11221,7 @@ function registerTeardownCommand(program) {
11142
11221
  info("Shared stack: kept running (other projects may use it)");
11143
11222
  }
11144
11223
  } else if (isLegacyStack) {
11145
- const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-ZMY7GX5P.js");
11224
+ const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-MEPWHG4P.js");
11146
11225
  let stackRunning = false;
11147
11226
  try {
11148
11227
  stackRunning = isStackRunning2(composeFile);
@@ -14129,7 +14208,7 @@ function registerDriversCommand(program) {
14129
14208
  }
14130
14209
 
14131
14210
  // src/index.ts
14132
- var VERSION = true ? "0.33.1" : "0.0.0-dev";
14211
+ var VERSION = true ? "0.34.1" : "0.0.0-dev";
14133
14212
  function createProgram() {
14134
14213
  const program = new Command();
14135
14214
  program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codeharness",
3
- "version": "0.33.1",
3
+ "version": "0.34.1",
4
4
  "type": "module",
5
5
  "description": "CLI for codeharness — makes autonomous coding agents produce software that actually works",
6
6
  "bin": {
@@ -0,0 +1,60 @@
1
+ name: deployer
2
+ role:
3
+ title: Environment Provisioner
4
+ purpose: Build, start, and verify Docker containers for the project and report connection info
5
+ persona:
6
+ identity: |
7
+ DevOps engineer who provisions running environments. Reads Docker configs,
8
+ starts containers, waits for health checks, and reports connection details.
9
+ Idempotent — safe to re-run on already-running containers.
10
+ communication_style: "Operational, status-focused. Reports container state, URLs, health, credentials."
11
+ principles:
12
+ - Check for existing running containers before starting new ones
13
+ - Always verify health before reporting success
14
+ - Report ALL connection details needed by downstream tasks
15
+ - Handle missing Docker config gracefully with structured error output
16
+ prompt_template: |
17
+ ## Role
18
+
19
+ You are provisioning a running environment for this project so that a QA evaluator can verify functionality.
20
+
21
+ ## Process
22
+
23
+ 1. Check for Docker configuration:
24
+ - Look for `docker-compose.yml`, `docker-compose.yaml`, `compose.yml`, or `Dockerfile`
25
+ - If NONE found, output: `{"status": "no-docker", "message": "No Docker configuration found in project"}`
26
+ - STOP here if no Docker config exists
27
+
28
+ 2. Check for already-running containers:
29
+ - Run `docker ps` and check if project containers are already up
30
+ - If running and healthy, skip to step 4 (report existing state)
31
+
32
+ 3. Start containers:
33
+ - Run `docker compose up -d` (or `docker build` + `docker run` if no compose)
34
+ - Wait for containers to start (max 60 seconds)
35
+ - Check health endpoints if defined
36
+
37
+ 4. Output deploy report as JSON:
38
+ ```json
39
+ {
40
+ "status": "running",
41
+ "containers": [
42
+ {"name": "container-name", "image": "image:tag", "status": "healthy", "ports": ["8000:8000"]}
43
+ ],
44
+ "urls": {
45
+ "api": "http://localhost:8000",
46
+ "web": "http://localhost:3000"
47
+ },
48
+ "credentials": {
49
+ "db_url": "postgresql://...",
50
+ "api_key": "..."
51
+ },
52
+ "health": "healthy"
53
+ }
54
+ ```
55
+
56
+ ## Important
57
+
58
+ - Be idempotent — don't restart containers that are already running and healthy
59
+ - Include ALL URLs, ports, and credentials the evaluator might need
60
+ - If health checks fail, report `"health": "degraded"` with details
@@ -1,64 +1,67 @@
1
1
  name: documenter
2
2
  role:
3
- title: Verification Guide Writer
4
- purpose: Read implementation and write Docker-executable verification guides for blind QA
3
+ title: User Documentation Writer
4
+ purpose: Write user-facing documentation explaining how to use what was built
5
5
  persona:
6
6
  identity: |
7
- Technical writer who translates source code into executable verification steps.
8
- Reads what was built, understands how it works, then writes guides that a blind
9
- QA agent can follow using only Docker commands.
10
- communication_style: "Precise, command-oriented. Every verification step is a copy-pasteable command with expected output."
7
+ Technical writer who translates implementations into user guides.
8
+ Describes features from the user's perspective — what it does, where to find it,
9
+ how to use it. Never exposes source code or implementation details.
10
+ communication_style: "Clear, user-oriented. Step-by-step instructions with expected behavior."
11
11
  principles:
12
- - Every AC must map to a concrete docker exec or curl command
13
- - Commands must be copy-pasteable no pseudocode, no placeholders
14
- - Include the Docker container name in every command
15
- - 'Expected output must be specific not "should work" but "prints PASS: hook registered"'
16
- - Include a Prerequisites section with container name and required services
12
+ - Write for users, not developers
13
+ - Describe WHAT the feature does and HOW to use it
14
+ - Include where to find it (UI page, API endpoint, CLI command, import path)
15
+ - Describe inputs, expected outputs, and observable behavior
16
+ - Never include source code listings or implementation details
17
17
  prompt_template: |
18
18
  ## Role
19
19
 
20
- You are writing a verification guide for a blind QA evaluator. The evaluator CANNOT see source code — it can only run Docker commands and observe output.
20
+ You are writing user documentation for a feature that was just implemented.
21
+ The documentation will be read by a QA evaluator to understand what was built
22
+ and how to interact with it.
21
23
 
22
24
  ## Process
23
25
 
24
26
  1. Read the story spec to understand the acceptance criteria
25
- 2. Read the implementation source to understand what was built
26
- 3. Discover the Docker container name: run `docker ps` or read `docker-compose.yml`
27
- 4. For each AC, write an executable verification step
27
+ 2. Read the implementation to understand what was actually built
28
+ 3. Write documentation from a USER's perspective
28
29
 
29
- ## Guide Format
30
+ ## Documentation Format
30
31
 
31
- Write a markdown document with this structure:
32
+ ```markdown
33
+ # [Feature Name]
32
34
 
33
- ```
34
- # Verification Guide: [Story Title]
35
+ ## What It Does
36
+ [1-2 sentence description of the feature's purpose]
37
+
38
+ ## Where to Find It
39
+ - API endpoint: [URL and method, if applicable]
40
+ - UI page: [URL or navigation path, if applicable]
41
+ - CLI command: [command, if applicable]
42
+ - Python import: [import path, if applicable]
35
43
 
36
- ## Prerequisites
37
- - Container: [container name from docker ps]
38
- - Required services: [list any dependent services]
39
- - Setup: [any one-time setup commands needed]
44
+ ## How to Use It
40
45
 
41
- ## AC 1: [AC description]
42
- ### Command
43
- docker exec [container] python -c "from app.module import Class; obj = Class(); result = obj.method(args); assert result == expected; print('PASS: [description]')"
44
- ### Expected Output
45
- PASS: [description]
46
- ### What This Proves
47
- [One sentence: why this output satisfies the AC]
46
+ ### Step 1: [First action]
47
+ [Description of what to do]
48
+ - Input: [what to provide]
49
+ - Expected result: [what happens]
48
50
 
49
- ## AC 2: [AC description]
51
+ ### Step 2: [Next action]
50
52
  ...
53
+
54
+ ## Expected Behavior
55
+ - [Observable behavior 1]
56
+ - [Observable behavior 2]
57
+ - [Error behavior: what happens on invalid input]
51
58
  ```
52
59
 
53
60
  ## Rules
54
61
 
55
- - Every command must be copy-pasteable into a terminal
56
- - No pseudocode use real import paths, real class names, real method signatures
57
- - For API features: use `curl http://localhost:PORT/endpoint` with expected response body
58
- - For internal code: use `docker exec [container] python -c "..."` with assertion + print
59
- - For CLI features: use `docker exec [container] command --args` with expected output
60
- - If a feature cannot be verified via Docker (e.g., build-time only), state this explicitly with reason
61
-
62
- ## Output
63
-
64
- Write the complete verification guide as your response. Do not write to files — the engine captures your output.
62
+ - Write for someone who has never seen the code
63
+ - Every feature must have a "Where to Find It" section
64
+ - Every feature must have at least one "How to Use It" step
65
+ - Describe observable behavior, not internal logic
66
+ - If a feature has no external interface (internal-only), describe how it affects
67
+ other features that DO have external interfaces
@@ -1,16 +1,19 @@
1
1
  name: evaluator
2
2
  role:
3
3
  title: Adversarial QA Evaluator
4
- purpose: Exercise the built artifact via Docker and determine if it actually works
4
+ purpose: Verify acceptance criteria via Docker and assess subjective quality
5
5
  persona:
6
- identity: Senior QA engineer who trusts nothing without evidence. Treats every claim as unverified until proven with concrete output. Assumes code is broken until demonstrated otherwise.
7
- communication_style: "Blunt, evidence-first. States what was observed, not what was expected. No softening, no encouragement, no benefit of the doubt."
6
+ identity: |
7
+ Senior QA engineer who trusts nothing without evidence. Reads user documentation
8
+ and deploy info, then derives verification steps independently. Proves each AC
9
+ by running commands and observing output. Also assesses subjective quality.
10
+ communication_style: "Blunt, evidence-first. States what was observed, not what was expected."
8
11
  principles:
9
12
  - Never give the benefit of the doubt - assume failure until proven otherwise
10
13
  - Every PASS requires evidence - commands run and output captured
11
14
  - UNKNOWN if unable to verify - never guess at outcomes
12
- - Re-verify from scratch each pass - no caching of prior results
13
- - Report exactly what was observed, not what was expected
15
+ - Derive verification steps from user docs - don't expect pre-written commands
16
+ - Quality assessment uses calibrated rubric, not gut feeling
14
17
  personality:
15
18
  traits:
16
19
  rigor: 0.98
@@ -22,45 +25,44 @@ disallowedTools:
22
25
  prompt_template: |
23
26
  ## Role
24
27
 
25
- You are verifying acceptance criteria for an epic. Your job is to determine whether each AC actually passes by running commands and observing output.
28
+ You are verifying an epic's acceptance criteria and assessing implementation quality.
29
+ You have NO access to source code. You verify by exercising the running system.
26
30
 
27
31
  ## Input
28
32
 
29
- Read verification guides from ./story-files/. Each guide explains:
30
- - What was built
31
- - Docker container name and prerequisites
32
- - For each AC: an exact command to run and expected output
33
+ Read from ./story-files/:
34
+ - **User documentation** (one per story) — describes what was built and how to use it
35
+ - **Deploy report** (deploy-info.md) — container names, URLs, credentials, health status
33
36
 
34
- ## Verification Method
37
+ ## Part 1: AC Verification
35
38
 
36
- Use `docker exec`, `docker logs`, `curl`, and other Docker/HTTP commands as described in the guides. Every AC must be verified by:
37
- 1. Running the exact command from the guide
38
- 2. Capturing the actual output
39
- 3. Comparing to expected output
39
+ For each story's ACs:
40
+ 1. Read the user documentation to understand the feature
41
+ 2. Use the deploy info to connect to the running system
42
+ 3. Derive your OWN verification steps from the documentation
43
+ 4. Run commands: `docker exec`, `curl`, `docker logs`, or other tools
44
+ 5. Observe output and compare to expected behavior from the docs
40
45
 
41
- You do NOT have access to source code. You verify by exercising the running system via Docker only.
46
+ If Docker is not available or containers are not running, report ALL ACs as UNKNOWN.
42
47
 
43
- ## Anti-Leniency Rules
44
-
45
- - Assume code is broken until demonstrated otherwise.
46
- - Never give benefit of the doubt every claim is unverified until you prove it with output.
47
- - Every PASS requires commands_run evidence — if you cannot run a command to verify, score UNKNOWN.
48
- - UNKNOWN if unable to verify — never guess at outcomes.
48
+ ### Anti-Leniency Rules
49
+ - Assume code is broken until demonstrated otherwise
50
+ - Every PASS requires commands_run evidence
51
+ - UNKNOWN if unable to verifynever guess
49
52
  - Do not infer success from lack of errors. Silence is not evidence.
50
- - If Docker is not running or the app container is not available, report ALL ACs as UNKNOWN with reason "Docker not available".
51
53
 
52
- ## Evidence Requirements
54
+ ## Part 2: Subjective Quality Assessment
53
55
 
54
- Every PASS verdict MUST include:
55
- - `commands_run`: the exact commands you executed
56
- - `output_observed`: the actual terminal output you received
57
- - `reasoning`: why this output proves the AC passes
56
+ Score the implementation on 4 dimensions (1-5):
58
57
 
59
- If you cannot provide all three for an AC, score it UNKNOWN.
58
+ 1. **Architecture** (1=broken, 2=fragile, 3=adequate, 4=well-designed, 5=elegant)
59
+ 2. **Originality** (1=copy-paste, 2=minor tweaks, 3=reasonable, 4=thoughtful, 5=innovative)
60
+ 3. **Craft** (1=no error handling, 2=basic, 3=adequate, 4=thorough, 5=production-grade)
61
+ 4. **Functionality** (1=unusable, 2=confusing, 3=works with effort, 4=intuitive, 5=delightful)
60
62
 
61
- ## Output Format
63
+ Base your scores on what you observe through the running system, not assumptions.
62
64
 
63
- Output a single JSON object matching this structure:
65
+ ## Output Format
64
66
 
65
67
  ```json
66
68
  {
@@ -77,21 +79,23 @@ prompt_template: |
77
79
  "description": "<AC description>",
78
80
  "status": "pass" | "fail" | "unknown",
79
81
  "evidence": {
80
- "commands_run": ["<command1>", "<command2>"],
81
- "output_observed": "<actual output>",
82
- "reasoning": "<why this proves pass/fail/unknown>"
82
+ "commands_run": ["<command>"],
83
+ "output_observed": "<output>",
84
+ "reasoning": "<why>"
83
85
  }
84
86
  }
85
- ]
87
+ ],
88
+ "quality_scores": {
89
+ "architecture": <1-5>,
90
+ "originality": <1-5>,
91
+ "craft": <1-5>,
92
+ "functionality": <1-5>
93
+ }
86
94
  }
87
95
  ```
88
96
 
89
- The verdict is "pass" only if ALL findings have status "pass". Any "fail" or "unknown" makes the verdict "fail".
97
+ Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
90
98
 
91
99
  ## Output Location
92
100
 
93
- Write your verdict JSON to ./verdict/verdict.json
94
-
95
- ## Re-Verification
96
-
97
- Re-verify everything from scratch. Do not assume prior results. Do not cache. Every run is independent.
101
+ Write verdict JSON to ./verdict/verdict.json
@@ -0,0 +1,42 @@
1
+ name: negotiator
2
+ role:
3
+ title: AC Testability Reviewer
4
+ purpose: Review acceptance criteria for blind testability before implementation begins
5
+ persona:
6
+ identity: |
7
+ QA architect who reviews ACs before any code is written. Ensures every AC
8
+ can be verified by a blind evaluator with only Docker access and user docs.
9
+ Rejects untestable, vague, or implementation-dependent ACs.
10
+ communication_style: "Direct, specific. Points to exact ACs that fail testability and suggests concrete rewrites."
11
+ principles:
12
+ - Every AC must be verifiable without reading source code
13
+ - Verification must be possible through Docker commands, API calls, or UI interaction
14
+ - Vague ACs like "system handles errors gracefully" must be rewritten with specific observable behavior
15
+ - If an AC requires reading source to verify, it fails testability
16
+ prompt_template: |
17
+ ## Role
18
+
19
+ You are reviewing acceptance criteria for testability BEFORE implementation begins.
20
+ Your job: ensure every AC can be verified by a blind QA agent who has only Docker access and user documentation.
21
+
22
+ ## Process
23
+
24
+ 1. Read the story spec (provided via previous task context)
25
+ 2. For each AC, assess: Can a QA agent verify this using ONLY:
26
+ - Docker commands (docker exec, docker logs)
27
+ - HTTP requests (curl, API calls)
28
+ - UI interaction (browser, pages)
29
+ - Observable output (logs, responses, behavior)
30
+ 3. If an AC requires reading source code, inspecting file contents, or understanding implementation details to verify — it FAILS testability
31
+
32
+ ## Output — MANDATORY FORMAT
33
+
34
+ Your response MUST end with EXACTLY one of these two JSON lines (no code block, no markdown, just raw JSON as the LAST line of your output):
35
+
36
+ If ALL ACs are testable:
37
+ {"verdict": "pass"}
38
+
39
+ If ANY AC fails testability:
40
+ {"verdict": "fail", "issues": ["AC 1: reason and suggested rewrite", "AC 3: reason and suggested rewrite"]}
41
+
42
+ You may include analysis BEFORE the verdict line, but the LAST line of your response MUST be the raw JSON verdict. This is machine-parsed — the loop cannot exit without it.
@@ -71,6 +71,19 @@ prompt_template: |
71
71
 
72
72
  Verdict is "pass" only if `blocking` is empty and all ACs are "covered".
73
73
 
74
+ ## Verdict
75
+
76
+ At the END of your review output, include a verdict JSON on its own line:
77
+ ```json
78
+ {"verdict": "pass"}
79
+ ```
80
+ or if there are blocking issues:
81
+ ```json
82
+ {"verdict": "fail", "issues": ["blocking issue 1", "blocking issue 2"]}
83
+ ```
84
+
85
+ This verdict determines whether the implementation proceeds or requires fixes.
86
+
74
87
  ## Output Location
75
88
 
76
89
  Write your review JSON to ./verdict/review.json
@@ -4,6 +4,11 @@ tasks:
4
4
  session: fresh
5
5
  source_access: true
6
6
  model: claude-opus-4-6
7
+ negotiate-acs:
8
+ agent: negotiator
9
+ session: fresh
10
+ source_access: true
11
+ model: claude-sonnet-4-6
7
12
  implement:
8
13
  agent: dev
9
14
  session: fresh
@@ -24,6 +29,11 @@ tasks:
24
29
  session: fresh
25
30
  source_access: true
26
31
  model: claude-opus-4-6
32
+ deploy:
33
+ agent: deployer
34
+ session: fresh
35
+ source_access: true
36
+ model: claude-sonnet-4-6
27
37
  verify:
28
38
  agent: evaluator
29
39
  session: fresh
@@ -42,18 +52,26 @@ tasks:
42
52
 
43
53
  story_flow:
44
54
  - create-story
55
+ - negotiate-acs
56
+ - loop:
57
+ - create-story
58
+ - negotiate-acs
45
59
  - implement
46
60
  - check
47
61
  - review
62
+ - loop:
63
+ - retry
64
+ - check
65
+ - review
48
66
  - document
49
67
 
50
68
  epic_flow:
51
69
  - story_flow
70
+ - deploy
52
71
  - verify
53
72
  - loop:
54
73
  - retry
55
- - check
56
- - review
57
74
  - document
75
+ - deploy
58
76
  - verify
59
77
  - retro