codeharness 0.34.1 → 0.35.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
2895
2895
  }
2896
2896
 
2897
2897
  // src/modules/infra/init-project.ts
2898
- var HARNESS_VERSION = true ? "0.34.1" : "0.0.0-dev";
2898
+ var HARNESS_VERSION = true ? "0.35.1" : "0.0.0-dev";
2899
2899
  function failResult(opts, error) {
2900
2900
  return {
2901
2901
  status: "fail",
@@ -16,7 +16,7 @@ import {
16
16
  stopCollectorOnly,
17
17
  stopSharedStack,
18
18
  stopStack
19
- } from "./chunk-G2RR744K.js";
19
+ } from "./chunk-AIXEFZIV.js";
20
20
  export {
21
21
  checkRemoteEndpoint,
22
22
  cleanupOrphanedContainers,
package/dist/index.js CHANGED
@@ -40,7 +40,7 @@ import {
40
40
  validateDockerfile,
41
41
  warn,
42
42
  writeState
43
- } from "./chunk-G2RR744K.js";
43
+ } from "./chunk-AIXEFZIV.js";
44
44
 
45
45
  // src/index.ts
46
46
  import { Command } from "commander";
@@ -3067,22 +3067,20 @@ function parseVerdict(output) {
3067
3067
  }
3068
3068
  return verdict;
3069
3069
  }
3070
- function parseSimpleVerdict(output) {
3071
- const jsonPattern = /\{[^{}]*"verdict"\s*:\s*"(pass|fail)"[^{}]*\}/g;
3072
- let lastMatch = null;
3073
- let m;
3074
- while ((m = jsonPattern.exec(output)) !== null) {
3075
- lastMatch = m;
3076
- }
3077
- if (!lastMatch) return null;
3078
- try {
3079
- const parsed = JSON.parse(lastMatch[0]);
3080
- if (parsed.verdict === "pass" || parsed.verdict === "fail") {
3081
- return { verdict: parsed.verdict };
3082
- }
3083
- } catch {
3084
- }
3085
- return null;
3070
+ function parseVerdictTag(output) {
3071
+ const match = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
3072
+ if (!match) return null;
3073
+ const verdict = match[1].toLowerCase();
3074
+ const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
3075
+ return {
3076
+ verdict,
3077
+ ...issuesMatch ? { issues: issuesMatch[1].trim() } : {}
3078
+ };
3079
+ }
3080
+ function extractTag(output, tag) {
3081
+ const pattern = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`, "i");
3082
+ const match = pattern.exec(output);
3083
+ return match ? match[1].trim() : null;
3086
3084
  }
3087
3085
 
3088
3086
  // src/lib/circuit-breaker.ts
@@ -3388,14 +3386,13 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
3388
3386
  }
3389
3387
  const isEpicSentinel = storyKey.startsWith("__epic_") || storyKey === PER_RUN_SENTINEL;
3390
3388
  const TASK_PROMPTS = {
3391
- "create-story": (key) => `Create or revise the story spec for ${key}. Read the epic definitions and architecture docs. If previous feedback is provided (from AC negotiation or review), revise the story to address that feedback. Write a complete story file with acceptance criteria, tasks, and dev notes.`,
3392
- "negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation? Your response MUST end with exactly one JSON line: {"verdict": "pass"} or {"verdict": "fail", "issues": ["..."]}`,
3389
+ "create-story": (key) => `Create the story spec for ${key}. Read the epic definitions and architecture docs. Write a complete story file with acceptance criteria, tasks, and dev notes. CRITICAL: Every AC must be testable by a blind QA agent using ONLY a user guide + browser/API/CLI access. No AC should reference source code, internal data structures, or implementation details like O(1) complexity. Each AC must describe observable behavior that can be verified through UI interaction (agent-browser), API calls (curl), CLI commands (docker exec), or log inspection (docker logs). Wrap output in <story-spec>...</story-spec> tags.`,
3393
3390
  "implement": (key) => `Implement story ${key}`,
3394
- "check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Report pass/fail results.`,
3395
- "review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Output a verdict JSON at the end.`,
3396
- "document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code \u2014 describe features, UI pages, API endpoints, CLI commands, and expected behavior.`,
3397
- "deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Output a deploy report JSON with container names, URLs, credentials, and health status.`,
3398
- "verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps from the documentation, connect using deploy info, run commands, and observe output. Also score subjective quality on 4 dimensions.`,
3391
+ "check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response.`,
3392
+ "review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response. If fail, include <issues>...</issues>.`,
3393
+ "document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code. Wrap documentation in <user-docs>...</user-docs> tags.`,
3394
+ "deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Wrap report in <deploy-report>...</deploy-report> tags with status, containers, URLs, credentials, health.`,
3395
+ "verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps, run commands, observe output. Include <verdict>pass</verdict> or <verdict>fail</verdict>. Include <evidence ac="N" status="pass|fail|unknown">...</evidence> per AC. Include <quality-scores>...</quality-scores>.`,
3399
3396
  "retro": () => `Run a retrospective for this epic. Analyze what worked, what failed, patterns, and action items for next epic.`
3400
3397
  };
3401
3398
  let basePrompt;
@@ -3785,11 +3782,11 @@ async function executeLoopBlock(loopBlock, state, config, workItems, initialCont
3785
3782
  }
3786
3783
  }
3787
3784
  if (!verdict) {
3788
- const simple = parseSimpleVerdict(dispatchResult.output);
3789
- if (simple) {
3785
+ const tagged = parseVerdictTag(dispatchResult.output);
3786
+ if (tagged) {
3790
3787
  verdict = {
3791
- verdict: simple.verdict,
3792
- score: { passed: simple.verdict === "pass" ? 1 : 0, failed: simple.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
3788
+ verdict: tagged.verdict,
3789
+ score: { passed: tagged.verdict === "pass" ? 1 : 0, failed: tagged.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
3793
3790
  findings: []
3794
3791
  };
3795
3792
  }
@@ -4105,9 +4102,10 @@ async function executeWorkflow(config) {
4105
4102
  const contractPath = join12(projectDir, ".codeharness", "contracts", `document-${item.key}.json`);
4106
4103
  if (existsSync15(contractPath)) {
4107
4104
  const contractData = JSON.parse(readFileSync13(contractPath, "utf-8"));
4108
- if (contractData.output) {
4105
+ const docs = contractData.output ? extractTag(contractData.output, "user-docs") ?? contractData.output : null;
4106
+ if (docs) {
4109
4107
  const guidePath = join12(guidesDir, `${item.key}-guide.md`);
4110
- writeFileSync8(guidePath, contractData.output, "utf-8");
4108
+ writeFileSync8(guidePath, docs, "utf-8");
4111
4109
  guideFiles.push(guidePath);
4112
4110
  }
4113
4111
  }
@@ -4115,9 +4113,10 @@ async function executeWorkflow(config) {
4115
4113
  const deployContractPath = join12(projectDir, ".codeharness", "contracts", `deploy-${epicSentinel}.json`);
4116
4114
  if (existsSync15(deployContractPath)) {
4117
4115
  const deployData = JSON.parse(readFileSync13(deployContractPath, "utf-8"));
4118
- if (deployData.output) {
4116
+ const report = deployData.output ? extractTag(deployData.output, "deploy-report") ?? deployData.output : null;
4117
+ if (report) {
4119
4118
  const deployPath = join12(guidesDir, "deploy-info.md");
4120
- writeFileSync8(deployPath, deployData.output, "utf-8");
4119
+ writeFileSync8(deployPath, report, "utf-8");
4121
4120
  guideFiles.push(deployPath);
4122
4121
  }
4123
4122
  }
@@ -11189,7 +11188,7 @@ function registerTeardownCommand(program) {
11189
11188
  } else if (otlpMode === "remote-routed") {
11190
11189
  if (!options.keepDocker) {
11191
11190
  try {
11192
- const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-MEPWHG4P.js");
11191
+ const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-GYFYNCLQ.js");
11193
11192
  stopCollectorOnly2();
11194
11193
  result.docker.stopped = true;
11195
11194
  if (!isJson) {
@@ -11221,7 +11220,7 @@ function registerTeardownCommand(program) {
11221
11220
  info("Shared stack: kept running (other projects may use it)");
11222
11221
  }
11223
11222
  } else if (isLegacyStack) {
11224
- const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-MEPWHG4P.js");
11223
+ const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-GYFYNCLQ.js");
11225
11224
  let stackRunning = false;
11226
11225
  try {
11227
11226
  stackRunning = isStackRunning2(composeFile);
@@ -14208,7 +14207,7 @@ function registerDriversCommand(program) {
14208
14207
  }
14209
14208
 
14210
14209
  // src/index.ts
14211
- var VERSION = true ? "0.34.1" : "0.0.0-dev";
14210
+ var VERSION = true ? "0.35.1" : "0.0.0-dev";
14212
14211
  function createProgram() {
14213
14212
  const program = new Command();
14214
14213
  program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codeharness",
3
- "version": "0.34.1",
3
+ "version": "0.35.1",
4
4
  "type": "module",
5
5
  "description": "CLI for codeharness — makes autonomous coding agents produce software that actually works",
6
6
  "bin": {
@@ -60,6 +60,8 @@ prompt_template: |
60
60
 
61
61
  Verdict is "pass" only if ALL checks pass.
62
62
 
63
+ Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
64
+
63
65
  ## Output Location
64
66
 
65
67
  Write results to ./verdict/check.json
@@ -22,7 +22,7 @@ prompt_template: |
22
22
 
23
23
  1. Check for Docker configuration:
24
24
  - Look for `docker-compose.yml`, `docker-compose.yaml`, `compose.yml`, or `Dockerfile`
25
- - If NONE found, output: `{"status": "no-docker", "message": "No Docker configuration found in project"}`
25
+ - If NONE found, output a deploy-report with status no-docker (see Output section below)
26
26
  - STOP here if no Docker config exists
27
27
 
28
28
  2. Check for already-running containers:
@@ -34,24 +34,22 @@ prompt_template: |
34
34
  - Wait for containers to start (max 60 seconds)
35
35
  - Check health endpoints if defined
36
36
 
37
- 4. Output deploy report as JSON:
38
- ```json
39
- {
40
- "status": "running",
41
- "containers": [
42
- {"name": "container-name", "image": "image:tag", "status": "healthy", "ports": ["8000:8000"]}
43
- ],
44
- "urls": {
45
- "api": "http://localhost:8000",
46
- "web": "http://localhost:3000"
47
- },
48
- "credentials": {
49
- "db_url": "postgresql://...",
50
- "api_key": "..."
51
- },
52
- "health": "healthy"
53
- }
54
- ```
37
+ 4. Wrap your deploy report in `<deploy-report>...</deploy-report>` tags. Include: status, container names, URLs, ports, credentials, health.
38
+
39
+ Example:
40
+ <deploy-report>
41
+ status: running
42
+ containers: container-name (image:tag, healthy, 8000:8000)
43
+ urls: api=http://localhost:8000, web=http://localhost:3000
44
+ credentials: db_url=postgresql://..., api_key=...
45
+ health: healthy
46
+ </deploy-report>
47
+
48
+ If no Docker config exists, output:
49
+ <deploy-report>
50
+ status: no-docker
51
+ message: No Docker configuration found in project
52
+ </deploy-report>
55
53
 
56
54
  ## Important
57
55
 
@@ -65,3 +65,7 @@ prompt_template: |
65
65
  - Describe observable behavior, not internal logic
66
66
  - If a feature has no external interface (internal-only), describe how it affects
67
67
  other features that DO have external interfaces
68
+
69
+ ## Output — MANDATORY FORMAT
70
+
71
+ Wrap your entire documentation in `<user-docs>...</user-docs>` tags. This is machine-parsed.
@@ -40,7 +40,11 @@ prompt_template: |
40
40
  1. Read the user documentation to understand the feature
41
41
  2. Use the deploy info to connect to the running system
42
42
  3. Derive your OWN verification steps from the documentation
43
- 4. Run commands: `docker exec`, `curl`, `docker logs`, or other tools
43
+ 4. Use the appropriate verification method:
44
+ - **API**: `curl` or HTTP requests to endpoints
45
+ - **UI**: `agent-browser` to navigate pages, click elements, observe content
46
+ - **CLI**: `docker exec` to run commands inside containers
47
+ - **Logs**: `docker logs` to check for specific entries
44
48
  5. Observe output and compare to expected behavior from the docs
45
49
 
46
50
  If Docker is not available or containers are not running, report ALL ACs as UNKNOWN.
@@ -96,6 +100,16 @@ prompt_template: |
96
100
 
97
101
  Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
98
102
 
103
+ ## XML Tags — MANDATORY
104
+
105
+ In addition to the JSON file output, your response MUST include these XML tags (machine-parsed):
106
+
107
+ Include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
108
+
109
+ For each AC, include `<evidence ac="N" status="pass|fail|unknown">command, output, reasoning</evidence>`.
110
+
111
+ Include `<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`.
112
+
99
113
  ## Output Location
100
114
 
101
115
  Write verdict JSON to ./verdict/verdict.json
@@ -5,38 +5,49 @@ role:
5
5
  persona:
6
6
  identity: |
7
7
  QA architect who reviews ACs before any code is written. Ensures every AC
8
- can be verified by a blind evaluator with only Docker access and user docs.
9
- Rejects untestable, vague, or implementation-dependent ACs.
8
+ can be verified by a blind evaluator with only Docker access, user docs,
9
+ and agent-browser for UI testing.
10
10
  communication_style: "Direct, specific. Points to exact ACs that fail testability and suggests concrete rewrites."
11
11
  principles:
12
12
  - Every AC must be verifiable without reading source code
13
- - Verification must be possible through Docker commands, API calls, or UI interaction
14
- - Vague ACs like "system handles errors gracefully" must be rewritten with specific observable behavior
13
+ - Verification must be possible through API calls, UI interaction, or CLI commands
14
+ - Vague ACs must be rewritten with specific observable behavior
15
15
  - If an AC requires reading source to verify, it fails testability
16
16
  prompt_template: |
17
17
  ## Role
18
18
 
19
19
  You are reviewing acceptance criteria for testability BEFORE implementation begins.
20
- Your job: ensure every AC can be verified by a blind QA agent who has only Docker access and user documentation.
20
+ Your job: ensure every AC can be verified by a blind QA agent.
21
+
22
+ ## Pass Criteria — an AC is testable if it can be verified through:
23
+
24
+ - **API**: curl/HTTP request to an endpoint, checking response body/status
25
+ - **UI**: agent-browser navigation, clicking, observing page content
26
+ - **CLI**: docker exec running a command, checking output
27
+ - **Logs**: docker logs checking for specific log entries
28
+ - **Database**: querying DB state through an exposed API or CLI tool
29
+
30
+ ## Fail Criteria — an AC is NOT testable if it requires:
31
+
32
+ - Reading source code files
33
+ - Inspecting internal data structures
34
+ - Understanding implementation details (e.g., "uses O(1) lookup" — untestable without benchmarks)
35
+ - Checking code patterns or conventions (that's the reviewer's job, not the evaluator's)
21
36
 
22
37
  ## Process
23
38
 
24
39
  1. Read the story spec (provided via previous task context)
25
- 2. For each AC, assess: Can a QA agent verify this using ONLY:
26
- - Docker commands (docker exec, docker logs)
27
- - HTTP requests (curl, API calls)
28
- - UI interaction (browser, pages)
29
- - Observable output (logs, responses, behavior)
30
- 3. If an AC requires reading source code, inspecting file contents, or understanding implementation details to verify — it FAILS testability
40
+ 2. For each AC, determine: which verification method (API/UI/CLI/Logs/DB) would prove this?
41
+ 3. If you can identify a concrete method → PASS
42
+ 4. If no external method exists → FAIL with rewrite suggestion
31
43
 
32
44
  ## Output — MANDATORY FORMAT
33
45
 
34
- Your response MUST end with EXACTLY one of these two JSON lines (no code block, no markdown, just raw JSON as the LAST line of your output):
35
-
36
- If ALL ACs are testable:
37
- {"verdict": "pass"}
46
+ Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
38
47
 
39
- If ANY AC fails testability:
40
- {"verdict": "fail", "issues": ["AC 1: reason and suggested rewrite", "AC 3: reason and suggested rewrite"]}
48
+ If fail, also include:
49
+ <issues>
50
+ AC N: [why untestable] → Suggested rewrite: [concrete rewrite with observable behavior]
51
+ </issues>
41
52
 
42
- You may include analysis BEFORE the verdict line, but the LAST line of your response MUST be the raw JSON verdict. This is machine-parsed — the loop cannot exit without it.
53
+ You may include analysis before the tags, but the XML tags are machine-parsed — the loop cannot exit without them.
@@ -73,16 +73,10 @@ prompt_template: |
73
73
 
74
74
  ## Verdict
75
75
 
76
- At the END of your review output, include a verdict JSON on its own line:
77
- ```json
78
- {"verdict": "pass"}
79
- ```
80
- or if there are blocking issues:
81
- ```json
82
- {"verdict": "fail", "issues": ["blocking issue 1", "blocking issue 2"]}
83
- ```
76
+ Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
77
+ If fail, also include `<issues>blocking issue descriptions</issues>`.
84
78
 
85
- This verdict determines whether the implementation proceeds or requires fixes.
79
+ These XML tags are machine-parsed and determine whether the implementation proceeds or requires fixes.
86
80
 
87
81
  ## Output Location
88
82
 
@@ -49,5 +49,7 @@ prompt_template: |
49
49
 
50
50
  ## Output
51
51
 
52
+ Wrap your story spec in `<story-spec>...</story-spec>` tags. This is machine-parsed.
53
+
52
54
  Write the story file to the implementation artifacts directory following the project's naming convention.
53
55
  Mark the story as `ready-for-dev` in the sprint status.
@@ -4,11 +4,6 @@ tasks:
4
4
  session: fresh
5
5
  source_access: true
6
6
  model: claude-opus-4-6
7
- negotiate-acs:
8
- agent: negotiator
9
- session: fresh
10
- source_access: true
11
- model: claude-sonnet-4-6
12
7
  implement:
13
8
  agent: dev
14
9
  session: fresh
@@ -52,10 +47,6 @@ tasks:
52
47
 
53
48
  story_flow:
54
49
  - create-story
55
- - negotiate-acs
56
- - loop:
57
- - create-story
58
- - negotiate-acs
59
50
  - implement
60
51
  - check
61
52
  - review