codeharness 0.34.1 → 0.35.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-G2RR744K.js → chunk-AIXEFZIV.js} +1 -1
- package/dist/{docker-MEPWHG4P.js → docker-GYFYNCLQ.js} +1 -1
- package/dist/index.js +34 -35
- package/package.json +1 -1
- package/templates/agents/checker.yaml +2 -0
- package/templates/agents/deployer.yaml +17 -19
- package/templates/agents/documenter.yaml +4 -0
- package/templates/agents/evaluator.yaml +15 -1
- package/templates/agents/negotiator.yaml +29 -18
- package/templates/agents/reviewer.yaml +3 -9
- package/templates/agents/story-creator.yaml +2 -0
- package/templates/workflows/default.yaml +0 -9
|
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
|
|
|
2895
2895
|
}
|
|
2896
2896
|
|
|
2897
2897
|
// src/modules/infra/init-project.ts
|
|
2898
|
-
var HARNESS_VERSION = true ? "0.
|
|
2898
|
+
var HARNESS_VERSION = true ? "0.35.1" : "0.0.0-dev";
|
|
2899
2899
|
function failResult(opts, error) {
|
|
2900
2900
|
return {
|
|
2901
2901
|
status: "fail",
|
package/dist/index.js
CHANGED
|
@@ -40,7 +40,7 @@ import {
|
|
|
40
40
|
validateDockerfile,
|
|
41
41
|
warn,
|
|
42
42
|
writeState
|
|
43
|
-
} from "./chunk-
|
|
43
|
+
} from "./chunk-AIXEFZIV.js";
|
|
44
44
|
|
|
45
45
|
// src/index.ts
|
|
46
46
|
import { Command } from "commander";
|
|
@@ -3067,22 +3067,20 @@ function parseVerdict(output) {
|
|
|
3067
3067
|
}
|
|
3068
3068
|
return verdict;
|
|
3069
3069
|
}
|
|
3070
|
-
function
|
|
3071
|
-
const
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
}
|
|
3085
|
-
return null;
|
|
3070
|
+
function parseVerdictTag(output) {
|
|
3071
|
+
const match = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
|
|
3072
|
+
if (!match) return null;
|
|
3073
|
+
const verdict = match[1].toLowerCase();
|
|
3074
|
+
const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
|
|
3075
|
+
return {
|
|
3076
|
+
verdict,
|
|
3077
|
+
...issuesMatch ? { issues: issuesMatch[1].trim() } : {}
|
|
3078
|
+
};
|
|
3079
|
+
}
|
|
3080
|
+
function extractTag(output, tag) {
|
|
3081
|
+
const pattern = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`, "i");
|
|
3082
|
+
const match = pattern.exec(output);
|
|
3083
|
+
return match ? match[1].trim() : null;
|
|
3086
3084
|
}
|
|
3087
3085
|
|
|
3088
3086
|
// src/lib/circuit-breaker.ts
|
|
@@ -3388,14 +3386,13 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
|
|
|
3388
3386
|
}
|
|
3389
3387
|
const isEpicSentinel = storyKey.startsWith("__epic_") || storyKey === PER_RUN_SENTINEL;
|
|
3390
3388
|
const TASK_PROMPTS = {
|
|
3391
|
-
"create-story": (key) => `Create
|
|
3392
|
-
"negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation? Your response MUST end with exactly one JSON line: {"verdict": "pass"} or {"verdict": "fail", "issues": ["..."]}`,
|
|
3389
|
+
"create-story": (key) => `Create the story spec for ${key}. Read the epic definitions and architecture docs. Write a complete story file with acceptance criteria, tasks, and dev notes. CRITICAL: Every AC must be testable by a blind QA agent using ONLY a user guide + browser/API/CLI access. No AC should reference source code, internal data structures, or implementation details like O(1) complexity. Each AC must describe observable behavior that can be verified through UI interaction (agent-browser), API calls (curl), CLI commands (docker exec), or log inspection (docker logs). Wrap output in <story-spec>...</story-spec> tags.`,
|
|
3393
3390
|
"implement": (key) => `Implement story ${key}`,
|
|
3394
|
-
"check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool.
|
|
3395
|
-
"review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage.
|
|
3396
|
-
"document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code
|
|
3397
|
-
"deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health.
|
|
3398
|
-
"verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps
|
|
3391
|
+
"check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response.`,
|
|
3392
|
+
"review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response. If fail, include <issues>...</issues>.`,
|
|
3393
|
+
"document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code. Wrap documentation in <user-docs>...</user-docs> tags.`,
|
|
3394
|
+
"deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Wrap report in <deploy-report>...</deploy-report> tags with status, containers, URLs, credentials, health.`,
|
|
3395
|
+
"verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps, run commands, observe output. Include <verdict>pass</verdict> or <verdict>fail</verdict>. Include <evidence ac="N" status="pass|fail|unknown">...</evidence> per AC. Include <quality-scores>...</quality-scores>.`,
|
|
3399
3396
|
"retro": () => `Run a retrospective for this epic. Analyze what worked, what failed, patterns, and action items for next epic.`
|
|
3400
3397
|
};
|
|
3401
3398
|
let basePrompt;
|
|
@@ -3785,11 +3782,11 @@ async function executeLoopBlock(loopBlock, state, config, workItems, initialCont
|
|
|
3785
3782
|
}
|
|
3786
3783
|
}
|
|
3787
3784
|
if (!verdict) {
|
|
3788
|
-
const
|
|
3789
|
-
if (
|
|
3785
|
+
const tagged = parseVerdictTag(dispatchResult.output);
|
|
3786
|
+
if (tagged) {
|
|
3790
3787
|
verdict = {
|
|
3791
|
-
verdict:
|
|
3792
|
-
score: { passed:
|
|
3788
|
+
verdict: tagged.verdict,
|
|
3789
|
+
score: { passed: tagged.verdict === "pass" ? 1 : 0, failed: tagged.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
|
|
3793
3790
|
findings: []
|
|
3794
3791
|
};
|
|
3795
3792
|
}
|
|
@@ -4105,9 +4102,10 @@ async function executeWorkflow(config) {
|
|
|
4105
4102
|
const contractPath = join12(projectDir, ".codeharness", "contracts", `document-${item.key}.json`);
|
|
4106
4103
|
if (existsSync15(contractPath)) {
|
|
4107
4104
|
const contractData = JSON.parse(readFileSync13(contractPath, "utf-8"));
|
|
4108
|
-
|
|
4105
|
+
const docs = contractData.output ? extractTag(contractData.output, "user-docs") ?? contractData.output : null;
|
|
4106
|
+
if (docs) {
|
|
4109
4107
|
const guidePath = join12(guidesDir, `${item.key}-guide.md`);
|
|
4110
|
-
writeFileSync8(guidePath,
|
|
4108
|
+
writeFileSync8(guidePath, docs, "utf-8");
|
|
4111
4109
|
guideFiles.push(guidePath);
|
|
4112
4110
|
}
|
|
4113
4111
|
}
|
|
@@ -4115,9 +4113,10 @@ async function executeWorkflow(config) {
|
|
|
4115
4113
|
const deployContractPath = join12(projectDir, ".codeharness", "contracts", `deploy-${epicSentinel}.json`);
|
|
4116
4114
|
if (existsSync15(deployContractPath)) {
|
|
4117
4115
|
const deployData = JSON.parse(readFileSync13(deployContractPath, "utf-8"));
|
|
4118
|
-
|
|
4116
|
+
const report = deployData.output ? extractTag(deployData.output, "deploy-report") ?? deployData.output : null;
|
|
4117
|
+
if (report) {
|
|
4119
4118
|
const deployPath = join12(guidesDir, "deploy-info.md");
|
|
4120
|
-
writeFileSync8(deployPath,
|
|
4119
|
+
writeFileSync8(deployPath, report, "utf-8");
|
|
4121
4120
|
guideFiles.push(deployPath);
|
|
4122
4121
|
}
|
|
4123
4122
|
}
|
|
@@ -11189,7 +11188,7 @@ function registerTeardownCommand(program) {
|
|
|
11189
11188
|
} else if (otlpMode === "remote-routed") {
|
|
11190
11189
|
if (!options.keepDocker) {
|
|
11191
11190
|
try {
|
|
11192
|
-
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-
|
|
11191
|
+
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-GYFYNCLQ.js");
|
|
11193
11192
|
stopCollectorOnly2();
|
|
11194
11193
|
result.docker.stopped = true;
|
|
11195
11194
|
if (!isJson) {
|
|
@@ -11221,7 +11220,7 @@ function registerTeardownCommand(program) {
|
|
|
11221
11220
|
info("Shared stack: kept running (other projects may use it)");
|
|
11222
11221
|
}
|
|
11223
11222
|
} else if (isLegacyStack) {
|
|
11224
|
-
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-
|
|
11223
|
+
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-GYFYNCLQ.js");
|
|
11225
11224
|
let stackRunning = false;
|
|
11226
11225
|
try {
|
|
11227
11226
|
stackRunning = isStackRunning2(composeFile);
|
|
@@ -14208,7 +14207,7 @@ function registerDriversCommand(program) {
|
|
|
14208
14207
|
}
|
|
14209
14208
|
|
|
14210
14209
|
// src/index.ts
|
|
14211
|
-
var VERSION = true ? "0.
|
|
14210
|
+
var VERSION = true ? "0.35.1" : "0.0.0-dev";
|
|
14212
14211
|
function createProgram() {
|
|
14213
14212
|
const program = new Command();
|
|
14214
14213
|
program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
|
package/package.json
CHANGED
|
@@ -22,7 +22,7 @@ prompt_template: |
|
|
|
22
22
|
|
|
23
23
|
1. Check for Docker configuration:
|
|
24
24
|
- Look for `docker-compose.yml`, `docker-compose.yaml`, `compose.yml`, or `Dockerfile`
|
|
25
|
-
- If NONE found, output
|
|
25
|
+
- If NONE found, output a deploy-report with status no-docker (see Output section below)
|
|
26
26
|
- STOP here if no Docker config exists
|
|
27
27
|
|
|
28
28
|
2. Check for already-running containers:
|
|
@@ -34,24 +34,22 @@ prompt_template: |
|
|
|
34
34
|
- Wait for containers to start (max 60 seconds)
|
|
35
35
|
- Check health endpoints if defined
|
|
36
36
|
|
|
37
|
-
4.
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
}
|
|
54
|
-
```
|
|
37
|
+
4. Wrap your deploy report in `<deploy-report>...</deploy-report>` tags. Include: status, container names, URLs, ports, credentials, health.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
<deploy-report>
|
|
41
|
+
status: running
|
|
42
|
+
containers: container-name (image:tag, healthy, 8000:8000)
|
|
43
|
+
urls: api=http://localhost:8000, web=http://localhost:3000
|
|
44
|
+
credentials: db_url=postgresql://..., api_key=...
|
|
45
|
+
health: healthy
|
|
46
|
+
</deploy-report>
|
|
47
|
+
|
|
48
|
+
If no Docker config exists, output:
|
|
49
|
+
<deploy-report>
|
|
50
|
+
status: no-docker
|
|
51
|
+
message: No Docker configuration found in project
|
|
52
|
+
</deploy-report>
|
|
55
53
|
|
|
56
54
|
## Important
|
|
57
55
|
|
|
@@ -65,3 +65,7 @@ prompt_template: |
|
|
|
65
65
|
- Describe observable behavior, not internal logic
|
|
66
66
|
- If a feature has no external interface (internal-only), describe how it affects
|
|
67
67
|
other features that DO have external interfaces
|
|
68
|
+
|
|
69
|
+
## Output — MANDATORY FORMAT
|
|
70
|
+
|
|
71
|
+
Wrap your entire documentation in `<user-docs>...</user-docs>` tags. This is machine-parsed.
|
|
@@ -40,7 +40,11 @@ prompt_template: |
|
|
|
40
40
|
1. Read the user documentation to understand the feature
|
|
41
41
|
2. Use the deploy info to connect to the running system
|
|
42
42
|
3. Derive your OWN verification steps from the documentation
|
|
43
|
-
4.
|
|
43
|
+
4. Use the appropriate verification method:
|
|
44
|
+
- **API**: `curl` or HTTP requests to endpoints
|
|
45
|
+
- **UI**: `agent-browser` to navigate pages, click elements, observe content
|
|
46
|
+
- **CLI**: `docker exec` to run commands inside containers
|
|
47
|
+
- **Logs**: `docker logs` to check for specific entries
|
|
44
48
|
5. Observe output and compare to expected behavior from the docs
|
|
45
49
|
|
|
46
50
|
If Docker is not available or containers are not running, report ALL ACs as UNKNOWN.
|
|
@@ -96,6 +100,16 @@ prompt_template: |
|
|
|
96
100
|
|
|
97
101
|
Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
|
|
98
102
|
|
|
103
|
+
## XML Tags — MANDATORY
|
|
104
|
+
|
|
105
|
+
In addition to the JSON file output, your response MUST include these XML tags (machine-parsed):
|
|
106
|
+
|
|
107
|
+
Include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
|
|
108
|
+
|
|
109
|
+
For each AC, include `<evidence ac="N" status="pass|fail|unknown">command, output, reasoning</evidence>`.
|
|
110
|
+
|
|
111
|
+
Include `<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`.
|
|
112
|
+
|
|
99
113
|
## Output Location
|
|
100
114
|
|
|
101
115
|
Write verdict JSON to ./verdict/verdict.json
|
|
@@ -5,38 +5,49 @@ role:
|
|
|
5
5
|
persona:
|
|
6
6
|
identity: |
|
|
7
7
|
QA architect who reviews ACs before any code is written. Ensures every AC
|
|
8
|
-
can be verified by a blind evaluator with only Docker access
|
|
9
|
-
|
|
8
|
+
can be verified by a blind evaluator with only Docker access, user docs,
|
|
9
|
+
and agent-browser for UI testing.
|
|
10
10
|
communication_style: "Direct, specific. Points to exact ACs that fail testability and suggests concrete rewrites."
|
|
11
11
|
principles:
|
|
12
12
|
- Every AC must be verifiable without reading source code
|
|
13
|
-
- Verification must be possible through
|
|
14
|
-
- Vague ACs
|
|
13
|
+
- Verification must be possible through API calls, UI interaction, or CLI commands
|
|
14
|
+
- Vague ACs must be rewritten with specific observable behavior
|
|
15
15
|
- If an AC requires reading source to verify, it fails testability
|
|
16
16
|
prompt_template: |
|
|
17
17
|
## Role
|
|
18
18
|
|
|
19
19
|
You are reviewing acceptance criteria for testability BEFORE implementation begins.
|
|
20
|
-
Your job: ensure every AC can be verified by a blind QA agent
|
|
20
|
+
Your job: ensure every AC can be verified by a blind QA agent.
|
|
21
|
+
|
|
22
|
+
## Pass Criteria — an AC is testable if it can be verified through:
|
|
23
|
+
|
|
24
|
+
- **API**: curl/HTTP request to an endpoint, checking response body/status
|
|
25
|
+
- **UI**: agent-browser navigation, clicking, observing page content
|
|
26
|
+
- **CLI**: docker exec running a command, checking output
|
|
27
|
+
- **Logs**: docker logs checking for specific log entries
|
|
28
|
+
- **Database**: querying DB state through an exposed API or CLI tool
|
|
29
|
+
|
|
30
|
+
## Fail Criteria — an AC is NOT testable if it requires:
|
|
31
|
+
|
|
32
|
+
- Reading source code files
|
|
33
|
+
- Inspecting internal data structures
|
|
34
|
+
- Understanding implementation details (e.g., "uses O(1) lookup" — untestable without benchmarks)
|
|
35
|
+
- Checking code patterns or conventions (that's the reviewer's job, not the evaluator's)
|
|
21
36
|
|
|
22
37
|
## Process
|
|
23
38
|
|
|
24
39
|
1. Read the story spec (provided via previous task context)
|
|
25
|
-
2. For each AC,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
- UI interaction (browser, pages)
|
|
29
|
-
- Observable output (logs, responses, behavior)
|
|
30
|
-
3. If an AC requires reading source code, inspecting file contents, or understanding implementation details to verify — it FAILS testability
|
|
40
|
+
2. For each AC, determine: which verification method (API/UI/CLI/Logs/DB) would prove this?
|
|
41
|
+
3. If you can identify a concrete method → PASS
|
|
42
|
+
4. If no external method exists → FAIL with rewrite suggestion
|
|
31
43
|
|
|
32
44
|
## Output — MANDATORY FORMAT
|
|
33
45
|
|
|
34
|
-
Your response MUST
|
|
35
|
-
|
|
36
|
-
If ALL ACs are testable:
|
|
37
|
-
{"verdict": "pass"}
|
|
46
|
+
Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
|
|
38
47
|
|
|
39
|
-
If
|
|
40
|
-
|
|
48
|
+
If fail, also include:
|
|
49
|
+
<issues>
|
|
50
|
+
AC N: [why untestable] → Suggested rewrite: [concrete rewrite with observable behavior]
|
|
51
|
+
</issues>
|
|
41
52
|
|
|
42
|
-
You may include analysis
|
|
53
|
+
You may include analysis before the tags, but the XML tags are machine-parsed — the loop cannot exit without them.
|
|
@@ -73,16 +73,10 @@ prompt_template: |
|
|
|
73
73
|
|
|
74
74
|
## Verdict
|
|
75
75
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
{"verdict": "pass"}
|
|
79
|
-
```
|
|
80
|
-
or if there are blocking issues:
|
|
81
|
-
```json
|
|
82
|
-
{"verdict": "fail", "issues": ["blocking issue 1", "blocking issue 2"]}
|
|
83
|
-
```
|
|
76
|
+
Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
|
|
77
|
+
If fail, also include `<issues>blocking issue descriptions</issues>`.
|
|
84
78
|
|
|
85
|
-
|
|
79
|
+
These XML tags are machine-parsed and determine whether the implementation proceeds or requires fixes.
|
|
86
80
|
|
|
87
81
|
## Output Location
|
|
88
82
|
|
|
@@ -49,5 +49,7 @@ prompt_template: |
|
|
|
49
49
|
|
|
50
50
|
## Output
|
|
51
51
|
|
|
52
|
+
Wrap your story spec in `<story-spec>...</story-spec>` tags. This is machine-parsed.
|
|
53
|
+
|
|
52
54
|
Write the story file to the implementation artifacts directory following the project's naming convention.
|
|
53
55
|
Mark the story as `ready-for-dev` in the sprint status.
|
|
@@ -4,11 +4,6 @@ tasks:
|
|
|
4
4
|
session: fresh
|
|
5
5
|
source_access: true
|
|
6
6
|
model: claude-opus-4-6
|
|
7
|
-
negotiate-acs:
|
|
8
|
-
agent: negotiator
|
|
9
|
-
session: fresh
|
|
10
|
-
source_access: true
|
|
11
|
-
model: claude-sonnet-4-6
|
|
12
7
|
implement:
|
|
13
8
|
agent: dev
|
|
14
9
|
session: fresh
|
|
@@ -52,10 +47,6 @@ tasks:
|
|
|
52
47
|
|
|
53
48
|
story_flow:
|
|
54
49
|
- create-story
|
|
55
|
-
- negotiate-acs
|
|
56
|
-
- loop:
|
|
57
|
-
- create-story
|
|
58
|
-
- negotiate-acs
|
|
59
50
|
- implement
|
|
60
51
|
- check
|
|
61
52
|
- review
|