codeharness 0.34.0 → 0.35.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QMWMRFGH.js → chunk-PUZ5PWYL.js} +1 -1
- package/dist/{docker-QYSLNCQ2.js → docker-CTLGRLXP.js} +1 -1
- package/dist/index.js +34 -30
- package/package.json +1 -1
- package/templates/agents/checker.yaml +2 -0
- package/templates/agents/deployer.yaml +17 -19
- package/templates/agents/documenter.yaml +4 -0
- package/templates/agents/evaluator.yaml +15 -1
- package/templates/agents/negotiator.yaml +34 -25
- package/templates/agents/reviewer.yaml +3 -9
- package/templates/agents/story-creator.yaml +2 -0
|
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
|
|
|
2895
2895
|
}
|
|
2896
2896
|
|
|
2897
2897
|
// src/modules/infra/init-project.ts
|
|
2898
|
-
var HARNESS_VERSION = true ? "0.
|
|
2898
|
+
var HARNESS_VERSION = true ? "0.35.0" : "0.0.0-dev";
|
|
2899
2899
|
function failResult(opts, error) {
|
|
2900
2900
|
return {
|
|
2901
2901
|
status: "fail",
|
package/dist/index.js
CHANGED
|
@@ -40,7 +40,7 @@ import {
|
|
|
40
40
|
validateDockerfile,
|
|
41
41
|
warn,
|
|
42
42
|
writeState
|
|
43
|
-
} from "./chunk-
|
|
43
|
+
} from "./chunk-PUZ5PWYL.js";
|
|
44
44
|
|
|
45
45
|
// src/index.ts
|
|
46
46
|
import { Command } from "commander";
|
|
@@ -3067,18 +3067,20 @@ function parseVerdict(output) {
|
|
|
3067
3067
|
}
|
|
3068
3068
|
return verdict;
|
|
3069
3069
|
}
|
|
3070
|
-
function
|
|
3071
|
-
const
|
|
3072
|
-
const match = jsonPattern.exec(output);
|
|
3070
|
+
function parseVerdictTag(output) {
|
|
3071
|
+
const match = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
|
|
3073
3072
|
if (!match) return null;
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
}
|
|
3079
|
-
}
|
|
3080
|
-
|
|
3081
|
-
|
|
3073
|
+
const verdict = match[1].toLowerCase();
|
|
3074
|
+
const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
|
|
3075
|
+
return {
|
|
3076
|
+
verdict,
|
|
3077
|
+
...issuesMatch ? { issues: issuesMatch[1].trim() } : {}
|
|
3078
|
+
};
|
|
3079
|
+
}
|
|
3080
|
+
function extractTag(output, tag) {
|
|
3081
|
+
const pattern = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`, "i");
|
|
3082
|
+
const match = pattern.exec(output);
|
|
3083
|
+
return match ? match[1].trim() : null;
|
|
3082
3084
|
}
|
|
3083
3085
|
|
|
3084
3086
|
// src/lib/circuit-breaker.ts
|
|
@@ -3384,14 +3386,14 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
|
|
|
3384
3386
|
}
|
|
3385
3387
|
const isEpicSentinel = storyKey.startsWith("__epic_") || storyKey === PER_RUN_SENTINEL;
|
|
3386
3388
|
const TASK_PROMPTS = {
|
|
3387
|
-
"create-story": (key) => `Create the story spec for ${key}. Read the epic definitions and architecture docs,
|
|
3388
|
-
"negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation?
|
|
3389
|
+
"create-story": (key) => `Create or revise the story spec for ${key}. Read the epic definitions and architecture docs. If previous feedback is provided (from AC negotiation or review), revise the story to address that feedback. Write a complete story file with acceptance criteria, tasks, and dev notes. Wrap output in <story-spec>...</story-spec> tags.`,
|
|
3390
|
+
"negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation? Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response. If fail, include <issues>...</issues> with specific feedback per AC.`,
|
|
3389
3391
|
"implement": (key) => `Implement story ${key}`,
|
|
3390
|
-
"check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool.
|
|
3391
|
-
"review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage.
|
|
3392
|
-
"document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code
|
|
3393
|
-
"deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health.
|
|
3394
|
-
"verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps
|
|
3392
|
+
"check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response.`,
|
|
3393
|
+
"review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response. If fail, include <issues>...</issues>.`,
|
|
3394
|
+
"document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code. Wrap documentation in <user-docs>...</user-docs> tags.`,
|
|
3395
|
+
"deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Wrap report in <deploy-report>...</deploy-report> tags with status, containers, URLs, credentials, health.`,
|
|
3396
|
+
"verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps, run commands, observe output. Include <verdict>pass</verdict> or <verdict>fail</verdict>. Include <evidence ac="N" status="pass|fail|unknown">...</evidence> per AC. Include <quality-scores>...</quality-scores>.`,
|
|
3395
3397
|
"retro": () => `Run a retrospective for this epic. Analyze what worked, what failed, patterns, and action items for next epic.`
|
|
3396
3398
|
};
|
|
3397
3399
|
let basePrompt;
|
|
@@ -3781,11 +3783,11 @@ async function executeLoopBlock(loopBlock, state, config, workItems, initialCont
|
|
|
3781
3783
|
}
|
|
3782
3784
|
}
|
|
3783
3785
|
if (!verdict) {
|
|
3784
|
-
const
|
|
3785
|
-
if (
|
|
3786
|
+
const tagged = parseVerdictTag(dispatchResult.output);
|
|
3787
|
+
if (tagged) {
|
|
3786
3788
|
verdict = {
|
|
3787
|
-
verdict:
|
|
3788
|
-
score: { passed:
|
|
3789
|
+
verdict: tagged.verdict,
|
|
3790
|
+
score: { passed: tagged.verdict === "pass" ? 1 : 0, failed: tagged.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
|
|
3789
3791
|
findings: []
|
|
3790
3792
|
};
|
|
3791
3793
|
}
|
|
@@ -4101,9 +4103,10 @@ async function executeWorkflow(config) {
|
|
|
4101
4103
|
const contractPath = join12(projectDir, ".codeharness", "contracts", `document-${item.key}.json`);
|
|
4102
4104
|
if (existsSync15(contractPath)) {
|
|
4103
4105
|
const contractData = JSON.parse(readFileSync13(contractPath, "utf-8"));
|
|
4104
|
-
|
|
4106
|
+
const docs = contractData.output ? extractTag(contractData.output, "user-docs") ?? contractData.output : null;
|
|
4107
|
+
if (docs) {
|
|
4105
4108
|
const guidePath = join12(guidesDir, `${item.key}-guide.md`);
|
|
4106
|
-
writeFileSync8(guidePath,
|
|
4109
|
+
writeFileSync8(guidePath, docs, "utf-8");
|
|
4107
4110
|
guideFiles.push(guidePath);
|
|
4108
4111
|
}
|
|
4109
4112
|
}
|
|
@@ -4111,9 +4114,10 @@ async function executeWorkflow(config) {
|
|
|
4111
4114
|
const deployContractPath = join12(projectDir, ".codeharness", "contracts", `deploy-${epicSentinel}.json`);
|
|
4112
4115
|
if (existsSync15(deployContractPath)) {
|
|
4113
4116
|
const deployData = JSON.parse(readFileSync13(deployContractPath, "utf-8"));
|
|
4114
|
-
|
|
4117
|
+
const report = deployData.output ? extractTag(deployData.output, "deploy-report") ?? deployData.output : null;
|
|
4118
|
+
if (report) {
|
|
4115
4119
|
const deployPath = join12(guidesDir, "deploy-info.md");
|
|
4116
|
-
writeFileSync8(deployPath,
|
|
4120
|
+
writeFileSync8(deployPath, report, "utf-8");
|
|
4117
4121
|
guideFiles.push(deployPath);
|
|
4118
4122
|
}
|
|
4119
4123
|
}
|
|
@@ -11185,7 +11189,7 @@ function registerTeardownCommand(program) {
|
|
|
11185
11189
|
} else if (otlpMode === "remote-routed") {
|
|
11186
11190
|
if (!options.keepDocker) {
|
|
11187
11191
|
try {
|
|
11188
|
-
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-
|
|
11192
|
+
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-CTLGRLXP.js");
|
|
11189
11193
|
stopCollectorOnly2();
|
|
11190
11194
|
result.docker.stopped = true;
|
|
11191
11195
|
if (!isJson) {
|
|
@@ -11217,7 +11221,7 @@ function registerTeardownCommand(program) {
|
|
|
11217
11221
|
info("Shared stack: kept running (other projects may use it)");
|
|
11218
11222
|
}
|
|
11219
11223
|
} else if (isLegacyStack) {
|
|
11220
|
-
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-
|
|
11224
|
+
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-CTLGRLXP.js");
|
|
11221
11225
|
let stackRunning = false;
|
|
11222
11226
|
try {
|
|
11223
11227
|
stackRunning = isStackRunning2(composeFile);
|
|
@@ -14204,7 +14208,7 @@ function registerDriversCommand(program) {
|
|
|
14204
14208
|
}
|
|
14205
14209
|
|
|
14206
14210
|
// src/index.ts
|
|
14207
|
-
var VERSION = true ? "0.
|
|
14211
|
+
var VERSION = true ? "0.35.0" : "0.0.0-dev";
|
|
14208
14212
|
function createProgram() {
|
|
14209
14213
|
const program = new Command();
|
|
14210
14214
|
program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
|
package/package.json
CHANGED
|
@@ -22,7 +22,7 @@ prompt_template: |
|
|
|
22
22
|
|
|
23
23
|
1. Check for Docker configuration:
|
|
24
24
|
- Look for `docker-compose.yml`, `docker-compose.yaml`, `compose.yml`, or `Dockerfile`
|
|
25
|
-
- If NONE found, output
|
|
25
|
+
- If NONE found, output a deploy-report with status no-docker (see Output section below)
|
|
26
26
|
- STOP here if no Docker config exists
|
|
27
27
|
|
|
28
28
|
2. Check for already-running containers:
|
|
@@ -34,24 +34,22 @@ prompt_template: |
|
|
|
34
34
|
- Wait for containers to start (max 60 seconds)
|
|
35
35
|
- Check health endpoints if defined
|
|
36
36
|
|
|
37
|
-
4.
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
}
|
|
54
|
-
```
|
|
37
|
+
4. Wrap your deploy report in `<deploy-report>...</deploy-report>` tags. Include: status, container names, URLs, ports, credentials, health.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
<deploy-report>
|
|
41
|
+
status: running
|
|
42
|
+
containers: container-name (image:tag, healthy, 8000:8000)
|
|
43
|
+
urls: api=http://localhost:8000, web=http://localhost:3000
|
|
44
|
+
credentials: db_url=postgresql://..., api_key=...
|
|
45
|
+
health: healthy
|
|
46
|
+
</deploy-report>
|
|
47
|
+
|
|
48
|
+
If no Docker config exists, output:
|
|
49
|
+
<deploy-report>
|
|
50
|
+
status: no-docker
|
|
51
|
+
message: No Docker configuration found in project
|
|
52
|
+
</deploy-report>
|
|
55
53
|
|
|
56
54
|
## Important
|
|
57
55
|
|
|
@@ -65,3 +65,7 @@ prompt_template: |
|
|
|
65
65
|
- Describe observable behavior, not internal logic
|
|
66
66
|
- If a feature has no external interface (internal-only), describe how it affects
|
|
67
67
|
other features that DO have external interfaces
|
|
68
|
+
|
|
69
|
+
## Output — MANDATORY FORMAT
|
|
70
|
+
|
|
71
|
+
Wrap your entire documentation in `<user-docs>...</user-docs>` tags. This is machine-parsed.
|
|
@@ -40,7 +40,11 @@ prompt_template: |
|
|
|
40
40
|
1. Read the user documentation to understand the feature
|
|
41
41
|
2. Use the deploy info to connect to the running system
|
|
42
42
|
3. Derive your OWN verification steps from the documentation
|
|
43
|
-
4.
|
|
43
|
+
4. Use the appropriate verification method:
|
|
44
|
+
- **API**: `curl` or HTTP requests to endpoints
|
|
45
|
+
- **UI**: `agent-browser` to navigate pages, click elements, observe content
|
|
46
|
+
- **CLI**: `docker exec` to run commands inside containers
|
|
47
|
+
- **Logs**: `docker logs` to check for specific entries
|
|
44
48
|
5. Observe output and compare to expected behavior from the docs
|
|
45
49
|
|
|
46
50
|
If Docker is not available or containers are not running, report ALL ACs as UNKNOWN.
|
|
@@ -96,6 +100,16 @@ prompt_template: |
|
|
|
96
100
|
|
|
97
101
|
Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
|
|
98
102
|
|
|
103
|
+
## XML Tags — MANDATORY
|
|
104
|
+
|
|
105
|
+
In addition to the JSON file output, your response MUST include these XML tags (machine-parsed):
|
|
106
|
+
|
|
107
|
+
Include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
|
|
108
|
+
|
|
109
|
+
For each AC, include `<evidence ac="N" status="pass|fail|unknown">command, output, reasoning</evidence>`.
|
|
110
|
+
|
|
111
|
+
Include `<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`.
|
|
112
|
+
|
|
99
113
|
## Output Location
|
|
100
114
|
|
|
101
115
|
Write verdict JSON to ./verdict/verdict.json
|
|
@@ -5,40 +5,49 @@ role:
|
|
|
5
5
|
persona:
|
|
6
6
|
identity: |
|
|
7
7
|
QA architect who reviews ACs before any code is written. Ensures every AC
|
|
8
|
-
can be verified by a blind evaluator with only Docker access
|
|
9
|
-
|
|
8
|
+
can be verified by a blind evaluator with only Docker access, user docs,
|
|
9
|
+
and agent-browser for UI testing.
|
|
10
10
|
communication_style: "Direct, specific. Points to exact ACs that fail testability and suggests concrete rewrites."
|
|
11
11
|
principles:
|
|
12
12
|
- Every AC must be verifiable without reading source code
|
|
13
|
-
- Verification must be possible through
|
|
14
|
-
- Vague ACs
|
|
13
|
+
- Verification must be possible through API calls, UI interaction, or CLI commands
|
|
14
|
+
- Vague ACs must be rewritten with specific observable behavior
|
|
15
15
|
- If an AC requires reading source to verify, it fails testability
|
|
16
16
|
prompt_template: |
|
|
17
17
|
## Role
|
|
18
18
|
|
|
19
19
|
You are reviewing acceptance criteria for testability BEFORE implementation begins.
|
|
20
|
-
Your job: ensure every AC can be verified by a blind QA agent
|
|
20
|
+
Your job: ensure every AC can be verified by a blind QA agent.
|
|
21
|
+
|
|
22
|
+
## Pass Criteria — an AC is testable if it can be verified through:
|
|
23
|
+
|
|
24
|
+
- **API**: curl/HTTP request to an endpoint, checking response body/status
|
|
25
|
+
- **UI**: agent-browser navigation, clicking, observing page content
|
|
26
|
+
- **CLI**: docker exec running a command, checking output
|
|
27
|
+
- **Logs**: docker logs checking for specific log entries
|
|
28
|
+
- **Database**: querying DB state through an exposed API or CLI tool
|
|
29
|
+
|
|
30
|
+
## Fail Criteria — an AC is NOT testable if it requires:
|
|
31
|
+
|
|
32
|
+
- Reading source code files
|
|
33
|
+
- Inspecting internal data structures
|
|
34
|
+
- Understanding implementation details (e.g., "uses O(1) lookup" — untestable without benchmarks)
|
|
35
|
+
- Checking code patterns or conventions (that's the reviewer's job, not the evaluator's)
|
|
21
36
|
|
|
22
37
|
## Process
|
|
23
38
|
|
|
24
39
|
1. Read the story spec (provided via previous task context)
|
|
25
|
-
2. For each AC,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
If ANY AC fails testability, output:
|
|
40
|
-
```json
|
|
41
|
-
{"verdict": "fail", "issues": ["AC N: [reason it's untestable]. Suggested rewrite: [specific rewrite]"]}
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
Be specific. Don't just say "untestable" — explain WHY and provide a concrete rewrite.
|
|
40
|
+
2. For each AC, determine: which verification method (API/UI/CLI/Logs/DB) would prove this?
|
|
41
|
+
3. If you can identify a concrete method → PASS
|
|
42
|
+
4. If no external method exists → FAIL with rewrite suggestion
|
|
43
|
+
|
|
44
|
+
## Output — MANDATORY FORMAT
|
|
45
|
+
|
|
46
|
+
Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
|
|
47
|
+
|
|
48
|
+
If fail, also include:
|
|
49
|
+
<issues>
|
|
50
|
+
AC N: [why untestable] → Suggested rewrite: [concrete rewrite with observable behavior]
|
|
51
|
+
</issues>
|
|
52
|
+
|
|
53
|
+
You may include analysis before the tags, but the XML tags are machine-parsed — the loop cannot exit without them.
|
|
@@ -73,16 +73,10 @@ prompt_template: |
|
|
|
73
73
|
|
|
74
74
|
## Verdict
|
|
75
75
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
{"verdict": "pass"}
|
|
79
|
-
```
|
|
80
|
-
or if there are blocking issues:
|
|
81
|
-
```json
|
|
82
|
-
{"verdict": "fail", "issues": ["blocking issue 1", "blocking issue 2"]}
|
|
83
|
-
```
|
|
76
|
+
Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
|
|
77
|
+
If fail, also include `<issues>blocking issue descriptions</issues>`.
|
|
84
78
|
|
|
85
|
-
|
|
79
|
+
These XML tags are machine-parsed and determine whether the implementation proceeds or requires fixes.
|
|
86
80
|
|
|
87
81
|
## Output Location
|
|
88
82
|
|
|
@@ -49,5 +49,7 @@ prompt_template: |
|
|
|
49
49
|
|
|
50
50
|
## Output
|
|
51
51
|
|
|
52
|
+
Wrap your story spec in `<story-spec>...</story-spec>` tags. This is machine-parsed.
|
|
53
|
+
|
|
52
54
|
Write the story file to the implementation artifacts directory following the project's naming convention.
|
|
53
55
|
Mark the story as `ready-for-dev` in the sprint status.
|