codeharness 0.37.0 → 0.37.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
|
|
|
2895
2895
|
}
|
|
2896
2896
|
|
|
2897
2897
|
// src/modules/infra/init-project.ts
|
|
2898
|
-
var HARNESS_VERSION = true ? "0.37.
|
|
2898
|
+
var HARNESS_VERSION = true ? "0.37.1" : "0.0.0-dev";
|
|
2899
2899
|
function failResult(opts, error) {
|
|
2900
2900
|
return {
|
|
2901
2901
|
status: "fail",
|
package/dist/index.js
CHANGED
|
@@ -40,7 +40,7 @@ import {
|
|
|
40
40
|
validateDockerfile,
|
|
41
41
|
warn,
|
|
42
42
|
writeState
|
|
43
|
-
} from "./chunk-
|
|
43
|
+
} from "./chunk-AXFKDGFF.js";
|
|
44
44
|
|
|
45
45
|
// src/index.ts
|
|
46
46
|
import { Command } from "commander";
|
|
@@ -2648,179 +2648,59 @@ var DispatchError = class extends Error {
|
|
|
2648
2648
|
};
|
|
2649
2649
|
|
|
2650
2650
|
// src/lib/verdict-parser.ts
|
|
2651
|
-
import Ajv2 from "ajv";
|
|
2652
|
-
|
|
2653
|
-
// src/schemas/verdict.schema.json
|
|
2654
|
-
var verdict_schema_default = {
|
|
2655
|
-
$schema: "http://json-schema.org/draft-07/schema#",
|
|
2656
|
-
$id: "https://codeharness.dev/schemas/verdict.schema.json",
|
|
2657
|
-
title: "EvaluatorVerdict",
|
|
2658
|
-
description: "Schema for evaluator verdict output (AD5)",
|
|
2659
|
-
type: "object",
|
|
2660
|
-
required: ["verdict", "score", "findings"],
|
|
2661
|
-
additionalProperties: true,
|
|
2662
|
-
properties: {
|
|
2663
|
-
verdict: {
|
|
2664
|
-
type: "string",
|
|
2665
|
-
enum: ["pass", "fail"]
|
|
2666
|
-
},
|
|
2667
|
-
score: {
|
|
2668
|
-
type: "object",
|
|
2669
|
-
required: ["passed", "failed", "unknown", "total"],
|
|
2670
|
-
additionalProperties: true,
|
|
2671
|
-
properties: {
|
|
2672
|
-
passed: {
|
|
2673
|
-
type: "integer",
|
|
2674
|
-
minimum: 0
|
|
2675
|
-
},
|
|
2676
|
-
failed: {
|
|
2677
|
-
type: "integer",
|
|
2678
|
-
minimum: 0
|
|
2679
|
-
},
|
|
2680
|
-
unknown: {
|
|
2681
|
-
type: "integer",
|
|
2682
|
-
minimum: 0
|
|
2683
|
-
},
|
|
2684
|
-
total: {
|
|
2685
|
-
type: "integer",
|
|
2686
|
-
minimum: 0
|
|
2687
|
-
}
|
|
2688
|
-
}
|
|
2689
|
-
},
|
|
2690
|
-
findings: {
|
|
2691
|
-
type: "array",
|
|
2692
|
-
items: {
|
|
2693
|
-
type: "object",
|
|
2694
|
-
required: ["ac", "description", "status", "evidence"],
|
|
2695
|
-
additionalProperties: true,
|
|
2696
|
-
properties: {
|
|
2697
|
-
ac: {
|
|
2698
|
-
type: "integer"
|
|
2699
|
-
},
|
|
2700
|
-
description: {
|
|
2701
|
-
type: "string"
|
|
2702
|
-
},
|
|
2703
|
-
status: {
|
|
2704
|
-
type: "string",
|
|
2705
|
-
enum: ["pass", "fail", "unknown"]
|
|
2706
|
-
},
|
|
2707
|
-
evidence: {
|
|
2708
|
-
type: "object",
|
|
2709
|
-
required: ["commands_run", "output_observed", "reasoning"],
|
|
2710
|
-
additionalProperties: true,
|
|
2711
|
-
properties: {
|
|
2712
|
-
commands_run: {
|
|
2713
|
-
type: "array",
|
|
2714
|
-
items: {
|
|
2715
|
-
type: "string"
|
|
2716
|
-
}
|
|
2717
|
-
},
|
|
2718
|
-
output_observed: {
|
|
2719
|
-
type: "string"
|
|
2720
|
-
},
|
|
2721
|
-
reasoning: {
|
|
2722
|
-
type: "string"
|
|
2723
|
-
}
|
|
2724
|
-
}
|
|
2725
|
-
}
|
|
2726
|
-
}
|
|
2727
|
-
}
|
|
2728
|
-
},
|
|
2729
|
-
evaluator_trace_id: {
|
|
2730
|
-
type: "string"
|
|
2731
|
-
},
|
|
2732
|
-
duration_seconds: {
|
|
2733
|
-
type: "number"
|
|
2734
|
-
}
|
|
2735
|
-
}
|
|
2736
|
-
};
|
|
2737
|
-
|
|
2738
|
-
// src/lib/verdict-parser.ts
|
|
2739
|
-
var VerdictParseError = class _VerdictParseError extends Error {
|
|
2740
|
-
retryable;
|
|
2741
|
-
rawOutput;
|
|
2742
|
-
validationErrors;
|
|
2743
|
-
constructor(message, retryable, rawOutput, validationErrors) {
|
|
2744
|
-
super(message);
|
|
2745
|
-
Object.setPrototypeOf(this, _VerdictParseError.prototype);
|
|
2746
|
-
this.name = "VerdictParseError";
|
|
2747
|
-
this.retryable = retryable;
|
|
2748
|
-
this.rawOutput = rawOutput;
|
|
2749
|
-
this.validationErrors = validationErrors;
|
|
2750
|
-
}
|
|
2751
|
-
};
|
|
2752
|
-
var ajv2 = new Ajv2({ allErrors: true });
|
|
2753
|
-
var validateSchema = ajv2.compile(verdict_schema_default);
|
|
2754
|
-
function validateVerdict(data) {
|
|
2755
|
-
const valid = validateSchema(data);
|
|
2756
|
-
if (valid) {
|
|
2757
|
-
const verdict = JSON.parse(JSON.stringify(data));
|
|
2758
|
-
return { valid: true, verdict };
|
|
2759
|
-
}
|
|
2760
|
-
const errors = (validateSchema.errors ?? []).map((err) => {
|
|
2761
|
-
const path = err.instancePath || "/";
|
|
2762
|
-
return `${path}: ${err.message ?? "unknown error"}`;
|
|
2763
|
-
});
|
|
2764
|
-
return { valid: false, errors };
|
|
2765
|
-
}
|
|
2766
2651
|
function parseVerdict(output) {
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
result.errors
|
|
2784
|
-
);
|
|
2652
|
+
const verdictMatch = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
|
|
2653
|
+
const verdictValue = verdictMatch ? verdictMatch[1].toLowerCase() : "fail";
|
|
2654
|
+
const findings = [];
|
|
2655
|
+
const evidenceRegex = /<evidence\s+ac="(\d+)"\s+status="(pass|fail|unknown)">([\s\S]*?)<\/evidence>/gi;
|
|
2656
|
+
let evidenceMatch;
|
|
2657
|
+
while ((evidenceMatch = evidenceRegex.exec(output)) !== null) {
|
|
2658
|
+
findings.push({
|
|
2659
|
+
ac: parseInt(evidenceMatch[1], 10),
|
|
2660
|
+
description: `AC #${evidenceMatch[1]}`,
|
|
2661
|
+
status: evidenceMatch[2].toLowerCase(),
|
|
2662
|
+
evidence: {
|
|
2663
|
+
commands_run: [],
|
|
2664
|
+
output_observed: evidenceMatch[3].trim(),
|
|
2665
|
+
reasoning: evidenceMatch[3].trim()
|
|
2666
|
+
}
|
|
2667
|
+
});
|
|
2785
2668
|
}
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2790
|
-
|
|
2791
|
-
|
|
2792
|
-
|
|
2669
|
+
if (findings.length === 0) {
|
|
2670
|
+
const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
|
|
2671
|
+
if (issuesMatch && verdictValue === "fail") {
|
|
2672
|
+
findings.push({
|
|
2673
|
+
ac: 1,
|
|
2674
|
+
description: "Issues found",
|
|
2675
|
+
status: "fail",
|
|
2676
|
+
evidence: {
|
|
2677
|
+
commands_run: [],
|
|
2678
|
+
output_observed: issuesMatch[1].trim(),
|
|
2679
|
+
reasoning: issuesMatch[1].trim()
|
|
2680
|
+
}
|
|
2681
|
+
});
|
|
2793
2682
|
}
|
|
2794
2683
|
}
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
};
|
|
2810
|
-
if (passed === 0) {
|
|
2811
|
-
verdict.verdict = "fail";
|
|
2684
|
+
let passed = 0;
|
|
2685
|
+
let failed = 0;
|
|
2686
|
+
let unknown = 0;
|
|
2687
|
+
for (const f of findings) {
|
|
2688
|
+
if (f.status === "pass") passed++;
|
|
2689
|
+
else if (f.status === "fail") failed++;
|
|
2690
|
+
else unknown++;
|
|
2691
|
+
}
|
|
2692
|
+
const total = findings.length || 1;
|
|
2693
|
+
if (findings.length === 0) {
|
|
2694
|
+
if (verdictValue === "pass") {
|
|
2695
|
+
passed = 1;
|
|
2696
|
+
} else {
|
|
2697
|
+
failed = 1;
|
|
2812
2698
|
}
|
|
2813
2699
|
}
|
|
2814
|
-
return verdict;
|
|
2815
|
-
}
|
|
2816
|
-
function parseVerdictTag(output) {
|
|
2817
|
-
const match = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
|
|
2818
|
-
if (!match) return null;
|
|
2819
|
-
const verdict = match[1].toLowerCase();
|
|
2820
|
-
const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
|
|
2821
2700
|
return {
|
|
2822
|
-
verdict,
|
|
2823
|
-
|
|
2701
|
+
verdict: verdictValue,
|
|
2702
|
+
score: { passed, failed, unknown, total },
|
|
2703
|
+
findings
|
|
2824
2704
|
};
|
|
2825
2705
|
}
|
|
2826
2706
|
function extractTag(output, tag) {
|
|
@@ -6249,15 +6129,6 @@ ${formatted}
|
|
|
6249
6129
|
|
|
6250
6130
|
Focus on fixing the failed criteria above.`;
|
|
6251
6131
|
}
|
|
6252
|
-
function buildAllUnknownVerdict(workItems, reasoning) {
|
|
6253
|
-
const findings = workItems.map((_, index) => ({
|
|
6254
|
-
ac: index + 1,
|
|
6255
|
-
description: `AC #${index + 1}`,
|
|
6256
|
-
status: "unknown",
|
|
6257
|
-
evidence: { commands_run: [], output_observed: "", reasoning }
|
|
6258
|
-
}));
|
|
6259
|
-
return { verdict: "fail", score: { passed: 0, failed: 0, unknown: findings.length, total: findings.length }, findings };
|
|
6260
|
-
}
|
|
6261
6132
|
function getFailedItems(verdict, allItems) {
|
|
6262
6133
|
if (!verdict) return allItems;
|
|
6263
6134
|
if (verdict.verdict === "pass") return [];
|
|
@@ -6358,20 +6229,7 @@ var loopIterationActor = fromPromise2(async ({ input }) => {
|
|
|
6358
6229
|
accumulatedCostUsd += dr.contract?.cost_usd ?? 0;
|
|
6359
6230
|
tasksCompleted++;
|
|
6360
6231
|
if (taskName === lastAgentTaskInLoop) {
|
|
6361
|
-
|
|
6362
|
-
const tagged = parseVerdictTag(dr.output);
|
|
6363
|
-
if (tagged) {
|
|
6364
|
-
verdict = { verdict: tagged.verdict, score: { passed: tagged.verdict === "pass" ? 1 : 0, failed: tagged.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 }, findings: [] };
|
|
6365
|
-
}
|
|
6366
|
-
if (!verdict) {
|
|
6367
|
-
try {
|
|
6368
|
-
verdict = parseVerdict(dr.output);
|
|
6369
|
-
} catch {
|
|
6370
|
-
}
|
|
6371
|
-
}
|
|
6372
|
-
if (!verdict) {
|
|
6373
|
-
verdict = buildAllUnknownVerdict(workItems, "No verdict tag or JSON found in output");
|
|
6374
|
-
}
|
|
6232
|
+
const verdict = parseVerdict(dr.output);
|
|
6375
6233
|
lastVerdict = verdict;
|
|
6376
6234
|
if (verdict) {
|
|
6377
6235
|
const score = { iteration: currentState.iteration, passed: verdict.score.passed, failed: verdict.score.failed, unknown: verdict.score.unknown, total: verdict.score.total, timestamp: (/* @__PURE__ */ new Date()).toISOString() };
|
|
@@ -11125,7 +10983,7 @@ function registerTeardownCommand(program) {
|
|
|
11125
10983
|
} else if (otlpMode === "remote-routed") {
|
|
11126
10984
|
if (!options.keepDocker) {
|
|
11127
10985
|
try {
|
|
11128
|
-
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-
|
|
10986
|
+
const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-OA5CDTQZ.js");
|
|
11129
10987
|
stopCollectorOnly2();
|
|
11130
10988
|
result.docker.stopped = true;
|
|
11131
10989
|
if (!isJson) {
|
|
@@ -11157,7 +11015,7 @@ function registerTeardownCommand(program) {
|
|
|
11157
11015
|
info("Shared stack: kept running (other projects may use it)");
|
|
11158
11016
|
}
|
|
11159
11017
|
} else if (isLegacyStack) {
|
|
11160
|
-
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-
|
|
11018
|
+
const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-OA5CDTQZ.js");
|
|
11161
11019
|
let stackRunning = false;
|
|
11162
11020
|
try {
|
|
11163
11021
|
stackRunning = isStackRunning2(composeFile);
|
|
@@ -14142,7 +14000,7 @@ function registerDriversCommand(program) {
|
|
|
14142
14000
|
}
|
|
14143
14001
|
|
|
14144
14002
|
// src/index.ts
|
|
14145
|
-
var VERSION = true ? "0.37.
|
|
14003
|
+
var VERSION = true ? "0.37.1" : "0.0.0-dev";
|
|
14146
14004
|
function createProgram() {
|
|
14147
14005
|
const program = new Command();
|
|
14148
14006
|
program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
|
package/package.json
CHANGED
|
@@ -66,50 +66,15 @@ prompt_template: |
|
|
|
66
66
|
|
|
67
67
|
Base your scores on what you observe through the running system, not assumptions.
|
|
68
68
|
|
|
69
|
-
## Output Format
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
{
|
|
82
|
-
"ac": <number>,
|
|
83
|
-
"description": "<AC description>",
|
|
84
|
-
"status": "pass" | "fail" | "unknown",
|
|
85
|
-
"evidence": {
|
|
86
|
-
"commands_run": ["<command>"],
|
|
87
|
-
"output_observed": "<output>",
|
|
88
|
-
"reasoning": "<why>"
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
],
|
|
92
|
-
"quality_scores": {
|
|
93
|
-
"architecture": <1-5>,
|
|
94
|
-
"originality": <1-5>,
|
|
95
|
-
"craft": <1-5>,
|
|
96
|
-
"functionality": <1-5>
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
```
|
|
100
|
-
|
|
101
|
-
Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
|
|
102
|
-
|
|
103
|
-
## XML Tags — MANDATORY
|
|
104
|
-
|
|
105
|
-
In addition to the JSON file output, your response MUST include these XML tags (machine-parsed):
|
|
106
|
-
|
|
107
|
-
Include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
|
|
108
|
-
|
|
109
|
-
For each AC, include `<evidence ac="N" status="pass|fail|unknown">command, output, reasoning</evidence>`.
|
|
110
|
-
|
|
111
|
-
Include `<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`.
|
|
112
|
-
|
|
113
|
-
## Output Location
|
|
114
|
-
|
|
115
|
-
Write verdict JSON to ./verdict/verdict.json
|
|
69
|
+
## Output Format — XML Tags (machine-parsed)
|
|
70
|
+
|
|
71
|
+
Your response MUST include these XML tags:
|
|
72
|
+
|
|
73
|
+
`<verdict>pass</verdict>` or `<verdict>fail</verdict>`
|
|
74
|
+
Verdict is "pass" only if ALL ACs have status "pass".
|
|
75
|
+
|
|
76
|
+
For each AC:
|
|
77
|
+
`<evidence ac="N" status="pass|fail|unknown">command run, output observed, reasoning</evidence>`
|
|
78
|
+
|
|
79
|
+
Quality assessment:
|
|
80
|
+
`<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`
|