codeharness 0.37.0 → 0.37.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
2895
2895
  }
2896
2896
 
2897
2897
  // src/modules/infra/init-project.ts
2898
- var HARNESS_VERSION = true ? "0.37.0" : "0.0.0-dev";
2898
+ var HARNESS_VERSION = true ? "0.37.1" : "0.0.0-dev";
2899
2899
  function failResult(opts, error) {
2900
2900
  return {
2901
2901
  status: "fail",
@@ -16,7 +16,7 @@ import {
16
16
  stopCollectorOnly,
17
17
  stopSharedStack,
18
18
  stopStack
19
- } from "./chunk-HCAPQSAZ.js";
19
+ } from "./chunk-AXFKDGFF.js";
20
20
  export {
21
21
  checkRemoteEndpoint,
22
22
  cleanupOrphanedContainers,
package/dist/index.js CHANGED
@@ -40,7 +40,7 @@ import {
40
40
  validateDockerfile,
41
41
  warn,
42
42
  writeState
43
- } from "./chunk-HCAPQSAZ.js";
43
+ } from "./chunk-AXFKDGFF.js";
44
44
 
45
45
  // src/index.ts
46
46
  import { Command } from "commander";
@@ -2648,179 +2648,59 @@ var DispatchError = class extends Error {
2648
2648
  };
2649
2649
 
2650
2650
  // src/lib/verdict-parser.ts
2651
- import Ajv2 from "ajv";
2652
-
2653
- // src/schemas/verdict.schema.json
2654
- var verdict_schema_default = {
2655
- $schema: "http://json-schema.org/draft-07/schema#",
2656
- $id: "https://codeharness.dev/schemas/verdict.schema.json",
2657
- title: "EvaluatorVerdict",
2658
- description: "Schema for evaluator verdict output (AD5)",
2659
- type: "object",
2660
- required: ["verdict", "score", "findings"],
2661
- additionalProperties: true,
2662
- properties: {
2663
- verdict: {
2664
- type: "string",
2665
- enum: ["pass", "fail"]
2666
- },
2667
- score: {
2668
- type: "object",
2669
- required: ["passed", "failed", "unknown", "total"],
2670
- additionalProperties: true,
2671
- properties: {
2672
- passed: {
2673
- type: "integer",
2674
- minimum: 0
2675
- },
2676
- failed: {
2677
- type: "integer",
2678
- minimum: 0
2679
- },
2680
- unknown: {
2681
- type: "integer",
2682
- minimum: 0
2683
- },
2684
- total: {
2685
- type: "integer",
2686
- minimum: 0
2687
- }
2688
- }
2689
- },
2690
- findings: {
2691
- type: "array",
2692
- items: {
2693
- type: "object",
2694
- required: ["ac", "description", "status", "evidence"],
2695
- additionalProperties: true,
2696
- properties: {
2697
- ac: {
2698
- type: "integer"
2699
- },
2700
- description: {
2701
- type: "string"
2702
- },
2703
- status: {
2704
- type: "string",
2705
- enum: ["pass", "fail", "unknown"]
2706
- },
2707
- evidence: {
2708
- type: "object",
2709
- required: ["commands_run", "output_observed", "reasoning"],
2710
- additionalProperties: true,
2711
- properties: {
2712
- commands_run: {
2713
- type: "array",
2714
- items: {
2715
- type: "string"
2716
- }
2717
- },
2718
- output_observed: {
2719
- type: "string"
2720
- },
2721
- reasoning: {
2722
- type: "string"
2723
- }
2724
- }
2725
- }
2726
- }
2727
- }
2728
- },
2729
- evaluator_trace_id: {
2730
- type: "string"
2731
- },
2732
- duration_seconds: {
2733
- type: "number"
2734
- }
2735
- }
2736
- };
2737
-
2738
- // src/lib/verdict-parser.ts
2739
- var VerdictParseError = class _VerdictParseError extends Error {
2740
- retryable;
2741
- rawOutput;
2742
- validationErrors;
2743
- constructor(message, retryable, rawOutput, validationErrors) {
2744
- super(message);
2745
- Object.setPrototypeOf(this, _VerdictParseError.prototype);
2746
- this.name = "VerdictParseError";
2747
- this.retryable = retryable;
2748
- this.rawOutput = rawOutput;
2749
- this.validationErrors = validationErrors;
2750
- }
2751
- };
2752
- var ajv2 = new Ajv2({ allErrors: true });
2753
- var validateSchema = ajv2.compile(verdict_schema_default);
2754
- function validateVerdict(data) {
2755
- const valid = validateSchema(data);
2756
- if (valid) {
2757
- const verdict = JSON.parse(JSON.stringify(data));
2758
- return { valid: true, verdict };
2759
- }
2760
- const errors = (validateSchema.errors ?? []).map((err) => {
2761
- const path = err.instancePath || "/";
2762
- return `${path}: ${err.message ?? "unknown error"}`;
2763
- });
2764
- return { valid: false, errors };
2765
- }
2766
2651
  function parseVerdict(output) {
2767
- let parsed;
2768
- try {
2769
- parsed = JSON.parse(output);
2770
- } catch {
2771
- throw new VerdictParseError(
2772
- "Failed to parse verdict: invalid JSON",
2773
- true,
2774
- output
2775
- );
2776
- }
2777
- const result = validateVerdict(parsed);
2778
- if (!result.valid) {
2779
- throw new VerdictParseError(
2780
- `Failed to parse verdict: schema validation failed`,
2781
- true,
2782
- output,
2783
- result.errors
2784
- );
2652
+ const verdictMatch = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
2653
+ const verdictValue = verdictMatch ? verdictMatch[1].toLowerCase() : "fail";
2654
+ const findings = [];
2655
+ const evidenceRegex = /<evidence\s+ac="(\d+)"\s+status="(pass|fail|unknown)">([\s\S]*?)<\/evidence>/gi;
2656
+ let evidenceMatch;
2657
+ while ((evidenceMatch = evidenceRegex.exec(output)) !== null) {
2658
+ findings.push({
2659
+ ac: parseInt(evidenceMatch[1], 10),
2660
+ description: `AC #${evidenceMatch[1]}`,
2661
+ status: evidenceMatch[2].toLowerCase(),
2662
+ evidence: {
2663
+ commands_run: [],
2664
+ output_observed: evidenceMatch[3].trim(),
2665
+ reasoning: evidenceMatch[3].trim()
2666
+ }
2667
+ });
2785
2668
  }
2786
- const verdict = result.verdict;
2787
- let passDowngraded = false;
2788
- for (const finding of verdict.findings) {
2789
- if (finding.status === "pass" && (!finding.evidence.commands_run || finding.evidence.commands_run.length === 0)) {
2790
- finding.status = "unknown";
2791
- finding.evidence.reasoning += " [Downgraded from PASS: no commands_run evidence provided]";
2792
- passDowngraded = true;
2669
+ if (findings.length === 0) {
2670
+ const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
2671
+ if (issuesMatch && verdictValue === "fail") {
2672
+ findings.push({
2673
+ ac: 1,
2674
+ description: "Issues found",
2675
+ status: "fail",
2676
+ evidence: {
2677
+ commands_run: [],
2678
+ output_observed: issuesMatch[1].trim(),
2679
+ reasoning: issuesMatch[1].trim()
2680
+ }
2681
+ });
2793
2682
  }
2794
2683
  }
2795
- if (passDowngraded) {
2796
- let passed = 0;
2797
- let failed = 0;
2798
- let unknown = 0;
2799
- for (const finding of verdict.findings) {
2800
- if (finding.status === "pass") passed++;
2801
- else if (finding.status === "fail") failed++;
2802
- else unknown++;
2803
- }
2804
- verdict.score = {
2805
- passed,
2806
- failed,
2807
- unknown,
2808
- total: verdict.findings.length
2809
- };
2810
- if (passed === 0) {
2811
- verdict.verdict = "fail";
2684
+ let passed = 0;
2685
+ let failed = 0;
2686
+ let unknown = 0;
2687
+ for (const f of findings) {
2688
+ if (f.status === "pass") passed++;
2689
+ else if (f.status === "fail") failed++;
2690
+ else unknown++;
2691
+ }
2692
+ const total = findings.length || 1;
2693
+ if (findings.length === 0) {
2694
+ if (verdictValue === "pass") {
2695
+ passed = 1;
2696
+ } else {
2697
+ failed = 1;
2812
2698
  }
2813
2699
  }
2814
- return verdict;
2815
- }
2816
- function parseVerdictTag(output) {
2817
- const match = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
2818
- if (!match) return null;
2819
- const verdict = match[1].toLowerCase();
2820
- const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
2821
2700
  return {
2822
- verdict,
2823
- ...issuesMatch ? { issues: issuesMatch[1].trim() } : {}
2701
+ verdict: verdictValue,
2702
+ score: { passed, failed, unknown, total },
2703
+ findings
2824
2704
  };
2825
2705
  }
2826
2706
  function extractTag(output, tag) {
@@ -6249,15 +6129,6 @@ ${formatted}
6249
6129
 
6250
6130
  Focus on fixing the failed criteria above.`;
6251
6131
  }
6252
- function buildAllUnknownVerdict(workItems, reasoning) {
6253
- const findings = workItems.map((_, index) => ({
6254
- ac: index + 1,
6255
- description: `AC #${index + 1}`,
6256
- status: "unknown",
6257
- evidence: { commands_run: [], output_observed: "", reasoning }
6258
- }));
6259
- return { verdict: "fail", score: { passed: 0, failed: 0, unknown: findings.length, total: findings.length }, findings };
6260
- }
6261
6132
  function getFailedItems(verdict, allItems) {
6262
6133
  if (!verdict) return allItems;
6263
6134
  if (verdict.verdict === "pass") return [];
@@ -6358,20 +6229,7 @@ var loopIterationActor = fromPromise2(async ({ input }) => {
6358
6229
  accumulatedCostUsd += dr.contract?.cost_usd ?? 0;
6359
6230
  tasksCompleted++;
6360
6231
  if (taskName === lastAgentTaskInLoop) {
6361
- let verdict = null;
6362
- const tagged = parseVerdictTag(dr.output);
6363
- if (tagged) {
6364
- verdict = { verdict: tagged.verdict, score: { passed: tagged.verdict === "pass" ? 1 : 0, failed: tagged.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 }, findings: [] };
6365
- }
6366
- if (!verdict) {
6367
- try {
6368
- verdict = parseVerdict(dr.output);
6369
- } catch {
6370
- }
6371
- }
6372
- if (!verdict) {
6373
- verdict = buildAllUnknownVerdict(workItems, "No verdict tag or JSON found in output");
6374
- }
6232
+ const verdict = parseVerdict(dr.output);
6375
6233
  lastVerdict = verdict;
6376
6234
  if (verdict) {
6377
6235
  const score = { iteration: currentState.iteration, passed: verdict.score.passed, failed: verdict.score.failed, unknown: verdict.score.unknown, total: verdict.score.total, timestamp: (/* @__PURE__ */ new Date()).toISOString() };
@@ -11125,7 +10983,7 @@ function registerTeardownCommand(program) {
11125
10983
  } else if (otlpMode === "remote-routed") {
11126
10984
  if (!options.keepDocker) {
11127
10985
  try {
11128
- const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-6GFZ4B3V.js");
10986
+ const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-OA5CDTQZ.js");
11129
10987
  stopCollectorOnly2();
11130
10988
  result.docker.stopped = true;
11131
10989
  if (!isJson) {
@@ -11157,7 +11015,7 @@ function registerTeardownCommand(program) {
11157
11015
  info("Shared stack: kept running (other projects may use it)");
11158
11016
  }
11159
11017
  } else if (isLegacyStack) {
11160
- const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-6GFZ4B3V.js");
11018
+ const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-OA5CDTQZ.js");
11161
11019
  let stackRunning = false;
11162
11020
  try {
11163
11021
  stackRunning = isStackRunning2(composeFile);
@@ -14142,7 +14000,7 @@ function registerDriversCommand(program) {
14142
14000
  }
14143
14001
 
14144
14002
  // src/index.ts
14145
- var VERSION = true ? "0.37.0" : "0.0.0-dev";
14003
+ var VERSION = true ? "0.37.1" : "0.0.0-dev";
14146
14004
  function createProgram() {
14147
14005
  const program = new Command();
14148
14006
  program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codeharness",
3
- "version": "0.37.0",
3
+ "version": "0.37.1",
4
4
  "type": "module",
5
5
  "description": "CLI for codeharness — makes autonomous coding agents produce software that actually works",
6
6
  "bin": {
@@ -66,50 +66,15 @@ prompt_template: |
66
66
 
67
67
  Base your scores on what you observe through the running system, not assumptions.
68
68
 
69
- ## Output Format
70
-
71
- ```json
72
- {
73
- "verdict": "pass" | "fail",
74
- "score": {
75
- "passed": <number>,
76
- "failed": <number>,
77
- "unknown": <number>,
78
- "total": <number>
79
- },
80
- "findings": [
81
- {
82
- "ac": <number>,
83
- "description": "<AC description>",
84
- "status": "pass" | "fail" | "unknown",
85
- "evidence": {
86
- "commands_run": ["<command>"],
87
- "output_observed": "<output>",
88
- "reasoning": "<why>"
89
- }
90
- }
91
- ],
92
- "quality_scores": {
93
- "architecture": <1-5>,
94
- "originality": <1-5>,
95
- "craft": <1-5>,
96
- "functionality": <1-5>
97
- }
98
- }
99
- ```
100
-
101
- Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
102
-
103
- ## XML Tags — MANDATORY
104
-
105
- In addition to the JSON file output, your response MUST include these XML tags (machine-parsed):
106
-
107
- Include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
108
-
109
- For each AC, include `<evidence ac="N" status="pass|fail|unknown">command, output, reasoning</evidence>`.
110
-
111
- Include `<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`.
112
-
113
- ## Output Location
114
-
115
- Write verdict JSON to ./verdict/verdict.json
69
+ ## Output Format — XML Tags (machine-parsed)
70
+
71
+ Your response MUST include these XML tags:
72
+
73
+ `<verdict>pass</verdict>` or `<verdict>fail</verdict>`
74
+ Verdict is "pass" only if ALL ACs have status "pass".
75
+
76
+ For each AC:
77
+ `<evidence ac="N" status="pass|fail|unknown">command run, output observed, reasoning</evidence>`
78
+
79
+ Quality assessment:
80
+ `<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`