npm - codeharness - Versions diffs - 0.34.1 → 0.35.1 - Mend

codeharness 0.34.1 → 0.35.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/{chunk-G2RR744K.js → chunk-AIXEFZIV.js} +1 -1
package/dist/{docker-MEPWHG4P.js → docker-GYFYNCLQ.js} +1 -1
package/dist/index.js +34 -35
package/package.json +1 -1
package/templates/agents/checker.yaml +2 -0
package/templates/agents/deployer.yaml +17 -19
package/templates/agents/documenter.yaml +4 -0
package/templates/agents/evaluator.yaml +15 -1
package/templates/agents/negotiator.yaml +29 -18
package/templates/agents/reviewer.yaml +3 -9
package/templates/agents/story-creator.yaml +2 -0
package/templates/workflows/default.yaml +0 -9

package/dist/{chunk-G2RR744K.js → chunk-AIXEFZIV.js} RENAMED Viewed

@@ -2895,7 +2895,7 @@ function generateDockerfileTemplate(projectDir, stackOrDetections) {
 }
 // src/modules/infra/init-project.ts
-var HARNESS_VERSION = true ? "0.34.1" : "0.0.0-dev";
+var HARNESS_VERSION = true ? "0.35.1" : "0.0.0-dev";
 function failResult(opts, error) {
   return {
     status: "fail",

package/dist/{docker-MEPWHG4P.js → docker-GYFYNCLQ.js} RENAMED Viewed

@@ -16,7 +16,7 @@ import {
   stopCollectorOnly,
   stopSharedStack,
   stopStack
-} from "./chunk-G2RR744K.js";
+} from "./chunk-AIXEFZIV.js";
 export {
   checkRemoteEndpoint,
   cleanupOrphanedContainers,

package/dist/index.js CHANGED Viewed

@@ -40,7 +40,7 @@ import {
   validateDockerfile,
   warn,
   writeState
-} from "./chunk-G2RR744K.js";
+} from "./chunk-AIXEFZIV.js";
 // src/index.ts
 import { Command } from "commander";
@@ -3067,22 +3067,20 @@ function parseVerdict(output) {
   }
   return verdict;
 }
-function parseSimpleVerdict(output) {
-  const jsonPattern = /\{[^{}]*"verdict"\s*:\s*"(pass|fail)"[^{}]*\}/g;
-  let lastMatch = null;
-  let m;
-  while ((m = jsonPattern.exec(output)) !== null) {
-    lastMatch = m;
-  }
-  if (!lastMatch) return null;
-  try {
-    const parsed = JSON.parse(lastMatch[0]);
-    if (parsed.verdict === "pass" || parsed.verdict === "fail") {
-      return { verdict: parsed.verdict };
-    }
-  } catch {
-  }
-  return null;
+function parseVerdictTag(output) {
+  const match = /<verdict>(pass|fail)<\/verdict>/i.exec(output);
+  if (!match) return null;
+  const verdict = match[1].toLowerCase();
+  const issuesMatch = /<issues>([\s\S]*?)<\/issues>/i.exec(output);
+  return {
+    verdict,
+    ...issuesMatch ? { issues: issuesMatch[1].trim() } : {}
+  };
+}
+function extractTag(output, tag) {
+  const pattern = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`, "i");
+  const match = pattern.exec(output);
+  return match ? match[1].trim() : null;
 }
 // src/lib/circuit-breaker.ts
@@ -3388,14 +3386,13 @@ async function dispatchTaskWithResult(task, taskName, storyKey, definition, stat
   }
   const isEpicSentinel = storyKey.startsWith("__epic_") || storyKey === PER_RUN_SENTINEL;
   const TASK_PROMPTS = {
-    "create-story": (key) => `Create or revise the story spec for ${key}. Read the epic definitions and architecture docs. If previous feedback is provided (from AC negotiation or review), revise the story to address that feedback. Write a complete story file with acceptance criteria, tasks, and dev notes.`,
-    "negotiate-acs": (key) => `Review the ACs in story ${key} for testability. Can each AC be verified by a blind QA agent with only Docker access and user documentation? Your response MUST end with exactly one JSON line: {"verdict": "pass"} or {"verdict": "fail", "issues": ["..."]}`,
+    "create-story": (key) => `Create the story spec for ${key}. Read the epic definitions and architecture docs. Write a complete story file with acceptance criteria, tasks, and dev notes. CRITICAL: Every AC must be testable by a blind QA agent using ONLY a user guide + browser/API/CLI access. No AC should reference source code, internal data structures, or implementation details like O(1) complexity. Each AC must describe observable behavior that can be verified through UI interaction (agent-browser), API calls (curl), CLI commands (docker exec), or log inspection (docker logs). Wrap output in <story-spec>...</story-spec> tags.`,
     "implement": (key) => `Implement story ${key}`,
-    "check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Report pass/fail results.`,
-    "review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Output a verdict JSON at the end.`,
-    "document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code \u2014 describe features, UI pages, API endpoints, CLI commands, and expected behavior.`,
-    "deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Output a deploy report JSON with container names, URLs, credentials, and health status.`,
-    "verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps from the documentation, connect using deploy info, run commands, and observe output. Also score subjective quality on 4 dimensions.`,
+    "check": (key) => `Run automated checks for story ${key}. Execute the project's test suite, linter, and coverage tool. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response.`,
+    "review": (key) => `Review the implementation of story ${key}. Check for correctness, security issues, architecture violations, and AC coverage. Include <verdict>pass</verdict> or <verdict>fail</verdict> in your response. If fail, include <issues>...</issues>.`,
+    "document": (key) => `Write user documentation for story ${key}. Describe what was built and how to use it from a user's perspective. No source code. Wrap documentation in <user-docs>...</user-docs> tags.`,
+    "deploy": () => `Provision the Docker environment for this project. Check for docker-compose.yml, start containers, verify health. Wrap report in <deploy-report>...</deploy-report> tags with status, containers, URLs, credentials, health.`,
+    "verify": () => `Verify the epic's stories using the user docs and deploy info in ./story-files/. For each AC, derive verification steps, run commands, observe output. Include <verdict>pass</verdict> or <verdict>fail</verdict>. Include <evidence ac="N" status="pass|fail|unknown">...</evidence> per AC. Include <quality-scores>...</quality-scores>.`,
     "retro": () => `Run a retrospective for this epic. Analyze what worked, what failed, patterns, and action items for next epic.`
   };
   let basePrompt;
@@ -3785,11 +3782,11 @@ async function executeLoopBlock(loopBlock, state, config, workItems, initialCont
             }
           }
           if (!verdict) {
-            const simple = parseSimpleVerdict(dispatchResult.output);
-            if (simple) {
+            const tagged = parseVerdictTag(dispatchResult.output);
+            if (tagged) {
               verdict = {
-                verdict: simple.verdict,
-                score: { passed: simple.verdict === "pass" ? 1 : 0, failed: simple.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
+                verdict: tagged.verdict,
+                score: { passed: tagged.verdict === "pass" ? 1 : 0, failed: tagged.verdict === "fail" ? 1 : 0, unknown: 0, total: 1 },
                 findings: []
               };
             }
@@ -4105,9 +4102,10 @@ async function executeWorkflow(config) {
             const contractPath = join12(projectDir, ".codeharness", "contracts", `document-${item.key}.json`);
             if (existsSync15(contractPath)) {
               const contractData = JSON.parse(readFileSync13(contractPath, "utf-8"));
-              if (contractData.output) {
+              const docs = contractData.output ? extractTag(contractData.output, "user-docs") ?? contractData.output : null;
+              if (docs) {
                 const guidePath = join12(guidesDir, `${item.key}-guide.md`);
-                writeFileSync8(guidePath, contractData.output, "utf-8");
+                writeFileSync8(guidePath, docs, "utf-8");
                 guideFiles.push(guidePath);
               }
             }
@@ -4115,9 +4113,10 @@ async function executeWorkflow(config) {
           const deployContractPath = join12(projectDir, ".codeharness", "contracts", `deploy-${epicSentinel}.json`);
           if (existsSync15(deployContractPath)) {
             const deployData = JSON.parse(readFileSync13(deployContractPath, "utf-8"));
-            if (deployData.output) {
+            const report = deployData.output ? extractTag(deployData.output, "deploy-report") ?? deployData.output : null;
+            if (report) {
               const deployPath = join12(guidesDir, "deploy-info.md");
-              writeFileSync8(deployPath, deployData.output, "utf-8");
+              writeFileSync8(deployPath, report, "utf-8");
               guideFiles.push(deployPath);
             }
           }
@@ -11189,7 +11188,7 @@ function registerTeardownCommand(program) {
     } else if (otlpMode === "remote-routed") {
       if (!options.keepDocker) {
         try {
-          const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-MEPWHG4P.js");
+          const { stopCollectorOnly: stopCollectorOnly2 } = await import("./docker-GYFYNCLQ.js");
           stopCollectorOnly2();
           result.docker.stopped = true;
           if (!isJson) {
@@ -11221,7 +11220,7 @@ function registerTeardownCommand(program) {
         info("Shared stack: kept running (other projects may use it)");
       }
     } else if (isLegacyStack) {
-      const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-MEPWHG4P.js");
+      const { isStackRunning: isStackRunning2, stopStack } = await import("./docker-GYFYNCLQ.js");
       let stackRunning = false;
       try {
         stackRunning = isStackRunning2(composeFile);
@@ -14208,7 +14207,7 @@ function registerDriversCommand(program) {
 }
 // src/index.ts
-var VERSION = true ? "0.34.1" : "0.0.0-dev";
+var VERSION = true ? "0.35.1" : "0.0.0-dev";
 function createProgram() {
   const program = new Command();
   program.name("codeharness").description("Makes autonomous coding agents produce software that actually works").version(VERSION).option("--json", "Output in machine-readable JSON format");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "codeharness",
-  "version": "0.34.1",
+  "version": "0.35.1",
   "type": "module",
   "description": "CLI for codeharness — makes autonomous coding agents produce software that actually works",
   "bin": {

package/templates/agents/checker.yaml CHANGED Viewed

@@ -60,6 +60,8 @@ prompt_template: |
   Verdict is "pass" only if ALL checks pass.
+  Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
   ## Output Location
   Write results to ./verdict/check.json

package/templates/agents/deployer.yaml CHANGED Viewed

@@ -22,7 +22,7 @@ prompt_template: |
   1. Check for Docker configuration:
      - Look for `docker-compose.yml`, `docker-compose.yaml`, `compose.yml`, or `Dockerfile`
-     - If NONE found, output: `{"status": "no-docker", "message": "No Docker configuration found in project"}`
+     - If NONE found, output a deploy-report with status no-docker (see Output section below)
      - STOP here if no Docker config exists
   2. Check for already-running containers:
@@ -34,24 +34,22 @@ prompt_template: |
      - Wait for containers to start (max 60 seconds)
      - Check health endpoints if defined
-  4. Output deploy report as JSON:
-     ```json
-     {
-       "status": "running",
-       "containers": [
-         {"name": "container-name", "image": "image:tag", "status": "healthy", "ports": ["8000:8000"]}
-       ],
-       "urls": {
-         "api": "http://localhost:8000",
-         "web": "http://localhost:3000"
-       },
-       "credentials": {
-         "db_url": "postgresql://...",
-         "api_key": "..."
-       },
-       "health": "healthy"
-     }
-     ```
+  4. Wrap your deploy report in `<deploy-report>...</deploy-report>` tags. Include: status, container names, URLs, ports, credentials, health.
+     Example:
+     <deploy-report>
+     status: running
+     containers: container-name (image:tag, healthy, 8000:8000)
+     urls: api=http://localhost:8000, web=http://localhost:3000
+     credentials: db_url=postgresql://..., api_key=...
+     health: healthy
+     </deploy-report>
+     If no Docker config exists, output:
+     <deploy-report>
+     status: no-docker
+     message: No Docker configuration found in project
+     </deploy-report>
   ## Important

package/templates/agents/documenter.yaml CHANGED Viewed

@@ -65,3 +65,7 @@ prompt_template: |
   - Describe observable behavior, not internal logic
   - If a feature has no external interface (internal-only), describe how it affects
     other features that DO have external interfaces
+  ## Output — MANDATORY FORMAT
+  Wrap your entire documentation in `<user-docs>...</user-docs>` tags. This is machine-parsed.

package/templates/agents/evaluator.yaml CHANGED Viewed

@@ -40,7 +40,11 @@ prompt_template: |
   1. Read the user documentation to understand the feature
   2. Use the deploy info to connect to the running system
   3. Derive your OWN verification steps from the documentation
-  4. Run commands: `docker exec`, `curl`, `docker logs`, or other tools
+  4. Use the appropriate verification method:
+     - **API**: `curl` or HTTP requests to endpoints
+     - **UI**: `agent-browser` to navigate pages, click elements, observe content
+     - **CLI**: `docker exec` to run commands inside containers
+     - **Logs**: `docker logs` to check for specific entries
   5. Observe output and compare to expected behavior from the docs
   If Docker is not available or containers are not running, report ALL ACs as UNKNOWN.
@@ -96,6 +100,16 @@ prompt_template: |
   Verdict is "pass" only if ALL findings have status "pass". Quality scores are informational.
+  ## XML Tags — MANDATORY
+  In addition to the JSON file output, your response MUST include these XML tags (machine-parsed):
+  Include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
+  For each AC, include `<evidence ac="N" status="pass|fail|unknown">command, output, reasoning</evidence>`.
+  Include `<quality-scores>architecture: N, originality: N, craft: N, functionality: N</quality-scores>`.
   ## Output Location
   Write verdict JSON to ./verdict/verdict.json

package/templates/agents/negotiator.yaml CHANGED Viewed

@@ -5,38 +5,49 @@ role:
 persona:
   identity: |
     QA architect who reviews ACs before any code is written. Ensures every AC
-    can be verified by a blind evaluator with only Docker access and user docs.
-    Rejects untestable, vague, or implementation-dependent ACs.
+    can be verified by a blind evaluator with only Docker access, user docs,
+    and agent-browser for UI testing.
   communication_style: "Direct, specific. Points to exact ACs that fail testability and suggests concrete rewrites."
   principles:
     - Every AC must be verifiable without reading source code
-    - Verification must be possible through Docker commands, API calls, or UI interaction
-    - Vague ACs like "system handles errors gracefully" must be rewritten with specific observable behavior
+    - Verification must be possible through API calls, UI interaction, or CLI commands
+    - Vague ACs must be rewritten with specific observable behavior
     - If an AC requires reading source to verify, it fails testability
 prompt_template: |
   ## Role
   You are reviewing acceptance criteria for testability BEFORE implementation begins.
-  Your job: ensure every AC can be verified by a blind QA agent who has only Docker access and user documentation.
+  Your job: ensure every AC can be verified by a blind QA agent.
+  ## Pass Criteria — an AC is testable if it can be verified through:
+  - **API**: curl/HTTP request to an endpoint, checking response body/status
+  - **UI**: agent-browser navigation, clicking, observing page content
+  - **CLI**: docker exec running a command, checking output
+  - **Logs**: docker logs checking for specific log entries
+  - **Database**: querying DB state through an exposed API or CLI tool
+  ## Fail Criteria — an AC is NOT testable if it requires:
+  - Reading source code files
+  - Inspecting internal data structures
+  - Understanding implementation details (e.g., "uses O(1) lookup" — untestable without benchmarks)
+  - Checking code patterns or conventions (that's the reviewer's job, not the evaluator's)
   ## Process
   1. Read the story spec (provided via previous task context)
-  2. For each AC, assess: Can a QA agent verify this using ONLY:
-     - Docker commands (docker exec, docker logs)
-     - HTTP requests (curl, API calls)
-     - UI interaction (browser, pages)
-     - Observable output (logs, responses, behavior)
-  3. If an AC requires reading source code, inspecting file contents, or understanding implementation details to verify — it FAILS testability
+  2. For each AC, determine: which verification method (API/UI/CLI/Logs/DB) would prove this?
+  3. If you can identify a concrete method → PASS
+  4. If no external method exists → FAIL with rewrite suggestion
   ## Output — MANDATORY FORMAT
-  Your response MUST end with EXACTLY one of these two JSON lines (no code block, no markdown, just raw JSON as the LAST line of your output):
-  If ALL ACs are testable:
-  {"verdict": "pass"}
+  Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
-  If ANY AC fails testability:
-  {"verdict": "fail", "issues": ["AC 1: reason and suggested rewrite", "AC 3: reason and suggested rewrite"]}
+  If fail, also include:
+  <issues>
+  AC N: [why untestable] → Suggested rewrite: [concrete rewrite with observable behavior]
+  </issues>
-  You may include analysis BEFORE the verdict line, but the LAST line of your response MUST be the raw JSON verdict. This is machine-parsed — the loop cannot exit without it.
+  You may include analysis before the tags, but the XML tags are machine-parsed — the loop cannot exit without them.

package/templates/agents/reviewer.yaml CHANGED Viewed

@@ -73,16 +73,10 @@ prompt_template: |
   ## Verdict
-  At the END of your review output, include a verdict JSON on its own line:
-  ```json
-  {"verdict": "pass"}
-  ```
-  or if there are blocking issues:
-  ```json
-  {"verdict": "fail", "issues": ["blocking issue 1", "blocking issue 2"]}
-  ```
+  Your response MUST include `<verdict>pass</verdict>` or `<verdict>fail</verdict>`.
+  If fail, also include `<issues>blocking issue descriptions</issues>`.
-  This verdict determines whether the implementation proceeds or requires fixes.
+  These XML tags are machine-parsed and determine whether the implementation proceeds or requires fixes.
   ## Output Location

package/templates/agents/story-creator.yaml CHANGED Viewed

@@ -49,5 +49,7 @@ prompt_template: |
   ## Output
+  Wrap your story spec in `<story-spec>...</story-spec>` tags. This is machine-parsed.
   Write the story file to the implementation artifacts directory following the project's naming convention.
   Mark the story as `ready-for-dev` in the sprint status.

package/templates/workflows/default.yaml CHANGED Viewed

@@ -4,11 +4,6 @@ tasks:
     session: fresh
     source_access: true
     model: claude-opus-4-6
-  negotiate-acs:
-    agent: negotiator
-    session: fresh
-    source_access: true
-    model: claude-sonnet-4-6
   implement:
     agent: dev
     session: fresh
@@ -52,10 +47,6 @@ tasks:
 story_flow:
   - create-story
-  - negotiate-acs
-  - loop:
-      - create-story
-      - negotiate-acs
   - implement
   - check
   - review