npm - @infinitedusky/indusk-mcp - Versions diffs - 1.23.2 → 1.24.1 - Mend

@infinitedusky/indusk-mcp 1.23.2 → 1.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/lib/eval/evaluator-runner.js +12 -9
package/dist/lib/eval/persistent-evaluator.js +18 -5
package/dist/lib/eval/prompt-builder.js +20 -1
package/dist/lib/eval/scorecard-extractor.d.ts +34 -0
package/dist/lib/eval/scorecard-extractor.js +130 -0
package/package.json +1 -1
package/skills/planner.md +6 -4
package/skills/work.md +8 -2

package/dist/lib/eval/evaluator-runner.js CHANGED Viewed

@@ -13,6 +13,7 @@ import { EvalLogWriter } from "./log-writer.js";
 import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
 import { buildEvaluatorPrompt } from "./prompt-builder.js";
 import { V1_RUBRIC } from "./rubric.js";
+import { extractScorecardJson, formatParseError } from "./scorecard-extractor.js";
 function getEvalLogPath(projectRoot) {
     return join(projectRoot, ".indusk", "eval", "results.log");
 }
@@ -117,10 +118,11 @@ export function runEvaluatorBackground(opts) {
             catch {
                 // stdout might be raw JSON scorecard already
             }
-            // Extract JSON from possible markdown code fences
-            const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
-            if (jsonMatch?.[1]) {
-                scorecardText = jsonMatch[1];
+            // Tolerantly extract scorecard JSON — handles pure JSON, fenced JSON,
+            // and prose-prefixed/wrapped JSON. See scorecard-extractor.ts.
+            const extracted = extractScorecardJson(scorecardText);
+            if (extracted !== null) {
+                scorecardText = extracted;
             }
             const scorecard = JSON.parse(scorecardText.trim());
             if (usage)
@@ -140,7 +142,7 @@ export function runEvaluatorBackground(opts) {
                 mode: opts.mode,
                 changeId: opts.changeId,
                 error: true,
-                message: err instanceof Error ? err.message : String(err),
+                message: stdout ? formatParseError(err, stdout) : (err instanceof Error ? err.message : String(err)),
             };
             await logWriter.append(errorEntry);
         }
@@ -237,9 +239,10 @@ async function runEvaluatorSyncInner(opts, projectGroup) {
                 catch {
                     // raw JSON
                 }
-                const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
-                if (jsonMatch?.[1]) {
-                    scorecardText = jsonMatch[1];
+                // Tolerantly extract scorecard JSON — see scorecard-extractor.ts.
+                const extracted = extractScorecardJson(scorecardText);
+                if (extracted !== null) {
+                    scorecardText = extracted;
                 }
                 const scorecard = JSON.parse(scorecardText.trim());
                 if (syncUsage)
@@ -260,7 +263,7 @@ async function runEvaluatorSyncInner(opts, projectGroup) {
                     mode: opts.mode,
                     changeId: opts.changeId,
                     error: true,
-                    message: err instanceof Error ? err.message : String(err),
+                    message: stdout ? formatParseError(err, stdout) : (err instanceof Error ? err.message : String(err)),
                 };
                 await logWriter.append(errorEntry);
                 resolve(errorEntry);

package/dist/lib/eval/persistent-evaluator.js CHANGED Viewed

@@ -16,6 +16,7 @@ import { EvalLogWriter } from "./log-writer.js";
 import { initEvalOtel, initEvalOtelLogs, logEvalContent, shutdownEvalOtel, withSpan, } from "./otel.js";
 import { buildEvaluatorPrompt } from "./prompt-builder.js";
 import { V1_RUBRIC } from "./rubric.js";
+import { extractScorecardJson, formatParseError } from "./scorecard-extractor.js";
 function getSessionPath(projectRoot) {
     return join(projectRoot, ".indusk", "eval", "evaluator-session.json");
 }
@@ -78,9 +79,13 @@ function parseClaudeOutput(stdout) {
     catch {
         // raw output
     }
-    const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
-    if (jsonMatch?.[1]) {
-        scorecardText = jsonMatch[1];
+    // Tolerantly extract the scorecard JSON — handles pure JSON, fenced JSON,
+    // and prose-prefixed/wrapped JSON. Falls through to the raw text if no
+    // balanced object exists, letting the caller's JSON.parse surface a
+    // recognizable error (which the catch enriches with a stdout snippet).
+    const extracted = extractScorecardJson(scorecardText);
+    if (extracted !== null) {
+        scorecardText = extracted;
     }
     return { scorecardText, usage, sessionId };
 }
@@ -135,6 +140,10 @@ export async function runPersistentEval(opts) {
         const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
         const session = await withSpan(tracer, "eval.read_session", undefined, () => readSession(opts.projectRoot));
         rootSpan.setAttribute("resumed", session !== null);
+        // Capture raw stdout so the catch can include a snippet in the error
+        // message — preserves debuggability when JSON parsing fails on the
+        // extracted scorecard text.
+        let rawClaudeStdout = "";
         try {
             const { args, prompt } = await withSpan(tracer, "eval.build_prompt", { resumed: session !== null }, (span) => {
                 const built = buildArgsAndPrompt();
@@ -212,6 +221,7 @@ Output ONLY the JSON scorecard as before — no commentary.`;
                 });
                 return spawned;
             });
+            rawClaudeStdout = claudeResult.stdout;
             if (claudeResult.code !== 0) {
                 if (session) {
                     await withSpan(tracer, "eval.clear_stale_session", undefined, () => clearSession(opts.projectRoot));
@@ -277,9 +287,12 @@ Output ONLY the JSON scorecard as before — no commentary.`;
         catch (err) {
             const msg = err instanceof Error ? err.message : String(err);
             const stack = err instanceof Error ? (err.stack ?? "") : "";
+            const enrichedMessage = rawClaudeStdout
+                ? formatParseError(err, rawClaudeStdout)
+                : msg;
             rootSpan.setAttribute("scorecard.status", "error");
             rootSpan.setAttribute("error.message", msg.slice(0, 500));
-            logEvalContent("error", stack || msg, {
+            logEvalContent("error", stack || enrichedMessage, {
                 "error.message": msg.slice(0, 500),
             });
             const errorEntry = {
@@ -288,7 +301,7 @@ Output ONLY the JSON scorecard as before — no commentary.`;
                 mode: opts.mode,
                 changeId: opts.changeId,
                 error: true,
-                message: msg,
+                message: enrichedMessage,
             };
             await logWriter.append(errorEntry);
             return errorEntry;

package/dist/lib/eval/prompt-builder.js CHANGED Viewed

@@ -131,5 +131,24 @@ After completing all steps, output ONLY the following JSON object. No markdown w
 }
 \`\`\`
-This JSON is parsed programmatically. It must be valid. Do not include anything outside the JSON object.`;
+This JSON is parsed programmatically. It must be valid. Do not include anything outside the JSON object.
+═══════════════════════════════════════════════════════════════════
+**FINAL REMINDER — OUTPUT FORMAT**
+Your final response must be a single raw JSON object. Nothing else. No prose before, no prose after, no markdown code fences. The parent process pipes your stdout directly into \`JSON.parse()\` — any character that isn't part of the JSON object will fail the parse and your scorecard will be lost.
+❌ DO NOT do this:
+  Now I've got everything I need. Here's the scorecard:
+  {"version":1,...}
+❌ DO NOT do this:
+  \`\`\`json
+  {"version":1,...}
+  \`\`\`
+✅ DO this — start your response with \`{\` and end with \`}\`, nothing else:
+  {"version":1,"timestamp":"2026-04-19T18:00:00.000Z","mode":"${opts.mode}","changeId":"${opts.changeId}","projectGroup":"${opts.projectGroup}","questions":[...],"summary":"...","graphitiWrites":3,"telemetryPosted":false}
+The first character of your output must be \`{\`. The last character must be \`}\`. Begin now.`;
 }

package/dist/lib/eval/scorecard-extractor.d.ts ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * Scorecard extractor — pulls the scorecard JSON object out of arbitrary
+ * Claude-CLI output. Tolerates three output shapes the model produces in
+ * practice:
+ *
+ *   1. Pure JSON: `{...}`
+ *   2. Fenced JSON: ` ```json\n{...}\n``` ` or ` ```\n{...}\n``` `
+ *   3. Prose-prefixed/wrapped JSON: `Sure, here's the result: {...}` or
+ *      `Some intro\n```json\n{...}\n```\nDone.`
+ *
+ * The third case is what bit eval-agent-mcp-access smoke 4 — see
+ * `.indusk/planning/eval-scorecard-format-fix/brief.md`.
+ */
+/**
+ * Extract a balanced JSON object from arbitrary text. Returns the JSON
+ * substring (just the `{...}` part) or null if no balanced object exists.
+ *
+ * Strategy order:
+ *   1. If the text trims to a string starting with `{`, try parsing as-is.
+ *   2. If a markdown code fence wraps the JSON, extract from inside the fence.
+ *   3. Otherwise scan for the first `{` and find its matching `}` by
+ *      tracking nesting depth and string-literal state (so braces inside
+ *      string values don't fool the depth counter).
+ *
+ * The caller is responsible for `JSON.parse`-ing the returned substring.
+ * This function only locates the JSON; it doesn't validate it.
+ */
+export declare function extractScorecardJson(text: string): string | null;
+/**
+ * Build an error message for the case where scorecard parsing failed.
+ * Includes the underlying error and a snippet of the raw stdout so post-
+ * mortem debugging is possible from `results.log` alone, without re-running.
+ */
+export declare function formatParseError(err: unknown, rawStdout: string): string;

package/dist/lib/eval/scorecard-extractor.js ADDED Viewed

@@ -0,0 +1,130 @@
+/**
+ * Scorecard extractor — pulls the scorecard JSON object out of arbitrary
+ * Claude-CLI output. Tolerates three output shapes the model produces in
+ * practice:
+ *
+ *   1. Pure JSON: `{...}`
+ *   2. Fenced JSON: ` ```json\n{...}\n``` ` or ` ```\n{...}\n``` `
+ *   3. Prose-prefixed/wrapped JSON: `Sure, here's the result: {...}` or
+ *      `Some intro\n```json\n{...}\n```\nDone.`
+ *
+ * The third case is what bit eval-agent-mcp-access smoke 4 — see
+ * `.indusk/planning/eval-scorecard-format-fix/brief.md`.
+ */
+/**
+ * Extract a balanced JSON object from arbitrary text. Returns the JSON
+ * substring (just the `{...}` part) or null if no balanced object exists.
+ *
+ * Strategy order:
+ *   1. If the text trims to a string starting with `{`, try parsing as-is.
+ *   2. If a markdown code fence wraps the JSON, extract from inside the fence.
+ *   3. Otherwise scan for the first `{` and find its matching `}` by
+ *      tracking nesting depth and string-literal state (so braces inside
+ *      string values don't fool the depth counter).
+ *
+ * The caller is responsible for `JSON.parse`-ing the returned substring.
+ * This function only locates the JSON; it doesn't validate it.
+ */
+export function extractScorecardJson(text) {
+    if (!text)
+        return null;
+    // Strategy 1: pure JSON (cleanest case)
+    const trimmed = text.trim();
+    if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
+        try {
+            JSON.parse(trimmed);
+            return trimmed;
+        }
+        catch {
+            // Fall through to other strategies — the trim-and-test was a quick check
+        }
+    }
+    // Strategy 2: fenced code block — ```json ... ``` or ``` ... ```
+    const fenceMatch = text.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
+    if (fenceMatch?.[1]) {
+        const inside = fenceMatch[1].trim();
+        try {
+            JSON.parse(inside);
+            return inside;
+        }
+        catch {
+            // Fall through — fence content wasn't valid JSON, try balanced-brace scan
+        }
+    }
+    // Strategy 3: balanced-brace scan
+    const balanced = findFirstBalancedJsonObject(text);
+    if (balanced) {
+        try {
+            JSON.parse(balanced);
+            return balanced;
+        }
+        catch {
+            return null;
+        }
+    }
+    return null;
+}
+/**
+ * Walk the text looking for the first `{` and find its matching `}`,
+ * tracking string-literal state and escape characters so braces inside
+ * string values don't confuse the depth counter.
+ *
+ * Returns the substring including both braces, or null if no balanced
+ * object exists in the text.
+ */
+function findFirstBalancedJsonObject(text) {
+    const start = text.indexOf("{");
+    if (start === -1)
+        return null;
+    let depth = 0;
+    let inString = false;
+    let escaped = false;
+    for (let i = start; i < text.length; i++) {
+        const ch = text[i];
+        if (escaped) {
+            // Previous character was a backslash — consume this character without
+            // interpreting it. Reset the escape flag.
+            escaped = false;
+            continue;
+        }
+        if (ch === "\\") {
+            // Inside a string, a backslash escapes the next character. Outside a
+            // string, this shouldn't occur in valid JSON but we handle it
+            // defensively.
+            escaped = true;
+            continue;
+        }
+        if (ch === '"') {
+            // Toggle string-literal state.
+            inString = !inString;
+            continue;
+        }
+        if (inString)
+            continue;
+        if (ch === "{") {
+            depth++;
+        }
+        else if (ch === "}") {
+            depth--;
+            if (depth === 0) {
+                return text.slice(start, i + 1);
+            }
+            if (depth < 0) {
+                // Unmatched closing brace — give up.
+                return null;
+            }
+        }
+    }
+    // Walked to end of string without closing the outermost brace.
+    return null;
+}
+/**
+ * Build an error message for the case where scorecard parsing failed.
+ * Includes the underlying error and a snippet of the raw stdout so post-
+ * mortem debugging is possible from `results.log` alone, without re-running.
+ */
+export function formatParseError(err, rawStdout) {
+    const errMsg = err instanceof Error ? err.message : String(err);
+    const snippet = rawStdout.slice(0, 500);
+    return `${errMsg}\n\nstdout snippet (first 500 chars):\n${snippet}`;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@infinitedusky/indusk-mcp",
-	"version": "1.23.2",
+	"version": "1.24.1",
 	"description": "InDusk development system — skills, MCP tools, and CLI for structured AI-assisted development",
 	"type": "module",
 	"files": [

package/skills/planner.md CHANGED Viewed

@@ -33,12 +33,14 @@ The first argument to `/planner` can optionally be a workflow type that controls
 | Command | Workflow | Documents |
 |---------|----------|-----------|
-| `/planner bugfix auth-expiry` | bugfix | brief + impl only |
-| `/planner refactor extract-auth` | refactor | brief + impl (with boundary map) |
+| `/planner bugfix auth-expiry` | bugfix | brief + test-plan + impl |
+| `/planner refactor extract-auth` | refactor | brief + test-plan + impl (with boundary map) |
 | `/planner spike redis-options` | spike | research only |
-| `/planner feature payment-flow` | feature | full lifecycle (default — includes test-plan between brief and ADR) |
+| `/planner feature payment-flow` | feature | full lifecycle (research + brief + test-plan + adr + impl + retrospective) |
 | `/planner payment-flow` | feature | same — no type defaults to feature |
+**Test plan is required for any workflow that ships an impl** (bugfix, refactor, feature). For a bugfix, the first behavioral assertion IS the failing test that proves the bug — you can't write a fix until you've named what should be true once it works. Spike is the only workflow that skips the test plan, because it skips the impl.
 Parse the input: if the first word is `bugfix`, `refactor`, `spike`, or `feature`, use that workflow. Otherwise, default to `feature`. The remaining words become the plan name (kebab-cased).
 Workflow templates are in `templates/workflows/` in the package. They describe which documents to create and provide streamlined templates for each workflow type.
@@ -84,7 +86,7 @@ Workflow templates are in `templates/workflows/` in the package. They describe w
    ```
    The working agent does not write Graphiti episodes directly. The eval agent reads unprocessed highlights (via `highlights_unprocessed`), extracts the full Problem + Proposed Direction + Scope context from the transcript, writes a structured episode into the project group, and marks the highlight processed. Skip silently if `mcp__indusk__highlight` is unavailable — highlights are best-effort and must not fail brief acceptance. See [`apps/indusk-docs/src/reference/tools/highlights.md`](../../indusk-docs/src/reference/tools/highlights.md) for the full flow.
-5. **If brief is accepted** and the workflow includes a test plan (feature only), write the test plan. The test plan is the bridge between the brief (what we want and why) and the ADR (architectural decision). It lists the **behavioral assertions** that must be true for the feature to be working, and for each assertion names **how it will be tested** — not the test code itself, but the test mechanism (vitest unit, vitest integration, end-to-end script, manual user test, manual smoke against running stack, etc.).
+5. **If brief is accepted** and the workflow includes a test plan (bugfix, refactor, or feature — anything that ships an impl), write the test plan. The test plan is the bridge between the brief (what we want and why) and the ADR (architectural decision). It lists the **behavioral assertions** that must be true for the feature to be working, and for each assertion names **how it will be tested** — not the test code itself, but the test mechanism (vitest unit, vitest integration, end-to-end script, manual user test, manual smoke against running stack, etc.).
    The discipline this produces: when you walk into the ADR with a test plan in hand, the architectural decision is constrained by "what makes all these assertions true?" rather than invented from intuition. The ADR's "We decided for" / "And against" clauses gain teeth because alternatives can be rejected against specific assertions. The impl's Test Trajectory rows derive directly from the test plan's assertions — one trajectory row per assertion, with the `Writable at` / `Passes at` columns added during impl authoring.

package/skills/work.md CHANGED Viewed

@@ -280,10 +280,16 @@ Use the **describe-then-do** workflow from the jj skill:
 1. `jj new` before each logical unit of work
 2. `jj describe` to declare what you're about to do
-3. Do the work, check off the item(s)
+3. Do the work, check off the item
 4. Repeat
-Commit at natural boundaries — typically per checklist item or per phase gate (otel, verify, context, document). Follow the monorepo rule: if a change spans multiple apps, use `jj split` to silo commits between contexts. See the jj skill for details.
+**Default: one commit per checklist item.** Each impl checklist item is a logical unit of work — give it its own commit. This keeps history granular, makes blame and bisect useful, avoids the end-of-phase `jj split` chore, and lets the eval agent score each unit while context is fresh.
+Phase-close commits (one big commit for everything in a phase) are an exception, not the default. Use them ONLY when items are trivially related — e.g., a phase that's "rename X → Y in 5 files" where every commit would be the same one-line change. If items represent meaningfully different work (different concerns, different files, different intents), each item deserves its own commit.
+Cost is not a reason to batch. The eval agent uses session-resume after the first commit, so subsequent commits within a session amortize the catchup cost — per-item commits are cheap.
+Follow the monorepo rule: if a change spans multiple apps, use `jj split` to silo commits between contexts. See the jj skill for details.
 ## Cross-Plan Impact