@infinitedusky/indusk-mcp 1.23.2 → 1.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ import { EvalLogWriter } from "./log-writer.js";
13
13
  import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
14
14
  import { buildEvaluatorPrompt } from "./prompt-builder.js";
15
15
  import { V1_RUBRIC } from "./rubric.js";
16
+ import { extractScorecardJson, formatParseError } from "./scorecard-extractor.js";
16
17
  function getEvalLogPath(projectRoot) {
17
18
  return join(projectRoot, ".indusk", "eval", "results.log");
18
19
  }
@@ -117,10 +118,11 @@ export function runEvaluatorBackground(opts) {
117
118
  catch {
118
119
  // stdout might be raw JSON scorecard already
119
120
  }
120
- // Extract JSON from possible markdown code fences
121
- const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
122
- if (jsonMatch?.[1]) {
123
- scorecardText = jsonMatch[1];
121
+ // Tolerantly extract scorecard JSON handles pure JSON, fenced JSON,
122
+ // and prose-prefixed/wrapped JSON. See scorecard-extractor.ts.
123
+ const extracted = extractScorecardJson(scorecardText);
124
+ if (extracted !== null) {
125
+ scorecardText = extracted;
124
126
  }
125
127
  const scorecard = JSON.parse(scorecardText.trim());
126
128
  if (usage)
@@ -140,7 +142,7 @@ export function runEvaluatorBackground(opts) {
140
142
  mode: opts.mode,
141
143
  changeId: opts.changeId,
142
144
  error: true,
143
- message: err instanceof Error ? err.message : String(err),
145
+ message: stdout ? formatParseError(err, stdout) : (err instanceof Error ? err.message : String(err)),
144
146
  };
145
147
  await logWriter.append(errorEntry);
146
148
  }
@@ -237,9 +239,10 @@ async function runEvaluatorSyncInner(opts, projectGroup) {
237
239
  catch {
238
240
  // raw JSON
239
241
  }
240
- const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
241
- if (jsonMatch?.[1]) {
242
- scorecardText = jsonMatch[1];
242
+ // Tolerantly extract scorecard JSON — see scorecard-extractor.ts.
243
+ const extracted = extractScorecardJson(scorecardText);
244
+ if (extracted !== null) {
245
+ scorecardText = extracted;
243
246
  }
244
247
  const scorecard = JSON.parse(scorecardText.trim());
245
248
  if (syncUsage)
@@ -260,7 +263,7 @@ async function runEvaluatorSyncInner(opts, projectGroup) {
260
263
  mode: opts.mode,
261
264
  changeId: opts.changeId,
262
265
  error: true,
263
- message: err instanceof Error ? err.message : String(err),
266
+ message: stdout ? formatParseError(err, stdout) : (err instanceof Error ? err.message : String(err)),
264
267
  };
265
268
  await logWriter.append(errorEntry);
266
269
  resolve(errorEntry);
@@ -16,6 +16,7 @@ import { EvalLogWriter } from "./log-writer.js";
16
16
  import { initEvalOtel, initEvalOtelLogs, logEvalContent, shutdownEvalOtel, withSpan, } from "./otel.js";
17
17
  import { buildEvaluatorPrompt } from "./prompt-builder.js";
18
18
  import { V1_RUBRIC } from "./rubric.js";
19
+ import { extractScorecardJson, formatParseError } from "./scorecard-extractor.js";
19
20
  function getSessionPath(projectRoot) {
20
21
  return join(projectRoot, ".indusk", "eval", "evaluator-session.json");
21
22
  }
@@ -78,9 +79,13 @@ function parseClaudeOutput(stdout) {
78
79
  catch {
79
80
  // raw output
80
81
  }
81
- const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
82
- if (jsonMatch?.[1]) {
83
- scorecardText = jsonMatch[1];
82
+ // Tolerantly extract the scorecard JSON — handles pure JSON, fenced JSON,
83
+ // and prose-prefixed/wrapped JSON. Falls through to the raw text if no
84
+ // balanced object exists, letting the caller's JSON.parse surface a
85
+ // recognizable error (which the catch enriches with a stdout snippet).
86
+ const extracted = extractScorecardJson(scorecardText);
87
+ if (extracted !== null) {
88
+ scorecardText = extracted;
84
89
  }
85
90
  return { scorecardText, usage, sessionId };
86
91
  }
@@ -135,6 +140,10 @@ export async function runPersistentEval(opts) {
135
140
  const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
136
141
  const session = await withSpan(tracer, "eval.read_session", undefined, () => readSession(opts.projectRoot));
137
142
  rootSpan.setAttribute("resumed", session !== null);
143
+ // Capture raw stdout so the catch can include a snippet in the error
144
+ // message — preserves debuggability when JSON parsing fails on the
145
+ // extracted scorecard text.
146
+ let rawClaudeStdout = "";
138
147
  try {
139
148
  const { args, prompt } = await withSpan(tracer, "eval.build_prompt", { resumed: session !== null }, (span) => {
140
149
  const built = buildArgsAndPrompt();
@@ -212,6 +221,7 @@ Output ONLY the JSON scorecard as before — no commentary.`;
212
221
  });
213
222
  return spawned;
214
223
  });
224
+ rawClaudeStdout = claudeResult.stdout;
215
225
  if (claudeResult.code !== 0) {
216
226
  if (session) {
217
227
  await withSpan(tracer, "eval.clear_stale_session", undefined, () => clearSession(opts.projectRoot));
@@ -277,9 +287,12 @@ Output ONLY the JSON scorecard as before — no commentary.`;
277
287
  catch (err) {
278
288
  const msg = err instanceof Error ? err.message : String(err);
279
289
  const stack = err instanceof Error ? (err.stack ?? "") : "";
290
+ const enrichedMessage = rawClaudeStdout
291
+ ? formatParseError(err, rawClaudeStdout)
292
+ : msg;
280
293
  rootSpan.setAttribute("scorecard.status", "error");
281
294
  rootSpan.setAttribute("error.message", msg.slice(0, 500));
282
- logEvalContent("error", stack || msg, {
295
+ logEvalContent("error", stack || enrichedMessage, {
283
296
  "error.message": msg.slice(0, 500),
284
297
  });
285
298
  const errorEntry = {
@@ -288,7 +301,7 @@ Output ONLY the JSON scorecard as before — no commentary.`;
288
301
  mode: opts.mode,
289
302
  changeId: opts.changeId,
290
303
  error: true,
291
- message: msg,
304
+ message: enrichedMessage,
292
305
  };
293
306
  await logWriter.append(errorEntry);
294
307
  return errorEntry;
@@ -131,5 +131,24 @@ After completing all steps, output ONLY the following JSON object. No markdown w
131
131
  }
132
132
  \`\`\`
133
133
 
134
- This JSON is parsed programmatically. It must be valid. Do not include anything outside the JSON object.`;
134
+ This JSON is parsed programmatically. It must be valid. Do not include anything outside the JSON object.
135
+
136
+ ═══════════════════════════════════════════════════════════════════
137
+ **FINAL REMINDER — OUTPUT FORMAT**
138
+
139
+ Your final response must be a single raw JSON object. Nothing else. No prose before, no prose after, no markdown code fences. The parent process pipes your stdout directly into \`JSON.parse()\` — any character that isn't part of the JSON object will fail the parse and your scorecard will be lost.
140
+
141
+ ❌ DO NOT do this:
142
+ Now I've got everything I need. Here's the scorecard:
143
+ {"version":1,...}
144
+
145
+ ❌ DO NOT do this:
146
+ \`\`\`json
147
+ {"version":1,...}
148
+ \`\`\`
149
+
150
+ ✅ DO this — start your response with \`{\` and end with \`}\`, nothing else:
151
+ {"version":1,"timestamp":"2026-04-19T18:00:00.000Z","mode":"${opts.mode}","changeId":"${opts.changeId}","projectGroup":"${opts.projectGroup}","questions":[...],"summary":"...","graphitiWrites":3,"telemetryPosted":false}
152
+
153
+ The first character of your output must be \`{\`. The last character must be \`}\`. Begin now.`;
135
154
  }
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Scorecard extractor — pulls the scorecard JSON object out of arbitrary
3
+ * Claude-CLI output. Tolerates three output shapes the model produces in
4
+ * practice:
5
+ *
6
+ * 1. Pure JSON: `{...}`
7
+ * 2. Fenced JSON: ` ```json\n{...}\n``` ` or ` ```\n{...}\n``` `
8
+ * 3. Prose-prefixed/wrapped JSON: `Sure, here's the result: {...}` or
9
+ * `Some intro\n```json\n{...}\n```\nDone.`
10
+ *
11
+ * The third case is what bit eval-agent-mcp-access smoke 4 — see
12
+ * `.indusk/planning/eval-scorecard-format-fix/brief.md`.
13
+ */
14
+ /**
15
+ * Extract a balanced JSON object from arbitrary text. Returns the JSON
16
+ * substring (just the `{...}` part) or null if no balanced object exists.
17
+ *
18
+ * Strategy order:
19
+ * 1. If the text trims to a string starting with `{`, try parsing as-is.
20
+ * 2. If a markdown code fence wraps the JSON, extract from inside the fence.
21
+ * 3. Otherwise scan for the first `{` and find its matching `}` by
22
+ * tracking nesting depth and string-literal state (so braces inside
23
+ * string values don't fool the depth counter).
24
+ *
25
+ * The caller is responsible for `JSON.parse`-ing the returned substring.
26
+ * This function only locates the JSON; it doesn't validate it.
27
+ */
28
+ export declare function extractScorecardJson(text: string): string | null;
29
+ /**
30
+ * Build an error message for the case where scorecard parsing failed.
31
+ * Includes the underlying error and a snippet of the raw stdout so post-
32
+ * mortem debugging is possible from `results.log` alone, without re-running.
33
+ */
34
+ export declare function formatParseError(err: unknown, rawStdout: string): string;
@@ -0,0 +1,130 @@
1
+ /**
2
+ * Scorecard extractor — pulls the scorecard JSON object out of arbitrary
3
+ * Claude-CLI output. Tolerates three output shapes the model produces in
4
+ * practice:
5
+ *
6
+ * 1. Pure JSON: `{...}`
7
+ * 2. Fenced JSON: ` ```json\n{...}\n``` ` or ` ```\n{...}\n``` `
8
+ * 3. Prose-prefixed/wrapped JSON: `Sure, here's the result: {...}` or
9
+ * `Some intro\n```json\n{...}\n```\nDone.`
10
+ *
11
+ * The third case is what bit eval-agent-mcp-access smoke 4 — see
12
+ * `.indusk/planning/eval-scorecard-format-fix/brief.md`.
13
+ */
14
+ /**
15
+ * Extract a balanced JSON object from arbitrary text. Returns the JSON
16
+ * substring (just the `{...}` part) or null if no balanced object exists.
17
+ *
18
+ * Strategy order:
19
+ * 1. If the text trims to a string starting with `{`, try parsing as-is.
20
+ * 2. If a markdown code fence wraps the JSON, extract from inside the fence.
21
+ * 3. Otherwise scan for the first `{` and find its matching `}` by
22
+ * tracking nesting depth and string-literal state (so braces inside
23
+ * string values don't fool the depth counter).
24
+ *
25
+ * The caller is responsible for `JSON.parse`-ing the returned substring.
26
+ * This function only locates the JSON; it doesn't validate it.
27
+ */
28
+ export function extractScorecardJson(text) {
29
+ if (!text)
30
+ return null;
31
+ // Strategy 1: pure JSON (cleanest case)
32
+ const trimmed = text.trim();
33
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
34
+ try {
35
+ JSON.parse(trimmed);
36
+ return trimmed;
37
+ }
38
+ catch {
39
+ // Fall through to other strategies — the trim-and-test was a quick check
40
+ }
41
+ }
42
+ // Strategy 2: fenced code block — ```json ... ``` or ``` ... ```
43
+ const fenceMatch = text.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
44
+ if (fenceMatch?.[1]) {
45
+ const inside = fenceMatch[1].trim();
46
+ try {
47
+ JSON.parse(inside);
48
+ return inside;
49
+ }
50
+ catch {
51
+ // Fall through — fence content wasn't valid JSON, try balanced-brace scan
52
+ }
53
+ }
54
+ // Strategy 3: balanced-brace scan
55
+ const balanced = findFirstBalancedJsonObject(text);
56
+ if (balanced) {
57
+ try {
58
+ JSON.parse(balanced);
59
+ return balanced;
60
+ }
61
+ catch {
62
+ return null;
63
+ }
64
+ }
65
+ return null;
66
+ }
67
+ /**
68
+ * Walk the text looking for the first `{` and find its matching `}`,
69
+ * tracking string-literal state and escape characters so braces inside
70
+ * string values don't confuse the depth counter.
71
+ *
72
+ * Returns the substring including both braces, or null if no balanced
73
+ * object exists in the text.
74
+ */
75
+ function findFirstBalancedJsonObject(text) {
76
+ const start = text.indexOf("{");
77
+ if (start === -1)
78
+ return null;
79
+ let depth = 0;
80
+ let inString = false;
81
+ let escaped = false;
82
+ for (let i = start; i < text.length; i++) {
83
+ const ch = text[i];
84
+ if (escaped) {
85
+ // Previous character was a backslash — consume this character without
86
+ // interpreting it. Reset the escape flag.
87
+ escaped = false;
88
+ continue;
89
+ }
90
+ if (ch === "\\") {
91
+ // Inside a string, a backslash escapes the next character. Outside a
92
+ // string, this shouldn't occur in valid JSON but we handle it
93
+ // defensively.
94
+ escaped = true;
95
+ continue;
96
+ }
97
+ if (ch === '"') {
98
+ // Toggle string-literal state.
99
+ inString = !inString;
100
+ continue;
101
+ }
102
+ if (inString)
103
+ continue;
104
+ if (ch === "{") {
105
+ depth++;
106
+ }
107
+ else if (ch === "}") {
108
+ depth--;
109
+ if (depth === 0) {
110
+ return text.slice(start, i + 1);
111
+ }
112
+ if (depth < 0) {
113
+ // Unmatched closing brace — give up.
114
+ return null;
115
+ }
116
+ }
117
+ }
118
+ // Walked to end of string without closing the outermost brace.
119
+ return null;
120
+ }
121
+ /**
122
+ * Build an error message for the case where scorecard parsing failed.
123
+ * Includes the underlying error and a snippet of the raw stdout so post-
124
+ * mortem debugging is possible from `results.log` alone, without re-running.
125
+ */
126
+ export function formatParseError(err, rawStdout) {
127
+ const errMsg = err instanceof Error ? err.message : String(err);
128
+ const snippet = rawStdout.slice(0, 500);
129
+ return `${errMsg}\n\nstdout snippet (first 500 chars):\n${snippet}`;
130
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@infinitedusky/indusk-mcp",
3
- "version": "1.23.2",
3
+ "version": "1.24.1",
4
4
  "description": "InDusk development system — skills, MCP tools, and CLI for structured AI-assisted development",
5
5
  "type": "module",
6
6
  "files": [
package/skills/planner.md CHANGED
@@ -33,12 +33,14 @@ The first argument to `/planner` can optionally be a workflow type that controls
33
33
 
34
34
  | Command | Workflow | Documents |
35
35
  |---------|----------|-----------|
36
- | `/planner bugfix auth-expiry` | bugfix | brief + impl only |
37
- | `/planner refactor extract-auth` | refactor | brief + impl (with boundary map) |
36
+ | `/planner bugfix auth-expiry` | bugfix | brief + test-plan + impl |
37
+ | `/planner refactor extract-auth` | refactor | brief + test-plan + impl (with boundary map) |
38
38
  | `/planner spike redis-options` | spike | research only |
39
- | `/planner feature payment-flow` | feature | full lifecycle (default includes test-plan between brief and ADR) |
39
+ | `/planner feature payment-flow` | feature | full lifecycle (research + brief + test-plan + adr + impl + retrospective) |
40
40
  | `/planner payment-flow` | feature | same — no type defaults to feature |
41
41
 
42
+ **Test plan is required for any workflow that ships an impl** (bugfix, refactor, feature). For a bugfix, the first behavioral assertion IS the failing test that proves the bug — you can't write a fix until you've named what should be true once it works. Spike is the only workflow that skips the test plan, because it skips the impl.
43
+
42
44
  Parse the input: if the first word is `bugfix`, `refactor`, `spike`, or `feature`, use that workflow. Otherwise, default to `feature`. The remaining words become the plan name (kebab-cased).
43
45
 
44
46
  Workflow templates are in `templates/workflows/` in the package. They describe which documents to create and provide streamlined templates for each workflow type.
@@ -84,7 +86,7 @@ Workflow templates are in `templates/workflows/` in the package. They describe w
84
86
  ```
85
87
  The working agent does not write Graphiti episodes directly. The eval agent reads unprocessed highlights (via `highlights_unprocessed`), extracts the full Problem + Proposed Direction + Scope context from the transcript, writes a structured episode into the project group, and marks the highlight processed. Skip silently if `mcp__indusk__highlight` is unavailable — highlights are best-effort and must not fail brief acceptance. See [`apps/indusk-docs/src/reference/tools/highlights.md`](../../indusk-docs/src/reference/tools/highlights.md) for the full flow.
86
88
 
87
- 5. **If brief is accepted** and the workflow includes a test plan (feature only), write the test plan. The test plan is the bridge between the brief (what we want and why) and the ADR (architectural decision). It lists the **behavioral assertions** that must be true for the feature to be working, and for each assertion names **how it will be tested** — not the test code itself, but the test mechanism (vitest unit, vitest integration, end-to-end script, manual user test, manual smoke against running stack, etc.).
89
+ 5. **If brief is accepted** and the workflow includes a test plan (bugfix, refactor, or feature — anything that ships an impl), write the test plan. The test plan is the bridge between the brief (what we want and why) and the ADR (architectural decision). It lists the **behavioral assertions** that must be true for the feature to be working, and for each assertion names **how it will be tested** — not the test code itself, but the test mechanism (vitest unit, vitest integration, end-to-end script, manual user test, manual smoke against running stack, etc.).
88
90
 
89
91
  The discipline this produces: when you walk into the ADR with a test plan in hand, the architectural decision is constrained by "what makes all these assertions true?" rather than invented from intuition. The ADR's "We decided for" / "And against" clauses gain teeth because alternatives can be rejected against specific assertions. The impl's Test Trajectory rows derive directly from the test plan's assertions — one trajectory row per assertion, with the `Writable at` / `Passes at` columns added during impl authoring.
90
92
 
package/skills/work.md CHANGED
@@ -280,10 +280,16 @@ Use the **describe-then-do** workflow from the jj skill:
280
280
 
281
281
  1. `jj new` before each logical unit of work
282
282
  2. `jj describe` to declare what you're about to do
283
- 3. Do the work, check off the item(s)
283
+ 3. Do the work, check off the item
284
284
  4. Repeat
285
285
 
286
- Commit at natural boundaries typically per checklist item or per phase gate (otel, verify, context, document). Follow the monorepo rule: if a change spans multiple apps, use `jj split` to silo commits between contexts. See the jj skill for details.
286
+ **Default: one commit per checklist item.** Each impl checklist item is a logical unit of work give it its own commit. This keeps history granular, makes blame and bisect useful, avoids the end-of-phase `jj split` chore, and lets the eval agent score each unit while context is fresh.
287
+
288
+ Phase-close commits (one big commit for everything in a phase) are an exception, not the default. Use them ONLY when items are trivially related — e.g., a phase that's "rename X → Y in 5 files" where every commit would be the same one-line change. If items represent meaningfully different work (different concerns, different files, different intents), each item deserves its own commit.
289
+
290
+ Cost is not a reason to batch. The eval agent uses session-resume after the first commit, so subsequent commits within a session amortize the catchup cost — per-item commits are cheap.
291
+
292
+ Follow the monorepo rule: if a change spans multiple apps, use `jj split` to silo commits between contexts. See the jj skill for details.
287
293
 
288
294
  ## Cross-Plan Impact
289
295