@infinitedusky/indusk-mcp 1.12.1 → 1.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/cli.js CHANGED
@@ -259,6 +259,28 @@ eval_
259
259
  const { evalSummary } = await import("./commands/eval.js");
260
260
  await evalSummary(process.cwd(), opts);
261
261
  });
262
+ eval_
263
+ .command("findings")
264
+ .description("List unresolved eval findings")
265
+ .option("--all", "Show all findings including fixed/ignored")
266
+ .action(async (opts) => {
267
+ const { evalFindings } = await import("./commands/eval.js");
268
+ await evalFindings(process.cwd(), opts);
269
+ });
270
+ eval_
271
+ .command("fix <key>")
272
+ .description("Mark an eval finding as fixed")
273
+ .action(async (key) => {
274
+ const { evalMark } = await import("./commands/eval.js");
275
+ await evalMark(process.cwd(), key, "fixed");
276
+ });
277
+ eval_
278
+ .command("ignore <key>")
279
+ .description("Mark an eval finding as ignored")
280
+ .action(async (key) => {
281
+ const { evalMark } = await import("./commands/eval.js");
282
+ await evalMark(process.cwd(), key, "ignored");
283
+ });
262
284
  eval_
263
285
  .command("baseline")
264
286
  .description("Run baseline evaluation with vanilla agent")
@@ -9,6 +9,10 @@ export declare function evalSummary(projectRoot: string, opts: {
9
9
  since?: string;
10
10
  json?: boolean;
11
11
  }): Promise<void>;
12
+ export declare function evalFindings(projectRoot: string, opts: {
13
+ all?: boolean;
14
+ }): Promise<void>;
15
+ export declare function evalMark(projectRoot: string, key: string, state: "fixed" | "ignored"): Promise<void>;
12
16
  export declare function evalBaseline(projectRoot: string, opts: {
13
17
  task: string;
14
18
  keep?: boolean;
@@ -6,6 +6,7 @@
6
6
  */
7
7
  import { existsSync } from "node:fs";
8
8
  import { join } from "node:path";
9
+ import { getAllFindings, getUnresolvedFindings, markFinding } from "../../lib/eval/findings.js";
9
10
  import { readAllEntries } from "../../lib/eval/log-reader.js";
10
11
  import { isScorecard } from "../../lib/eval/types.js";
11
12
  function getEvalLogPath(projectRoot) {
@@ -106,6 +107,30 @@ function computeSummary(scorecards) {
106
107
  trend,
107
108
  };
108
109
  }
110
+ export async function evalFindings(projectRoot, opts) {
111
+ const findings = opts.all ? getAllFindings(projectRoot) : getUnresolvedFindings(projectRoot);
112
+ if (findings.length === 0) {
113
+ console.info(opts.all ? "No eval findings." : "No unresolved findings.");
114
+ return;
115
+ }
116
+ console.info(`\n${opts.all ? "All" : "Unresolved"} eval findings (${findings.length}):\n`);
117
+ for (const f of findings) {
118
+ const icon = f.state === "fixed" ? "✓" : f.state === "ignored" ? "–" : "●";
119
+ console.info(` ${icon} [${f.severity}] ${f.questionId}: ${f.finding}`);
120
+ console.info(` key: ${f.key} change: ${f.changeId.slice(0, 8)} state: ${f.state}`);
121
+ }
122
+ console.info("");
123
+ }
124
+ export async function evalMark(projectRoot, key, state) {
125
+ const success = markFinding(projectRoot, key, state);
126
+ if (success) {
127
+ console.info(`Marked ${key} as ${state}`);
128
+ }
129
+ else {
130
+ console.error(`Finding not found: ${key}`);
131
+ process.exit(1);
132
+ }
133
+ }
109
134
  function computePassRates(cards) {
110
135
  const counts = {};
111
136
  for (const card of cards) {
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Tracks eval finding resolution state.
3
+ *
4
+ * Findings persist as "unresolved" until explicitly fixed or ignored.
5
+ * The eval hook surfaces unresolved findings on every jj describe.
6
+ */
7
+ import type { EvalScorecard } from "./types.js";
8
+ export type FindingState = "unresolved" | "fixed" | "ignored";
9
+ export interface FindingEntry {
10
+ state: FindingState;
11
+ questionId: string;
12
+ severity: string;
13
+ finding: string;
14
+ changeId: string;
15
+ }
16
+ export declare function getUnresolvedFindings(projectRoot: string): Array<{
17
+ key: string;
18
+ } & FindingEntry>;
19
+ export declare function getAllFindings(projectRoot: string): Array<{
20
+ key: string;
21
+ } & FindingEntry>;
22
+ export declare function markFinding(projectRoot: string, key: string, state: FindingState): boolean;
23
+ export declare function ingestScorecard(projectRoot: string, scorecard: EvalScorecard): number;
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Tracks eval finding resolution state.
3
+ *
4
+ * Findings persist as "unresolved" until explicitly fixed or ignored.
5
+ * The eval hook surfaces unresolved findings on every jj describe.
6
+ */
7
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
8
+ import { dirname, join } from "node:path";
9
+ function getFindingsPath(projectRoot) {
10
+ return join(projectRoot, ".indusk", "eval", "findings.json");
11
+ }
12
+ function readFindings(projectRoot) {
13
+ const path = getFindingsPath(projectRoot);
14
+ if (!existsSync(path))
15
+ return {};
16
+ try {
17
+ return JSON.parse(readFileSync(path, "utf8"));
18
+ }
19
+ catch {
20
+ return {};
21
+ }
22
+ }
23
+ function writeFindings(projectRoot, findings) {
24
+ const path = getFindingsPath(projectRoot);
25
+ mkdirSync(dirname(path), { recursive: true });
26
+ writeFileSync(path, `${JSON.stringify(findings, null, 2)}\n`);
27
+ }
28
+ export function getUnresolvedFindings(projectRoot) {
29
+ const findings = readFindings(projectRoot);
30
+ return Object.entries(findings)
31
+ .filter(([, entry]) => entry.state === "unresolved")
32
+ .map(([key, entry]) => ({ key, ...entry }));
33
+ }
34
+ export function getAllFindings(projectRoot) {
35
+ const findings = readFindings(projectRoot);
36
+ return Object.entries(findings).map(([key, entry]) => ({ key, ...entry }));
37
+ }
38
+ export function markFinding(projectRoot, key, state) {
39
+ const findings = readFindings(projectRoot);
40
+ if (!findings[key])
41
+ return false;
42
+ findings[key].state = state;
43
+ writeFindings(projectRoot, findings);
44
+ return true;
45
+ }
46
+ export function ingestScorecard(projectRoot, scorecard) {
47
+ const findings = readFindings(projectRoot);
48
+ let added = 0;
49
+ for (const q of scorecard.questions) {
50
+ if (q.answer === "yes")
51
+ continue; // no finding for passing questions
52
+ const key = `${scorecard.changeId}:${q.id}`;
53
+ if (!findings[key]) {
54
+ findings[key] = {
55
+ state: "unresolved",
56
+ questionId: q.id,
57
+ severity: q.severity,
58
+ finding: q.finding,
59
+ changeId: scorecard.changeId,
60
+ };
61
+ added++;
62
+ }
63
+ }
64
+ if (added > 0) {
65
+ writeFindings(projectRoot, findings);
66
+ }
67
+ return added;
68
+ }
@@ -8,6 +8,7 @@
8
8
  import { spawn } from "node:child_process";
9
9
  import { join } from "node:path";
10
10
  import { getProjectGroupId } from "../config.js";
11
+ import { ingestScorecard } from "./findings.js";
11
12
  import { EvalLogWriter } from "./log-writer.js";
12
13
  import { buildJudgePrompt } from "./prompt-builder.js";
13
14
  import { V1_RUBRIC } from "./rubric.js";
@@ -127,6 +128,7 @@ export function runJudgeBackground(opts) {
127
128
  scorecard.telemetryPosted = true;
128
129
  }
129
130
  await logWriter.append(scorecard);
131
+ ingestScorecard(opts.projectRoot, scorecard);
130
132
  }
131
133
  catch (err) {
132
134
  const errorEntry = {
@@ -230,6 +232,7 @@ export async function runJudgeSync(opts) {
230
232
  scorecard.telemetryPosted = true;
231
233
  }
232
234
  await logWriter.append(scorecard);
235
+ ingestScorecard(opts.projectRoot, scorecard);
233
236
  resolve(scorecard);
234
237
  }
235
238
  catch (err) {
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Persistent judge session management.
3
+ *
4
+ * First eval spawns a new session with full catchup. Subsequent evals resume
5
+ * the same session — no catchup cost, just "evaluate this change."
6
+ *
7
+ * Session state stored in `.indusk/eval/judge-session.json`.
8
+ */
9
+ import type { EvalErrorEntry, EvalScorecard } from "./types.js";
10
+ /**
11
+ * Run eval using a persistent session. First call does catchup + eval.
12
+ * Subsequent calls resume the session with just the new change.
13
+ */
14
+ export declare function runPersistentEval(opts: {
15
+ projectRoot: string;
16
+ changeId: string;
17
+ transcriptPath: string;
18
+ mode: "eval" | "baseline";
19
+ evalEndpoint?: string;
20
+ }): Promise<EvalScorecard | EvalErrorEntry>;
@@ -0,0 +1,192 @@
1
+ /**
2
+ * Persistent judge session management.
3
+ *
4
+ * First eval spawns a new session with full catchup. Subsequent evals resume
5
+ * the same session — no catchup cost, just "evaluate this change."
6
+ *
7
+ * Session state stored in `.indusk/eval/judge-session.json`.
8
+ */
9
+ import { spawn } from "node:child_process";
10
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
11
+ import { dirname, join } from "node:path";
12
+ import { getProjectGroupId } from "../config.js";
13
+ import { ingestScorecard } from "./findings.js";
14
+ import { EvalLogWriter } from "./log-writer.js";
15
+ import { buildJudgePrompt } from "./prompt-builder.js";
16
+ import { V1_RUBRIC } from "./rubric.js";
17
+ function getSessionPath(projectRoot) {
18
+ return join(projectRoot, ".indusk", "eval", "judge-session.json");
19
+ }
20
+ function getEvalLogPath(projectRoot) {
21
+ return join(projectRoot, ".indusk", "eval", "results.log");
22
+ }
23
+ function readSession(projectRoot) {
24
+ const path = getSessionPath(projectRoot);
25
+ if (!existsSync(path))
26
+ return null;
27
+ try {
28
+ return JSON.parse(readFileSync(path, "utf8"));
29
+ }
30
+ catch {
31
+ return null;
32
+ }
33
+ }
34
+ function writeSession(projectRoot, session) {
35
+ const path = getSessionPath(projectRoot);
36
+ mkdirSync(dirname(path), { recursive: true });
37
+ writeFileSync(path, `${JSON.stringify(session, null, 2)}\n`);
38
+ }
39
+ function clearSession(projectRoot) {
40
+ const path = getSessionPath(projectRoot);
41
+ if (existsSync(path)) {
42
+ const { unlinkSync } = require("node:fs");
43
+ unlinkSync(path);
44
+ }
45
+ }
46
+ const ALLOWED_TOOLS = [
47
+ "Read",
48
+ "Grep",
49
+ "Glob",
50
+ "Bash(jj:*)",
51
+ "Bash(git:*)",
52
+ "mcp__graphiti__*",
53
+ "mcp__indusk__*",
54
+ "mcp__codegraphcontext__*",
55
+ ];
56
+ function parseClaudeOutput(stdout) {
57
+ let scorecardText = stdout;
58
+ let usage;
59
+ let sessionId;
60
+ try {
61
+ const jsonOutput = JSON.parse(stdout);
62
+ scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
63
+ sessionId = jsonOutput.session_id;
64
+ if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
65
+ const u = jsonOutput.usage ?? {};
66
+ usage = {
67
+ costUsd: jsonOutput.total_cost_usd ?? 0,
68
+ inputTokens: u.input_tokens ?? 0,
69
+ outputTokens: u.output_tokens ?? 0,
70
+ cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
71
+ cacheReadTokens: u.cache_read_input_tokens ?? 0,
72
+ durationMs: jsonOutput.duration_ms ?? 0,
73
+ };
74
+ }
75
+ }
76
+ catch {
77
+ // raw output
78
+ }
79
+ const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
80
+ if (jsonMatch?.[1]) {
81
+ scorecardText = jsonMatch[1];
82
+ }
83
+ return { scorecardText, usage, sessionId };
84
+ }
85
+ async function spawnClaude(args, prompt, cwd) {
86
+ return new Promise((resolve) => {
87
+ const child = spawn("claude", args, {
88
+ cwd,
89
+ stdio: ["pipe", "pipe", "pipe"],
90
+ env: { ...process.env },
91
+ });
92
+ child.stdin?.write(prompt);
93
+ child.stdin?.end();
94
+ let stdout = "";
95
+ let stderr = "";
96
+ child.stdout?.on("data", (chunk) => {
97
+ stdout += chunk.toString();
98
+ });
99
+ child.stderr?.on("data", (chunk) => {
100
+ stderr += chunk.toString();
101
+ });
102
+ child.on("close", (code) => {
103
+ resolve({ stdout, stderr, code });
104
+ });
105
+ });
106
+ }
107
+ /**
108
+ * Run eval using a persistent session. First call does catchup + eval.
109
+ * Subsequent calls resume the session with just the new change.
110
+ */
111
+ export async function runPersistentEval(opts) {
112
+ const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
113
+ const session = readSession(opts.projectRoot);
114
+ const projectGroup = getProjectGroupId(opts.projectRoot);
115
+ try {
116
+ let result;
117
+ if (session) {
118
+ // Resume existing session — cheap eval, no catchup
119
+ const resumePrompt = `Evaluate a new commit. Change ID: ${opts.changeId}
120
+
121
+ Run \`jj diff -r ${opts.changeId}\` to see what changed. Then answer the same evaluation questions as before. Read the changed files for full context.
122
+
123
+ Output ONLY the JSON scorecard as before — no commentary.`;
124
+ result = await spawnClaude([
125
+ "--print",
126
+ "--output-format",
127
+ "json",
128
+ "--resume",
129
+ session.sessionId,
130
+ "--allowed-tools",
131
+ ALLOWED_TOOLS.join(","),
132
+ ], resumePrompt, opts.projectRoot);
133
+ }
134
+ else {
135
+ // First eval — full catchup + evaluation
136
+ const fullPrompt = buildJudgePrompt({
137
+ rubric: V1_RUBRIC,
138
+ changeId: opts.changeId,
139
+ transcriptPath: opts.transcriptPath,
140
+ mode: opts.mode,
141
+ projectGroup,
142
+ });
143
+ result = await spawnClaude([
144
+ "--print",
145
+ "--output-format",
146
+ "json",
147
+ "--model",
148
+ "opus",
149
+ "--permission-mode",
150
+ "acceptEdits",
151
+ "--allowed-tools",
152
+ ALLOWED_TOOLS.join(","),
153
+ ], fullPrompt, opts.projectRoot);
154
+ }
155
+ if (result.code !== 0) {
156
+ // If resuming failed, clear session and retry with full catchup
157
+ if (session) {
158
+ clearSession(opts.projectRoot);
159
+ return runPersistentEval(opts);
160
+ }
161
+ throw new Error(`claude exited with code ${result.code}: ${result.stderr.slice(0, 500)}`);
162
+ }
163
+ const parsed = parseClaudeOutput(result.stdout);
164
+ const scorecard = JSON.parse(parsed.scorecardText.trim());
165
+ if (parsed.usage)
166
+ scorecard.usage = parsed.usage;
167
+ scorecard.telemetryPosted = false;
168
+ // Update session state
169
+ const newSession = {
170
+ sessionId: parsed.sessionId ?? session?.sessionId ?? "unknown",
171
+ createdAt: session?.createdAt ?? new Date().toISOString(),
172
+ lastEvalAt: new Date().toISOString(),
173
+ evalCount: (session?.evalCount ?? 0) + 1,
174
+ };
175
+ writeSession(opts.projectRoot, newSession);
176
+ await logWriter.append(scorecard);
177
+ ingestScorecard(opts.projectRoot, scorecard);
178
+ return scorecard;
179
+ }
180
+ catch (err) {
181
+ const errorEntry = {
182
+ version: 1,
183
+ timestamp: new Date().toISOString(),
184
+ mode: opts.mode,
185
+ changeId: opts.changeId,
186
+ error: true,
187
+ message: err instanceof Error ? err.message : String(err),
188
+ };
189
+ await logWriter.append(errorEntry);
190
+ return errorEntry;
191
+ }
192
+ }
@@ -5,7 +5,7 @@
5
5
  * rubric, defined in rubric.ts and answered by the judge agent.
6
6
  */
7
7
  export function isScorecard(entry) {
8
- return !("error" in entry) && "questions" in entry && Array.isArray(entry.questions);
8
+ return (!("error" in entry) && "questions" in entry && Array.isArray(entry.questions));
9
9
  }
10
10
  export function isErrorEntry(entry) {
11
11
  return "error" in entry && entry.error === true;
@@ -141,10 +141,33 @@ if (!judgeRunnerPath) {
141
141
  process.exit(0);
142
142
  }
143
143
 
144
- // Spawn a detached node process that calls runJudgeSync (which awaits completion).
144
+ // Surface unresolved findings from previous evals
145
+ const findingsPath = judgeRunnerPath.replace("judge-runner.js", "findings.js");
146
+ if (existsSync(findingsPath)) {
147
+ try {
148
+ const { getUnresolvedFindings } = await import(findingsPath);
149
+ const unresolved = getUnresolvedFindings(projectRoot);
150
+ if (unresolved.length > 0) {
151
+ const lines = unresolved.map(
152
+ (f) => ` [${f.severity}] ${f.questionId}: ${f.finding} (change ${f.changeId.slice(0, 8)})`,
153
+ );
154
+ process.stderr.write(
155
+ `\n📊 Unresolved eval findings (${unresolved.length}):\n${lines.join("\n")}\nUse \`indusk eval fix <key>\` or \`indusk eval ignore <key>\` to resolve.\n\n`,
156
+ );
157
+ }
158
+ } catch {
159
+ // findings module not available — skip silently
160
+ }
161
+ }
162
+
163
+ // Use persistent judge — resumes existing session if available, otherwise does full catchup.
164
+ const persistentJudgePath = judgeRunnerPath.replace("judge-runner.js", "persistent-judge.js");
165
+ const useModule = existsSync(persistentJudgePath) ? persistentJudgePath : judgeRunnerPath;
166
+ const useFunction = existsSync(persistentJudgePath) ? "runPersistentEval" : "runJudgeSync";
167
+
145
168
  const judgeScript = `
146
- import("${judgeRunnerPath}")
147
- .then(m => m.runJudgeSync({
169
+ import("${useModule}")
170
+ .then(m => m.${useFunction}({
148
171
  projectRoot: ${JSON.stringify(projectRoot)},
149
172
  changeId: ${JSON.stringify(changeId)},
150
173
  transcriptPath: ${JSON.stringify(transcriptPath)},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@infinitedusky/indusk-mcp",
3
- "version": "1.12.1",
3
+ "version": "1.13.0",
4
4
  "description": "InDusk development system — skills, MCP tools, and CLI for structured AI-assisted development",
5
5
  "type": "module",
6
6
  "files": [
package/skills/planner.md CHANGED
@@ -25,7 +25,7 @@ Each document builds on the ones before it. Not every plan needs all five — us
25
25
 
26
26
  The order is always preserved — never write an ADR before the brief, or an impl before the ADR (when both exist).
27
27
 
28
- General-purpose research (insights useful across plans) also lives in `research/` at the repo root.
28
+ General-purpose research (insights useful across plans) also lives in `.indusk/research/`.
29
29
 
30
30
  ## Workflow Types
31
31
 
@@ -62,12 +62,12 @@ Workflow templates are in `templates/workflows/` in the package. They describe w
62
62
  - **refactor**: start with brief (includes boundary map)
63
63
  - **spike**: start with research (and stop there)
64
64
 
65
- **Check for existing research first.** Before writing new research, scan `research/` at the repo root for relevant standalone research docs. If one exists (e.g., `research/auth-options.md`), ask the user: "I found existing research at `research/auth-options.md`. Want to use this as the starting point?" If yes:
65
+ **Check for existing research first.** Before writing new research, scan `.indusk/research/` for relevant standalone research docs. If one exists (e.g., `.indusk/research/auth-options.md`), ask the user: "I found existing research at `.indusk/research/auth-options.md`. Want to use this as the starting point?" If yes:
66
66
  - Copy it to `.indusk/planning/{plan-name}/research.md`
67
67
  - Set the frontmatter status to `complete`
68
68
  - Move straight to the brief
69
69
 
70
- The `research/` directory is for standalone exploration that isn't tied to a plan yet. When it becomes a plan, it moves into the planning folder. The original in `research/` can be deleted or kept as a reference — user's choice.
70
+ The `.indusk/research/` directory is for standalone exploration that isn't tied to a plan yet. When it becomes a plan, it moves into the planning folder. The original in `.indusk/research/` can be deleted or kept as a reference — user's choice.
71
71
 
72
72
  For feature/spike workflows that need new research: Explore the problem space — read code, search the web, check Context7 for library docs. **Query the code graph before scoping** (see toolbelt "Before Modifying Code") — include structural findings in research.md with concrete numbers.
73
73
  Document what you find. The research doc records findings and analysis, but saves the recommendation for the brief.
@@ -336,7 +336,7 @@ date: {YYYY-MM-DD}
336
336
  - {Hindsight — decisions that could have been better, steps to skip or add}
337
337
 
338
338
  ## Insights Worth Carrying Forward
339
- {Takeaways for future plans. Save to research/ if broadly useful.}
339
+ {Takeaways for future plans. Save to .indusk/research/ if broadly useful.}
340
340
 
341
341
  ## Quality Ratchet
342
342
  {Could any mistakes in this plan have been caught automatically by a Biome rule? If yes, add the rule to biome.json and document it in biome-rationale.md. The quality ratchet only gets tighter.}
@@ -361,7 +361,7 @@ date: {YYYY-MM-DD}
361
361
  └── archive/
362
362
  └── {completed-plan}/
363
363
 
364
- research/ # Standalone insights useful across plans
364
+ .indusk/research/ # Standalone insights useful across plans
365
365
  ```
366
366
 
367
367
  - Kebab-case folder names
@@ -374,6 +374,6 @@ research/ # Standalone insights useful across plans
374
374
  - **Use the code graph for scoping.** Before writing a brief or impl, query `analyze_code_relationships` to understand what depends on what. "How many files import X?" and "What calls this function?" prevent underscoping.
375
375
  - Keep Y-statements concise but complete. Every field filled in.
376
376
  - Impl checklists: granular enough to track, not so granular they're busywork.
377
- - When research produces broadly useful insights, also save to `research/` at repo root.
377
+ - When research produces broadly useful insights, also save to `.indusk/research/`.
378
378
  - Cross-reference related plans by path whenever work overlaps between plans.
379
379
  - The user's input is: $ARGUMENTS