npm - @infinitedusky/indusk-mcp - Versions diffs - 1.12.1 → 1.13.0 - Mend

@infinitedusky/indusk-mcp 1.12.1 → 1.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/bin/cli.js +22 -0
package/dist/bin/commands/eval.d.ts +4 -0
package/dist/bin/commands/eval.js +25 -0
package/dist/lib/eval/findings.d.ts +23 -0
package/dist/lib/eval/findings.js +68 -0
package/dist/lib/eval/judge-runner.js +3 -0
package/dist/lib/eval/persistent-judge.d.ts +20 -0
package/dist/lib/eval/persistent-judge.js +192 -0
package/dist/lib/eval/types.js +1 -1
package/hooks/eval-trigger.js +26 -3
package/package.json +1 -1
package/skills/planner.md +6 -6

package/dist/bin/cli.js CHANGED Viewed

@@ -259,6 +259,28 @@ eval_
     const { evalSummary } = await import("./commands/eval.js");
     await evalSummary(process.cwd(), opts);
 });
+eval_
+    .command("findings")
+    .description("List unresolved eval findings")
+    .option("--all", "Show all findings including fixed/ignored")
+    .action(async (opts) => {
+    const { evalFindings } = await import("./commands/eval.js");
+    await evalFindings(process.cwd(), opts);
+});
+eval_
+    .command("fix <key>")
+    .description("Mark an eval finding as fixed")
+    .action(async (key) => {
+    const { evalMark } = await import("./commands/eval.js");
+    await evalMark(process.cwd(), key, "fixed");
+});
+eval_
+    .command("ignore <key>")
+    .description("Mark an eval finding as ignored")
+    .action(async (key) => {
+    const { evalMark } = await import("./commands/eval.js");
+    await evalMark(process.cwd(), key, "ignored");
+});
 eval_
     .command("baseline")
     .description("Run baseline evaluation with vanilla agent")

package/dist/bin/commands/eval.d.ts CHANGED Viewed

@@ -9,6 +9,10 @@ export declare function evalSummary(projectRoot: string, opts: {
     since?: string;
     json?: boolean;
 }): Promise<void>;
+export declare function evalFindings(projectRoot: string, opts: {
+    all?: boolean;
+}): Promise<void>;
+export declare function evalMark(projectRoot: string, key: string, state: "fixed" | "ignored"): Promise<void>;
 export declare function evalBaseline(projectRoot: string, opts: {
     task: string;
     keep?: boolean;

package/dist/bin/commands/eval.js CHANGED Viewed

@@ -6,6 +6,7 @@
  */
 import { existsSync } from "node:fs";
 import { join } from "node:path";
+import { getAllFindings, getUnresolvedFindings, markFinding } from "../../lib/eval/findings.js";
 import { readAllEntries } from "../../lib/eval/log-reader.js";
 import { isScorecard } from "../../lib/eval/types.js";
 function getEvalLogPath(projectRoot) {
@@ -106,6 +107,30 @@ function computeSummary(scorecards) {
         trend,
     };
 }
+export async function evalFindings(projectRoot, opts) {
+    const findings = opts.all ? getAllFindings(projectRoot) : getUnresolvedFindings(projectRoot);
+    if (findings.length === 0) {
+        console.info(opts.all ? "No eval findings." : "No unresolved findings.");
+        return;
+    }
+    console.info(`\n${opts.all ? "All" : "Unresolved"} eval findings (${findings.length}):\n`);
+    for (const f of findings) {
+        const icon = f.state === "fixed" ? "✓" : f.state === "ignored" ? "–" : "●";
+        console.info(`  ${icon} [${f.severity}] ${f.questionId}: ${f.finding}`);
+        console.info(`    key: ${f.key}  change: ${f.changeId.slice(0, 8)}  state: ${f.state}`);
+    }
+    console.info("");
+}
+export async function evalMark(projectRoot, key, state) {
+    const success = markFinding(projectRoot, key, state);
+    if (success) {
+        console.info(`Marked ${key} as ${state}`);
+    }
+    else {
+        console.error(`Finding not found: ${key}`);
+        process.exit(1);
+    }
+}
 function computePassRates(cards) {
     const counts = {};
     for (const card of cards) {

package/dist/lib/eval/findings.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+/**
+ * Tracks eval finding resolution state.
+ *
+ * Findings persist as "unresolved" until explicitly fixed or ignored.
+ * The eval hook surfaces unresolved findings on every jj describe.
+ */
+import type { EvalScorecard } from "./types.js";
+export type FindingState = "unresolved" | "fixed" | "ignored";
+export interface FindingEntry {
+    state: FindingState;
+    questionId: string;
+    severity: string;
+    finding: string;
+    changeId: string;
+}
+export declare function getUnresolvedFindings(projectRoot: string): Array<{
+    key: string;
+} & FindingEntry>;
+export declare function getAllFindings(projectRoot: string): Array<{
+    key: string;
+} & FindingEntry>;
+export declare function markFinding(projectRoot: string, key: string, state: FindingState): boolean;
+export declare function ingestScorecard(projectRoot: string, scorecard: EvalScorecard): number;

package/dist/lib/eval/findings.js ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * Tracks eval finding resolution state.
+ *
+ * Findings persist as "unresolved" until explicitly fixed or ignored.
+ * The eval hook surfaces unresolved findings on every jj describe.
+ */
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+function getFindingsPath(projectRoot) {
+    return join(projectRoot, ".indusk", "eval", "findings.json");
+}
+function readFindings(projectRoot) {
+    const path = getFindingsPath(projectRoot);
+    if (!existsSync(path))
+        return {};
+    try {
+        return JSON.parse(readFileSync(path, "utf8"));
+    }
+    catch {
+        return {};
+    }
+}
+function writeFindings(projectRoot, findings) {
+    const path = getFindingsPath(projectRoot);
+    mkdirSync(dirname(path), { recursive: true });
+    writeFileSync(path, `${JSON.stringify(findings, null, 2)}\n`);
+}
+export function getUnresolvedFindings(projectRoot) {
+    const findings = readFindings(projectRoot);
+    return Object.entries(findings)
+        .filter(([, entry]) => entry.state === "unresolved")
+        .map(([key, entry]) => ({ key, ...entry }));
+}
+export function getAllFindings(projectRoot) {
+    const findings = readFindings(projectRoot);
+    return Object.entries(findings).map(([key, entry]) => ({ key, ...entry }));
+}
+export function markFinding(projectRoot, key, state) {
+    const findings = readFindings(projectRoot);
+    if (!findings[key])
+        return false;
+    findings[key].state = state;
+    writeFindings(projectRoot, findings);
+    return true;
+}
+export function ingestScorecard(projectRoot, scorecard) {
+    const findings = readFindings(projectRoot);
+    let added = 0;
+    for (const q of scorecard.questions) {
+        if (q.answer === "yes")
+            continue; // no finding for passing questions
+        const key = `${scorecard.changeId}:${q.id}`;
+        if (!findings[key]) {
+            findings[key] = {
+                state: "unresolved",
+                questionId: q.id,
+                severity: q.severity,
+                finding: q.finding,
+                changeId: scorecard.changeId,
+            };
+            added++;
+        }
+    }
+    if (added > 0) {
+        writeFindings(projectRoot, findings);
+    }
+    return added;
+}

package/dist/lib/eval/judge-runner.js CHANGED Viewed

@@ -8,6 +8,7 @@
 import { spawn } from "node:child_process";
 import { join } from "node:path";
 import { getProjectGroupId } from "../config.js";
+import { ingestScorecard } from "./findings.js";
 import { EvalLogWriter } from "./log-writer.js";
 import { buildJudgePrompt } from "./prompt-builder.js";
 import { V1_RUBRIC } from "./rubric.js";
@@ -127,6 +128,7 @@ export function runJudgeBackground(opts) {
                 scorecard.telemetryPosted = true;
             }
             await logWriter.append(scorecard);
+            ingestScorecard(opts.projectRoot, scorecard);
         }
         catch (err) {
             const errorEntry = {
@@ -230,6 +232,7 @@ export async function runJudgeSync(opts) {
                     scorecard.telemetryPosted = true;
                 }
                 await logWriter.append(scorecard);
+                ingestScorecard(opts.projectRoot, scorecard);
                 resolve(scorecard);
             }
             catch (err) {

package/dist/lib/eval/persistent-judge.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+/**
+ * Persistent judge session management.
+ *
+ * First eval spawns a new session with full catchup. Subsequent evals resume
+ * the same session — no catchup cost, just "evaluate this change."
+ *
+ * Session state stored in `.indusk/eval/judge-session.json`.
+ */
+import type { EvalErrorEntry, EvalScorecard } from "./types.js";
+/**
+ * Run eval using a persistent session. First call does catchup + eval.
+ * Subsequent calls resume the session with just the new change.
+ */
+export declare function runPersistentEval(opts: {
+    projectRoot: string;
+    changeId: string;
+    transcriptPath: string;
+    mode: "eval" | "baseline";
+    evalEndpoint?: string;
+}): Promise<EvalScorecard | EvalErrorEntry>;

package/dist/lib/eval/persistent-judge.js ADDED Viewed

@@ -0,0 +1,192 @@
+/**
+ * Persistent judge session management.
+ *
+ * First eval spawns a new session with full catchup. Subsequent evals resume
+ * the same session — no catchup cost, just "evaluate this change."
+ *
+ * Session state stored in `.indusk/eval/judge-session.json`.
+ */
+import { spawn } from "node:child_process";
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { getProjectGroupId } from "../config.js";
+import { ingestScorecard } from "./findings.js";
+import { EvalLogWriter } from "./log-writer.js";
+import { buildJudgePrompt } from "./prompt-builder.js";
+import { V1_RUBRIC } from "./rubric.js";
+function getSessionPath(projectRoot) {
+    return join(projectRoot, ".indusk", "eval", "judge-session.json");
+}
+function getEvalLogPath(projectRoot) {
+    return join(projectRoot, ".indusk", "eval", "results.log");
+}
+function readSession(projectRoot) {
+    const path = getSessionPath(projectRoot);
+    if (!existsSync(path))
+        return null;
+    try {
+        return JSON.parse(readFileSync(path, "utf8"));
+    }
+    catch {
+        return null;
+    }
+}
+function writeSession(projectRoot, session) {
+    const path = getSessionPath(projectRoot);
+    mkdirSync(dirname(path), { recursive: true });
+    writeFileSync(path, `${JSON.stringify(session, null, 2)}\n`);
+}
+function clearSession(projectRoot) {
+    const path = getSessionPath(projectRoot);
+    if (existsSync(path)) {
+        const { unlinkSync } = require("node:fs");
+        unlinkSync(path);
+    }
+}
+const ALLOWED_TOOLS = [
+    "Read",
+    "Grep",
+    "Glob",
+    "Bash(jj:*)",
+    "Bash(git:*)",
+    "mcp__graphiti__*",
+    "mcp__indusk__*",
+    "mcp__codegraphcontext__*",
+];
+function parseClaudeOutput(stdout) {
+    let scorecardText = stdout;
+    let usage;
+    let sessionId;
+    try {
+        const jsonOutput = JSON.parse(stdout);
+        scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
+        sessionId = jsonOutput.session_id;
+        if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
+            const u = jsonOutput.usage ?? {};
+            usage = {
+                costUsd: jsonOutput.total_cost_usd ?? 0,
+                inputTokens: u.input_tokens ?? 0,
+                outputTokens: u.output_tokens ?? 0,
+                cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
+                cacheReadTokens: u.cache_read_input_tokens ?? 0,
+                durationMs: jsonOutput.duration_ms ?? 0,
+            };
+        }
+    }
+    catch {
+        // raw output
+    }
+    const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
+    if (jsonMatch?.[1]) {
+        scorecardText = jsonMatch[1];
+    }
+    return { scorecardText, usage, sessionId };
+}
+async function spawnClaude(args, prompt, cwd) {
+    return new Promise((resolve) => {
+        const child = spawn("claude", args, {
+            cwd,
+            stdio: ["pipe", "pipe", "pipe"],
+            env: { ...process.env },
+        });
+        child.stdin?.write(prompt);
+        child.stdin?.end();
+        let stdout = "";
+        let stderr = "";
+        child.stdout?.on("data", (chunk) => {
+            stdout += chunk.toString();
+        });
+        child.stderr?.on("data", (chunk) => {
+            stderr += chunk.toString();
+        });
+        child.on("close", (code) => {
+            resolve({ stdout, stderr, code });
+        });
+    });
+}
+/**
+ * Run eval using a persistent session. First call does catchup + eval.
+ * Subsequent calls resume the session with just the new change.
+ */
+export async function runPersistentEval(opts) {
+    const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
+    const session = readSession(opts.projectRoot);
+    const projectGroup = getProjectGroupId(opts.projectRoot);
+    try {
+        let result;
+        if (session) {
+            // Resume existing session — cheap eval, no catchup
+            const resumePrompt = `Evaluate a new commit. Change ID: ${opts.changeId}
+Run \`jj diff -r ${opts.changeId}\` to see what changed. Then answer the same evaluation questions as before. Read the changed files for full context.
+Output ONLY the JSON scorecard as before — no commentary.`;
+            result = await spawnClaude([
+                "--print",
+                "--output-format",
+                "json",
+                "--resume",
+                session.sessionId,
+                "--allowed-tools",
+                ALLOWED_TOOLS.join(","),
+            ], resumePrompt, opts.projectRoot);
+        }
+        else {
+            // First eval — full catchup + evaluation
+            const fullPrompt = buildJudgePrompt({
+                rubric: V1_RUBRIC,
+                changeId: opts.changeId,
+                transcriptPath: opts.transcriptPath,
+                mode: opts.mode,
+                projectGroup,
+            });
+            result = await spawnClaude([
+                "--print",
+                "--output-format",
+                "json",
+                "--model",
+                "opus",
+                "--permission-mode",
+                "acceptEdits",
+                "--allowed-tools",
+                ALLOWED_TOOLS.join(","),
+            ], fullPrompt, opts.projectRoot);
+        }
+        if (result.code !== 0) {
+            // If resuming failed, clear session and retry with full catchup
+            if (session) {
+                clearSession(opts.projectRoot);
+                return runPersistentEval(opts);
+            }
+            throw new Error(`claude exited with code ${result.code}: ${result.stderr.slice(0, 500)}`);
+        }
+        const parsed = parseClaudeOutput(result.stdout);
+        const scorecard = JSON.parse(parsed.scorecardText.trim());
+        if (parsed.usage)
+            scorecard.usage = parsed.usage;
+        scorecard.telemetryPosted = false;
+        // Update session state
+        const newSession = {
+            sessionId: parsed.sessionId ?? session?.sessionId ?? "unknown",
+            createdAt: session?.createdAt ?? new Date().toISOString(),
+            lastEvalAt: new Date().toISOString(),
+            evalCount: (session?.evalCount ?? 0) + 1,
+        };
+        writeSession(opts.projectRoot, newSession);
+        await logWriter.append(scorecard);
+        ingestScorecard(opts.projectRoot, scorecard);
+        return scorecard;
+    }
+    catch (err) {
+        const errorEntry = {
+            version: 1,
+            timestamp: new Date().toISOString(),
+            mode: opts.mode,
+            changeId: opts.changeId,
+            error: true,
+            message: err instanceof Error ? err.message : String(err),
+        };
+        await logWriter.append(errorEntry);
+        return errorEntry;
+    }
+}

package/dist/lib/eval/types.js CHANGED Viewed

@@ -5,7 +5,7 @@
  * rubric, defined in rubric.ts and answered by the judge agent.
  */
 export function isScorecard(entry) {
-    return !("error" in entry) && "questions" in entry && Array.isArray(entry.questions);
+    return (!("error" in entry) && "questions" in entry && Array.isArray(entry.questions));
 }
 export function isErrorEntry(entry) {
     return "error" in entry && entry.error === true;

package/hooks/eval-trigger.js CHANGED Viewed

@@ -141,10 +141,33 @@ if (!judgeRunnerPath) {
 	process.exit(0);
 }
-// Spawn a detached node process that calls runJudgeSync (which awaits completion).
+// Surface unresolved findings from previous evals
+const findingsPath = judgeRunnerPath.replace("judge-runner.js", "findings.js");
+if (existsSync(findingsPath)) {
+	try {
+		const { getUnresolvedFindings } = await import(findingsPath);
+		const unresolved = getUnresolvedFindings(projectRoot);
+		if (unresolved.length > 0) {
+			const lines = unresolved.map(
+				(f) => `  [${f.severity}] ${f.questionId}: ${f.finding} (change ${f.changeId.slice(0, 8)})`,
+			);
+			process.stderr.write(
+				`\n📊 Unresolved eval findings (${unresolved.length}):\n${lines.join("\n")}\nUse \`indusk eval fix <key>\` or \`indusk eval ignore <key>\` to resolve.\n\n`,
+			);
+		}
+	} catch {
+		// findings module not available — skip silently
+	}
+}
+// Use persistent judge — resumes existing session if available, otherwise does full catchup.
+const persistentJudgePath = judgeRunnerPath.replace("judge-runner.js", "persistent-judge.js");
+const useModule = existsSync(persistentJudgePath) ? persistentJudgePath : judgeRunnerPath;
+const useFunction = existsSync(persistentJudgePath) ? "runPersistentEval" : "runJudgeSync";
 const judgeScript = `
-import("${judgeRunnerPath}")
-  .then(m => m.runJudgeSync({
+import("${useModule}")
+  .then(m => m.${useFunction}({
     projectRoot: ${JSON.stringify(projectRoot)},
     changeId: ${JSON.stringify(changeId)},
     transcriptPath: ${JSON.stringify(transcriptPath)},

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@infinitedusky/indusk-mcp",
-	"version": "1.12.1",
+	"version": "1.13.0",
 	"description": "InDusk development system — skills, MCP tools, and CLI for structured AI-assisted development",
 	"type": "module",
 	"files": [

package/skills/planner.md CHANGED Viewed

@@ -25,7 +25,7 @@ Each document builds on the ones before it. Not every plan needs all five — us
 The order is always preserved — never write an ADR before the brief, or an impl before the ADR (when both exist).
-General-purpose research (insights useful across plans) also lives in `research/` at the repo root.
+General-purpose research (insights useful across plans) also lives in `.indusk/research/`.
 ## Workflow Types
@@ -62,12 +62,12 @@ Workflow templates are in `templates/workflows/` in the package. They describe w
    - **refactor**: start with brief (includes boundary map)
    - **spike**: start with research (and stop there)
-   **Check for existing research first.** Before writing new research, scan `research/` at the repo root for relevant standalone research docs. If one exists (e.g., `research/auth-options.md`), ask the user: "I found existing research at `research/auth-options.md`. Want to use this as the starting point?" If yes:
+   **Check for existing research first.** Before writing new research, scan `.indusk/research/` for relevant standalone research docs. If one exists (e.g., `.indusk/research/auth-options.md`), ask the user: "I found existing research at `.indusk/research/auth-options.md`. Want to use this as the starting point?" If yes:
    - Copy it to `.indusk/planning/{plan-name}/research.md`
    - Set the frontmatter status to `complete`
    - Move straight to the brief
-   The `research/` directory is for standalone exploration that isn't tied to a plan yet. When it becomes a plan, it moves into the planning folder. The original in `research/` can be deleted or kept as a reference — user's choice.
+   The `.indusk/research/` directory is for standalone exploration that isn't tied to a plan yet. When it becomes a plan, it moves into the planning folder. The original in `.indusk/research/` can be deleted or kept as a reference — user's choice.
    For feature/spike workflows that need new research: Explore the problem space — read code, search the web, check Context7 for library docs. **Query the code graph before scoping** (see toolbelt "Before Modifying Code") — include structural findings in research.md with concrete numbers.
    Document what you find. The research doc records findings and analysis, but saves the recommendation for the brief.
@@ -336,7 +336,7 @@ date: {YYYY-MM-DD}
 - {Hindsight — decisions that could have been better, steps to skip or add}
 ## Insights Worth Carrying Forward
-{Takeaways for future plans. Save to research/ if broadly useful.}
+{Takeaways for future plans. Save to .indusk/research/ if broadly useful.}
 ## Quality Ratchet
 {Could any mistakes in this plan have been caught automatically by a Biome rule? If yes, add the rule to biome.json and document it in biome-rationale.md. The quality ratchet only gets tighter.}
@@ -361,7 +361,7 @@ date: {YYYY-MM-DD}
 └── archive/
     └── {completed-plan}/
-research/                    # Standalone insights useful across plans
+.indusk/research/            # Standalone insights useful across plans
 ```
 - Kebab-case folder names
@@ -374,6 +374,6 @@ research/                    # Standalone insights useful across plans
 - **Use the code graph for scoping.** Before writing a brief or impl, query `analyze_code_relationships` to understand what depends on what. "How many files import X?" and "What calls this function?" prevent underscoping.
 - Keep Y-statements concise but complete. Every field filled in.
 - Impl checklists: granular enough to track, not so granular they're busywork.
-- When research produces broadly useful insights, also save to `research/` at repo root.
+- When research produces broadly useful insights, also save to `.indusk/research/`.
 - Cross-reference related plans by path whenever work overlaps between plans.
 - The user's input is: $ARGUMENTS