@infinitedusky/indusk-mcp 1.12.3 → 1.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Persistent judge session management.
|
|
3
|
+
*
|
|
4
|
+
* First eval spawns a new session with full catchup. Subsequent evals resume
|
|
5
|
+
* the same session — no catchup cost, just "evaluate this change."
|
|
6
|
+
*
|
|
7
|
+
* Session state stored in `.indusk/eval/judge-session.json`.
|
|
8
|
+
*/
|
|
9
|
+
import type { EvalErrorEntry, EvalScorecard } from "./types.js";
|
|
10
|
+
/**
|
|
11
|
+
* Run eval using a persistent session. First call does catchup + eval.
|
|
12
|
+
* Subsequent calls resume the session with just the new change.
|
|
13
|
+
*/
|
|
14
|
+
export declare function runPersistentEval(opts: {
|
|
15
|
+
projectRoot: string;
|
|
16
|
+
changeId: string;
|
|
17
|
+
transcriptPath: string;
|
|
18
|
+
mode: "eval" | "baseline";
|
|
19
|
+
evalEndpoint?: string;
|
|
20
|
+
}): Promise<EvalScorecard | EvalErrorEntry>;
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Persistent judge session management.
|
|
3
|
+
*
|
|
4
|
+
* First eval spawns a new session with full catchup. Subsequent evals resume
|
|
5
|
+
* the same session — no catchup cost, just "evaluate this change."
|
|
6
|
+
*
|
|
7
|
+
* Session state stored in `.indusk/eval/judge-session.json`.
|
|
8
|
+
*/
|
|
9
|
+
import { spawn } from "node:child_process";
|
|
10
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
11
|
+
import { dirname, join } from "node:path";
|
|
12
|
+
import { getProjectGroupId } from "../config.js";
|
|
13
|
+
import { ingestScorecard } from "./findings.js";
|
|
14
|
+
import { EvalLogWriter } from "./log-writer.js";
|
|
15
|
+
import { buildJudgePrompt } from "./prompt-builder.js";
|
|
16
|
+
import { V1_RUBRIC } from "./rubric.js";
|
|
17
|
+
function getSessionPath(projectRoot) {
|
|
18
|
+
return join(projectRoot, ".indusk", "eval", "judge-session.json");
|
|
19
|
+
}
|
|
20
|
+
function getEvalLogPath(projectRoot) {
|
|
21
|
+
return join(projectRoot, ".indusk", "eval", "results.log");
|
|
22
|
+
}
|
|
23
|
+
function readSession(projectRoot) {
|
|
24
|
+
const path = getSessionPath(projectRoot);
|
|
25
|
+
if (!existsSync(path))
|
|
26
|
+
return null;
|
|
27
|
+
try {
|
|
28
|
+
return JSON.parse(readFileSync(path, "utf8"));
|
|
29
|
+
}
|
|
30
|
+
catch {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function writeSession(projectRoot, session) {
|
|
35
|
+
const path = getSessionPath(projectRoot);
|
|
36
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
37
|
+
writeFileSync(path, `${JSON.stringify(session, null, 2)}\n`);
|
|
38
|
+
}
|
|
39
|
+
function clearSession(projectRoot) {
|
|
40
|
+
const path = getSessionPath(projectRoot);
|
|
41
|
+
if (existsSync(path)) {
|
|
42
|
+
const { unlinkSync } = require("node:fs");
|
|
43
|
+
unlinkSync(path);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
const ALLOWED_TOOLS = [
|
|
47
|
+
"Read",
|
|
48
|
+
"Grep",
|
|
49
|
+
"Glob",
|
|
50
|
+
"Bash(jj:*)",
|
|
51
|
+
"Bash(git:*)",
|
|
52
|
+
"mcp__graphiti__*",
|
|
53
|
+
"mcp__indusk__*",
|
|
54
|
+
"mcp__codegraphcontext__*",
|
|
55
|
+
];
|
|
56
|
+
function parseClaudeOutput(stdout) {
|
|
57
|
+
let scorecardText = stdout;
|
|
58
|
+
let usage;
|
|
59
|
+
let sessionId;
|
|
60
|
+
try {
|
|
61
|
+
const jsonOutput = JSON.parse(stdout);
|
|
62
|
+
scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
|
|
63
|
+
sessionId = jsonOutput.session_id;
|
|
64
|
+
if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
|
|
65
|
+
const u = jsonOutput.usage ?? {};
|
|
66
|
+
usage = {
|
|
67
|
+
costUsd: jsonOutput.total_cost_usd ?? 0,
|
|
68
|
+
inputTokens: u.input_tokens ?? 0,
|
|
69
|
+
outputTokens: u.output_tokens ?? 0,
|
|
70
|
+
cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
|
|
71
|
+
cacheReadTokens: u.cache_read_input_tokens ?? 0,
|
|
72
|
+
durationMs: jsonOutput.duration_ms ?? 0,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
// raw output
|
|
78
|
+
}
|
|
79
|
+
const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
|
|
80
|
+
if (jsonMatch?.[1]) {
|
|
81
|
+
scorecardText = jsonMatch[1];
|
|
82
|
+
}
|
|
83
|
+
return { scorecardText, usage, sessionId };
|
|
84
|
+
}
|
|
85
|
+
async function spawnClaude(args, prompt, cwd) {
|
|
86
|
+
return new Promise((resolve) => {
|
|
87
|
+
const child = spawn("claude", args, {
|
|
88
|
+
cwd,
|
|
89
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
90
|
+
env: { ...process.env },
|
|
91
|
+
});
|
|
92
|
+
child.stdin?.write(prompt);
|
|
93
|
+
child.stdin?.end();
|
|
94
|
+
let stdout = "";
|
|
95
|
+
let stderr = "";
|
|
96
|
+
child.stdout?.on("data", (chunk) => {
|
|
97
|
+
stdout += chunk.toString();
|
|
98
|
+
});
|
|
99
|
+
child.stderr?.on("data", (chunk) => {
|
|
100
|
+
stderr += chunk.toString();
|
|
101
|
+
});
|
|
102
|
+
child.on("close", (code) => {
|
|
103
|
+
resolve({ stdout, stderr, code });
|
|
104
|
+
});
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Run eval using a persistent session. First call does catchup + eval.
|
|
109
|
+
* Subsequent calls resume the session with just the new change.
|
|
110
|
+
*/
|
|
111
|
+
export async function runPersistentEval(opts) {
|
|
112
|
+
const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
|
|
113
|
+
const session = readSession(opts.projectRoot);
|
|
114
|
+
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
115
|
+
try {
|
|
116
|
+
let result;
|
|
117
|
+
if (session) {
|
|
118
|
+
// Resume existing session — cheap eval, no catchup
|
|
119
|
+
const resumePrompt = `Evaluate a new commit. Change ID: ${opts.changeId}
|
|
120
|
+
|
|
121
|
+
Run \`jj diff -r ${opts.changeId}\` to see what changed. Then answer the same evaluation questions as before. Read the changed files for full context.
|
|
122
|
+
|
|
123
|
+
Output ONLY the JSON scorecard as before — no commentary.`;
|
|
124
|
+
result = await spawnClaude([
|
|
125
|
+
"--print",
|
|
126
|
+
"--output-format",
|
|
127
|
+
"json",
|
|
128
|
+
"--resume",
|
|
129
|
+
session.sessionId,
|
|
130
|
+
"--allowed-tools",
|
|
131
|
+
ALLOWED_TOOLS.join(","),
|
|
132
|
+
], resumePrompt, opts.projectRoot);
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
// First eval — full catchup + evaluation
|
|
136
|
+
const fullPrompt = buildJudgePrompt({
|
|
137
|
+
rubric: V1_RUBRIC,
|
|
138
|
+
changeId: opts.changeId,
|
|
139
|
+
transcriptPath: opts.transcriptPath,
|
|
140
|
+
mode: opts.mode,
|
|
141
|
+
projectGroup,
|
|
142
|
+
});
|
|
143
|
+
result = await spawnClaude([
|
|
144
|
+
"--print",
|
|
145
|
+
"--output-format",
|
|
146
|
+
"json",
|
|
147
|
+
"--model",
|
|
148
|
+
"opus",
|
|
149
|
+
"--permission-mode",
|
|
150
|
+
"acceptEdits",
|
|
151
|
+
"--allowed-tools",
|
|
152
|
+
ALLOWED_TOOLS.join(","),
|
|
153
|
+
], fullPrompt, opts.projectRoot);
|
|
154
|
+
}
|
|
155
|
+
if (result.code !== 0) {
|
|
156
|
+
// If resuming failed, clear session and retry with full catchup
|
|
157
|
+
if (session) {
|
|
158
|
+
clearSession(opts.projectRoot);
|
|
159
|
+
return runPersistentEval(opts);
|
|
160
|
+
}
|
|
161
|
+
throw new Error(`claude exited with code ${result.code}: ${result.stderr.slice(0, 500)}`);
|
|
162
|
+
}
|
|
163
|
+
const parsed = parseClaudeOutput(result.stdout);
|
|
164
|
+
const scorecard = JSON.parse(parsed.scorecardText.trim());
|
|
165
|
+
if (parsed.usage)
|
|
166
|
+
scorecard.usage = parsed.usage;
|
|
167
|
+
scorecard.telemetryPosted = false;
|
|
168
|
+
// Update session state
|
|
169
|
+
const newSession = {
|
|
170
|
+
sessionId: parsed.sessionId ?? session?.sessionId ?? "unknown",
|
|
171
|
+
createdAt: session?.createdAt ?? new Date().toISOString(),
|
|
172
|
+
lastEvalAt: new Date().toISOString(),
|
|
173
|
+
evalCount: (session?.evalCount ?? 0) + 1,
|
|
174
|
+
};
|
|
175
|
+
writeSession(opts.projectRoot, newSession);
|
|
176
|
+
await logWriter.append(scorecard);
|
|
177
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
178
|
+
return scorecard;
|
|
179
|
+
}
|
|
180
|
+
catch (err) {
|
|
181
|
+
const errorEntry = {
|
|
182
|
+
version: 1,
|
|
183
|
+
timestamp: new Date().toISOString(),
|
|
184
|
+
mode: opts.mode,
|
|
185
|
+
changeId: opts.changeId,
|
|
186
|
+
error: true,
|
|
187
|
+
message: err instanceof Error ? err.message : String(err),
|
|
188
|
+
};
|
|
189
|
+
await logWriter.append(errorEntry);
|
|
190
|
+
return errorEntry;
|
|
191
|
+
}
|
|
192
|
+
}
|
package/hooks/eval-trigger.js
CHANGED
|
@@ -11,10 +11,24 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
import { execSync, spawn } from "node:child_process";
|
|
14
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
14
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync } from "node:fs";
|
|
15
15
|
import { dirname, resolve } from "node:path";
|
|
16
16
|
import { fileURLToPath } from "node:url";
|
|
17
17
|
|
|
18
|
+
// Hook debug log — writes to .indusk/eval/hook.log so we can see what's happening
|
|
19
|
+
function hookLog(projectRoot, msg) {
|
|
20
|
+
try {
|
|
21
|
+
const logDir = resolve(projectRoot || ".", ".indusk", "eval");
|
|
22
|
+
mkdirSync(logDir, { recursive: true });
|
|
23
|
+
appendFileSync(
|
|
24
|
+
resolve(logDir, "hook.log"),
|
|
25
|
+
`${new Date().toISOString()} ${msg}\n`,
|
|
26
|
+
);
|
|
27
|
+
} catch {
|
|
28
|
+
// ignore — logging should never break the hook
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
18
32
|
// Read hook input from stdin
|
|
19
33
|
let input = "";
|
|
20
34
|
for await (const chunk of process.stdin) {
|
|
@@ -24,9 +38,13 @@ for await (const chunk of process.stdin) {
|
|
|
24
38
|
const event = JSON.parse(input);
|
|
25
39
|
const toolInput = event.tool_input ?? {};
|
|
26
40
|
const command = toolInput.command ?? "";
|
|
41
|
+
const cwd = event.cwd ?? process.cwd();
|
|
42
|
+
|
|
43
|
+
hookLog(cwd, `hook fired — tool: ${event.tool_name}, command: ${command.slice(0, 100)}`);
|
|
27
44
|
|
|
28
45
|
// Fast path: not a jj describe command
|
|
29
46
|
if (!command.includes("jj describe")) {
|
|
47
|
+
hookLog(cwd, "skip — no jj describe in command");
|
|
30
48
|
process.exit(0);
|
|
31
49
|
}
|
|
32
50
|
|
|
@@ -61,11 +79,14 @@ function readEvalConfig(projectRoot) {
|
|
|
61
79
|
}
|
|
62
80
|
}
|
|
63
81
|
|
|
64
|
-
const projectRoot = findProjectRoot(
|
|
82
|
+
const projectRoot = findProjectRoot(cwd);
|
|
65
83
|
const evalConfig = readEvalConfig(projectRoot);
|
|
66
84
|
|
|
85
|
+
hookLog(projectRoot, `projectRoot: ${projectRoot}, eval.enabled: ${evalConfig.enabled}`);
|
|
86
|
+
|
|
67
87
|
// Check if eval is disabled
|
|
68
88
|
if (!evalConfig.enabled) {
|
|
89
|
+
hookLog(projectRoot, "skip — eval disabled in config");
|
|
69
90
|
process.exit(0);
|
|
70
91
|
}
|
|
71
92
|
|
|
@@ -117,11 +138,13 @@ const candidates = [
|
|
|
117
138
|
];
|
|
118
139
|
let judgeRunnerPath = null;
|
|
119
140
|
for (const c of candidates) {
|
|
141
|
+
hookLog(projectRoot, `candidate: ${c} — ${existsSync(c) ? "found" : "missing"}`);
|
|
120
142
|
if (existsSync(c)) {
|
|
121
143
|
judgeRunnerPath = c;
|
|
122
144
|
break;
|
|
123
145
|
}
|
|
124
146
|
}
|
|
147
|
+
hookLog(projectRoot, `judgeRunnerPath: ${judgeRunnerPath ?? "NOT FOUND"}`);
|
|
125
148
|
|
|
126
149
|
if (!judgeRunnerPath) {
|
|
127
150
|
// Can't find the package — log error and exit
|
|
@@ -160,10 +183,16 @@ if (existsSync(findingsPath)) {
|
|
|
160
183
|
}
|
|
161
184
|
}
|
|
162
185
|
|
|
163
|
-
//
|
|
186
|
+
// Use persistent judge — resumes existing session if available, otherwise does full catchup.
|
|
187
|
+
const persistentJudgePath = judgeRunnerPath.replace("judge-runner.js", "persistent-judge.js");
|
|
188
|
+
const useModule = existsSync(persistentJudgePath) ? persistentJudgePath : judgeRunnerPath;
|
|
189
|
+
const useFunction = existsSync(persistentJudgePath) ? "runPersistentEval" : "runJudgeSync";
|
|
190
|
+
|
|
191
|
+
hookLog(projectRoot, `spawning judge — module: ${useModule}, function: ${useFunction}, changeId: ${changeId}`);
|
|
192
|
+
|
|
164
193
|
const judgeScript = `
|
|
165
|
-
import("${
|
|
166
|
-
.then(m => m
|
|
194
|
+
import("${useModule}")
|
|
195
|
+
.then(m => m.${useFunction}({
|
|
167
196
|
projectRoot: ${JSON.stringify(projectRoot)},
|
|
168
197
|
changeId: ${JSON.stringify(changeId)},
|
|
169
198
|
transcriptPath: ${JSON.stringify(transcriptPath)},
|