@infinitedusky/indusk-mcp 1.11.0 → 1.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* The judge is a detached child process so the calling hook can exit immediately.
|
|
6
6
|
* Results appear asynchronously in `.indusk/eval/results.log`.
|
|
7
7
|
*/
|
|
8
|
-
import {
|
|
8
|
+
import { spawn } from "node:child_process";
|
|
9
9
|
import { join } from "node:path";
|
|
10
10
|
import { getProjectGroupId } from "../config.js";
|
|
11
11
|
import { EvalLogWriter } from "./log-writer.js";
|
|
@@ -14,14 +14,6 @@ import { V1_RUBRIC } from "./rubric.js";
|
|
|
14
14
|
function getEvalLogPath(projectRoot) {
|
|
15
15
|
return join(projectRoot, ".indusk", "eval", "results.log");
|
|
16
16
|
}
|
|
17
|
-
function getDiff(changeId) {
|
|
18
|
-
try {
|
|
19
|
-
return execSync(`jj diff -r ${changeId}`, { encoding: "utf8", maxBuffer: 10 * 1024 * 1024 });
|
|
20
|
-
}
|
|
21
|
-
catch {
|
|
22
|
-
return "(diff unavailable)";
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
17
|
async function postTelemetry(endpoint, scorecard) {
|
|
26
18
|
try {
|
|
27
19
|
const controller = new AbortController();
|
|
@@ -46,13 +38,11 @@ async function postTelemetry(endpoint, scorecard) {
|
|
|
46
38
|
* If anything fails, logs an error entry instead of silently dropping.
|
|
47
39
|
*/
|
|
48
40
|
export function runJudgeBackground(opts) {
|
|
49
|
-
const diff = getDiff(opts.changeId);
|
|
50
41
|
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
51
42
|
const prompt = buildJudgePrompt({
|
|
52
43
|
rubric: V1_RUBRIC,
|
|
53
44
|
changeId: opts.changeId,
|
|
54
45
|
transcriptPath: opts.transcriptPath,
|
|
55
|
-
diff,
|
|
56
46
|
mode: opts.mode,
|
|
57
47
|
projectGroup,
|
|
58
48
|
});
|
|
@@ -75,16 +65,18 @@ export function runJudgeBackground(opts) {
|
|
|
75
65
|
"--permission-mode",
|
|
76
66
|
"acceptEdits",
|
|
77
67
|
"--allowed-tools",
|
|
78
|
-
|
|
79
|
-
prompt,
|
|
68
|
+
allowedTools.join(","),
|
|
80
69
|
];
|
|
70
|
+
// Not detached — the eval-trigger hook already spawns this in a separate
|
|
71
|
+
// node process. Detaching + unref causes the close handler to never fire.
|
|
81
72
|
const child = spawn("claude", args, {
|
|
82
73
|
cwd: opts.projectRoot,
|
|
83
|
-
stdio: ["
|
|
84
|
-
detached: true,
|
|
74
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
85
75
|
env: { ...process.env },
|
|
86
76
|
});
|
|
87
|
-
|
|
77
|
+
// Pipe the prompt via stdin (too large for CLI arg)
|
|
78
|
+
child.stdin?.write(prompt);
|
|
79
|
+
child.stdin?.end();
|
|
88
80
|
let stdout = "";
|
|
89
81
|
let stderr = "";
|
|
90
82
|
child.stdout?.on("data", (chunk) => {
|
|
@@ -140,13 +132,11 @@ export function runJudgeBackground(opts) {
|
|
|
140
132
|
* Returns the scorecard or error entry.
|
|
141
133
|
*/
|
|
142
134
|
export async function runJudgeSync(opts) {
|
|
143
|
-
const diff = getDiff(opts.changeId);
|
|
144
135
|
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
145
136
|
const prompt = buildJudgePrompt({
|
|
146
137
|
rubric: V1_RUBRIC,
|
|
147
138
|
changeId: opts.changeId,
|
|
148
139
|
transcriptPath: opts.transcriptPath,
|
|
149
|
-
diff,
|
|
150
140
|
mode: opts.mode,
|
|
151
141
|
projectGroup,
|
|
152
142
|
});
|
|
@@ -169,15 +159,16 @@ export async function runJudgeSync(opts) {
|
|
|
169
159
|
"--permission-mode",
|
|
170
160
|
"acceptEdits",
|
|
171
161
|
"--allowed-tools",
|
|
172
|
-
|
|
173
|
-
prompt,
|
|
162
|
+
allowedTools.join(","),
|
|
174
163
|
];
|
|
175
164
|
return new Promise((resolve) => {
|
|
176
165
|
const child = spawn("claude", args, {
|
|
177
166
|
cwd: opts.projectRoot,
|
|
178
|
-
stdio: ["
|
|
167
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
179
168
|
env: { ...process.env },
|
|
180
169
|
});
|
|
170
|
+
child.stdin?.write(prompt);
|
|
171
|
+
child.stdin?.end();
|
|
181
172
|
let stdout = "";
|
|
182
173
|
let stderr = "";
|
|
183
174
|
child.stdout?.on("data", (chunk) => {
|
|
@@ -2,15 +2,17 @@
|
|
|
2
2
|
* Builds the judge agent's system prompt.
|
|
3
3
|
*
|
|
4
4
|
* The prompt instructs the judge to: do catchup, read the transcript, read the
|
|
5
|
-
* diff, answer each rubric question, write findings to Graphiti
|
|
6
|
-
* only), and output a JSON scorecard.
|
|
5
|
+
* diff itself via jj, answer each rubric question, write findings to Graphiti
|
|
6
|
+
* (eval mode only), and output a JSON scorecard.
|
|
7
|
+
*
|
|
8
|
+
* The diff is NOT embedded in the prompt — the judge reads it via tool calls.
|
|
9
|
+
* This keeps the prompt small regardless of commit size.
|
|
7
10
|
*/
|
|
8
11
|
import type { RubricQuestion } from "./types.js";
|
|
9
12
|
export interface PromptBuilderOptions {
|
|
10
13
|
rubric: RubricQuestion[];
|
|
11
14
|
changeId: string;
|
|
12
15
|
transcriptPath: string;
|
|
13
|
-
diff: string;
|
|
14
16
|
mode: "eval" | "baseline";
|
|
15
17
|
projectGroup: string;
|
|
16
18
|
}
|
|
@@ -2,8 +2,11 @@
|
|
|
2
2
|
* Builds the judge agent's system prompt.
|
|
3
3
|
*
|
|
4
4
|
* The prompt instructs the judge to: do catchup, read the transcript, read the
|
|
5
|
-
* diff, answer each rubric question, write findings to Graphiti
|
|
6
|
-
* only), and output a JSON scorecard.
|
|
5
|
+
* diff itself via jj, answer each rubric question, write findings to Graphiti
|
|
6
|
+
* (eval mode only), and output a JSON scorecard.
|
|
7
|
+
*
|
|
8
|
+
* The diff is NOT embedded in the prompt — the judge reads it via tool calls.
|
|
9
|
+
* This keeps the prompt small regardless of commit size.
|
|
7
10
|
*/
|
|
8
11
|
export function buildJudgePrompt(opts) {
|
|
9
12
|
const questionsBlock = opts.rubric
|
|
@@ -54,13 +57,9 @@ This is the JSONL record of the working agent's session. Read it to understand:
|
|
|
54
57
|
|
|
55
58
|
### Step 3: Read the diff
|
|
56
59
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
\`\`\`
|
|
60
|
-
${opts.diff}
|
|
61
|
-
\`\`\`
|
|
60
|
+
Run \`jj diff -r ${opts.changeId}\` to see what was committed. This is the work being evaluated.
|
|
62
61
|
|
|
63
|
-
|
|
62
|
+
Then read the specific files that were changed to understand the full context — not just the diff lines, but the surrounding code.
|
|
64
63
|
|
|
65
64
|
### Step 4: Answer the evaluation questions
|
|
66
65
|
|
package/hooks/eval-trigger.js
CHANGED
|
@@ -90,17 +90,18 @@ const transcriptPath =
|
|
|
90
90
|
"(transcript unavailable)";
|
|
91
91
|
|
|
92
92
|
// Spawn the judge runner as a detached background process.
|
|
93
|
-
//
|
|
94
|
-
//
|
|
93
|
+
// Spawn a detached node process that calls runJudgeSync (which awaits completion).
|
|
94
|
+
// runJudgeSync keeps the process alive until claude --print finishes and logs the result.
|
|
95
95
|
const judgeScript = `
|
|
96
96
|
import("${resolve(projectRoot, "apps/indusk-mcp/dist/lib/eval/judge-runner.js")}")
|
|
97
|
-
.then(m => m.
|
|
97
|
+
.then(m => m.runJudgeSync({
|
|
98
98
|
projectRoot: ${JSON.stringify(projectRoot)},
|
|
99
99
|
changeId: ${JSON.stringify(changeId)},
|
|
100
100
|
transcriptPath: ${JSON.stringify(transcriptPath)},
|
|
101
101
|
mode: "eval",
|
|
102
102
|
evalEndpoint: ${JSON.stringify(evalConfig.endpoint)},
|
|
103
103
|
}))
|
|
104
|
+
.then(() => process.exit(0))
|
|
104
105
|
.catch(err => {
|
|
105
106
|
const fs = require("fs");
|
|
106
107
|
const path = require("path");
|
|
@@ -115,6 +116,7 @@ import("${resolve(projectRoot, "apps/indusk-mcp/dist/lib/eval/judge-runner.js")}
|
|
|
115
116
|
message: err.message || String(err),
|
|
116
117
|
});
|
|
117
118
|
fs.appendFileSync(logPath, entry + "\\n", "utf8");
|
|
119
|
+
process.exit(1);
|
|
118
120
|
});
|
|
119
121
|
`;
|
|
120
122
|
|