@infinitedusky/indusk-mcp 1.17.0 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/commands/eval.js +2 -2
- package/dist/lib/eval/evaluator-runner.d.ts +28 -0
- package/dist/lib/eval/evaluator-runner.js +266 -0
- package/dist/lib/eval/otel.d.ts +61 -0
- package/dist/lib/eval/otel.js +177 -0
- package/dist/lib/eval/persistent-evaluator.d.ts +20 -0
- package/dist/lib/eval/persistent-evaluator.js +244 -0
- package/dist/lib/eval/prompt-builder.d.ts +4 -4
- package/dist/lib/eval/prompt-builder.js +5 -5
- package/dist/lib/eval/types.d.ts +1 -1
- package/dist/lib/eval/types.js +1 -1
- package/hooks/eval-trigger.js +35 -27
- package/package.json +7 -1
- package/skills/eval-review.md +7 -7
- package/skills/handoff.md +1 -1
|
@@ -212,7 +212,7 @@ export async function evalBaseline(projectRoot, opts) {
|
|
|
212
212
|
}
|
|
213
213
|
// Run the smart evaluator against the baseline
|
|
214
214
|
console.info("Running smart evaluator against baseline...");
|
|
215
|
-
const {
|
|
215
|
+
const { runEvaluatorSync } = await import("../../lib/eval/evaluator-runner.js");
|
|
216
216
|
let changeId;
|
|
217
217
|
try {
|
|
218
218
|
changeId = execSync("jj log -r @ --no-graph -T change_id", {
|
|
@@ -223,7 +223,7 @@ export async function evalBaseline(projectRoot, opts) {
|
|
|
223
223
|
catch {
|
|
224
224
|
changeId = "baseline-unknown";
|
|
225
225
|
}
|
|
226
|
-
const evalResult = await
|
|
226
|
+
const evalResult = await runEvaluatorSync({
|
|
227
227
|
projectRoot: worktreePath,
|
|
228
228
|
changeId,
|
|
229
229
|
transcriptPath: "(baseline — no transcript)",
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluator runner — spawns a background `claude --print` process that evaluates
|
|
3
|
+
* a commit and writes results to the eval log.
|
|
4
|
+
*
|
|
5
|
+
* The evaluator is a detached child process so the calling hook can exit immediately.
|
|
6
|
+
* Results appear asynchronously in `.indusk/eval/results.log`.
|
|
7
|
+
*/
|
|
8
|
+
import type { EvalErrorEntry, EvalScorecard } from "./types.js";
|
|
9
|
+
export interface EvaluatorRunOptions {
|
|
10
|
+
projectRoot: string;
|
|
11
|
+
changeId: string;
|
|
12
|
+
transcriptPath: string;
|
|
13
|
+
mode: "eval" | "baseline";
|
|
14
|
+
evalEndpoint?: string;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Run the evaluator as a detached background process.
|
|
18
|
+
*
|
|
19
|
+
* Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
|
|
20
|
+
* Collects stdout, parses the scorecard JSON, and appends to the eval log.
|
|
21
|
+
* If anything fails, logs an error entry instead of silently dropping.
|
|
22
|
+
*/
|
|
23
|
+
export declare function runEvaluatorBackground(opts: EvaluatorRunOptions): void;
|
|
24
|
+
/**
|
|
25
|
+
* Run the evaluator synchronously (for testing and manual invocation).
|
|
26
|
+
* Returns the scorecard or error entry.
|
|
27
|
+
*/
|
|
28
|
+
export declare function runEvaluatorSync(opts: EvaluatorRunOptions): Promise<EvalScorecard | EvalErrorEntry>;
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluator runner — spawns a background `claude --print` process that evaluates
|
|
3
|
+
* a commit and writes results to the eval log.
|
|
4
|
+
*
|
|
5
|
+
* The evaluator is a detached child process so the calling hook can exit immediately.
|
|
6
|
+
* Results appear asynchronously in `.indusk/eval/results.log`.
|
|
7
|
+
*/
|
|
8
|
+
import { spawn } from "node:child_process";
|
|
9
|
+
import { join } from "node:path";
|
|
10
|
+
import { getProjectGroupId } from "../config.js";
|
|
11
|
+
import { ingestScorecard } from "./findings.js";
|
|
12
|
+
import { EvalLogWriter } from "./log-writer.js";
|
|
13
|
+
import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
|
|
14
|
+
import { buildEvaluatorPrompt } from "./prompt-builder.js";
|
|
15
|
+
import { V1_RUBRIC } from "./rubric.js";
|
|
16
|
+
function getEvalLogPath(projectRoot) {
|
|
17
|
+
return join(projectRoot, ".indusk", "eval", "results.log");
|
|
18
|
+
}
|
|
19
|
+
async function postTelemetry(endpoint, scorecard) {
|
|
20
|
+
try {
|
|
21
|
+
const controller = new AbortController();
|
|
22
|
+
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
23
|
+
await fetch(endpoint, {
|
|
24
|
+
method: "POST",
|
|
25
|
+
headers: { "Content-Type": "application/json" },
|
|
26
|
+
body: JSON.stringify(scorecard),
|
|
27
|
+
signal: controller.signal,
|
|
28
|
+
});
|
|
29
|
+
clearTimeout(timeout);
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
// fire-and-forget — silently ignore errors
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Run the evaluator as a detached background process.
|
|
37
|
+
*
|
|
38
|
+
* Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
|
|
39
|
+
* Collects stdout, parses the scorecard JSON, and appends to the eval log.
|
|
40
|
+
* If anything fails, logs an error entry instead of silently dropping.
|
|
41
|
+
*/
|
|
42
|
+
export function runEvaluatorBackground(opts) {
|
|
43
|
+
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
44
|
+
const prompt = buildEvaluatorPrompt({
|
|
45
|
+
rubric: V1_RUBRIC,
|
|
46
|
+
changeId: opts.changeId,
|
|
47
|
+
transcriptPath: opts.transcriptPath,
|
|
48
|
+
mode: opts.mode,
|
|
49
|
+
projectGroup,
|
|
50
|
+
});
|
|
51
|
+
const allowedTools = [
|
|
52
|
+
"Read",
|
|
53
|
+
"Grep",
|
|
54
|
+
"Glob",
|
|
55
|
+
"Bash(jj:*)",
|
|
56
|
+
"Bash(git:*)",
|
|
57
|
+
"mcp__graphiti__*",
|
|
58
|
+
"mcp__indusk__*",
|
|
59
|
+
"mcp__codegraphcontext__*",
|
|
60
|
+
];
|
|
61
|
+
const args = [
|
|
62
|
+
"--print",
|
|
63
|
+
"--output-format",
|
|
64
|
+
"json",
|
|
65
|
+
"--model",
|
|
66
|
+
"opus",
|
|
67
|
+
"--permission-mode",
|
|
68
|
+
"acceptEdits",
|
|
69
|
+
"--allowed-tools",
|
|
70
|
+
allowedTools.join(","),
|
|
71
|
+
];
|
|
72
|
+
// Not detached — the eval-trigger hook already spawns this in a separate
|
|
73
|
+
// node process. Detaching + unref causes the close handler to never fire.
|
|
74
|
+
const child = spawn("claude", args, {
|
|
75
|
+
cwd: opts.projectRoot,
|
|
76
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
77
|
+
env: { ...process.env },
|
|
78
|
+
});
|
|
79
|
+
// Pipe the prompt via stdin (too large for CLI arg)
|
|
80
|
+
child.stdin?.write(prompt);
|
|
81
|
+
child.stdin?.end();
|
|
82
|
+
let stdout = "";
|
|
83
|
+
let stderr = "";
|
|
84
|
+
child.stdout?.on("data", (chunk) => {
|
|
85
|
+
stdout += chunk.toString();
|
|
86
|
+
});
|
|
87
|
+
child.stderr?.on("data", (chunk) => {
|
|
88
|
+
stderr += chunk.toString();
|
|
89
|
+
});
|
|
90
|
+
child.on("close", async (code) => {
|
|
91
|
+
const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
|
|
92
|
+
try {
|
|
93
|
+
if (code !== 0) {
|
|
94
|
+
throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
|
|
95
|
+
}
|
|
96
|
+
// --output-format json wraps the result; extract the text content and usage
|
|
97
|
+
let scorecardText = stdout;
|
|
98
|
+
let usage;
|
|
99
|
+
try {
|
|
100
|
+
const jsonOutput = JSON.parse(stdout);
|
|
101
|
+
scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
|
|
102
|
+
// Capture usage data from claude --print output
|
|
103
|
+
if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
|
|
104
|
+
const u = jsonOutput.usage ?? {};
|
|
105
|
+
usage = {
|
|
106
|
+
costUsd: jsonOutput.total_cost_usd ?? 0,
|
|
107
|
+
inputTokens: u.input_tokens ?? 0,
|
|
108
|
+
outputTokens: u.output_tokens ?? 0,
|
|
109
|
+
cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
|
|
110
|
+
cacheReadTokens: u.cache_read_input_tokens ?? 0,
|
|
111
|
+
durationMs: jsonOutput.duration_ms ?? 0,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
// stdout might be raw JSON scorecard already
|
|
117
|
+
}
|
|
118
|
+
// Extract JSON from possible markdown code fences
|
|
119
|
+
const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
|
|
120
|
+
if (jsonMatch?.[1]) {
|
|
121
|
+
scorecardText = jsonMatch[1];
|
|
122
|
+
}
|
|
123
|
+
const scorecard = JSON.parse(scorecardText.trim());
|
|
124
|
+
if (usage)
|
|
125
|
+
scorecard.usage = usage;
|
|
126
|
+
scorecard.telemetryPosted = false;
|
|
127
|
+
if (opts.evalEndpoint) {
|
|
128
|
+
await postTelemetry(opts.evalEndpoint, scorecard);
|
|
129
|
+
scorecard.telemetryPosted = true;
|
|
130
|
+
}
|
|
131
|
+
await logWriter.append(scorecard);
|
|
132
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
133
|
+
}
|
|
134
|
+
catch (err) {
|
|
135
|
+
const errorEntry = {
|
|
136
|
+
version: 1,
|
|
137
|
+
timestamp: new Date().toISOString(),
|
|
138
|
+
mode: opts.mode,
|
|
139
|
+
changeId: opts.changeId,
|
|
140
|
+
error: true,
|
|
141
|
+
message: err instanceof Error ? err.message : String(err),
|
|
142
|
+
};
|
|
143
|
+
await logWriter.append(errorEntry);
|
|
144
|
+
}
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Run the evaluator synchronously (for testing and manual invocation).
|
|
149
|
+
* Returns the scorecard or error entry.
|
|
150
|
+
*/
|
|
151
|
+
export async function runEvaluatorSync(opts) {
|
|
152
|
+
const tracer = initEvalOtel(opts.projectRoot);
|
|
153
|
+
const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
|
|
154
|
+
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
155
|
+
const result = await withSpan(tracer, "eval.run", {
|
|
156
|
+
changeId: opts.changeId,
|
|
157
|
+
source,
|
|
158
|
+
mode: opts.mode,
|
|
159
|
+
projectGroup,
|
|
160
|
+
entrypoint: "runEvaluatorSync",
|
|
161
|
+
}, () => runEvaluatorSyncInner(opts, projectGroup));
|
|
162
|
+
await shutdownEvalOtel();
|
|
163
|
+
return result;
|
|
164
|
+
}
|
|
165
|
+
async function runEvaluatorSyncInner(opts, projectGroup) {
|
|
166
|
+
const prompt = buildEvaluatorPrompt({
|
|
167
|
+
rubric: V1_RUBRIC,
|
|
168
|
+
changeId: opts.changeId,
|
|
169
|
+
transcriptPath: opts.transcriptPath,
|
|
170
|
+
mode: opts.mode,
|
|
171
|
+
projectGroup,
|
|
172
|
+
});
|
|
173
|
+
const allowedTools = [
|
|
174
|
+
"Read",
|
|
175
|
+
"Grep",
|
|
176
|
+
"Glob",
|
|
177
|
+
"Bash(jj:*)",
|
|
178
|
+
"Bash(git:*)",
|
|
179
|
+
"mcp__graphiti__*",
|
|
180
|
+
"mcp__indusk__*",
|
|
181
|
+
"mcp__codegraphcontext__*",
|
|
182
|
+
];
|
|
183
|
+
const args = [
|
|
184
|
+
"--print",
|
|
185
|
+
"--output-format",
|
|
186
|
+
"json",
|
|
187
|
+
"--model",
|
|
188
|
+
"opus",
|
|
189
|
+
"--permission-mode",
|
|
190
|
+
"acceptEdits",
|
|
191
|
+
"--allowed-tools",
|
|
192
|
+
allowedTools.join(","),
|
|
193
|
+
];
|
|
194
|
+
return new Promise((resolve) => {
|
|
195
|
+
const child = spawn("claude", args, {
|
|
196
|
+
cwd: opts.projectRoot,
|
|
197
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
198
|
+
env: { ...process.env },
|
|
199
|
+
});
|
|
200
|
+
child.stdin?.write(prompt);
|
|
201
|
+
child.stdin?.end();
|
|
202
|
+
let stdout = "";
|
|
203
|
+
let stderr = "";
|
|
204
|
+
child.stdout?.on("data", (chunk) => {
|
|
205
|
+
stdout += chunk.toString();
|
|
206
|
+
});
|
|
207
|
+
child.stderr?.on("data", (chunk) => {
|
|
208
|
+
stderr += chunk.toString();
|
|
209
|
+
});
|
|
210
|
+
child.on("close", async (code) => {
|
|
211
|
+
const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
|
|
212
|
+
try {
|
|
213
|
+
if (code !== 0) {
|
|
214
|
+
throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
|
|
215
|
+
}
|
|
216
|
+
let scorecardText = stdout;
|
|
217
|
+
let syncUsage;
|
|
218
|
+
try {
|
|
219
|
+
const jsonOutput = JSON.parse(stdout);
|
|
220
|
+
scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
|
|
221
|
+
if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
|
|
222
|
+
const u = jsonOutput.usage ?? {};
|
|
223
|
+
syncUsage = {
|
|
224
|
+
costUsd: jsonOutput.total_cost_usd ?? 0,
|
|
225
|
+
inputTokens: u.input_tokens ?? 0,
|
|
226
|
+
outputTokens: u.output_tokens ?? 0,
|
|
227
|
+
cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
|
|
228
|
+
cacheReadTokens: u.cache_read_input_tokens ?? 0,
|
|
229
|
+
durationMs: jsonOutput.duration_ms ?? 0,
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
catch {
|
|
234
|
+
// raw JSON
|
|
235
|
+
}
|
|
236
|
+
const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
|
|
237
|
+
if (jsonMatch?.[1]) {
|
|
238
|
+
scorecardText = jsonMatch[1];
|
|
239
|
+
}
|
|
240
|
+
const scorecard = JSON.parse(scorecardText.trim());
|
|
241
|
+
if (syncUsage)
|
|
242
|
+
scorecard.usage = syncUsage;
|
|
243
|
+
scorecard.telemetryPosted = false;
|
|
244
|
+
if (opts.evalEndpoint) {
|
|
245
|
+
await postTelemetry(opts.evalEndpoint, scorecard);
|
|
246
|
+
scorecard.telemetryPosted = true;
|
|
247
|
+
}
|
|
248
|
+
await logWriter.append(scorecard);
|
|
249
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
250
|
+
resolve(scorecard);
|
|
251
|
+
}
|
|
252
|
+
catch (err) {
|
|
253
|
+
const errorEntry = {
|
|
254
|
+
version: 1,
|
|
255
|
+
timestamp: new Date().toISOString(),
|
|
256
|
+
mode: opts.mode,
|
|
257
|
+
changeId: opts.changeId,
|
|
258
|
+
error: true,
|
|
259
|
+
message: err instanceof Error ? err.message : String(err),
|
|
260
|
+
};
|
|
261
|
+
await logWriter.append(errorEntry);
|
|
262
|
+
resolve(errorEntry);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
});
|
|
266
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenTelemetry tracing for the eval agent (evaluator).
|
|
3
|
+
*
|
|
4
|
+
* Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
|
|
5
|
+
* `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
6
|
+
* (Dash0 or any OTLP HTTP receiver).
|
|
7
|
+
*
|
|
8
|
+
* Default OFF — zero cost in normal operation (no SDK init, no network).
|
|
9
|
+
*
|
|
10
|
+
* Graceful degradation: when enabled but endpoint missing, log a warning
|
|
11
|
+
* to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
|
|
12
|
+
* throws, same behavior. The evaluator never fails because of OTel.
|
|
13
|
+
*/
|
|
14
|
+
import { type Attributes, type Span, type Tracer } from "@opentelemetry/api";
|
|
15
|
+
export interface EvalOtelConfig {
|
|
16
|
+
enabled: boolean;
|
|
17
|
+
endpoint: string | null;
|
|
18
|
+
dataset: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
|
|
22
|
+
* the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
23
|
+
* env vars. Does not init anything or touch the network.
|
|
24
|
+
*
|
|
25
|
+
* Resolution:
|
|
26
|
+
* - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
|
|
27
|
+
* - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
|
|
28
|
+
* - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
|
|
29
|
+
* else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
|
|
30
|
+
*/
|
|
31
|
+
export declare function isEvalOtelEnabled(projectRoot: string): EvalOtelConfig;
|
|
32
|
+
/**
|
|
33
|
+
* Initialize OTel tracing for the evaluator if enabled + endpoint set.
|
|
34
|
+
* Returns a Tracer — real when enabled, no-op when not.
|
|
35
|
+
*
|
|
36
|
+
* The no-op path costs nothing: no provider registered, no network, the
|
|
37
|
+
* returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
|
|
38
|
+
*
|
|
39
|
+
* Safe to call multiple times — subsequent calls return the same tracer.
|
|
40
|
+
*/
|
|
41
|
+
export declare function initEvalOtel(projectRoot: string): Tracer;
|
|
42
|
+
/**
|
|
43
|
+
* Run `fn` inside an active span. Closes the span in `finally`. On thrown
|
|
44
|
+
* error, records the exception on the span and sets status to ERROR, then
|
|
45
|
+
* re-throws so callers can still handle it.
|
|
46
|
+
*
|
|
47
|
+
* Use this for every lifecycle step in the evaluator so spans close even
|
|
48
|
+
* when Claude exits non-zero or a downstream step throws.
|
|
49
|
+
*/
|
|
50
|
+
export declare function withSpan<T>(tracer: Tracer, name: string, attrs: Attributes | undefined, fn: (span: Span) => Promise<T> | T): Promise<T>;
|
|
51
|
+
/**
|
|
52
|
+
* Flush and shut down the active provider. Call this before `process.exit()`
|
|
53
|
+
* in detached processes so batched spans are not lost. No-op if no provider
|
|
54
|
+
* is active.
|
|
55
|
+
*/
|
|
56
|
+
export declare function shutdownEvalOtel(): Promise<void>;
|
|
57
|
+
/**
|
|
58
|
+
* Test hook: reset the module's state AND the global OTel API so each test
|
|
59
|
+
* starts fresh. Not part of the public API.
|
|
60
|
+
*/
|
|
61
|
+
export declare function __resetEvalOtelForTests(): void;
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenTelemetry tracing for the eval agent (evaluator).
|
|
3
|
+
*
|
|
4
|
+
* Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
|
|
5
|
+
* `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
6
|
+
* (Dash0 or any OTLP HTTP receiver).
|
|
7
|
+
*
|
|
8
|
+
* Default OFF — zero cost in normal operation (no SDK init, no network).
|
|
9
|
+
*
|
|
10
|
+
* Graceful degradation: when enabled but endpoint missing, log a warning
|
|
11
|
+
* to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
|
|
12
|
+
* throws, same behavior. The evaluator never fails because of OTel.
|
|
13
|
+
*/
|
|
14
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync } from "node:fs";
|
|
15
|
+
import { join, resolve } from "node:path";
|
|
16
|
+
import { SpanStatusCode, trace } from "@opentelemetry/api";
|
|
17
|
+
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
18
|
+
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
19
|
+
import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-base";
|
|
20
|
+
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
21
|
+
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
|
|
22
|
+
const TRACER_NAME = "@infinitedusky/indusk-mcp/eval";
|
|
23
|
+
const SERVICE_NAME = "indusk-eval-agent";
|
|
24
|
+
function syslog(projectRoot, msg) {
|
|
25
|
+
try {
|
|
26
|
+
const logDir = resolve(projectRoot, ".indusk", "eval");
|
|
27
|
+
mkdirSync(logDir, { recursive: true });
|
|
28
|
+
appendFileSync(resolve(logDir, "system.log"), `${new Date().toISOString()} ${msg}\n`);
|
|
29
|
+
}
|
|
30
|
+
catch {
|
|
31
|
+
// logging should never break anything
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
const DEFAULT_DATASET = "agent";
|
|
35
|
+
/**
|
|
36
|
+
* Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
|
|
37
|
+
* the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
38
|
+
* env vars. Does not init anything or touch the network.
|
|
39
|
+
*
|
|
40
|
+
* Resolution:
|
|
41
|
+
* - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
|
|
42
|
+
* - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
|
|
43
|
+
* - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
|
|
44
|
+
* else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
|
|
45
|
+
*/
|
|
46
|
+
export function isEvalOtelEnabled(projectRoot) {
|
|
47
|
+
const envFlag = process.env.INDUSK_EVAL_OTEL;
|
|
48
|
+
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? null;
|
|
49
|
+
const envDataset = process.env.INDUSK_EVAL_OTEL_DATASET;
|
|
50
|
+
let configEnabled = false;
|
|
51
|
+
let configDataset;
|
|
52
|
+
const configPath = join(projectRoot, ".indusk", "config.json");
|
|
53
|
+
if (existsSync(configPath)) {
|
|
54
|
+
try {
|
|
55
|
+
const config = JSON.parse(readFileSync(configPath, "utf-8"));
|
|
56
|
+
configEnabled = config?.eval?.otel?.enabled === true;
|
|
57
|
+
if (typeof config?.eval?.otel?.dataset === "string") {
|
|
58
|
+
configDataset = config.eval.otel.dataset;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
// malformed config — treat as disabled
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
const envForcesEnabled = envFlag !== undefined && envFlag !== "" && envFlag !== "0" && envFlag.toLowerCase() !== "false";
|
|
66
|
+
const dataset = envDataset && envDataset !== "" ? envDataset : (configDataset ?? DEFAULT_DATASET);
|
|
67
|
+
return {
|
|
68
|
+
enabled: envForcesEnabled || configEnabled,
|
|
69
|
+
endpoint,
|
|
70
|
+
dataset,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
let activeProvider = null;
|
|
74
|
+
/**
|
|
75
|
+
* Initialize OTel tracing for the evaluator if enabled + endpoint set.
|
|
76
|
+
* Returns a Tracer — real when enabled, no-op when not.
|
|
77
|
+
*
|
|
78
|
+
* The no-op path costs nothing: no provider registered, no network, the
|
|
79
|
+
* returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
|
|
80
|
+
*
|
|
81
|
+
* Safe to call multiple times — subsequent calls return the same tracer.
|
|
82
|
+
*/
|
|
83
|
+
export function initEvalOtel(projectRoot) {
|
|
84
|
+
const { enabled, endpoint, dataset } = isEvalOtelEnabled(projectRoot);
|
|
85
|
+
if (!enabled) {
|
|
86
|
+
return trace.getTracer(TRACER_NAME);
|
|
87
|
+
}
|
|
88
|
+
if (!endpoint) {
|
|
89
|
+
syslog(projectRoot, "eval.otel.enabled but OTEL_EXPORTER_OTLP_ENDPOINT is unset — falling back to no-op tracer");
|
|
90
|
+
return trace.getTracer(TRACER_NAME);
|
|
91
|
+
}
|
|
92
|
+
if (activeProvider) {
|
|
93
|
+
return trace.getTracer(TRACER_NAME);
|
|
94
|
+
}
|
|
95
|
+
try {
|
|
96
|
+
const exporter = new OTLPTraceExporter({
|
|
97
|
+
url: endpoint.endsWith("/v1/traces") ? endpoint : `${endpoint.replace(/\/$/, "")}/v1/traces`,
|
|
98
|
+
// Route agent spans to the Dash0 dataset named `dataset`. Default
|
|
99
|
+
// is "agent". Env-set headers (OTEL_EXPORTER_OTLP_HEADERS) take
|
|
100
|
+
// precedence — per the OTel SDK contract — so a user-provided
|
|
101
|
+
// Dash0-Dataset in env overrides this default.
|
|
102
|
+
headers: {
|
|
103
|
+
"Dash0-Dataset": dataset,
|
|
104
|
+
},
|
|
105
|
+
});
|
|
106
|
+
const provider = new NodeTracerProvider({
|
|
107
|
+
resource: resourceFromAttributes({
|
|
108
|
+
[ATTR_SERVICE_NAME]: SERVICE_NAME,
|
|
109
|
+
}),
|
|
110
|
+
spanProcessors: [new BatchSpanProcessor(exporter)],
|
|
111
|
+
});
|
|
112
|
+
provider.register();
|
|
113
|
+
activeProvider = provider;
|
|
114
|
+
syslog(projectRoot, `eval.otel initialized — endpoint: ${endpoint}, dataset: ${dataset}`);
|
|
115
|
+
}
|
|
116
|
+
catch (err) {
|
|
117
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
118
|
+
syslog(projectRoot, `eval.otel init failed — falling back to no-op tracer: ${message}`);
|
|
119
|
+
}
|
|
120
|
+
return trace.getTracer(TRACER_NAME);
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Run `fn` inside an active span. Closes the span in `finally`. On thrown
|
|
124
|
+
* error, records the exception on the span and sets status to ERROR, then
|
|
125
|
+
* re-throws so callers can still handle it.
|
|
126
|
+
*
|
|
127
|
+
* Use this for every lifecycle step in the evaluator so spans close even
|
|
128
|
+
* when Claude exits non-zero or a downstream step throws.
|
|
129
|
+
*/
|
|
130
|
+
export async function withSpan(tracer, name, attrs, fn) {
|
|
131
|
+
return tracer.startActiveSpan(name, { attributes: attrs ?? {} }, async (span) => {
|
|
132
|
+
try {
|
|
133
|
+
return await fn(span);
|
|
134
|
+
}
|
|
135
|
+
catch (err) {
|
|
136
|
+
span.recordException(err instanceof Error ? err : new Error(String(err)));
|
|
137
|
+
span.setStatus({ code: SpanStatusCode.ERROR });
|
|
138
|
+
throw err;
|
|
139
|
+
}
|
|
140
|
+
finally {
|
|
141
|
+
span.end();
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Flush and shut down the active provider. Call this before `process.exit()`
|
|
147
|
+
* in detached processes so batched spans are not lost. No-op if no provider
|
|
148
|
+
* is active.
|
|
149
|
+
*/
|
|
150
|
+
export async function shutdownEvalOtel() {
|
|
151
|
+
if (!activeProvider)
|
|
152
|
+
return;
|
|
153
|
+
try {
|
|
154
|
+
await activeProvider.forceFlush();
|
|
155
|
+
await activeProvider.shutdown();
|
|
156
|
+
}
|
|
157
|
+
catch {
|
|
158
|
+
// shutdown is best-effort
|
|
159
|
+
}
|
|
160
|
+
finally {
|
|
161
|
+
activeProvider = null;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Test hook: reset the module's state AND the global OTel API so each test
|
|
166
|
+
* starts fresh. Not part of the public API.
|
|
167
|
+
*/
|
|
168
|
+
export function __resetEvalOtelForTests() {
|
|
169
|
+
// Tear down any provider left over from a previous test. This un-registers
|
|
170
|
+
// from the global OTel API, so `trace.getTracer()` falls back to the no-op
|
|
171
|
+
// tracer until a new provider is registered.
|
|
172
|
+
if (activeProvider) {
|
|
173
|
+
void activeProvider.shutdown().catch(() => { });
|
|
174
|
+
}
|
|
175
|
+
activeProvider = null;
|
|
176
|
+
trace.disable();
|
|
177
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Persistent evaluator session management.
|
|
3
|
+
*
|
|
4
|
+
* First eval spawns a new session with full catchup. Subsequent evals resume
|
|
5
|
+
* the same session — no catchup cost, just "evaluate this change."
|
|
6
|
+
*
|
|
7
|
+
* Session state stored in `.indusk/eval/evaluator-session.json`.
|
|
8
|
+
*/
|
|
9
|
+
import type { EvalErrorEntry, EvalScorecard } from "./types.js";
|
|
10
|
+
/**
|
|
11
|
+
* Run eval using a persistent session. First call does catchup + eval.
|
|
12
|
+
* Subsequent calls resume the session with just the new change.
|
|
13
|
+
*/
|
|
14
|
+
export declare function runPersistentEval(opts: {
|
|
15
|
+
projectRoot: string;
|
|
16
|
+
changeId: string;
|
|
17
|
+
transcriptPath: string;
|
|
18
|
+
mode: "eval" | "baseline";
|
|
19
|
+
evalEndpoint?: string;
|
|
20
|
+
}): Promise<EvalScorecard | EvalErrorEntry>;
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Persistent evaluator session management.
|
|
3
|
+
*
|
|
4
|
+
* First eval spawns a new session with full catchup. Subsequent evals resume
|
|
5
|
+
* the same session — no catchup cost, just "evaluate this change."
|
|
6
|
+
*
|
|
7
|
+
* Session state stored in `.indusk/eval/evaluator-session.json`.
|
|
8
|
+
*/
|
|
9
|
+
import { spawn } from "node:child_process";
|
|
10
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
11
|
+
import { dirname, join } from "node:path";
|
|
12
|
+
import { getProjectGroupId } from "../config.js";
|
|
13
|
+
import { ingestScorecard } from "./findings.js";
|
|
14
|
+
import { readUnprocessedHighlights } from "../highlights/highlights.js";
|
|
15
|
+
import { EvalLogWriter } from "./log-writer.js";
|
|
16
|
+
import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
|
|
17
|
+
import { buildEvaluatorPrompt } from "./prompt-builder.js";
|
|
18
|
+
import { V1_RUBRIC } from "./rubric.js";
|
|
19
|
+
function getSessionPath(projectRoot) {
|
|
20
|
+
return join(projectRoot, ".indusk", "eval", "evaluator-session.json");
|
|
21
|
+
}
|
|
22
|
+
function getEvalLogPath(projectRoot) {
|
|
23
|
+
return join(projectRoot, ".indusk", "eval", "results.log");
|
|
24
|
+
}
|
|
25
|
+
function readSession(projectRoot) {
|
|
26
|
+
const path = getSessionPath(projectRoot);
|
|
27
|
+
if (!existsSync(path))
|
|
28
|
+
return null;
|
|
29
|
+
try {
|
|
30
|
+
return JSON.parse(readFileSync(path, "utf8"));
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
function writeSession(projectRoot, session) {
|
|
37
|
+
const path = getSessionPath(projectRoot);
|
|
38
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
39
|
+
writeFileSync(path, `${JSON.stringify(session, null, 2)}\n`);
|
|
40
|
+
}
|
|
41
|
+
function clearSession(projectRoot) {
|
|
42
|
+
const path = getSessionPath(projectRoot);
|
|
43
|
+
if (existsSync(path)) {
|
|
44
|
+
const { unlinkSync } = require("node:fs");
|
|
45
|
+
unlinkSync(path);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
const ALLOWED_TOOLS = [
|
|
49
|
+
"Read",
|
|
50
|
+
"Grep",
|
|
51
|
+
"Glob",
|
|
52
|
+
"Bash(jj:*)",
|
|
53
|
+
"Bash(git:*)",
|
|
54
|
+
"mcp__graphiti__*",
|
|
55
|
+
"mcp__indusk__*",
|
|
56
|
+
"mcp__codegraphcontext__*",
|
|
57
|
+
];
|
|
58
|
+
function parseClaudeOutput(stdout) {
|
|
59
|
+
let scorecardText = stdout;
|
|
60
|
+
let usage;
|
|
61
|
+
let sessionId;
|
|
62
|
+
try {
|
|
63
|
+
const jsonOutput = JSON.parse(stdout);
|
|
64
|
+
scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
|
|
65
|
+
sessionId = jsonOutput.session_id;
|
|
66
|
+
if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
|
|
67
|
+
const u = jsonOutput.usage ?? {};
|
|
68
|
+
usage = {
|
|
69
|
+
costUsd: jsonOutput.total_cost_usd ?? 0,
|
|
70
|
+
inputTokens: u.input_tokens ?? 0,
|
|
71
|
+
outputTokens: u.output_tokens ?? 0,
|
|
72
|
+
cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
|
|
73
|
+
cacheReadTokens: u.cache_read_input_tokens ?? 0,
|
|
74
|
+
durationMs: jsonOutput.duration_ms ?? 0,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
catch {
|
|
79
|
+
// raw output
|
|
80
|
+
}
|
|
81
|
+
const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
|
|
82
|
+
if (jsonMatch?.[1]) {
|
|
83
|
+
scorecardText = jsonMatch[1];
|
|
84
|
+
}
|
|
85
|
+
return { scorecardText, usage, sessionId };
|
|
86
|
+
}
|
|
87
|
+
async function spawnClaude(args, prompt, cwd) {
|
|
88
|
+
return new Promise((resolve) => {
|
|
89
|
+
const child = spawn("claude", args, {
|
|
90
|
+
cwd,
|
|
91
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
92
|
+
env: { ...process.env },
|
|
93
|
+
});
|
|
94
|
+
child.stdin?.write(prompt);
|
|
95
|
+
child.stdin?.end();
|
|
96
|
+
let stdout = "";
|
|
97
|
+
let stderr = "";
|
|
98
|
+
child.stdout?.on("data", (chunk) => {
|
|
99
|
+
stdout += chunk.toString();
|
|
100
|
+
});
|
|
101
|
+
child.stderr?.on("data", (chunk) => {
|
|
102
|
+
stderr += chunk.toString();
|
|
103
|
+
});
|
|
104
|
+
child.on("close", (code) => {
|
|
105
|
+
resolve({ stdout, stderr, code });
|
|
106
|
+
});
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Run eval using a persistent session. First call does catchup + eval.
|
|
111
|
+
* Subsequent calls resume the session with just the new change.
|
|
112
|
+
*/
|
|
113
|
+
export async function runPersistentEval(opts) {
|
|
114
|
+
const tracer = initEvalOtel(opts.projectRoot);
|
|
115
|
+
const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
|
|
116
|
+
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
117
|
+
// Peek at the highlights queue before spawning — gives us observability
|
|
118
|
+
// into how much work the Claude subprocess will do without having to
|
|
119
|
+
// span per-highlight (which would require Claude-Code-internal OTel).
|
|
120
|
+
let unprocessedCount = 0;
|
|
121
|
+
try {
|
|
122
|
+
unprocessedCount = readUnprocessedHighlights(opts.projectRoot).length;
|
|
123
|
+
}
|
|
124
|
+
catch {
|
|
125
|
+
// reading the queue is best-effort — never block the evaluator
|
|
126
|
+
}
|
|
127
|
+
const result = await withSpan(tracer, "eval.run", {
|
|
128
|
+
changeId: opts.changeId,
|
|
129
|
+
source,
|
|
130
|
+
mode: opts.mode,
|
|
131
|
+
projectGroup,
|
|
132
|
+
"highlights.unprocessed_count": unprocessedCount,
|
|
133
|
+
}, async (rootSpan) => {
|
|
134
|
+
const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
|
|
135
|
+
const session = await withSpan(tracer, "eval.read_session", undefined, () => readSession(opts.projectRoot));
|
|
136
|
+
rootSpan.setAttribute("resumed", session !== null);
|
|
137
|
+
try {
|
|
138
|
+
const { args, prompt } = await withSpan(tracer, "eval.build_prompt", { resumed: session !== null }, () => {
|
|
139
|
+
if (session) {
|
|
140
|
+
const resumePrompt = `Evaluate a new commit. Change ID: ${opts.changeId}
|
|
141
|
+
|
|
142
|
+
Run \`jj diff -r ${opts.changeId}\` to see what changed. Then answer the same evaluation questions as before. Read the changed files for full context.
|
|
143
|
+
|
|
144
|
+
Output ONLY the JSON scorecard as before — no commentary.`;
|
|
145
|
+
return {
|
|
146
|
+
args: [
|
|
147
|
+
"--print",
|
|
148
|
+
"--output-format",
|
|
149
|
+
"json",
|
|
150
|
+
"--resume",
|
|
151
|
+
session.sessionId,
|
|
152
|
+
"--allowed-tools",
|
|
153
|
+
ALLOWED_TOOLS.join(","),
|
|
154
|
+
],
|
|
155
|
+
prompt: resumePrompt,
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
return {
|
|
159
|
+
args: [
|
|
160
|
+
"--print",
|
|
161
|
+
"--output-format",
|
|
162
|
+
"json",
|
|
163
|
+
"--model",
|
|
164
|
+
"opus",
|
|
165
|
+
"--permission-mode",
|
|
166
|
+
"acceptEdits",
|
|
167
|
+
"--allowed-tools",
|
|
168
|
+
ALLOWED_TOOLS.join(","),
|
|
169
|
+
],
|
|
170
|
+
prompt: buildEvaluatorPrompt({
|
|
171
|
+
rubric: V1_RUBRIC,
|
|
172
|
+
changeId: opts.changeId,
|
|
173
|
+
transcriptPath: opts.transcriptPath,
|
|
174
|
+
mode: opts.mode,
|
|
175
|
+
projectGroup,
|
|
176
|
+
}),
|
|
177
|
+
};
|
|
178
|
+
});
|
|
179
|
+
const claudeResult = await withSpan(tracer, "eval.spawn_claude", {
|
|
180
|
+
"args.resumed": session !== null,
|
|
181
|
+
"args.model": session ? "(resumed)" : "opus",
|
|
182
|
+
}, async (span) => {
|
|
183
|
+
const spawned = await spawnClaude(args, prompt, opts.projectRoot);
|
|
184
|
+
span.setAttribute("exit.code", spawned.code ?? -1);
|
|
185
|
+
if (spawned.code !== 0) {
|
|
186
|
+
span.setAttribute("exit.stderr_tail", spawned.stderr.slice(-500));
|
|
187
|
+
}
|
|
188
|
+
return spawned;
|
|
189
|
+
});
|
|
190
|
+
if (claudeResult.code !== 0) {
|
|
191
|
+
if (session) {
|
|
192
|
+
await withSpan(tracer, "eval.clear_stale_session", undefined, () => clearSession(opts.projectRoot));
|
|
193
|
+
// Recurse — the retry produces its own root span
|
|
194
|
+
return runPersistentEval(opts);
|
|
195
|
+
}
|
|
196
|
+
throw new Error(`claude exited with code ${claudeResult.code}: ${claudeResult.stderr.slice(0, 500)}`);
|
|
197
|
+
}
|
|
198
|
+
const parsed = await withSpan(tracer, "eval.parse_output", undefined, (span) => {
|
|
199
|
+
const out = parseClaudeOutput(claudeResult.stdout);
|
|
200
|
+
if (out.sessionId)
|
|
201
|
+
span.setAttribute("session_id", out.sessionId);
|
|
202
|
+
if (out.usage) {
|
|
203
|
+
span.setAttribute("cost_usd", out.usage.costUsd);
|
|
204
|
+
span.setAttribute("input_tokens", out.usage.inputTokens);
|
|
205
|
+
span.setAttribute("output_tokens", out.usage.outputTokens);
|
|
206
|
+
}
|
|
207
|
+
return out;
|
|
208
|
+
});
|
|
209
|
+
const scorecard = JSON.parse(parsed.scorecardText.trim());
|
|
210
|
+
if (parsed.usage)
|
|
211
|
+
scorecard.usage = parsed.usage;
|
|
212
|
+
scorecard.telemetryPosted = false;
|
|
213
|
+
await withSpan(tracer, "eval.update_session", undefined, () => {
|
|
214
|
+
const newSession = {
|
|
215
|
+
sessionId: parsed.sessionId ?? session?.sessionId ?? "unknown",
|
|
216
|
+
createdAt: session?.createdAt ?? new Date().toISOString(),
|
|
217
|
+
lastEvalAt: new Date().toISOString(),
|
|
218
|
+
evalCount: (session?.evalCount ?? 0) + 1,
|
|
219
|
+
};
|
|
220
|
+
writeSession(opts.projectRoot, newSession);
|
|
221
|
+
});
|
|
222
|
+
await withSpan(tracer, "eval.write_scorecard", undefined, async () => {
|
|
223
|
+
await logWriter.append(scorecard);
|
|
224
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
225
|
+
});
|
|
226
|
+
return scorecard;
|
|
227
|
+
}
|
|
228
|
+
catch (err) {
|
|
229
|
+
const errorEntry = {
|
|
230
|
+
version: 1,
|
|
231
|
+
timestamp: new Date().toISOString(),
|
|
232
|
+
mode: opts.mode,
|
|
233
|
+
changeId: opts.changeId,
|
|
234
|
+
error: true,
|
|
235
|
+
message: err instanceof Error ? err.message : String(err),
|
|
236
|
+
};
|
|
237
|
+
await logWriter.append(errorEntry);
|
|
238
|
+
return errorEntry;
|
|
239
|
+
}
|
|
240
|
+
});
|
|
241
|
+
// Flush OTel so batched spans ship before the detached process exits.
|
|
242
|
+
await shutdownEvalOtel();
|
|
243
|
+
return result;
|
|
244
|
+
}
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Builds the
|
|
2
|
+
* Builds the evaluator agent's system prompt.
|
|
3
3
|
*
|
|
4
|
-
* The prompt instructs the
|
|
4
|
+
* The prompt instructs the evaluator to: do catchup, read the transcript, read the
|
|
5
5
|
* diff itself via jj, answer each rubric question, write findings to Graphiti
|
|
6
6
|
* (eval mode only), and output a JSON scorecard.
|
|
7
7
|
*
|
|
8
|
-
* The diff is NOT embedded in the prompt — the
|
|
8
|
+
* The diff is NOT embedded in the prompt — the evaluator reads it via tool calls.
|
|
9
9
|
* This keeps the prompt small regardless of commit size.
|
|
10
10
|
*/
|
|
11
11
|
import type { RubricQuestion } from "./types.js";
|
|
@@ -16,4 +16,4 @@ export interface PromptBuilderOptions {
|
|
|
16
16
|
mode: "eval" | "baseline";
|
|
17
17
|
projectGroup: string;
|
|
18
18
|
}
|
|
19
|
-
export declare function
|
|
19
|
+
export declare function buildEvaluatorPrompt(opts: PromptBuilderOptions): string;
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Builds the
|
|
2
|
+
* Builds the evaluator agent's system prompt.
|
|
3
3
|
*
|
|
4
|
-
* The prompt instructs the
|
|
4
|
+
* The prompt instructs the evaluator to: do catchup, read the transcript, read the
|
|
5
5
|
* diff itself via jj, answer each rubric question, write findings to Graphiti
|
|
6
6
|
* (eval mode only), and output a JSON scorecard.
|
|
7
7
|
*
|
|
8
|
-
* The diff is NOT embedded in the prompt — the
|
|
8
|
+
* The diff is NOT embedded in the prompt — the evaluator reads it via tool calls.
|
|
9
9
|
* This keeps the prompt small regardless of commit size.
|
|
10
10
|
*/
|
|
11
|
-
export function
|
|
11
|
+
export function buildEvaluatorPrompt(opts) {
|
|
12
12
|
const questionsBlock = opts.rubric
|
|
13
13
|
.map((q, i) => `${i + 1}. **${q.id}**: ${q.question}\n Guidance: ${q.guidance}`)
|
|
14
14
|
.join("\n\n");
|
|
@@ -62,7 +62,7 @@ If the tool is unavailable, skip silently and set graphitiWrites to 0.`
|
|
|
62
62
|
### Step 6: Graphiti writes
|
|
63
63
|
|
|
64
64
|
Baseline mode — do NOT write to Graphiti. Set graphitiWrites to 0.`;
|
|
65
|
-
return `You are the InDusk
|
|
65
|
+
return `You are the InDusk eval agent (evaluator). Your job is to evaluate the quality of work done by an AI agent on a software project.
|
|
66
66
|
|
|
67
67
|
You have full read access to the codebase, MCP tools (Graphiti, code graph, InDusk), and the session transcript. You cannot edit files.
|
|
68
68
|
|
package/dist/lib/eval/types.d.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Types for the context system evaluation.
|
|
3
3
|
*
|
|
4
4
|
* The scorecard is the unit of evaluation — one per commit. Questions are the
|
|
5
|
-
* rubric, defined in rubric.ts and answered by the
|
|
5
|
+
* rubric, defined in rubric.ts and answered by the eval agent (evaluator).
|
|
6
6
|
*/
|
|
7
7
|
export interface RubricQuestion {
|
|
8
8
|
id: string;
|
package/dist/lib/eval/types.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Types for the context system evaluation.
|
|
3
3
|
*
|
|
4
4
|
* The scorecard is the unit of evaluation — one per commit. Questions are the
|
|
5
|
-
* rubric, defined in rubric.ts and answered by the
|
|
5
|
+
* rubric, defined in rubric.ts and answered by the eval agent (evaluator).
|
|
6
6
|
*/
|
|
7
7
|
export function isScorecard(entry) {
|
|
8
8
|
return (!("error" in entry) && "questions" in entry && Array.isArray(entry.questions));
|
package/hooks/eval-trigger.js
CHANGED
|
@@ -4,13 +4,13 @@
|
|
|
4
4
|
* Dual-mode eval trigger.
|
|
5
5
|
*
|
|
6
6
|
* 1) PostToolUse hook mode (default): fires on Bash tool calls containing
|
|
7
|
-
* `jj describe`. Reads the hook event JSON from stdin. Spawns the
|
|
7
|
+
* `jj describe`. Reads the hook event JSON from stdin. Spawns the evaluator
|
|
8
8
|
* runner as a detached background process.
|
|
9
9
|
*
|
|
10
10
|
* 2) CLI mode (`--source <tag>`): invoked manually by skills (e.g., handoff)
|
|
11
11
|
* at session end. No stdin read, no `jj describe` filter. Uses the current
|
|
12
|
-
* @ change and passes the source tag to the
|
|
13
|
-
* The
|
|
12
|
+
* @ change and passes the source tag to the evaluator via INDUSK_EVAL_SOURCE.
|
|
13
|
+
* The evaluator may skip diff-based scoring when source != "commit" but still
|
|
14
14
|
* processes the highlights queue.
|
|
15
15
|
*
|
|
16
16
|
* Exit 0 always — this is advisory, not blocking.
|
|
@@ -142,9 +142,12 @@ const transcriptPath =
|
|
|
142
142
|
const hookDir = dirname(fileURLToPath(import.meta.url));
|
|
143
143
|
const candidates = [
|
|
144
144
|
// Source repo (apps/indusk-mcp/hooks/ → apps/indusk-mcp/dist/)
|
|
145
|
-
resolve(hookDir, "../dist/lib/eval/
|
|
145
|
+
resolve(hookDir, "../dist/lib/eval/evaluator-runner.js"),
|
|
146
146
|
// Installed package (hooks/ → dist/)
|
|
147
|
-
resolve(
|
|
147
|
+
resolve(
|
|
148
|
+
hookDir,
|
|
149
|
+
"../../node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/evaluator-runner.js",
|
|
150
|
+
),
|
|
148
151
|
// Global npx cache
|
|
149
152
|
...(() => {
|
|
150
153
|
try {
|
|
@@ -153,24 +156,24 @@ const candidates = [
|
|
|
153
156
|
return [
|
|
154
157
|
resolve(
|
|
155
158
|
dirname(which),
|
|
156
|
-
"../lib/node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/
|
|
159
|
+
"../lib/node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/evaluator-runner.js",
|
|
157
160
|
),
|
|
158
161
|
];
|
|
159
162
|
} catch {}
|
|
160
163
|
return [];
|
|
161
164
|
})(),
|
|
162
165
|
];
|
|
163
|
-
let
|
|
166
|
+
let evaluatorRunnerPath = null;
|
|
164
167
|
for (const c of candidates) {
|
|
165
168
|
syslog(projectRoot, `candidate: ${c} — ${existsSync(c) ? "found" : "missing"}`);
|
|
166
169
|
if (existsSync(c)) {
|
|
167
|
-
|
|
170
|
+
evaluatorRunnerPath = c;
|
|
168
171
|
break;
|
|
169
172
|
}
|
|
170
173
|
}
|
|
171
|
-
syslog(projectRoot, `
|
|
174
|
+
syslog(projectRoot, `evaluatorRunnerPath: ${evaluatorRunnerPath ?? "NOT FOUND"}`);
|
|
172
175
|
|
|
173
|
-
if (!
|
|
176
|
+
if (!evaluatorRunnerPath) {
|
|
174
177
|
// Can't find the package — log error and exit
|
|
175
178
|
const { mkdirSync, appendFileSync } = await import("node:fs");
|
|
176
179
|
const logPath = resolve(projectRoot, ".indusk", "eval", "results.log");
|
|
@@ -182,14 +185,14 @@ if (!judgeRunnerPath) {
|
|
|
182
185
|
changeId,
|
|
183
186
|
error: true,
|
|
184
187
|
message:
|
|
185
|
-
"Could not find @infinitedusky/indusk-mcp package — eval
|
|
188
|
+
"Could not find @infinitedusky/indusk-mcp package — eval evaluator not available. Run: npm i -g @infinitedusky/indusk-mcp",
|
|
186
189
|
});
|
|
187
190
|
appendFileSync(logPath, `${entry}\n`, "utf8");
|
|
188
191
|
process.exit(0);
|
|
189
192
|
}
|
|
190
193
|
|
|
191
194
|
// Surface unresolved findings from previous evals
|
|
192
|
-
const findingsPath =
|
|
195
|
+
const findingsPath = evaluatorRunnerPath.replace("evaluator-runner.js", "findings.js");
|
|
193
196
|
if (existsSync(findingsPath)) {
|
|
194
197
|
try {
|
|
195
198
|
const { getUnresolvedFindings } = await import(findingsPath);
|
|
@@ -207,18 +210,23 @@ if (existsSync(findingsPath)) {
|
|
|
207
210
|
}
|
|
208
211
|
}
|
|
209
212
|
|
|
210
|
-
// Use persistent
|
|
211
|
-
const
|
|
212
|
-
|
|
213
|
-
|
|
213
|
+
// Use persistent evaluator — resumes existing session if available, otherwise does full catchup.
|
|
214
|
+
const persistentEvaluatorPath = evaluatorRunnerPath.replace(
|
|
215
|
+
"evaluator-runner.js",
|
|
216
|
+
"persistent-evaluator.js",
|
|
217
|
+
);
|
|
218
|
+
const useModule = existsSync(persistentEvaluatorPath)
|
|
219
|
+
? persistentEvaluatorPath
|
|
220
|
+
: evaluatorRunnerPath;
|
|
221
|
+
const useFunction = existsSync(persistentEvaluatorPath) ? "runPersistentEval" : "runEvaluatorSync";
|
|
214
222
|
|
|
215
223
|
syslog(
|
|
216
224
|
projectRoot,
|
|
217
|
-
`spawning
|
|
225
|
+
`spawning evaluator — module: ${useModule}, function: ${useFunction}, changeId: ${changeId}`,
|
|
218
226
|
);
|
|
219
227
|
|
|
220
228
|
const syslogPath = resolve(projectRoot, ".indusk", "eval", "system.log");
|
|
221
|
-
const
|
|
229
|
+
const evaluatorScript = `
|
|
222
230
|
const fs = require("fs");
|
|
223
231
|
const path = require("path");
|
|
224
232
|
function syslog(msg) {
|
|
@@ -227,10 +235,10 @@ function syslog(msg) {
|
|
|
227
235
|
fs.appendFileSync("${syslogPath}", new Date().toISOString() + " " + msg + "\\n");
|
|
228
236
|
} catch {}
|
|
229
237
|
}
|
|
230
|
-
syslog("
|
|
238
|
+
syslog("evaluator process started — changeId: ${changeId}");
|
|
231
239
|
import("${useModule}")
|
|
232
240
|
.then(m => {
|
|
233
|
-
syslog("
|
|
241
|
+
syslog("evaluator module loaded — calling ${useFunction}");
|
|
234
242
|
return m.${useFunction}({
|
|
235
243
|
projectRoot: ${JSON.stringify(projectRoot)},
|
|
236
244
|
changeId: ${JSON.stringify(changeId)},
|
|
@@ -241,11 +249,11 @@ import("${useModule}")
|
|
|
241
249
|
})
|
|
242
250
|
.then((result) => {
|
|
243
251
|
const hasError = result && result.error;
|
|
244
|
-
syslog("
|
|
252
|
+
syslog("evaluator completed — " + (hasError ? "error: " + result.message : "scorecard written"));
|
|
245
253
|
process.exit(0);
|
|
246
254
|
})
|
|
247
255
|
.catch(err => {
|
|
248
|
-
syslog("
|
|
256
|
+
syslog("evaluator crashed — " + (err.message || String(err)));
|
|
249
257
|
const logPath = path.join(${JSON.stringify(projectRoot)}, ".indusk", "eval", "results.log");
|
|
250
258
|
fs.mkdirSync(path.dirname(logPath), { recursive: true });
|
|
251
259
|
const entry = JSON.stringify({
|
|
@@ -261,7 +269,7 @@ import("${useModule}")
|
|
|
261
269
|
});
|
|
262
270
|
`;
|
|
263
271
|
|
|
264
|
-
const child = spawn("node", ["--input-type=module", "-e",
|
|
272
|
+
const child = spawn("node", ["--input-type=module", "-e", evaluatorScript], {
|
|
265
273
|
cwd: projectRoot,
|
|
266
274
|
stdio: "ignore",
|
|
267
275
|
detached: true,
|
|
@@ -270,24 +278,24 @@ const child = spawn("node", ["--input-type=module", "-e", judgeScript], {
|
|
|
270
278
|
|
|
271
279
|
child.unref();
|
|
272
280
|
|
|
273
|
-
syslog(projectRoot, `
|
|
281
|
+
syslog(projectRoot, `evaluator spawned — source: ${source}, pid: ${child.pid}`);
|
|
274
282
|
|
|
275
283
|
if (cliSource !== null) {
|
|
276
284
|
// CLI mode — write a brief notice to stderr and exit
|
|
277
285
|
process.stderr.write(
|
|
278
|
-
`📊 Eval
|
|
286
|
+
`📊 Eval evaluator spawned (source=${source}) for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
|
|
279
287
|
);
|
|
280
288
|
} else {
|
|
281
289
|
// Hook mode — output structured hook response
|
|
282
290
|
const output = JSON.stringify({
|
|
283
291
|
hookSpecificOutput: {
|
|
284
292
|
hookEventName: "PostToolUse",
|
|
285
|
-
message: `Eval
|
|
293
|
+
message: `Eval evaluator spawned for change ${changeId.slice(0, 8)}`,
|
|
286
294
|
},
|
|
287
295
|
});
|
|
288
296
|
process.stdout.write(output);
|
|
289
297
|
process.stderr.write(
|
|
290
|
-
`📊 Eval
|
|
298
|
+
`📊 Eval evaluator spawned in background for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
|
|
291
299
|
);
|
|
292
300
|
}
|
|
293
301
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@infinitedusky/indusk-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.18.0",
|
|
4
4
|
"description": "InDusk development system — skills, MCP tools, and CLI for structured AI-assisted development",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -28,6 +28,12 @@
|
|
|
28
28
|
},
|
|
29
29
|
"dependencies": {
|
|
30
30
|
"@modelcontextprotocol/sdk": "^1.12.1",
|
|
31
|
+
"@opentelemetry/api": "^1.9.0",
|
|
32
|
+
"@opentelemetry/exporter-trace-otlp-http": "^0.214.0",
|
|
33
|
+
"@opentelemetry/resources": "^2.6.0",
|
|
34
|
+
"@opentelemetry/sdk-trace-base": "^2.6.0",
|
|
35
|
+
"@opentelemetry/sdk-trace-node": "^2.6.0",
|
|
36
|
+
"@opentelemetry/semantic-conventions": "^1.40.0",
|
|
31
37
|
"commander": "^13.0.0",
|
|
32
38
|
"falkordb": "^6.6.2",
|
|
33
39
|
"glob": "^11.0.0",
|
package/skills/eval-review.md
CHANGED
|
@@ -2,19 +2,19 @@ You can evaluate the current session's work quality on demand.
|
|
|
2
2
|
|
|
3
3
|
## When to Use
|
|
4
4
|
|
|
5
|
-
- `/eval review` — run the eval
|
|
5
|
+
- `/eval review` — run the eval evaluator against the current working copy
|
|
6
6
|
- Mid-session quality check before committing
|
|
7
7
|
- When you want to see how the work scores against the rubric
|
|
8
8
|
|
|
9
9
|
## What It Does
|
|
10
10
|
|
|
11
|
-
Runs the same
|
|
11
|
+
Runs the same evaluator process as the automatic eval hook, but against uncommitted changes instead of a committed change. Uses `jj diff` for the current working copy diff and the current session's transcript.
|
|
12
12
|
|
|
13
13
|
## Process
|
|
14
14
|
|
|
15
15
|
1. Get the current diff: `jj diff`
|
|
16
|
-
2. Build the
|
|
17
|
-
3. Run the
|
|
16
|
+
2. Build the evaluator prompt with the v1 rubric
|
|
17
|
+
3. Run the evaluator (uses `runEvaluatorSync` from `apps/indusk-mcp/src/lib/eval/evaluator-runner.ts`)
|
|
18
18
|
4. Display the scorecard inline
|
|
19
19
|
5. Append results to `.indusk/eval/results.log`
|
|
20
20
|
|
|
@@ -23,7 +23,7 @@ Runs the same judge process as the automatic eval hook, but against uncommitted
|
|
|
23
23
|
When the user says `/eval review` or asks for a quality check:
|
|
24
24
|
|
|
25
25
|
1. Get the current change ID: `jj log -r @ --no-graph -T change_id`
|
|
26
|
-
2. Call `
|
|
26
|
+
2. Call `runEvaluatorSync` with mode `"eval"` and the current transcript path
|
|
27
27
|
3. Present the scorecard to the user:
|
|
28
28
|
- Overall summary
|
|
29
29
|
- Per-question results with evidence
|
|
@@ -32,6 +32,6 @@ When the user says `/eval review` or asks for a quality check:
|
|
|
32
32
|
## Important
|
|
33
33
|
|
|
34
34
|
- This is a quality check, not a blocker — findings are informational
|
|
35
|
-
- The
|
|
35
|
+
- The evaluator has full MCP access and does a real catchup
|
|
36
36
|
- Results are logged to the same eval log as automatic evaluations
|
|
37
|
-
- If the
|
|
37
|
+
- If the evaluator fails, show the error — don't silently skip
|
package/skills/handoff.md
CHANGED
|
@@ -62,7 +62,7 @@ Run this from the project root:
|
|
|
62
62
|
node .claude/hooks/eval-trigger.js --source handoff
|
|
63
63
|
```
|
|
64
64
|
|
|
65
|
-
The trigger spawns the
|
|
65
|
+
The trigger spawns the evaluator in the background and returns immediately — it never blocks handoff. The evaluator processes the highlights queue and, because `INDUSK_EVAL_SOURCE=handoff` is set in the environment, may skip diff-based rubric scoring (there's no new commit). Highlights still get materialized into Graphiti episodes.
|
|
66
66
|
|
|
67
67
|
If the hook isn't installed or Node isn't on PATH, the handoff still succeeds — the highlights remain queued for the next `jj describe` in a future session.
|
|
68
68
|
|