@infinitedusky/indusk-mcp 1.17.0 → 1.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -212,7 +212,7 @@ export async function evalBaseline(projectRoot, opts) {
212
212
  }
213
213
  // Run the smart evaluator against the baseline
214
214
  console.info("Running smart evaluator against baseline...");
215
- const { runJudgeSync } = await import("../../lib/eval/judge-runner.js");
215
+ const { runEvaluatorSync } = await import("../../lib/eval/evaluator-runner.js");
216
216
  let changeId;
217
217
  try {
218
218
  changeId = execSync("jj log -r @ --no-graph -T change_id", {
@@ -223,7 +223,7 @@ export async function evalBaseline(projectRoot, opts) {
223
223
  catch {
224
224
  changeId = "baseline-unknown";
225
225
  }
226
- const evalResult = await runJudgeSync({
226
+ const evalResult = await runEvaluatorSync({
227
227
  projectRoot: worktreePath,
228
228
  changeId,
229
229
  transcriptPath: "(baseline — no transcript)",
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Evaluator runner — spawns a background `claude --print` process that evaluates
3
+ * a commit and writes results to the eval log.
4
+ *
5
+ * The evaluator is a detached child process so the calling hook can exit immediately.
6
+ * Results appear asynchronously in `.indusk/eval/results.log`.
7
+ */
8
+ import type { EvalErrorEntry, EvalScorecard } from "./types.js";
9
+ export interface EvaluatorRunOptions {
10
+ projectRoot: string;
11
+ changeId: string;
12
+ transcriptPath: string;
13
+ mode: "eval" | "baseline";
14
+ evalEndpoint?: string;
15
+ }
16
+ /**
17
+ * Run the evaluator as a detached background process.
18
+ *
19
+ * Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
20
+ * Collects stdout, parses the scorecard JSON, and appends to the eval log.
21
+ * If anything fails, logs an error entry instead of silently dropping.
22
+ */
23
+ export declare function runEvaluatorBackground(opts: EvaluatorRunOptions): void;
24
+ /**
25
+ * Run the evaluator synchronously (for testing and manual invocation).
26
+ * Returns the scorecard or error entry.
27
+ */
28
+ export declare function runEvaluatorSync(opts: EvaluatorRunOptions): Promise<EvalScorecard | EvalErrorEntry>;
@@ -0,0 +1,266 @@
1
+ /**
2
+ * Evaluator runner — spawns a background `claude --print` process that evaluates
3
+ * a commit and writes results to the eval log.
4
+ *
5
+ * The evaluator is a detached child process so the calling hook can exit immediately.
6
+ * Results appear asynchronously in `.indusk/eval/results.log`.
7
+ */
8
+ import { spawn } from "node:child_process";
9
+ import { join } from "node:path";
10
+ import { getProjectGroupId } from "../config.js";
11
+ import { ingestScorecard } from "./findings.js";
12
+ import { EvalLogWriter } from "./log-writer.js";
13
+ import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
14
+ import { buildEvaluatorPrompt } from "./prompt-builder.js";
15
+ import { V1_RUBRIC } from "./rubric.js";
16
+ function getEvalLogPath(projectRoot) {
17
+ return join(projectRoot, ".indusk", "eval", "results.log");
18
+ }
19
+ async function postTelemetry(endpoint, scorecard) {
20
+ try {
21
+ const controller = new AbortController();
22
+ const timeout = setTimeout(() => controller.abort(), 5000);
23
+ await fetch(endpoint, {
24
+ method: "POST",
25
+ headers: { "Content-Type": "application/json" },
26
+ body: JSON.stringify(scorecard),
27
+ signal: controller.signal,
28
+ });
29
+ clearTimeout(timeout);
30
+ }
31
+ catch {
32
+ // fire-and-forget — silently ignore errors
33
+ }
34
+ }
35
+ /**
36
+ * Run the evaluator as a detached background process.
37
+ *
38
+ * Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
39
+ * Collects stdout, parses the scorecard JSON, and appends to the eval log.
40
+ * If anything fails, logs an error entry instead of silently dropping.
41
+ */
42
+ export function runEvaluatorBackground(opts) {
43
+ const projectGroup = getProjectGroupId(opts.projectRoot);
44
+ const prompt = buildEvaluatorPrompt({
45
+ rubric: V1_RUBRIC,
46
+ changeId: opts.changeId,
47
+ transcriptPath: opts.transcriptPath,
48
+ mode: opts.mode,
49
+ projectGroup,
50
+ });
51
+ const allowedTools = [
52
+ "Read",
53
+ "Grep",
54
+ "Glob",
55
+ "Bash(jj:*)",
56
+ "Bash(git:*)",
57
+ "mcp__graphiti__*",
58
+ "mcp__indusk__*",
59
+ "mcp__codegraphcontext__*",
60
+ ];
61
+ const args = [
62
+ "--print",
63
+ "--output-format",
64
+ "json",
65
+ "--model",
66
+ "opus",
67
+ "--permission-mode",
68
+ "acceptEdits",
69
+ "--allowed-tools",
70
+ allowedTools.join(","),
71
+ ];
72
+ // Not detached — the eval-trigger hook already spawns this in a separate
73
+ // node process. Detaching + unref causes the close handler to never fire.
74
+ const child = spawn("claude", args, {
75
+ cwd: opts.projectRoot,
76
+ stdio: ["pipe", "pipe", "pipe"],
77
+ env: { ...process.env },
78
+ });
79
+ // Pipe the prompt via stdin (too large for CLI arg)
80
+ child.stdin?.write(prompt);
81
+ child.stdin?.end();
82
+ let stdout = "";
83
+ let stderr = "";
84
+ child.stdout?.on("data", (chunk) => {
85
+ stdout += chunk.toString();
86
+ });
87
+ child.stderr?.on("data", (chunk) => {
88
+ stderr += chunk.toString();
89
+ });
90
+ child.on("close", async (code) => {
91
+ const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
92
+ try {
93
+ if (code !== 0) {
94
+ throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
95
+ }
96
+ // --output-format json wraps the result; extract the text content and usage
97
+ let scorecardText = stdout;
98
+ let usage;
99
+ try {
100
+ const jsonOutput = JSON.parse(stdout);
101
+ scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
102
+ // Capture usage data from claude --print output
103
+ if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
104
+ const u = jsonOutput.usage ?? {};
105
+ usage = {
106
+ costUsd: jsonOutput.total_cost_usd ?? 0,
107
+ inputTokens: u.input_tokens ?? 0,
108
+ outputTokens: u.output_tokens ?? 0,
109
+ cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
110
+ cacheReadTokens: u.cache_read_input_tokens ?? 0,
111
+ durationMs: jsonOutput.duration_ms ?? 0,
112
+ };
113
+ }
114
+ }
115
+ catch {
116
+ // stdout might be raw JSON scorecard already
117
+ }
118
+ // Extract JSON from possible markdown code fences
119
+ const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
120
+ if (jsonMatch?.[1]) {
121
+ scorecardText = jsonMatch[1];
122
+ }
123
+ const scorecard = JSON.parse(scorecardText.trim());
124
+ if (usage)
125
+ scorecard.usage = usage;
126
+ scorecard.telemetryPosted = false;
127
+ if (opts.evalEndpoint) {
128
+ await postTelemetry(opts.evalEndpoint, scorecard);
129
+ scorecard.telemetryPosted = true;
130
+ }
131
+ await logWriter.append(scorecard);
132
+ ingestScorecard(opts.projectRoot, scorecard);
133
+ }
134
+ catch (err) {
135
+ const errorEntry = {
136
+ version: 1,
137
+ timestamp: new Date().toISOString(),
138
+ mode: opts.mode,
139
+ changeId: opts.changeId,
140
+ error: true,
141
+ message: err instanceof Error ? err.message : String(err),
142
+ };
143
+ await logWriter.append(errorEntry);
144
+ }
145
+ });
146
+ }
147
+ /**
148
+ * Run the evaluator synchronously (for testing and manual invocation).
149
+ * Returns the scorecard or error entry.
150
+ */
151
+ export async function runEvaluatorSync(opts) {
152
+ const tracer = initEvalOtel(opts.projectRoot);
153
+ const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
154
+ const projectGroup = getProjectGroupId(opts.projectRoot);
155
+ const result = await withSpan(tracer, "eval.run", {
156
+ changeId: opts.changeId,
157
+ source,
158
+ mode: opts.mode,
159
+ projectGroup,
160
+ entrypoint: "runEvaluatorSync",
161
+ }, () => runEvaluatorSyncInner(opts, projectGroup));
162
+ await shutdownEvalOtel();
163
+ return result;
164
+ }
165
+ async function runEvaluatorSyncInner(opts, projectGroup) {
166
+ const prompt = buildEvaluatorPrompt({
167
+ rubric: V1_RUBRIC,
168
+ changeId: opts.changeId,
169
+ transcriptPath: opts.transcriptPath,
170
+ mode: opts.mode,
171
+ projectGroup,
172
+ });
173
+ const allowedTools = [
174
+ "Read",
175
+ "Grep",
176
+ "Glob",
177
+ "Bash(jj:*)",
178
+ "Bash(git:*)",
179
+ "mcp__graphiti__*",
180
+ "mcp__indusk__*",
181
+ "mcp__codegraphcontext__*",
182
+ ];
183
+ const args = [
184
+ "--print",
185
+ "--output-format",
186
+ "json",
187
+ "--model",
188
+ "opus",
189
+ "--permission-mode",
190
+ "acceptEdits",
191
+ "--allowed-tools",
192
+ allowedTools.join(","),
193
+ ];
194
+ return new Promise((resolve) => {
195
+ const child = spawn("claude", args, {
196
+ cwd: opts.projectRoot,
197
+ stdio: ["pipe", "pipe", "pipe"],
198
+ env: { ...process.env },
199
+ });
200
+ child.stdin?.write(prompt);
201
+ child.stdin?.end();
202
+ let stdout = "";
203
+ let stderr = "";
204
+ child.stdout?.on("data", (chunk) => {
205
+ stdout += chunk.toString();
206
+ });
207
+ child.stderr?.on("data", (chunk) => {
208
+ stderr += chunk.toString();
209
+ });
210
+ child.on("close", async (code) => {
211
+ const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
212
+ try {
213
+ if (code !== 0) {
214
+ throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
215
+ }
216
+ let scorecardText = stdout;
217
+ let syncUsage;
218
+ try {
219
+ const jsonOutput = JSON.parse(stdout);
220
+ scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
221
+ if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
222
+ const u = jsonOutput.usage ?? {};
223
+ syncUsage = {
224
+ costUsd: jsonOutput.total_cost_usd ?? 0,
225
+ inputTokens: u.input_tokens ?? 0,
226
+ outputTokens: u.output_tokens ?? 0,
227
+ cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
228
+ cacheReadTokens: u.cache_read_input_tokens ?? 0,
229
+ durationMs: jsonOutput.duration_ms ?? 0,
230
+ };
231
+ }
232
+ }
233
+ catch {
234
+ // raw JSON
235
+ }
236
+ const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
237
+ if (jsonMatch?.[1]) {
238
+ scorecardText = jsonMatch[1];
239
+ }
240
+ const scorecard = JSON.parse(scorecardText.trim());
241
+ if (syncUsage)
242
+ scorecard.usage = syncUsage;
243
+ scorecard.telemetryPosted = false;
244
+ if (opts.evalEndpoint) {
245
+ await postTelemetry(opts.evalEndpoint, scorecard);
246
+ scorecard.telemetryPosted = true;
247
+ }
248
+ await logWriter.append(scorecard);
249
+ ingestScorecard(opts.projectRoot, scorecard);
250
+ resolve(scorecard);
251
+ }
252
+ catch (err) {
253
+ const errorEntry = {
254
+ version: 1,
255
+ timestamp: new Date().toISOString(),
256
+ mode: opts.mode,
257
+ changeId: opts.changeId,
258
+ error: true,
259
+ message: err instanceof Error ? err.message : String(err),
260
+ };
261
+ await logWriter.append(errorEntry);
262
+ resolve(errorEntry);
263
+ }
264
+ });
265
+ });
266
+ }
@@ -0,0 +1,61 @@
1
+ /**
2
+ * OpenTelemetry tracing for the eval agent (evaluator).
3
+ *
4
+ * Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
5
+ * `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
6
+ * (Dash0 or any OTLP HTTP receiver).
7
+ *
8
+ * Default OFF — zero cost in normal operation (no SDK init, no network).
9
+ *
10
+ * Graceful degradation: when enabled but endpoint missing, log a warning
11
+ * to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
12
+ * throws, same behavior. The evaluator never fails because of OTel.
13
+ */
14
+ import { type Attributes, type Span, type Tracer } from "@opentelemetry/api";
15
+ export interface EvalOtelConfig {
16
+ enabled: boolean;
17
+ endpoint: string | null;
18
+ dataset: string;
19
+ }
20
+ /**
21
+ * Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
22
+ * the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
23
+ * env vars. Does not init anything or touch the network.
24
+ *
25
+ * Resolution:
26
+ * - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
27
+ * - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
28
+ * - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
29
+ * else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
30
+ */
31
+ export declare function isEvalOtelEnabled(projectRoot: string): EvalOtelConfig;
32
+ /**
33
+ * Initialize OTel tracing for the evaluator if enabled + endpoint set.
34
+ * Returns a Tracer — real when enabled, no-op when not.
35
+ *
36
+ * The no-op path costs nothing: no provider registered, no network, the
37
+ * returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
38
+ *
39
+ * Safe to call multiple times — subsequent calls return the same tracer.
40
+ */
41
+ export declare function initEvalOtel(projectRoot: string): Tracer;
42
+ /**
43
+ * Run `fn` inside an active span. Closes the span in `finally`. On thrown
44
+ * error, records the exception on the span and sets status to ERROR, then
45
+ * re-throws so callers can still handle it.
46
+ *
47
+ * Use this for every lifecycle step in the evaluator so spans close even
48
+ * when Claude exits non-zero or a downstream step throws.
49
+ */
50
+ export declare function withSpan<T>(tracer: Tracer, name: string, attrs: Attributes | undefined, fn: (span: Span) => Promise<T> | T): Promise<T>;
51
+ /**
52
+ * Flush and shut down the active provider. Call this before `process.exit()`
53
+ * in detached processes so batched spans are not lost. No-op if no provider
54
+ * is active.
55
+ */
56
+ export declare function shutdownEvalOtel(): Promise<void>;
57
+ /**
58
+ * Test hook: reset the module's state AND the global OTel API so each test
59
+ * starts fresh. Not part of the public API.
60
+ */
61
+ export declare function __resetEvalOtelForTests(): void;
@@ -0,0 +1,189 @@
1
+ /**
2
+ * OpenTelemetry tracing for the eval agent (evaluator).
3
+ *
4
+ * Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
5
+ * `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
6
+ * (Dash0 or any OTLP HTTP receiver).
7
+ *
8
+ * Default OFF — zero cost in normal operation (no SDK init, no network).
9
+ *
10
+ * Graceful degradation: when enabled but endpoint missing, log a warning
11
+ * to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
12
+ * throws, same behavior. The evaluator never fails because of OTel.
13
+ */
14
+ import { appendFileSync, existsSync, mkdirSync, readFileSync } from "node:fs";
15
+ import { join, resolve } from "node:path";
16
+ import { SpanStatusCode, trace } from "@opentelemetry/api";
17
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
18
+ import { resourceFromAttributes } from "@opentelemetry/resources";
19
+ import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-base";
20
+ import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
21
+ import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
22
+ const TRACER_NAME = "@infinitedusky/indusk-mcp/eval";
23
+ const SERVICE_NAME = "indusk-eval-agent";
24
+ function syslog(projectRoot, msg) {
25
+ try {
26
+ const logDir = resolve(projectRoot, ".indusk", "eval");
27
+ mkdirSync(logDir, { recursive: true });
28
+ appendFileSync(resolve(logDir, "system.log"), `${new Date().toISOString()} ${msg}\n`);
29
+ }
30
+ catch {
31
+ // logging should never break anything
32
+ }
33
+ }
34
+ const DEFAULT_DATASET = "agent";
35
+ /**
36
+ * Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
37
+ * the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
38
+ * env vars. Does not init anything or touch the network.
39
+ *
40
+ * Resolution:
41
+ * - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
42
+ * - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
43
+ * - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
44
+ * else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
45
+ */
46
+ export function isEvalOtelEnabled(projectRoot) {
47
+ const envFlag = process.env.INDUSK_EVAL_OTEL;
48
+ const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? null;
49
+ const envDataset = process.env.INDUSK_EVAL_OTEL_DATASET;
50
+ let configEnabled = false;
51
+ let configDataset;
52
+ const configPath = join(projectRoot, ".indusk", "config.json");
53
+ if (existsSync(configPath)) {
54
+ try {
55
+ const config = JSON.parse(readFileSync(configPath, "utf-8"));
56
+ configEnabled = config?.eval?.otel?.enabled === true;
57
+ if (typeof config?.eval?.otel?.dataset === "string") {
58
+ configDataset = config.eval.otel.dataset;
59
+ }
60
+ }
61
+ catch {
62
+ // malformed config — treat as disabled
63
+ }
64
+ }
65
+ const envForcesEnabled = envFlag !== undefined && envFlag !== "" && envFlag !== "0" && envFlag.toLowerCase() !== "false";
66
+ const dataset = envDataset && envDataset !== "" ? envDataset : (configDataset ?? DEFAULT_DATASET);
67
+ return {
68
+ enabled: envForcesEnabled || configEnabled,
69
+ endpoint,
70
+ dataset,
71
+ };
72
+ }
73
+ let activeProvider = null;
74
+ /**
75
+ * Initialize OTel tracing for the evaluator if enabled + endpoint set.
76
+ * Returns a Tracer — real when enabled, no-op when not.
77
+ *
78
+ * The no-op path costs nothing: no provider registered, no network, the
79
+ * returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
80
+ *
81
+ * Safe to call multiple times — subsequent calls return the same tracer.
82
+ */
83
+ export function initEvalOtel(projectRoot) {
84
+ const { enabled, endpoint, dataset } = isEvalOtelEnabled(projectRoot);
85
+ if (!enabled) {
86
+ return trace.getTracer(TRACER_NAME);
87
+ }
88
+ if (!endpoint) {
89
+ syslog(projectRoot, "eval.otel.enabled but OTEL_EXPORTER_OTLP_ENDPOINT is unset — falling back to no-op tracer");
90
+ return trace.getTracer(TRACER_NAME);
91
+ }
92
+ if (activeProvider) {
93
+ return trace.getTracer(TRACER_NAME);
94
+ }
95
+ // Build exporter headers. We pass Authorization and Dash0-Dataset in the
96
+ // constructor rather than relying on OTEL_EXPORTER_OTLP_HEADERS env parsing,
97
+ // because the OTel SDK's env parser has proven unreliable for tokens with
98
+ // spaces (e.g., "Bearer auth_xxx") in practice — the header silently fails
99
+ // to attach and exports retry-loop to no effect.
100
+ //
101
+ // Precedence:
102
+ // 1. User-set `OTEL_EXPORTER_OTLP_HEADERS` env (handled by SDK, takes top precedence per OTel spec)
103
+ // 2. Explicit constructor headers below (our defaults)
104
+ //
105
+ // DASH0_API_TOKEN is the conventional name we inherit from the Dash0 CLI.
106
+ // If set, we build a Bearer header. If not, we rely on the user's env.
107
+ const headers = {
108
+ "Dash0-Dataset": dataset,
109
+ };
110
+ if (process.env.DASH0_API_TOKEN) {
111
+ headers.Authorization = `Bearer ${process.env.DASH0_API_TOKEN}`;
112
+ }
113
+ try {
114
+ const exporter = new OTLPTraceExporter({
115
+ url: endpoint.endsWith("/v1/traces") ? endpoint : `${endpoint.replace(/\/$/, "")}/v1/traces`,
116
+ headers,
117
+ });
118
+ const provider = new NodeTracerProvider({
119
+ resource: resourceFromAttributes({
120
+ [ATTR_SERVICE_NAME]: SERVICE_NAME,
121
+ }),
122
+ spanProcessors: [new BatchSpanProcessor(exporter)],
123
+ });
124
+ provider.register();
125
+ activeProvider = provider;
126
+ syslog(projectRoot, `eval.otel initialized — endpoint: ${endpoint}, dataset: ${dataset}`);
127
+ }
128
+ catch (err) {
129
+ const message = err instanceof Error ? err.message : String(err);
130
+ syslog(projectRoot, `eval.otel init failed — falling back to no-op tracer: ${message}`);
131
+ }
132
+ return trace.getTracer(TRACER_NAME);
133
+ }
134
+ /**
135
+ * Run `fn` inside an active span. Closes the span in `finally`. On thrown
136
+ * error, records the exception on the span and sets status to ERROR, then
137
+ * re-throws so callers can still handle it.
138
+ *
139
+ * Use this for every lifecycle step in the evaluator so spans close even
140
+ * when Claude exits non-zero or a downstream step throws.
141
+ */
142
+ export async function withSpan(tracer, name, attrs, fn) {
143
+ return tracer.startActiveSpan(name, { attributes: attrs ?? {} }, async (span) => {
144
+ try {
145
+ return await fn(span);
146
+ }
147
+ catch (err) {
148
+ span.recordException(err instanceof Error ? err : new Error(String(err)));
149
+ span.setStatus({ code: SpanStatusCode.ERROR });
150
+ throw err;
151
+ }
152
+ finally {
153
+ span.end();
154
+ }
155
+ });
156
+ }
157
+ /**
158
+ * Flush and shut down the active provider. Call this before `process.exit()`
159
+ * in detached processes so batched spans are not lost. No-op if no provider
160
+ * is active.
161
+ */
162
+ export async function shutdownEvalOtel() {
163
+ if (!activeProvider)
164
+ return;
165
+ try {
166
+ await activeProvider.forceFlush();
167
+ await activeProvider.shutdown();
168
+ }
169
+ catch {
170
+ // shutdown is best-effort
171
+ }
172
+ finally {
173
+ activeProvider = null;
174
+ }
175
+ }
176
+ /**
177
+ * Test hook: reset the module's state AND the global OTel API so each test
178
+ * starts fresh. Not part of the public API.
179
+ */
180
+ export function __resetEvalOtelForTests() {
181
+ // Tear down any provider left over from a previous test. This un-registers
182
+ // from the global OTel API, so `trace.getTracer()` falls back to the no-op
183
+ // tracer until a new provider is registered.
184
+ if (activeProvider) {
185
+ void activeProvider.shutdown().catch(() => { });
186
+ }
187
+ activeProvider = null;
188
+ trace.disable();
189
+ }
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Persistent evaluator session management.
3
+ *
4
+ * First eval spawns a new session with full catchup. Subsequent evals resume
5
+ * the same session — no catchup cost, just "evaluate this change."
6
+ *
7
+ * Session state stored in `.indusk/eval/evaluator-session.json`.
8
+ */
9
+ import type { EvalErrorEntry, EvalScorecard } from "./types.js";
10
+ /**
11
+ * Run eval using a persistent session. First call does catchup + eval.
12
+ * Subsequent calls resume the session with just the new change.
13
+ */
14
+ export declare function runPersistentEval(opts: {
15
+ projectRoot: string;
16
+ changeId: string;
17
+ transcriptPath: string;
18
+ mode: "eval" | "baseline";
19
+ evalEndpoint?: string;
20
+ }): Promise<EvalScorecard | EvalErrorEntry>;
@@ -0,0 +1,244 @@
1
+ /**
2
+ * Persistent evaluator session management.
3
+ *
4
+ * First eval spawns a new session with full catchup. Subsequent evals resume
5
+ * the same session — no catchup cost, just "evaluate this change."
6
+ *
7
+ * Session state stored in `.indusk/eval/evaluator-session.json`.
8
+ */
9
+ import { spawn } from "node:child_process";
10
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
11
+ import { dirname, join } from "node:path";
12
+ import { getProjectGroupId } from "../config.js";
13
+ import { readUnprocessedHighlights } from "../highlights/highlights.js";
14
+ import { ingestScorecard } from "./findings.js";
15
+ import { EvalLogWriter } from "./log-writer.js";
16
+ import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
17
+ import { buildEvaluatorPrompt } from "./prompt-builder.js";
18
+ import { V1_RUBRIC } from "./rubric.js";
19
+ function getSessionPath(projectRoot) {
20
+ return join(projectRoot, ".indusk", "eval", "evaluator-session.json");
21
+ }
22
+ function getEvalLogPath(projectRoot) {
23
+ return join(projectRoot, ".indusk", "eval", "results.log");
24
+ }
25
+ function readSession(projectRoot) {
26
+ const path = getSessionPath(projectRoot);
27
+ if (!existsSync(path))
28
+ return null;
29
+ try {
30
+ return JSON.parse(readFileSync(path, "utf8"));
31
+ }
32
+ catch {
33
+ return null;
34
+ }
35
+ }
36
+ function writeSession(projectRoot, session) {
37
+ const path = getSessionPath(projectRoot);
38
+ mkdirSync(dirname(path), { recursive: true });
39
+ writeFileSync(path, `${JSON.stringify(session, null, 2)}\n`);
40
+ }
41
+ function clearSession(projectRoot) {
42
+ const path = getSessionPath(projectRoot);
43
+ if (existsSync(path)) {
44
+ const { unlinkSync } = require("node:fs");
45
+ unlinkSync(path);
46
+ }
47
+ }
48
+ const ALLOWED_TOOLS = [
49
+ "Read",
50
+ "Grep",
51
+ "Glob",
52
+ "Bash(jj:*)",
53
+ "Bash(git:*)",
54
+ "mcp__graphiti__*",
55
+ "mcp__indusk__*",
56
+ "mcp__codegraphcontext__*",
57
+ ];
58
+ function parseClaudeOutput(stdout) {
59
+ let scorecardText = stdout;
60
+ let usage;
61
+ let sessionId;
62
+ try {
63
+ const jsonOutput = JSON.parse(stdout);
64
+ scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
65
+ sessionId = jsonOutput.session_id;
66
+ if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
67
+ const u = jsonOutput.usage ?? {};
68
+ usage = {
69
+ costUsd: jsonOutput.total_cost_usd ?? 0,
70
+ inputTokens: u.input_tokens ?? 0,
71
+ outputTokens: u.output_tokens ?? 0,
72
+ cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
73
+ cacheReadTokens: u.cache_read_input_tokens ?? 0,
74
+ durationMs: jsonOutput.duration_ms ?? 0,
75
+ };
76
+ }
77
+ }
78
+ catch {
79
+ // raw output
80
+ }
81
+ const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
82
+ if (jsonMatch?.[1]) {
83
+ scorecardText = jsonMatch[1];
84
+ }
85
+ return { scorecardText, usage, sessionId };
86
+ }
87
+ async function spawnClaude(args, prompt, cwd) {
88
+ return new Promise((resolve) => {
89
+ const child = spawn("claude", args, {
90
+ cwd,
91
+ stdio: ["pipe", "pipe", "pipe"],
92
+ env: { ...process.env },
93
+ });
94
+ child.stdin?.write(prompt);
95
+ child.stdin?.end();
96
+ let stdout = "";
97
+ let stderr = "";
98
+ child.stdout?.on("data", (chunk) => {
99
+ stdout += chunk.toString();
100
+ });
101
+ child.stderr?.on("data", (chunk) => {
102
+ stderr += chunk.toString();
103
+ });
104
+ child.on("close", (code) => {
105
+ resolve({ stdout, stderr, code });
106
+ });
107
+ });
108
+ }
109
+ /**
110
+ * Run eval using a persistent session. First call does catchup + eval.
111
+ * Subsequent calls resume the session with just the new change.
112
+ */
113
+ export async function runPersistentEval(opts) {
114
+ const tracer = initEvalOtel(opts.projectRoot);
115
+ const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
116
+ const projectGroup = getProjectGroupId(opts.projectRoot);
117
+ // Peek at the highlights queue before spawning — gives us observability
118
+ // into how much work the Claude subprocess will do without having to
119
+ // span per-highlight (which would require Claude-Code-internal OTel).
120
+ let unprocessedCount = 0;
121
+ try {
122
+ unprocessedCount = readUnprocessedHighlights(opts.projectRoot).length;
123
+ }
124
+ catch {
125
+ // reading the queue is best-effort — never block the evaluator
126
+ }
127
+ const result = await withSpan(tracer, "eval.run", {
128
+ changeId: opts.changeId,
129
+ source,
130
+ mode: opts.mode,
131
+ projectGroup,
132
+ "highlights.unprocessed_count": unprocessedCount,
133
+ }, async (rootSpan) => {
134
+ const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
135
+ const session = await withSpan(tracer, "eval.read_session", undefined, () => readSession(opts.projectRoot));
136
+ rootSpan.setAttribute("resumed", session !== null);
137
+ try {
138
+ const { args, prompt } = await withSpan(tracer, "eval.build_prompt", { resumed: session !== null }, () => {
139
+ if (session) {
140
+ const resumePrompt = `Evaluate a new commit. Change ID: ${opts.changeId}
141
+
142
+ Run \`jj diff -r ${opts.changeId}\` to see what changed. Then answer the same evaluation questions as before. Read the changed files for full context.
143
+
144
+ Output ONLY the JSON scorecard as before — no commentary.`;
145
+ return {
146
+ args: [
147
+ "--print",
148
+ "--output-format",
149
+ "json",
150
+ "--resume",
151
+ session.sessionId,
152
+ "--allowed-tools",
153
+ ALLOWED_TOOLS.join(","),
154
+ ],
155
+ prompt: resumePrompt,
156
+ };
157
+ }
158
+ return {
159
+ args: [
160
+ "--print",
161
+ "--output-format",
162
+ "json",
163
+ "--model",
164
+ "opus",
165
+ "--permission-mode",
166
+ "acceptEdits",
167
+ "--allowed-tools",
168
+ ALLOWED_TOOLS.join(","),
169
+ ],
170
+ prompt: buildEvaluatorPrompt({
171
+ rubric: V1_RUBRIC,
172
+ changeId: opts.changeId,
173
+ transcriptPath: opts.transcriptPath,
174
+ mode: opts.mode,
175
+ projectGroup,
176
+ }),
177
+ };
178
+ });
179
+ const claudeResult = await withSpan(tracer, "eval.spawn_claude", {
180
+ "args.resumed": session !== null,
181
+ "args.model": session ? "(resumed)" : "opus",
182
+ }, async (span) => {
183
+ const spawned = await spawnClaude(args, prompt, opts.projectRoot);
184
+ span.setAttribute("exit.code", spawned.code ?? -1);
185
+ if (spawned.code !== 0) {
186
+ span.setAttribute("exit.stderr_tail", spawned.stderr.slice(-500));
187
+ }
188
+ return spawned;
189
+ });
190
+ if (claudeResult.code !== 0) {
191
+ if (session) {
192
+ await withSpan(tracer, "eval.clear_stale_session", undefined, () => clearSession(opts.projectRoot));
193
+ // Recurse — the retry produces its own root span
194
+ return runPersistentEval(opts);
195
+ }
196
+ throw new Error(`claude exited with code ${claudeResult.code}: ${claudeResult.stderr.slice(0, 500)}`);
197
+ }
198
+ const parsed = await withSpan(tracer, "eval.parse_output", undefined, (span) => {
199
+ const out = parseClaudeOutput(claudeResult.stdout);
200
+ if (out.sessionId)
201
+ span.setAttribute("session_id", out.sessionId);
202
+ if (out.usage) {
203
+ span.setAttribute("cost_usd", out.usage.costUsd);
204
+ span.setAttribute("input_tokens", out.usage.inputTokens);
205
+ span.setAttribute("output_tokens", out.usage.outputTokens);
206
+ }
207
+ return out;
208
+ });
209
+ const scorecard = JSON.parse(parsed.scorecardText.trim());
210
+ if (parsed.usage)
211
+ scorecard.usage = parsed.usage;
212
+ scorecard.telemetryPosted = false;
213
+ await withSpan(tracer, "eval.update_session", undefined, () => {
214
+ const newSession = {
215
+ sessionId: parsed.sessionId ?? session?.sessionId ?? "unknown",
216
+ createdAt: session?.createdAt ?? new Date().toISOString(),
217
+ lastEvalAt: new Date().toISOString(),
218
+ evalCount: (session?.evalCount ?? 0) + 1,
219
+ };
220
+ writeSession(opts.projectRoot, newSession);
221
+ });
222
+ await withSpan(tracer, "eval.write_scorecard", undefined, async () => {
223
+ await logWriter.append(scorecard);
224
+ ingestScorecard(opts.projectRoot, scorecard);
225
+ });
226
+ return scorecard;
227
+ }
228
+ catch (err) {
229
+ const errorEntry = {
230
+ version: 1,
231
+ timestamp: new Date().toISOString(),
232
+ mode: opts.mode,
233
+ changeId: opts.changeId,
234
+ error: true,
235
+ message: err instanceof Error ? err.message : String(err),
236
+ };
237
+ await logWriter.append(errorEntry);
238
+ return errorEntry;
239
+ }
240
+ });
241
+ // Flush OTel so batched spans ship before the detached process exits.
242
+ await shutdownEvalOtel();
243
+ return result;
244
+ }
@@ -1,11 +1,11 @@
1
1
  /**
2
- * Builds the judge agent's system prompt.
2
+ * Builds the evaluator agent's system prompt.
3
3
  *
4
- * The prompt instructs the judge to: do catchup, read the transcript, read the
4
+ * The prompt instructs the evaluator to: do catchup, read the transcript, read the
5
5
  * diff itself via jj, answer each rubric question, write findings to Graphiti
6
6
  * (eval mode only), and output a JSON scorecard.
7
7
  *
8
- * The diff is NOT embedded in the prompt — the judge reads it via tool calls.
8
+ * The diff is NOT embedded in the prompt — the evaluator reads it via tool calls.
9
9
  * This keeps the prompt small regardless of commit size.
10
10
  */
11
11
  import type { RubricQuestion } from "./types.js";
@@ -16,4 +16,4 @@ export interface PromptBuilderOptions {
16
16
  mode: "eval" | "baseline";
17
17
  projectGroup: string;
18
18
  }
19
- export declare function buildJudgePrompt(opts: PromptBuilderOptions): string;
19
+ export declare function buildEvaluatorPrompt(opts: PromptBuilderOptions): string;
@@ -1,14 +1,14 @@
1
1
  /**
2
- * Builds the judge agent's system prompt.
2
+ * Builds the evaluator agent's system prompt.
3
3
  *
4
- * The prompt instructs the judge to: do catchup, read the transcript, read the
4
+ * The prompt instructs the evaluator to: do catchup, read the transcript, read the
5
5
  * diff itself via jj, answer each rubric question, write findings to Graphiti
6
6
  * (eval mode only), and output a JSON scorecard.
7
7
  *
8
- * The diff is NOT embedded in the prompt — the judge reads it via tool calls.
8
+ * The diff is NOT embedded in the prompt — the evaluator reads it via tool calls.
9
9
  * This keeps the prompt small regardless of commit size.
10
10
  */
11
- export function buildJudgePrompt(opts) {
11
+ export function buildEvaluatorPrompt(opts) {
12
12
  const questionsBlock = opts.rubric
13
13
  .map((q, i) => `${i + 1}. **${q.id}**: ${q.question}\n Guidance: ${q.guidance}`)
14
14
  .join("\n\n");
@@ -62,7 +62,7 @@ If the tool is unavailable, skip silently and set graphitiWrites to 0.`
62
62
  ### Step 6: Graphiti writes
63
63
 
64
64
  Baseline mode — do NOT write to Graphiti. Set graphitiWrites to 0.`;
65
- return `You are the InDusk evaluation judge. Your job is to evaluate the quality of work done by an AI agent on a software project.
65
+ return `You are the InDusk eval agent (evaluator). Your job is to evaluate the quality of work done by an AI agent on a software project.
66
66
 
67
67
  You have full read access to the codebase, MCP tools (Graphiti, code graph, InDusk), and the session transcript. You cannot edit files.
68
68
 
@@ -2,7 +2,7 @@
2
2
  * Types for the context system evaluation.
3
3
  *
4
4
  * The scorecard is the unit of evaluation — one per commit. Questions are the
5
- * rubric, defined in rubric.ts and answered by the judge agent.
5
+ * rubric, defined in rubric.ts and answered by the eval agent (evaluator).
6
6
  */
7
7
  export interface RubricQuestion {
8
8
  id: string;
@@ -2,7 +2,7 @@
2
2
  * Types for the context system evaluation.
3
3
  *
4
4
  * The scorecard is the unit of evaluation — one per commit. Questions are the
5
- * rubric, defined in rubric.ts and answered by the judge agent.
5
+ * rubric, defined in rubric.ts and answered by the eval agent (evaluator).
6
6
  */
7
7
  export function isScorecard(entry) {
8
8
  return (!("error" in entry) && "questions" in entry && Array.isArray(entry.questions));
@@ -4,13 +4,13 @@
4
4
  * Dual-mode eval trigger.
5
5
  *
6
6
  * 1) PostToolUse hook mode (default): fires on Bash tool calls containing
7
- * `jj describe`. Reads the hook event JSON from stdin. Spawns the judge
7
+ * `jj describe`. Reads the hook event JSON from stdin. Spawns the evaluator
8
8
  * runner as a detached background process.
9
9
  *
10
10
  * 2) CLI mode (`--source <tag>`): invoked manually by skills (e.g., handoff)
11
11
  * at session end. No stdin read, no `jj describe` filter. Uses the current
12
- * @ change and passes the source tag to the judge via INDUSK_EVAL_SOURCE.
13
- * The judge may skip diff-based scoring when source != "commit" but still
12
+ * @ change and passes the source tag to the evaluator via INDUSK_EVAL_SOURCE.
13
+ * The evaluator may skip diff-based scoring when source != "commit" but still
14
14
  * processes the highlights queue.
15
15
  *
16
16
  * Exit 0 always — this is advisory, not blocking.
@@ -142,9 +142,12 @@ const transcriptPath =
142
142
  const hookDir = dirname(fileURLToPath(import.meta.url));
143
143
  const candidates = [
144
144
  // Source repo (apps/indusk-mcp/hooks/ → apps/indusk-mcp/dist/)
145
- resolve(hookDir, "../dist/lib/eval/judge-runner.js"),
145
+ resolve(hookDir, "../dist/lib/eval/evaluator-runner.js"),
146
146
  // Installed package (hooks/ → dist/)
147
- resolve(hookDir, "../../node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/judge-runner.js"),
147
+ resolve(
148
+ hookDir,
149
+ "../../node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/evaluator-runner.js",
150
+ ),
148
151
  // Global npx cache
149
152
  ...(() => {
150
153
  try {
@@ -153,24 +156,24 @@ const candidates = [
153
156
  return [
154
157
  resolve(
155
158
  dirname(which),
156
- "../lib/node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/judge-runner.js",
159
+ "../lib/node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/evaluator-runner.js",
157
160
  ),
158
161
  ];
159
162
  } catch {}
160
163
  return [];
161
164
  })(),
162
165
  ];
163
- let judgeRunnerPath = null;
166
+ let evaluatorRunnerPath = null;
164
167
  for (const c of candidates) {
165
168
  syslog(projectRoot, `candidate: ${c} — ${existsSync(c) ? "found" : "missing"}`);
166
169
  if (existsSync(c)) {
167
- judgeRunnerPath = c;
170
+ evaluatorRunnerPath = c;
168
171
  break;
169
172
  }
170
173
  }
171
- syslog(projectRoot, `judgeRunnerPath: ${judgeRunnerPath ?? "NOT FOUND"}`);
174
+ syslog(projectRoot, `evaluatorRunnerPath: ${evaluatorRunnerPath ?? "NOT FOUND"}`);
172
175
 
173
- if (!judgeRunnerPath) {
176
+ if (!evaluatorRunnerPath) {
174
177
  // Can't find the package — log error and exit
175
178
  const { mkdirSync, appendFileSync } = await import("node:fs");
176
179
  const logPath = resolve(projectRoot, ".indusk", "eval", "results.log");
@@ -182,14 +185,14 @@ if (!judgeRunnerPath) {
182
185
  changeId,
183
186
  error: true,
184
187
  message:
185
- "Could not find @infinitedusky/indusk-mcp package — eval judge not available. Run: npm i -g @infinitedusky/indusk-mcp",
188
+ "Could not find @infinitedusky/indusk-mcp package — eval evaluator not available. Run: npm i -g @infinitedusky/indusk-mcp",
186
189
  });
187
190
  appendFileSync(logPath, `${entry}\n`, "utf8");
188
191
  process.exit(0);
189
192
  }
190
193
 
191
194
  // Surface unresolved findings from previous evals
192
- const findingsPath = judgeRunnerPath.replace("judge-runner.js", "findings.js");
195
+ const findingsPath = evaluatorRunnerPath.replace("evaluator-runner.js", "findings.js");
193
196
  if (existsSync(findingsPath)) {
194
197
  try {
195
198
  const { getUnresolvedFindings } = await import(findingsPath);
@@ -207,18 +210,23 @@ if (existsSync(findingsPath)) {
207
210
  }
208
211
  }
209
212
 
210
- // Use persistent judge — resumes existing session if available, otherwise does full catchup.
211
- const persistentJudgePath = judgeRunnerPath.replace("judge-runner.js", "persistent-judge.js");
212
- const useModule = existsSync(persistentJudgePath) ? persistentJudgePath : judgeRunnerPath;
213
- const useFunction = existsSync(persistentJudgePath) ? "runPersistentEval" : "runJudgeSync";
213
+ // Use persistent evaluator — resumes existing session if available, otherwise does full catchup.
214
+ const persistentEvaluatorPath = evaluatorRunnerPath.replace(
215
+ "evaluator-runner.js",
216
+ "persistent-evaluator.js",
217
+ );
218
+ const useModule = existsSync(persistentEvaluatorPath)
219
+ ? persistentEvaluatorPath
220
+ : evaluatorRunnerPath;
221
+ const useFunction = existsSync(persistentEvaluatorPath) ? "runPersistentEval" : "runEvaluatorSync";
214
222
 
215
223
  syslog(
216
224
  projectRoot,
217
- `spawning judge — module: ${useModule}, function: ${useFunction}, changeId: ${changeId}`,
225
+ `spawning evaluator — module: ${useModule}, function: ${useFunction}, changeId: ${changeId}`,
218
226
  );
219
227
 
220
228
  const syslogPath = resolve(projectRoot, ".indusk", "eval", "system.log");
221
- const judgeScript = `
229
+ const evaluatorScript = `
222
230
  const fs = require("fs");
223
231
  const path = require("path");
224
232
  function syslog(msg) {
@@ -227,10 +235,10 @@ function syslog(msg) {
227
235
  fs.appendFileSync("${syslogPath}", new Date().toISOString() + " " + msg + "\\n");
228
236
  } catch {}
229
237
  }
230
- syslog("judge process started — changeId: ${changeId}");
238
+ syslog("evaluator process started — changeId: ${changeId}");
231
239
  import("${useModule}")
232
240
  .then(m => {
233
- syslog("judge module loaded — calling ${useFunction}");
241
+ syslog("evaluator module loaded — calling ${useFunction}");
234
242
  return m.${useFunction}({
235
243
  projectRoot: ${JSON.stringify(projectRoot)},
236
244
  changeId: ${JSON.stringify(changeId)},
@@ -241,11 +249,11 @@ import("${useModule}")
241
249
  })
242
250
  .then((result) => {
243
251
  const hasError = result && result.error;
244
- syslog("judge completed — " + (hasError ? "error: " + result.message : "scorecard written"));
252
+ syslog("evaluator completed — " + (hasError ? "error: " + result.message : "scorecard written"));
245
253
  process.exit(0);
246
254
  })
247
255
  .catch(err => {
248
- syslog("judge crashed — " + (err.message || String(err)));
256
+ syslog("evaluator crashed — " + (err.message || String(err)));
249
257
  const logPath = path.join(${JSON.stringify(projectRoot)}, ".indusk", "eval", "results.log");
250
258
  fs.mkdirSync(path.dirname(logPath), { recursive: true });
251
259
  const entry = JSON.stringify({
@@ -261,7 +269,7 @@ import("${useModule}")
261
269
  });
262
270
  `;
263
271
 
264
- const child = spawn("node", ["--input-type=module", "-e", judgeScript], {
272
+ const child = spawn("node", ["--input-type=module", "-e", evaluatorScript], {
265
273
  cwd: projectRoot,
266
274
  stdio: "ignore",
267
275
  detached: true,
@@ -270,24 +278,24 @@ const child = spawn("node", ["--input-type=module", "-e", judgeScript], {
270
278
 
271
279
  child.unref();
272
280
 
273
- syslog(projectRoot, `judge spawned — source: ${source}, pid: ${child.pid}`);
281
+ syslog(projectRoot, `evaluator spawned — source: ${source}, pid: ${child.pid}`);
274
282
 
275
283
  if (cliSource !== null) {
276
284
  // CLI mode — write a brief notice to stderr and exit
277
285
  process.stderr.write(
278
- `📊 Eval judge spawned (source=${source}) for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
286
+ `📊 Eval evaluator spawned (source=${source}) for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
279
287
  );
280
288
  } else {
281
289
  // Hook mode — output structured hook response
282
290
  const output = JSON.stringify({
283
291
  hookSpecificOutput: {
284
292
  hookEventName: "PostToolUse",
285
- message: `Eval judge spawned for change ${changeId.slice(0, 8)}`,
293
+ message: `Eval evaluator spawned for change ${changeId.slice(0, 8)}`,
286
294
  },
287
295
  });
288
296
  process.stdout.write(output);
289
297
  process.stderr.write(
290
- `📊 Eval judge spawned in background for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
298
+ `📊 Eval evaluator spawned in background for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
291
299
  );
292
300
  }
293
301
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@infinitedusky/indusk-mcp",
3
- "version": "1.17.0",
3
+ "version": "1.18.1",
4
4
  "description": "InDusk development system — skills, MCP tools, and CLI for structured AI-assisted development",
5
5
  "type": "module",
6
6
  "files": [
@@ -28,6 +28,12 @@
28
28
  },
29
29
  "dependencies": {
30
30
  "@modelcontextprotocol/sdk": "^1.12.1",
31
+ "@opentelemetry/api": "^1.9.0",
32
+ "@opentelemetry/exporter-trace-otlp-http": "^0.214.0",
33
+ "@opentelemetry/resources": "^2.6.0",
34
+ "@opentelemetry/sdk-trace-base": "^2.6.0",
35
+ "@opentelemetry/sdk-trace-node": "^2.6.0",
36
+ "@opentelemetry/semantic-conventions": "^1.40.0",
31
37
  "commander": "^13.0.0",
32
38
  "falkordb": "^6.6.2",
33
39
  "glob": "^11.0.0",
@@ -2,19 +2,19 @@ You can evaluate the current session's work quality on demand.
2
2
 
3
3
  ## When to Use
4
4
 
5
- - `/eval review` — run the eval judge against the current working copy
5
+ - `/eval review` — run the eval evaluator against the current working copy
6
6
  - Mid-session quality check before committing
7
7
  - When you want to see how the work scores against the rubric
8
8
 
9
9
  ## What It Does
10
10
 
11
- Runs the same judge process as the automatic eval hook, but against uncommitted changes instead of a committed change. Uses `jj diff` for the current working copy diff and the current session's transcript.
11
+ Runs the same evaluator process as the automatic eval hook, but against uncommitted changes instead of a committed change. Uses `jj diff` for the current working copy diff and the current session's transcript.
12
12
 
13
13
  ## Process
14
14
 
15
15
  1. Get the current diff: `jj diff`
16
- 2. Build the judge prompt with the v1 rubric
17
- 3. Run the judge (uses `runJudgeSync` from `apps/indusk-mcp/src/lib/eval/judge-runner.ts`)
16
+ 2. Build the evaluator prompt with the v1 rubric
17
+ 3. Run the evaluator (uses `runEvaluatorSync` from `apps/indusk-mcp/src/lib/eval/evaluator-runner.ts`)
18
18
  4. Display the scorecard inline
19
19
  5. Append results to `.indusk/eval/results.log`
20
20
 
@@ -23,7 +23,7 @@ Runs the same judge process as the automatic eval hook, but against uncommitted
23
23
  When the user says `/eval review` or asks for a quality check:
24
24
 
25
25
  1. Get the current change ID: `jj log -r @ --no-graph -T change_id`
26
- 2. Call `runJudgeSync` with mode `"eval"` and the current transcript path
26
+ 2. Call `runEvaluatorSync` with mode `"eval"` and the current transcript path
27
27
  3. Present the scorecard to the user:
28
28
  - Overall summary
29
29
  - Per-question results with evidence
@@ -32,6 +32,6 @@ When the user says `/eval review` or asks for a quality check:
32
32
  ## Important
33
33
 
34
34
  - This is a quality check, not a blocker — findings are informational
35
- - The judge has full MCP access and does a real catchup
35
+ - The evaluator has full MCP access and does a real catchup
36
36
  - Results are logged to the same eval log as automatic evaluations
37
- - If the judge fails, show the error — don't silently skip
37
+ - If the evaluator fails, show the error — don't silently skip
package/skills/handoff.md CHANGED
@@ -62,7 +62,7 @@ Run this from the project root:
62
62
  node .claude/hooks/eval-trigger.js --source handoff
63
63
  ```
64
64
 
65
- The trigger spawns the judge in the background and returns immediately — it never blocks handoff. The judge processes the highlights queue and, because `INDUSK_EVAL_SOURCE=handoff` is set in the environment, may skip diff-based rubric scoring (there's no new commit). Highlights still get materialized into Graphiti episodes.
65
+ The trigger spawns the evaluator in the background and returns immediately — it never blocks handoff. The evaluator processes the highlights queue and, because `INDUSK_EVAL_SOURCE=handoff` is set in the environment, may skip diff-based rubric scoring (there's no new commit). Highlights still get materialized into Graphiti episodes.
66
66
 
67
67
  If the hook isn't installed or Node isn't on PATH, the handoff still succeeds — the highlights remain queued for the next `jj describe` in a future session.
68
68