npm - @infinitedusky/indusk-mcp - Versions diffs - 1.17.0 → 1.18.1 - Mend

@infinitedusky/indusk-mcp 1.17.0 → 1.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/bin/commands/eval.js +2 -2
package/dist/lib/eval/evaluator-runner.d.ts +28 -0
package/dist/lib/eval/evaluator-runner.js +266 -0
package/dist/lib/eval/otel.d.ts +61 -0
package/dist/lib/eval/otel.js +189 -0
package/dist/lib/eval/persistent-evaluator.d.ts +20 -0
package/dist/lib/eval/persistent-evaluator.js +244 -0
package/dist/lib/eval/prompt-builder.d.ts +4 -4
package/dist/lib/eval/prompt-builder.js +5 -5
package/dist/lib/eval/types.d.ts +1 -1
package/dist/lib/eval/types.js +1 -1
package/hooks/eval-trigger.js +35 -27
package/package.json +7 -1
package/skills/eval-review.md +7 -7
package/skills/handoff.md +1 -1

package/dist/bin/commands/eval.js CHANGED Viewed

@@ -212,7 +212,7 @@ export async function evalBaseline(projectRoot, opts) {
     }
     // Run the smart evaluator against the baseline
     console.info("Running smart evaluator against baseline...");
-    const { runJudgeSync } = await import("../../lib/eval/judge-runner.js");
+    const { runEvaluatorSync } = await import("../../lib/eval/evaluator-runner.js");
     let changeId;
     try {
         changeId = execSync("jj log -r @ --no-graph -T change_id", {
@@ -223,7 +223,7 @@ export async function evalBaseline(projectRoot, opts) {
     catch {
         changeId = "baseline-unknown";
     }
-    const evalResult = await runJudgeSync({
+    const evalResult = await runEvaluatorSync({
         projectRoot: worktreePath,
         changeId,
         transcriptPath: "(baseline — no transcript)",

package/dist/lib/eval/evaluator-runner.d.ts ADDED Viewed

@@ -0,0 +1,28 @@
+/**
+ * Evaluator runner — spawns a background `claude --print` process that evaluates
+ * a commit and writes results to the eval log.
+ *
+ * The evaluator is a detached child process so the calling hook can exit immediately.
+ * Results appear asynchronously in `.indusk/eval/results.log`.
+ */
+import type { EvalErrorEntry, EvalScorecard } from "./types.js";
+export interface EvaluatorRunOptions {
+    projectRoot: string;
+    changeId: string;
+    transcriptPath: string;
+    mode: "eval" | "baseline";
+    evalEndpoint?: string;
+}
+/**
+ * Run the evaluator as a detached background process.
+ *
+ * Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
+ * Collects stdout, parses the scorecard JSON, and appends to the eval log.
+ * If anything fails, logs an error entry instead of silently dropping.
+ */
+export declare function runEvaluatorBackground(opts: EvaluatorRunOptions): void;
+/**
+ * Run the evaluator synchronously (for testing and manual invocation).
+ * Returns the scorecard or error entry.
+ */
+export declare function runEvaluatorSync(opts: EvaluatorRunOptions): Promise<EvalScorecard | EvalErrorEntry>;

package/dist/lib/eval/evaluator-runner.js ADDED Viewed

@@ -0,0 +1,266 @@
+/**
+ * Evaluator runner — spawns a background `claude --print` process that evaluates
+ * a commit and writes results to the eval log.
+ *
+ * The evaluator is a detached child process so the calling hook can exit immediately.
+ * Results appear asynchronously in `.indusk/eval/results.log`.
+ */
+import { spawn } from "node:child_process";
+import { join } from "node:path";
+import { getProjectGroupId } from "../config.js";
+import { ingestScorecard } from "./findings.js";
+import { EvalLogWriter } from "./log-writer.js";
+import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
+import { buildEvaluatorPrompt } from "./prompt-builder.js";
+import { V1_RUBRIC } from "./rubric.js";
+function getEvalLogPath(projectRoot) {
+    return join(projectRoot, ".indusk", "eval", "results.log");
+}
+async function postTelemetry(endpoint, scorecard) {
+    try {
+        const controller = new AbortController();
+        const timeout = setTimeout(() => controller.abort(), 5000);
+        await fetch(endpoint, {
+            method: "POST",
+            headers: { "Content-Type": "application/json" },
+            body: JSON.stringify(scorecard),
+            signal: controller.signal,
+        });
+        clearTimeout(timeout);
+    }
+    catch {
+        // fire-and-forget — silently ignore errors
+    }
+}
+/**
+ * Run the evaluator as a detached background process.
+ *
+ * Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
+ * Collects stdout, parses the scorecard JSON, and appends to the eval log.
+ * If anything fails, logs an error entry instead of silently dropping.
+ */
+export function runEvaluatorBackground(opts) {
+    const projectGroup = getProjectGroupId(opts.projectRoot);
+    const prompt = buildEvaluatorPrompt({
+        rubric: V1_RUBRIC,
+        changeId: opts.changeId,
+        transcriptPath: opts.transcriptPath,
+        mode: opts.mode,
+        projectGroup,
+    });
+    const allowedTools = [
+        "Read",
+        "Grep",
+        "Glob",
+        "Bash(jj:*)",
+        "Bash(git:*)",
+        "mcp__graphiti__*",
+        "mcp__indusk__*",
+        "mcp__codegraphcontext__*",
+    ];
+    const args = [
+        "--print",
+        "--output-format",
+        "json",
+        "--model",
+        "opus",
+        "--permission-mode",
+        "acceptEdits",
+        "--allowed-tools",
+        allowedTools.join(","),
+    ];
+    // Not detached — the eval-trigger hook already spawns this in a separate
+    // node process. Detaching + unref causes the close handler to never fire.
+    const child = spawn("claude", args, {
+        cwd: opts.projectRoot,
+        stdio: ["pipe", "pipe", "pipe"],
+        env: { ...process.env },
+    });
+    // Pipe the prompt via stdin (too large for CLI arg)
+    child.stdin?.write(prompt);
+    child.stdin?.end();
+    let stdout = "";
+    let stderr = "";
+    child.stdout?.on("data", (chunk) => {
+        stdout += chunk.toString();
+    });
+    child.stderr?.on("data", (chunk) => {
+        stderr += chunk.toString();
+    });
+    child.on("close", async (code) => {
+        const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
+        try {
+            if (code !== 0) {
+                throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
+            }
+            // --output-format json wraps the result; extract the text content and usage
+            let scorecardText = stdout;
+            let usage;
+            try {
+                const jsonOutput = JSON.parse(stdout);
+                scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
+                // Capture usage data from claude --print output
+                if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
+                    const u = jsonOutput.usage ?? {};
+                    usage = {
+                        costUsd: jsonOutput.total_cost_usd ?? 0,
+                        inputTokens: u.input_tokens ?? 0,
+                        outputTokens: u.output_tokens ?? 0,
+                        cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
+                        cacheReadTokens: u.cache_read_input_tokens ?? 0,
+                        durationMs: jsonOutput.duration_ms ?? 0,
+                    };
+                }
+            }
+            catch {
+                // stdout might be raw JSON scorecard already
+            }
+            // Extract JSON from possible markdown code fences
+            const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
+            if (jsonMatch?.[1]) {
+                scorecardText = jsonMatch[1];
+            }
+            const scorecard = JSON.parse(scorecardText.trim());
+            if (usage)
+                scorecard.usage = usage;
+            scorecard.telemetryPosted = false;
+            if (opts.evalEndpoint) {
+                await postTelemetry(opts.evalEndpoint, scorecard);
+                scorecard.telemetryPosted = true;
+            }
+            await logWriter.append(scorecard);
+            ingestScorecard(opts.projectRoot, scorecard);
+        }
+        catch (err) {
+            const errorEntry = {
+                version: 1,
+                timestamp: new Date().toISOString(),
+                mode: opts.mode,
+                changeId: opts.changeId,
+                error: true,
+                message: err instanceof Error ? err.message : String(err),
+            };
+            await logWriter.append(errorEntry);
+        }
+    });
+}
+/**
+ * Run the evaluator synchronously (for testing and manual invocation).
+ * Returns the scorecard or error entry.
+ */
+export async function runEvaluatorSync(opts) {
+    const tracer = initEvalOtel(opts.projectRoot);
+    const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
+    const projectGroup = getProjectGroupId(opts.projectRoot);
+    const result = await withSpan(tracer, "eval.run", {
+        changeId: opts.changeId,
+        source,
+        mode: opts.mode,
+        projectGroup,
+        entrypoint: "runEvaluatorSync",
+    }, () => runEvaluatorSyncInner(opts, projectGroup));
+    await shutdownEvalOtel();
+    return result;
+}
+async function runEvaluatorSyncInner(opts, projectGroup) {
+    const prompt = buildEvaluatorPrompt({
+        rubric: V1_RUBRIC,
+        changeId: opts.changeId,
+        transcriptPath: opts.transcriptPath,
+        mode: opts.mode,
+        projectGroup,
+    });
+    const allowedTools = [
+        "Read",
+        "Grep",
+        "Glob",
+        "Bash(jj:*)",
+        "Bash(git:*)",
+        "mcp__graphiti__*",
+        "mcp__indusk__*",
+        "mcp__codegraphcontext__*",
+    ];
+    const args = [
+        "--print",
+        "--output-format",
+        "json",
+        "--model",
+        "opus",
+        "--permission-mode",
+        "acceptEdits",
+        "--allowed-tools",
+        allowedTools.join(","),
+    ];
+    return new Promise((resolve) => {
+        const child = spawn("claude", args, {
+            cwd: opts.projectRoot,
+            stdio: ["pipe", "pipe", "pipe"],
+            env: { ...process.env },
+        });
+        child.stdin?.write(prompt);
+        child.stdin?.end();
+        let stdout = "";
+        let stderr = "";
+        child.stdout?.on("data", (chunk) => {
+            stdout += chunk.toString();
+        });
+        child.stderr?.on("data", (chunk) => {
+            stderr += chunk.toString();
+        });
+        child.on("close", async (code) => {
+            const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
+            try {
+                if (code !== 0) {
+                    throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
+                }
+                let scorecardText = stdout;
+                let syncUsage;
+                try {
+                    const jsonOutput = JSON.parse(stdout);
+                    scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
+                    if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
+                        const u = jsonOutput.usage ?? {};
+                        syncUsage = {
+                            costUsd: jsonOutput.total_cost_usd ?? 0,
+                            inputTokens: u.input_tokens ?? 0,
+                            outputTokens: u.output_tokens ?? 0,
+                            cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
+                            cacheReadTokens: u.cache_read_input_tokens ?? 0,
+                            durationMs: jsonOutput.duration_ms ?? 0,
+                        };
+                    }
+                }
+                catch {
+                    // raw JSON
+                }
+                const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
+                if (jsonMatch?.[1]) {
+                    scorecardText = jsonMatch[1];
+                }
+                const scorecard = JSON.parse(scorecardText.trim());
+                if (syncUsage)
+                    scorecard.usage = syncUsage;
+                scorecard.telemetryPosted = false;
+                if (opts.evalEndpoint) {
+                    await postTelemetry(opts.evalEndpoint, scorecard);
+                    scorecard.telemetryPosted = true;
+                }
+                await logWriter.append(scorecard);
+                ingestScorecard(opts.projectRoot, scorecard);
+                resolve(scorecard);
+            }
+            catch (err) {
+                const errorEntry = {
+                    version: 1,
+                    timestamp: new Date().toISOString(),
+                    mode: opts.mode,
+                    changeId: opts.changeId,
+                    error: true,
+                    message: err instanceof Error ? err.message : String(err),
+                };
+                await logWriter.append(errorEntry);
+                resolve(errorEntry);
+            }
+        });
+    });
+}

package/dist/lib/eval/otel.d.ts ADDED Viewed

@@ -0,0 +1,61 @@
+/**
+ * OpenTelemetry tracing for the eval agent (evaluator).
+ *
+ * Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
+ * `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
+ * (Dash0 or any OTLP HTTP receiver).
+ *
+ * Default OFF — zero cost in normal operation (no SDK init, no network).
+ *
+ * Graceful degradation: when enabled but endpoint missing, log a warning
+ * to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
+ * throws, same behavior. The evaluator never fails because of OTel.
+ */
+import { type Attributes, type Span, type Tracer } from "@opentelemetry/api";
+export interface EvalOtelConfig {
+    enabled: boolean;
+    endpoint: string | null;
+    dataset: string;
+}
+/**
+ * Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
+ * the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
+ * env vars. Does not init anything or touch the network.
+ *
+ * Resolution:
+ * - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
+ * - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
+ * - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
+ *   else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
+ */
+export declare function isEvalOtelEnabled(projectRoot: string): EvalOtelConfig;
+/**
+ * Initialize OTel tracing for the evaluator if enabled + endpoint set.
+ * Returns a Tracer — real when enabled, no-op when not.
+ *
+ * The no-op path costs nothing: no provider registered, no network, the
+ * returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
+ *
+ * Safe to call multiple times — subsequent calls return the same tracer.
+ */
+export declare function initEvalOtel(projectRoot: string): Tracer;
+/**
+ * Run `fn` inside an active span. Closes the span in `finally`. On thrown
+ * error, records the exception on the span and sets status to ERROR, then
+ * re-throws so callers can still handle it.
+ *
+ * Use this for every lifecycle step in the evaluator so spans close even
+ * when Claude exits non-zero or a downstream step throws.
+ */
+export declare function withSpan<T>(tracer: Tracer, name: string, attrs: Attributes | undefined, fn: (span: Span) => Promise<T> | T): Promise<T>;
+/**
+ * Flush and shut down the active provider. Call this before `process.exit()`
+ * in detached processes so batched spans are not lost. No-op if no provider
+ * is active.
+ */
+export declare function shutdownEvalOtel(): Promise<void>;
+/**
+ * Test hook: reset the module's state AND the global OTel API so each test
+ * starts fresh. Not part of the public API.
+ */
+export declare function __resetEvalOtelForTests(): void;

package/dist/lib/eval/otel.js ADDED Viewed

@@ -0,0 +1,189 @@
+/**
+ * OpenTelemetry tracing for the eval agent (evaluator).
+ *
+ * Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
+ * `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
+ * (Dash0 or any OTLP HTTP receiver).
+ *
+ * Default OFF — zero cost in normal operation (no SDK init, no network).
+ *
+ * Graceful degradation: when enabled but endpoint missing, log a warning
+ * to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
+ * throws, same behavior. The evaluator never fails because of OTel.
+ */
+import { appendFileSync, existsSync, mkdirSync, readFileSync } from "node:fs";
+import { join, resolve } from "node:path";
+import { SpanStatusCode, trace } from "@opentelemetry/api";
+import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
+import { resourceFromAttributes } from "@opentelemetry/resources";
+import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-base";
+import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
+import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
+const TRACER_NAME = "@infinitedusky/indusk-mcp/eval";
+const SERVICE_NAME = "indusk-eval-agent";
+function syslog(projectRoot, msg) {
+    try {
+        const logDir = resolve(projectRoot, ".indusk", "eval");
+        mkdirSync(logDir, { recursive: true });
+        appendFileSync(resolve(logDir, "system.log"), `${new Date().toISOString()} ${msg}\n`);
+    }
+    catch {
+        // logging should never break anything
+    }
+}
+const DEFAULT_DATASET = "agent";
+/**
+ * Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
+ * the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
+ * env vars. Does not init anything or touch the network.
+ *
+ * Resolution:
+ * - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
+ * - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
+ * - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
+ *   else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
+ */
+export function isEvalOtelEnabled(projectRoot) {
+    const envFlag = process.env.INDUSK_EVAL_OTEL;
+    const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? null;
+    const envDataset = process.env.INDUSK_EVAL_OTEL_DATASET;
+    let configEnabled = false;
+    let configDataset;
+    const configPath = join(projectRoot, ".indusk", "config.json");
+    if (existsSync(configPath)) {
+        try {
+            const config = JSON.parse(readFileSync(configPath, "utf-8"));
+            configEnabled = config?.eval?.otel?.enabled === true;
+            if (typeof config?.eval?.otel?.dataset === "string") {
+                configDataset = config.eval.otel.dataset;
+            }
+        }
+        catch {
+            // malformed config — treat as disabled
+        }
+    }
+    const envForcesEnabled = envFlag !== undefined && envFlag !== "" && envFlag !== "0" && envFlag.toLowerCase() !== "false";
+    const dataset = envDataset && envDataset !== "" ? envDataset : (configDataset ?? DEFAULT_DATASET);
+    return {
+        enabled: envForcesEnabled || configEnabled,
+        endpoint,
+        dataset,
+    };
+}
+let activeProvider = null;
+/**
+ * Initialize OTel tracing for the evaluator if enabled + endpoint set.
+ * Returns a Tracer — real when enabled, no-op when not.
+ *
+ * The no-op path costs nothing: no provider registered, no network, the
+ * returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
+ *
+ * Safe to call multiple times — subsequent calls return the same tracer.
+ */
+export function initEvalOtel(projectRoot) {
+    const { enabled, endpoint, dataset } = isEvalOtelEnabled(projectRoot);
+    if (!enabled) {
+        return trace.getTracer(TRACER_NAME);
+    }
+    if (!endpoint) {
+        syslog(projectRoot, "eval.otel.enabled but OTEL_EXPORTER_OTLP_ENDPOINT is unset — falling back to no-op tracer");
+        return trace.getTracer(TRACER_NAME);
+    }
+    if (activeProvider) {
+        return trace.getTracer(TRACER_NAME);
+    }
+    // Build exporter headers. We pass Authorization and Dash0-Dataset in the
+    // constructor rather than relying on OTEL_EXPORTER_OTLP_HEADERS env parsing,
+    // because the OTel SDK's env parser has proven unreliable for tokens with
+    // spaces (e.g., "Bearer auth_xxx") in practice — the header silently fails
+    // to attach and exports retry-loop to no effect.
+    //
+    // Precedence:
+    //   1. User-set `OTEL_EXPORTER_OTLP_HEADERS` env (handled by SDK, takes top precedence per OTel spec)
+    //   2. Explicit constructor headers below (our defaults)
+    //
+    // DASH0_API_TOKEN is the conventional name we inherit from the Dash0 CLI.
+    // If set, we build a Bearer header. If not, we rely on the user's env.
+    const headers = {
+        "Dash0-Dataset": dataset,
+    };
+    if (process.env.DASH0_API_TOKEN) {
+        headers.Authorization = `Bearer ${process.env.DASH0_API_TOKEN}`;
+    }
+    try {
+        const exporter = new OTLPTraceExporter({
+            url: endpoint.endsWith("/v1/traces") ? endpoint : `${endpoint.replace(/\/$/, "")}/v1/traces`,
+            headers,
+        });
+        const provider = new NodeTracerProvider({
+            resource: resourceFromAttributes({
+                [ATTR_SERVICE_NAME]: SERVICE_NAME,
+            }),
+            spanProcessors: [new BatchSpanProcessor(exporter)],
+        });
+        provider.register();
+        activeProvider = provider;
+        syslog(projectRoot, `eval.otel initialized — endpoint: ${endpoint}, dataset: ${dataset}`);
+    }
+    catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        syslog(projectRoot, `eval.otel init failed — falling back to no-op tracer: ${message}`);
+    }
+    return trace.getTracer(TRACER_NAME);
+}
+/**
+ * Run `fn` inside an active span. Closes the span in `finally`. On thrown
+ * error, records the exception on the span and sets status to ERROR, then
+ * re-throws so callers can still handle it.
+ *
+ * Use this for every lifecycle step in the evaluator so spans close even
+ * when Claude exits non-zero or a downstream step throws.
+ */
+export async function withSpan(tracer, name, attrs, fn) {
+    return tracer.startActiveSpan(name, { attributes: attrs ?? {} }, async (span) => {
+        try {
+            return await fn(span);
+        }
+        catch (err) {
+            span.recordException(err instanceof Error ? err : new Error(String(err)));
+            span.setStatus({ code: SpanStatusCode.ERROR });
+            throw err;
+        }
+        finally {
+            span.end();
+        }
+    });
+}
+/**
+ * Flush and shut down the active provider. Call this before `process.exit()`
+ * in detached processes so batched spans are not lost. No-op if no provider
+ * is active.
+ */
+export async function shutdownEvalOtel() {
+    if (!activeProvider)
+        return;
+    try {
+        await activeProvider.forceFlush();
+        await activeProvider.shutdown();
+    }
+    catch {
+        // shutdown is best-effort
+    }
+    finally {
+        activeProvider = null;
+    }
+}
+/**
+ * Test hook: reset the module's state AND the global OTel API so each test
+ * starts fresh. Not part of the public API.
+ */
+export function __resetEvalOtelForTests() {
+    // Tear down any provider left over from a previous test. This un-registers
+    // from the global OTel API, so `trace.getTracer()` falls back to the no-op
+    // tracer until a new provider is registered.
+    if (activeProvider) {
+        void activeProvider.shutdown().catch(() => { });
+    }
+    activeProvider = null;
+    trace.disable();
+}

package/dist/lib/eval/persistent-evaluator.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+/**
+ * Persistent evaluator session management.
+ *
+ * First eval spawns a new session with full catchup. Subsequent evals resume
+ * the same session — no catchup cost, just "evaluate this change."
+ *
+ * Session state stored in `.indusk/eval/evaluator-session.json`.
+ */
+import type { EvalErrorEntry, EvalScorecard } from "./types.js";
+/**
+ * Run eval using a persistent session. First call does catchup + eval.
+ * Subsequent calls resume the session with just the new change.
+ */
+export declare function runPersistentEval(opts: {
+    projectRoot: string;
+    changeId: string;
+    transcriptPath: string;
+    mode: "eval" | "baseline";
+    evalEndpoint?: string;
+}): Promise<EvalScorecard | EvalErrorEntry>;

package/dist/lib/eval/persistent-evaluator.js ADDED Viewed

@@ -0,0 +1,244 @@
+/**
+ * Persistent evaluator session management.
+ *
+ * First eval spawns a new session with full catchup. Subsequent evals resume
+ * the same session — no catchup cost, just "evaluate this change."
+ *
+ * Session state stored in `.indusk/eval/evaluator-session.json`.
+ */
+import { spawn } from "node:child_process";
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { getProjectGroupId } from "../config.js";
+import { readUnprocessedHighlights } from "../highlights/highlights.js";
+import { ingestScorecard } from "./findings.js";
+import { EvalLogWriter } from "./log-writer.js";
+import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
+import { buildEvaluatorPrompt } from "./prompt-builder.js";
+import { V1_RUBRIC } from "./rubric.js";
+function getSessionPath(projectRoot) {
+    return join(projectRoot, ".indusk", "eval", "evaluator-session.json");
+}
+function getEvalLogPath(projectRoot) {
+    return join(projectRoot, ".indusk", "eval", "results.log");
+}
+function readSession(projectRoot) {
+    const path = getSessionPath(projectRoot);
+    if (!existsSync(path))
+        return null;
+    try {
+        return JSON.parse(readFileSync(path, "utf8"));
+    }
+    catch {
+        return null;
+    }
+}
+function writeSession(projectRoot, session) {
+    const path = getSessionPath(projectRoot);
+    mkdirSync(dirname(path), { recursive: true });
+    writeFileSync(path, `${JSON.stringify(session, null, 2)}\n`);
+}
+function clearSession(projectRoot) {
+    const path = getSessionPath(projectRoot);
+    if (existsSync(path)) {
+        const { unlinkSync } = require("node:fs");
+        unlinkSync(path);
+    }
+}
+const ALLOWED_TOOLS = [
+    "Read",
+    "Grep",
+    "Glob",
+    "Bash(jj:*)",
+    "Bash(git:*)",
+    "mcp__graphiti__*",
+    "mcp__indusk__*",
+    "mcp__codegraphcontext__*",
+];
+function parseClaudeOutput(stdout) {
+    let scorecardText = stdout;
+    let usage;
+    let sessionId;
+    try {
+        const jsonOutput = JSON.parse(stdout);
+        scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
+        sessionId = jsonOutput.session_id;
+        if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
+            const u = jsonOutput.usage ?? {};
+            usage = {
+                costUsd: jsonOutput.total_cost_usd ?? 0,
+                inputTokens: u.input_tokens ?? 0,
+                outputTokens: u.output_tokens ?? 0,
+                cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
+                cacheReadTokens: u.cache_read_input_tokens ?? 0,
+                durationMs: jsonOutput.duration_ms ?? 0,
+            };
+        }
+    }
+    catch {
+        // raw output
+    }
+    const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
+    if (jsonMatch?.[1]) {
+        scorecardText = jsonMatch[1];
+    }
+    return { scorecardText, usage, sessionId };
+}
+async function spawnClaude(args, prompt, cwd) {
+    return new Promise((resolve) => {
+        const child = spawn("claude", args, {
+            cwd,
+            stdio: ["pipe", "pipe", "pipe"],
+            env: { ...process.env },
+        });
+        child.stdin?.write(prompt);
+        child.stdin?.end();
+        let stdout = "";
+        let stderr = "";
+        child.stdout?.on("data", (chunk) => {
+            stdout += chunk.toString();
+        });
+        child.stderr?.on("data", (chunk) => {
+            stderr += chunk.toString();
+        });
+        child.on("close", (code) => {
+            resolve({ stdout, stderr, code });
+        });
+    });
+}
+/**
+ * Run eval using a persistent session. First call does catchup + eval.
+ * Subsequent calls resume the session with just the new change.
+ */
+export async function runPersistentEval(opts) {
+    const tracer = initEvalOtel(opts.projectRoot);
+    const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
+    const projectGroup = getProjectGroupId(opts.projectRoot);
+    // Peek at the highlights queue before spawning — gives us observability
+    // into how much work the Claude subprocess will do without having to
+    // span per-highlight (which would require Claude-Code-internal OTel).
+    let unprocessedCount = 0;
+    try {
+        unprocessedCount = readUnprocessedHighlights(opts.projectRoot).length;
+    }
+    catch {
+        // reading the queue is best-effort — never block the evaluator
+    }
+    const result = await withSpan(tracer, "eval.run", {
+        changeId: opts.changeId,
+        source,
+        mode: opts.mode,
+        projectGroup,
+        "highlights.unprocessed_count": unprocessedCount,
+    }, async (rootSpan) => {
+        const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
+        const session = await withSpan(tracer, "eval.read_session", undefined, () => readSession(opts.projectRoot));
+        rootSpan.setAttribute("resumed", session !== null);
+        try {
+            const { args, prompt } = await withSpan(tracer, "eval.build_prompt", { resumed: session !== null }, () => {
+                if (session) {
+                    const resumePrompt = `Evaluate a new commit. Change ID: ${opts.changeId}
+Run \`jj diff -r ${opts.changeId}\` to see what changed. Then answer the same evaluation questions as before. Read the changed files for full context.
+Output ONLY the JSON scorecard as before — no commentary.`;
+                    return {
+                        args: [
+                            "--print",
+                            "--output-format",
+                            "json",
+                            "--resume",
+                            session.sessionId,
+                            "--allowed-tools",
+                            ALLOWED_TOOLS.join(","),
+                        ],
+                        prompt: resumePrompt,
+                    };
+                }
+                return {
+                    args: [
+                        "--print",
+                        "--output-format",
+                        "json",
+                        "--model",
+                        "opus",
+                        "--permission-mode",
+                        "acceptEdits",
+                        "--allowed-tools",
+                        ALLOWED_TOOLS.join(","),
+                    ],
+                    prompt: buildEvaluatorPrompt({
+                        rubric: V1_RUBRIC,
+                        changeId: opts.changeId,
+                        transcriptPath: opts.transcriptPath,
+                        mode: opts.mode,
+                        projectGroup,
+                    }),
+                };
+            });
+            const claudeResult = await withSpan(tracer, "eval.spawn_claude", {
+                "args.resumed": session !== null,
+                "args.model": session ? "(resumed)" : "opus",
+            }, async (span) => {
+                const spawned = await spawnClaude(args, prompt, opts.projectRoot);
+                span.setAttribute("exit.code", spawned.code ?? -1);
+                if (spawned.code !== 0) {
+                    span.setAttribute("exit.stderr_tail", spawned.stderr.slice(-500));
+                }
+                return spawned;
+            });
+            if (claudeResult.code !== 0) {
+                if (session) {
+                    await withSpan(tracer, "eval.clear_stale_session", undefined, () => clearSession(opts.projectRoot));
+                    // Recurse — the retry produces its own root span
+                    return runPersistentEval(opts);
+                }
+                throw new Error(`claude exited with code ${claudeResult.code}: ${claudeResult.stderr.slice(0, 500)}`);
+            }
+            const parsed = await withSpan(tracer, "eval.parse_output", undefined, (span) => {
+                const out = parseClaudeOutput(claudeResult.stdout);
+                if (out.sessionId)
+                    span.setAttribute("session_id", out.sessionId);
+                if (out.usage) {
+                    span.setAttribute("cost_usd", out.usage.costUsd);
+                    span.setAttribute("input_tokens", out.usage.inputTokens);
+                    span.setAttribute("output_tokens", out.usage.outputTokens);
+                }
+                return out;
+            });
+            const scorecard = JSON.parse(parsed.scorecardText.trim());
+            if (parsed.usage)
+                scorecard.usage = parsed.usage;
+            scorecard.telemetryPosted = false;
+            await withSpan(tracer, "eval.update_session", undefined, () => {
+                const newSession = {
+                    sessionId: parsed.sessionId ?? session?.sessionId ?? "unknown",
+                    createdAt: session?.createdAt ?? new Date().toISOString(),
+                    lastEvalAt: new Date().toISOString(),
+                    evalCount: (session?.evalCount ?? 0) + 1,
+                };
+                writeSession(opts.projectRoot, newSession);
+            });
+            await withSpan(tracer, "eval.write_scorecard", undefined, async () => {
+                await logWriter.append(scorecard);
+                ingestScorecard(opts.projectRoot, scorecard);
+            });
+            return scorecard;
+        }
+        catch (err) {
+            const errorEntry = {
+                version: 1,
+                timestamp: new Date().toISOString(),
+                mode: opts.mode,
+                changeId: opts.changeId,
+                error: true,
+                message: err instanceof Error ? err.message : String(err),
+            };
+            await logWriter.append(errorEntry);
+            return errorEntry;
+        }
+    });
+    // Flush OTel so batched spans ship before the detached process exits.
+    await shutdownEvalOtel();
+    return result;
+}

package/dist/lib/eval/prompt-builder.d.ts CHANGED Viewed

@@ -1,11 +1,11 @@
 /**
- * Builds the judge agent's system prompt.
+ * Builds the evaluator agent's system prompt.
  *
- * The prompt instructs the judge to: do catchup, read the transcript, read the
+ * The prompt instructs the evaluator to: do catchup, read the transcript, read the
  * diff itself via jj, answer each rubric question, write findings to Graphiti
  * (eval mode only), and output a JSON scorecard.
  *
- * The diff is NOT embedded in the prompt — the judge reads it via tool calls.
+ * The diff is NOT embedded in the prompt — the evaluator reads it via tool calls.
  * This keeps the prompt small regardless of commit size.
  */
 import type { RubricQuestion } from "./types.js";
@@ -16,4 +16,4 @@ export interface PromptBuilderOptions {
     mode: "eval" | "baseline";
     projectGroup: string;
 }
-export declare function buildJudgePrompt(opts: PromptBuilderOptions): string;
+export declare function buildEvaluatorPrompt(opts: PromptBuilderOptions): string;

package/dist/lib/eval/prompt-builder.js CHANGED Viewed

@@ -1,14 +1,14 @@
 /**
- * Builds the judge agent's system prompt.
+ * Builds the evaluator agent's system prompt.
  *
- * The prompt instructs the judge to: do catchup, read the transcript, read the
+ * The prompt instructs the evaluator to: do catchup, read the transcript, read the
  * diff itself via jj, answer each rubric question, write findings to Graphiti
  * (eval mode only), and output a JSON scorecard.
  *
- * The diff is NOT embedded in the prompt — the judge reads it via tool calls.
+ * The diff is NOT embedded in the prompt — the evaluator reads it via tool calls.
  * This keeps the prompt small regardless of commit size.
  */
-export function buildJudgePrompt(opts) {
+export function buildEvaluatorPrompt(opts) {
     const questionsBlock = opts.rubric
         .map((q, i) => `${i + 1}. **${q.id}**: ${q.question}\n   Guidance: ${q.guidance}`)
         .join("\n\n");
@@ -62,7 +62,7 @@ If the tool is unavailable, skip silently and set graphitiWrites to 0.`
 ### Step 6: Graphiti writes
 Baseline mode — do NOT write to Graphiti. Set graphitiWrites to 0.`;
-    return `You are the InDusk evaluation judge. Your job is to evaluate the quality of work done by an AI agent on a software project.
+    return `You are the InDusk eval agent (evaluator). Your job is to evaluate the quality of work done by an AI agent on a software project.
 You have full read access to the codebase, MCP tools (Graphiti, code graph, InDusk), and the session transcript. You cannot edit files.

package/dist/lib/eval/types.d.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Types for the context system evaluation.
  *
  * The scorecard is the unit of evaluation — one per commit. Questions are the
- * rubric, defined in rubric.ts and answered by the judge agent.
+ * rubric, defined in rubric.ts and answered by the eval agent (evaluator).
  */
 export interface RubricQuestion {
     id: string;

package/dist/lib/eval/types.js CHANGED Viewed

@@ -2,7 +2,7 @@
  * Types for the context system evaluation.
  *
  * The scorecard is the unit of evaluation — one per commit. Questions are the
- * rubric, defined in rubric.ts and answered by the judge agent.
+ * rubric, defined in rubric.ts and answered by the eval agent (evaluator).
  */
 export function isScorecard(entry) {
     return (!("error" in entry) && "questions" in entry && Array.isArray(entry.questions));

package/hooks/eval-trigger.js CHANGED Viewed

@@ -4,13 +4,13 @@
  * Dual-mode eval trigger.
  *
  * 1) PostToolUse hook mode (default): fires on Bash tool calls containing
- *    `jj describe`. Reads the hook event JSON from stdin. Spawns the judge
+ *    `jj describe`. Reads the hook event JSON from stdin. Spawns the evaluator
  *    runner as a detached background process.
  *
  * 2) CLI mode (`--source <tag>`): invoked manually by skills (e.g., handoff)
  *    at session end. No stdin read, no `jj describe` filter. Uses the current
- *    @ change and passes the source tag to the judge via INDUSK_EVAL_SOURCE.
- *    The judge may skip diff-based scoring when source != "commit" but still
+ *    @ change and passes the source tag to the evaluator via INDUSK_EVAL_SOURCE.
+ *    The evaluator may skip diff-based scoring when source != "commit" but still
  *    processes the highlights queue.
  *
  * Exit 0 always — this is advisory, not blocking.
@@ -142,9 +142,12 @@ const transcriptPath =
 const hookDir = dirname(fileURLToPath(import.meta.url));
 const candidates = [
 	// Source repo (apps/indusk-mcp/hooks/ → apps/indusk-mcp/dist/)
-	resolve(hookDir, "../dist/lib/eval/judge-runner.js"),
+	resolve(hookDir, "../dist/lib/eval/evaluator-runner.js"),
 	// Installed package (hooks/ → dist/)
-	resolve(hookDir, "../../node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/judge-runner.js"),
+	resolve(
+		hookDir,
+		"../../node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/evaluator-runner.js",
+	),
 	// Global npx cache
 	...(() => {
 		try {
@@ -153,24 +156,24 @@ const candidates = [
 				return [
 					resolve(
 						dirname(which),
-						"../lib/node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/judge-runner.js",
+						"../lib/node_modules/@infinitedusky/indusk-mcp/dist/lib/eval/evaluator-runner.js",
 					),
 				];
 		} catch {}
 		return [];
 	})(),
 ];
-let judgeRunnerPath = null;
+let evaluatorRunnerPath = null;
 for (const c of candidates) {
 	syslog(projectRoot, `candidate: ${c} — ${existsSync(c) ? "found" : "missing"}`);
 	if (existsSync(c)) {
-		judgeRunnerPath = c;
+		evaluatorRunnerPath = c;
 		break;
 	}
 }
-syslog(projectRoot, `judgeRunnerPath: ${judgeRunnerPath ?? "NOT FOUND"}`);
+syslog(projectRoot, `evaluatorRunnerPath: ${evaluatorRunnerPath ?? "NOT FOUND"}`);
-if (!judgeRunnerPath) {
+if (!evaluatorRunnerPath) {
 	// Can't find the package — log error and exit
 	const { mkdirSync, appendFileSync } = await import("node:fs");
 	const logPath = resolve(projectRoot, ".indusk", "eval", "results.log");
@@ -182,14 +185,14 @@ if (!judgeRunnerPath) {
 		changeId,
 		error: true,
 		message:
-			"Could not find @infinitedusky/indusk-mcp package — eval judge not available. Run: npm i -g @infinitedusky/indusk-mcp",
+			"Could not find @infinitedusky/indusk-mcp package — eval evaluator not available. Run: npm i -g @infinitedusky/indusk-mcp",
 	});
 	appendFileSync(logPath, `${entry}\n`, "utf8");
 	process.exit(0);
 }
 // Surface unresolved findings from previous evals
-const findingsPath = judgeRunnerPath.replace("judge-runner.js", "findings.js");
+const findingsPath = evaluatorRunnerPath.replace("evaluator-runner.js", "findings.js");
 if (existsSync(findingsPath)) {
 	try {
 		const { getUnresolvedFindings } = await import(findingsPath);
@@ -207,18 +210,23 @@ if (existsSync(findingsPath)) {
 	}
 }
-// Use persistent judge — resumes existing session if available, otherwise does full catchup.
-const persistentJudgePath = judgeRunnerPath.replace("judge-runner.js", "persistent-judge.js");
-const useModule = existsSync(persistentJudgePath) ? persistentJudgePath : judgeRunnerPath;
-const useFunction = existsSync(persistentJudgePath) ? "runPersistentEval" : "runJudgeSync";
+// Use persistent evaluator — resumes existing session if available, otherwise does full catchup.
+const persistentEvaluatorPath = evaluatorRunnerPath.replace(
+	"evaluator-runner.js",
+	"persistent-evaluator.js",
+);
+const useModule = existsSync(persistentEvaluatorPath)
+	? persistentEvaluatorPath
+	: evaluatorRunnerPath;
+const useFunction = existsSync(persistentEvaluatorPath) ? "runPersistentEval" : "runEvaluatorSync";
 syslog(
 	projectRoot,
-	`spawning judge — module: ${useModule}, function: ${useFunction}, changeId: ${changeId}`,
+	`spawning evaluator — module: ${useModule}, function: ${useFunction}, changeId: ${changeId}`,
 );
 const syslogPath = resolve(projectRoot, ".indusk", "eval", "system.log");
-const judgeScript = `
+const evaluatorScript = `
 const fs = require("fs");
 const path = require("path");
 function syslog(msg) {
@@ -227,10 +235,10 @@ function syslog(msg) {
     fs.appendFileSync("${syslogPath}", new Date().toISOString() + " " + msg + "\\n");
   } catch {}
 }
-syslog("judge process started — changeId: ${changeId}");
+syslog("evaluator process started — changeId: ${changeId}");
 import("${useModule}")
   .then(m => {
-    syslog("judge module loaded — calling ${useFunction}");
+    syslog("evaluator module loaded — calling ${useFunction}");
     return m.${useFunction}({
       projectRoot: ${JSON.stringify(projectRoot)},
       changeId: ${JSON.stringify(changeId)},
@@ -241,11 +249,11 @@ import("${useModule}")
   })
   .then((result) => {
     const hasError = result && result.error;
-    syslog("judge completed — " + (hasError ? "error: " + result.message : "scorecard written"));
+    syslog("evaluator completed — " + (hasError ? "error: " + result.message : "scorecard written"));
     process.exit(0);
   })
   .catch(err => {
-    syslog("judge crashed — " + (err.message || String(err)));
+    syslog("evaluator crashed — " + (err.message || String(err)));
     const logPath = path.join(${JSON.stringify(projectRoot)}, ".indusk", "eval", "results.log");
     fs.mkdirSync(path.dirname(logPath), { recursive: true });
     const entry = JSON.stringify({
@@ -261,7 +269,7 @@ import("${useModule}")
   });
 `;
-const child = spawn("node", ["--input-type=module", "-e", judgeScript], {
+const child = spawn("node", ["--input-type=module", "-e", evaluatorScript], {
 	cwd: projectRoot,
 	stdio: "ignore",
 	detached: true,
@@ -270,24 +278,24 @@ const child = spawn("node", ["--input-type=module", "-e", judgeScript], {
 child.unref();
-syslog(projectRoot, `judge spawned — source: ${source}, pid: ${child.pid}`);
+syslog(projectRoot, `evaluator spawned — source: ${source}, pid: ${child.pid}`);
 if (cliSource !== null) {
 	// CLI mode — write a brief notice to stderr and exit
 	process.stderr.write(
-		`📊 Eval judge spawned (source=${source}) for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
+		`📊 Eval evaluator spawned (source=${source}) for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
 	);
 } else {
 	// Hook mode — output structured hook response
 	const output = JSON.stringify({
 		hookSpecificOutput: {
 			hookEventName: "PostToolUse",
-			message: `Eval judge spawned for change ${changeId.slice(0, 8)}`,
+			message: `Eval evaluator spawned for change ${changeId.slice(0, 8)}`,
 		},
 	});
 	process.stdout.write(output);
 	process.stderr.write(
-		`📊 Eval judge spawned in background for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
+		`📊 Eval evaluator spawned in background for ${changeId.slice(0, 8)}. Results will appear in .indusk/eval/results.log\n`,
 	);
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@infinitedusky/indusk-mcp",
-	"version": "1.17.0",
+	"version": "1.18.1",
 	"description": "InDusk development system — skills, MCP tools, and CLI for structured AI-assisted development",
 	"type": "module",
 	"files": [
@@ -28,6 +28,12 @@
 	},
 	"dependencies": {
 		"@modelcontextprotocol/sdk": "^1.12.1",
+		"@opentelemetry/api": "^1.9.0",
+		"@opentelemetry/exporter-trace-otlp-http": "^0.214.0",
+		"@opentelemetry/resources": "^2.6.0",
+		"@opentelemetry/sdk-trace-base": "^2.6.0",
+		"@opentelemetry/sdk-trace-node": "^2.6.0",
+		"@opentelemetry/semantic-conventions": "^1.40.0",
 		"commander": "^13.0.0",
 		"falkordb": "^6.6.2",
 		"glob": "^11.0.0",

package/skills/eval-review.md CHANGED Viewed

@@ -2,19 +2,19 @@ You can evaluate the current session's work quality on demand.
 ## When to Use
-- `/eval review` — run the eval judge against the current working copy
+- `/eval review` — run the eval evaluator against the current working copy
 - Mid-session quality check before committing
 - When you want to see how the work scores against the rubric
 ## What It Does
-Runs the same judge process as the automatic eval hook, but against uncommitted changes instead of a committed change. Uses `jj diff` for the current working copy diff and the current session's transcript.
+Runs the same evaluator process as the automatic eval hook, but against uncommitted changes instead of a committed change. Uses `jj diff` for the current working copy diff and the current session's transcript.
 ## Process
 1. Get the current diff: `jj diff`
-2. Build the judge prompt with the v1 rubric
-3. Run the judge (uses `runJudgeSync` from `apps/indusk-mcp/src/lib/eval/judge-runner.ts`)
+2. Build the evaluator prompt with the v1 rubric
+3. Run the evaluator (uses `runEvaluatorSync` from `apps/indusk-mcp/src/lib/eval/evaluator-runner.ts`)
 4. Display the scorecard inline
 5. Append results to `.indusk/eval/results.log`
@@ -23,7 +23,7 @@ Runs the same judge process as the automatic eval hook, but against uncommitted
 When the user says `/eval review` or asks for a quality check:
 1. Get the current change ID: `jj log -r @ --no-graph -T change_id`
-2. Call `runJudgeSync` with mode `"eval"` and the current transcript path
+2. Call `runEvaluatorSync` with mode `"eval"` and the current transcript path
 3. Present the scorecard to the user:
    - Overall summary
    - Per-question results with evidence
@@ -32,6 +32,6 @@ When the user says `/eval review` or asks for a quality check:
 ## Important
 - This is a quality check, not a blocker — findings are informational
-- The judge has full MCP access and does a real catchup
+- The evaluator has full MCP access and does a real catchup
 - Results are logged to the same eval log as automatic evaluations
-- If the judge fails, show the error — don't silently skip
+- If the evaluator fails, show the error — don't silently skip

package/skills/handoff.md CHANGED Viewed

@@ -62,7 +62,7 @@ Run this from the project root:
 node .claude/hooks/eval-trigger.js --source handoff
 ```
-The trigger spawns the judge in the background and returns immediately — it never blocks handoff. The judge processes the highlights queue and, because `INDUSK_EVAL_SOURCE=handoff` is set in the environment, may skip diff-based rubric scoring (there's no new commit). Highlights still get materialized into Graphiti episodes.
+The trigger spawns the evaluator in the background and returns immediately — it never blocks handoff. The evaluator processes the highlights queue and, because `INDUSK_EVAL_SOURCE=handoff` is set in the environment, may skip diff-based rubric scoring (there's no new commit). Highlights still get materialized into Graphiti episodes.
 If the hook isn't installed or Node isn't on PATH, the handoff still succeeds — the highlights remain queued for the next `jj describe` in a future session.