@infinitedusky/indusk-mcp 1.16.1 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/dist/bin/commands/eval.js +2 -2
  2. package/dist/bin/commands/extensions.js +5 -2
  3. package/dist/bin/commands/init-docs.js +2 -2
  4. package/dist/lib/eval/evaluator-runner.d.ts +28 -0
  5. package/dist/lib/eval/evaluator-runner.js +266 -0
  6. package/dist/lib/eval/otel.d.ts +61 -0
  7. package/dist/lib/eval/otel.js +177 -0
  8. package/dist/lib/eval/persistent-evaluator.d.ts +20 -0
  9. package/dist/lib/eval/persistent-evaluator.js +244 -0
  10. package/dist/lib/eval/prompt-builder.d.ts +4 -4
  11. package/dist/lib/eval/prompt-builder.js +36 -11
  12. package/dist/lib/eval/types.d.ts +1 -1
  13. package/dist/lib/eval/types.js +1 -1
  14. package/dist/lib/highlights/highlights.d.ts +48 -0
  15. package/dist/lib/highlights/highlights.js +136 -0
  16. package/dist/lib/semantic-graph/index.d.ts +1 -1
  17. package/dist/lib/trajectory/audit.js +4 -4
  18. package/dist/server/index.js +2 -0
  19. package/dist/tools/highlight-tools.d.ts +18 -0
  20. package/dist/tools/highlight-tools.js +78 -0
  21. package/hooks/check-catchup.js +18 -7
  22. package/hooks/eval-trigger.js +94 -50
  23. package/hooks/gate-reminder.js +1 -3
  24. package/package.json +7 -1
  25. package/skills/eval-review.md +7 -7
  26. package/skills/handoff.md +14 -0
  27. package/skills/highlight.md +50 -0
  28. package/skills/planner.md +12 -16
  29. package/skills/retrospective.md +23 -17
  30. package/skills/work.md +8 -14
  31. package/templates/FullscreenDiagram.vue +3 -3
  32. package/templates/filtering-exporter.ts +3 -16
  33. package/templates/instrumentation.ts +4 -5
  34. package/templates/instrumentation.web.ts +19 -15
  35. package/templates/logger.ts +1 -1
@@ -212,7 +212,7 @@ export async function evalBaseline(projectRoot, opts) {
212
212
  }
213
213
  // Run the smart evaluator against the baseline
214
214
  console.info("Running smart evaluator against baseline...");
215
- const { runJudgeSync } = await import("../../lib/eval/judge-runner.js");
215
+ const { runEvaluatorSync } = await import("../../lib/eval/evaluator-runner.js");
216
216
  let changeId;
217
217
  try {
218
218
  changeId = execSync("jj log -r @ --no-graph -T change_id", {
@@ -223,7 +223,7 @@ export async function evalBaseline(projectRoot, opts) {
223
223
  catch {
224
224
  changeId = "baseline-unknown";
225
225
  }
226
- const evalResult = await runJudgeSync({
226
+ const evalResult = await runEvaluatorSync({
227
227
  projectRoot: worktreePath,
228
228
  changeId,
229
229
  transcriptPath: "(baseline — no transcript)",
@@ -374,7 +374,7 @@ export async function extensionsUpdate(projectRoot, names) {
374
374
  continue;
375
375
  try {
376
376
  if (!ext.manifest._source) {
377
- if (names && names.includes(name)) {
377
+ if (names?.includes(name)) {
378
378
  console.info(` ${name}: built-in extension — updated via package update, not extensions update`);
379
379
  }
380
380
  continue;
@@ -633,7 +633,10 @@ function printMcpInstructions(name, manifest) {
633
633
  const needsAuth = server.headers && Object.keys(server.headers).length > 0;
634
634
  // Remove first, then add — ensures clean state
635
635
  try {
636
- execSync(`claude mcp remove -s project ${name}`, { timeout: 10000, stdio: ["ignore", "pipe", "pipe"] });
636
+ execSync(`claude mcp remove -s project ${name}`, {
637
+ timeout: 10000,
638
+ stdio: ["ignore", "pipe", "pipe"],
639
+ });
637
640
  }
638
641
  catch {
639
642
  // not registered yet, fine
@@ -24,7 +24,7 @@ export async function initDocs(projectRoot) {
24
24
  mkdirSync(join(docsDir, dir), { recursive: true });
25
25
  }
26
26
  // package.json
27
- writeFileSync(join(docsDir, "package.json"), JSON.stringify({
27
+ writeFileSync(join(docsDir, "package.json"), `${JSON.stringify({
28
28
  name: `${projectName}-docs`,
29
29
  version: "0.1.0",
30
30
  private: true,
@@ -42,7 +42,7 @@ export async function initDocs(projectRoot) {
42
42
  "vitepress-plugin-mermaid": "^2.0.10",
43
43
  vue: "^3.4.15",
44
44
  },
45
- }, null, "\t") + "\n");
45
+ }, null, "\t")}\n`);
46
46
  // .vitepress/config.ts
47
47
  writeFileSync(join(docsDir, "src/.vitepress/config.ts"), `import { defineConfig } from "vitepress";
48
48
  import llmstxt from "vitepress-plugin-llms";
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Evaluator runner — spawns a background `claude --print` process that evaluates
3
+ * a commit and writes results to the eval log.
4
+ *
5
+ * The evaluator is a detached child process so the calling hook can exit immediately.
6
+ * Results appear asynchronously in `.indusk/eval/results.log`.
7
+ */
8
+ import type { EvalErrorEntry, EvalScorecard } from "./types.js";
9
+ export interface EvaluatorRunOptions {
10
+ projectRoot: string;
11
+ changeId: string;
12
+ transcriptPath: string;
13
+ mode: "eval" | "baseline";
14
+ evalEndpoint?: string;
15
+ }
16
+ /**
17
+ * Run the evaluator as a detached background process.
18
+ *
19
+ * Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
20
+ * Collects stdout, parses the scorecard JSON, and appends to the eval log.
21
+ * If anything fails, logs an error entry instead of silently dropping.
22
+ */
23
+ export declare function runEvaluatorBackground(opts: EvaluatorRunOptions): void;
24
+ /**
25
+ * Run the evaluator synchronously (for testing and manual invocation).
26
+ * Returns the scorecard or error entry.
27
+ */
28
+ export declare function runEvaluatorSync(opts: EvaluatorRunOptions): Promise<EvalScorecard | EvalErrorEntry>;
@@ -0,0 +1,266 @@
1
+ /**
2
+ * Evaluator runner — spawns a background `claude --print` process that evaluates
3
+ * a commit and writes results to the eval log.
4
+ *
5
+ * The evaluator is a detached child process so the calling hook can exit immediately.
6
+ * Results appear asynchronously in `.indusk/eval/results.log`.
7
+ */
8
+ import { spawn } from "node:child_process";
9
+ import { join } from "node:path";
10
+ import { getProjectGroupId } from "../config.js";
11
+ import { ingestScorecard } from "./findings.js";
12
+ import { EvalLogWriter } from "./log-writer.js";
13
+ import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
14
+ import { buildEvaluatorPrompt } from "./prompt-builder.js";
15
+ import { V1_RUBRIC } from "./rubric.js";
16
+ function getEvalLogPath(projectRoot) {
17
+ return join(projectRoot, ".indusk", "eval", "results.log");
18
+ }
19
+ async function postTelemetry(endpoint, scorecard) {
20
+ try {
21
+ const controller = new AbortController();
22
+ const timeout = setTimeout(() => controller.abort(), 5000);
23
+ await fetch(endpoint, {
24
+ method: "POST",
25
+ headers: { "Content-Type": "application/json" },
26
+ body: JSON.stringify(scorecard),
27
+ signal: controller.signal,
28
+ });
29
+ clearTimeout(timeout);
30
+ }
31
+ catch {
32
+ // fire-and-forget — silently ignore errors
33
+ }
34
+ }
35
+ /**
36
+ * Run the evaluator as a detached background process.
37
+ *
38
+ * Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
39
+ * Collects stdout, parses the scorecard JSON, and appends to the eval log.
40
+ * If anything fails, logs an error entry instead of silently dropping.
41
+ */
42
+ export function runEvaluatorBackground(opts) {
43
+ const projectGroup = getProjectGroupId(opts.projectRoot);
44
+ const prompt = buildEvaluatorPrompt({
45
+ rubric: V1_RUBRIC,
46
+ changeId: opts.changeId,
47
+ transcriptPath: opts.transcriptPath,
48
+ mode: opts.mode,
49
+ projectGroup,
50
+ });
51
+ const allowedTools = [
52
+ "Read",
53
+ "Grep",
54
+ "Glob",
55
+ "Bash(jj:*)",
56
+ "Bash(git:*)",
57
+ "mcp__graphiti__*",
58
+ "mcp__indusk__*",
59
+ "mcp__codegraphcontext__*",
60
+ ];
61
+ const args = [
62
+ "--print",
63
+ "--output-format",
64
+ "json",
65
+ "--model",
66
+ "opus",
67
+ "--permission-mode",
68
+ "acceptEdits",
69
+ "--allowed-tools",
70
+ allowedTools.join(","),
71
+ ];
72
+ // Not detached — the eval-trigger hook already spawns this in a separate
73
+ // node process. Detaching + unref causes the close handler to never fire.
74
+ const child = spawn("claude", args, {
75
+ cwd: opts.projectRoot,
76
+ stdio: ["pipe", "pipe", "pipe"],
77
+ env: { ...process.env },
78
+ });
79
+ // Pipe the prompt via stdin (too large for CLI arg)
80
+ child.stdin?.write(prompt);
81
+ child.stdin?.end();
82
+ let stdout = "";
83
+ let stderr = "";
84
+ child.stdout?.on("data", (chunk) => {
85
+ stdout += chunk.toString();
86
+ });
87
+ child.stderr?.on("data", (chunk) => {
88
+ stderr += chunk.toString();
89
+ });
90
+ child.on("close", async (code) => {
91
+ const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
92
+ try {
93
+ if (code !== 0) {
94
+ throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
95
+ }
96
+ // --output-format json wraps the result; extract the text content and usage
97
+ let scorecardText = stdout;
98
+ let usage;
99
+ try {
100
+ const jsonOutput = JSON.parse(stdout);
101
+ scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
102
+ // Capture usage data from claude --print output
103
+ if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
104
+ const u = jsonOutput.usage ?? {};
105
+ usage = {
106
+ costUsd: jsonOutput.total_cost_usd ?? 0,
107
+ inputTokens: u.input_tokens ?? 0,
108
+ outputTokens: u.output_tokens ?? 0,
109
+ cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
110
+ cacheReadTokens: u.cache_read_input_tokens ?? 0,
111
+ durationMs: jsonOutput.duration_ms ?? 0,
112
+ };
113
+ }
114
+ }
115
+ catch {
116
+ // stdout might be raw JSON scorecard already
117
+ }
118
+ // Extract JSON from possible markdown code fences
119
+ const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
120
+ if (jsonMatch?.[1]) {
121
+ scorecardText = jsonMatch[1];
122
+ }
123
+ const scorecard = JSON.parse(scorecardText.trim());
124
+ if (usage)
125
+ scorecard.usage = usage;
126
+ scorecard.telemetryPosted = false;
127
+ if (opts.evalEndpoint) {
128
+ await postTelemetry(opts.evalEndpoint, scorecard);
129
+ scorecard.telemetryPosted = true;
130
+ }
131
+ await logWriter.append(scorecard);
132
+ ingestScorecard(opts.projectRoot, scorecard);
133
+ }
134
+ catch (err) {
135
+ const errorEntry = {
136
+ version: 1,
137
+ timestamp: new Date().toISOString(),
138
+ mode: opts.mode,
139
+ changeId: opts.changeId,
140
+ error: true,
141
+ message: err instanceof Error ? err.message : String(err),
142
+ };
143
+ await logWriter.append(errorEntry);
144
+ }
145
+ });
146
+ }
147
+ /**
148
+ * Run the evaluator synchronously (for testing and manual invocation).
149
+ * Returns the scorecard or error entry.
150
+ */
151
+ export async function runEvaluatorSync(opts) {
152
+ const tracer = initEvalOtel(opts.projectRoot);
153
+ const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
154
+ const projectGroup = getProjectGroupId(opts.projectRoot);
155
+ const result = await withSpan(tracer, "eval.run", {
156
+ changeId: opts.changeId,
157
+ source,
158
+ mode: opts.mode,
159
+ projectGroup,
160
+ entrypoint: "runEvaluatorSync",
161
+ }, () => runEvaluatorSyncInner(opts, projectGroup));
162
+ await shutdownEvalOtel();
163
+ return result;
164
+ }
165
+ async function runEvaluatorSyncInner(opts, projectGroup) {
166
+ const prompt = buildEvaluatorPrompt({
167
+ rubric: V1_RUBRIC,
168
+ changeId: opts.changeId,
169
+ transcriptPath: opts.transcriptPath,
170
+ mode: opts.mode,
171
+ projectGroup,
172
+ });
173
+ const allowedTools = [
174
+ "Read",
175
+ "Grep",
176
+ "Glob",
177
+ "Bash(jj:*)",
178
+ "Bash(git:*)",
179
+ "mcp__graphiti__*",
180
+ "mcp__indusk__*",
181
+ "mcp__codegraphcontext__*",
182
+ ];
183
+ const args = [
184
+ "--print",
185
+ "--output-format",
186
+ "json",
187
+ "--model",
188
+ "opus",
189
+ "--permission-mode",
190
+ "acceptEdits",
191
+ "--allowed-tools",
192
+ allowedTools.join(","),
193
+ ];
194
+ return new Promise((resolve) => {
195
+ const child = spawn("claude", args, {
196
+ cwd: opts.projectRoot,
197
+ stdio: ["pipe", "pipe", "pipe"],
198
+ env: { ...process.env },
199
+ });
200
+ child.stdin?.write(prompt);
201
+ child.stdin?.end();
202
+ let stdout = "";
203
+ let stderr = "";
204
+ child.stdout?.on("data", (chunk) => {
205
+ stdout += chunk.toString();
206
+ });
207
+ child.stderr?.on("data", (chunk) => {
208
+ stderr += chunk.toString();
209
+ });
210
+ child.on("close", async (code) => {
211
+ const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
212
+ try {
213
+ if (code !== 0) {
214
+ throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
215
+ }
216
+ let scorecardText = stdout;
217
+ let syncUsage;
218
+ try {
219
+ const jsonOutput = JSON.parse(stdout);
220
+ scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
221
+ if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
222
+ const u = jsonOutput.usage ?? {};
223
+ syncUsage = {
224
+ costUsd: jsonOutput.total_cost_usd ?? 0,
225
+ inputTokens: u.input_tokens ?? 0,
226
+ outputTokens: u.output_tokens ?? 0,
227
+ cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
228
+ cacheReadTokens: u.cache_read_input_tokens ?? 0,
229
+ durationMs: jsonOutput.duration_ms ?? 0,
230
+ };
231
+ }
232
+ }
233
+ catch {
234
+ // raw JSON
235
+ }
236
+ const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
237
+ if (jsonMatch?.[1]) {
238
+ scorecardText = jsonMatch[1];
239
+ }
240
+ const scorecard = JSON.parse(scorecardText.trim());
241
+ if (syncUsage)
242
+ scorecard.usage = syncUsage;
243
+ scorecard.telemetryPosted = false;
244
+ if (opts.evalEndpoint) {
245
+ await postTelemetry(opts.evalEndpoint, scorecard);
246
+ scorecard.telemetryPosted = true;
247
+ }
248
+ await logWriter.append(scorecard);
249
+ ingestScorecard(opts.projectRoot, scorecard);
250
+ resolve(scorecard);
251
+ }
252
+ catch (err) {
253
+ const errorEntry = {
254
+ version: 1,
255
+ timestamp: new Date().toISOString(),
256
+ mode: opts.mode,
257
+ changeId: opts.changeId,
258
+ error: true,
259
+ message: err instanceof Error ? err.message : String(err),
260
+ };
261
+ await logWriter.append(errorEntry);
262
+ resolve(errorEntry);
263
+ }
264
+ });
265
+ });
266
+ }
@@ -0,0 +1,61 @@
1
+ /**
2
+ * OpenTelemetry tracing for the eval agent (evaluator).
3
+ *
4
+ * Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
5
+ * `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
6
+ * (Dash0 or any OTLP HTTP receiver).
7
+ *
8
+ * Default OFF — zero cost in normal operation (no SDK init, no network).
9
+ *
10
+ * Graceful degradation: when enabled but endpoint missing, log a warning
11
+ * to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
12
+ * throws, same behavior. The evaluator never fails because of OTel.
13
+ */
14
+ import { type Attributes, type Span, type Tracer } from "@opentelemetry/api";
15
+ export interface EvalOtelConfig {
16
+ enabled: boolean;
17
+ endpoint: string | null;
18
+ dataset: string;
19
+ }
20
+ /**
21
+ * Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
22
+ * the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
23
+ * env vars. Does not init anything or touch the network.
24
+ *
25
+ * Resolution:
26
+ * - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
27
+ * - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
28
+ * - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
29
+ * else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
30
+ */
31
+ export declare function isEvalOtelEnabled(projectRoot: string): EvalOtelConfig;
32
+ /**
33
+ * Initialize OTel tracing for the evaluator if enabled + endpoint set.
34
+ * Returns a Tracer — real when enabled, no-op when not.
35
+ *
36
+ * The no-op path costs nothing: no provider registered, no network, the
37
+ * returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
38
+ *
39
+ * Safe to call multiple times — subsequent calls return the same tracer.
40
+ */
41
+ export declare function initEvalOtel(projectRoot: string): Tracer;
42
+ /**
43
+ * Run `fn` inside an active span. Closes the span in `finally`. On thrown
44
+ * error, records the exception on the span and sets status to ERROR, then
45
+ * re-throws so callers can still handle it.
46
+ *
47
+ * Use this for every lifecycle step in the evaluator so spans close even
48
+ * when Claude exits non-zero or a downstream step throws.
49
+ */
50
+ export declare function withSpan<T>(tracer: Tracer, name: string, attrs: Attributes | undefined, fn: (span: Span) => Promise<T> | T): Promise<T>;
51
+ /**
52
+ * Flush and shut down the active provider. Call this before `process.exit()`
53
+ * in detached processes so batched spans are not lost. No-op if no provider
54
+ * is active.
55
+ */
56
+ export declare function shutdownEvalOtel(): Promise<void>;
57
+ /**
58
+ * Test hook: reset the module's state AND the global OTel API so each test
59
+ * starts fresh. Not part of the public API.
60
+ */
61
+ export declare function __resetEvalOtelForTests(): void;
@@ -0,0 +1,177 @@
1
+ /**
2
+ * OpenTelemetry tracing for the eval agent (evaluator).
3
+ *
4
+ * Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
5
+ * `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
6
+ * (Dash0 or any OTLP HTTP receiver).
7
+ *
8
+ * Default OFF — zero cost in normal operation (no SDK init, no network).
9
+ *
10
+ * Graceful degradation: when enabled but endpoint missing, log a warning
11
+ * to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
12
+ * throws, same behavior. The evaluator never fails because of OTel.
13
+ */
14
+ import { appendFileSync, existsSync, mkdirSync, readFileSync } from "node:fs";
15
+ import { join, resolve } from "node:path";
16
+ import { SpanStatusCode, trace } from "@opentelemetry/api";
17
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
18
+ import { resourceFromAttributes } from "@opentelemetry/resources";
19
+ import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-base";
20
+ import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
21
+ import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
22
+ const TRACER_NAME = "@infinitedusky/indusk-mcp/eval";
23
+ const SERVICE_NAME = "indusk-eval-agent";
24
+ function syslog(projectRoot, msg) {
25
+ try {
26
+ const logDir = resolve(projectRoot, ".indusk", "eval");
27
+ mkdirSync(logDir, { recursive: true });
28
+ appendFileSync(resolve(logDir, "system.log"), `${new Date().toISOString()} ${msg}\n`);
29
+ }
30
+ catch {
31
+ // logging should never break anything
32
+ }
33
+ }
34
+ const DEFAULT_DATASET = "agent";
35
+ /**
36
+ * Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
37
+ * the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
38
+ * env vars. Does not init anything or touch the network.
39
+ *
40
+ * Resolution:
41
+ * - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
42
+ * - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
43
+ * - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
44
+ * else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
45
+ */
46
+ export function isEvalOtelEnabled(projectRoot) {
47
+ const envFlag = process.env.INDUSK_EVAL_OTEL;
48
+ const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? null;
49
+ const envDataset = process.env.INDUSK_EVAL_OTEL_DATASET;
50
+ let configEnabled = false;
51
+ let configDataset;
52
+ const configPath = join(projectRoot, ".indusk", "config.json");
53
+ if (existsSync(configPath)) {
54
+ try {
55
+ const config = JSON.parse(readFileSync(configPath, "utf-8"));
56
+ configEnabled = config?.eval?.otel?.enabled === true;
57
+ if (typeof config?.eval?.otel?.dataset === "string") {
58
+ configDataset = config.eval.otel.dataset;
59
+ }
60
+ }
61
+ catch {
62
+ // malformed config — treat as disabled
63
+ }
64
+ }
65
+ const envForcesEnabled = envFlag !== undefined && envFlag !== "" && envFlag !== "0" && envFlag.toLowerCase() !== "false";
66
+ const dataset = envDataset && envDataset !== "" ? envDataset : (configDataset ?? DEFAULT_DATASET);
67
+ return {
68
+ enabled: envForcesEnabled || configEnabled,
69
+ endpoint,
70
+ dataset,
71
+ };
72
+ }
73
+ let activeProvider = null;
74
+ /**
75
+ * Initialize OTel tracing for the evaluator if enabled + endpoint set.
76
+ * Returns a Tracer — real when enabled, no-op when not.
77
+ *
78
+ * The no-op path costs nothing: no provider registered, no network, the
79
+ * returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
80
+ *
81
+ * Safe to call multiple times — subsequent calls return the same tracer.
82
+ */
83
+ export function initEvalOtel(projectRoot) {
84
+ const { enabled, endpoint, dataset } = isEvalOtelEnabled(projectRoot);
85
+ if (!enabled) {
86
+ return trace.getTracer(TRACER_NAME);
87
+ }
88
+ if (!endpoint) {
89
+ syslog(projectRoot, "eval.otel.enabled but OTEL_EXPORTER_OTLP_ENDPOINT is unset — falling back to no-op tracer");
90
+ return trace.getTracer(TRACER_NAME);
91
+ }
92
+ if (activeProvider) {
93
+ return trace.getTracer(TRACER_NAME);
94
+ }
95
+ try {
96
+ const exporter = new OTLPTraceExporter({
97
+ url: endpoint.endsWith("/v1/traces") ? endpoint : `${endpoint.replace(/\/$/, "")}/v1/traces`,
98
+ // Route agent spans to the Dash0 dataset named `dataset`. Default
99
+ // is "agent". Env-set headers (OTEL_EXPORTER_OTLP_HEADERS) take
100
+ // precedence — per the OTel SDK contract — so a user-provided
101
+ // Dash0-Dataset in env overrides this default.
102
+ headers: {
103
+ "Dash0-Dataset": dataset,
104
+ },
105
+ });
106
+ const provider = new NodeTracerProvider({
107
+ resource: resourceFromAttributes({
108
+ [ATTR_SERVICE_NAME]: SERVICE_NAME,
109
+ }),
110
+ spanProcessors: [new BatchSpanProcessor(exporter)],
111
+ });
112
+ provider.register();
113
+ activeProvider = provider;
114
+ syslog(projectRoot, `eval.otel initialized — endpoint: ${endpoint}, dataset: ${dataset}`);
115
+ }
116
+ catch (err) {
117
+ const message = err instanceof Error ? err.message : String(err);
118
+ syslog(projectRoot, `eval.otel init failed — falling back to no-op tracer: ${message}`);
119
+ }
120
+ return trace.getTracer(TRACER_NAME);
121
+ }
122
+ /**
123
+ * Run `fn` inside an active span. Closes the span in `finally`. On thrown
124
+ * error, records the exception on the span and sets status to ERROR, then
125
+ * re-throws so callers can still handle it.
126
+ *
127
+ * Use this for every lifecycle step in the evaluator so spans close even
128
+ * when Claude exits non-zero or a downstream step throws.
129
+ */
130
+ export async function withSpan(tracer, name, attrs, fn) {
131
+ return tracer.startActiveSpan(name, { attributes: attrs ?? {} }, async (span) => {
132
+ try {
133
+ return await fn(span);
134
+ }
135
+ catch (err) {
136
+ span.recordException(err instanceof Error ? err : new Error(String(err)));
137
+ span.setStatus({ code: SpanStatusCode.ERROR });
138
+ throw err;
139
+ }
140
+ finally {
141
+ span.end();
142
+ }
143
+ });
144
+ }
145
+ /**
146
+ * Flush and shut down the active provider. Call this before `process.exit()`
147
+ * in detached processes so batched spans are not lost. No-op if no provider
148
+ * is active.
149
+ */
150
+ export async function shutdownEvalOtel() {
151
+ if (!activeProvider)
152
+ return;
153
+ try {
154
+ await activeProvider.forceFlush();
155
+ await activeProvider.shutdown();
156
+ }
157
+ catch {
158
+ // shutdown is best-effort
159
+ }
160
+ finally {
161
+ activeProvider = null;
162
+ }
163
+ }
164
+ /**
165
+ * Test hook: reset the module's state AND the global OTel API so each test
166
+ * starts fresh. Not part of the public API.
167
+ */
168
+ export function __resetEvalOtelForTests() {
169
+ // Tear down any provider left over from a previous test. This un-registers
170
+ // from the global OTel API, so `trace.getTracer()` falls back to the no-op
171
+ // tracer until a new provider is registered.
172
+ if (activeProvider) {
173
+ void activeProvider.shutdown().catch(() => { });
174
+ }
175
+ activeProvider = null;
176
+ trace.disable();
177
+ }
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Persistent evaluator session management.
3
+ *
4
+ * First eval spawns a new session with full catchup. Subsequent evals resume
5
+ * the same session — no catchup cost, just "evaluate this change."
6
+ *
7
+ * Session state stored in `.indusk/eval/evaluator-session.json`.
8
+ */
9
+ import type { EvalErrorEntry, EvalScorecard } from "./types.js";
10
+ /**
11
+ * Run eval using a persistent session. First call does catchup + eval.
12
+ * Subsequent calls resume the session with just the new change.
13
+ */
14
+ export declare function runPersistentEval(opts: {
15
+ projectRoot: string;
16
+ changeId: string;
17
+ transcriptPath: string;
18
+ mode: "eval" | "baseline";
19
+ evalEndpoint?: string;
20
+ }): Promise<EvalScorecard | EvalErrorEntry>;