@infinitedusky/indusk-mcp 1.16.1 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/commands/eval.js +2 -2
- package/dist/bin/commands/extensions.js +5 -2
- package/dist/bin/commands/init-docs.js +2 -2
- package/dist/lib/eval/evaluator-runner.d.ts +28 -0
- package/dist/lib/eval/evaluator-runner.js +266 -0
- package/dist/lib/eval/otel.d.ts +61 -0
- package/dist/lib/eval/otel.js +177 -0
- package/dist/lib/eval/persistent-evaluator.d.ts +20 -0
- package/dist/lib/eval/persistent-evaluator.js +244 -0
- package/dist/lib/eval/prompt-builder.d.ts +4 -4
- package/dist/lib/eval/prompt-builder.js +36 -11
- package/dist/lib/eval/types.d.ts +1 -1
- package/dist/lib/eval/types.js +1 -1
- package/dist/lib/highlights/highlights.d.ts +48 -0
- package/dist/lib/highlights/highlights.js +136 -0
- package/dist/lib/semantic-graph/index.d.ts +1 -1
- package/dist/lib/trajectory/audit.js +4 -4
- package/dist/server/index.js +2 -0
- package/dist/tools/highlight-tools.d.ts +18 -0
- package/dist/tools/highlight-tools.js +78 -0
- package/hooks/check-catchup.js +18 -7
- package/hooks/eval-trigger.js +94 -50
- package/hooks/gate-reminder.js +1 -3
- package/package.json +7 -1
- package/skills/eval-review.md +7 -7
- package/skills/handoff.md +14 -0
- package/skills/highlight.md +50 -0
- package/skills/planner.md +12 -16
- package/skills/retrospective.md +23 -17
- package/skills/work.md +8 -14
- package/templates/FullscreenDiagram.vue +3 -3
- package/templates/filtering-exporter.ts +3 -16
- package/templates/instrumentation.ts +4 -5
- package/templates/instrumentation.web.ts +19 -15
- package/templates/logger.ts +1 -1
|
@@ -212,7 +212,7 @@ export async function evalBaseline(projectRoot, opts) {
|
|
|
212
212
|
}
|
|
213
213
|
// Run the smart evaluator against the baseline
|
|
214
214
|
console.info("Running smart evaluator against baseline...");
|
|
215
|
-
const {
|
|
215
|
+
const { runEvaluatorSync } = await import("../../lib/eval/evaluator-runner.js");
|
|
216
216
|
let changeId;
|
|
217
217
|
try {
|
|
218
218
|
changeId = execSync("jj log -r @ --no-graph -T change_id", {
|
|
@@ -223,7 +223,7 @@ export async function evalBaseline(projectRoot, opts) {
|
|
|
223
223
|
catch {
|
|
224
224
|
changeId = "baseline-unknown";
|
|
225
225
|
}
|
|
226
|
-
const evalResult = await
|
|
226
|
+
const evalResult = await runEvaluatorSync({
|
|
227
227
|
projectRoot: worktreePath,
|
|
228
228
|
changeId,
|
|
229
229
|
transcriptPath: "(baseline — no transcript)",
|
|
@@ -374,7 +374,7 @@ export async function extensionsUpdate(projectRoot, names) {
|
|
|
374
374
|
continue;
|
|
375
375
|
try {
|
|
376
376
|
if (!ext.manifest._source) {
|
|
377
|
-
if (names
|
|
377
|
+
if (names?.includes(name)) {
|
|
378
378
|
console.info(` ${name}: built-in extension — updated via package update, not extensions update`);
|
|
379
379
|
}
|
|
380
380
|
continue;
|
|
@@ -633,7 +633,10 @@ function printMcpInstructions(name, manifest) {
|
|
|
633
633
|
const needsAuth = server.headers && Object.keys(server.headers).length > 0;
|
|
634
634
|
// Remove first, then add — ensures clean state
|
|
635
635
|
try {
|
|
636
|
-
execSync(`claude mcp remove -s project ${name}`, {
|
|
636
|
+
execSync(`claude mcp remove -s project ${name}`, {
|
|
637
|
+
timeout: 10000,
|
|
638
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
639
|
+
});
|
|
637
640
|
}
|
|
638
641
|
catch {
|
|
639
642
|
// not registered yet, fine
|
|
@@ -24,7 +24,7 @@ export async function initDocs(projectRoot) {
|
|
|
24
24
|
mkdirSync(join(docsDir, dir), { recursive: true });
|
|
25
25
|
}
|
|
26
26
|
// package.json
|
|
27
|
-
writeFileSync(join(docsDir, "package.json"), JSON.stringify({
|
|
27
|
+
writeFileSync(join(docsDir, "package.json"), `${JSON.stringify({
|
|
28
28
|
name: `${projectName}-docs`,
|
|
29
29
|
version: "0.1.0",
|
|
30
30
|
private: true,
|
|
@@ -42,7 +42,7 @@ export async function initDocs(projectRoot) {
|
|
|
42
42
|
"vitepress-plugin-mermaid": "^2.0.10",
|
|
43
43
|
vue: "^3.4.15",
|
|
44
44
|
},
|
|
45
|
-
}, null, "\t")
|
|
45
|
+
}, null, "\t")}\n`);
|
|
46
46
|
// .vitepress/config.ts
|
|
47
47
|
writeFileSync(join(docsDir, "src/.vitepress/config.ts"), `import { defineConfig } from "vitepress";
|
|
48
48
|
import llmstxt from "vitepress-plugin-llms";
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluator runner — spawns a background `claude --print` process that evaluates
|
|
3
|
+
* a commit and writes results to the eval log.
|
|
4
|
+
*
|
|
5
|
+
* The evaluator is a detached child process so the calling hook can exit immediately.
|
|
6
|
+
* Results appear asynchronously in `.indusk/eval/results.log`.
|
|
7
|
+
*/
|
|
8
|
+
import type { EvalErrorEntry, EvalScorecard } from "./types.js";
|
|
9
|
+
export interface EvaluatorRunOptions {
|
|
10
|
+
projectRoot: string;
|
|
11
|
+
changeId: string;
|
|
12
|
+
transcriptPath: string;
|
|
13
|
+
mode: "eval" | "baseline";
|
|
14
|
+
evalEndpoint?: string;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Run the evaluator as a detached background process.
|
|
18
|
+
*
|
|
19
|
+
* Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
|
|
20
|
+
* Collects stdout, parses the scorecard JSON, and appends to the eval log.
|
|
21
|
+
* If anything fails, logs an error entry instead of silently dropping.
|
|
22
|
+
*/
|
|
23
|
+
export declare function runEvaluatorBackground(opts: EvaluatorRunOptions): void;
|
|
24
|
+
/**
|
|
25
|
+
* Run the evaluator synchronously (for testing and manual invocation).
|
|
26
|
+
* Returns the scorecard or error entry.
|
|
27
|
+
*/
|
|
28
|
+
export declare function runEvaluatorSync(opts: EvaluatorRunOptions): Promise<EvalScorecard | EvalErrorEntry>;
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluator runner — spawns a background `claude --print` process that evaluates
|
|
3
|
+
* a commit and writes results to the eval log.
|
|
4
|
+
*
|
|
5
|
+
* The evaluator is a detached child process so the calling hook can exit immediately.
|
|
6
|
+
* Results appear asynchronously in `.indusk/eval/results.log`.
|
|
7
|
+
*/
|
|
8
|
+
import { spawn } from "node:child_process";
|
|
9
|
+
import { join } from "node:path";
|
|
10
|
+
import { getProjectGroupId } from "../config.js";
|
|
11
|
+
import { ingestScorecard } from "./findings.js";
|
|
12
|
+
import { EvalLogWriter } from "./log-writer.js";
|
|
13
|
+
import { initEvalOtel, shutdownEvalOtel, withSpan } from "./otel.js";
|
|
14
|
+
import { buildEvaluatorPrompt } from "./prompt-builder.js";
|
|
15
|
+
import { V1_RUBRIC } from "./rubric.js";
|
|
16
|
+
function getEvalLogPath(projectRoot) {
|
|
17
|
+
return join(projectRoot, ".indusk", "eval", "results.log");
|
|
18
|
+
}
|
|
19
|
+
async function postTelemetry(endpoint, scorecard) {
|
|
20
|
+
try {
|
|
21
|
+
const controller = new AbortController();
|
|
22
|
+
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
23
|
+
await fetch(endpoint, {
|
|
24
|
+
method: "POST",
|
|
25
|
+
headers: { "Content-Type": "application/json" },
|
|
26
|
+
body: JSON.stringify(scorecard),
|
|
27
|
+
signal: controller.signal,
|
|
28
|
+
});
|
|
29
|
+
clearTimeout(timeout);
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
// fire-and-forget — silently ignore errors
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Run the evaluator as a detached background process.
|
|
37
|
+
*
|
|
38
|
+
* Spawns `claude --print` with the evaluator prompt and allowed tools whitelist.
|
|
39
|
+
* Collects stdout, parses the scorecard JSON, and appends to the eval log.
|
|
40
|
+
* If anything fails, logs an error entry instead of silently dropping.
|
|
41
|
+
*/
|
|
42
|
+
export function runEvaluatorBackground(opts) {
|
|
43
|
+
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
44
|
+
const prompt = buildEvaluatorPrompt({
|
|
45
|
+
rubric: V1_RUBRIC,
|
|
46
|
+
changeId: opts.changeId,
|
|
47
|
+
transcriptPath: opts.transcriptPath,
|
|
48
|
+
mode: opts.mode,
|
|
49
|
+
projectGroup,
|
|
50
|
+
});
|
|
51
|
+
const allowedTools = [
|
|
52
|
+
"Read",
|
|
53
|
+
"Grep",
|
|
54
|
+
"Glob",
|
|
55
|
+
"Bash(jj:*)",
|
|
56
|
+
"Bash(git:*)",
|
|
57
|
+
"mcp__graphiti__*",
|
|
58
|
+
"mcp__indusk__*",
|
|
59
|
+
"mcp__codegraphcontext__*",
|
|
60
|
+
];
|
|
61
|
+
const args = [
|
|
62
|
+
"--print",
|
|
63
|
+
"--output-format",
|
|
64
|
+
"json",
|
|
65
|
+
"--model",
|
|
66
|
+
"opus",
|
|
67
|
+
"--permission-mode",
|
|
68
|
+
"acceptEdits",
|
|
69
|
+
"--allowed-tools",
|
|
70
|
+
allowedTools.join(","),
|
|
71
|
+
];
|
|
72
|
+
// Not detached — the eval-trigger hook already spawns this in a separate
|
|
73
|
+
// node process. Detaching + unref causes the close handler to never fire.
|
|
74
|
+
const child = spawn("claude", args, {
|
|
75
|
+
cwd: opts.projectRoot,
|
|
76
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
77
|
+
env: { ...process.env },
|
|
78
|
+
});
|
|
79
|
+
// Pipe the prompt via stdin (too large for CLI arg)
|
|
80
|
+
child.stdin?.write(prompt);
|
|
81
|
+
child.stdin?.end();
|
|
82
|
+
let stdout = "";
|
|
83
|
+
let stderr = "";
|
|
84
|
+
child.stdout?.on("data", (chunk) => {
|
|
85
|
+
stdout += chunk.toString();
|
|
86
|
+
});
|
|
87
|
+
child.stderr?.on("data", (chunk) => {
|
|
88
|
+
stderr += chunk.toString();
|
|
89
|
+
});
|
|
90
|
+
child.on("close", async (code) => {
|
|
91
|
+
const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
|
|
92
|
+
try {
|
|
93
|
+
if (code !== 0) {
|
|
94
|
+
throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
|
|
95
|
+
}
|
|
96
|
+
// --output-format json wraps the result; extract the text content and usage
|
|
97
|
+
let scorecardText = stdout;
|
|
98
|
+
let usage;
|
|
99
|
+
try {
|
|
100
|
+
const jsonOutput = JSON.parse(stdout);
|
|
101
|
+
scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
|
|
102
|
+
// Capture usage data from claude --print output
|
|
103
|
+
if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
|
|
104
|
+
const u = jsonOutput.usage ?? {};
|
|
105
|
+
usage = {
|
|
106
|
+
costUsd: jsonOutput.total_cost_usd ?? 0,
|
|
107
|
+
inputTokens: u.input_tokens ?? 0,
|
|
108
|
+
outputTokens: u.output_tokens ?? 0,
|
|
109
|
+
cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
|
|
110
|
+
cacheReadTokens: u.cache_read_input_tokens ?? 0,
|
|
111
|
+
durationMs: jsonOutput.duration_ms ?? 0,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
// stdout might be raw JSON scorecard already
|
|
117
|
+
}
|
|
118
|
+
// Extract JSON from possible markdown code fences
|
|
119
|
+
const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
|
|
120
|
+
if (jsonMatch?.[1]) {
|
|
121
|
+
scorecardText = jsonMatch[1];
|
|
122
|
+
}
|
|
123
|
+
const scorecard = JSON.parse(scorecardText.trim());
|
|
124
|
+
if (usage)
|
|
125
|
+
scorecard.usage = usage;
|
|
126
|
+
scorecard.telemetryPosted = false;
|
|
127
|
+
if (opts.evalEndpoint) {
|
|
128
|
+
await postTelemetry(opts.evalEndpoint, scorecard);
|
|
129
|
+
scorecard.telemetryPosted = true;
|
|
130
|
+
}
|
|
131
|
+
await logWriter.append(scorecard);
|
|
132
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
133
|
+
}
|
|
134
|
+
catch (err) {
|
|
135
|
+
const errorEntry = {
|
|
136
|
+
version: 1,
|
|
137
|
+
timestamp: new Date().toISOString(),
|
|
138
|
+
mode: opts.mode,
|
|
139
|
+
changeId: opts.changeId,
|
|
140
|
+
error: true,
|
|
141
|
+
message: err instanceof Error ? err.message : String(err),
|
|
142
|
+
};
|
|
143
|
+
await logWriter.append(errorEntry);
|
|
144
|
+
}
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Run the evaluator synchronously (for testing and manual invocation).
|
|
149
|
+
* Returns the scorecard or error entry.
|
|
150
|
+
*/
|
|
151
|
+
export async function runEvaluatorSync(opts) {
|
|
152
|
+
const tracer = initEvalOtel(opts.projectRoot);
|
|
153
|
+
const source = process.env.INDUSK_EVAL_SOURCE ?? "commit";
|
|
154
|
+
const projectGroup = getProjectGroupId(opts.projectRoot);
|
|
155
|
+
const result = await withSpan(tracer, "eval.run", {
|
|
156
|
+
changeId: opts.changeId,
|
|
157
|
+
source,
|
|
158
|
+
mode: opts.mode,
|
|
159
|
+
projectGroup,
|
|
160
|
+
entrypoint: "runEvaluatorSync",
|
|
161
|
+
}, () => runEvaluatorSyncInner(opts, projectGroup));
|
|
162
|
+
await shutdownEvalOtel();
|
|
163
|
+
return result;
|
|
164
|
+
}
|
|
165
|
+
async function runEvaluatorSyncInner(opts, projectGroup) {
|
|
166
|
+
const prompt = buildEvaluatorPrompt({
|
|
167
|
+
rubric: V1_RUBRIC,
|
|
168
|
+
changeId: opts.changeId,
|
|
169
|
+
transcriptPath: opts.transcriptPath,
|
|
170
|
+
mode: opts.mode,
|
|
171
|
+
projectGroup,
|
|
172
|
+
});
|
|
173
|
+
const allowedTools = [
|
|
174
|
+
"Read",
|
|
175
|
+
"Grep",
|
|
176
|
+
"Glob",
|
|
177
|
+
"Bash(jj:*)",
|
|
178
|
+
"Bash(git:*)",
|
|
179
|
+
"mcp__graphiti__*",
|
|
180
|
+
"mcp__indusk__*",
|
|
181
|
+
"mcp__codegraphcontext__*",
|
|
182
|
+
];
|
|
183
|
+
const args = [
|
|
184
|
+
"--print",
|
|
185
|
+
"--output-format",
|
|
186
|
+
"json",
|
|
187
|
+
"--model",
|
|
188
|
+
"opus",
|
|
189
|
+
"--permission-mode",
|
|
190
|
+
"acceptEdits",
|
|
191
|
+
"--allowed-tools",
|
|
192
|
+
allowedTools.join(","),
|
|
193
|
+
];
|
|
194
|
+
return new Promise((resolve) => {
|
|
195
|
+
const child = spawn("claude", args, {
|
|
196
|
+
cwd: opts.projectRoot,
|
|
197
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
198
|
+
env: { ...process.env },
|
|
199
|
+
});
|
|
200
|
+
child.stdin?.write(prompt);
|
|
201
|
+
child.stdin?.end();
|
|
202
|
+
let stdout = "";
|
|
203
|
+
let stderr = "";
|
|
204
|
+
child.stdout?.on("data", (chunk) => {
|
|
205
|
+
stdout += chunk.toString();
|
|
206
|
+
});
|
|
207
|
+
child.stderr?.on("data", (chunk) => {
|
|
208
|
+
stderr += chunk.toString();
|
|
209
|
+
});
|
|
210
|
+
child.on("close", async (code) => {
|
|
211
|
+
const logWriter = new EvalLogWriter(getEvalLogPath(opts.projectRoot));
|
|
212
|
+
try {
|
|
213
|
+
if (code !== 0) {
|
|
214
|
+
throw new Error(`claude exited with code ${code}: ${stderr.slice(0, 500)}`);
|
|
215
|
+
}
|
|
216
|
+
let scorecardText = stdout;
|
|
217
|
+
let syncUsage;
|
|
218
|
+
try {
|
|
219
|
+
const jsonOutput = JSON.parse(stdout);
|
|
220
|
+
scorecardText = jsonOutput.result ?? jsonOutput.text ?? jsonOutput.content ?? stdout;
|
|
221
|
+
if (jsonOutput.total_cost_usd !== undefined || jsonOutput.usage) {
|
|
222
|
+
const u = jsonOutput.usage ?? {};
|
|
223
|
+
syncUsage = {
|
|
224
|
+
costUsd: jsonOutput.total_cost_usd ?? 0,
|
|
225
|
+
inputTokens: u.input_tokens ?? 0,
|
|
226
|
+
outputTokens: u.output_tokens ?? 0,
|
|
227
|
+
cacheCreationTokens: u.cache_creation_input_tokens ?? 0,
|
|
228
|
+
cacheReadTokens: u.cache_read_input_tokens ?? 0,
|
|
229
|
+
durationMs: jsonOutput.duration_ms ?? 0,
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
catch {
|
|
234
|
+
// raw JSON
|
|
235
|
+
}
|
|
236
|
+
const jsonMatch = scorecardText.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
|
|
237
|
+
if (jsonMatch?.[1]) {
|
|
238
|
+
scorecardText = jsonMatch[1];
|
|
239
|
+
}
|
|
240
|
+
const scorecard = JSON.parse(scorecardText.trim());
|
|
241
|
+
if (syncUsage)
|
|
242
|
+
scorecard.usage = syncUsage;
|
|
243
|
+
scorecard.telemetryPosted = false;
|
|
244
|
+
if (opts.evalEndpoint) {
|
|
245
|
+
await postTelemetry(opts.evalEndpoint, scorecard);
|
|
246
|
+
scorecard.telemetryPosted = true;
|
|
247
|
+
}
|
|
248
|
+
await logWriter.append(scorecard);
|
|
249
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
250
|
+
resolve(scorecard);
|
|
251
|
+
}
|
|
252
|
+
catch (err) {
|
|
253
|
+
const errorEntry = {
|
|
254
|
+
version: 1,
|
|
255
|
+
timestamp: new Date().toISOString(),
|
|
256
|
+
mode: opts.mode,
|
|
257
|
+
changeId: opts.changeId,
|
|
258
|
+
error: true,
|
|
259
|
+
message: err instanceof Error ? err.message : String(err),
|
|
260
|
+
};
|
|
261
|
+
await logWriter.append(errorEntry);
|
|
262
|
+
resolve(errorEntry);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
});
|
|
266
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenTelemetry tracing for the eval agent (evaluator).
|
|
3
|
+
*
|
|
4
|
+
* Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
|
|
5
|
+
* `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
6
|
+
* (Dash0 or any OTLP HTTP receiver).
|
|
7
|
+
*
|
|
8
|
+
* Default OFF — zero cost in normal operation (no SDK init, no network).
|
|
9
|
+
*
|
|
10
|
+
* Graceful degradation: when enabled but endpoint missing, log a warning
|
|
11
|
+
* to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
|
|
12
|
+
* throws, same behavior. The evaluator never fails because of OTel.
|
|
13
|
+
*/
|
|
14
|
+
import { type Attributes, type Span, type Tracer } from "@opentelemetry/api";
|
|
15
|
+
export interface EvalOtelConfig {
|
|
16
|
+
enabled: boolean;
|
|
17
|
+
endpoint: string | null;
|
|
18
|
+
dataset: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
|
|
22
|
+
* the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
23
|
+
* env vars. Does not init anything or touch the network.
|
|
24
|
+
*
|
|
25
|
+
* Resolution:
|
|
26
|
+
* - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
|
|
27
|
+
* - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
|
|
28
|
+
* - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
|
|
29
|
+
* else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
|
|
30
|
+
*/
|
|
31
|
+
export declare function isEvalOtelEnabled(projectRoot: string): EvalOtelConfig;
|
|
32
|
+
/**
|
|
33
|
+
* Initialize OTel tracing for the evaluator if enabled + endpoint set.
|
|
34
|
+
* Returns a Tracer — real when enabled, no-op when not.
|
|
35
|
+
*
|
|
36
|
+
* The no-op path costs nothing: no provider registered, no network, the
|
|
37
|
+
* returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
|
|
38
|
+
*
|
|
39
|
+
* Safe to call multiple times — subsequent calls return the same tracer.
|
|
40
|
+
*/
|
|
41
|
+
export declare function initEvalOtel(projectRoot: string): Tracer;
|
|
42
|
+
/**
|
|
43
|
+
* Run `fn` inside an active span. Closes the span in `finally`. On thrown
|
|
44
|
+
* error, records the exception on the span and sets status to ERROR, then
|
|
45
|
+
* re-throws so callers can still handle it.
|
|
46
|
+
*
|
|
47
|
+
* Use this for every lifecycle step in the evaluator so spans close even
|
|
48
|
+
* when Claude exits non-zero or a downstream step throws.
|
|
49
|
+
*/
|
|
50
|
+
export declare function withSpan<T>(tracer: Tracer, name: string, attrs: Attributes | undefined, fn: (span: Span) => Promise<T> | T): Promise<T>;
|
|
51
|
+
/**
|
|
52
|
+
* Flush and shut down the active provider. Call this before `process.exit()`
|
|
53
|
+
* in detached processes so batched spans are not lost. No-op if no provider
|
|
54
|
+
* is active.
|
|
55
|
+
*/
|
|
56
|
+
export declare function shutdownEvalOtel(): Promise<void>;
|
|
57
|
+
/**
|
|
58
|
+
* Test hook: reset the module's state AND the global OTel API so each test
|
|
59
|
+
* starts fresh. Not part of the public API.
|
|
60
|
+
*/
|
|
61
|
+
export declare function __resetEvalOtelForTests(): void;
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenTelemetry tracing for the eval agent (evaluator).
|
|
3
|
+
*
|
|
4
|
+
* Opt-in via `eval.otel.enabled: true` in `.indusk/config.json` OR
|
|
5
|
+
* `INDUSK_EVAL_OTEL=1` env var. Exports to `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
6
|
+
* (Dash0 or any OTLP HTTP receiver).
|
|
7
|
+
*
|
|
8
|
+
* Default OFF — zero cost in normal operation (no SDK init, no network).
|
|
9
|
+
*
|
|
10
|
+
* Graceful degradation: when enabled but endpoint missing, log a warning
|
|
11
|
+
* to `.indusk/eval/system.log` and return a no-op tracer. When SDK init
|
|
12
|
+
* throws, same behavior. The evaluator never fails because of OTel.
|
|
13
|
+
*/
|
|
14
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync } from "node:fs";
|
|
15
|
+
import { join, resolve } from "node:path";
|
|
16
|
+
import { SpanStatusCode, trace } from "@opentelemetry/api";
|
|
17
|
+
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
18
|
+
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
19
|
+
import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-base";
|
|
20
|
+
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
21
|
+
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
|
|
22
|
+
const TRACER_NAME = "@infinitedusky/indusk-mcp/eval";
|
|
23
|
+
const SERVICE_NAME = "indusk-eval-agent";
|
|
24
|
+
function syslog(projectRoot, msg) {
|
|
25
|
+
try {
|
|
26
|
+
const logDir = resolve(projectRoot, ".indusk", "eval");
|
|
27
|
+
mkdirSync(logDir, { recursive: true });
|
|
28
|
+
appendFileSync(resolve(logDir, "system.log"), `${new Date().toISOString()} ${msg}\n`);
|
|
29
|
+
}
|
|
30
|
+
catch {
|
|
31
|
+
// logging should never break anything
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
const DEFAULT_DATASET = "agent";
|
|
35
|
+
/**
|
|
36
|
+
* Pure predicate — reads `.indusk/config.json` `eval.otel.{enabled,dataset}` and
|
|
37
|
+
* the `INDUSK_EVAL_OTEL` / `INDUSK_EVAL_OTEL_DATASET` / `OTEL_EXPORTER_OTLP_ENDPOINT`
|
|
38
|
+
* env vars. Does not init anything or touch the network.
|
|
39
|
+
*
|
|
40
|
+
* Resolution:
|
|
41
|
+
* - `enabled`: `INDUSK_EVAL_OTEL=1` (truthy) wins, else config `eval.otel.enabled`, else false.
|
|
42
|
+
* - `endpoint`: `OTEL_EXPORTER_OTLP_ENDPOINT` (null if unset).
|
|
43
|
+
* - `dataset`: `INDUSK_EVAL_OTEL_DATASET` env var wins, else config `eval.otel.dataset`,
|
|
44
|
+
* else `"agent"` default. Sent as the `Dash0-Dataset` header on every OTLP export.
|
|
45
|
+
*/
|
|
46
|
+
export function isEvalOtelEnabled(projectRoot) {
|
|
47
|
+
const envFlag = process.env.INDUSK_EVAL_OTEL;
|
|
48
|
+
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? null;
|
|
49
|
+
const envDataset = process.env.INDUSK_EVAL_OTEL_DATASET;
|
|
50
|
+
let configEnabled = false;
|
|
51
|
+
let configDataset;
|
|
52
|
+
const configPath = join(projectRoot, ".indusk", "config.json");
|
|
53
|
+
if (existsSync(configPath)) {
|
|
54
|
+
try {
|
|
55
|
+
const config = JSON.parse(readFileSync(configPath, "utf-8"));
|
|
56
|
+
configEnabled = config?.eval?.otel?.enabled === true;
|
|
57
|
+
if (typeof config?.eval?.otel?.dataset === "string") {
|
|
58
|
+
configDataset = config.eval.otel.dataset;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
// malformed config — treat as disabled
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
const envForcesEnabled = envFlag !== undefined && envFlag !== "" && envFlag !== "0" && envFlag.toLowerCase() !== "false";
|
|
66
|
+
const dataset = envDataset && envDataset !== "" ? envDataset : (configDataset ?? DEFAULT_DATASET);
|
|
67
|
+
return {
|
|
68
|
+
enabled: envForcesEnabled || configEnabled,
|
|
69
|
+
endpoint,
|
|
70
|
+
dataset,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
let activeProvider = null;
|
|
74
|
+
/**
|
|
75
|
+
* Initialize OTel tracing for the evaluator if enabled + endpoint set.
|
|
76
|
+
* Returns a Tracer — real when enabled, no-op when not.
|
|
77
|
+
*
|
|
78
|
+
* The no-op path costs nothing: no provider registered, no network, the
|
|
79
|
+
* returned tracer's `startSpan` / `startActiveSpan` produce no-op spans.
|
|
80
|
+
*
|
|
81
|
+
* Safe to call multiple times — subsequent calls return the same tracer.
|
|
82
|
+
*/
|
|
83
|
+
export function initEvalOtel(projectRoot) {
|
|
84
|
+
const { enabled, endpoint, dataset } = isEvalOtelEnabled(projectRoot);
|
|
85
|
+
if (!enabled) {
|
|
86
|
+
return trace.getTracer(TRACER_NAME);
|
|
87
|
+
}
|
|
88
|
+
if (!endpoint) {
|
|
89
|
+
syslog(projectRoot, "eval.otel.enabled but OTEL_EXPORTER_OTLP_ENDPOINT is unset — falling back to no-op tracer");
|
|
90
|
+
return trace.getTracer(TRACER_NAME);
|
|
91
|
+
}
|
|
92
|
+
if (activeProvider) {
|
|
93
|
+
return trace.getTracer(TRACER_NAME);
|
|
94
|
+
}
|
|
95
|
+
try {
|
|
96
|
+
const exporter = new OTLPTraceExporter({
|
|
97
|
+
url: endpoint.endsWith("/v1/traces") ? endpoint : `${endpoint.replace(/\/$/, "")}/v1/traces`,
|
|
98
|
+
// Route agent spans to the Dash0 dataset named `dataset`. Default
|
|
99
|
+
// is "agent". Env-set headers (OTEL_EXPORTER_OTLP_HEADERS) take
|
|
100
|
+
// precedence — per the OTel SDK contract — so a user-provided
|
|
101
|
+
// Dash0-Dataset in env overrides this default.
|
|
102
|
+
headers: {
|
|
103
|
+
"Dash0-Dataset": dataset,
|
|
104
|
+
},
|
|
105
|
+
});
|
|
106
|
+
const provider = new NodeTracerProvider({
|
|
107
|
+
resource: resourceFromAttributes({
|
|
108
|
+
[ATTR_SERVICE_NAME]: SERVICE_NAME,
|
|
109
|
+
}),
|
|
110
|
+
spanProcessors: [new BatchSpanProcessor(exporter)],
|
|
111
|
+
});
|
|
112
|
+
provider.register();
|
|
113
|
+
activeProvider = provider;
|
|
114
|
+
syslog(projectRoot, `eval.otel initialized — endpoint: ${endpoint}, dataset: ${dataset}`);
|
|
115
|
+
}
|
|
116
|
+
catch (err) {
|
|
117
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
118
|
+
syslog(projectRoot, `eval.otel init failed — falling back to no-op tracer: ${message}`);
|
|
119
|
+
}
|
|
120
|
+
return trace.getTracer(TRACER_NAME);
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Run `fn` inside an active span. Closes the span in `finally`. On thrown
|
|
124
|
+
* error, records the exception on the span and sets status to ERROR, then
|
|
125
|
+
* re-throws so callers can still handle it.
|
|
126
|
+
*
|
|
127
|
+
* Use this for every lifecycle step in the evaluator so spans close even
|
|
128
|
+
* when Claude exits non-zero or a downstream step throws.
|
|
129
|
+
*/
|
|
130
|
+
export async function withSpan(tracer, name, attrs, fn) {
|
|
131
|
+
return tracer.startActiveSpan(name, { attributes: attrs ?? {} }, async (span) => {
|
|
132
|
+
try {
|
|
133
|
+
return await fn(span);
|
|
134
|
+
}
|
|
135
|
+
catch (err) {
|
|
136
|
+
span.recordException(err instanceof Error ? err : new Error(String(err)));
|
|
137
|
+
span.setStatus({ code: SpanStatusCode.ERROR });
|
|
138
|
+
throw err;
|
|
139
|
+
}
|
|
140
|
+
finally {
|
|
141
|
+
span.end();
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Flush and shut down the active provider. Call this before `process.exit()`
|
|
147
|
+
* in detached processes so batched spans are not lost. No-op if no provider
|
|
148
|
+
* is active.
|
|
149
|
+
*/
|
|
150
|
+
export async function shutdownEvalOtel() {
|
|
151
|
+
if (!activeProvider)
|
|
152
|
+
return;
|
|
153
|
+
try {
|
|
154
|
+
await activeProvider.forceFlush();
|
|
155
|
+
await activeProvider.shutdown();
|
|
156
|
+
}
|
|
157
|
+
catch {
|
|
158
|
+
// shutdown is best-effort
|
|
159
|
+
}
|
|
160
|
+
finally {
|
|
161
|
+
activeProvider = null;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Test hook: reset the module's state AND the global OTel API so each test
|
|
166
|
+
* starts fresh. Not part of the public API.
|
|
167
|
+
*/
|
|
168
|
+
export function __resetEvalOtelForTests() {
|
|
169
|
+
// Tear down any provider left over from a previous test. This un-registers
|
|
170
|
+
// from the global OTel API, so `trace.getTracer()` falls back to the no-op
|
|
171
|
+
// tracer until a new provider is registered.
|
|
172
|
+
if (activeProvider) {
|
|
173
|
+
void activeProvider.shutdown().catch(() => { });
|
|
174
|
+
}
|
|
175
|
+
activeProvider = null;
|
|
176
|
+
trace.disable();
|
|
177
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Persistent evaluator session management.
|
|
3
|
+
*
|
|
4
|
+
* First eval spawns a new session with full catchup. Subsequent evals resume
|
|
5
|
+
* the same session — no catchup cost, just "evaluate this change."
|
|
6
|
+
*
|
|
7
|
+
* Session state stored in `.indusk/eval/evaluator-session.json`.
|
|
8
|
+
*/
|
|
9
|
+
import type { EvalErrorEntry, EvalScorecard } from "./types.js";
|
|
10
|
+
/**
|
|
11
|
+
* Run eval using a persistent session. First call does catchup + eval.
|
|
12
|
+
* Subsequent calls resume the session with just the new change.
|
|
13
|
+
*/
|
|
14
|
+
export declare function runPersistentEval(opts: {
|
|
15
|
+
projectRoot: string;
|
|
16
|
+
changeId: string;
|
|
17
|
+
transcriptPath: string;
|
|
18
|
+
mode: "eval" | "baseline";
|
|
19
|
+
evalEndpoint?: string;
|
|
20
|
+
}): Promise<EvalScorecard | EvalErrorEntry>;
|