zidane 5.9.1 → 5.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/{agent-CHemm-6O.d.ts → agent-CDFbxbHd.d.ts} +2 -2
  2. package/dist/agent-CDFbxbHd.d.ts.map +1 -0
  3. package/dist/chat/pure.d.ts +3 -3
  4. package/dist/chat.d.ts +6 -6
  5. package/dist/chat.js +2 -2
  6. package/dist/contexts/docker.d.ts +1 -1
  7. package/dist/contexts/docker.js +11 -0
  8. package/dist/contexts/docker.js.map +1 -1
  9. package/dist/contexts.d.ts +3 -3
  10. package/dist/eval.d.ts +2 -0
  11. package/dist/eval.js +1323 -0
  12. package/dist/eval.js.map +1 -0
  13. package/dist/{headless-CZuZ_565.js → headless-C596K-Lk.js} +3 -3
  14. package/dist/{headless-CZuZ_565.js.map → headless-C596K-Lk.js.map} +1 -1
  15. package/dist/headless.d.ts +1 -1
  16. package/dist/headless.js +1 -1
  17. package/dist/{index-CZOwAJIX.d.ts → index-ClYUGpmF.d.ts} +2 -2
  18. package/dist/{index-CZOwAJIX.d.ts.map → index-ClYUGpmF.d.ts.map} +1 -1
  19. package/dist/{index-OfPNcoad.d.ts → index-DWw_PtJH.d.ts} +3 -3
  20. package/dist/{index-OfPNcoad.d.ts.map → index-DWw_PtJH.d.ts.map} +1 -1
  21. package/dist/{index-DjIJ2qWV.d.ts → index-D_i2Nhts.d.ts} +446 -4
  22. package/dist/index-D_i2Nhts.d.ts.map +1 -0
  23. package/dist/index.d.ts +6 -6
  24. package/dist/index.js +7 -6
  25. package/dist/index.js.map +1 -1
  26. package/dist/{login-BouPxVYR.js → login-Ujwd1HEb.js} +2 -2
  27. package/dist/{login-BouPxVYR.js.map → login-Ujwd1HEb.js.map} +1 -1
  28. package/dist/mcp.d.ts +1 -1
  29. package/dist/{presets-Cugb5FrA.js → presets-ga6dMRVd.js} +2 -2
  30. package/dist/{presets-Cugb5FrA.js.map → presets-ga6dMRVd.js.map} +1 -1
  31. package/dist/presets.d.ts +2 -2
  32. package/dist/presets.js +1 -1
  33. package/dist/{providers-CHV6e9Ik.js → providers-CezC9my7.js} +28 -2
  34. package/dist/providers-CezC9my7.js.map +1 -0
  35. package/dist/providers.d.ts +1 -1
  36. package/dist/providers.js +1 -1
  37. package/dist/restate.d.ts +1 -1
  38. package/dist/session/sqlite.d.ts +1 -1
  39. package/dist/session.d.ts +1 -1
  40. package/dist/skills.d.ts +2 -2
  41. package/dist/{tool-formatters-pndTz5iT.d.ts → tool-formatters-B3fSafpr.d.ts} +2 -2
  42. package/dist/{tool-formatters-pndTz5iT.d.ts.map → tool-formatters-B3fSafpr.d.ts.map} +1 -1
  43. package/dist/tools/fetch-url.d.ts +1 -1
  44. package/dist/tools/web-search.d.ts +1 -1
  45. package/dist/{tools-CSmcSu_B.js → tools-CAdllmWO.js} +2 -2
  46. package/dist/{tools-CSmcSu_B.js.map → tools-CAdllmWO.js.map} +1 -1
  47. package/dist/tools.d.ts +2 -2
  48. package/dist/tools.js +1 -1
  49. package/dist/{transcript-anchors-BROJfwC8.d.ts → transcript-anchors-BHVVVhR0.d.ts} +4 -4
  50. package/dist/{transcript-anchors-BROJfwC8.d.ts.map → transcript-anchors-BHVVVhR0.d.ts.map} +1 -1
  51. package/dist/{transcript-anchors-Bncn747q.js → transcript-anchors-DFknQ4ew.js} +4 -4
  52. package/dist/{transcript-anchors-Bncn747q.js.map → transcript-anchors-DFknQ4ew.js.map} +1 -1
  53. package/dist/tui.d.ts +3 -3
  54. package/dist/tui.js +3 -3
  55. package/dist/{turn-operations-BYnXjY80.d.ts → turn-operations-BMxugnqg.d.ts} +3 -3
  56. package/dist/{turn-operations-BYnXjY80.d.ts.map → turn-operations-BMxugnqg.d.ts.map} +1 -1
  57. package/dist/{types-CEAMIUXw.d.ts → types-CB1J-DQw.d.ts} +14 -2
  58. package/dist/types-CB1J-DQw.d.ts.map +1 -0
  59. package/dist/types.d.ts +4 -4
  60. package/package.json +10 -3
  61. package/scripts/eval.ts +140 -0
  62. package/dist/agent-CHemm-6O.d.ts.map +0 -1
  63. package/dist/index-DjIJ2qWV.d.ts.map +0 -1
  64. package/dist/providers-CHV6e9Ik.js.map +0 -1
  65. package/dist/types-CEAMIUXw.d.ts.map +0 -1
package/dist/eval.js ADDED
@@ -0,0 +1,1323 @@
1
+ import { n as createProcessContext } from "./contexts-BD2U_xpi.js";
2
+ import { n as runHeadless, t as headlessEventToJsonl } from "./headless-C596K-Lk.js";
3
+ import { i as createMemoryStore, t as createSession } from "./session-7CKYn9qT.js";
4
+ import { join, relative, resolve } from "node:path";
5
+ import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
6
+ import { tmpdir } from "node:os";
7
+ import chalk from "chalk";
8
+ //#region src/eval.ts
9
+ /**
10
+ * Declare an eval's metric set. Returns the same map, typed; pass it to
11
+ * `EvalCaseOptions.metrics`. Declared metrics that are not emitted are recorded
12
+ * as missing with a normalized score of 0; emitting an undeclared metric is an
13
+ * authoring error and throws.
14
+ */
15
+ function defineMetrics(spec) {
16
+ return spec;
17
+ }
18
+ /** Always-available efficiency metrics derived from `HeadlessResult`. */
19
+ const EFFICIENCY_METRICS = {
20
+ "execution-time": {
21
+ min: 0,
22
+ max: 3e5,
23
+ direction: "lower-is-better",
24
+ tags: ["@efficiency"],
25
+ description: "Wall-clock run time (ms)."
26
+ },
27
+ "provider-tokens": {
28
+ min: 0,
29
+ max: 2e5,
30
+ direction: "lower-is-better",
31
+ tags: ["@efficiency"],
32
+ description: "Input + output tokens."
33
+ },
34
+ "cache-read-rate": {
35
+ min: 0,
36
+ max: 1,
37
+ direction: "higher-is-better",
38
+ tags: ["@efficiency"],
39
+ description: "Cache-read tokens / input tokens."
40
+ }
41
+ };
42
+ function normalizeMetric(raw, spec) {
43
+ const span = spec.max - spec.min;
44
+ if (span === 0) return 1;
45
+ const normalized = (Math.min(spec.max, Math.max(spec.min, raw)) - spec.min) / span;
46
+ return spec.direction === "lower-is-better" ? 1 - normalized : normalized;
47
+ }
48
+ /**
49
+ * Thrown by `runEvalCase` when a scorer emits a metric id that was not declared
50
+ * in `defineMetrics`. This is an authoring error (a typo'd or stale metric id),
51
+ * not a low score — it fails the eval test rather than being recorded as a
52
+ * failed scorer. Exported so downstream harnesses can type-narrow on it.
53
+ */
54
+ var EvalMetricError = class extends Error {
55
+ constructor(message) {
56
+ super(message);
57
+ this.name = "EvalMetricError";
58
+ }
59
+ };
60
+ /**
61
+ * Wrap an execution context so repeated agent runs share one handle until the
62
+ * caller disposes it. This is useful for eval fixtures that want per-test
63
+ * Docker/process lifetime while still using `runHeadless()` per turn.
64
+ */
65
+ function createReusableExecutionContext(base) {
66
+ let handle;
67
+ let spawnPromise;
68
+ let disposed = false;
69
+ let disposePromise;
70
+ const ensureHandle = (config) => {
71
+ if (disposed) throw new Error("Reusable execution context has been disposed");
72
+ if (handle) return Promise.resolve(handle);
73
+ spawnPromise ??= base.spawn(config).then((next) => {
74
+ handle = next;
75
+ return next;
76
+ }, (err) => {
77
+ spawnPromise = void 0;
78
+ throw err;
79
+ });
80
+ return spawnPromise;
81
+ };
82
+ return {
83
+ execution: {
84
+ ...base,
85
+ async spawn(config) {
86
+ return ensureHandle(config);
87
+ },
88
+ async destroy() {}
89
+ },
90
+ dispose() {
91
+ disposePromise ??= (async () => {
92
+ disposed = true;
93
+ const active = handle ?? await spawnPromise?.catch(() => void 0);
94
+ handle = void 0;
95
+ spawnPromise = void 0;
96
+ if (active) await base.destroy(active);
97
+ })();
98
+ return disposePromise;
99
+ },
100
+ handle: () => handle
101
+ };
102
+ }
103
+ /**
104
+ * Multi-turn eval agent over the low-level headless runner. It keeps session
105
+ * and execution lifetime outside any Playwright/Bolt-specific fixture layer,
106
+ * so downstream platforms can wrap it with their own `agent.run(...)` shape.
107
+ */
108
+ function createEvalAgent(options) {
109
+ const { cwd, execution, session: initialSession, store: initialStore, mcpServers: baseMcpServers, onEvent: baseOnEvent, ...headlessDefaults } = options;
110
+ const executionOwner = createReusableExecutionContext(execution ?? createProcessContext({ cwd }));
111
+ const store = initialStore ?? createMemoryStore();
112
+ let sessionPromise = initialSession ? Promise.resolve(initialSession) : void 0;
113
+ let disposed = false;
114
+ let runInFlight = false;
115
+ let disposePromise;
116
+ const stats = {
117
+ runs: 0,
118
+ input: 0,
119
+ output: 0,
120
+ cacheRead: 0,
121
+ cacheCreation: 0,
122
+ cost: 0,
123
+ turns: 0,
124
+ toolCalls: 0,
125
+ durationMs: 0
126
+ };
127
+ async function getSession() {
128
+ if (disposed) throw new Error("Eval agent has been disposed");
129
+ sessionPromise ??= createSession({ store });
130
+ return sessionPromise;
131
+ }
132
+ async function resolveMcpServers(runServers) {
133
+ const merged = [...(typeof baseMcpServers === "function" ? await baseMcpServers() : baseMcpServers) ?? [], ...runServers ?? []];
134
+ return merged.length > 0 ? [...merged] : void 0;
135
+ }
136
+ async function run(runOptions) {
137
+ if (runInFlight) throw new Error("Eval agent already has a run in progress");
138
+ runInFlight = true;
139
+ try {
140
+ const session = await getSession();
141
+ const before = session.turns.length;
142
+ const { mcpServers: runMcpServers, onEvent: runOnEvent, ...restRunOptions } = runOptions;
143
+ const mcpServers = await resolveMcpServers(runMcpServers);
144
+ const result = await runHeadless({
145
+ ...headlessDefaults,
146
+ ...restRunOptions,
147
+ session,
148
+ execution: executionOwner.execution,
149
+ ...mcpServers ? { mcpServers } : {},
150
+ onEvent(event) {
151
+ baseOnEvent?.(event);
152
+ runOnEvent?.(event);
153
+ }
154
+ });
155
+ const newTranscript = result.transcript.slice(before);
156
+ const runStats = {
157
+ input: result.usage.input,
158
+ output: result.usage.output,
159
+ cacheRead: result.usage.cacheRead,
160
+ cacheCreation: result.usage.cacheCreation,
161
+ cost: result.usage.cost ?? 0,
162
+ durationMs: result.durationMs,
163
+ turns: result.turns,
164
+ toolCalls: countToolCallsInTurns(newTranscript)
165
+ };
166
+ accumulateEvalAgentStats(stats, runStats);
167
+ return {
168
+ result,
169
+ stats: runStats,
170
+ newTranscript
171
+ };
172
+ } finally {
173
+ runInFlight = false;
174
+ }
175
+ }
176
+ function dispose() {
177
+ disposePromise ??= (async () => {
178
+ disposed = true;
179
+ await executionOwner.dispose();
180
+ })();
181
+ return disposePromise;
182
+ }
183
+ return {
184
+ run,
185
+ stats,
186
+ session: getSession,
187
+ dispose,
188
+ [Symbol.asyncDispose]: dispose
189
+ };
190
+ }
191
+ /**
192
+ * Build an ordered, grouped trajectory from a transcript. Walks assistant
193
+ * turns in order, emitting `think` / `text` / `tool:<name>` steps, grouping
194
+ * only consecutive identical kinds. Per-step duration is approximated from the
195
+ * owning turn's `createdAt` delta to the next turn.
196
+ */
197
+ function buildTrajectory(transcript) {
198
+ const steps = [];
199
+ let total = 0;
200
+ const push = (kind, name, durationMs) => {
201
+ total++;
202
+ const last = steps[steps.length - 1];
203
+ if (last && last.kind === kind && last.name === name) {
204
+ last.count++;
205
+ if (durationMs > 0) last.durationMs = (last.durationMs ?? 0) + durationMs;
206
+ return;
207
+ }
208
+ steps.push({
209
+ kind,
210
+ ...name ? { name } : {},
211
+ count: 1,
212
+ ...durationMs > 0 ? { durationMs } : {}
213
+ });
214
+ };
215
+ for (let i = 0; i < transcript.length; i++) {
216
+ const turn = transcript[i];
217
+ if (turn.role !== "assistant") continue;
218
+ const next = transcript[i + 1];
219
+ const turnDuration = next ? Math.max(0, next.createdAt - turn.createdAt) : 0;
220
+ const blocks = turn.content.filter((b) => b.type === "thinking" || b.type === "redacted_thinking" || b.type === "text" || b.type === "tool_call");
221
+ const toolBlocks = blocks.filter((b) => b.type === "tool_call").length;
222
+ const perTool = toolBlocks > 0 ? Math.round(turnDuration / toolBlocks) : 0;
223
+ for (const block of blocks) if (block.type === "thinking" || block.type === "redacted_thinking") push("think", void 0, 0);
224
+ else if (block.type === "text") push("text", void 0, 0);
225
+ else if (block.type === "tool_call") push("tool", block.name, perTool);
226
+ }
227
+ return {
228
+ steps,
229
+ totalBlocks: total
230
+ };
231
+ }
232
+ /** Render a trajectory as a one-line timeline string (no color). */
233
+ function formatTrajectoryLine(trajectory) {
234
+ return trajectory.steps.map(stepLabel).join(" → ");
235
+ }
236
+ function stepLabel(step) {
237
+ const base = step.kind === "tool" ? step.name ?? "tool" : step.kind;
238
+ return step.count > 1 ? `${base} ×${step.count}` : base;
239
+ }
240
+ const evalRegistry = [];
241
+ const STUB_PROVIDER = {
242
+ name: "stub",
243
+ meta: { defaultModel: "stub" },
244
+ formatTools: (tools) => tools,
245
+ userMessage: () => ({
246
+ role: "user",
247
+ content: []
248
+ }),
249
+ assistantMessage: () => ({
250
+ role: "assistant",
251
+ content: []
252
+ }),
253
+ toolResultsMessage: () => ({
254
+ role: "user",
255
+ content: []
256
+ }),
257
+ stream: async () => {
258
+ throw new Error("stub provider is not runnable");
259
+ }
260
+ };
261
+ /**
262
+ * Register an eval definition. Eval files call this at module load; the harness
263
+ * (`buildRegisteredEvals`) materializes them with a provider at run time. The
264
+ * factory keeps definitions lazy so a single registry serves any provider/judge
265
+ * pairing without re-importing eval files.
266
+ *
267
+ * Returns the same factory so a file can also `export default` it for direct,
268
+ * registry-free use (e.g. hermetic unit tests).
269
+ */
270
+ function defineEval(define) {
271
+ const id = safeRegistryId(define);
272
+ if (!evalRegistry.some((entry) => entry.id === id)) evalRegistry.push({
273
+ id,
274
+ define
275
+ });
276
+ return define;
277
+ }
278
+ /** Snapshot of every registered eval, materialized for one provider/judge pair. */
279
+ function buildRegisteredEvals(ctx) {
280
+ return evalRegistry.map((entry) => entry.define(ctx));
281
+ }
282
+ /** Drop every registered eval — test isolation helper. */
283
+ function clearRegisteredEvals() {
284
+ evalRegistry.length = 0;
285
+ }
286
+ function safeRegistryId(define) {
287
+ try {
288
+ const stub = define({
289
+ provider: STUB_PROVIDER,
290
+ judge: STUB_PROVIDER
291
+ });
292
+ return `${stub.suite ?? "eval"}/${stub.id}`;
293
+ } catch {
294
+ return `eval/${evalRegistry.length}`;
295
+ }
296
+ }
297
+ const DEFAULT_JUDGE_SYSTEM = [
298
+ "You are a strict software eval judge.",
299
+ "Grade only the supplied output against the rubric.",
300
+ "Call the `submit_grade` tool with your score and reasoning.",
301
+ "The score must be between 0 and 1."
302
+ ].join("\n");
303
+ /** Forced-output tool the judge must call — the schema-enforcement path. */
304
+ const JUDGE_TOOL = {
305
+ name: "submit_grade",
306
+ description: "Submit the grade for the output under evaluation.",
307
+ inputSchema: {
308
+ type: "object",
309
+ properties: {
310
+ score: {
311
+ type: "number",
312
+ minimum: 0,
313
+ maximum: 1,
314
+ description: "Quality score from 0 (worst) to 1 (best)."
315
+ },
316
+ reasoning: {
317
+ type: "string",
318
+ description: "Brief justification for the score."
319
+ },
320
+ feedback: {
321
+ type: "string",
322
+ description: "Optional actionable feedback."
323
+ }
324
+ },
325
+ required: ["score", "reasoning"],
326
+ additionalProperties: false
327
+ }
328
+ };
329
+ async function runEvalCase(options) {
330
+ const { id, suite, tags = [], artifactDir, workspace, scorers = [], onEvent, ...headless } = options;
331
+ const { metrics: declaredMetrics, sourceFile, ...headlessRest } = headless;
332
+ const events = [];
333
+ const caseArtifactDir = artifactDir ? join(artifactDir, safeSegment(suite ?? "eval"), safeSegment(id)) : void 0;
334
+ const workspaceState = await prepareWorkspace(headlessRest.execution, workspace);
335
+ const caseMetricIds = new Set(Object.keys(declaredMetrics ?? {}));
336
+ const hasCaseMetrics = caseMetricIds.size > 0;
337
+ const metricSpecs = {
338
+ ...EFFICIENCY_METRICS,
339
+ ...declaredMetrics
340
+ };
341
+ const rawMetrics = /* @__PURE__ */ new Map();
342
+ const emitMetric = (metricId, raw) => {
343
+ if (!(metricId in metricSpecs)) throw new EvalMetricError(`Eval ${id}: metric "${metricId}" emitted but not declared in defineMetrics`);
344
+ if (!Number.isFinite(raw)) throw new EvalMetricError(`Eval ${id}: metric "${metricId}" emitted a non-finite value (${raw}); scores require finite numbers`);
345
+ rawMetrics.set(metricId, raw);
346
+ };
347
+ try {
348
+ const result = await runHeadless({
349
+ ...headlessRest,
350
+ ...workspaceState.execution ? { execution: workspaceState.execution } : {},
351
+ onEvent(event) {
352
+ events.push(event);
353
+ onEvent?.(event);
354
+ }
355
+ });
356
+ emitEfficiencyMetrics(emitMetric, result);
357
+ const workspaceSnapshot = workspaceState.snapshot();
358
+ const workspaceError = workspaceState.error();
359
+ const scores = await runScorers({
360
+ id,
361
+ ...suite ? { suite } : {},
362
+ tags,
363
+ result,
364
+ events,
365
+ metric: emitMetric,
366
+ ...workspaceSnapshot ? { workspace: workspaceSnapshot } : {},
367
+ ...caseArtifactDir ? { artifactDir: caseArtifactDir } : {}
368
+ }, scorers);
369
+ const metrics = finalizeEvalMetrics(metricSpecs, rawMetrics);
370
+ const tagScores = computeEvalTagScores(metrics);
371
+ const caseMetrics = metrics.filter((metric) => caseMetricIds.has(metric.id));
372
+ const score = hasCaseMetrics && caseMetrics.length > 0 ? meanNormalized(caseMetrics) : aggregateScore(scores);
373
+ const evalResult = {
374
+ id,
375
+ ...suite ? { suite } : {},
376
+ tags,
377
+ result,
378
+ score,
379
+ passed: result.status === "completed" && scores.every((score) => score.passed),
380
+ scores,
381
+ metrics,
382
+ tagScores,
383
+ trajectory: buildTrajectory(result.transcript),
384
+ events,
385
+ ...sourceFile ? { sourceFile } : {},
386
+ ...workspaceSnapshot ? { workspace: workspaceSnapshot } : {},
387
+ ...workspaceError ? { workspaceError } : {}
388
+ };
389
+ const artifacts = caseArtifactDir ? await writeEvalArtifacts(caseArtifactDir, evalResult) : void 0;
390
+ return {
391
+ ...evalResult,
392
+ ...artifacts ? { artifacts } : {}
393
+ };
394
+ } finally {
395
+ await workspaceState.cleanup();
396
+ }
397
+ }
398
+ function statusCompleted(name = "status.completed") {
399
+ return ({ result }) => ({
400
+ name,
401
+ passed: result.status === "completed",
402
+ score: result.status === "completed" ? 1 : 0,
403
+ details: result.status === "completed" ? void 0 : {
404
+ status: result.status,
405
+ error: result.error
406
+ }
407
+ });
408
+ }
409
+ function fileExists(path, name = `file.exists:${path}`) {
410
+ return ({ workspace }) => {
411
+ const found = workspace?.files.some((file) => file.path === path) ?? false;
412
+ return {
413
+ name,
414
+ passed: found,
415
+ score: found ? 1 : 0,
416
+ details: found ? void 0 : `Missing ${path}`
417
+ };
418
+ };
419
+ }
420
+ function fileExistsOneOf(paths, name = `file.existsOneOf:${paths.join("|")}`) {
421
+ return ({ workspace }) => {
422
+ const found = paths.find((path) => workspace?.files.some((file) => file.path === path));
423
+ return {
424
+ name,
425
+ passed: Boolean(found),
426
+ score: found ? 1 : 0,
427
+ details: found ? void 0 : `Missing one of: ${paths.join(", ")}`
428
+ };
429
+ };
430
+ }
431
+ function fileContains(path, expected, name = `file.contains:${path}`) {
432
+ return ({ workspace }) => {
433
+ const file = workspace?.files.find((file) => file.path === path);
434
+ const content = file?.content ?? "";
435
+ const passed = typeof expected === "string" ? content.includes(expected) : expected.test(content);
436
+ return {
437
+ name,
438
+ passed,
439
+ score: passed ? 1 : 0,
440
+ details: passed ? void 0 : file ? `Expected ${path} to contain ${String(expected)}` : `Missing ${path}`
441
+ };
442
+ };
443
+ }
444
+ function fileContentQuality(path, expected, name = `file.quality:${path}`) {
445
+ return ({ workspace }) => {
446
+ const content = (workspace?.files.find((file) => file.path === path))?.content;
447
+ if (content === void 0) return {
448
+ name,
449
+ passed: false,
450
+ score: 0,
451
+ details: `Missing ${path}`
452
+ };
453
+ const normalized = content.trim();
454
+ const expectedNormalized = expected.trim();
455
+ const score = normalized === expectedNormalized ? 1 : normalized.includes(expectedNormalized) ? .75 : content.includes(expected) ? .5 : 0;
456
+ return {
457
+ name,
458
+ passed: score >= .75,
459
+ score,
460
+ details: score >= .75 ? void 0 : `Expected ${path} to closely match ${JSON.stringify(expectedNormalized)}`
461
+ };
462
+ };
463
+ }
464
+ function llmJudge(options) {
465
+ const name = options.name ?? "llm.judge";
466
+ const model = options.model ?? options.provider.meta.defaultModel;
467
+ return async (ctx) => {
468
+ const userMessage = options.provider.userMessage(renderJudgePrompt({
469
+ rubric: options.rubric,
470
+ input: options.input?.(ctx) ?? defaultJudgeInput(ctx)
471
+ }));
472
+ let text = "";
473
+ const controller = new AbortController();
474
+ const timeoutMs = options.timeoutMs ?? 6e4;
475
+ let timedOut = false;
476
+ const timer = timeoutMs > 0 ? setTimeout(() => {
477
+ timedOut = true;
478
+ controller.abort(/* @__PURE__ */ new Error(`LLM judge timed out after ${timeoutMs}ms`));
479
+ }, timeoutMs) : void 0;
480
+ try {
481
+ const streamPromise = options.provider.stream({
482
+ model,
483
+ system: options.system ?? DEFAULT_JUDGE_SYSTEM,
484
+ tools: options.provider.formatTools([JUDGE_TOOL]),
485
+ messages: [userMessage],
486
+ maxTokens: options.maxTokens ?? 600,
487
+ cache: false,
488
+ signal: controller.signal,
489
+ toolChoice: {
490
+ type: "tool",
491
+ name: JUDGE_TOOL.name
492
+ }
493
+ }, { onText(delta) {
494
+ text += delta;
495
+ } });
496
+ const timeoutPromise = timeoutMs > 0 ? new Promise((_resolve, reject) => {
497
+ controller.signal.addEventListener("abort", () => {
498
+ if (timedOut) reject(/* @__PURE__ */ new Error(`LLM judge timed out after ${timeoutMs}ms`));
499
+ }, { once: true });
500
+ }) : void 0;
501
+ const result = timeoutPromise ? await Promise.race([streamPromise, timeoutPromise]) : await streamPromise;
502
+ const parsed = readJudgeResult(result.toolCalls, text || result.text);
503
+ if (options.metric) ctx.metric(options.metric, parsed.score);
504
+ return {
505
+ name,
506
+ passed: parsed.score >= .7,
507
+ score: parsed.score,
508
+ details: {
509
+ reasoning: parsed.reasoning,
510
+ ...parsed.feedback ? { feedback: parsed.feedback } : {},
511
+ ...parsed.raw ? { raw: parsed.raw } : {},
512
+ usage: result.usage
513
+ }
514
+ };
515
+ } finally {
516
+ if (timer) clearTimeout(timer);
517
+ }
518
+ };
519
+ }
520
+ function readJudgeResult(toolCalls, fallbackText) {
521
+ const call = toolCalls.find((tc) => tc.name === JUDGE_TOOL.name);
522
+ if (call) {
523
+ const input = call.input;
524
+ return {
525
+ score: clampScore(input.score),
526
+ reasoning: typeof input.reasoning === "string" ? input.reasoning : "",
527
+ ...typeof input.feedback === "string" ? { feedback: input.feedback } : {}
528
+ };
529
+ }
530
+ return parseJudgeOutput(fallbackText);
531
+ }
532
+ /**
533
+ * Aggregate a group of boolean scorers into a single `0..1` functionality
534
+ * metric (fraction passed) and emit it. Useful for "N of M files present".
535
+ */
536
+ function functionalityMetric(metricId, scorers, name = metricId) {
537
+ return async (ctx) => {
538
+ const results = await runScorers(ctx, scorers);
539
+ const passed = results.filter((r) => r.passed).length;
540
+ const fraction = results.length === 0 ? 1 : passed / results.length;
541
+ ctx.metric(metricId, fraction);
542
+ return {
543
+ name,
544
+ passed: passed === results.length,
545
+ score: fraction,
546
+ details: {
547
+ passed,
548
+ total: results.length,
549
+ checks: results.map((r) => ({
550
+ name: r.name,
551
+ passed: r.passed
552
+ }))
553
+ }
554
+ };
555
+ };
556
+ }
557
+ function formatEvalCaseSummary(result) {
558
+ const lines = [
559
+ `Eval ${result.suite ? `${result.suite}/` : ""}${result.id}`,
560
+ `status: ${result.result.status}`,
561
+ `passed: ${result.passed}`,
562
+ `score: ${result.score.toFixed(2)}`,
563
+ `duration: ${result.result.durationMs}ms`,
564
+ `turns: ${result.result.turns}`,
565
+ `tool calls: ${result.result.numToolCalls}`,
566
+ `usage: input=${result.result.usage.input} output=${result.result.usage.output} cacheRead=${result.result.usage.cacheRead} cacheCreation=${result.result.usage.cacheCreation}${result.result.usage.cost !== void 0 ? ` cost=$${result.result.usage.cost.toFixed(6)}` : ""}`
567
+ ];
568
+ if (result.workspace) lines.push(`workspace files: ${result.workspace.files.length}`);
569
+ if (result.workspaceError) lines.push(`workspace error: ${result.workspaceError}`);
570
+ if (result.artifacts) lines.push(`artifacts: ${result.artifacts.dir}`);
571
+ if (result.scores.length > 0) {
572
+ lines.push("scores:");
573
+ for (const score of result.scores) {
574
+ const value = score.score !== void 0 ? score.score.toFixed(2) : score.passed ? "1.00" : "0.00";
575
+ lines.push(` - ${score.name}: ${value} ${score.passed ? "pass" : "fail"}`);
576
+ if (typeof score.details === "string" && score.details.length > 0) lines.push(` ${score.details}`);
577
+ else if (isRecord(score.details) && typeof score.details.reasoning === "string") lines.push(` ${score.details.reasoning}`);
578
+ }
579
+ }
580
+ return lines.join("\n");
581
+ }
582
+ function formatEvalRunSummary(results) {
583
+ return formatEvalRunSummaryWithOptions(results);
584
+ }
585
+ /**
586
+ * Register one test per eval case against a test runner, funnel every result
587
+ * into a shared reporter, and print the aggregated summary once after all cases.
588
+ *
589
+ * A failing test means the agent run itself broke (provider/tool/timeout). Low
590
+ * scores are reported, not asserted — see {@link RegisterEvalTestsOptions.failOnIncomplete}.
591
+ */
592
+ function registerEvalTests(options) {
593
+ const reporter = options.reporter ?? createEvalRunReporter();
594
+ const failOnIncomplete = options.failOnIncomplete ?? true;
595
+ const printSummary = options.printSummary ?? true;
596
+ const repeat = Math.max(1, options.repeat ?? 1);
597
+ const concurrency = Math.max(1, options.concurrency ?? 1);
598
+ options.runner.afterAll(async () => {
599
+ await reporter.flush();
600
+ if (printSummary) console.log(`\n${reporter.format()}\n`);
601
+ if (options.dispose) await options.dispose();
602
+ });
603
+ const work = [];
604
+ for (const evalCase of options.cases) {
605
+ const baseLabel = `${evalCase.suite ? `${evalCase.suite}/` : ""}${evalCase.id}`;
606
+ for (let run = 1; run <= repeat; run++) {
607
+ const label = repeat > 1 ? `${baseLabel} #${run}` : baseLabel;
608
+ const caseId = repeat > 1 ? `${evalCase.id}-repeat-${run}` : evalCase.id;
609
+ work.push({
610
+ label,
611
+ run: () => runEvalCase({
612
+ ...evalCase,
613
+ id: caseId,
614
+ ...options.artifactDir ? { artifactDir: options.artifactDir } : {}
615
+ })
616
+ });
617
+ }
618
+ }
619
+ const schedule = createLazyPool(concurrency);
620
+ work.forEach((w) => {
621
+ options.runner.it(w.label, async () => {
622
+ const result = await schedule(w.run);
623
+ await reporter.record(result);
624
+ if (failOnIncomplete && result.result.status !== "completed") throw new Error(`Eval ${w.label} did not complete: ${JSON.stringify(result.result.error ?? { status: result.result.status }, null, 2)}`);
625
+ });
626
+ });
627
+ return reporter;
628
+ }
629
+ /**
630
+ * Lazily run tasks with a bounded concurrency `limit`. Nothing starts until a
631
+ * runner invokes the corresponding test callback, so filtered/skipped tests do
632
+ * not leak model or Docker work in the background.
633
+ */
634
+ function createLazyPool(limit) {
635
+ const queue = [];
636
+ let active = 0;
637
+ const drain = () => {
638
+ const available = Math.min(limit - active, queue.length);
639
+ for (let i = 0; i < available; i++) queue.shift()?.();
640
+ };
641
+ return (task) => new Promise((resolve, reject) => {
642
+ const run = async () => {
643
+ active++;
644
+ try {
645
+ resolve(await task());
646
+ } catch (err) {
647
+ reject(err);
648
+ } finally {
649
+ active--;
650
+ drain();
651
+ }
652
+ };
653
+ queue.push(() => void run());
654
+ drain();
655
+ });
656
+ }
657
+ function createEvalRunReporter(options = {}) {
658
+ const results = [];
659
+ return {
660
+ get results() {
661
+ return results;
662
+ },
663
+ async record(result) {
664
+ results.push(result);
665
+ if (options.outputDir) {
666
+ const casePath = join(options.outputDir, "cases", `${safeSegment(result.suite ?? "eval")}--${safeSegment(result.id)}.json`);
667
+ await mkdir(join(options.outputDir, "cases"), { recursive: true });
668
+ await writeFile(casePath, `${JSON.stringify(result, null, 2)}\n`);
669
+ }
670
+ },
671
+ async flush() {
672
+ const summary = buildEvalRunSummary(results);
673
+ if (options.outputDir) {
674
+ await mkdir(options.outputDir, { recursive: true });
675
+ await writeFile(join(options.outputDir, "run-summary.json"), `${JSON.stringify(summary, null, 2)}\n`);
676
+ }
677
+ return summary;
678
+ },
679
+ format() {
680
+ return formatEvalRunSummaryWithOptions(results, { color: options.color });
681
+ }
682
+ };
683
+ }
684
+ /** Stable ordering by `suite/id` so parallel completion order doesn't shuffle output. */
685
+ function sortCases(results) {
686
+ return [...results].sort((a, b) => {
687
+ const ka = `${a.suite ?? ""}/${a.id}`;
688
+ const kb = `${b.suite ?? ""}/${b.id}`;
689
+ return ka.localeCompare(kb);
690
+ });
691
+ }
692
+ function buildEvalRunSummary(input) {
693
+ const results = sortCases(input);
694
+ const usage = results.reduce((acc, result) => {
695
+ acc.input += result.result.usage.input;
696
+ acc.output += result.result.usage.output;
697
+ acc.cacheRead += result.result.usage.cacheRead;
698
+ acc.cacheCreation += result.result.usage.cacheCreation;
699
+ acc.cost += result.result.usage.cost ?? 0;
700
+ return acc;
701
+ }, {
702
+ input: 0,
703
+ output: 0,
704
+ cacheRead: 0,
705
+ cacheCreation: 0,
706
+ cost: 0
707
+ });
708
+ const totalScore = results.reduce((sum, result) => sum + result.score, 0);
709
+ return {
710
+ count: results.length,
711
+ passed: results.filter((result) => result.passed).length,
712
+ score: results.length > 0 ? totalScore / results.length : 0,
713
+ durationMs: results.reduce((sum, result) => sum + result.result.durationMs, 0),
714
+ usage,
715
+ cases: results.map((result) => ({
716
+ id: result.id,
717
+ ...result.suite ? { suite: result.suite } : {},
718
+ passed: result.passed,
719
+ score: result.score,
720
+ status: result.result.status,
721
+ durationMs: result.result.durationMs,
722
+ scores: result.scores,
723
+ metrics: result.metrics,
724
+ tagScores: result.tagScores,
725
+ trajectory: result.trajectory
726
+ })),
727
+ metrics: aggregateMetrics(results),
728
+ tagScores: aggregateTagScores(results)
729
+ };
730
+ }
731
+ function aggregateMetrics(results) {
732
+ const byId = /* @__PURE__ */ new Map();
733
+ for (const result of results) for (const metric of result.metrics) {
734
+ const list = byId.get(metric.id) ?? [];
735
+ list.push(metric);
736
+ byId.set(metric.id, list);
737
+ }
738
+ return [...byId.entries()].map(([id, metrics]) => ({
739
+ id,
740
+ direction: metrics[0].direction,
741
+ tags: metrics[0].tags,
742
+ raw: metricStats(metrics.map((m) => m.raw)),
743
+ normalized: metricStats(metrics.map((m) => m.normalized))
744
+ }));
745
+ }
746
+ function aggregateTagScores(results) {
747
+ const byTag = /* @__PURE__ */ new Map();
748
+ for (const result of results) for (const [tag, value] of Object.entries(result.tagScores)) {
749
+ const list = byTag.get(tag) ?? [];
750
+ list.push(value);
751
+ byTag.set(tag, list);
752
+ }
753
+ const out = {};
754
+ for (const [tag, values] of byTag) out[tag] = mean(values);
755
+ return out;
756
+ }
757
+ function metricStats(values) {
758
+ const sorted = [...values].sort((a, b) => a - b);
759
+ return {
760
+ mean: mean(values),
761
+ min: sorted[0] ?? 0,
762
+ max: sorted[sorted.length - 1] ?? 0,
763
+ p50: percentile(sorted, .5),
764
+ p90: percentile(sorted, .9),
765
+ zeroCount: values.filter((v) => v === 0).length,
766
+ values
767
+ };
768
+ }
769
+ function percentile(sorted, q) {
770
+ if (sorted.length === 0) return 0;
771
+ return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(q * sorted.length) - 1))];
772
+ }
773
+ function formatEvalRunSummaryWithOptions(input, options = {}) {
774
+ const results = sortCases(input);
775
+ const color = createEvalColors(options.color);
776
+ if (results.length === 0) return `${color.heading("Eval run summary")}\n${color.muted("no evals ran")}`;
777
+ const summary = buildEvalRunSummary(results);
778
+ const out = [];
779
+ out.push(color.heading("Eval run summary"));
780
+ out.push(renderResultsTable(results, summary, color));
781
+ const tagTable = renderTagTable(summary.tagScores, color);
782
+ if (tagTable) {
783
+ out.push("");
784
+ out.push(tagTable);
785
+ }
786
+ out.push("");
787
+ const hyperlinks = options.color !== false;
788
+ const linkText = (target) => {
789
+ const abs = absPath(target);
790
+ return hyperlinks ? oscLink(abs, abs) : abs;
791
+ };
792
+ for (const result of results) {
793
+ const label = `${result.suite ? `${result.suite}/` : ""}${result.id}`;
794
+ out.push(`${color.dot(result.passed)} ${color.caseStatus(result.passed, label)}`);
795
+ out.push(`${color.muted("score")} ${color.score(result.score)}`);
796
+ const tagEntries = Object.entries(result.tagScores);
797
+ if (tagEntries.length > 0) out.push(`${color.muted("tags:")} ${tagEntries.map(([tag, value]) => `${color.tag(tag)} ${color.score(value)}`).join(" ")}`);
798
+ for (const metric of result.metrics) {
799
+ const note = metric.missing ? color.fail(" (not emitted)") : color.muted(` (${formatMetricRaw(metric)})`);
800
+ out.push(`${color.muted(padEnd(metric.id, 28))} ${color.score(metric.normalized)}${note}`);
801
+ }
802
+ for (const score of result.scores) {
803
+ const detail = scoreDetailLine(score);
804
+ if (!score.passed && detail) out.push(`${color.dot(score.passed)} ${color.muted(`${score.name}: ${truncate(detail, 140)}`)}`);
805
+ }
806
+ if (result.sourceFile) out.push(`${color.muted("case:")} ${color.muted(linkText(result.sourceFile))}`);
807
+ const resultTarget = caseLinkTarget(result);
808
+ if (resultTarget) out.push(`${color.muted("result:")} ${color.muted(linkText(resultTarget))}`);
809
+ if (result.trajectory.steps.length > 0) out.push(`${color.muted(`turns: ${result.trajectory.totalBlocks}`)} ${formatTrajectoryColored(result.trajectory, color)}`);
810
+ out.push("");
811
+ }
812
+ return out.join("\n").replace(/\n+$/, "");
813
+ }
814
+ function renderTagTable(tagScores, color) {
815
+ const entries = Object.entries(tagScores);
816
+ if (entries.length === 0) return void 0;
817
+ const tagWidth = Math.max(3, ...entries.map(([tag]) => tag.length));
818
+ const top = `┌${"─".repeat(tagWidth + 2)}┬${"─".repeat(9)}┐`;
819
+ const mid = `├${"─".repeat(tagWidth + 2)}┼${"─".repeat(9)}┤`;
820
+ const bot = `└${"─".repeat(tagWidth + 2)}┴${"─".repeat(9)}┘`;
821
+ return [
822
+ top,
823
+ `│ ${color.heading(padEnd("TAG", tagWidth))} │ ${color.heading("SCORE")} │`,
824
+ mid,
825
+ ...entries.map(([tag, value]) => `│ ${color.muted(padEnd(tag, tagWidth))} │ ${color.score(value)} │`),
826
+ bot
827
+ ].join("\n");
828
+ }
829
+ function formatMetricRaw(metric) {
830
+ return `raw ${Number.isInteger(metric.raw) ? formatInt(metric.raw) : metric.raw.toFixed(3)} ${metric.direction === "lower-is-better" ? "↓" : "↑"}`;
831
+ }
832
+ /**
833
+ * Resolve a filesystem target to link an eval case to. Prefers the written
834
+ * artifact (`result.json` → its dir), so the link opens the case's output.
835
+ * Returns `undefined` when no artifacts were written.
836
+ */
837
+ function caseLinkTarget(result) {
838
+ return result.artifacts?.result ?? result.artifacts?.dir;
839
+ }
840
+ /** Wrap text in an OSC 8 terminal hyperlink to a local path or URL. */
841
+ function oscLink(target, text) {
842
+ const url = /^(?:file|https?):\/\//.test(target) ? target : `file://${resolve(target)}`;
843
+ const OSC = "\x1B]8;;";
844
+ const ST = "\x1B\\";
845
+ return `${OSC}${url}${ST}${text}${OSC}${ST}`;
846
+ }
847
+ /** Absolute filesystem path (strips any `file://` prefix, resolves relatives). */
848
+ function absPath(target) {
849
+ return resolve(target.replace(/^file:\/\//, ""));
850
+ }
851
+ function renderResultsTable(results, summary, color) {
852
+ const header = [
853
+ "EVAL",
854
+ "STATUS",
855
+ "SCORE",
856
+ "IN",
857
+ "OUT",
858
+ "CACHE R",
859
+ "CACHE W",
860
+ "COST",
861
+ "TIME"
862
+ ];
863
+ const rows = results.map((result) => {
864
+ const label = `${result.suite ? `${result.suite}/` : ""}${result.id}`;
865
+ const u = result.result.usage;
866
+ return {
867
+ passed: result.passed,
868
+ cells: [
869
+ label,
870
+ result.passed ? "pass" : "fail",
871
+ result.score.toFixed(2),
872
+ formatInt(u.input),
873
+ formatInt(u.output),
874
+ formatInt(u.cacheRead),
875
+ formatInt(u.cacheCreation),
876
+ formatCost(u.cost ?? 0),
877
+ formatDuration(result.result.durationMs)
878
+ ],
879
+ score: result.score
880
+ };
881
+ });
882
+ const totals = {
883
+ cells: [
884
+ `TOTAL (${summary.passed}/${summary.count})`,
885
+ summary.passed === summary.count ? "pass" : "fail",
886
+ summary.score.toFixed(2),
887
+ formatInt(summary.usage.input),
888
+ formatInt(summary.usage.output),
889
+ formatInt(summary.usage.cacheRead),
890
+ formatInt(summary.usage.cacheCreation),
891
+ formatCost(summary.usage.cost),
892
+ formatDuration(summary.durationMs)
893
+ ],
894
+ passed: summary.passed === summary.count,
895
+ score: summary.score
896
+ };
897
+ const aligns = [
898
+ "left",
899
+ "left",
900
+ "right",
901
+ "right",
902
+ "right",
903
+ "right",
904
+ "right",
905
+ "right",
906
+ "right"
907
+ ];
908
+ const widths = header.map((h, i) => Math.max(h.length, ...rows.map((r) => r.cells[i].length), totals.cells[i].length));
909
+ const top = `┌${widths.map((w) => "─".repeat(w + 2)).join("┬")}┐`;
910
+ const mid = `├${widths.map((w) => "─".repeat(w + 2)).join("┼")}┤`;
911
+ const bot = `└${widths.map((w) => "─".repeat(w + 2)).join("┴")}┘`;
912
+ const renderRow = (cells, paint) => {
913
+ return `│${cells.map((cell, i) => {
914
+ const padded = aligns[i] === "right" ? padStart(cell, widths[i]) : padEnd(cell, widths[i]);
915
+ return ` ${paint ? paint(i, cell, padded) : padded} `;
916
+ }).join("│")}│`;
917
+ };
918
+ return [
919
+ top,
920
+ renderRow(header, (_i, _raw, padded) => color.heading(padded)),
921
+ mid,
922
+ ...rows.map((r) => renderRow(r.cells, (i, raw, padded) => paintCell(i, raw, padded, r.passed, r.score, color))),
923
+ mid,
924
+ renderRow(totals.cells, (i, raw, padded) => paintCell(i, raw, padded, totals.passed, totals.score, color, true)),
925
+ bot
926
+ ].join("\n");
927
+ }
928
+ function paintCell(col, raw, padded, passed, score, color, bold = false) {
929
+ if (col === 0) return bold ? color.heading(padded) : color.caseStatus(passed, padded);
930
+ if (col === 1) return padded.replace(raw, color.status(passed));
931
+ if (col === 2) return padded.replace(raw, color.score(score));
932
+ if (col === 7) return color.cost(padded);
933
+ return color.muted(padded);
934
+ }
935
+ function scoreDetailLine(score) {
936
+ if (typeof score.details === "string") return score.details;
937
+ if (isRecord(score.details) && typeof score.details.reasoning === "string") return score.details.reasoning;
938
+ }
939
+ function formatInt(value) {
940
+ return value.toLocaleString("en-US");
941
+ }
942
+ function formatCost(value) {
943
+ return value > 0 ? `$${value.toFixed(4)}` : "-";
944
+ }
945
+ function formatDuration(ms) {
946
+ const rounded = Math.round(ms);
947
+ if (rounded < 1e3) return `${rounded}ms`;
948
+ const s = rounded / 1e3;
949
+ return `${Number.isInteger(s) ? s : s.toFixed(1)}s`;
950
+ }
951
+ function padEnd(value, width) {
952
+ return value.length >= width ? value : value + " ".repeat(width - value.length);
953
+ }
954
+ function padStart(value, width) {
955
+ return value.length >= width ? value : " ".repeat(width - value.length) + value;
956
+ }
957
+ function truncate(value, max) {
958
+ const oneLine = value.replace(/\s+/g, " ").trim();
959
+ return oneLine.length > max ? `${oneLine.slice(0, max - 1)}…` : oneLine;
960
+ }
961
+ async function prepareWorkspace(execution, workspace) {
962
+ if (!workspace) return {
963
+ snapshot: () => void 0,
964
+ error: () => void 0,
965
+ cleanup: async () => {}
966
+ };
967
+ let tempDir;
968
+ let baseExecution = execution;
969
+ if (!baseExecution) {
970
+ const cwd = workspace.cwd ?? await mkdtemp(join(tmpdir(), "zidane-eval-"));
971
+ if (!workspace.cwd) tempDir = cwd;
972
+ baseExecution = createProcessContext({ cwd });
973
+ }
974
+ const captured = {};
975
+ return {
976
+ execution: withWorkspaceLifecycle(baseExecution, workspace, captured),
977
+ snapshot: () => captured.snapshot,
978
+ error: () => captured.error,
979
+ cleanup: async () => {
980
+ if (tempDir && !workspace.retain) await rm(tempDir, {
981
+ recursive: true,
982
+ force: true
983
+ });
984
+ }
985
+ };
986
+ }
987
+ function withWorkspaceLifecycle(execution, workspace, captured) {
988
+ return {
989
+ ...execution,
990
+ async spawn(config) {
991
+ const handle = await execution.spawn({
992
+ ...config,
993
+ ...workspace.cwd && !config?.cwd ? { cwd: workspace.cwd } : {}
994
+ });
995
+ if (workspace.seedDir) await seedWorkspace(execution, handle, workspace);
996
+ return handle;
997
+ },
998
+ async destroy(handle) {
999
+ try {
1000
+ captured.snapshot = await captureWorkspace(execution, handle, workspace);
1001
+ } catch (err) {
1002
+ captured.error = err instanceof Error ? err.message : String(err);
1003
+ } finally {
1004
+ await execution.destroy(handle);
1005
+ }
1006
+ }
1007
+ };
1008
+ }
1009
+ async function seedWorkspace(execution, handle, workspace) {
1010
+ const target = workspace.seedTarget ?? ".";
1011
+ const result = await execution.exec(handle, `mkdir -p ${shellQuote(target)} && cp -R ${shellQuote(`${workspace.seedDir}/.`)} ${shellQuote(target)}`);
1012
+ if (result.exitCode !== 0) throw new Error(`Failed to seed eval workspace: ${result.stderr || result.stdout}`);
1013
+ }
1014
+ async function captureWorkspace(execution, handle, workspace) {
1015
+ const capturePaths = workspace.capture?.length ? workspace.capture : ["."];
1016
+ const command = [
1017
+ `cd ${shellQuote(handle.cwd)}`,
1018
+ "&&",
1019
+ "for p in",
1020
+ capturePaths.map(shellQuote).join(" "),
1021
+ "; do",
1022
+ "if [ -e \"$p\" ]; then",
1023
+ "find \"$p\" \\( -path \"*/node_modules/*\" -o -path \"*/.git/*\" \\) -prune -o -type f -print0",
1024
+ "; fi",
1025
+ "done"
1026
+ ].join(" ");
1027
+ const listed = await execution.exec(handle, command);
1028
+ if (listed.exitCode !== 0) throw new Error(`Failed to list eval workspace: ${listed.stderr || listed.stdout}`);
1029
+ const maxFileChars = workspace.maxFileChars ?? 256 * 1024;
1030
+ const files = [];
1031
+ const seen = /* @__PURE__ */ new Set();
1032
+ for (const rawPath of listed.stdout.split("\0").filter(Boolean)) {
1033
+ const path = normalizeSnapshotPath(rawPath);
1034
+ if (seen.has(path)) continue;
1035
+ seen.add(path);
1036
+ const content = await execution.readFile(handle, path);
1037
+ const binary = content.includes("\0");
1038
+ const truncated = content.length > maxFileChars;
1039
+ files.push({
1040
+ path,
1041
+ size: content.length,
1042
+ ...binary ? { binary: true } : { content: truncated ? content.slice(0, maxFileChars) : content },
1043
+ ...truncated ? { truncated: true } : {}
1044
+ });
1045
+ }
1046
+ return {
1047
+ cwd: handle.cwd,
1048
+ files
1049
+ };
1050
+ }
1051
+ async function runScorers(ctx, scorers) {
1052
+ const scores = [];
1053
+ for (const scorer of scorers) try {
1054
+ scores.push(await scorer(ctx));
1055
+ } catch (err) {
1056
+ if (err instanceof EvalMetricError) throw err;
1057
+ scores.push({
1058
+ name: scorer.name || "anonymous-scorer",
1059
+ passed: false,
1060
+ score: 0,
1061
+ details: err instanceof Error ? err.message : String(err)
1062
+ });
1063
+ }
1064
+ return scores;
1065
+ }
1066
+ function aggregateScore(scores) {
1067
+ if (scores.length === 0) return 1;
1068
+ return scores.reduce((sum, score) => sum + (score.score ?? (score.passed ? 1 : 0)), 0) / scores.length;
1069
+ }
1070
+ function efficiencyMetricValues(result) {
1071
+ const u = result.usage;
1072
+ const readTotal = u.input + u.cacheRead;
1073
+ return {
1074
+ "execution-time": result.durationMs,
1075
+ "provider-tokens": u.input + u.output,
1076
+ "cache-read-rate": readTotal > 0 ? u.cacheRead / readTotal : 0
1077
+ };
1078
+ }
1079
+ function emitEfficiencyMetrics(emit, result) {
1080
+ const values = efficiencyMetricValues(result);
1081
+ for (const [id, raw] of Object.entries(values)) emit(id, raw);
1082
+ }
1083
+ function finalizeEvalMetrics(specs, raw) {
1084
+ return Object.entries(specs).map(([metricId, spec]) => {
1085
+ const emitted = raw.has(metricId);
1086
+ const value = emitted ? raw.get(metricId) : spec.min;
1087
+ return {
1088
+ id: metricId,
1089
+ raw: emitted ? value : 0,
1090
+ normalized: emitted ? normalizeMetric(value, spec) : 0,
1091
+ direction: spec.direction,
1092
+ min: spec.min,
1093
+ max: spec.max,
1094
+ tags: spec.tags ?? [],
1095
+ ...spec.description ? { description: spec.description } : {},
1096
+ ...emitted ? {} : { missing: true }
1097
+ };
1098
+ });
1099
+ }
1100
+ function computeEvalTagScores(metrics) {
1101
+ const byTag = /* @__PURE__ */ new Map();
1102
+ for (const metric of metrics) for (const tag of metric.tags) {
1103
+ const list = byTag.get(tag) ?? [];
1104
+ list.push(metric.normalized);
1105
+ byTag.set(tag, list);
1106
+ }
1107
+ const out = {};
1108
+ for (const [tag, values] of byTag) out[tag] = mean(values);
1109
+ return out;
1110
+ }
1111
+ function meanNormalized(metrics) {
1112
+ return metrics.length === 0 ? 0 : mean(metrics.map((m) => m.normalized));
1113
+ }
1114
+ function accumulateEvalAgentStats(acc, turn) {
1115
+ acc.runs += 1;
1116
+ acc.input += turn.input;
1117
+ acc.output += turn.output;
1118
+ acc.cacheRead += turn.cacheRead;
1119
+ acc.cacheCreation += turn.cacheCreation;
1120
+ acc.cost += turn.cost;
1121
+ acc.turns += turn.turns;
1122
+ acc.toolCalls += turn.toolCalls;
1123
+ acc.durationMs += turn.durationMs;
1124
+ }
1125
+ function countToolCallsInTurns(turns) {
1126
+ let count = 0;
1127
+ for (const turn of turns) {
1128
+ if (turn.role !== "assistant") continue;
1129
+ count += turn.content.filter((block) => block.type === "tool_call").length;
1130
+ }
1131
+ return count;
1132
+ }
1133
+ function mean(values) {
1134
+ return values.length === 0 ? 0 : values.reduce((a, b) => a + b, 0) / values.length;
1135
+ }
1136
+ function isRecord(value) {
1137
+ return typeof value === "object" && value !== null;
1138
+ }
1139
+ function createEvalColors(enabled = shouldUseColor()) {
1140
+ if (!enabled) return {
1141
+ heading: (value) => value,
1142
+ pass: (value) => value,
1143
+ fail: (value) => value,
1144
+ muted: (value) => value,
1145
+ cost: (value) => value,
1146
+ status: (passed) => passed ? "pass" : "fail",
1147
+ score: (value) => value.toFixed(2),
1148
+ caseStatus: (_passed, value) => value,
1149
+ dot: (passed) => passed ? "+" : "x",
1150
+ tag: (value) => value,
1151
+ step: (_kind, value) => value
1152
+ };
1153
+ return {
1154
+ heading: (value) => chalk.bold(value),
1155
+ pass: (value) => chalk.green(value),
1156
+ fail: (value) => chalk.red(value),
1157
+ muted: (value) => chalk.gray(value),
1158
+ cost: (value) => chalk.cyan(value),
1159
+ status: (passed) => passed ? chalk.green("pass") : chalk.red("fail"),
1160
+ score: (value) => value >= .9 ? chalk.green(value.toFixed(2)) : value >= .7 ? chalk.yellow(value.toFixed(2)) : chalk.red(value.toFixed(2)),
1161
+ caseStatus: (passed, value) => passed ? chalk.green(value) : chalk.red(value),
1162
+ dot: (passed) => passed ? chalk.green("●") : chalk.red("●"),
1163
+ tag: (value) => chalk.bold(chalk.whiteBright(value)),
1164
+ step: (kind, value) => paintStep(kind, value)
1165
+ };
1166
+ }
1167
+ /** Per-kind trajectory colors: think=magenta, text=blue, tools by family. */
1168
+ function paintStep(kind, value) {
1169
+ if (kind === "think") return chalk.magenta(value);
1170
+ if (kind === "text") return chalk.blue(value);
1171
+ if (/write|edit|create/.test(value)) return chalk.green(value);
1172
+ if (/read|list|glob|grep|search/.test(value)) return chalk.cyan(value);
1173
+ if (/shell|bash|exec|run/.test(value)) return chalk.yellow(value);
1174
+ if (/spawn|task|agent/.test(value)) return chalk.magentaBright(value);
1175
+ return chalk.white(value);
1176
+ }
1177
+ function formatTrajectoryColored(trajectory, color) {
1178
+ return trajectory.steps.map((step) => {
1179
+ const base = step.kind === "tool" ? step.name ?? "tool" : step.kind;
1180
+ const labelWithCount = step.count > 1 ? `${base} ×${step.count}` : base;
1181
+ const painted = color.step(step.kind, labelWithCount);
1182
+ const timing = step.durationMs && step.durationMs >= 100 ? color.muted(`(${formatDuration(step.durationMs)})`) : "";
1183
+ return timing ? `${painted} ${timing}` : painted;
1184
+ }).join(color.muted(" → "));
1185
+ }
1186
+ function shouldUseColor() {
1187
+ return Boolean(process.stdout.isTTY && !process.env.NO_COLOR);
1188
+ }
1189
+ function renderJudgePrompt(input) {
1190
+ return [
1191
+ "Rubric:",
1192
+ input.rubric,
1193
+ "",
1194
+ "Output to grade:",
1195
+ input.input,
1196
+ "",
1197
+ "Return JSON only."
1198
+ ].join("\n");
1199
+ }
1200
+ function defaultJudgeInput(ctx) {
1201
+ const chunks = [`Final answer:\n${ctx.result.finalText}`];
1202
+ if (ctx.workspace) chunks.push(`Workspace files:\n${ctx.workspace.files.map((file) => [`--- ${file.path}${file.truncated ? " (truncated)" : ""} ---`, file.binary ? "[binary omitted]" : file.content ?? ""].join("\n")).join("\n")}`);
1203
+ return chunks.join("\n\n");
1204
+ }
1205
+ function parseJudgeOutput(raw) {
1206
+ const trimmed = stripCodeFences(raw.trim());
1207
+ const jsonText = extractJsonObject(trimmed);
1208
+ if (jsonText) try {
1209
+ const parsed = JSON.parse(jsonText);
1210
+ if (parsed.score !== void 0 || parsed.reasoning !== void 0) return {
1211
+ score: clampScore(parsed.score),
1212
+ reasoning: typeof parsed.reasoning === "string" ? parsed.reasoning : "",
1213
+ ...typeof parsed.feedback === "string" ? { feedback: parsed.feedback } : {},
1214
+ ...jsonText === trimmed ? {} : { raw: trimmed }
1215
+ };
1216
+ } catch {}
1217
+ const scoreFragment = trimmed.match(/score["']?\s*[:=]\s*(-?\d+(?:\.\d+)?)/i);
1218
+ if (scoreFragment) return {
1219
+ score: clampScore(Number(scoreFragment[1])),
1220
+ reasoning: trimmed.slice(0, 500),
1221
+ raw: trimmed
1222
+ };
1223
+ const bareFraction = trimmed.match(/^(-?\d+(?:\.\d+)?)\s*\/\s*(\d+(?:\.\d+)?)$/);
1224
+ if (bareFraction) {
1225
+ const denom = Number(bareFraction[2]);
1226
+ return {
1227
+ score: clampScore(denom === 0 ? 0 : Number(bareFraction[1]) / denom),
1228
+ reasoning: trimmed,
1229
+ raw: trimmed
1230
+ };
1231
+ }
1232
+ if (trimmed.match(/^-?\d+(?:\.\d+)?$/)) return {
1233
+ score: clampScore(Number(trimmed)),
1234
+ reasoning: "",
1235
+ raw: trimmed
1236
+ };
1237
+ return {
1238
+ score: 0,
1239
+ reasoning: "Judge did not return a parseable score.",
1240
+ raw: trimmed
1241
+ };
1242
+ }
1243
+ /** Strip a leading/trailing markdown code fence (``` or ```json). */
1244
+ function stripCodeFences(text) {
1245
+ const fence = text.match(/^```(?:json)?[ \t]*\n([\s\S]*?)\n```$/i);
1246
+ return fence ? fence[1].trim() : text;
1247
+ }
1248
+ /** Extract the first balanced `{...}` object, ignoring braces inside strings. */
1249
+ function extractJsonObject(text) {
1250
+ const start = text.indexOf("{");
1251
+ if (start === -1) return void 0;
1252
+ let depth = 0;
1253
+ let inString = false;
1254
+ let escaped = false;
1255
+ for (let i = start; i < text.length; i++) {
1256
+ const ch = text[i];
1257
+ if (inString) {
1258
+ if (escaped) escaped = false;
1259
+ else if (ch === "\\") escaped = true;
1260
+ else if (ch === "\"") inString = false;
1261
+ continue;
1262
+ }
1263
+ if (ch === "\"") inString = true;
1264
+ else if (ch === "{") depth++;
1265
+ else if (ch === "}") {
1266
+ depth--;
1267
+ if (depth === 0) return text.slice(start, i + 1);
1268
+ }
1269
+ }
1270
+ }
1271
+ function clampScore(value) {
1272
+ const n = typeof value === "number" ? value : Number(value);
1273
+ if (!Number.isFinite(n)) return 0;
1274
+ return Math.max(0, Math.min(1, n));
1275
+ }
1276
+ async function writeEvalArtifacts(dir, result) {
1277
+ await mkdir(dir, { recursive: true });
1278
+ const resultPath = join(dir, "result.json");
1279
+ const eventsPath = join(dir, "events.jsonl");
1280
+ const transcriptPath = join(dir, "transcript.json");
1281
+ const workspacePath = result.workspace ? join(dir, "workspace.json") : void 0;
1282
+ await writeFile(resultPath, `${JSON.stringify(result, null, 2)}\n`);
1283
+ await writeFile(eventsPath, result.events.map(headlessEventToJsonl).join(""));
1284
+ await writeFile(transcriptPath, `${JSON.stringify(result.result.transcript, null, 2)}\n`);
1285
+ if (workspacePath) await writeFile(workspacePath, `${JSON.stringify(result.workspace, null, 2)}\n`);
1286
+ return {
1287
+ dir,
1288
+ result: resultPath,
1289
+ events: eventsPath,
1290
+ transcript: transcriptPath,
1291
+ ...workspacePath ? { workspace: workspacePath } : {}
1292
+ };
1293
+ }
1294
+ function normalizeSnapshotPath(path) {
1295
+ const normalized = path.startsWith("./") ? path.slice(2) : path;
1296
+ return normalized === "" ? "." : normalized;
1297
+ }
1298
+ function safeSegment(value) {
1299
+ const sanitized = value.replace(/[^\w.-]+/g, "-").replace(/^-+|-+$/g, "");
1300
+ if (sanitized === value && sanitized.length > 0) return sanitized;
1301
+ return `${sanitized || "eval"}-${shortHash(value)}`;
1302
+ }
1303
+ function shortHash(value) {
1304
+ let hash = 2166136261;
1305
+ for (let i = 0; i < value.length; i++) {
1306
+ hash ^= value.charCodeAt(i);
1307
+ hash = Math.imul(hash, 16777619);
1308
+ }
1309
+ return (hash >>> 0).toString(36).padStart(7, "0").slice(0, 7);
1310
+ }
1311
+ function shellQuote(value) {
1312
+ return `'${value.replace(/'/g, String.raw`'\''`)}'`;
1313
+ }
1314
+ function artifactPath(root, result) {
1315
+ return resolve(root, safeSegment(result.suite ?? "eval"), safeSegment(result.id));
1316
+ }
1317
+ function relativeArtifactPath(root, path) {
1318
+ return relative(root, path);
1319
+ }
1320
+ //#endregion
1321
+ export { EFFICIENCY_METRICS, EvalMetricError, artifactPath, buildEvalRunSummary, buildRegisteredEvals, buildTrajectory, clearRegisteredEvals, computeEvalTagScores, createEvalAgent, createEvalRunReporter, createReusableExecutionContext, defineEval, defineMetrics, efficiencyMetricValues, emitEfficiencyMetrics, fileContains, fileContentQuality, fileExists, fileExistsOneOf, finalizeEvalMetrics, formatEvalCaseSummary, formatEvalRunSummary, formatTrajectoryLine, functionalityMetric, llmJudge, normalizeMetric, registerEvalTests, relativeArtifactPath, runEvalCase, statusCompleted };
1322
+
1323
+ //# sourceMappingURL=eval.js.map