@alis-build/harness-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1396 @@
1
+ import { i as buildJudgeArgs } from "./claude-code-ycT0JQZF.js";
2
+ import { n as createLimit } from "./suite-chj0j22j.js";
3
+ import { spawn } from "node:child_process";
4
+ import { readFile } from "node:fs/promises";
5
+ import { parse } from "yaml";
6
+ import { createHash, randomUUID } from "node:crypto";
7
+ //#region src/types/eval-record.ts
8
+ /** Schema version for {@link EvalRunEnvelope} JSON documents. */
9
+ const EVAL_RUN_SCHEMA_VERSION = "1.0";
10
+ /** Schema version embedded in each {@link TrajectoryView} at export time. */
11
+ const TRAJECTORY_SCHEMA_VERSION = "1.0";
12
+ //#endregion
13
+ //#region src/otel/attributes.ts
14
+ function strAttr(key, value) {
15
+ return {
16
+ key,
17
+ value: { stringValue: value }
18
+ };
19
+ }
20
+ function intAttr(key, value) {
21
+ return {
22
+ key,
23
+ value: { intValue: String(value) }
24
+ };
25
+ }
26
+ function boolAttr(key, value) {
27
+ return {
28
+ key,
29
+ value: { boolValue: value }
30
+ };
31
+ }
32
+ function jsonAttr(key, value) {
33
+ return {
34
+ key,
35
+ value: { stringValue: JSON.stringify(value) }
36
+ };
37
+ }
38
+ //#endregion
39
+ //#region src/otel/messages.ts
40
+ function mapStopReason(reason) {
41
+ if (!reason) return void 0;
42
+ switch (reason) {
43
+ case "end_turn": return "stop";
44
+ case "tool_use": return "tool_calls";
45
+ case "max_tokens": return "length";
46
+ case "stop_sequence": return "stop";
47
+ default: return reason;
48
+ }
49
+ }
50
+ function toolCallPart(call) {
51
+ return {
52
+ type: "tool_call",
53
+ id: call.callId,
54
+ name: call.name,
55
+ arguments: call.args ?? {}
56
+ };
57
+ }
58
+ function toolResponsePart(call) {
59
+ return {
60
+ type: "tool_call_response",
61
+ id: call.callId,
62
+ result: call.result
63
+ };
64
+ }
65
+ function assistantMessageFromTurn(turn) {
66
+ const parts = [];
67
+ if (turn.text) parts.push({
68
+ type: "text",
69
+ content: turn.text
70
+ });
71
+ for (const call of turn.toolCalls) parts.push(toolCallPart(call));
72
+ const finish = mapStopReason(turn.stopReason);
73
+ return {
74
+ role: "assistant",
75
+ parts,
76
+ ...finish ? { finish_reason: finish } : {}
77
+ };
78
+ }
79
+ function toolResultsMessage(calls) {
80
+ const parts = calls.filter((c) => c.result !== null).map((c) => toolResponsePart(c));
81
+ if (parts.length === 0) return null;
82
+ return {
83
+ role: "tool",
84
+ parts
85
+ };
86
+ }
87
+ /**
88
+ * Input history before the assistant turn at `turnIndex`.
89
+ */
90
+ function inputMessagesBeforeTurn(view, turnIndex, prompt) {
91
+ const messages = [];
92
+ if (prompt) messages.push({
93
+ role: "user",
94
+ parts: [{
95
+ type: "text",
96
+ content: prompt
97
+ }]
98
+ });
99
+ for (let i = 0; i < turnIndex; i++) {
100
+ const turn = view.turns[i];
101
+ if (!turn) continue;
102
+ messages.push(assistantMessageFromTurn(turn));
103
+ const toolMsg = toolResultsMessage(turn.toolCalls);
104
+ if (toolMsg) messages.push(toolMsg);
105
+ }
106
+ return messages;
107
+ }
108
+ //#endregion
109
+ //#region src/otel/types.ts
110
+ /** OTLP span kinds (enum integers). */
111
+ const SpanKind = {
112
+ INTERNAL: 1,
113
+ CLIENT: 2
114
+ };
115
+ /** OTLP status codes. */
116
+ const StatusCode = {
117
+ UNSET: 0,
118
+ OK: 1,
119
+ ERROR: 2
120
+ };
121
+ //#endregion
122
+ //#region src/otel/emitter.ts
123
+ /**
124
+ * TrajectoryView → OTLP JSON export using OpenTelemetry GenAI semantic conventions.
125
+ *
126
+ * Produces an `ExportTraceServiceRequest` suitable for OTLP/HTTP JSON ingestion.
127
+ * Assertions continue to use {@link TrajectoryView} directly; this is export-only.
128
+ */
129
+ const INSTRUMENTATION_VERSION = "0.1.0";
130
+ /**
131
+ * Map a {@link TrajectoryView} to OTLP trace JSON.
132
+ *
133
+ * Span tree (siblings under `invoke_agent`, not nested):
134
+ * ```
135
+ * invoke_agent
136
+ * ├── chat {model}
137
+ * ├── execute_tool {name}
138
+ * ├── chat {model}
139
+ * └── ...
140
+ * ```
141
+ */
142
+ function trajectoryToOtlp(view, options = {}) {
143
+ const agentName = options.agentName ?? "claude-code";
144
+ const providerName = options.providerName ?? "anthropic";
145
+ const serviceName = options.serviceName ?? "harness-eval";
146
+ const scopeName = options.instrumentationScope ?? "@alis-build/harness-eval";
147
+ const traceId = traceIdFromSession(view.meta.sessionId);
148
+ const rootSpanId = spanIdFromKey(traceId, "invoke_agent");
149
+ const durationMs = Math.max(view.usage.durationMs, 1);
150
+ const endMs = options.endTimeMs ?? Date.now();
151
+ const startMs = endMs - durationMs;
152
+ const rootStartNs = msToNs(startMs);
153
+ const rootEndNs = msToNs(endMs);
154
+ const spans = [];
155
+ const timings = buildSpanTimings(view, startMs, endMs);
156
+ spans.push({
157
+ traceId,
158
+ spanId: rootSpanId,
159
+ name: "invoke_agent",
160
+ kind: SpanKind.INTERNAL,
161
+ startTimeUnixNano: rootStartNs,
162
+ endTimeUnixNano: rootEndNs,
163
+ attributes: [
164
+ strAttr("gen_ai.operation.name", "invoke_agent"),
165
+ strAttr("gen_ai.agent.name", agentName),
166
+ strAttr("gen_ai.provider.name", providerName),
167
+ strAttr("gen_ai.conversation.id", view.meta.sessionId),
168
+ strAttr("gen_ai.request.model", view.meta.model),
169
+ strAttr("gen_ai.response.model", view.meta.model),
170
+ intAttr("gen_ai.usage.input_tokens", view.usage.inputTokens),
171
+ intAttr("gen_ai.usage.output_tokens", view.usage.outputTokens),
172
+ boolAttr("harness_eval.success", view.success)
173
+ ],
174
+ status: viewStatus(view)
175
+ });
176
+ let opIndex = 0;
177
+ for (const turn of view.turns) {
178
+ const chatTiming = timings[opIndex++];
179
+ const chatSpanId = spanIdFromKey(traceId, `chat:${turn.turnIndex}`);
180
+ const inputMessages = inputMessagesBeforeTurn(view, turn.turnIndex, options.prompt);
181
+ const outputMessages = [assistantMessageFromTurn(turn)];
182
+ spans.push({
183
+ traceId,
184
+ spanId: chatSpanId,
185
+ parentSpanId: rootSpanId,
186
+ name: `chat ${view.meta.model}`,
187
+ kind: SpanKind.CLIENT,
188
+ startTimeUnixNano: chatTiming.startNs,
189
+ endTimeUnixNano: chatTiming.endNs,
190
+ attributes: [
191
+ strAttr("gen_ai.operation.name", "chat"),
192
+ strAttr("gen_ai.provider.name", providerName),
193
+ strAttr("gen_ai.request.model", view.meta.model),
194
+ strAttr("gen_ai.response.model", view.meta.model),
195
+ ...inputMessages.length > 0 ? [jsonAttr("gen_ai.input.messages", inputMessages)] : [],
196
+ jsonAttr("gen_ai.output.messages", outputMessages),
197
+ ...turn.stopReason ? [jsonAttr("gen_ai.response.finish_reasons", [mapStopReason(turn.stopReason) ?? turn.stopReason])] : []
198
+ ],
199
+ status: { code: StatusCode.OK }
200
+ });
201
+ if (turn.toolCalls.length === 0) continue;
202
+ const toolTiming = timings[opIndex++];
203
+ for (const call of turn.toolCalls) {
204
+ const toolSpanId = spanIdFromKey(traceId, `tool:${call.callId}`);
205
+ spans.push({
206
+ traceId,
207
+ spanId: toolSpanId,
208
+ parentSpanId: rootSpanId,
209
+ name: `execute_tool ${call.name}`,
210
+ kind: SpanKind.INTERNAL,
211
+ startTimeUnixNano: toolTiming.startNs,
212
+ endTimeUnixNano: toolTiming.endNs,
213
+ attributes: [
214
+ strAttr("gen_ai.operation.name", "execute_tool"),
215
+ strAttr("gen_ai.provider.name", providerName),
216
+ strAttr("gen_ai.tool.name", call.name),
217
+ strAttr("gen_ai.tool.call.id", call.callId),
218
+ jsonAttr("gen_ai.tool.call.arguments", call.args ?? {}),
219
+ ...call.result !== null ? [jsonAttr("gen_ai.tool.call.result", call.result)] : [],
220
+ ...call.namespace ? [strAttr("harness_eval.tool.namespace", call.namespace)] : [],
221
+ boolAttr("harness_eval.tool.is_error", call.isError)
222
+ ],
223
+ status: call.isError ? {
224
+ code: StatusCode.ERROR,
225
+ message: "tool reported error"
226
+ } : { code: StatusCode.OK }
227
+ });
228
+ }
229
+ }
230
+ return { resourceSpans: [{
231
+ resource: { attributes: [strAttr("service.name", serviceName), strAttr("gen_ai.agent.name", agentName)] },
232
+ scopeSpans: [{
233
+ scope: {
234
+ name: scopeName,
235
+ version: INSTRUMENTATION_VERSION
236
+ },
237
+ spans
238
+ }]
239
+ }] };
240
+ }
241
+ /** Alias matching the implementation plan naming. */
242
+ const emitOtel = trajectoryToOtlp;
243
+ function viewStatus(view) {
244
+ if (view.success) return { code: StatusCode.OK };
245
+ return {
246
+ code: StatusCode.ERROR,
247
+ message: "harness run did not complete successfully"
248
+ };
249
+ }
250
+ function buildSpanTimings(view, startMs, endMs) {
251
+ const slots = [];
252
+ for (const turn of view.turns) {
253
+ slots.push("chat");
254
+ if (turn.toolCalls.length > 0) slots.push("tools");
255
+ }
256
+ if (slots.length === 0) return [];
257
+ const slotMs = Math.max(endMs - startMs, 1) / slots.length;
258
+ const timings = [];
259
+ let offset = startMs;
260
+ for (const slot of slots) {
261
+ const slotStart = offset;
262
+ const slotEnd = offset + slotMs;
263
+ timings.push({
264
+ startNs: msToNs(slotStart),
265
+ endNs: msToNs(slotEnd)
266
+ });
267
+ offset = slotEnd;
268
+ }
269
+ return timings;
270
+ }
271
+ function traceIdFromSession(sessionId) {
272
+ return createHash("sha256").update(`harness-eval:trace:${sessionId}`).digest("hex").slice(0, 32).toUpperCase();
273
+ }
274
+ function spanIdFromKey(traceId, key) {
275
+ return createHash("sha256").update(`${traceId}:span:${key}`).digest("hex").slice(0, 16).toUpperCase();
276
+ }
277
+ function msToNs(ms) {
278
+ return String(Math.round(ms * 1e6));
279
+ }
280
+ //#endregion
281
+ //#region src/grader/prompt.ts
282
+ function buildGraderPrompt(input) {
283
+ const expectationList = input.expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
284
+ return `${input.systemInstruction ? `${input.systemInstruction.trim()}\n\n` : ""}You are an automated evaluation grader (not the agent under test). Your only job is to score expectations against the transcript below.
285
+
286
+ Your job is to evaluate each expectation against the transcript and final response.
287
+ PASS only when there is clear evidence in the transcript or final response.
288
+ When uncertain, FAIL — burden of proof is on PASS.
289
+
290
+ Also critique the expectations themselves if any are trivially satisfied or miss important outcomes.
291
+
292
+ ## Eval prompt
293
+
294
+ ${input.prompt}
295
+
296
+ ## Execution transcript
297
+
298
+ ${input.transcript}
299
+
300
+ ## Expectations to grade
301
+
302
+ ${expectationList}
303
+
304
+ ## Output format
305
+
306
+ Respond with ONLY a single JSON object (no markdown fences, no commentary) matching this schema:
307
+
308
+ {
309
+ "expectations": [
310
+ { "text": "<original expectation>", "passed": true|false, "evidence": "<quote or description>" }
311
+ ],
312
+ "summary": { "passed": <int>, "failed": <int>, "total": <int>, "pass_rate": <0.0-1.0> },
313
+ "eval_feedback": {
314
+ "suggestions": [{ "assertion": "<optional>", "reason": "<string>" }],
315
+ "overall": "<brief assessment>"
316
+ }
317
+ }
318
+
319
+ Include every expectation in the same order. summary must match the expectations array.`;
320
+ }
321
+ //#endregion
322
+ //#region src/grader/parse.ts
323
+ function extractClaudeResponseText(stdout) {
324
+ const trimmed = stdout.trim();
325
+ if (!trimmed) return "";
326
+ try {
327
+ const data = JSON.parse(trimmed);
328
+ if (Array.isArray(data)) return extractFromEventArray(data) ?? trimmed;
329
+ if (typeof data === "object" && data !== null) {
330
+ const event = data;
331
+ if (event.type === "result" && typeof event.result === "string") return event.result;
332
+ if (event.type === "assistant" && event.message) {
333
+ const text = textFromAssistantMessage(event.message);
334
+ if (text) return text;
335
+ }
336
+ }
337
+ } catch {}
338
+ return trimmed;
339
+ }
340
+ function extractFromEventArray(events) {
341
+ const result = events.find((e) => typeof e === "object" && e !== null && e.type === "result");
342
+ if (result?.result) return result.result;
343
+ const assistantTexts = [];
344
+ for (const event of events) if (typeof event === "object" && event !== null && event.type === "assistant") {
345
+ const text = textFromAssistantMessage(event.message);
346
+ if (text) assistantTexts.push(text);
347
+ }
348
+ if (assistantTexts.length > 0) return assistantTexts[assistantTexts.length - 1];
349
+ return null;
350
+ }
351
+ function textFromAssistantMessage(message) {
352
+ if (!message || typeof message !== "object") return null;
353
+ const content = message.content;
354
+ if (typeof content === "string") return content;
355
+ if (!Array.isArray(content)) return null;
356
+ const texts = [];
357
+ for (const block of content) if (typeof block === "object" && block !== null && block.type === "text" && typeof block.text === "string") texts.push(block.text);
358
+ return texts.length > 0 ? texts.join("\n") : null;
359
+ }
360
+ function parseGraderJson(text) {
361
+ const candidates = [text.trim(), extractJsonBlock(text)];
362
+ for (const candidate of candidates) {
363
+ if (!candidate) continue;
364
+ try {
365
+ const normalized = normalizeGraderJson(JSON.parse(candidate));
366
+ if (normalized.expectations.length > 0) return normalized;
367
+ } catch {
368
+ continue;
369
+ }
370
+ }
371
+ return null;
372
+ }
373
+ function extractJsonBlock(text) {
374
+ const fence = text.match(/```(?:json)?\s*([\s\S]*?)```/);
375
+ if (fence?.[1]) return fence[1].trim();
376
+ const start = text.indexOf("{");
377
+ const end = text.lastIndexOf("}");
378
+ if (start >= 0 && end > start) return text.slice(start, end + 1);
379
+ return null;
380
+ }
381
+ function normalizeGraderJson(raw) {
382
+ const expectations = (raw.expectations ?? []).map((e) => ({
383
+ text: e.text ?? "",
384
+ passed: Boolean(e.passed),
385
+ evidence: e.evidence ?? ""
386
+ }));
387
+ const passed = expectations.filter((e) => e.passed).length;
388
+ const failed = expectations.length - passed;
389
+ const total = expectations.length;
390
+ const passRate = raw.summary?.pass_rate ?? raw.summary?.passRate ?? (total === 0 ? 0 : passed / total);
391
+ const summary = {
392
+ passed: raw.summary?.passed ?? passed,
393
+ failed: raw.summary?.failed ?? failed,
394
+ total: raw.summary?.total ?? total,
395
+ passRate
396
+ };
397
+ let evalFeedback;
398
+ if (raw.eval_feedback) evalFeedback = {
399
+ suggestions: (raw.eval_feedback.suggestions ?? []).map((s) => ({
400
+ assertion: s.assertion,
401
+ reason: s.reason ?? ""
402
+ })),
403
+ overall: raw.eval_feedback.overall ?? ""
404
+ };
405
+ return {
406
+ expectations,
407
+ summary,
408
+ evalFeedback
409
+ };
410
+ }
411
+ //#endregion
412
+ //#region src/grader/claude-grader.ts
413
+ /**
414
+ * Grade expectations by spawning Claude as judge (skill-creator grader pattern).
415
+ */
416
+ const DEFAULT_TIMEOUT_MS = 3e5;
417
+ /**
418
+ * Judge subprocess defaults — grading is a single-shot JSON response, not an agent session.
419
+ * Without these, Claude Code may load plugins/MCP and loop on tools until timeout.
420
+ */
421
+ const JUDGE_CLAUDE_DEFAULTS = {
422
+ maxTurns: 1,
423
+ bare: true,
424
+ disableSlashCommands: true,
425
+ noSessionPersistence: true
426
+ };
427
+ function mergeJudgeClaudeOptions(claudeCode) {
428
+ return {
429
+ ...JUDGE_CLAUDE_DEFAULTS,
430
+ ...claudeCode
431
+ };
432
+ }
433
+ function createClaudeGrader(options = {}) {
434
+ return (input) => runClaudeGrader(input, options);
435
+ }
436
+ async function runClaudeGrader(input, options = {}) {
437
+ const binary = options.binary ?? options.claudeCode?.binary ?? "claude";
438
+ const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
439
+ const prompt = buildGraderPrompt(input);
440
+ const model = options.model ?? options.claudeCode?.model;
441
+ const responseText = extractClaudeResponseText(await spawnCollectStdout(binary, buildJudgeArgs(prompt, {
442
+ ...mergeJudgeClaudeOptions(options.claudeCode),
443
+ model
444
+ }), timeoutMs, options.env, options.cwd));
445
+ const parsed = parseGraderJson(responseText);
446
+ if (!parsed) return {
447
+ expectations: input.expectations.map((text) => ({
448
+ text,
449
+ passed: false,
450
+ evidence: "Grader returned unparseable output"
451
+ })),
452
+ summary: {
453
+ passed: 0,
454
+ failed: input.expectations.length,
455
+ total: input.expectations.length,
456
+ passRate: 0
457
+ },
458
+ error: `failed to parse grader JSON from response: ${responseText.slice(0, 200)}`
459
+ };
460
+ const expectations = input.expectations.map((text, i) => {
461
+ const graded = parsed.expectations[i];
462
+ return {
463
+ text,
464
+ passed: graded?.passed ?? false,
465
+ evidence: graded?.evidence ?? "No evidence returned"
466
+ };
467
+ });
468
+ const passed = expectations.filter((e) => e.passed).length;
469
+ const total = expectations.length;
470
+ return {
471
+ expectations,
472
+ summary: {
473
+ passed,
474
+ failed: total - passed,
475
+ total,
476
+ passRate: total === 0 ? 0 : passed / total
477
+ },
478
+ evalFeedback: parsed.evalFeedback
479
+ };
480
+ }
481
+ function spawnCollectStdout(binary, args, timeoutMs, extraEnv, cwd) {
482
+ return new Promise((resolve, reject) => {
483
+ const child = spawn(binary, args, {
484
+ env: buildChildEnv(extraEnv),
485
+ cwd,
486
+ stdio: [
487
+ "ignore",
488
+ "pipe",
489
+ "pipe"
490
+ ]
491
+ });
492
+ const chunks = [];
493
+ child.stdout?.setEncoding("utf8");
494
+ child.stdout?.on("data", (c) => chunks.push(c));
495
+ const stderrChunks = [];
496
+ child.stderr?.setEncoding("utf8");
497
+ child.stderr?.on("data", (c) => stderrChunks.push(c));
498
+ const timer = setTimeout(() => {
499
+ child.kill("SIGTERM");
500
+ const stderrHint = stderrChunks.join("").trim().slice(0, 400);
501
+ reject(/* @__PURE__ */ new Error(`grader timed out after ${timeoutMs}ms` + (stderrHint ? ` (stderr: ${stderrHint})` : "")));
502
+ }, timeoutMs);
503
+ const finalize = (err) => {
504
+ clearTimeout(timer);
505
+ if (err) reject(err);
506
+ else resolve(chunks.join(""));
507
+ };
508
+ child.on("error", (err) => finalize(err));
509
+ child.on("close", (code) => {
510
+ if (code !== 0 && chunks.length === 0) finalize(/* @__PURE__ */ new Error(`grader exited ${code}: ${stderrChunks.join("").slice(0, 500)}`));
511
+ else finalize();
512
+ });
513
+ });
514
+ }
515
+ function buildChildEnv(extraEnv) {
516
+ const env = {
517
+ ...process.env,
518
+ ...extraEnv
519
+ };
520
+ delete env.CLAUDECODE;
521
+ return env;
522
+ }
523
+ //#endregion
524
+ //#region src/grader/expectations.ts
525
+ /**
526
+ * Load expectations sidecar (YAML or JSON).
527
+ */
528
+ async function loadExpectationsMap(path) {
529
+ const text = await readFile(path, "utf8");
530
+ const trimmed = path.trim().toLowerCase();
531
+ let raw;
532
+ if (trimmed.endsWith(".json")) raw = JSON.parse(text);
533
+ else raw = parse(text);
534
+ if (!raw || typeof raw !== "object") throw new Error(`expectations file must be an object mapping case ids to lists`);
535
+ const map = {};
536
+ for (const [caseId, value] of Object.entries(raw)) {
537
+ if (!Array.isArray(value)) throw new Error(`expectations for case "${caseId}" must be an array of strings`);
538
+ map[caseId] = value.map(String);
539
+ }
540
+ return map;
541
+ }
542
+ //#endregion
543
+ //#region src/grader/transcript.ts
544
+ const MAX_RESULT_CHARS = 4e3;
545
+ function trajectoryToTranscript(view, prompt) {
546
+ const lines = [];
547
+ if (prompt) lines.push("## User prompt", "", prompt, "");
548
+ for (const turn of view.turns) {
549
+ lines.push(`## Assistant turn ${turn.turnIndex + 1}`, "");
550
+ if (turn.text) lines.push(turn.text, "");
551
+ for (const call of turn.toolCalls) {
552
+ lines.push(`[Tool call] ${call.name} (id=${call.callId})`);
553
+ lines.push(`Arguments: ${formatJson$1(call.args)}`);
554
+ if (call.result !== null) {
555
+ lines.push(`[Tool result] ${formatResult(call.result)}`);
556
+ if (call.isError) lines.push("(tool reported error)");
557
+ } else lines.push("[Tool result] (none observed)");
558
+ lines.push("");
559
+ }
560
+ if (turn.stopReason) lines.push(`Stop reason: ${turn.stopReason}`, "");
561
+ }
562
+ const finalInTurns = view.turns.some((t) => t.text === view.finalResponse);
563
+ if (view.finalResponse && !finalInTurns) lines.push("## Final response", "", view.finalResponse, "");
564
+ lines.push("## Session metadata", `session_id: ${view.meta.sessionId}`, `model: ${view.meta.model}`, `cwd: ${view.meta.cwd}`, `success: ${view.success}`, `tool_calls: ${view.toolCalls.length}`, `duration_ms: ${view.usage.durationMs}`, `input_tokens: ${view.usage.inputTokens}`, `output_tokens: ${view.usage.outputTokens}`);
565
+ return lines.join("\n").trimEnd();
566
+ }
567
+ function formatJson$1(value) {
568
+ try {
569
+ return JSON.stringify(value);
570
+ } catch {
571
+ return String(value);
572
+ }
573
+ }
574
+ function formatResult(result) {
575
+ if (typeof result === "string") return truncate(result);
576
+ return truncate(formatJson$1(result));
577
+ }
578
+ function truncate(text) {
579
+ if (text.length <= MAX_RESULT_CHARS) return text;
580
+ return `${text.slice(0, MAX_RESULT_CHARS)}… (truncated)`;
581
+ }
582
+ //#endregion
583
+ //#region src/grader/grade-report.ts
584
+ /**
585
+ * Grade a harness-eval SuiteReport with outcome expectations (LLM judge).
586
+ */
587
+ async function gradeReport(report, options = {}) {
588
+ const expectationsMap = options.expectationsPath ? await loadExpectationsMap(options.expectationsPath) : {};
589
+ const gradeFn = options.gradeFn ?? createClaudeGrader({
590
+ binary: options.binary,
591
+ model: options.model,
592
+ timeoutMs: options.timeoutMs,
593
+ env: options.env,
594
+ cwd: options.cwd,
595
+ claudeCode: options.claudeCode
596
+ });
597
+ const limit = createLimit(options.maxConcurrent ?? 2);
598
+ const tasks = [];
599
+ for (const cell of report.cells) {
600
+ const expectations = cell.expectations ?? expectationsMap[cell.caseId] ?? [];
601
+ if (expectations.length === 0) continue;
602
+ for (const rep of cell.repetitions) {
603
+ if (!rep.adapterResult) continue;
604
+ tasks.push({
605
+ cell,
606
+ rep,
607
+ expectations
608
+ });
609
+ }
610
+ }
611
+ const gradeStartTs = Date.now();
612
+ options.onProgress?.({
613
+ kind: "grade-start",
614
+ total: tasks.length
615
+ });
616
+ const results = await Promise.all(tasks.map(({ cell, rep, expectations }) => limit(async () => {
617
+ const start = Date.now();
618
+ const view = rep.adapterResult.view;
619
+ const prompt = cell.prompt ?? "";
620
+ const transcript = trajectoryToTranscript(view, prompt);
621
+ try {
622
+ const graded = await gradeFn({
623
+ prompt,
624
+ transcript,
625
+ expectations,
626
+ systemInstruction: options.systemInstruction
627
+ });
628
+ const result = {
629
+ caseId: cell.caseId,
630
+ cellLabel: cell.cell.label,
631
+ repetitionIndex: rep.repetitionIndex,
632
+ prompt,
633
+ expectations: graded.expectations,
634
+ summary: graded.summary,
635
+ evalFeedback: graded.evalFeedback,
636
+ graderError: graded.error,
637
+ durationMs: Date.now() - start
638
+ };
639
+ options.onProgress?.({
640
+ kind: "grade-complete",
641
+ caseId: result.caseId,
642
+ cellLabel: result.cellLabel,
643
+ repetitionIndex: result.repetitionIndex,
644
+ passed: result.summary.passed,
645
+ failed: result.summary.failed,
646
+ durationMs: result.durationMs,
647
+ graderError: result.graderError
648
+ });
649
+ return result;
650
+ } catch (err) {
651
+ const message = err instanceof Error ? err.message : String(err);
652
+ const result = {
653
+ caseId: cell.caseId,
654
+ cellLabel: cell.cell.label,
655
+ repetitionIndex: rep.repetitionIndex,
656
+ prompt,
657
+ expectations: expectations.map((text) => ({
658
+ text,
659
+ passed: false,
660
+ evidence: message
661
+ })),
662
+ summary: {
663
+ passed: 0,
664
+ failed: expectations.length,
665
+ total: expectations.length,
666
+ passRate: 0
667
+ },
668
+ graderError: message,
669
+ durationMs: Date.now() - start
670
+ };
671
+ options.onProgress?.({
672
+ kind: "grade-complete",
673
+ caseId: result.caseId,
674
+ cellLabel: result.cellLabel,
675
+ repetitionIndex: result.repetitionIndex,
676
+ passed: 0,
677
+ failed: expectations.length,
678
+ durationMs: result.durationMs,
679
+ graderError: message
680
+ });
681
+ return result;
682
+ }
683
+ })));
684
+ results.sort((a, b) => {
685
+ const keyA = `${a.caseId}::${a.cellLabel}::${a.repetitionIndex}`;
686
+ const keyB = `${b.caseId}::${b.cellLabel}::${b.repetitionIndex}`;
687
+ return keyA.localeCompare(keyB);
688
+ });
689
+ const totalExpectations = results.reduce((n, r) => n + r.summary.total, 0);
690
+ const passedExpectations = results.reduce((n, r) => n + r.summary.passed, 0);
691
+ options.onProgress?.({
692
+ kind: "grade-done",
693
+ durationMs: Date.now() - gradeStartTs,
694
+ totalExpectations,
695
+ passedExpectations
696
+ });
697
+ return {
698
+ gradedAt: (/* @__PURE__ */ new Date()).toISOString(),
699
+ sourceReport: options.sourceReport ?? "",
700
+ gradingConfigPath: options.gradingConfigPath,
701
+ results,
702
+ summary: {
703
+ passed: passedExpectations,
704
+ failed: totalExpectations - passedExpectations,
705
+ total: totalExpectations,
706
+ passRate: totalExpectations === 0 ? 0 : passedExpectations / totalExpectations
707
+ }
708
+ };
709
+ }
710
+ async function loadSuiteReport(path) {
711
+ const text = await readFile(path, "utf8");
712
+ return JSON.parse(text);
713
+ }
714
+ //#endregion
715
+ //#region src/grader/resolve-grade-options.ts
716
+ /**
717
+ * Merge standalone grading YAML with CLI flags (CLI wins).
718
+ */
719
+ function resolveGradeOptions(fileConfig, cli = {}, configPath) {
720
+ const judge = fileConfig?.judge;
721
+ const adapter = judge?.adapter ?? "claude-code";
722
+ if (adapter !== "claude-code") throw new Error(`unsupported grading adapter "${adapter}" (only claude-code today)`);
723
+ const claudeCode = judge?.claudeCode ?? {};
724
+ const binary = cli.binary ?? claudeCode.binary;
725
+ const model = cli.model ?? judge?.model ?? claudeCode.model;
726
+ return {
727
+ sourceReport: cli.sourceReport,
728
+ expectationsPath: cli.expectationsPath,
729
+ model,
730
+ binary,
731
+ timeoutMs: cli.timeoutMs ?? judge?.timeoutMs,
732
+ maxConcurrent: cli.maxConcurrent ?? judge?.maxConcurrent,
733
+ systemInstruction: judge?.system_instruction,
734
+ env: judge?.env,
735
+ cwd: judge?.cwd,
736
+ claudeCode: {
737
+ ...claudeCode,
738
+ binary: void 0,
739
+ model: void 0
740
+ },
741
+ gradingConfigPath: configPath
742
+ };
743
+ }
744
+ //#endregion
745
+ //#region src/grader/format-console.ts
746
+ const RESET$1 = "\x1B[0m";
747
+ const GREEN$1 = "\x1B[32m";
748
+ const RED$1 = "\x1B[31m";
749
+ const DIM = "\x1B[2m";
750
+ function formatGradingConsole(report, color = true) {
751
+ const lines = [];
752
+ if (report.results.length === 0) {
753
+ lines.push("No repetitions graded. Add expectations to the suite YAML or pass --expectations.");
754
+ return lines.join("\n");
755
+ }
756
+ for (const result of report.results) {
757
+ const status = result.summary.failed === 0 && !result.graderError ? color ? `${GREEN$1}PASS${RESET$1}` : "PASS" : color ? `${RED$1}FAIL${RESET$1}` : "FAIL";
758
+ lines.push(`${result.caseId} @ ${result.cellLabel} rep${result.repetitionIndex} ${status}`);
759
+ if (result.graderError) lines.push(color ? ` ${RED$1}grader error: ${result.graderError}${RESET$1}` : ` grader error: ${result.graderError}`);
760
+ for (const exp of result.expectations) {
761
+ const marker = exp.passed ? color ? `${GREEN$1}✓${RESET$1}` : "✓" : color ? `${RED$1}✗${RESET$1}` : "✗";
762
+ lines.push(` ├─ ${exp.text} ${marker}`);
763
+ if (!exp.passed || exp.evidence) lines.push(color ? ` │ ${DIM}${exp.evidence}${RESET$1}` : ` │ ${exp.evidence}`);
764
+ }
765
+ const pct = (result.summary.passRate * 100).toFixed(0);
766
+ lines.push(` └─ ${result.summary.passed}/${result.summary.total} (${pct}%) expectations`);
767
+ lines.push("");
768
+ }
769
+ const overallPct = (report.summary.passRate * 100).toFixed(0);
770
+ lines.push(`Overall: ${report.summary.passed}/${report.summary.total} (${overallPct}%) expectations passed`);
771
+ return lines.join("\n").trimEnd();
772
+ }
773
+ function gradingReportPassed(report) {
774
+ return report.results.every((r) => !r.graderError && r.summary.failed === 0 && r.summary.total > 0);
775
+ }
776
+ //#endregion
777
+ //#region src/reporter/format-console.ts
778
+ const RESET = "\x1B[0m";
779
+ const GREEN = "\x1B[32m";
780
+ const RED = "\x1B[31m";
781
+ const YELLOW = "\x1B[33m";
782
+ function formatConsole(rows, color = true) {
783
+ const lines = [];
784
+ for (const row of rows) {
785
+ const status = row.passed ? color ? `${GREEN}PASS${RESET}` : "PASS" : color ? `${RED}FAIL${RESET}` : "FAIL";
786
+ const crashNote = row.adapterErrors > 0 ? ` ${color ? YELLOW : ""}[${row.adapterErrors} adapter errors]${color ? RESET : ""}` : "";
787
+ lines.push(`${row.caseId} @ ${row.cellLabel} ${status}${crashNote}`);
788
+ if (row.category) lines.push(` category: ${row.category}`);
789
+ for (const stat of row.stats) {
790
+ const marker = stat.meetsThreshold ? color ? `${GREEN}✓${RESET}` : "✓" : color ? `${RED}✗${RESET}` : "✗";
791
+ const rateStr = formatRate$1(stat);
792
+ const thresholdPct = (stat.threshold * 100).toFixed(0);
793
+ let line = ` ├─ ${stat.description}: ${rateStr} [threshold ${thresholdPct}%] ${marker}`;
794
+ if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
795
+ const arrow = stat.delta >= 0 ? "↑" : "↓";
796
+ const basePct = (stat.baselinePassRate * 100).toFixed(0);
797
+ const curPct = (stat.passRate * 100).toFixed(0);
798
+ const deltaPct = (stat.delta * 100).toFixed(0);
799
+ line += ` (${basePct}% → ${curPct}% (${arrow}${deltaPct}%))`;
800
+ }
801
+ lines.push(line);
802
+ }
803
+ lines.push("");
804
+ }
805
+ return lines.join("\n").trimEnd();
806
+ }
807
+ function formatRate$1(stat) {
808
+ if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
809
+ const pct = (stat.passRate * 100).toFixed(0);
810
+ return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
811
+ }
812
+ //#endregion
813
+ //#region src/reporter/format-json.ts
814
+ function formatJson(report) {
815
+ return JSON.stringify(report, null, 2);
816
+ }
817
+ //#endregion
818
+ //#region src/reporter/format-markdown.ts
819
+ function formatMarkdown(rows) {
820
+ const lines = ["# Harness Eval Report", ""];
821
+ for (const row of rows) {
822
+ const status = row.passed ? "PASS" : "FAIL";
823
+ const crashNote = row.adapterErrors > 0 ? ` (${row.adapterErrors} adapter errors)` : "";
824
+ lines.push(`## ${row.caseId} @ ${row.cellLabel} — ${status}${crashNote}`);
825
+ if (row.category) lines.push(`**Category:** ${row.category}`);
826
+ if (row.notes) lines.push("<details><summary>Notes</summary>", row.notes, "</details>");
827
+ lines.push("");
828
+ lines.push("| Assertion | Result | Threshold | Status |");
829
+ lines.push("| --- | --- | --- | --- |");
830
+ for (const stat of row.stats) {
831
+ const rateStr = formatRate(stat);
832
+ const threshold = `${(stat.threshold * 100).toFixed(0)}%`;
833
+ const statusCell = stat.meetsThreshold ? "✓" : "✗";
834
+ let result = rateStr;
835
+ if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
836
+ const base = (stat.baselinePassRate * 100).toFixed(0);
837
+ const cur = (stat.passRate * 100).toFixed(0);
838
+ const d = (stat.delta * 100).toFixed(0);
839
+ const sign = stat.delta >= 0 ? "+" : "";
840
+ result += ` (${base}% → ${cur}%, ${sign}${d}%)`;
841
+ }
842
+ lines.push(`| ${stat.description} | ${result} | ${threshold} | ${statusCell} |`);
843
+ }
844
+ lines.push("");
845
+ }
846
+ return lines.join("\n").trimEnd();
847
+ }
848
+ function formatRate(stat) {
849
+ if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
850
+ const pct = (stat.passRate * 100).toFixed(0);
851
+ return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
852
+ }
853
+ //#endregion
854
+ //#region src/reporter/renderable.ts
855
+ function toRenderableRows(report) {
856
+ return report.cells.map((cell) => cellToRow(cell));
857
+ }
858
+ function applyBaseline(rows, baseline) {
859
+ const baselineMap = new Map(baseline.cells.map((c) => [`${c.caseId}::${c.cell.label}`, c]));
860
+ return rows.map((row) => {
861
+ const baseCell = baselineMap.get(`${row.caseId}::${row.cellLabel}`);
862
+ if (!baseCell) return row;
863
+ const stats = row.stats.map((stat, i) => {
864
+ const baseStat = baseCell.assertionStats[i];
865
+ if (!baseStat) return stat;
866
+ const delta = stat.passRate - baseStat.passRate;
867
+ return {
868
+ ...stat,
869
+ baselinePassRate: baseStat.passRate,
870
+ delta
871
+ };
872
+ });
873
+ return {
874
+ ...row,
875
+ stats
876
+ };
877
+ });
878
+ }
879
+ function cellToRow(cell) {
880
+ const totalReps = cell.repetitions.length;
881
+ const stats = cell.assertionStats.map((s) => ({
882
+ description: s.description,
883
+ threshold: s.threshold,
884
+ passedCount: s.passedCount,
885
+ evaluatedCount: s.evaluatedCount,
886
+ totalReps,
887
+ adapterErrors: cell.adapterErrors,
888
+ passRate: s.passRate,
889
+ meetsThreshold: s.meetsThreshold
890
+ }));
891
+ return {
892
+ caseId: cell.caseId,
893
+ category: cell.category,
894
+ notes: cell.notes,
895
+ cellLabel: cell.cell.label,
896
+ passed: cell.passed,
897
+ adapterErrors: cell.adapterErrors,
898
+ totalReps,
899
+ stats
900
+ };
901
+ }
902
+ //#endregion
903
+ //#region src/reporter/index.ts
904
+ function formatReport(report, options) {
905
+ if (options.format === "json") return formatJson(report);
906
+ let rows = toRenderableRows(report);
907
+ if (options.baseline) rows = applyBaseline(rows, options.baseline);
908
+ const useColor = options.color ?? options.format === "console";
909
+ if (options.format === "markdown") return formatMarkdown(rows);
910
+ return formatConsole(rows, useColor);
911
+ }
912
+ //#endregion
913
+ //#region src/eval-interchange/build.ts
914
+ const DEFAULT_AGENT_ID = "agent";
915
+ function serializeToolInput(args) {
916
+ return JSON.stringify(args ?? {});
917
+ }
918
+ function parseToolInput(toolInput) {
919
+ try {
920
+ return JSON.parse(toolInput);
921
+ } catch {
922
+ return toolInput;
923
+ }
924
+ }
925
+ function toolCallToInterchange(toolCall) {
926
+ return {
927
+ tool_name: toolCall.name,
928
+ tool_input: serializeToolInput(toolCall.args)
929
+ };
930
+ }
931
+ function interchangeToTabular(toolCall) {
932
+ return {
933
+ tool_name: toolCall.tool_name,
934
+ tool_input: parseToolInput(toolCall.tool_input)
935
+ };
936
+ }
937
+ function predictedTrajectoryFromView(view) {
938
+ return view.toolCalls.map(toolCallToInterchange);
939
+ }
940
+ function buildAgentTrace(view, agentId = DEFAULT_AGENT_ID) {
941
+ const agents = { [agentId]: {
942
+ agent_id: agentId,
943
+ agent_type: "assistant",
944
+ description: view.meta.model,
945
+ tools: view.meta.availableTools.map((name) => ({ name }))
946
+ } };
947
+ const activeTools = view.meta.availableTools.map((name) => ({ name }));
948
+ return {
949
+ agents,
950
+ turns: view.turns.map((turn) => {
951
+ const events = [];
952
+ if (turn.text) events.push({
953
+ author: agentId,
954
+ content: { parts: [{ text: turn.text }] },
955
+ active_tools: activeTools
956
+ });
957
+ for (const toolCall of turn.toolCalls) {
958
+ events.push({
959
+ author: agentId,
960
+ content: { parts: [{ function_call: {
961
+ name: toolCall.name,
962
+ args: toolCall.args ?? {}
963
+ } }] },
964
+ active_tools: activeTools
965
+ });
966
+ if (toolCall.result !== null && toolCall.result !== void 0) events.push({
967
+ author: agentId,
968
+ content: { parts: [{ function_response: {
969
+ name: toolCall.name,
970
+ response: toolCall.result
971
+ } }] },
972
+ active_tools: activeTools
973
+ });
974
+ }
975
+ return {
976
+ turn_index: turn.turnIndex,
977
+ events
978
+ };
979
+ })
980
+ };
981
+ }
982
+ function latencyInSeconds(view) {
983
+ return view.usage.durationMs / 1e3;
984
+ }
985
+ //#endregion
986
+ //#region src/metrics/trajectory.ts
987
+ function normalizeToolCall$1(toolCall) {
988
+ if (typeof toolCall.tool_input === "string") return {
989
+ tool_name: toolCall.tool_name,
990
+ tool_input: toolCall.tool_input
991
+ };
992
+ return {
993
+ tool_name: toolCall.tool_name,
994
+ tool_input: serializeToolInput(toolCall.tool_input)
995
+ };
996
+ }
997
+ function normalizeTrajectory(trajectory) {
998
+ return trajectory.map(normalizeToolCall$1);
999
+ }
1000
+ function toolCallKey(toolCall) {
1001
+ return `${toolCall.tool_name}\0${toolCall.tool_input}`;
1002
+ }
1003
+ function multisetIntersectionSize(predicted, reference) {
1004
+ const refCounts = /* @__PURE__ */ new Map();
1005
+ for (const toolCall of reference) {
1006
+ const key = toolCallKey(toolCall);
1007
+ refCounts.set(key, (refCounts.get(key) ?? 0) + 1);
1008
+ }
1009
+ let matched = 0;
1010
+ for (const toolCall of predicted) {
1011
+ const key = toolCallKey(toolCall);
1012
+ const count = refCounts.get(key) ?? 0;
1013
+ if (count > 0) {
1014
+ matched += 1;
1015
+ refCounts.set(key, count - 1);
1016
+ }
1017
+ }
1018
+ return matched;
1019
+ }
1020
+ function isSubsequence(predicted, reference) {
1021
+ let refIndex = 0;
1022
+ for (const toolCall of predicted) {
1023
+ if (refIndex >= reference.length) break;
1024
+ if (toolCallKey(toolCall) === toolCallKey(reference[refIndex])) refIndex += 1;
1025
+ }
1026
+ return refIndex === reference.length;
1027
+ }
1028
+ function arraysEqual(left, right) {
1029
+ if (left.length !== right.length) return false;
1030
+ return left.every((toolCall, index) => {
1031
+ const other = right[index];
1032
+ return toolCallKey(toolCall) === toolCallKey(other);
1033
+ });
1034
+ }
1035
+ function trajectoryExactMatch(predicted, reference) {
1036
+ return arraysEqual(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
1037
+ }
1038
+ function trajectoryInOrderMatch(predicted, reference) {
1039
+ return isSubsequence(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
1040
+ }
1041
+ function trajectoryAnyOrderMatch(predicted, reference) {
1042
+ const predictedNorm = normalizeTrajectory(predicted);
1043
+ const referenceNorm = normalizeTrajectory(reference);
1044
+ if (predictedNorm.length !== referenceNorm.length) return 0;
1045
+ const predictedKeys = predictedNorm.map(toolCallKey).sort();
1046
+ const referenceKeys = referenceNorm.map(toolCallKey).sort();
1047
+ return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
1048
+ }
1049
+ function trajectoryPrecision(predicted, reference) {
1050
+ const predictedNorm = normalizeTrajectory(predicted);
1051
+ if (predictedNorm.length === 0) return reference.length === 0 ? 1 : 0;
1052
+ return multisetIntersectionSize(predictedNorm, normalizeTrajectory(reference)) / predictedNorm.length;
1053
+ }
1054
+ function trajectoryRecall(predicted, reference) {
1055
+ const referenceNorm = normalizeTrajectory(reference);
1056
+ if (referenceNorm.length === 0) return predicted.length === 0 ? 1 : 0;
1057
+ return multisetIntersectionSize(normalizeTrajectory(predicted), referenceNorm) / referenceNorm.length;
1058
+ }
1059
+ function trajectorySingleToolUse(predicted, reference) {
1060
+ const predictedNorm = normalizeTrajectory(predicted);
1061
+ const referenceNorm = normalizeTrajectory(reference);
1062
+ if (predictedNorm.length !== 1 || referenceNorm.length !== 1) return 0;
1063
+ return toolCallKey(predictedNorm[0]) === toolCallKey(referenceNorm[0]) ? 1 : 0;
1064
+ }
1065
+ function computeTrajectoryMetrics(predicted, reference) {
1066
+ return {
1067
+ trajectory_exact_match: trajectoryExactMatch(predicted, reference),
1068
+ trajectory_in_order_match: trajectoryInOrderMatch(predicted, reference),
1069
+ trajectory_any_order_match: trajectoryAnyOrderMatch(predicted, reference),
1070
+ trajectory_precision: trajectoryPrecision(predicted, reference),
1071
+ trajectory_recall: trajectoryRecall(predicted, reference),
1072
+ trajectory_single_tool_use: trajectorySingleToolUse(predicted, reference)
1073
+ };
1074
+ }
1075
+ //#endregion
1076
+ //#region src/metrics/tool-calls.ts
1077
+ function normalizeToolCall(toolCall) {
1078
+ if (typeof toolCall.tool_input === "string") return {
1079
+ tool_name: toolCall.tool_name,
1080
+ tool_input: toolCall.tool_input
1081
+ };
1082
+ return {
1083
+ tool_name: toolCall.tool_name,
1084
+ tool_input: serializeToolInput(toolCall.tool_input)
1085
+ };
1086
+ }
1087
+ function parsedArgs(toolCall) {
1088
+ const parsed = parseToolInput(toolCall.tool_input);
1089
+ if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) return null;
1090
+ return parsed;
1091
+ }
1092
+ function toolCallValid(toolCall) {
1093
+ const normalized = normalizeToolCall(toolCall);
1094
+ if (!normalized.tool_name.trim()) return 0;
1095
+ try {
1096
+ JSON.parse(normalized.tool_input);
1097
+ return 1;
1098
+ } catch {
1099
+ return 0;
1100
+ }
1101
+ }
1102
+ function toolNameMatch(predicted, reference) {
1103
+ const predictedNorm = normalizeToolCall(predicted);
1104
+ const referenceNorm = normalizeToolCall(reference);
1105
+ return predictedNorm.tool_name === referenceNorm.tool_name ? 1 : 0;
1106
+ }
1107
+ function toolParameterKeyMatch(predicted, reference) {
1108
+ if (toolNameMatch(predicted, reference) === 0) return 0;
1109
+ const predictedArgs = parsedArgs(normalizeToolCall(predicted));
1110
+ const referenceArgs = parsedArgs(normalizeToolCall(reference));
1111
+ if (predictedArgs === null || referenceArgs === null) return 0;
1112
+ const predictedKeys = Object.keys(predictedArgs).sort();
1113
+ const referenceKeys = Object.keys(referenceArgs).sort();
1114
+ if (predictedKeys.length !== referenceKeys.length) return 0;
1115
+ return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
1116
+ }
1117
+ function valuesEqual(left, right, useStrictStringMatch) {
1118
+ if (useStrictStringMatch) return JSON.stringify(left) === JSON.stringify(right);
1119
+ return JSON.stringify(left) === JSON.stringify(right);
1120
+ }
1121
+ function toolParameterKvMatch(predicted, reference, options = {}) {
1122
+ if (toolParameterKeyMatch(predicted, reference) === 0) return 0;
1123
+ const predictedArgs = parsedArgs(normalizeToolCall(predicted));
1124
+ const referenceArgs = parsedArgs(normalizeToolCall(reference));
1125
+ for (const key of Object.keys(referenceArgs)) if (!valuesEqual(predictedArgs[key], referenceArgs[key], options.useStrictStringMatch ?? false)) return 0;
1126
+ return 1;
1127
+ }
1128
+ function computeToolCallMetrics(predicted, reference, options = {}) {
1129
+ const pairCount = Math.max(predicted.length, reference.length, 1);
1130
+ let valid = 0;
1131
+ let nameMatch = 0;
1132
+ let keyMatch = 0;
1133
+ let kvMatch = 0;
1134
+ for (let index = 0; index < pairCount; index += 1) {
1135
+ const predictedCall = predicted[index];
1136
+ const referenceCall = reference[index];
1137
+ if (!predictedCall) continue;
1138
+ valid += toolCallValid(predictedCall);
1139
+ if (!referenceCall) continue;
1140
+ nameMatch += toolNameMatch(predictedCall, referenceCall);
1141
+ keyMatch += toolParameterKeyMatch(predictedCall, referenceCall);
1142
+ kvMatch += toolParameterKvMatch(predictedCall, referenceCall, options);
1143
+ }
1144
+ return {
1145
+ tool_call_valid: valid / pairCount,
1146
+ tool_name_match: nameMatch / pairCount,
1147
+ tool_parameter_key_match: keyMatch / pairCount,
1148
+ tool_parameter_kv_match: kvMatch / pairCount
1149
+ };
1150
+ }
1151
+ //#endregion
1152
+ //#region src/eval-interchange/projections.ts
1153
+ /**
1154
+ * Envelope projection methods for eval interchange output.
1155
+ */
1156
+ function repetitionInterchangeFields(repetition) {
1157
+ if (!repetition.trajectory) return { predicted_trajectory: [] };
1158
+ return {
1159
+ predicted_trajectory: repetition.predicted_trajectory ?? predictedTrajectoryFromView(repetition.trajectory),
1160
+ agent_trace: repetition.agent_trace ?? buildAgentTrace(repetition.trajectory),
1161
+ latency_in_seconds: repetition.latency_in_seconds ?? latencyInSeconds(repetition.trajectory),
1162
+ failure: repetition.failure ?? (repetition.trajectory.success ? 0 : 1)
1163
+ };
1164
+ }
1165
+ function referenceTrajectoryForCell(cell) {
1166
+ return cell.reference_trajectory;
1167
+ }
1168
+ function repetitionToDatasetRow(cell, repetition) {
1169
+ const fields = repetitionInterchangeFields(repetition);
1170
+ if (!repetition.trajectory) return {
1171
+ prompt: cell.prompt,
1172
+ response: void 0,
1173
+ predicted_trajectory: [],
1174
+ reference_trajectory: referenceTrajectoryForCell(cell),
1175
+ latency_in_seconds: repetition.durationMs / 1e3,
1176
+ failure: 1,
1177
+ human_ratings: cell.human_ratings
1178
+ };
1179
+ return {
1180
+ prompt: cell.prompt,
1181
+ response: repetition.trajectory.finalResponse,
1182
+ predicted_trajectory: fields.predicted_trajectory.map(interchangeToTabular),
1183
+ reference_trajectory: referenceTrajectoryForCell(cell),
1184
+ latency_in_seconds: fields.latency_in_seconds ?? repetition.durationMs / 1e3,
1185
+ failure: fields.failure ?? 1,
1186
+ human_ratings: cell.human_ratings
1187
+ };
1188
+ }
1189
+ function repetitionToProtoInstance(cell, repetition) {
1190
+ const fields = repetitionInterchangeFields(repetition);
1191
+ if (!repetition.trajectory) return null;
1192
+ const reference = referenceTrajectoryForCell(cell);
1193
+ return {
1194
+ prompt: cell.prompt,
1195
+ response: repetition.trajectory.finalResponse,
1196
+ predicted_trajectory: { tool_calls: fields.predicted_trajectory },
1197
+ reference_trajectory: reference ? { tool_calls: reference.map((toolCall) => ({
1198
+ tool_name: toolCall.tool_name,
1199
+ tool_input: typeof toolCall.tool_input === "string" ? toolCall.tool_input : JSON.stringify(toolCall.tool_input ?? {})
1200
+ })) } : void 0
1201
+ };
1202
+ }
1203
+ function repetitionToAgentTrace(repetition) {
1204
+ return repetitionInterchangeFields(repetition).agent_trace ?? null;
1205
+ }
1206
+ function computeRepetitionMetrics(repetition, referenceTrajectory) {
1207
+ if (!referenceTrajectory?.length) return {};
1208
+ const predictedTabular = (repetition.predicted_trajectory ?? (repetition.trajectory ? predictedTrajectoryFromView(repetition.trajectory) : [])).map(interchangeToTabular);
1209
+ return {
1210
+ trajectoryMetrics: computeTrajectoryMetrics(predictedTabular, referenceTrajectory),
1211
+ toolCallMetrics: computeToolCallMetrics(predictedTabular, referenceTrajectory)
1212
+ };
1213
+ }
1214
+ function toTrajectory(envelope) {
1215
+ const rows = [];
1216
+ for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
1217
+ const row = repetitionToDatasetRow(cell, repetition);
1218
+ if (row) rows.push(row);
1219
+ }
1220
+ return rows;
1221
+ }
1222
+ function toProtoInstances(envelope) {
1223
+ const instances = [];
1224
+ for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
1225
+ const instance = repetitionToProtoInstance(cell, repetition);
1226
+ if (instance) instances.push(instance);
1227
+ }
1228
+ return instances;
1229
+ }
1230
+ function toAgentTrace(envelope) {
1231
+ const traces = [];
1232
+ for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
1233
+ const trace = repetitionToAgentTrace(repetition);
1234
+ if (trace) traces.push(trace);
1235
+ }
1236
+ return traces;
1237
+ }
1238
+ function enrichRepetitionWithInterchange(repetition, referenceTrajectory) {
1239
+ if (!repetition.trajectory) return repetition;
1240
+ const predicted_trajectory = predictedTrajectoryFromView(repetition.trajectory);
1241
+ const agent_trace = buildAgentTrace(repetition.trajectory);
1242
+ const latency_in_seconds = latencyInSeconds(repetition.trajectory);
1243
+ const failure = repetition.trajectory.success ? 0 : 1;
1244
+ const metrics = computeRepetitionMetrics({
1245
+ ...repetition,
1246
+ predicted_trajectory,
1247
+ agent_trace,
1248
+ latency_in_seconds,
1249
+ failure
1250
+ }, referenceTrajectory);
1251
+ return {
1252
+ ...repetition,
1253
+ predicted_trajectory,
1254
+ agent_trace,
1255
+ latency_in_seconds,
1256
+ failure,
1257
+ trajectoryMetrics: metrics.trajectoryMetrics,
1258
+ toolCallMetrics: metrics.toolCallMetrics
1259
+ };
1260
+ }
1261
+ //#endregion
1262
+ //#region src/eval-record/build.ts
1263
+ /**
1264
+ * Build {@link EvalRunEnvelope} from harness-eval run and grading reports.
1265
+ */
1266
+ function extractRawEvents(adapterResult) {
1267
+ if (adapterResult !== null && typeof adapterResult === "object" && "rawEvents" in adapterResult && Array.isArray(adapterResult.rawEvents)) return adapterResult.rawEvents;
1268
+ }
1269
+ function outcomePassForCell(caseId, cellLabel, repetitions) {
1270
+ const graded = repetitions.filter((r) => r.outcomeGrades);
1271
+ if (graded.length === 0) return void 0;
1272
+ return graded.every((r) => r.outcomeGrades.error === void 0 && r.outcomeGrades.summary.failed === 0);
1273
+ }
1274
+ /**
1275
+ * Convert a {@link SuiteReport} (and optional grading) into a versioned
1276
+ * {@link EvalRunEnvelope} for storage or API handoff.
1277
+ */
1278
+ function buildEvalRunEnvelope(report, options = {}) {
1279
+ const includeTranscript = options.includeTranscript !== false;
1280
+ const includeRaw = options.includeRawStreamEvents === true;
1281
+ const judge = options.grading?.judge ?? { id: "harness-eval/claude-grader" };
1282
+ const cells = report.cells.map((cell) => {
1283
+ const prompt = cell.prompt ?? "";
1284
+ const referenceTrajectory = cell.reference_trajectory;
1285
+ const repetitions = cell.repetitions.map((rep) => {
1286
+ const base = {
1287
+ repetitionIndex: rep.repetitionIndex,
1288
+ durationMs: rep.durationMs,
1289
+ assertionResults: rep.assertionResults
1290
+ };
1291
+ if (rep.error) {
1292
+ base.error = {
1293
+ message: rep.error.message,
1294
+ diagnostics: rep.error.diagnostics
1295
+ };
1296
+ return base;
1297
+ }
1298
+ if (rep.adapterResult) {
1299
+ base.trajectory = {
1300
+ ...rep.adapterResult.view,
1301
+ schemaVersion: "1.0"
1302
+ };
1303
+ base.diagnostics = rep.adapterResult.diagnostics;
1304
+ const artifacts = {};
1305
+ if (includeTranscript) artifacts.transcript = trajectoryToTranscript(rep.adapterResult.view, prompt);
1306
+ if (includeRaw) {
1307
+ const raw = extractRawEvents(rep.adapterResult);
1308
+ if (raw) artifacts.rawStreamEvents = raw;
1309
+ }
1310
+ if (Object.keys(artifacts).length > 0) base.artifacts = artifacts;
1311
+ }
1312
+ const graded = options.grading?.results.find((r) => r.caseId === cell.caseId && r.cellLabel === cell.cell.label && r.repetitionIndex === rep.repetitionIndex);
1313
+ if (graded) base.outcomeGrades = {
1314
+ judge,
1315
+ expectations: graded.expectations,
1316
+ summary: graded.summary,
1317
+ evalFeedback: graded.evalFeedback,
1318
+ error: graded.graderError
1319
+ };
1320
+ return enrichRepetitionWithInterchange(base, referenceTrajectory);
1321
+ });
1322
+ return {
1323
+ caseId: cell.caseId,
1324
+ category: cell.category,
1325
+ notes: cell.notes,
1326
+ prompt: cell.prompt,
1327
+ expectations: cell.expectations,
1328
+ reference_trajectory: cell.reference_trajectory,
1329
+ human_ratings: cell.human_ratings,
1330
+ cellLabel: cell.cell.label,
1331
+ axes: cell.cell.axes,
1332
+ assertionStats: cell.assertionStats,
1333
+ adapterErrors: cell.adapterErrors,
1334
+ behavioralPass: cell.passed,
1335
+ outcomePass: outcomePassForCell(cell.caseId, cell.cell.label, repetitions),
1336
+ repetitions
1337
+ };
1338
+ });
1339
+ const cellsPassed = cells.filter((c) => c.behavioralPass).length;
1340
+ const gradedCells = cells.filter((c) => c.outcomePass !== void 0);
1341
+ const outcomePass = gradedCells.length > 0 ? gradedCells.every((c) => c.outcomePass === true) : void 0;
1342
+ return {
1343
+ schemaVersion: "1.0",
1344
+ runId: options.runId ?? randomUUID(),
1345
+ startedAt: report.startedAt,
1346
+ durationMs: report.durationMs,
1347
+ suite: options.suite,
1348
+ harness: {
1349
+ adapter: options.harness?.adapter ?? "claude-code",
1350
+ frameworkVersion: options.harness?.frameworkVersion,
1351
+ harnessVersion: options.harness?.harnessVersion
1352
+ },
1353
+ provenance: options.provenance,
1354
+ summary: {
1355
+ cellsTotal: cells.length,
1356
+ cellsPassed,
1357
+ behavioralPass: cellsPassed === cells.length,
1358
+ outcomePass
1359
+ },
1360
+ cells
1361
+ };
1362
+ }
1363
+ /** Build envelope from on-disk report + optional grading JSON paths. */
1364
+ async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
1365
+ const reportText = await readFile(reportPath, "utf8");
1366
+ const report = JSON.parse(reportText);
1367
+ let grading = options.grading;
1368
+ if (options.gradingPath) {
1369
+ const gradingText = await readFile(options.gradingPath, "utf8");
1370
+ const parsed = JSON.parse(gradingText);
1371
+ grading = {
1372
+ gradedAt: parsed.gradedAt,
1373
+ sourceReport: parsed.sourceReport,
1374
+ results: parsed.results,
1375
+ judge: options.grading?.judge ?? { id: "harness-eval/claude-grader" }
1376
+ };
1377
+ }
1378
+ let suite = options.suite;
1379
+ if (options.suitePath) {
1380
+ const content = await readFile(options.suitePath, "utf8");
1381
+ suite = {
1382
+ ...suite,
1383
+ uri: options.suitePath,
1384
+ contentHash: createHash("sha256").update(content).digest("hex")
1385
+ };
1386
+ }
1387
+ return buildEvalRunEnvelope(report, {
1388
+ ...options,
1389
+ suite,
1390
+ grading
1391
+ });
1392
+ }
1393
+ //#endregion
1394
+ export { TRAJECTORY_SCHEMA_VERSION as A, gradeReport as C, emitOtel as D, createClaudeGrader as E, trajectoryToOtlp as O, resolveGradeOptions as S, trajectoryToTranscript as T, trajectoryRecall as _, toProtoInstances as a, formatGradingConsole as b, toolCallValid as c, toolParameterKvMatch as d, computeTrajectoryMetrics as f, trajectoryPrecision as g, trajectoryInOrderMatch as h, toAgentTrace as i, EVAL_RUN_SCHEMA_VERSION as k, toolNameMatch as l, trajectoryExactMatch as m, buildEvalRunEnvelopeFromFiles as n, toTrajectory as o, trajectoryAnyOrderMatch as p, enrichRepetitionWithInterchange as r, computeToolCallMetrics as s, buildEvalRunEnvelope as t, toolParameterKeyMatch as u, trajectorySingleToolUse as v, loadSuiteReport as w, gradingReportPassed as x, formatReport as y };
1395
+
1396
+ //# sourceMappingURL=build-DsVJ_UeU.js.map