@alis-build/harness-eval 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -4
- package/dist/adapters/claude-code/index.d.ts +1 -1
- package/dist/adapters/claude-code/index.js +1 -1
- package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
- package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
- package/dist/cli/bin.js +109 -12
- package/dist/cli/bin.js.map +1 -1
- package/dist/config/loader.d.ts +1 -1
- package/dist/config/loader.js +1 -1
- package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
- package/dist/index.d.ts +270 -152
- package/dist/index.js +124 -5
- package/dist/index.js.map +1 -0
- package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
- package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
- package/dist/loader-DcI0KfRX.js.map +1 -0
- package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
- package/dist/projections-BcX7w-f6.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
- package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
- package/dist/suite-Dlzl-HI0.js.map +1 -0
- package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
- package/package.json +4 -2
- package/schemas/eval-interchange-instances.schema.json +196 -0
- package/schemas/eval-interchange.schema.json +65 -52
- package/schemas/eval-run-envelope.schema.json +182 -425
- package/dist/build-DsVJ_UeU.js.map +0 -1
- package/dist/loader-BCnFJ8rm.js.map +0 -1
- package/dist/suite-chj0j22j.js.map +0 -1
- package/schemas/eval-interchange-agent-trace.schema.json +0 -322
- package/schemas/eval-interchange-proto-instance.schema.json +0 -106
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { i as buildJudgeArgs } from "./claude-code-
|
|
2
|
-
import { n as createLimit } from "./suite-
|
|
1
|
+
import { i as buildJudgeArgs } from "./claude-code-DZ4Vkgp6.js";
|
|
2
|
+
import { n as createLimit } from "./suite-Dlzl-HI0.js";
|
|
3
3
|
import { spawn } from "node:child_process";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { parse } from "yaml";
|
|
@@ -11,24 +11,28 @@ const EVAL_RUN_SCHEMA_VERSION = "1.0";
|
|
|
11
11
|
const TRAJECTORY_SCHEMA_VERSION = "1.0";
|
|
12
12
|
//#endregion
|
|
13
13
|
//#region src/otel/attributes.ts
|
|
14
|
+
/** Build a string-typed OTLP attribute. */
|
|
14
15
|
function strAttr(key, value) {
|
|
15
16
|
return {
|
|
16
17
|
key,
|
|
17
18
|
value: { stringValue: value }
|
|
18
19
|
};
|
|
19
20
|
}
|
|
21
|
+
/** Build an integer-typed OTLP attribute (stored as decimal string). */
|
|
20
22
|
function intAttr(key, value) {
|
|
21
23
|
return {
|
|
22
24
|
key,
|
|
23
25
|
value: { intValue: String(value) }
|
|
24
26
|
};
|
|
25
27
|
}
|
|
28
|
+
/** Build a boolean-typed OTLP attribute. */
|
|
26
29
|
function boolAttr(key, value) {
|
|
27
30
|
return {
|
|
28
31
|
key,
|
|
29
32
|
value: { boolValue: value }
|
|
30
33
|
};
|
|
31
34
|
}
|
|
35
|
+
/** Build a JSON-serialized string attribute (common for message arrays). */
|
|
32
36
|
function jsonAttr(key, value) {
|
|
33
37
|
return {
|
|
34
38
|
key,
|
|
@@ -37,6 +41,11 @@ function jsonAttr(key, value) {
|
|
|
37
41
|
}
|
|
38
42
|
//#endregion
|
|
39
43
|
//#region src/otel/messages.ts
|
|
44
|
+
/**
|
|
45
|
+
* Map harness stop reasons to GenAI semconv finish_reason values.
|
|
46
|
+
*
|
|
47
|
+
* Unknown reasons pass through unchanged for forward compatibility.
|
|
48
|
+
*/
|
|
40
49
|
function mapStopReason(reason) {
|
|
41
50
|
if (!reason) return void 0;
|
|
42
51
|
switch (reason) {
|
|
@@ -47,6 +56,7 @@ function mapStopReason(reason) {
|
|
|
47
56
|
default: return reason;
|
|
48
57
|
}
|
|
49
58
|
}
|
|
59
|
+
/** Build a tool_call part from a {@link ToolCall}. */
|
|
50
60
|
function toolCallPart(call) {
|
|
51
61
|
return {
|
|
52
62
|
type: "tool_call",
|
|
@@ -55,6 +65,7 @@ function toolCallPart(call) {
|
|
|
55
65
|
arguments: call.args ?? {}
|
|
56
66
|
};
|
|
57
67
|
}
|
|
68
|
+
/** Build a tool_call_response part from a {@link ToolCall} result. */
|
|
58
69
|
function toolResponsePart(call) {
|
|
59
70
|
return {
|
|
60
71
|
type: "tool_call_response",
|
|
@@ -62,6 +73,7 @@ function toolResponsePart(call) {
|
|
|
62
73
|
result: call.result
|
|
63
74
|
};
|
|
64
75
|
}
|
|
76
|
+
/** Convert one assistant turn to a GenAI semconv assistant message. */
|
|
65
77
|
function assistantMessageFromTurn(turn) {
|
|
66
78
|
const parts = [];
|
|
67
79
|
if (turn.text) parts.push({
|
|
@@ -76,6 +88,7 @@ function assistantMessageFromTurn(turn) {
|
|
|
76
88
|
...finish ? { finish_reason: finish } : {}
|
|
77
89
|
};
|
|
78
90
|
}
|
|
91
|
+
/** Aggregate tool results from a turn into a single tool-role message, if any. */
|
|
79
92
|
function toolResultsMessage(calls) {
|
|
80
93
|
const parts = calls.filter((c) => c.result !== null).map((c) => toolResponsePart(c));
|
|
81
94
|
if (parts.length === 0) return null;
|
|
@@ -238,8 +251,9 @@ function trajectoryToOtlp(view, options = {}) {
|
|
|
238
251
|
}]
|
|
239
252
|
}] };
|
|
240
253
|
}
|
|
241
|
-
/** Alias
|
|
254
|
+
/** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
|
|
242
255
|
const emitOtel = trajectoryToOtlp;
|
|
256
|
+
/** Map view success flag to OTLP span status on the root invoke_agent span. */
|
|
243
257
|
function viewStatus(view) {
|
|
244
258
|
if (view.success) return { code: StatusCode.OK };
|
|
245
259
|
return {
|
|
@@ -247,6 +261,13 @@ function viewStatus(view) {
|
|
|
247
261
|
message: "harness run did not complete successfully"
|
|
248
262
|
};
|
|
249
263
|
}
|
|
264
|
+
/**
|
|
265
|
+
* Assign synthetic timestamps to chat and tool spans.
|
|
266
|
+
*
|
|
267
|
+
* Stream-json does not carry per-turn wall times, so we divide the session
|
|
268
|
+
* duration evenly across chat/tool slots for OTLP consumers that require
|
|
269
|
+
* start/end times on every span.
|
|
270
|
+
*/
|
|
250
271
|
function buildSpanTimings(view, startMs, endMs) {
|
|
251
272
|
const slots = [];
|
|
252
273
|
for (const turn of view.turns) {
|
|
@@ -268,17 +289,31 @@ function buildSpanTimings(view, startMs, endMs) {
|
|
|
268
289
|
}
|
|
269
290
|
return timings;
|
|
270
291
|
}
|
|
292
|
+
/**
|
|
293
|
+
* Derive a deterministic 128-bit trace id from the harness session id.
|
|
294
|
+
*
|
|
295
|
+
* Uses SHA-256 truncation so the same session always maps to the same trace.
|
|
296
|
+
*/
|
|
271
297
|
function traceIdFromSession(sessionId) {
|
|
272
298
|
return createHash("sha256").update(`harness-eval:trace:${sessionId}`).digest("hex").slice(0, 32).toUpperCase();
|
|
273
299
|
}
|
|
300
|
+
/**
|
|
301
|
+
* Derive a deterministic 64-bit span id from trace id and a logical span key.
|
|
302
|
+
*/
|
|
274
303
|
function spanIdFromKey(traceId, key) {
|
|
275
304
|
return createHash("sha256").update(`${traceId}:span:${key}`).digest("hex").slice(0, 16).toUpperCase();
|
|
276
305
|
}
|
|
306
|
+
/** Convert milliseconds since epoch to OTLP nanosecond timestamp string. */
|
|
277
307
|
function msToNs(ms) {
|
|
278
308
|
return String(Math.round(ms * 1e6));
|
|
279
309
|
}
|
|
280
310
|
//#endregion
|
|
281
311
|
//#region src/grader/prompt.ts
|
|
312
|
+
/**
|
|
313
|
+
* Build the full grader prompt including eval prompt, transcript, and schema.
|
|
314
|
+
*
|
|
315
|
+
* When `systemInstruction` is set it is prepended as a judge-specific prefix.
|
|
316
|
+
*/
|
|
282
317
|
function buildGraderPrompt(input) {
|
|
283
318
|
const expectationList = input.expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
|
|
284
319
|
return `${input.systemInstruction ? `${input.systemInstruction.trim()}\n\n` : ""}You are an automated evaluation grader (not the agent under test). Your only job is to score expectations against the transcript below.
|
|
@@ -320,6 +355,13 @@ Include every expectation in the same order. summary must match the expectations
|
|
|
320
355
|
}
|
|
321
356
|
//#endregion
|
|
322
357
|
//#region src/grader/parse.ts
|
|
358
|
+
/**
|
|
359
|
+
* Extract assistant text from Claude stdout.
|
|
360
|
+
*
|
|
361
|
+
* Handles plain text, single JSON result envelopes, stream-json arrays, and
|
|
362
|
+
* assistant message objects — the judge subprocess may emit any of these
|
|
363
|
+
* depending on Claude Code version and flags.
|
|
364
|
+
*/
|
|
323
365
|
function extractClaudeResponseText(stdout) {
|
|
324
366
|
const trimmed = stdout.trim();
|
|
325
367
|
if (!trimmed) return "";
|
|
@@ -337,6 +379,7 @@ function extractClaudeResponseText(stdout) {
|
|
|
337
379
|
} catch {}
|
|
338
380
|
return trimmed;
|
|
339
381
|
}
|
|
382
|
+
/** Walk a stream-json event array and return the final assistant or result text. */
|
|
340
383
|
function extractFromEventArray(events) {
|
|
341
384
|
const result = events.find((e) => typeof e === "object" && e !== null && e.type === "result");
|
|
342
385
|
if (result?.result) return result.result;
|
|
@@ -348,6 +391,7 @@ function extractFromEventArray(events) {
|
|
|
348
391
|
if (assistantTexts.length > 0) return assistantTexts[assistantTexts.length - 1];
|
|
349
392
|
return null;
|
|
350
393
|
}
|
|
394
|
+
/** Concatenate text blocks from an Anthropic-style assistant message object. */
|
|
351
395
|
function textFromAssistantMessage(message) {
|
|
352
396
|
if (!message || typeof message !== "object") return null;
|
|
353
397
|
const content = message.content;
|
|
@@ -357,6 +401,12 @@ function textFromAssistantMessage(message) {
|
|
|
357
401
|
for (const block of content) if (typeof block === "object" && block !== null && block.type === "text" && typeof block.text === "string") texts.push(block.text);
|
|
358
402
|
return texts.length > 0 ? texts.join("\n") : null;
|
|
359
403
|
}
|
|
404
|
+
/**
|
|
405
|
+
* Parse grader JSON from response text.
|
|
406
|
+
*
|
|
407
|
+
* Tries the raw string first, then fenced code blocks and brace-delimited
|
|
408
|
+
* substrings. Returns null when no valid expectations array is found.
|
|
409
|
+
*/
|
|
360
410
|
function parseGraderJson(text) {
|
|
361
411
|
const candidates = [text.trim(), extractJsonBlock(text)];
|
|
362
412
|
for (const candidate of candidates) {
|
|
@@ -370,6 +420,7 @@ function parseGraderJson(text) {
|
|
|
370
420
|
}
|
|
371
421
|
return null;
|
|
372
422
|
}
|
|
423
|
+
/** Extract JSON from markdown fences or the outermost `{...}` substring. */
|
|
373
424
|
function extractJsonBlock(text) {
|
|
374
425
|
const fence = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
375
426
|
if (fence?.[1]) return fence[1].trim();
|
|
@@ -378,6 +429,7 @@ function extractJsonBlock(text) {
|
|
|
378
429
|
if (start >= 0 && end > start) return text.slice(start, end + 1);
|
|
379
430
|
return null;
|
|
380
431
|
}
|
|
432
|
+
/** Map raw grader JSON to runtime {@link GraderOutput} with computed summary. */
|
|
381
433
|
function normalizeGraderJson(raw) {
|
|
382
434
|
const expectations = (raw.expectations ?? []).map((e) => ({
|
|
383
435
|
text: e.text ?? "",
|
|
@@ -424,15 +476,22 @@ const JUDGE_CLAUDE_DEFAULTS = {
|
|
|
424
476
|
disableSlashCommands: true,
|
|
425
477
|
noSessionPersistence: true
|
|
426
478
|
};
|
|
479
|
+
/** Merge user-supplied Claude Code options over judge-safe defaults. */
|
|
427
480
|
function mergeJudgeClaudeOptions(claudeCode) {
|
|
428
481
|
return {
|
|
429
482
|
...JUDGE_CLAUDE_DEFAULTS,
|
|
430
483
|
...claudeCode
|
|
431
484
|
};
|
|
432
485
|
}
|
|
486
|
+
/** Factory returning a {@link GraderFn} bound to subprocess options. */
|
|
433
487
|
function createClaudeGrader(options = {}) {
|
|
434
488
|
return (input) => runClaudeGrader(input, options);
|
|
435
489
|
}
|
|
490
|
+
/**
|
|
491
|
+
* Spawn Claude as judge, parse JSON response, align with input expectations.
|
|
492
|
+
*
|
|
493
|
+
* Unparseable output fails all expectations and sets {@link GraderOutput.error}.
|
|
494
|
+
*/
|
|
436
495
|
async function runClaudeGrader(input, options = {}) {
|
|
437
496
|
const binary = options.binary ?? options.claudeCode?.binary ?? "claude";
|
|
438
497
|
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
@@ -478,6 +537,12 @@ async function runClaudeGrader(input, options = {}) {
|
|
|
478
537
|
evalFeedback: parsed.evalFeedback
|
|
479
538
|
};
|
|
480
539
|
}
|
|
540
|
+
/**
|
|
541
|
+
* Spawn a child process and collect stdout until exit or timeout.
|
|
542
|
+
*
|
|
543
|
+
* Non-zero exit with empty stdout is treated as failure; partial stdout on
|
|
544
|
+
* non-zero exit is retained (Claude sometimes exits non-zero after emitting JSON).
|
|
545
|
+
*/
|
|
481
546
|
function spawnCollectStdout(binary, args, timeoutMs, extraEnv, cwd) {
|
|
482
547
|
return new Promise((resolve, reject) => {
|
|
483
548
|
const child = spawn(binary, args, {
|
|
@@ -512,6 +577,9 @@ function spawnCollectStdout(binary, args, timeoutMs, extraEnv, cwd) {
|
|
|
512
577
|
});
|
|
513
578
|
});
|
|
514
579
|
}
|
|
580
|
+
/**
|
|
581
|
+
* Build subprocess env, stripping CLAUDECODE to avoid nested-session guards.
|
|
582
|
+
*/
|
|
515
583
|
function buildChildEnv(extraEnv) {
|
|
516
584
|
const env = {
|
|
517
585
|
...process.env,
|
|
@@ -525,6 +593,11 @@ function buildChildEnv(extraEnv) {
|
|
|
525
593
|
/**
|
|
526
594
|
* Load expectations sidecar (YAML or JSON).
|
|
527
595
|
*/
|
|
596
|
+
/**
|
|
597
|
+
* Load expectations sidecar (YAML or JSON).
|
|
598
|
+
*
|
|
599
|
+
* File format: `{ "<caseId>": ["expectation 1", ...], ... }`.
|
|
600
|
+
*/
|
|
528
601
|
async function loadExpectationsMap(path) {
|
|
529
602
|
const text = await readFile(path, "utf8");
|
|
530
603
|
const trimmed = path.trim().toLowerCase();
|
|
@@ -541,7 +614,14 @@ async function loadExpectationsMap(path) {
|
|
|
541
614
|
}
|
|
542
615
|
//#endregion
|
|
543
616
|
//#region src/grader/transcript.ts
|
|
617
|
+
/** Maximum characters per tool result embedded in grader transcripts. */
|
|
544
618
|
const MAX_RESULT_CHARS = 4e3;
|
|
619
|
+
/**
|
|
620
|
+
* Render a {@link TrajectoryView} as markdown for LLM graders.
|
|
621
|
+
*
|
|
622
|
+
* Tool results are truncated at {@link MAX_RESULT_CHARS} to keep judge
|
|
623
|
+
* prompts within reasonable token limits.
|
|
624
|
+
*/
|
|
545
625
|
function trajectoryToTranscript(view, prompt) {
|
|
546
626
|
const lines = [];
|
|
547
627
|
if (prompt) lines.push("## User prompt", "", prompt, "");
|
|
@@ -564,6 +644,7 @@ function trajectoryToTranscript(view, prompt) {
|
|
|
564
644
|
lines.push("## Session metadata", `session_id: ${view.meta.sessionId}`, `model: ${view.meta.model}`, `cwd: ${view.meta.cwd}`, `success: ${view.success}`, `tool_calls: ${view.toolCalls.length}`, `duration_ms: ${view.usage.durationMs}`, `input_tokens: ${view.usage.inputTokens}`, `output_tokens: ${view.usage.outputTokens}`);
|
|
565
645
|
return lines.join("\n").trimEnd();
|
|
566
646
|
}
|
|
647
|
+
/** Format unknown values as JSON for transcript embedding. */
|
|
567
648
|
function formatJson$1(value) {
|
|
568
649
|
try {
|
|
569
650
|
return JSON.stringify(value);
|
|
@@ -571,10 +652,12 @@ function formatJson$1(value) {
|
|
|
571
652
|
return String(value);
|
|
572
653
|
}
|
|
573
654
|
}
|
|
655
|
+
/** Format a tool result, truncating long string or JSON payloads. */
|
|
574
656
|
function formatResult(result) {
|
|
575
657
|
if (typeof result === "string") return truncate(result);
|
|
576
658
|
return truncate(formatJson$1(result));
|
|
577
659
|
}
|
|
660
|
+
/** Truncate text with ellipsis when exceeding the transcript size budget. */
|
|
578
661
|
function truncate(text) {
|
|
579
662
|
if (text.length <= MAX_RESULT_CHARS) return text;
|
|
580
663
|
return `${text.slice(0, MAX_RESULT_CHARS)}… (truncated)`;
|
|
@@ -584,6 +667,12 @@ function truncate(text) {
|
|
|
584
667
|
/**
|
|
585
668
|
* Grade a harness-eval SuiteReport with outcome expectations (LLM judge).
|
|
586
669
|
*/
|
|
670
|
+
/**
|
|
671
|
+
* Grade every repetition in a {@link SuiteReport} that has expectations.
|
|
672
|
+
*
|
|
673
|
+
* Expectations come from inline case fields or an optional sidecar YAML/JSON
|
|
674
|
+
* map. Runs are concurrent under {@link GradeReportOptions.maxConcurrent}.
|
|
675
|
+
*/
|
|
587
676
|
async function gradeReport(report, options = {}) {
|
|
588
677
|
const expectationsMap = options.expectationsPath ? await loadExpectationsMap(options.expectationsPath) : {};
|
|
589
678
|
const gradeFn = options.gradeFn ?? createClaudeGrader({
|
|
@@ -707,6 +796,7 @@ async function gradeReport(report, options = {}) {
|
|
|
707
796
|
}
|
|
708
797
|
};
|
|
709
798
|
}
|
|
799
|
+
/** Load a suite report JSON file produced by `harness-eval run`. */
|
|
710
800
|
async function loadSuiteReport(path) {
|
|
711
801
|
const text = await readFile(path, "utf8");
|
|
712
802
|
return JSON.parse(text);
|
|
@@ -747,6 +837,11 @@ const RESET$1 = "\x1B[0m";
|
|
|
747
837
|
const GREEN$1 = "\x1B[32m";
|
|
748
838
|
const RED$1 = "\x1B[31m";
|
|
749
839
|
const DIM = "\x1B[2m";
|
|
840
|
+
/**
|
|
841
|
+
* Format a {@link SuiteGradingReport} for terminal output.
|
|
842
|
+
*
|
|
843
|
+
* @param color When true, emit ANSI status colors (default for TTY console).
|
|
844
|
+
*/
|
|
750
845
|
function formatGradingConsole(report, color = true) {
|
|
751
846
|
const lines = [];
|
|
752
847
|
if (report.results.length === 0) {
|
|
@@ -770,6 +865,7 @@ function formatGradingConsole(report, color = true) {
|
|
|
770
865
|
lines.push(`Overall: ${report.summary.passed}/${report.summary.total} (${overallPct}%) expectations passed`);
|
|
771
866
|
return lines.join("\n").trimEnd();
|
|
772
867
|
}
|
|
868
|
+
/** True when every graded rep passed all expectations without grader errors. */
|
|
773
869
|
function gradingReportPassed(report) {
|
|
774
870
|
return report.results.every((r) => !r.graderError && r.summary.failed === 0 && r.summary.total > 0);
|
|
775
871
|
}
|
|
@@ -779,6 +875,11 @@ const RESET = "\x1B[0m";
|
|
|
779
875
|
const GREEN = "\x1B[32m";
|
|
780
876
|
const RED = "\x1B[31m";
|
|
781
877
|
const YELLOW = "\x1B[33m";
|
|
878
|
+
/**
|
|
879
|
+
* Render renderable rows as ANSI-colored console output.
|
|
880
|
+
*
|
|
881
|
+
* @param color When false, emit plain text without escape codes.
|
|
882
|
+
*/
|
|
782
883
|
function formatConsole(rows, color = true) {
|
|
783
884
|
const lines = [];
|
|
784
885
|
for (const row of rows) {
|
|
@@ -804,6 +905,7 @@ function formatConsole(rows, color = true) {
|
|
|
804
905
|
}
|
|
805
906
|
return lines.join("\n").trimEnd();
|
|
806
907
|
}
|
|
908
|
+
/** Format pass rate for display, noting when all reps crashed. */
|
|
807
909
|
function formatRate$1(stat) {
|
|
808
910
|
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
809
911
|
const pct = (stat.passRate * 100).toFixed(0);
|
|
@@ -811,11 +913,17 @@ function formatRate$1(stat) {
|
|
|
811
913
|
}
|
|
812
914
|
//#endregion
|
|
813
915
|
//#region src/reporter/format-json.ts
|
|
916
|
+
/**
|
|
917
|
+
* Serialize a suite report as indented JSON (no transformation).
|
|
918
|
+
*
|
|
919
|
+
* Used by `--format json` and `--output` persistence.
|
|
920
|
+
*/
|
|
814
921
|
function formatJson(report) {
|
|
815
922
|
return JSON.stringify(report, null, 2);
|
|
816
923
|
}
|
|
817
924
|
//#endregion
|
|
818
925
|
//#region src/reporter/format-markdown.ts
|
|
926
|
+
/** Render renderable rows as a GitHub-flavored markdown report. */
|
|
819
927
|
function formatMarkdown(rows) {
|
|
820
928
|
const lines = ["# Harness Eval Report", ""];
|
|
821
929
|
for (const row of rows) {
|
|
@@ -845,6 +953,7 @@ function formatMarkdown(rows) {
|
|
|
845
953
|
}
|
|
846
954
|
return lines.join("\n").trimEnd();
|
|
847
955
|
}
|
|
956
|
+
/** Format pass rate for markdown tables, noting when all reps crashed. */
|
|
848
957
|
function formatRate(stat) {
|
|
849
958
|
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
850
959
|
const pct = (stat.passRate * 100).toFixed(0);
|
|
@@ -852,9 +961,15 @@ function formatRate(stat) {
|
|
|
852
961
|
}
|
|
853
962
|
//#endregion
|
|
854
963
|
//#region src/reporter/renderable.ts
|
|
964
|
+
/** Map a suite report to formatter-ready rows (one per cell). */
|
|
855
965
|
function toRenderableRows(report) {
|
|
856
966
|
return report.cells.map((cell) => cellToRow(cell));
|
|
857
967
|
}
|
|
968
|
+
/**
|
|
969
|
+
* Attach baseline pass-rate deltas to matching rows.
|
|
970
|
+
*
|
|
971
|
+
* Rows without a matching baseline cell are returned unchanged.
|
|
972
|
+
*/
|
|
858
973
|
function applyBaseline(rows, baseline) {
|
|
859
974
|
const baselineMap = new Map(baseline.cells.map((c) => [`${c.caseId}::${c.cell.label}`, c]));
|
|
860
975
|
return rows.map((row) => {
|
|
@@ -876,6 +991,7 @@ function applyBaseline(rows, baseline) {
|
|
|
876
991
|
};
|
|
877
992
|
});
|
|
878
993
|
}
|
|
994
|
+
/** Convert one {@link CellReport} to a {@link RenderableRow}. */
|
|
879
995
|
function cellToRow(cell) {
|
|
880
996
|
const totalReps = cell.repetitions.length;
|
|
881
997
|
const stats = cell.assertionStats.map((s) => ({
|
|
@@ -901,6 +1017,12 @@ function cellToRow(cell) {
|
|
|
901
1017
|
}
|
|
902
1018
|
//#endregion
|
|
903
1019
|
//#region src/reporter/index.ts
|
|
1020
|
+
/**
|
|
1021
|
+
* Format a {@link SuiteReport} for console, markdown, or JSON output.
|
|
1022
|
+
*
|
|
1023
|
+
* JSON format bypasses the renderable intermediate model and serializes the
|
|
1024
|
+
* report directly. Console and markdown apply optional baseline deltas.
|
|
1025
|
+
*/
|
|
904
1026
|
function formatReport(report, options) {
|
|
905
1027
|
if (options.format === "json") return formatJson(report);
|
|
906
1028
|
let rows = toRenderableRows(report);
|
|
@@ -910,81 +1032,149 @@ function formatReport(report, options) {
|
|
|
910
1032
|
return formatConsole(rows, useColor);
|
|
911
1033
|
}
|
|
912
1034
|
//#endregion
|
|
913
|
-
//#region src/eval-interchange/
|
|
914
|
-
|
|
1035
|
+
//#region src/eval-interchange/normalize.ts
|
|
1036
|
+
/**
|
|
1037
|
+
* Serialize tool arguments to the Vertex wire string format.
|
|
1038
|
+
*
|
|
1039
|
+
* Already-string inputs pass through unchanged (e.g. pre-serialized reference
|
|
1040
|
+
* steps). Objects and nullish values become JSON strings; empty input becomes `{}`.
|
|
1041
|
+
*
|
|
1042
|
+
* @param args - Tool arguments from harness or suite YAML.
|
|
1043
|
+
* @returns JSON string suitable for {@link ProtojsonToolCall.toolInput}.
|
|
1044
|
+
*/
|
|
915
1045
|
function serializeToolInput(args) {
|
|
1046
|
+
if (typeof args === "string") return args;
|
|
916
1047
|
return JSON.stringify(args ?? {});
|
|
917
1048
|
}
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1049
|
+
/**
|
|
1050
|
+
* Normalize a tool name according to suite reference configuration.
|
|
1051
|
+
*
|
|
1052
|
+
* In `"bare"` mode, strips the MCP namespace prefix (`mcp__api__foo` → `foo`)
|
|
1053
|
+
* so reference trajectories authored with bare names match harness tool names.
|
|
1054
|
+
*
|
|
1055
|
+
* @param toolName - Raw tool name from harness or suite.
|
|
1056
|
+
* @param mode - `"harness"` preserves the name; `"bare"` strips after last `__`.
|
|
1057
|
+
*/
|
|
1058
|
+
function normalizeReferenceToolName(toolName, mode) {
|
|
1059
|
+
if (mode !== "bare") return toolName;
|
|
1060
|
+
const separator = toolName.lastIndexOf("__");
|
|
1061
|
+
if (separator === -1) return toolName;
|
|
1062
|
+
return toolName.slice(separator + 2);
|
|
924
1063
|
}
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
1064
|
+
/**
|
|
1065
|
+
* Convert a harness or suite trajectory into Vertex protojson wire format.
|
|
1066
|
+
*
|
|
1067
|
+
* `toolNameMode` controls MCP prefix stripping for every tool name in the
|
|
1068
|
+
* trajectory. Suite reference steps and predicted harness tool calls use the
|
|
1069
|
+
* same mode so comparisons stay consistent across metrics and instances.
|
|
1070
|
+
*
|
|
1071
|
+
* @param trajectory - Tool calls in harness or YAML reference shape.
|
|
1072
|
+
* @param options.toolNameMode - `"harness"` keeps full names; `"bare"` strips after last `__`.
|
|
1073
|
+
*/
|
|
1074
|
+
function toProtojsonTrajectory(trajectory, options = {}) {
|
|
1075
|
+
const toolNameMode = options.toolNameMode ?? "harness";
|
|
1076
|
+
return { toolCalls: trajectory.map((toolCall) => {
|
|
1077
|
+
const name = "name" in toolCall ? toolCall.name : toolCall.tool_name;
|
|
1078
|
+
const args = "args" in toolCall ? toolCall.args : toolCall.tool_input;
|
|
1079
|
+
return {
|
|
1080
|
+
toolName: normalizeReferenceToolName(name, toolNameMode),
|
|
1081
|
+
toolInput: serializeToolInput(args)
|
|
1082
|
+
};
|
|
1083
|
+
}) };
|
|
930
1084
|
}
|
|
931
|
-
|
|
1085
|
+
//#endregion
|
|
1086
|
+
//#region src/eval-interchange/protojson/trajectory-instances.ts
|
|
1087
|
+
/**
|
|
1088
|
+
* Build Vertex Trajectory*Instance protojson wire objects.
|
|
1089
|
+
*
|
|
1090
|
+
* Each trajectory metric in Vertex EvaluateInstances expects a specific
|
|
1091
|
+
* protobuf message. This module constructs all six instance payloads from
|
|
1092
|
+
* one predicted/reference pair so callers can batch-upload via JSONL.
|
|
1093
|
+
*/
|
|
1094
|
+
/**
|
|
1095
|
+
* Build a pair instance with predicted and reference trajectories.
|
|
1096
|
+
*
|
|
1097
|
+
* Both sides use the same `referenceToolNameMode` so wire payloads align with
|
|
1098
|
+
* {@link toHarnessMetrics} and Vertex EvaluateInstances sees comparable names.
|
|
1099
|
+
* In `"bare"` mode, MCP prefixes are stripped on predicted and reference alike.
|
|
1100
|
+
*/
|
|
1101
|
+
function pairInstance(predicted, reference, referenceToolNameMode) {
|
|
932
1102
|
return {
|
|
933
|
-
|
|
934
|
-
|
|
1103
|
+
predictedTrajectory: toProtojsonTrajectory(predicted, { toolNameMode: referenceToolNameMode }),
|
|
1104
|
+
referenceTrajectory: toProtojsonTrajectory(reference, { toolNameMode: referenceToolNameMode })
|
|
935
1105
|
};
|
|
936
1106
|
}
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
const activeTools = view.meta.availableTools.map((name) => ({ name }));
|
|
1107
|
+
/**
|
|
1108
|
+
* Build all Trajectory*Instance payloads for one predicted/reference pair.
|
|
1109
|
+
*
|
|
1110
|
+
* Pair metrics (exact, in-order, any-order, precision, recall) share the
|
|
1111
|
+
* same trajectory pair; single-tool-use omits the reference trajectory
|
|
1112
|
+
* per Vertex API shape.
|
|
1113
|
+
*/
|
|
1114
|
+
function toTrajectoryInstances(options) {
|
|
1115
|
+
const referenceToolNameMode = options.referenceToolNameMode ?? "harness";
|
|
1116
|
+
const pair = pairInstance(options.predicted, options.reference, referenceToolNameMode);
|
|
948
1117
|
return {
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
active_tools: activeTools
|
|
956
|
-
});
|
|
957
|
-
for (const toolCall of turn.toolCalls) {
|
|
958
|
-
events.push({
|
|
959
|
-
author: agentId,
|
|
960
|
-
content: { parts: [{ function_call: {
|
|
961
|
-
name: toolCall.name,
|
|
962
|
-
args: toolCall.args ?? {}
|
|
963
|
-
} }] },
|
|
964
|
-
active_tools: activeTools
|
|
965
|
-
});
|
|
966
|
-
if (toolCall.result !== null && toolCall.result !== void 0) events.push({
|
|
967
|
-
author: agentId,
|
|
968
|
-
content: { parts: [{ function_response: {
|
|
969
|
-
name: toolCall.name,
|
|
970
|
-
response: toolCall.result
|
|
971
|
-
} }] },
|
|
972
|
-
active_tools: activeTools
|
|
973
|
-
});
|
|
974
|
-
}
|
|
975
|
-
return {
|
|
976
|
-
turn_index: turn.turnIndex,
|
|
977
|
-
events
|
|
978
|
-
};
|
|
979
|
-
})
|
|
1118
|
+
exactMatch: pair,
|
|
1119
|
+
inOrderMatch: pair,
|
|
1120
|
+
anyOrderMatch: pair,
|
|
1121
|
+
precision: pair,
|
|
1122
|
+
recall: pair,
|
|
1123
|
+
singleToolUse: { predictedTrajectory: pair.predictedTrajectory }
|
|
980
1124
|
};
|
|
981
1125
|
}
|
|
982
|
-
|
|
983
|
-
|
|
1126
|
+
/**
|
|
1127
|
+
* Convert suite reference steps to cell-level protojson trajectory export.
|
|
1128
|
+
*/
|
|
1129
|
+
function toReferenceTrajectory(reference, referenceToolNameMode = "harness") {
|
|
1130
|
+
return toProtojsonTrajectory(reference, { toolNameMode: referenceToolNameMode });
|
|
1131
|
+
}
|
|
1132
|
+
/**
|
|
1133
|
+
* Map a trajectory instance key to the Vertex protobuf message type name.
|
|
1134
|
+
*
|
|
1135
|
+
* Used as `messageType` in {@link InstancesJsonlRow} for EvaluateInstances batching.
|
|
1136
|
+
*/
|
|
1137
|
+
function trajectoryInstanceMessageType(key) {
|
|
1138
|
+
switch (key) {
|
|
1139
|
+
case "exactMatch": return "TrajectoryExactMatchInstance";
|
|
1140
|
+
case "inOrderMatch": return "TrajectoryInOrderMatchInstance";
|
|
1141
|
+
case "anyOrderMatch": return "TrajectoryAnyOrderMatchInstance";
|
|
1142
|
+
case "precision": return "TrajectoryPrecisionInstance";
|
|
1143
|
+
case "recall": return "TrajectoryRecallInstance";
|
|
1144
|
+
case "singleToolUse": return "TrajectorySingleToolUseInstance";
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
//#endregion
|
|
1148
|
+
//#region src/eval-interchange/protojson/evaluation-instance.ts
|
|
1149
|
+
/**
|
|
1150
|
+
* Build an EvaluationInstance protojson object from harness strings.
|
|
1151
|
+
*
|
|
1152
|
+
* Omitted fields are excluded from the output object rather than set to
|
|
1153
|
+
* empty wrappers — protojson omits unset optional fields.
|
|
1154
|
+
*
|
|
1155
|
+
* @param options.prompt - Case prompt sent to the agent.
|
|
1156
|
+
* @param options.response - Final agent response from the trajectory.
|
|
1157
|
+
* @param options.reference - Optional reference answer text (rare in harness eval).
|
|
1158
|
+
*/
|
|
1159
|
+
function toEvaluationInstance(options) {
|
|
1160
|
+
const instance = {};
|
|
1161
|
+
if (options.prompt !== void 0) instance.prompt = { text: options.prompt };
|
|
1162
|
+
if (options.response !== void 0) instance.response = { text: options.response };
|
|
1163
|
+
if (options.reference !== void 0) instance.reference = { text: options.reference };
|
|
1164
|
+
return instance;
|
|
984
1165
|
}
|
|
985
1166
|
//#endregion
|
|
986
1167
|
//#region src/metrics/trajectory.ts
|
|
987
|
-
|
|
1168
|
+
/**
|
|
1169
|
+
* Trajectory-level metrics for comparing predicted and reference tool-call sequences.
|
|
1170
|
+
*
|
|
1171
|
+
* Aligns with Vertex AI EvaluationService trajectory metrics (exact match,
|
|
1172
|
+
* in-order, any-order, precision, recall, single tool use). Tool calls are
|
|
1173
|
+
* compared by `(tool_name, serialized tool_input)` identity after normalization.
|
|
1174
|
+
*
|
|
1175
|
+
* Binary metrics return 0 or 1; precision and recall return fractions in [0, 1].
|
|
1176
|
+
*/
|
|
1177
|
+
function normalizeToolCall(toolCall) {
|
|
988
1178
|
if (typeof toolCall.tool_input === "string") return {
|
|
989
1179
|
tool_name: toolCall.tool_name,
|
|
990
1180
|
tool_input: toolCall.tool_input
|
|
@@ -995,11 +1185,17 @@ function normalizeToolCall$1(toolCall) {
|
|
|
995
1185
|
};
|
|
996
1186
|
}
|
|
997
1187
|
function normalizeTrajectory(trajectory) {
|
|
998
|
-
return trajectory.map(normalizeToolCall
|
|
1188
|
+
return trajectory.map(normalizeToolCall);
|
|
999
1189
|
}
|
|
1190
|
+
/** Stable composite key for multiset and equality checks. */
|
|
1000
1191
|
function toolCallKey(toolCall) {
|
|
1001
1192
|
return `${toolCall.tool_name}\0${toolCall.tool_input}`;
|
|
1002
1193
|
}
|
|
1194
|
+
/**
|
|
1195
|
+
* Count predicted tool calls that appear in reference (multiset intersection).
|
|
1196
|
+
*
|
|
1197
|
+
* Duplicate tool calls are matched one-for-one; order does not matter.
|
|
1198
|
+
*/
|
|
1003
1199
|
function multisetIntersectionSize(predicted, reference) {
|
|
1004
1200
|
const refCounts = /* @__PURE__ */ new Map();
|
|
1005
1201
|
for (const toolCall of reference) {
|
|
@@ -1017,6 +1213,12 @@ function multisetIntersectionSize(predicted, reference) {
|
|
|
1017
1213
|
}
|
|
1018
1214
|
return matched;
|
|
1019
1215
|
}
|
|
1216
|
+
/**
|
|
1217
|
+
* Whether reference appears as a subsequence of predicted (order preserved).
|
|
1218
|
+
*
|
|
1219
|
+
* Extra predicted calls between reference steps are allowed (in-order match
|
|
1220
|
+
* semantics per Vertex).
|
|
1221
|
+
*/
|
|
1020
1222
|
function isSubsequence(predicted, reference) {
|
|
1021
1223
|
let refIndex = 0;
|
|
1022
1224
|
for (const toolCall of predicted) {
|
|
@@ -1032,12 +1234,15 @@ function arraysEqual(left, right) {
|
|
|
1032
1234
|
return toolCallKey(toolCall) === toolCallKey(other);
|
|
1033
1235
|
});
|
|
1034
1236
|
}
|
|
1237
|
+
/** Exact sequence equality after normalization. */
|
|
1035
1238
|
function trajectoryExactMatch(predicted, reference) {
|
|
1036
1239
|
return arraysEqual(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
|
|
1037
1240
|
}
|
|
1241
|
+
/** Reference is a subsequence of predicted (order preserved, extras allowed). */
|
|
1038
1242
|
function trajectoryInOrderMatch(predicted, reference) {
|
|
1039
1243
|
return isSubsequence(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
|
|
1040
1244
|
}
|
|
1245
|
+
/** Same multiset of tool calls; length must match. */
|
|
1041
1246
|
function trajectoryAnyOrderMatch(predicted, reference) {
|
|
1042
1247
|
const predictedNorm = normalizeTrajectory(predicted);
|
|
1043
1248
|
const referenceNorm = normalizeTrajectory(reference);
|
|
@@ -1046,22 +1251,34 @@ function trajectoryAnyOrderMatch(predicted, reference) {
|
|
|
1046
1251
|
const referenceKeys = referenceNorm.map(toolCallKey).sort();
|
|
1047
1252
|
return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
|
|
1048
1253
|
}
|
|
1254
|
+
/**
|
|
1255
|
+
* Fraction of predicted tool calls that appear in reference (multiset).
|
|
1256
|
+
*
|
|
1257
|
+
* Returns 1 when both trajectories are empty.
|
|
1258
|
+
*/
|
|
1049
1259
|
function trajectoryPrecision(predicted, reference) {
|
|
1050
1260
|
const predictedNorm = normalizeTrajectory(predicted);
|
|
1051
1261
|
if (predictedNorm.length === 0) return reference.length === 0 ? 1 : 0;
|
|
1052
1262
|
return multisetIntersectionSize(predictedNorm, normalizeTrajectory(reference)) / predictedNorm.length;
|
|
1053
1263
|
}
|
|
1264
|
+
/**
|
|
1265
|
+
* Fraction of reference tool calls matched in predicted (multiset recall).
|
|
1266
|
+
*
|
|
1267
|
+
* Returns 1 when reference is empty and predicted is empty.
|
|
1268
|
+
*/
|
|
1054
1269
|
function trajectoryRecall(predicted, reference) {
|
|
1055
1270
|
const referenceNorm = normalizeTrajectory(reference);
|
|
1056
1271
|
if (referenceNorm.length === 0) return predicted.length === 0 ? 1 : 0;
|
|
1057
1272
|
return multisetIntersectionSize(normalizeTrajectory(predicted), referenceNorm) / referenceNorm.length;
|
|
1058
1273
|
}
|
|
1274
|
+
/** Both trajectories have exactly one call and they match. */
|
|
1059
1275
|
function trajectorySingleToolUse(predicted, reference) {
|
|
1060
1276
|
const predictedNorm = normalizeTrajectory(predicted);
|
|
1061
1277
|
const referenceNorm = normalizeTrajectory(reference);
|
|
1062
1278
|
if (predictedNorm.length !== 1 || referenceNorm.length !== 1) return 0;
|
|
1063
1279
|
return toolCallKey(predictedNorm[0]) === toolCallKey(referenceNorm[0]) ? 1 : 0;
|
|
1064
1280
|
}
|
|
1281
|
+
/** Compute all trajectory metrics in one pass. */
|
|
1065
1282
|
function computeTrajectoryMetrics(predicted, reference) {
|
|
1066
1283
|
return {
|
|
1067
1284
|
trajectory_exact_match: trajectoryExactMatch(predicted, reference),
|
|
@@ -1072,201 +1289,144 @@ function computeTrajectoryMetrics(predicted, reference) {
|
|
|
1072
1289
|
trajectory_single_tool_use: trajectorySingleToolUse(predicted, reference)
|
|
1073
1290
|
};
|
|
1074
1291
|
}
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
};
|
|
1082
|
-
return {
|
|
1083
|
-
tool_name: toolCall.tool_name,
|
|
1084
|
-
tool_input: serializeToolInput(toolCall.tool_input)
|
|
1085
|
-
};
|
|
1086
|
-
}
|
|
1087
|
-
function parsedArgs(toolCall) {
|
|
1088
|
-
const parsed = parseToolInput(toolCall.tool_input);
|
|
1089
|
-
if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) return null;
|
|
1090
|
-
return parsed;
|
|
1091
|
-
}
|
|
1092
|
-
function toolCallValid(toolCall) {
|
|
1093
|
-
const normalized = normalizeToolCall(toolCall);
|
|
1094
|
-
if (!normalized.tool_name.trim()) return 0;
|
|
1292
|
+
/**
|
|
1293
|
+
* Parse a wire tool_input string to JSON, or return the raw string on failure.
|
|
1294
|
+
*
|
|
1295
|
+
* Exported for tool-call metrics that need structured arg comparison.
|
|
1296
|
+
*/
|
|
1297
|
+
function parseToolInput(toolInput) {
|
|
1095
1298
|
try {
|
|
1096
|
-
JSON.parse(
|
|
1097
|
-
return 1;
|
|
1299
|
+
return JSON.parse(toolInput);
|
|
1098
1300
|
} catch {
|
|
1099
|
-
return
|
|
1100
|
-
}
|
|
1101
|
-
}
|
|
1102
|
-
function toolNameMatch(predicted, reference) {
|
|
1103
|
-
const predictedNorm = normalizeToolCall(predicted);
|
|
1104
|
-
const referenceNorm = normalizeToolCall(reference);
|
|
1105
|
-
return predictedNorm.tool_name === referenceNorm.tool_name ? 1 : 0;
|
|
1106
|
-
}
|
|
1107
|
-
function toolParameterKeyMatch(predicted, reference) {
|
|
1108
|
-
if (toolNameMatch(predicted, reference) === 0) return 0;
|
|
1109
|
-
const predictedArgs = parsedArgs(normalizeToolCall(predicted));
|
|
1110
|
-
const referenceArgs = parsedArgs(normalizeToolCall(reference));
|
|
1111
|
-
if (predictedArgs === null || referenceArgs === null) return 0;
|
|
1112
|
-
const predictedKeys = Object.keys(predictedArgs).sort();
|
|
1113
|
-
const referenceKeys = Object.keys(referenceArgs).sort();
|
|
1114
|
-
if (predictedKeys.length !== referenceKeys.length) return 0;
|
|
1115
|
-
return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
|
|
1116
|
-
}
|
|
1117
|
-
function valuesEqual(left, right, useStrictStringMatch) {
|
|
1118
|
-
if (useStrictStringMatch) return JSON.stringify(left) === JSON.stringify(right);
|
|
1119
|
-
return JSON.stringify(left) === JSON.stringify(right);
|
|
1120
|
-
}
|
|
1121
|
-
function toolParameterKvMatch(predicted, reference, options = {}) {
|
|
1122
|
-
if (toolParameterKeyMatch(predicted, reference) === 0) return 0;
|
|
1123
|
-
const predictedArgs = parsedArgs(normalizeToolCall(predicted));
|
|
1124
|
-
const referenceArgs = parsedArgs(normalizeToolCall(reference));
|
|
1125
|
-
for (const key of Object.keys(referenceArgs)) if (!valuesEqual(predictedArgs[key], referenceArgs[key], options.useStrictStringMatch ?? false)) return 0;
|
|
1126
|
-
return 1;
|
|
1127
|
-
}
|
|
1128
|
-
function computeToolCallMetrics(predicted, reference, options = {}) {
|
|
1129
|
-
const pairCount = Math.max(predicted.length, reference.length, 1);
|
|
1130
|
-
let valid = 0;
|
|
1131
|
-
let nameMatch = 0;
|
|
1132
|
-
let keyMatch = 0;
|
|
1133
|
-
let kvMatch = 0;
|
|
1134
|
-
for (let index = 0; index < pairCount; index += 1) {
|
|
1135
|
-
const predictedCall = predicted[index];
|
|
1136
|
-
const referenceCall = reference[index];
|
|
1137
|
-
if (!predictedCall) continue;
|
|
1138
|
-
valid += toolCallValid(predictedCall);
|
|
1139
|
-
if (!referenceCall) continue;
|
|
1140
|
-
nameMatch += toolNameMatch(predictedCall, referenceCall);
|
|
1141
|
-
keyMatch += toolParameterKeyMatch(predictedCall, referenceCall);
|
|
1142
|
-
kvMatch += toolParameterKvMatch(predictedCall, referenceCall, options);
|
|
1301
|
+
return toolInput;
|
|
1143
1302
|
}
|
|
1144
|
-
return {
|
|
1145
|
-
tool_call_valid: valid / pairCount,
|
|
1146
|
-
tool_name_match: nameMatch / pairCount,
|
|
1147
|
-
tool_parameter_key_match: keyMatch / pairCount,
|
|
1148
|
-
tool_parameter_kv_match: kvMatch / pairCount
|
|
1149
|
-
};
|
|
1150
1303
|
}
|
|
1151
1304
|
//#endregion
|
|
1152
|
-
//#region src/eval-interchange/
|
|
1305
|
+
//#region src/eval-interchange/protojson/harness-metrics.ts
|
|
1153
1306
|
/**
|
|
1154
|
-
*
|
|
1307
|
+
* Harness-owned trajectory metric scores in Vertex camelCase field names.
|
|
1308
|
+
*
|
|
1309
|
+
* Wraps {@link computeTrajectoryMetrics} for envelope export. External
|
|
1310
|
+
* systems can compare harness-precomputed scores against Vertex EvaluateInstances
|
|
1311
|
+
* results without reimplementing trajectory matching logic.
|
|
1155
1312
|
*/
|
|
1156
|
-
|
|
1157
|
-
|
|
1313
|
+
/**
|
|
1314
|
+
* Compute trajectory metrics and map snake_case keys to Vertex camelCase.
|
|
1315
|
+
*
|
|
1316
|
+
* When `referenceToolNameMode` is `"bare"`, both predicted and reference tool
|
|
1317
|
+
* names are stripped to the suffix after the last `__` so suite reference steps
|
|
1318
|
+
* authored with bare names (e.g. `ListLandingZones`) match harness MCP names
|
|
1319
|
+
* (e.g. `mcp__plugin__ListLandingZones`).
|
|
1320
|
+
*
|
|
1321
|
+
* @param predicted - Tool calls from the harness trajectory view.
|
|
1322
|
+
* @param reference - Reference steps from suite YAML.
|
|
1323
|
+
* @param options.referenceToolNameMode - Name normalization mode from suite YAML.
|
|
1324
|
+
*/
|
|
1325
|
+
function toHarnessMetrics(predicted, reference, options = {}) {
|
|
1326
|
+
const referenceToolNameMode = options.referenceToolNameMode ?? "harness";
|
|
1327
|
+
const metrics = computeTrajectoryMetrics(predicted.map((toolCall) => ({
|
|
1328
|
+
tool_name: normalizeReferenceToolName(toolCall.name, referenceToolNameMode),
|
|
1329
|
+
tool_input: toolCall.args
|
|
1330
|
+
})), reference.map((step) => ({
|
|
1331
|
+
tool_name: normalizeReferenceToolName(step.tool_name, referenceToolNameMode),
|
|
1332
|
+
tool_input: step.tool_input
|
|
1333
|
+
})));
|
|
1158
1334
|
return {
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1335
|
+
trajectoryExactMatch: metrics.trajectory_exact_match,
|
|
1336
|
+
trajectoryInOrderMatch: metrics.trajectory_in_order_match,
|
|
1337
|
+
trajectoryAnyOrderMatch: metrics.trajectory_any_order_match,
|
|
1338
|
+
trajectoryPrecision: metrics.trajectory_precision,
|
|
1339
|
+
trajectoryRecall: metrics.trajectory_recall,
|
|
1340
|
+
trajectorySingleToolUse: metrics.trajectory_single_tool_use
|
|
1163
1341
|
};
|
|
1164
1342
|
}
|
|
1165
|
-
|
|
1166
|
-
|
|
1343
|
+
//#endregion
|
|
1344
|
+
//#region src/eval-interchange/enrich.ts
|
|
1345
|
+
/**
|
|
1346
|
+
* Enrich eval repetitions with Vertex protojson interchange fields.
|
|
1347
|
+
*
|
|
1348
|
+
* Called during envelope build for each successful repetition. Adds
|
|
1349
|
+
* `evaluationInstance`, optional `trajectoryInstances` / `harnessMetrics`
|
|
1350
|
+
* when a suite reference exists, and Vertex-style `latencySeconds` / `failure`
|
|
1351
|
+
* flags derived from trajectory success.
|
|
1352
|
+
*/
|
|
1353
|
+
/** Extract reference steps from suite config when present. */
|
|
1354
|
+
function referenceSteps(reference) {
|
|
1355
|
+
return reference?.steps;
|
|
1167
1356
|
}
|
|
1168
|
-
|
|
1169
|
-
|
|
1357
|
+
/**
|
|
1358
|
+
* Attach Vertex protojson interchange fields to one {@link EvalRepetition}.
|
|
1359
|
+
*
|
|
1360
|
+
* When no trajectory exists (adapter error), sets `failure: 1` and skips
|
|
1361
|
+
* protojson payloads. Trajectory instances and harness metrics are only
|
|
1362
|
+
* computed when the suite defines a non-empty reference trajectory.
|
|
1363
|
+
*
|
|
1364
|
+
* @param repetition - Base repetition from the runner (trajectory, assertions, grades).
|
|
1365
|
+
* @param options.prompt - Case prompt for EvaluationInstance.
|
|
1366
|
+
* @param options.reference - Suite reference trajectory config, if any.
|
|
1367
|
+
*/
|
|
1368
|
+
function enrichRepetitionWithProtojson(repetition, options = {}) {
|
|
1170
1369
|
if (!repetition.trajectory) return {
|
|
1171
|
-
prompt: cell.prompt,
|
|
1172
|
-
response: void 0,
|
|
1173
|
-
predicted_trajectory: [],
|
|
1174
|
-
reference_trajectory: referenceTrajectoryForCell(cell),
|
|
1175
|
-
latency_in_seconds: repetition.durationMs / 1e3,
|
|
1176
|
-
failure: 1,
|
|
1177
|
-
human_ratings: cell.human_ratings
|
|
1178
|
-
};
|
|
1179
|
-
return {
|
|
1180
|
-
prompt: cell.prompt,
|
|
1181
|
-
response: repetition.trajectory.finalResponse,
|
|
1182
|
-
predicted_trajectory: fields.predicted_trajectory.map(interchangeToTabular),
|
|
1183
|
-
reference_trajectory: referenceTrajectoryForCell(cell),
|
|
1184
|
-
latency_in_seconds: fields.latency_in_seconds ?? repetition.durationMs / 1e3,
|
|
1185
|
-
failure: fields.failure ?? 1,
|
|
1186
|
-
human_ratings: cell.human_ratings
|
|
1187
|
-
};
|
|
1188
|
-
}
|
|
1189
|
-
function repetitionToProtoInstance(cell, repetition) {
|
|
1190
|
-
const fields = repetitionInterchangeFields(repetition);
|
|
1191
|
-
if (!repetition.trajectory) return null;
|
|
1192
|
-
const reference = referenceTrajectoryForCell(cell);
|
|
1193
|
-
return {
|
|
1194
|
-
prompt: cell.prompt,
|
|
1195
|
-
response: repetition.trajectory.finalResponse,
|
|
1196
|
-
predicted_trajectory: { tool_calls: fields.predicted_trajectory },
|
|
1197
|
-
reference_trajectory: reference ? { tool_calls: reference.map((toolCall) => ({
|
|
1198
|
-
tool_name: toolCall.tool_name,
|
|
1199
|
-
tool_input: typeof toolCall.tool_input === "string" ? toolCall.tool_input : JSON.stringify(toolCall.tool_input ?? {})
|
|
1200
|
-
})) } : void 0
|
|
1201
|
-
};
|
|
1202
|
-
}
|
|
1203
|
-
function repetitionToAgentTrace(repetition) {
|
|
1204
|
-
return repetitionInterchangeFields(repetition).agent_trace ?? null;
|
|
1205
|
-
}
|
|
1206
|
-
function computeRepetitionMetrics(repetition, referenceTrajectory) {
|
|
1207
|
-
if (!referenceTrajectory?.length) return {};
|
|
1208
|
-
const predictedTabular = (repetition.predicted_trajectory ?? (repetition.trajectory ? predictedTrajectoryFromView(repetition.trajectory) : [])).map(interchangeToTabular);
|
|
1209
|
-
return {
|
|
1210
|
-
trajectoryMetrics: computeTrajectoryMetrics(predictedTabular, referenceTrajectory),
|
|
1211
|
-
toolCallMetrics: computeToolCallMetrics(predictedTabular, referenceTrajectory)
|
|
1212
|
-
};
|
|
1213
|
-
}
|
|
1214
|
-
function toTrajectory(envelope) {
|
|
1215
|
-
const rows = [];
|
|
1216
|
-
for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
|
|
1217
|
-
const row = repetitionToDatasetRow(cell, repetition);
|
|
1218
|
-
if (row) rows.push(row);
|
|
1219
|
-
}
|
|
1220
|
-
return rows;
|
|
1221
|
-
}
|
|
1222
|
-
function toProtoInstances(envelope) {
|
|
1223
|
-
const instances = [];
|
|
1224
|
-
for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
|
|
1225
|
-
const instance = repetitionToProtoInstance(cell, repetition);
|
|
1226
|
-
if (instance) instances.push(instance);
|
|
1227
|
-
}
|
|
1228
|
-
return instances;
|
|
1229
|
-
}
|
|
1230
|
-
function toAgentTrace(envelope) {
|
|
1231
|
-
const traces = [];
|
|
1232
|
-
for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
|
|
1233
|
-
const trace = repetitionToAgentTrace(repetition);
|
|
1234
|
-
if (trace) traces.push(trace);
|
|
1235
|
-
}
|
|
1236
|
-
return traces;
|
|
1237
|
-
}
|
|
1238
|
-
function enrichRepetitionWithInterchange(repetition, referenceTrajectory) {
|
|
1239
|
-
if (!repetition.trajectory) return repetition;
|
|
1240
|
-
const predicted_trajectory = predictedTrajectoryFromView(repetition.trajectory);
|
|
1241
|
-
const agent_trace = buildAgentTrace(repetition.trajectory);
|
|
1242
|
-
const latency_in_seconds = latencyInSeconds(repetition.trajectory);
|
|
1243
|
-
const failure = repetition.trajectory.success ? 0 : 1;
|
|
1244
|
-
const metrics = computeRepetitionMetrics({
|
|
1245
1370
|
...repetition,
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1371
|
+
failure: 1
|
|
1372
|
+
};
|
|
1373
|
+
const predicted = repetition.trajectory.toolCalls;
|
|
1374
|
+
const referenceStepsList = referenceSteps(options.reference);
|
|
1375
|
+
const referenceToolNameMode = options.reference?.tool_name_mode ?? "harness";
|
|
1376
|
+
const enriched = {
|
|
1252
1377
|
...repetition,
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1378
|
+
evaluationInstance: toEvaluationInstance({
|
|
1379
|
+
prompt: options.prompt,
|
|
1380
|
+
response: repetition.trajectory.finalResponse
|
|
1381
|
+
}),
|
|
1382
|
+
latencySeconds: repetition.trajectory.usage.durationMs / 1e3,
|
|
1383
|
+
failure: repetition.trajectory.success ? 0 : 1
|
|
1259
1384
|
};
|
|
1385
|
+
if (referenceStepsList?.length) {
|
|
1386
|
+
enriched.trajectoryInstances = toTrajectoryInstances({
|
|
1387
|
+
predicted,
|
|
1388
|
+
reference: referenceStepsList,
|
|
1389
|
+
referenceToolNameMode
|
|
1390
|
+
});
|
|
1391
|
+
enriched.harnessMetrics = toHarnessMetrics(predicted, referenceStepsList, { referenceToolNameMode });
|
|
1392
|
+
}
|
|
1393
|
+
return enriched;
|
|
1260
1394
|
}
|
|
1261
1395
|
//#endregion
|
|
1262
1396
|
//#region src/eval-record/build.ts
|
|
1263
1397
|
/**
|
|
1264
1398
|
* Build {@link EvalRunEnvelope} from harness-eval run and grading reports.
|
|
1399
|
+
*
|
|
1400
|
+
* This is the canonical export path from in-process or on-disk {@link SuiteReport}
|
|
1401
|
+
* JSON into the cross-harness eval record contract. It stitches together:
|
|
1402
|
+
*
|
|
1403
|
+
* - Behavioral assertion results from the runner
|
|
1404
|
+
* - Optional outcome grades from the LLM grader
|
|
1405
|
+
* - Vertex protojson interchange fields via {@link enrichRepetitionWithProtojson}
|
|
1406
|
+
* - Optional artifacts (transcript, raw stream-json) controlled by build options
|
|
1407
|
+
*
|
|
1408
|
+
* Downstream consumers include CI gates, databases, and the `harness-eval envelope`
|
|
1409
|
+
* CLI projection commands.
|
|
1410
|
+
*/
|
|
1411
|
+
/**
|
|
1412
|
+
* Pull raw stream-json events from an adapter result when the adapter exposes them.
|
|
1413
|
+
*
|
|
1414
|
+
* Adapters may attach `rawEvents` for debug-only envelope export; this helper
|
|
1415
|
+
* avoids coupling the builder to a specific adapter result type.
|
|
1265
1416
|
*/
|
|
1266
1417
|
function extractRawEvents(adapterResult) {
|
|
1267
1418
|
if (adapterResult !== null && typeof adapterResult === "object" && "rawEvents" in adapterResult && Array.isArray(adapterResult.rawEvents)) return adapterResult.rawEvents;
|
|
1268
1419
|
}
|
|
1269
|
-
|
|
1420
|
+
/**
|
|
1421
|
+
* Derive cell-level outcome pass from graded repetitions.
|
|
1422
|
+
*
|
|
1423
|
+
* Returns `undefined` when no repetition was graded (outcome gate not applicable).
|
|
1424
|
+
* When graded, every repetition must have zero failed expectations and no grader error.
|
|
1425
|
+
*
|
|
1426
|
+
* @param _caseId - Reserved for future per-case outcome rules; unused today.
|
|
1427
|
+
* @param _cellLabel - Reserved for future per-cell outcome rules; unused today.
|
|
1428
|
+
*/
|
|
1429
|
+
function outcomePassForCell(_caseId, _cellLabel, repetitions) {
|
|
1270
1430
|
const graded = repetitions.filter((r) => r.outcomeGrades);
|
|
1271
1431
|
if (graded.length === 0) return void 0;
|
|
1272
1432
|
return graded.every((r) => r.outcomeGrades.error === void 0 && r.outcomeGrades.summary.failed === 0);
|
|
@@ -1274,6 +1434,10 @@ function outcomePassForCell(caseId, cellLabel, repetitions) {
|
|
|
1274
1434
|
/**
|
|
1275
1435
|
* Convert a {@link SuiteReport} (and optional grading) into a versioned
|
|
1276
1436
|
* {@link EvalRunEnvelope} for storage or API handoff.
|
|
1437
|
+
*
|
|
1438
|
+
* @param report - Runner output for one suite execution.
|
|
1439
|
+
* @param options - Provenance, grading merge, and artifact inclusion flags.
|
|
1440
|
+
* @returns A fully populated envelope with protojson interchange fields on each repetition.
|
|
1277
1441
|
*/
|
|
1278
1442
|
function buildEvalRunEnvelope(report, options = {}) {
|
|
1279
1443
|
const includeTranscript = options.includeTranscript !== false;
|
|
@@ -1281,7 +1445,8 @@ function buildEvalRunEnvelope(report, options = {}) {
|
|
|
1281
1445
|
const judge = options.grading?.judge ?? { id: "harness-eval/claude-grader" };
|
|
1282
1446
|
const cells = report.cells.map((cell) => {
|
|
1283
1447
|
const prompt = cell.prompt ?? "";
|
|
1284
|
-
const
|
|
1448
|
+
const referenceTrajectoryConfig = cell.reference_trajectory;
|
|
1449
|
+
const referenceTrajectory = referenceTrajectoryConfig ? toReferenceTrajectory(referenceTrajectoryConfig.steps, referenceTrajectoryConfig.tool_name_mode ?? "harness") : void 0;
|
|
1285
1450
|
const repetitions = cell.repetitions.map((rep) => {
|
|
1286
1451
|
const base = {
|
|
1287
1452
|
repetitionIndex: rep.repetitionIndex,
|
|
@@ -1317,7 +1482,10 @@ function buildEvalRunEnvelope(report, options = {}) {
|
|
|
1317
1482
|
evalFeedback: graded.evalFeedback,
|
|
1318
1483
|
error: graded.graderError
|
|
1319
1484
|
};
|
|
1320
|
-
return
|
|
1485
|
+
return enrichRepetitionWithProtojson(base, {
|
|
1486
|
+
prompt,
|
|
1487
|
+
reference: referenceTrajectoryConfig
|
|
1488
|
+
});
|
|
1321
1489
|
});
|
|
1322
1490
|
return {
|
|
1323
1491
|
caseId: cell.caseId,
|
|
@@ -1325,8 +1493,8 @@ function buildEvalRunEnvelope(report, options = {}) {
|
|
|
1325
1493
|
notes: cell.notes,
|
|
1326
1494
|
prompt: cell.prompt,
|
|
1327
1495
|
expectations: cell.expectations,
|
|
1328
|
-
|
|
1329
|
-
|
|
1496
|
+
referenceTrajectory,
|
|
1497
|
+
humanRatings: cell.human_ratings,
|
|
1330
1498
|
cellLabel: cell.cell.label,
|
|
1331
1499
|
axes: cell.cell.axes,
|
|
1332
1500
|
assertionStats: cell.assertionStats,
|
|
@@ -1360,7 +1528,16 @@ function buildEvalRunEnvelope(report, options = {}) {
|
|
|
1360
1528
|
cells
|
|
1361
1529
|
};
|
|
1362
1530
|
}
|
|
1363
|
-
/**
|
|
1531
|
+
/**
|
|
1532
|
+
* Build an envelope from on-disk runner and grader JSON artifacts.
|
|
1533
|
+
*
|
|
1534
|
+
* Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
|
|
1535
|
+
* outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
|
|
1536
|
+
* attaches suite URI and SHA-256 content hash for reproducibility.
|
|
1537
|
+
*
|
|
1538
|
+
* @param reportPath - Path to the suite run report JSON from `harness-eval run`.
|
|
1539
|
+
* @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
|
|
1540
|
+
*/
|
|
1364
1541
|
async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
|
|
1365
1542
|
const reportText = await readFile(reportPath, "utf8");
|
|
1366
1543
|
const report = JSON.parse(reportText);
|
|
@@ -1391,6 +1568,72 @@ async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
|
|
|
1391
1568
|
});
|
|
1392
1569
|
}
|
|
1393
1570
|
//#endregion
|
|
1394
|
-
|
|
1571
|
+
//#region src/eval-interchange/projections.ts
|
|
1572
|
+
/** Trajectory instance keys emitted in stable order for JSONL export. */
|
|
1573
|
+
const TRAJECTORY_INSTANCE_KEYS = [
|
|
1574
|
+
"exactMatch",
|
|
1575
|
+
"inOrderMatch",
|
|
1576
|
+
"anyOrderMatch",
|
|
1577
|
+
"precision",
|
|
1578
|
+
"recall",
|
|
1579
|
+
"singleToolUse"
|
|
1580
|
+
];
|
|
1581
|
+
/**
|
|
1582
|
+
* Flatten one repetition into a trajectory dataset row.
|
|
1583
|
+
*
|
|
1584
|
+
* Pulls prompt from the cell, response from evaluationInstance, and falls
|
|
1585
|
+
* back to duration-based latency when enrich did not set latencySeconds.
|
|
1586
|
+
*/
|
|
1587
|
+
function repetitionToDatasetRow(cell, repetition) {
|
|
1588
|
+
return {
|
|
1589
|
+
caseId: cell.caseId,
|
|
1590
|
+
repetitionIndex: repetition.repetitionIndex,
|
|
1591
|
+
prompt: cell.prompt,
|
|
1592
|
+
response: repetition.evaluationInstance?.response?.text,
|
|
1593
|
+
evaluationInstance: repetition.evaluationInstance,
|
|
1594
|
+
latencySeconds: repetition.latencySeconds ?? repetition.durationMs / 1e3,
|
|
1595
|
+
failure: repetition.failure ?? (repetition.trajectory?.success ? 0 : 1),
|
|
1596
|
+
humanRatings: cell.humanRatings
|
|
1597
|
+
};
|
|
1598
|
+
}
|
|
1599
|
+
/**
|
|
1600
|
+
* Expand one repetition into type-tagged instance rows for EvaluateInstances.
|
|
1601
|
+
*
|
|
1602
|
+
* Returns an empty array when the repetition has no reference trajectory
|
|
1603
|
+
* (and therefore no trajectoryInstances block).
|
|
1604
|
+
*/
|
|
1605
|
+
function repetitionToInstanceRows(cell, repetition) {
|
|
1606
|
+
if (!repetition.trajectoryInstances) return [];
|
|
1607
|
+
const rows = [];
|
|
1608
|
+
for (const key of TRAJECTORY_INSTANCE_KEYS) {
|
|
1609
|
+
const instance = repetition.trajectoryInstances[key];
|
|
1610
|
+
if (!instance) continue;
|
|
1611
|
+
rows.push({
|
|
1612
|
+
messageType: trajectoryInstanceMessageType(key),
|
|
1613
|
+
caseId: cell.caseId,
|
|
1614
|
+
repetitionIndex: repetition.repetitionIndex,
|
|
1615
|
+
instance
|
|
1616
|
+
});
|
|
1617
|
+
}
|
|
1618
|
+
return rows;
|
|
1619
|
+
}
|
|
1620
|
+
/**
|
|
1621
|
+
* Trajectory projection — all repetitions in the envelope as dataset rows.
|
|
1622
|
+
*/
|
|
1623
|
+
function toTrajectory(envelope) {
|
|
1624
|
+
const rows = [];
|
|
1625
|
+
for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(repetitionToDatasetRow(cell, repetition));
|
|
1626
|
+
return rows;
|
|
1627
|
+
}
|
|
1628
|
+
/**
|
|
1629
|
+
* Instances projection — all trajectory metric instances as JSONL rows.
|
|
1630
|
+
*/
|
|
1631
|
+
function toInstancesJsonl(envelope) {
|
|
1632
|
+
const rows = [];
|
|
1633
|
+
for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(...repetitionToInstanceRows(cell, repetition));
|
|
1634
|
+
return rows;
|
|
1635
|
+
}
|
|
1636
|
+
//#endregion
|
|
1637
|
+
export { loadSuiteReport as C, trajectoryToOtlp as D, emitOtel as E, EVAL_RUN_SCHEMA_VERSION as O, gradeReport as S, createClaudeGrader as T, serializeToolInput as _, enrichRepetitionWithProtojson as a, gradingReportPassed as b, parseToolInput as c, trajectoryInOrderMatch as d, trajectoryPrecision as f, toTrajectoryInstances as g, toEvaluationInstance as h, buildEvalRunEnvelopeFromFiles as i, TRAJECTORY_SCHEMA_VERSION as k, trajectoryAnyOrderMatch as l, trajectorySingleToolUse as m, toTrajectory as n, toHarnessMetrics as o, trajectoryRecall as p, buildEvalRunEnvelope as r, computeTrajectoryMetrics as s, toInstancesJsonl as t, trajectoryExactMatch as u, formatReport as v, trajectoryToTranscript as w, resolveGradeOptions as x, formatGradingConsole as y };
|
|
1395
1638
|
|
|
1396
|
-
//# sourceMappingURL=
|
|
1639
|
+
//# sourceMappingURL=projections-BcX7w-f6.js.map
|