@alis-build/harness-eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +104 -10
  2. package/dist/adapters/claude-code/index.d.ts +2 -2
  3. package/dist/adapters/claude-code/index.js +2 -1
  4. package/dist/adapters/codex/index.d.ts +68 -0
  5. package/dist/adapters/codex/index.js +3 -0
  6. package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
  7. package/dist/claude-code-C_7hxC8z.js.map +1 -0
  8. package/dist/cli/bin.js +204 -127
  9. package/dist/cli/bin.js.map +1 -1
  10. package/dist/codex-0cHO2te9.js +496 -0
  11. package/dist/codex-0cHO2te9.js.map +1 -0
  12. package/dist/config/loader.d.ts +2 -2
  13. package/dist/config/loader.js +2 -2
  14. package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
  15. package/dist/index.d.ts +397 -153
  16. package/dist/index.js +125 -5
  17. package/dist/index.js.map +1 -0
  18. package/dist/loader-B1WmGGzf.d.ts +107 -0
  19. package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
  20. package/dist/loader-DnQ6Jt0i.js.map +1 -0
  21. package/dist/reporter-Biy-5-9M.js +2216 -0
  22. package/dist/reporter-Biy-5-9M.js.map +1 -0
  23. package/dist/runner/suite.d.ts +1 -1
  24. package/dist/runner/suite.js +1 -1
  25. package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
  26. package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
  27. package/dist/suite-BcP64nlb.js.map +1 -0
  28. package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
  29. package/dist/types-Bac8_Ixb.js +246 -0
  30. package/dist/types-Bac8_Ixb.js.map +1 -0
  31. package/dist/types-Bu8uOZZN.d.ts +77 -0
  32. package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
  33. package/package.json +7 -2
  34. package/schemas/eval-interchange-instances.schema.json +196 -0
  35. package/schemas/eval-interchange.schema.json +65 -52
  36. package/schemas/eval-run-envelope.schema.json +182 -425
  37. package/dist/build-DsVJ_UeU.js +0 -1396
  38. package/dist/build-DsVJ_UeU.js.map +0 -1
  39. package/dist/claude-code-ycT0JQZF.js.map +0 -1
  40. package/dist/loader-BCnFJ8rm.js.map +0 -1
  41. package/dist/loader-DTvoVfN0.d.ts +0 -33
  42. package/dist/suite-chj0j22j.js.map +0 -1
  43. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  44. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
@@ -0,0 +1,2216 @@
1
+ import { i as buildJudgeArgs } from "./claude-code-C_7hxC8z.js";
2
+ import { n as createLimit, t as runSuite, u as getAdapter } from "./suite-BcP64nlb.js";
3
+ import { s as buildJudgeArgs$1 } from "./codex-0cHO2te9.js";
4
+ import { i as loadGradingConfig, l as ConfigError, o as loadSuiteDocument, s as DEFAULT_PIPELINE_OUTPUTS, t as loadSuite } from "./loader-DnQ6Jt0i.js";
5
+ import { spawn } from "node:child_process";
6
+ import { readFile, stat, writeFile } from "node:fs/promises";
7
+ import { basename, dirname, join, resolve } from "node:path";
8
+ import { createHash, randomUUID } from "node:crypto";
9
+ import { parse } from "yaml";
10
+ import { fileURLToPath } from "node:url";
11
+ //#region src/types/eval-record.ts
12
+ /** Schema version for {@link EvalRunEnvelope} JSON documents. */
13
+ const EVAL_RUN_SCHEMA_VERSION = "1.0";
14
+ /** Schema version embedded in each {@link TrajectoryView} at export time. */
15
+ const TRAJECTORY_SCHEMA_VERSION = "1.0";
16
+ //#endregion
17
+ //#region src/grader/prompt.ts
18
+ /**
19
+ * Build the full grader prompt including eval prompt, transcript, and schema.
20
+ *
21
+ * When `systemInstruction` is set it is prepended as a judge-specific prefix.
22
+ */
23
+ function buildGraderPrompt(input) {
24
+ const expectationList = input.expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
25
+ return `${input.systemInstruction ? `${input.systemInstruction.trim()}\n\n` : ""}You are an automated evaluation grader (not the agent under test). Your only job is to score expectations against the transcript below.
26
+
27
+ Your job is to evaluate each expectation against the transcript and final response.
28
+ PASS only when there is clear evidence in the transcript or final response.
29
+ When uncertain, FAIL — burden of proof is on PASS.
30
+
31
+ Also critique the expectations themselves if any are trivially satisfied or miss important outcomes.
32
+
33
+ ## Eval prompt
34
+
35
+ ${input.prompt}
36
+
37
+ ## Execution transcript
38
+
39
+ ${input.transcript}
40
+
41
+ ## Expectations to grade
42
+
43
+ ${expectationList}
44
+
45
+ ## Output format
46
+
47
+ Respond with ONLY a single JSON object (no markdown fences, no commentary) matching this schema:
48
+
49
+ {
50
+ "expectations": [
51
+ { "text": "<original expectation>", "passed": true|false, "evidence": "<quote or description>" }
52
+ ],
53
+ "summary": { "passed": <int>, "failed": <int>, "total": <int>, "pass_rate": <0.0-1.0> },
54
+ "eval_feedback": {
55
+ "suggestions": [{ "assertion": "<optional>", "reason": "<string>" }],
56
+ "overall": "<brief assessment>"
57
+ }
58
+ }
59
+
60
+ Include every expectation in the same order. summary must match the expectations array.`;
61
+ }
62
+ //#endregion
63
+ //#region src/grader/parse.ts
64
+ /**
65
+ * Extract assistant text from Claude stdout.
66
+ *
67
+ * Handles plain text, single JSON result envelopes, stream-json arrays, and
68
+ * assistant message objects — the judge subprocess may emit any of these
69
+ * depending on Claude Code version and flags.
70
+ */
71
+ function extractClaudeResponseText(stdout) {
72
+ const trimmed = stdout.trim();
73
+ if (!trimmed) return "";
74
+ try {
75
+ const data = JSON.parse(trimmed);
76
+ if (Array.isArray(data)) return extractFromEventArray(data) ?? trimmed;
77
+ if (typeof data === "object" && data !== null) {
78
+ const event = data;
79
+ if (event.type === "result" && typeof event.result === "string") return event.result;
80
+ if (event.type === "assistant" && event.message) {
81
+ const text = textFromAssistantMessage(event.message);
82
+ if (text) return text;
83
+ }
84
+ }
85
+ } catch {}
86
+ return trimmed;
87
+ }
88
+ /**
89
+ * Extract assistant text from Codex judge stdout.
90
+ *
91
+ * Handles plain text and JSONL streams from accidental `--json` usage.
92
+ */
93
+ function extractCodexResponseText(stdout) {
94
+ const trimmed = stdout.trim();
95
+ if (!trimmed) return "";
96
+ const lines = trimmed.split("\n").filter((line) => line.trim().length > 0);
97
+ if (lines.length > 1) for (let i = lines.length - 1; i >= 0; i--) try {
98
+ const event = JSON.parse(lines[i]);
99
+ if (event.type === "item.completed" && (event.item?.type === "assistant_message" || event.item?.item_type === "assistant_message") && event.item.text) return event.item.text;
100
+ } catch {
101
+ continue;
102
+ }
103
+ return trimmed;
104
+ }
105
+ /** Walk a stream-json event array and return the final assistant or result text. */
106
+ function extractFromEventArray(events) {
107
+ const result = events.find((e) => typeof e === "object" && e !== null && e.type === "result");
108
+ if (result?.result) return result.result;
109
+ const assistantTexts = [];
110
+ for (const event of events) if (typeof event === "object" && event !== null && event.type === "assistant") {
111
+ const text = textFromAssistantMessage(event.message);
112
+ if (text) assistantTexts.push(text);
113
+ }
114
+ if (assistantTexts.length > 0) return assistantTexts[assistantTexts.length - 1];
115
+ return null;
116
+ }
117
+ /** Concatenate text blocks from an Anthropic-style assistant message object. */
118
+ function textFromAssistantMessage(message) {
119
+ if (!message || typeof message !== "object") return null;
120
+ const content = message.content;
121
+ if (typeof content === "string") return content;
122
+ if (!Array.isArray(content)) return null;
123
+ const texts = [];
124
+ for (const block of content) if (typeof block === "object" && block !== null && block.type === "text" && typeof block.text === "string") texts.push(block.text);
125
+ return texts.length > 0 ? texts.join("\n") : null;
126
+ }
127
+ /**
128
+ * Parse grader JSON from response text.
129
+ *
130
+ * Tries the raw string first, then fenced code blocks and brace-delimited
131
+ * substrings. Returns null when no valid expectations array is found.
132
+ */
133
+ function parseGraderJson(text) {
134
+ const candidates = [text.trim(), extractJsonBlock(text)];
135
+ for (const candidate of candidates) {
136
+ if (!candidate) continue;
137
+ try {
138
+ const normalized = normalizeGraderJson(JSON.parse(candidate));
139
+ if (normalized.expectations.length > 0) return normalized;
140
+ } catch {
141
+ continue;
142
+ }
143
+ }
144
+ return null;
145
+ }
146
+ /** Extract JSON from markdown fences or the outermost `{...}` substring. */
147
+ function extractJsonBlock(text) {
148
+ const fence = text.match(/```(?:json)?\s*([\s\S]*?)```/);
149
+ if (fence?.[1]) return fence[1].trim();
150
+ const start = text.indexOf("{");
151
+ const end = text.lastIndexOf("}");
152
+ if (start >= 0 && end > start) return text.slice(start, end + 1);
153
+ return null;
154
+ }
155
+ /** Map raw grader JSON to runtime {@link GraderOutput} with computed summary. */
156
+ function normalizeGraderJson(raw) {
157
+ const expectations = (raw.expectations ?? []).map((e) => ({
158
+ text: e.text ?? "",
159
+ passed: Boolean(e.passed),
160
+ evidence: e.evidence ?? ""
161
+ }));
162
+ const passed = expectations.filter((e) => e.passed).length;
163
+ const failed = expectations.length - passed;
164
+ const total = expectations.length;
165
+ const passRate = raw.summary?.pass_rate ?? raw.summary?.passRate ?? (total === 0 ? 0 : passed / total);
166
+ const summary = {
167
+ passed: raw.summary?.passed ?? passed,
168
+ failed: raw.summary?.failed ?? failed,
169
+ total: raw.summary?.total ?? total,
170
+ passRate
171
+ };
172
+ let evalFeedback;
173
+ if (raw.eval_feedback) evalFeedback = {
174
+ suggestions: (raw.eval_feedback.suggestions ?? []).map((s) => ({
175
+ assertion: s.assertion,
176
+ reason: s.reason ?? ""
177
+ })),
178
+ overall: raw.eval_feedback.overall ?? ""
179
+ };
180
+ return {
181
+ expectations,
182
+ summary,
183
+ evalFeedback
184
+ };
185
+ }
186
+ //#endregion
187
+ //#region src/grader/spawn-judge.ts
188
+ /**
189
+ * Shared subprocess utilities for judge graders (Claude + Codex).
190
+ *
191
+ * Owns detached spawn, process-group teardown, and SIGTERM → SIGKILL
192
+ * escalation so both graders share one implementation.
193
+ */
194
+ const KILL_GRACE_MS = 5e3;
195
+ /** Kill the detached process group (fallback to single process if group kill fails). */
196
+ function killTree(child, signal) {
197
+ if (child.pid === void 0) return;
198
+ try {
199
+ process.kill(-child.pid, signal);
200
+ } catch {
201
+ try {
202
+ child.kill(signal);
203
+ } catch {}
204
+ }
205
+ }
206
+ /**
207
+ * Spawn a judge subprocess with process-group teardown and collect stdout.
208
+ *
209
+ * Non-zero exit with empty stdout is treated as failure; partial stdout on
210
+ * non-zero exit is retained (judges sometimes exit non-zero after emitting JSON).
211
+ */
212
+ function spawnCollectStdout(options) {
213
+ const { binary, args, timeoutMs, env, cwd } = options;
214
+ return new Promise((resolve, reject) => {
215
+ const child = spawn(binary, args, {
216
+ env: env ?? process.env,
217
+ cwd,
218
+ stdio: [
219
+ "ignore",
220
+ "pipe",
221
+ "pipe"
222
+ ],
223
+ detached: true
224
+ });
225
+ const chunks = [];
226
+ child.stdout?.setEncoding("utf8");
227
+ child.stdout?.on("data", (c) => chunks.push(c));
228
+ const stderrChunks = [];
229
+ child.stderr?.setEncoding("utf8");
230
+ child.stderr?.on("data", (c) => stderrChunks.push(c));
231
+ let killEscalation = null;
232
+ const timer = setTimeout(() => {
233
+ killTree(child, "SIGTERM");
234
+ killEscalation = setTimeout(() => killTree(child, "SIGKILL"), KILL_GRACE_MS);
235
+ const stderrHint = stderrChunks.join("").trim().slice(0, 400);
236
+ reject(/* @__PURE__ */ new Error(`grader timed out after ${timeoutMs}ms` + (stderrHint ? ` (stderr: ${stderrHint})` : "")));
237
+ }, timeoutMs);
238
+ const finalize = (err) => {
239
+ clearTimeout(timer);
240
+ if (killEscalation) clearTimeout(killEscalation);
241
+ if (err) reject(err);
242
+ else resolve(chunks.join(""));
243
+ };
244
+ child.on("error", (err) => finalize(err));
245
+ child.on("close", (code) => {
246
+ if (code !== 0 && chunks.length === 0) finalize(/* @__PURE__ */ new Error(`grader exited ${code}: ${stderrChunks.join("").slice(0, 500)}`));
247
+ else finalize();
248
+ });
249
+ });
250
+ }
251
+ //#endregion
252
+ //#region src/grader/claude-grader.ts
253
+ /**
254
+ * Grade expectations by spawning Claude as judge (skill-creator grader pattern).
255
+ */
256
+ const DEFAULT_TIMEOUT_MS$1 = 3e5;
257
+ /**
258
+ * Judge subprocess defaults — grading is a single-shot JSON response, not an agent session.
259
+ * Without these, Claude Code may load plugins/MCP and loop on tools until timeout.
260
+ */
261
+ const JUDGE_CLAUDE_DEFAULTS = {
262
+ maxTurns: 1,
263
+ bare: true,
264
+ disableSlashCommands: true,
265
+ noSessionPersistence: true
266
+ };
267
+ /** Merge user-supplied Claude Code options over judge-safe defaults. */
268
+ function mergeJudgeClaudeOptions(claudeCode) {
269
+ return {
270
+ ...JUDGE_CLAUDE_DEFAULTS,
271
+ ...claudeCode
272
+ };
273
+ }
274
+ /** Factory returning a {@link GraderFn} bound to subprocess options. */
275
+ function createClaudeGrader(options = {}) {
276
+ return (input) => runClaudeGrader(input, options);
277
+ }
278
+ /**
279
+ * Spawn Claude as judge, parse JSON response, align with input expectations.
280
+ *
281
+ * Unparseable output fails all expectations and sets {@link GraderOutput.error}.
282
+ */
283
+ async function runClaudeGrader(input, options = {}) {
284
+ const binary = options.binary ?? options.claudeCode?.binary ?? "claude";
285
+ const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS$1;
286
+ const prompt = buildGraderPrompt(input);
287
+ const model = options.model ?? options.claudeCode?.model;
288
+ const responseText = extractClaudeResponseText(await spawnCollectStdout({
289
+ binary,
290
+ args: buildJudgeArgs(prompt, {
291
+ ...mergeJudgeClaudeOptions(options.claudeCode),
292
+ model
293
+ }),
294
+ timeoutMs,
295
+ env: buildChildEnv(options.env),
296
+ cwd: options.cwd
297
+ }));
298
+ const parsed = parseGraderJson(responseText);
299
+ if (!parsed) return {
300
+ expectations: input.expectations.map((text) => ({
301
+ text,
302
+ passed: false,
303
+ evidence: "Grader returned unparseable output"
304
+ })),
305
+ summary: {
306
+ passed: 0,
307
+ failed: input.expectations.length,
308
+ total: input.expectations.length,
309
+ passRate: 0
310
+ },
311
+ error: `failed to parse grader JSON from response: ${responseText.slice(0, 200)}`
312
+ };
313
+ const expectations = input.expectations.map((text, i) => {
314
+ const graded = parsed.expectations[i];
315
+ return {
316
+ text,
317
+ passed: graded?.passed ?? false,
318
+ evidence: graded?.evidence ?? "No evidence returned"
319
+ };
320
+ });
321
+ const passed = expectations.filter((e) => e.passed).length;
322
+ const total = expectations.length;
323
+ return {
324
+ expectations,
325
+ summary: {
326
+ passed,
327
+ failed: total - passed,
328
+ total,
329
+ passRate: total === 0 ? 0 : passed / total
330
+ },
331
+ evalFeedback: parsed.evalFeedback
332
+ };
333
+ }
334
+ /**
335
+ * Build subprocess env, stripping CLAUDECODE to avoid nested-session guards.
336
+ */
337
+ function buildChildEnv(extraEnv) {
338
+ const env = {
339
+ ...process.env,
340
+ ...extraEnv
341
+ };
342
+ delete env.CLAUDECODE;
343
+ return env;
344
+ }
345
+ //#endregion
346
+ //#region src/grader/codex-grader.ts
347
+ /**
348
+ * Grade expectations by spawning Codex as judge.
349
+ */
350
+ const DEFAULT_TIMEOUT_MS = 3e5;
351
+ /** Judge subprocess defaults — single-shot grading without persistent sessions. */
352
+ const JUDGE_CODEX_DEFAULTS = {
353
+ ephemeral: true,
354
+ ignoreUserConfig: true,
355
+ skipGitRepoCheck: true
356
+ };
357
+ /** Merge user-supplied Codex options over judge-safe defaults. */
358
+ function mergeJudgeCodexOptions(codex) {
359
+ return {
360
+ ...JUDGE_CODEX_DEFAULTS,
361
+ ...codex
362
+ };
363
+ }
364
+ /** Factory returning a {@link GraderFn} bound to subprocess options. */
365
+ function createCodexGrader(options = {}) {
366
+ return (input) => runCodexGrader(input, options);
367
+ }
368
+ /**
369
+ * Spawn Codex as judge, parse JSON response, align with input expectations.
370
+ *
371
+ * Unparseable output fails all expectations and sets {@link GraderOutput.error}.
372
+ */
373
+ async function runCodexGrader(input, options = {}) {
374
+ const binary = options.binary ?? options.codex?.binary ?? "codex";
375
+ const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
376
+ const prompt = buildGraderPrompt(input);
377
+ const model = options.model ?? options.codex?.model;
378
+ const responseText = extractCodexResponseText(await spawnCollectStdout({
379
+ binary,
380
+ args: buildJudgeArgs$1(prompt, {
381
+ ...mergeJudgeCodexOptions(options.codex),
382
+ model,
383
+ cwd: options.cwd
384
+ }),
385
+ timeoutMs,
386
+ env: {
387
+ ...process.env,
388
+ ...options.env
389
+ },
390
+ cwd: options.cwd
391
+ }));
392
+ const parsed = parseGraderJson(responseText);
393
+ if (!parsed) return {
394
+ expectations: input.expectations.map((text) => ({
395
+ text,
396
+ passed: false,
397
+ evidence: "Grader returned unparseable output"
398
+ })),
399
+ summary: {
400
+ passed: 0,
401
+ failed: input.expectations.length,
402
+ total: input.expectations.length,
403
+ passRate: 0
404
+ },
405
+ error: `failed to parse grader JSON from response: ${responseText.slice(0, 200)}`
406
+ };
407
+ const expectations = input.expectations.map((text, i) => {
408
+ const graded = parsed.expectations[i];
409
+ return {
410
+ text,
411
+ passed: graded?.passed ?? false,
412
+ evidence: graded?.evidence ?? "No evidence returned"
413
+ };
414
+ });
415
+ const passed = expectations.filter((e) => e.passed).length;
416
+ const total = expectations.length;
417
+ return {
418
+ expectations,
419
+ summary: {
420
+ passed,
421
+ failed: total - passed,
422
+ total,
423
+ passRate: total === 0 ? 0 : passed / total
424
+ },
425
+ evalFeedback: parsed.evalFeedback
426
+ };
427
+ }
428
+ //#endregion
429
+ //#region src/grader/expectations.ts
430
+ /**
431
+ * Load expectations sidecar (YAML or JSON).
432
+ */
433
+ /**
434
+ * Load expectations sidecar (YAML or JSON).
435
+ *
436
+ * File format: `{ "<caseId>": ["expectation 1", ...], ... }`.
437
+ */
438
+ async function loadExpectationsMap(path) {
439
+ const text = await readFile(path, "utf8");
440
+ const trimmed = path.trim().toLowerCase();
441
+ let raw;
442
+ if (trimmed.endsWith(".json")) raw = JSON.parse(text);
443
+ else raw = parse(text);
444
+ if (!raw || typeof raw !== "object") throw new Error(`expectations file must be an object mapping case ids to lists`);
445
+ const map = {};
446
+ for (const [caseId, value] of Object.entries(raw)) {
447
+ if (!Array.isArray(value)) throw new Error(`expectations for case "${caseId}" must be an array of strings`);
448
+ map[caseId] = value.map(String);
449
+ }
450
+ return map;
451
+ }
452
+ //#endregion
453
+ //#region src/grader/transcript.ts
454
+ /** Maximum characters per tool result embedded in grader transcripts. */
455
+ const MAX_RESULT_CHARS = 4e3;
456
+ /**
457
+ * Render a {@link TrajectoryView} as markdown for LLM graders.
458
+ *
459
+ * Tool results are truncated at {@link MAX_RESULT_CHARS} to keep judge
460
+ * prompts within reasonable token limits.
461
+ */
462
+ function trajectoryToTranscript(view, prompt) {
463
+ const lines = [];
464
+ if (prompt) lines.push("## User prompt", "", prompt, "");
465
+ for (const turn of view.turns) {
466
+ lines.push(`## Assistant turn ${turn.turnIndex + 1}`, "");
467
+ if (turn.text) lines.push(turn.text, "");
468
+ for (const call of turn.toolCalls) {
469
+ lines.push(`[Tool call] ${call.name} (id=${call.callId})`);
470
+ lines.push(`Arguments: ${formatJson$1(call.args)}`);
471
+ if (call.result !== null) {
472
+ lines.push(`[Tool result] ${formatResult(call.result)}`);
473
+ if (call.isError) lines.push("(tool reported error)");
474
+ } else lines.push("[Tool result] (none observed)");
475
+ lines.push("");
476
+ }
477
+ if (turn.stopReason) lines.push(`Stop reason: ${turn.stopReason}`, "");
478
+ }
479
+ const finalInTurns = view.turns.some((t) => t.text === view.finalResponse);
480
+ if (view.finalResponse && !finalInTurns) lines.push("## Final response", "", view.finalResponse, "");
481
+ lines.push("## Session metadata", `session_id: ${view.meta.sessionId}`, `model: ${view.meta.model}`, `cwd: ${view.meta.cwd}`, `success: ${view.success}`, `tool_calls: ${view.toolCalls.length}`, `duration_ms: ${view.usage.durationMs}`, `input_tokens: ${view.usage.inputTokens}`, `output_tokens: ${view.usage.outputTokens}`);
482
+ return lines.join("\n").trimEnd();
483
+ }
484
+ /** Format unknown values as JSON for transcript embedding. */
485
+ function formatJson$1(value) {
486
+ try {
487
+ return JSON.stringify(value);
488
+ } catch {
489
+ return String(value);
490
+ }
491
+ }
492
+ /** Format a tool result, truncating long string or JSON payloads. */
493
+ function formatResult(result) {
494
+ if (typeof result === "string") return truncate(result);
495
+ return truncate(formatJson$1(result));
496
+ }
497
+ /** Truncate text with ellipsis when exceeding the transcript size budget. */
498
+ function truncate(text) {
499
+ if (text.length <= MAX_RESULT_CHARS) return text;
500
+ return `${text.slice(0, MAX_RESULT_CHARS)}… (truncated)`;
501
+ }
502
+ //#endregion
503
+ //#region src/eval-record/judge-metadata.ts
504
+ /** Map harness grading adapter id to a stable judge identifier. */
505
+ function judgeIdForAdapter(adapter) {
506
+ switch (adapter) {
507
+ case "codex": return "harness-eval/codex-grader";
508
+ case "claude-code": return "harness-eval/claude-grader";
509
+ default: return adapter ? `harness-eval/${adapter}-grader` : "harness-eval/claude-grader";
510
+ }
511
+ }
512
+ /** Build {@link JudgeInfo} from grading adapter and optional model override. */
513
+ function resolveJudgeInfo(options) {
514
+ const adapter = options.adapter ?? "claude-code";
515
+ return {
516
+ id: options.id ?? judgeIdForAdapter(adapter),
517
+ model: options.model,
518
+ adapter
519
+ };
520
+ }
521
+ /** Derive judge metadata from a parsed grading YAML config. */
522
+ function judgeInfoFromGradingConfig(config) {
523
+ return resolveJudgeInfo({
524
+ adapter: config.judge.adapter ?? "claude-code",
525
+ model: config.judge.model ?? config.judge.codex?.model ?? config.judge.claudeCode?.model
526
+ });
527
+ }
528
+ //#endregion
529
+ //#region src/grader/grade-report.ts
530
+ /**
531
+ * Grade a harness-eval SuiteReport with outcome expectations (LLM judge).
532
+ */
533
+ /**
534
+ * Grade every repetition in a {@link SuiteReport} that has expectations.
535
+ *
536
+ * Expectations come from inline case fields or an optional sidecar YAML/JSON
537
+ * map. Runs are concurrent under {@link GradeReportOptions.maxConcurrent}.
538
+ */
539
+ async function gradeReport(report, options = {}) {
540
+ const expectationsMap = options.expectationsPath ? await loadExpectationsMap(options.expectationsPath) : {};
541
+ const gradeFn = options.gradeFn ?? (options.judgeAdapter === "codex" ? createCodexGrader({
542
+ binary: options.binary,
543
+ model: options.model,
544
+ timeoutMs: options.timeoutMs,
545
+ env: options.env,
546
+ cwd: options.cwd,
547
+ codex: options.codex
548
+ }) : createClaudeGrader({
549
+ binary: options.binary,
550
+ model: options.model,
551
+ timeoutMs: options.timeoutMs,
552
+ env: options.env,
553
+ cwd: options.cwd,
554
+ claudeCode: options.claudeCode
555
+ }));
556
+ const limit = createLimit(options.maxConcurrent ?? 2);
557
+ const tasks = [];
558
+ for (const cell of report.cells) {
559
+ const expectations = cell.expectations ?? expectationsMap[cell.caseId] ?? [];
560
+ if (expectations.length === 0) continue;
561
+ for (const rep of cell.repetitions) {
562
+ if (!rep.adapterResult) continue;
563
+ tasks.push({
564
+ cell,
565
+ rep,
566
+ expectations
567
+ });
568
+ }
569
+ }
570
+ const gradeStartTs = Date.now();
571
+ options.onProgress?.({
572
+ kind: "grade-start",
573
+ total: tasks.length
574
+ });
575
+ const results = await Promise.all(tasks.map(({ cell, rep, expectations }) => limit(async () => {
576
+ const start = Date.now();
577
+ const view = rep.adapterResult.view;
578
+ const prompt = cell.prompt ?? "";
579
+ const transcript = trajectoryToTranscript(view, prompt);
580
+ try {
581
+ const graded = await gradeFn({
582
+ prompt,
583
+ transcript,
584
+ expectations,
585
+ systemInstruction: options.systemInstruction
586
+ });
587
+ const result = {
588
+ caseId: cell.caseId,
589
+ cellLabel: cell.cell.label,
590
+ repetitionIndex: rep.repetitionIndex,
591
+ prompt,
592
+ expectations: graded.expectations,
593
+ summary: graded.summary,
594
+ evalFeedback: graded.evalFeedback,
595
+ graderError: graded.error,
596
+ durationMs: Date.now() - start
597
+ };
598
+ options.onProgress?.({
599
+ kind: "grade-complete",
600
+ caseId: result.caseId,
601
+ cellLabel: result.cellLabel,
602
+ repetitionIndex: result.repetitionIndex,
603
+ passed: result.summary.passed,
604
+ failed: result.summary.failed,
605
+ durationMs: result.durationMs,
606
+ graderError: result.graderError
607
+ });
608
+ return result;
609
+ } catch (err) {
610
+ const message = err instanceof Error ? err.message : String(err);
611
+ const result = {
612
+ caseId: cell.caseId,
613
+ cellLabel: cell.cell.label,
614
+ repetitionIndex: rep.repetitionIndex,
615
+ prompt,
616
+ expectations: expectations.map((text) => ({
617
+ text,
618
+ passed: false,
619
+ evidence: message
620
+ })),
621
+ summary: {
622
+ passed: 0,
623
+ failed: expectations.length,
624
+ total: expectations.length,
625
+ passRate: 0
626
+ },
627
+ graderError: message,
628
+ durationMs: Date.now() - start
629
+ };
630
+ options.onProgress?.({
631
+ kind: "grade-complete",
632
+ caseId: result.caseId,
633
+ cellLabel: result.cellLabel,
634
+ repetitionIndex: result.repetitionIndex,
635
+ passed: 0,
636
+ failed: expectations.length,
637
+ durationMs: result.durationMs,
638
+ graderError: message
639
+ });
640
+ return result;
641
+ }
642
+ })));
643
+ results.sort((a, b) => {
644
+ const keyA = `${a.caseId}::${a.cellLabel}::${a.repetitionIndex}`;
645
+ const keyB = `${b.caseId}::${b.cellLabel}::${b.repetitionIndex}`;
646
+ return keyA.localeCompare(keyB);
647
+ });
648
+ const totalExpectations = results.reduce((n, r) => n + r.summary.total, 0);
649
+ const passedExpectations = results.reduce((n, r) => n + r.summary.passed, 0);
650
+ options.onProgress?.({
651
+ kind: "grade-done",
652
+ durationMs: Date.now() - gradeStartTs,
653
+ totalExpectations,
654
+ passedExpectations
655
+ });
656
+ return {
657
+ gradedAt: (/* @__PURE__ */ new Date()).toISOString(),
658
+ sourceReport: options.sourceReport ?? "",
659
+ gradingConfigPath: options.gradingConfigPath,
660
+ judge: resolveJudgeInfo({
661
+ adapter: options.judgeAdapter ?? "claude-code",
662
+ model: options.model
663
+ }),
664
+ results,
665
+ summary: {
666
+ passed: passedExpectations,
667
+ failed: totalExpectations - passedExpectations,
668
+ total: totalExpectations,
669
+ passRate: totalExpectations === 0 ? 0 : passedExpectations / totalExpectations
670
+ }
671
+ };
672
+ }
673
+ /** Load a suite report JSON file produced by `harness-eval run`. */
674
+ async function loadSuiteReport(path) {
675
+ const text = await readFile(path, "utf8");
676
+ return JSON.parse(text);
677
+ }
678
+ //#endregion
679
+ //#region src/grader/resolve-grade-options.ts
680
+ /**
681
+ * Merge standalone grading YAML with CLI flags (CLI wins).
682
+ */
683
+ function resolveGradeOptions(fileConfig, cli = {}, configPath) {
684
+ const judge = fileConfig?.judge;
685
+ const adapter = judge?.adapter ?? "claude-code";
686
+ const claudeCode = judge?.claudeCode ?? {};
687
+ const codex = judge?.codex ?? {};
688
+ const adapterBlock = adapter === "codex" ? codex : claudeCode;
689
+ const binary = cli.binary ?? adapterBlock.binary;
690
+ const model = cli.model ?? judge?.model ?? adapterBlock.model;
691
+ if (adapter === "codex") return {
692
+ sourceReport: cli.sourceReport,
693
+ expectationsPath: cli.expectationsPath,
694
+ model,
695
+ binary,
696
+ timeoutMs: cli.timeoutMs ?? judge?.timeoutMs,
697
+ maxConcurrent: cli.maxConcurrent ?? judge?.maxConcurrent,
698
+ systemInstruction: judge?.system_instruction,
699
+ env: judge?.env,
700
+ cwd: judge?.cwd,
701
+ judgeAdapter: "codex",
702
+ codex: {
703
+ ...codex,
704
+ binary: void 0,
705
+ model: void 0
706
+ },
707
+ gradingConfigPath: configPath
708
+ };
709
+ if (adapter !== "claude-code") throw new Error(`unsupported grading adapter "${adapter}" (supported: claude-code, codex)`);
710
+ return {
711
+ sourceReport: cli.sourceReport,
712
+ expectationsPath: cli.expectationsPath,
713
+ model,
714
+ binary,
715
+ timeoutMs: cli.timeoutMs ?? judge?.timeoutMs,
716
+ maxConcurrent: cli.maxConcurrent ?? judge?.maxConcurrent,
717
+ systemInstruction: judge?.system_instruction,
718
+ env: judge?.env,
719
+ cwd: judge?.cwd,
720
+ judgeAdapter: "claude-code",
721
+ claudeCode: {
722
+ ...claudeCode,
723
+ binary: void 0,
724
+ model: void 0
725
+ },
726
+ gradingConfigPath: configPath
727
+ };
728
+ }
729
+ //#endregion
730
+ //#region src/grader/format-console.ts
731
+ const RESET$1 = "\x1B[0m";
732
+ const GREEN$1 = "\x1B[32m";
733
+ const RED$1 = "\x1B[31m";
734
+ const DIM = "\x1B[2m";
735
+ /**
736
+ * Format a {@link SuiteGradingReport} for terminal output.
737
+ *
738
+ * @param color When true, emit ANSI status colors (default for TTY console).
739
+ */
740
+ function formatGradingConsole(report, color = true) {
741
+ const lines = [];
742
+ if (report.results.length === 0) {
743
+ lines.push("No repetitions graded. Add expectations to the suite YAML or pass --expectations.");
744
+ return lines.join("\n");
745
+ }
746
+ for (const result of report.results) {
747
+ const status = result.summary.failed === 0 && !result.graderError ? color ? `${GREEN$1}PASS${RESET$1}` : "PASS" : color ? `${RED$1}FAIL${RESET$1}` : "FAIL";
748
+ lines.push(`${result.caseId} @ ${result.cellLabel} rep${result.repetitionIndex} ${status}`);
749
+ if (result.graderError) lines.push(color ? ` ${RED$1}grader error: ${result.graderError}${RESET$1}` : ` grader error: ${result.graderError}`);
750
+ for (const exp of result.expectations) {
751
+ const marker = exp.passed ? color ? `${GREEN$1}✓${RESET$1}` : "✓" : color ? `${RED$1}✗${RESET$1}` : "✗";
752
+ lines.push(` ├─ ${exp.text} ${marker}`);
753
+ if (!exp.passed || exp.evidence) lines.push(color ? ` │ ${DIM}${exp.evidence}${RESET$1}` : ` │ ${exp.evidence}`);
754
+ }
755
+ const pct = (result.summary.passRate * 100).toFixed(0);
756
+ lines.push(` └─ ${result.summary.passed}/${result.summary.total} (${pct}%) expectations`);
757
+ lines.push("");
758
+ }
759
+ const overallPct = (report.summary.passRate * 100).toFixed(0);
760
+ lines.push(`Overall: ${report.summary.passed}/${report.summary.total} (${overallPct}%) expectations passed`);
761
+ return lines.join("\n").trimEnd();
762
+ }
763
+ /** True when every graded rep passed all expectations without grader errors. */
764
+ function gradingReportPassed(report) {
765
+ return report.results.every((r) => !r.graderError && r.summary.failed === 0 && r.summary.total > 0);
766
+ }
767
+ //#endregion
768
+ //#region src/eval-interchange/normalize.ts
769
+ /**
770
+ * Serialize tool arguments to the Vertex wire string format.
771
+ *
772
+ * Already-string inputs pass through unchanged (e.g. pre-serialized reference
773
+ * steps). Objects and nullish values become JSON strings; empty input becomes `{}`.
774
+ *
775
+ * @param args - Tool arguments from harness or suite YAML.
776
+ * @returns JSON string suitable for {@link ProtojsonToolCall.toolInput}.
777
+ */
778
+ function serializeToolInput(args) {
779
+ if (typeof args === "string") return args;
780
+ return JSON.stringify(args ?? {});
781
+ }
782
+ /**
783
+ * Normalize a tool name according to suite reference configuration.
784
+ *
785
+ * In `"bare"` mode, strips the MCP namespace prefix (`mcp__api__foo` → `foo`)
786
+ * so reference trajectories authored with bare names match harness tool names.
787
+ *
788
+ * @param toolName - Raw tool name from harness or suite.
789
+ * @param mode - `"harness"` preserves the name; `"bare"` strips after last `__`.
790
+ */
791
+ function normalizeReferenceToolName(toolName, mode) {
792
+ if (mode !== "bare") return toolName;
793
+ const separator = toolName.lastIndexOf("__");
794
+ if (separator === -1) return toolName;
795
+ return toolName.slice(separator + 2);
796
+ }
797
+ /**
798
+ * Convert a harness or suite trajectory into Vertex protojson wire format.
799
+ *
800
+ * `toolNameMode` controls MCP prefix stripping for every tool name in the
801
+ * trajectory. Suite reference steps and predicted harness tool calls use the
802
+ * same mode so comparisons stay consistent across metrics and instances.
803
+ *
804
+ * @param trajectory - Tool calls in harness or YAML reference shape.
805
+ * @param options.toolNameMode - `"harness"` keeps full names; `"bare"` strips after last `__`.
806
+ */
807
+ function toProtojsonTrajectory(trajectory, options = {}) {
808
+ const toolNameMode = options.toolNameMode ?? "harness";
809
+ return { toolCalls: trajectory.map((toolCall) => {
810
+ const name = "name" in toolCall ? toolCall.name : toolCall.tool_name;
811
+ const args = "args" in toolCall ? toolCall.args : toolCall.tool_input;
812
+ return {
813
+ toolName: normalizeReferenceToolName(name, toolNameMode),
814
+ toolInput: serializeToolInput(args)
815
+ };
816
+ }) };
817
+ }
818
+ //#endregion
819
+ //#region src/eval-interchange/protojson/trajectory-instances.ts
820
+ /**
821
+ * Build Vertex Trajectory*Instance protojson wire objects.
822
+ *
823
+ * Each trajectory metric in Vertex EvaluateInstances expects a specific
824
+ * protobuf message. This module constructs all six instance payloads from
825
+ * one predicted/reference pair so callers can batch-upload via JSONL.
826
+ */
827
+ /**
828
+ * Build a pair instance with predicted and reference trajectories.
829
+ *
830
+ * Both sides use the same `referenceToolNameMode` so wire payloads align with
831
+ * {@link toHarnessMetrics} and Vertex EvaluateInstances sees comparable names.
832
+ * In `"bare"` mode, MCP prefixes are stripped on predicted and reference alike.
833
+ */
834
+ function pairInstance(predicted, reference, referenceToolNameMode) {
835
+ return {
836
+ predictedTrajectory: toProtojsonTrajectory(predicted, { toolNameMode: referenceToolNameMode }),
837
+ referenceTrajectory: toProtojsonTrajectory(reference, { toolNameMode: referenceToolNameMode })
838
+ };
839
+ }
840
+ /**
841
+ * Build all Trajectory*Instance payloads for one predicted/reference pair.
842
+ *
843
+ * Pair metrics (exact, in-order, any-order, precision, recall) share the
844
+ * same trajectory pair; single-tool-use omits the reference trajectory
845
+ * per Vertex API shape.
846
+ */
847
+ function toTrajectoryInstances(options) {
848
+ const referenceToolNameMode = options.referenceToolNameMode ?? "harness";
849
+ const pair = pairInstance(options.predicted, options.reference, referenceToolNameMode);
850
+ return {
851
+ exactMatch: pair,
852
+ inOrderMatch: pair,
853
+ anyOrderMatch: pair,
854
+ precision: pair,
855
+ recall: pair,
856
+ singleToolUse: { predictedTrajectory: pair.predictedTrajectory }
857
+ };
858
+ }
859
+ /**
860
+ * Convert suite reference steps to cell-level protojson trajectory export.
861
+ */
862
+ function toReferenceTrajectory(reference, referenceToolNameMode = "harness") {
863
+ return toProtojsonTrajectory(reference, { toolNameMode: referenceToolNameMode });
864
+ }
865
+ /**
866
+ * Map a trajectory instance key to the Vertex protobuf message type name.
867
+ *
868
+ * Used as `messageType` in {@link InstancesJsonlRow} for EvaluateInstances batching.
869
+ */
870
+ function trajectoryInstanceMessageType(key) {
871
+ switch (key) {
872
+ case "exactMatch": return "TrajectoryExactMatchInstance";
873
+ case "inOrderMatch": return "TrajectoryInOrderMatchInstance";
874
+ case "anyOrderMatch": return "TrajectoryAnyOrderMatchInstance";
875
+ case "precision": return "TrajectoryPrecisionInstance";
876
+ case "recall": return "TrajectoryRecallInstance";
877
+ case "singleToolUse": return "TrajectorySingleToolUseInstance";
878
+ }
879
+ }
880
+ //#endregion
881
+ //#region src/eval-interchange/protojson/evaluation-instance.ts
882
+ /**
883
+ * Build an EvaluationInstance protojson object from harness strings.
884
+ *
885
+ * Omitted fields are excluded from the output object rather than set to
886
+ * empty wrappers — protojson omits unset optional fields.
887
+ *
888
+ * @param options.prompt - Case prompt sent to the agent.
889
+ * @param options.response - Final agent response from the trajectory.
890
+ * @param options.reference - Optional reference answer text (rare in harness eval).
891
+ */
892
+ function toEvaluationInstance(options) {
893
+ const instance = {};
894
+ if (options.prompt !== void 0) instance.prompt = { text: options.prompt };
895
+ if (options.response !== void 0) instance.response = { text: options.response };
896
+ if (options.reference !== void 0) instance.reference = { text: options.reference };
897
+ return instance;
898
+ }
899
+ //#endregion
900
+ //#region src/metrics/trajectory.ts
901
+ /**
902
+ * Trajectory-level metrics for comparing predicted and reference tool-call sequences.
903
+ *
904
+ * Aligns with Vertex AI EvaluationService trajectory metrics (exact match,
905
+ * in-order, any-order, precision, recall, single tool use). Tool calls are
906
+ * compared by `(tool_name, serialized tool_input)` identity after normalization.
907
+ *
908
+ * Binary metrics return 0 or 1; precision and recall return fractions in [0, 1].
909
+ */
910
+ function normalizeToolCall(toolCall) {
911
+ if (typeof toolCall.tool_input === "string") return {
912
+ tool_name: toolCall.tool_name,
913
+ tool_input: toolCall.tool_input
914
+ };
915
+ return {
916
+ tool_name: toolCall.tool_name,
917
+ tool_input: serializeToolInput(toolCall.tool_input)
918
+ };
919
+ }
920
+ function normalizeTrajectory(trajectory) {
921
+ return trajectory.map(normalizeToolCall);
922
+ }
923
+ /** Stable composite key for multiset and equality checks. */
924
+ function toolCallKey(toolCall) {
925
+ return `${toolCall.tool_name}\0${toolCall.tool_input}`;
926
+ }
927
+ /**
928
+ * Count predicted tool calls that appear in reference (multiset intersection).
929
+ *
930
+ * Duplicate tool calls are matched one-for-one; order does not matter.
931
+ */
932
+ function multisetIntersectionSize(predicted, reference) {
933
+ const refCounts = /* @__PURE__ */ new Map();
934
+ for (const toolCall of reference) {
935
+ const key = toolCallKey(toolCall);
936
+ refCounts.set(key, (refCounts.get(key) ?? 0) + 1);
937
+ }
938
+ let matched = 0;
939
+ for (const toolCall of predicted) {
940
+ const key = toolCallKey(toolCall);
941
+ const count = refCounts.get(key) ?? 0;
942
+ if (count > 0) {
943
+ matched += 1;
944
+ refCounts.set(key, count - 1);
945
+ }
946
+ }
947
+ return matched;
948
+ }
949
+ /**
950
+ * Whether reference appears as a subsequence of predicted (order preserved).
951
+ *
952
+ * Extra predicted calls between reference steps are allowed (in-order match
953
+ * semantics per Vertex).
954
+ */
955
+ function isSubsequence(predicted, reference) {
956
+ let refIndex = 0;
957
+ for (const toolCall of predicted) {
958
+ if (refIndex >= reference.length) break;
959
+ if (toolCallKey(toolCall) === toolCallKey(reference[refIndex])) refIndex += 1;
960
+ }
961
+ return refIndex === reference.length;
962
+ }
963
+ function arraysEqual(left, right) {
964
+ if (left.length !== right.length) return false;
965
+ return left.every((toolCall, index) => {
966
+ const other = right[index];
967
+ return toolCallKey(toolCall) === toolCallKey(other);
968
+ });
969
+ }
970
+ /** Exact sequence equality after normalization. */
971
+ function trajectoryExactMatch(predicted, reference) {
972
+ return arraysEqual(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
973
+ }
974
+ /** Reference is a subsequence of predicted (order preserved, extras allowed). */
975
+ function trajectoryInOrderMatch(predicted, reference) {
976
+ return isSubsequence(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
977
+ }
978
+ /** Same multiset of tool calls; length must match. */
979
+ function trajectoryAnyOrderMatch(predicted, reference) {
980
+ const predictedNorm = normalizeTrajectory(predicted);
981
+ const referenceNorm = normalizeTrajectory(reference);
982
+ if (predictedNorm.length !== referenceNorm.length) return 0;
983
+ const predictedKeys = predictedNorm.map(toolCallKey).sort();
984
+ const referenceKeys = referenceNorm.map(toolCallKey).sort();
985
+ return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
986
+ }
987
+ /**
988
+ * Fraction of predicted tool calls that appear in reference (multiset).
989
+ *
990
+ * Returns 1 when both trajectories are empty.
991
+ */
992
+ function trajectoryPrecision(predicted, reference) {
993
+ const predictedNorm = normalizeTrajectory(predicted);
994
+ if (predictedNorm.length === 0) return reference.length === 0 ? 1 : 0;
995
+ return multisetIntersectionSize(predictedNorm, normalizeTrajectory(reference)) / predictedNorm.length;
996
+ }
997
+ /**
998
+ * Fraction of reference tool calls matched in predicted (multiset recall).
999
+ *
1000
+ * Returns 1 when reference is empty and predicted is empty.
1001
+ */
1002
+ function trajectoryRecall(predicted, reference) {
1003
+ const referenceNorm = normalizeTrajectory(reference);
1004
+ if (referenceNorm.length === 0) return predicted.length === 0 ? 1 : 0;
1005
+ return multisetIntersectionSize(normalizeTrajectory(predicted), referenceNorm) / referenceNorm.length;
1006
+ }
1007
+ /** Both trajectories have exactly one call and they match. */
1008
+ function trajectorySingleToolUse(predicted, reference) {
1009
+ const predictedNorm = normalizeTrajectory(predicted);
1010
+ const referenceNorm = normalizeTrajectory(reference);
1011
+ if (predictedNorm.length !== 1 || referenceNorm.length !== 1) return 0;
1012
+ return toolCallKey(predictedNorm[0]) === toolCallKey(referenceNorm[0]) ? 1 : 0;
1013
+ }
1014
+ /** Compute all trajectory metrics in one pass. */
1015
+ function computeTrajectoryMetrics(predicted, reference) {
1016
+ return {
1017
+ trajectory_exact_match: trajectoryExactMatch(predicted, reference),
1018
+ trajectory_in_order_match: trajectoryInOrderMatch(predicted, reference),
1019
+ trajectory_any_order_match: trajectoryAnyOrderMatch(predicted, reference),
1020
+ trajectory_precision: trajectoryPrecision(predicted, reference),
1021
+ trajectory_recall: trajectoryRecall(predicted, reference),
1022
+ trajectory_single_tool_use: trajectorySingleToolUse(predicted, reference)
1023
+ };
1024
+ }
1025
+ /**
1026
+ * Parse a wire tool_input string to JSON, or return the raw string on failure.
1027
+ *
1028
+ * Exported for tool-call metrics that need structured arg comparison.
1029
+ */
1030
+ function parseToolInput(toolInput) {
1031
+ try {
1032
+ return JSON.parse(toolInput);
1033
+ } catch {
1034
+ return toolInput;
1035
+ }
1036
+ }
1037
+ //#endregion
1038
+ //#region src/eval-interchange/protojson/harness-metrics.ts
1039
+ /**
1040
+ * Harness-owned trajectory metric scores in Vertex camelCase field names.
1041
+ *
1042
+ * Wraps {@link computeTrajectoryMetrics} for envelope export. External
1043
+ * systems can compare harness-precomputed scores against Vertex EvaluateInstances
1044
+ * results without reimplementing trajectory matching logic.
1045
+ */
1046
+ /**
1047
+ * Compute trajectory metrics and map snake_case keys to Vertex camelCase.
1048
+ *
1049
+ * When `referenceToolNameMode` is `"bare"`, both predicted and reference tool
1050
+ * names are stripped to the suffix after the last `__` so suite reference steps
1051
+ * authored with bare names (e.g. `ListLandingZones`) match harness MCP names
1052
+ * (e.g. `mcp__plugin__ListLandingZones`).
1053
+ *
1054
+ * @param predicted - Tool calls from the harness trajectory view.
1055
+ * @param reference - Reference steps from suite YAML.
1056
+ * @param options.referenceToolNameMode - Name normalization mode from suite YAML.
1057
+ */
1058
+ function toHarnessMetrics(predicted, reference, options = {}) {
1059
+ const referenceToolNameMode = options.referenceToolNameMode ?? "harness";
1060
+ const metrics = computeTrajectoryMetrics(predicted.map((toolCall) => ({
1061
+ tool_name: normalizeReferenceToolName(toolCall.name, referenceToolNameMode),
1062
+ tool_input: toolCall.args
1063
+ })), reference.map((step) => ({
1064
+ tool_name: normalizeReferenceToolName(step.tool_name, referenceToolNameMode),
1065
+ tool_input: step.tool_input
1066
+ })));
1067
+ return {
1068
+ trajectoryExactMatch: metrics.trajectory_exact_match,
1069
+ trajectoryInOrderMatch: metrics.trajectory_in_order_match,
1070
+ trajectoryAnyOrderMatch: metrics.trajectory_any_order_match,
1071
+ trajectoryPrecision: metrics.trajectory_precision,
1072
+ trajectoryRecall: metrics.trajectory_recall,
1073
+ trajectorySingleToolUse: metrics.trajectory_single_tool_use
1074
+ };
1075
+ }
1076
+ //#endregion
1077
+ //#region src/eval-interchange/enrich.ts
1078
+ /**
1079
+ * Enrich eval repetitions with Vertex protojson interchange fields.
1080
+ *
1081
+ * Called during envelope build for each successful repetition. Adds
1082
+ * `evaluationInstance`, optional `trajectoryInstances` / `harnessMetrics`
1083
+ * when a suite reference exists, and Vertex-style `latencySeconds` / `failure`
1084
+ * flags derived from trajectory success.
1085
+ */
1086
+ /** Extract reference steps from suite config when present. */
1087
+ function referenceSteps(reference) {
1088
+ return reference?.steps;
1089
+ }
1090
+ /**
1091
+ * Attach Vertex protojson interchange fields to one {@link EvalRepetition}.
1092
+ *
1093
+ * When no trajectory exists (adapter error), sets `failure: 1` and skips
1094
+ * protojson payloads. Trajectory instances and harness metrics are only
1095
+ * computed when the suite defines a non-empty reference trajectory.
1096
+ *
1097
+ * @param repetition - Base repetition from the runner (trajectory, assertions, grades).
1098
+ * @param options.prompt - Case prompt for EvaluationInstance.
1099
+ * @param options.reference - Suite reference trajectory config, if any.
1100
+ */
1101
+ function enrichRepetitionWithProtojson(repetition, options = {}) {
1102
+ if (!repetition.trajectory) return {
1103
+ ...repetition,
1104
+ failure: 1
1105
+ };
1106
+ const predicted = repetition.trajectory.toolCalls;
1107
+ const referenceStepsList = referenceSteps(options.reference);
1108
+ const referenceToolNameMode = options.reference?.tool_name_mode ?? "harness";
1109
+ const enriched = {
1110
+ ...repetition,
1111
+ evaluationInstance: toEvaluationInstance({
1112
+ prompt: options.prompt,
1113
+ response: repetition.trajectory.finalResponse
1114
+ }),
1115
+ latencySeconds: repetition.trajectory.usage.durationMs / 1e3,
1116
+ failure: repetition.trajectory.success ? 0 : 1
1117
+ };
1118
+ if (referenceStepsList?.length) {
1119
+ enriched.trajectoryInstances = toTrajectoryInstances({
1120
+ predicted,
1121
+ reference: referenceStepsList,
1122
+ referenceToolNameMode
1123
+ });
1124
+ enriched.harnessMetrics = toHarnessMetrics(predicted, referenceStepsList, { referenceToolNameMode });
1125
+ }
1126
+ return enriched;
1127
+ }
1128
+ //#endregion
1129
+ //#region src/eval-record/build.ts
1130
+ /**
1131
+ * Build {@link EvalRunEnvelope} from harness-eval run and grading reports.
1132
+ *
1133
+ * This is the canonical export path from in-process or on-disk {@link SuiteReport}
1134
+ * JSON into the cross-harness eval record contract. It stitches together:
1135
+ *
1136
+ * - Behavioral assertion results from the runner
1137
+ * - Optional outcome grades from the LLM grader
1138
+ * - Vertex protojson interchange fields via {@link enrichRepetitionWithProtojson}
1139
+ * - Optional artifacts (transcript, raw stream-json) controlled by build options
1140
+ *
1141
+ * Downstream consumers include CI gates, databases, and the `harness-eval envelope`
1142
+ * CLI projection commands.
1143
+ */
1144
+ /**
1145
+ * Pull raw stream-json events from an adapter result when the adapter exposes them.
1146
+ *
1147
+ * Adapters may attach `rawEvents` for debug-only envelope export; this helper
1148
+ * avoids coupling the builder to a specific adapter result type.
1149
+ */
1150
+ function extractRawEvents(adapterResult) {
1151
+ if (adapterResult !== null && typeof adapterResult === "object" && "rawEvents" in adapterResult && Array.isArray(adapterResult.rawEvents)) return adapterResult.rawEvents;
1152
+ }
1153
+ /**
1154
+ * Derive cell-level outcome pass from graded repetitions.
1155
+ *
1156
+ * Returns `undefined` when no repetition was graded (outcome gate not applicable).
1157
+ * When graded, every repetition must have zero failed expectations and no grader error.
1158
+ *
1159
+ * @param _caseId - Reserved for future per-case outcome rules; unused today.
1160
+ * @param _cellLabel - Reserved for future per-cell outcome rules; unused today.
1161
+ */
1162
+ function outcomePassForCell(_caseId, _cellLabel, repetitions) {
1163
+ const graded = repetitions.filter((r) => r.outcomeGrades);
1164
+ if (graded.length === 0) return void 0;
1165
+ return graded.every((r) => r.outcomeGrades.error === void 0 && r.outcomeGrades.summary.failed === 0);
1166
+ }
1167
+ /** Resolve judge metadata for envelope export (explicit options win). */
1168
+ async function resolveEnvelopeJudge(options) {
1169
+ if (options.grading?.judge) return options.grading.judge;
1170
+ if (options.gradingConfigPath) try {
1171
+ return judgeInfoFromGradingConfig(await loadGradingConfig(resolve(options.gradingConfigPath)));
1172
+ } catch {}
1173
+ return resolveJudgeInfo({ adapter: "claude-code" });
1174
+ }
1175
+ /** Path to pass to {@link loadSuite} (directory layout uses the suite folder). */
1176
+ async function resolveSuiteLoadPath(suitePath) {
1177
+ const abs = resolve(suitePath);
1178
+ if (basename(abs) === "suite.yaml") return dirname(abs);
1179
+ try {
1180
+ if ((await stat(abs)).isDirectory()) return abs;
1181
+ } catch {}
1182
+ return abs;
1183
+ }
1184
+ /** Read suite YAML bytes for content hashing. */
1185
+ async function readSuiteYamlContent(suitePath) {
1186
+ const loadPath = await resolveSuiteLoadPath(suitePath);
1187
+ return readFile(basename(resolve(suitePath)) === "suite.yaml" ? resolve(suitePath) : join(loadPath, "suite.yaml"), "utf8");
1188
+ }
1189
+ async function resolveEnvelopeHarnessAdapter(options) {
1190
+ if (options.harnessAdapter) return options.harnessAdapter;
1191
+ if (options.suitePath) try {
1192
+ const suite = await loadSuite(await resolveSuiteLoadPath(options.suitePath));
1193
+ if (suite.adapter) return suite.adapter;
1194
+ } catch {}
1195
+ return "claude-code";
1196
+ }
1197
+ /**
1198
+ * Convert a {@link SuiteReport} (and optional grading) into a versioned
1199
+ * {@link EvalRunEnvelope} for storage or API handoff.
1200
+ *
1201
+ * @param report - Runner output for one suite execution.
1202
+ * @param options - Provenance, grading merge, and artifact inclusion flags.
1203
+ * @returns A fully populated envelope with protojson interchange fields on each repetition.
1204
+ */
1205
+ function buildEvalRunEnvelope(report, options = {}) {
1206
+ const includeTranscript = options.includeTranscript !== false;
1207
+ const includeRaw = options.includeRawStreamEvents === true;
1208
+ const judge = options.grading?.judge ?? resolveJudgeInfo({ adapter: "claude-code" });
1209
+ const cells = report.cells.map((cell) => {
1210
+ const prompt = cell.prompt ?? "";
1211
+ const referenceTrajectoryConfig = cell.reference_trajectory;
1212
+ const referenceTrajectory = referenceTrajectoryConfig ? toReferenceTrajectory(referenceTrajectoryConfig.steps, referenceTrajectoryConfig.tool_name_mode ?? "harness") : void 0;
1213
+ const repetitions = cell.repetitions.map((rep) => {
1214
+ const base = {
1215
+ repetitionIndex: rep.repetitionIndex,
1216
+ durationMs: rep.durationMs,
1217
+ assertionResults: rep.assertionResults
1218
+ };
1219
+ if (rep.error) {
1220
+ base.error = {
1221
+ message: rep.error.message,
1222
+ diagnostics: rep.error.diagnostics
1223
+ };
1224
+ return base;
1225
+ }
1226
+ if (rep.adapterResult) {
1227
+ base.trajectory = {
1228
+ ...rep.adapterResult.view,
1229
+ schemaVersion: "1.0"
1230
+ };
1231
+ base.diagnostics = rep.adapterResult.diagnostics;
1232
+ const artifacts = {};
1233
+ if (includeTranscript) artifacts.transcript = trajectoryToTranscript(rep.adapterResult.view, prompt);
1234
+ if (includeRaw) {
1235
+ const raw = extractRawEvents(rep.adapterResult);
1236
+ if (raw) artifacts.rawStreamEvents = raw;
1237
+ }
1238
+ if (Object.keys(artifacts).length > 0) base.artifacts = artifacts;
1239
+ }
1240
+ const graded = options.grading?.results.find((r) => r.caseId === cell.caseId && r.cellLabel === cell.cell.label && r.repetitionIndex === rep.repetitionIndex);
1241
+ if (graded) base.outcomeGrades = {
1242
+ judge,
1243
+ expectations: graded.expectations,
1244
+ summary: graded.summary,
1245
+ evalFeedback: graded.evalFeedback,
1246
+ error: graded.graderError
1247
+ };
1248
+ return enrichRepetitionWithProtojson(base, {
1249
+ prompt,
1250
+ reference: referenceTrajectoryConfig
1251
+ });
1252
+ });
1253
+ return {
1254
+ caseId: cell.caseId,
1255
+ category: cell.category,
1256
+ notes: cell.notes,
1257
+ prompt: cell.prompt,
1258
+ expectations: cell.expectations,
1259
+ referenceTrajectory,
1260
+ humanRatings: cell.human_ratings,
1261
+ cellLabel: cell.cell.label,
1262
+ axes: cell.cell.axes,
1263
+ assertionStats: cell.assertionStats,
1264
+ adapterErrors: cell.adapterErrors,
1265
+ behavioralPass: cell.passed,
1266
+ outcomePass: outcomePassForCell(cell.caseId, cell.cell.label, repetitions),
1267
+ repetitions
1268
+ };
1269
+ });
1270
+ const cellsPassed = cells.filter((c) => c.behavioralPass).length;
1271
+ const gradedCells = cells.filter((c) => c.outcomePass !== void 0);
1272
+ const outcomePass = gradedCells.length > 0 ? gradedCells.every((c) => c.outcomePass === true) : void 0;
1273
+ return {
1274
+ schemaVersion: "1.0",
1275
+ runId: options.runId ?? randomUUID(),
1276
+ startedAt: report.startedAt,
1277
+ durationMs: report.durationMs,
1278
+ suite: options.suite,
1279
+ harness: {
1280
+ adapter: options.harness?.adapter ?? "claude-code",
1281
+ frameworkVersion: options.harness?.frameworkVersion,
1282
+ harnessVersion: options.harness?.harnessVersion
1283
+ },
1284
+ provenance: options.provenance,
1285
+ summary: {
1286
+ cellsTotal: cells.length,
1287
+ cellsPassed,
1288
+ behavioralPass: cellsPassed === cells.length,
1289
+ outcomePass
1290
+ },
1291
+ cells
1292
+ };
1293
+ }
1294
+ /**
1295
+ * Build an envelope from on-disk runner and grader JSON artifacts.
1296
+ *
1297
+ * Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
1298
+ * outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
1299
+ * attaches suite URI and SHA-256 content hash for reproducibility.
1300
+ *
1301
+ * @param reportPath - Path to the suite run report JSON from `harness-eval run`.
1302
+ * @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
1303
+ */
1304
+ async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
1305
+ const reportText = await readFile(reportPath, "utf8");
1306
+ const report = JSON.parse(reportText);
1307
+ const harnessAdapter = await resolveEnvelopeHarnessAdapter({
1308
+ harnessAdapter: options.harness?.adapter,
1309
+ suitePath: options.suitePath
1310
+ });
1311
+ let grading = options.grading;
1312
+ if (options.gradingPath) {
1313
+ const gradingText = await readFile(options.gradingPath, "utf8");
1314
+ const parsed = JSON.parse(gradingText);
1315
+ const judge = parsed.judge ?? await resolveEnvelopeJudge({ gradingConfigPath: parsed.gradingConfigPath });
1316
+ grading = {
1317
+ gradedAt: parsed.gradedAt,
1318
+ sourceReport: parsed.sourceReport,
1319
+ results: parsed.results,
1320
+ judge
1321
+ };
1322
+ }
1323
+ let suite = options.suite;
1324
+ if (options.suitePath) {
1325
+ const content = await readSuiteYamlContent(options.suitePath);
1326
+ suite = {
1327
+ ...suite,
1328
+ uri: options.suitePath,
1329
+ contentHash: createHash("sha256").update(content).digest("hex")
1330
+ };
1331
+ }
1332
+ return buildEvalRunEnvelope(report, {
1333
+ ...options,
1334
+ suite,
1335
+ grading,
1336
+ harness: {
1337
+ ...options.harness,
1338
+ adapter: harnessAdapter
1339
+ }
1340
+ });
1341
+ }
1342
+ //#endregion
1343
+ //#region src/eval-interchange/projections.ts
1344
+ /** Trajectory instance keys emitted in stable order for JSONL export. */
1345
+ const TRAJECTORY_INSTANCE_KEYS = [
1346
+ "exactMatch",
1347
+ "inOrderMatch",
1348
+ "anyOrderMatch",
1349
+ "precision",
1350
+ "recall",
1351
+ "singleToolUse"
1352
+ ];
1353
+ /**
1354
+ * Flatten one repetition into a trajectory dataset row.
1355
+ *
1356
+ * Pulls prompt from the cell, response from evaluationInstance, and falls
1357
+ * back to duration-based latency when enrich did not set latencySeconds.
1358
+ */
1359
+ function repetitionToDatasetRow(cell, repetition) {
1360
+ return {
1361
+ caseId: cell.caseId,
1362
+ repetitionIndex: repetition.repetitionIndex,
1363
+ prompt: cell.prompt,
1364
+ response: repetition.evaluationInstance?.response?.text,
1365
+ evaluationInstance: repetition.evaluationInstance,
1366
+ latencySeconds: repetition.latencySeconds ?? repetition.durationMs / 1e3,
1367
+ failure: repetition.failure ?? (repetition.trajectory?.success ? 0 : 1),
1368
+ humanRatings: cell.humanRatings
1369
+ };
1370
+ }
1371
+ /**
1372
+ * Expand one repetition into type-tagged instance rows for EvaluateInstances.
1373
+ *
1374
+ * Returns an empty array when the repetition has no reference trajectory
1375
+ * (and therefore no trajectoryInstances block).
1376
+ */
1377
+ function repetitionToInstanceRows(cell, repetition) {
1378
+ if (!repetition.trajectoryInstances) return [];
1379
+ const rows = [];
1380
+ for (const key of TRAJECTORY_INSTANCE_KEYS) {
1381
+ const instance = repetition.trajectoryInstances[key];
1382
+ if (!instance) continue;
1383
+ rows.push({
1384
+ messageType: trajectoryInstanceMessageType(key),
1385
+ caseId: cell.caseId,
1386
+ repetitionIndex: repetition.repetitionIndex,
1387
+ instance
1388
+ });
1389
+ }
1390
+ return rows;
1391
+ }
1392
+ /**
1393
+ * Trajectory projection — all repetitions in the envelope as dataset rows.
1394
+ */
1395
+ function toTrajectory(envelope) {
1396
+ const rows = [];
1397
+ for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(repetitionToDatasetRow(cell, repetition));
1398
+ return rows;
1399
+ }
1400
+ /**
1401
+ * Instances projection — all trajectory metric instances as JSONL rows.
1402
+ */
1403
+ function toInstancesJsonl(envelope) {
1404
+ const rows = [];
1405
+ for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(...repetitionToInstanceRows(cell, repetition));
1406
+ return rows;
1407
+ }
1408
+ //#endregion
1409
+ //#region src/pipeline/resolve-inputs.ts
1410
+ /**
1411
+ * Resolve pipeline step inputs and outputs with precedence rules.
1412
+ *
1413
+ * Precedence: CLI override > explicit YAML > prior step in this run > default path on disk > error.
1414
+ */
1415
+ /** Resolve absolute paths for enabled pipeline steps. */
1416
+ async function resolvePipelineInputs(options) {
1417
+ const { suitePath, suiteDir, pipeline, steps, overrides } = options;
1418
+ const executed = options.executed ?? {};
1419
+ const stepSet = new Set(steps);
1420
+ const resolved = { suitePath: resolve(suitePath) };
1421
+ const defaultRunOutput = resolve(suiteDir, pipeline.run?.output ?? DEFAULT_PIPELINE_OUTPUTS.run);
1422
+ const defaultGradeOutput = resolve(suiteDir, pipeline.grade?.output ?? DEFAULT_PIPELINE_OUTPUTS.grade);
1423
+ if (stepSet.has("run") && pipeline.run) resolved.run = {
1424
+ output: resolve(suiteDir, overrides?.run?.output ?? pipeline.run.output),
1425
+ maxConcurrent: overrides?.run?.maxConcurrent ?? pipeline.run.maxConcurrent
1426
+ };
1427
+ if (stepSet.has("grade") && pipeline.grade) resolved.grade = {
1428
+ input: await resolveReportPath({
1429
+ explicit: overrides?.grade?.input ?? pipeline.grade.input,
1430
+ executedOutput: executed.run?.output,
1431
+ defaultPath: defaultRunOutput,
1432
+ label: "grade input (report)"
1433
+ }),
1434
+ output: resolve(suiteDir, overrides?.grade?.output ?? pipeline.grade.output),
1435
+ maxConcurrent: overrides?.grade?.maxConcurrent ?? pipeline.grade.maxConcurrent
1436
+ };
1437
+ if (stepSet.has("envelope") && pipeline.envelope) resolved.envelope = {
1438
+ report: await resolveReportPath({
1439
+ explicit: overrides?.envelope?.report ?? pipeline.envelope.report,
1440
+ executedOutput: executed.run?.output,
1441
+ defaultPath: defaultRunOutput,
1442
+ label: "envelope report"
1443
+ }),
1444
+ grading: await resolveOptionalGradingPath({
1445
+ explicit: overrides?.envelope?.grading ?? pipeline.envelope.grading,
1446
+ executedOutput: executed.grade?.output,
1447
+ defaultPath: defaultGradeOutput
1448
+ }),
1449
+ output: resolve(suiteDir, overrides?.envelope?.output ?? pipeline.envelope.output),
1450
+ projection: overrides?.envelope?.projection ?? pipeline.envelope.projection ?? "envelope",
1451
+ includeRawStreamEvents: pipeline.envelope.includeRawStreamEvents ?? false,
1452
+ noTranscript: pipeline.envelope.noTranscript ?? false
1453
+ };
1454
+ return resolved;
1455
+ }
1456
+ /**
1457
+ * Resolve a required report path: explicit override → prior step output → default on disk.
1458
+ * Throws when none of the above exist.
1459
+ */
1460
+ async function resolveReportPath(options) {
1461
+ if (options.explicit) return resolve(options.explicit);
1462
+ if (options.executedOutput) return resolve(options.executedOutput);
1463
+ if (await pathExists(options.defaultPath)) return options.defaultPath;
1464
+ throw new ConfigError(`pipeline: could not resolve ${options.label}; specify an explicit path or run the run step first`, options.defaultPath);
1465
+ }
1466
+ /** Resolve optional grading path; returns undefined when grading was not run and file is absent. */
1467
+ async function resolveOptionalGradingPath(options) {
1468
+ if (options.explicit) return resolve(options.explicit);
1469
+ if (options.executedOutput) return resolve(options.executedOutput);
1470
+ if (await pathExists(options.defaultPath)) return options.defaultPath;
1471
+ }
1472
+ async function pathExists(filePath) {
1473
+ try {
1474
+ await stat(filePath);
1475
+ return true;
1476
+ } catch {
1477
+ return false;
1478
+ }
1479
+ }
1480
+ /**
1481
+ * Resolve a grading artifact path from a unified suite's `pipeline:` block.
1482
+ *
1483
+ * Used by `harness-eval envelope --suite` when `--grading` is omitted (spec C-7).
1484
+ * Checks `pipeline.envelope.grading` then default `pipeline.grade.output` on disk.
1485
+ */
1486
+ async function resolveGradingArtifactFromSuite(suitePath) {
1487
+ let doc;
1488
+ try {
1489
+ doc = await loadSuiteDocument(suitePath);
1490
+ } catch {
1491
+ return;
1492
+ }
1493
+ if (!doc.pipeline) return void 0;
1494
+ const explicit = doc.pipeline.envelope?.grading;
1495
+ if (explicit && await pathExists(explicit)) return explicit;
1496
+ const defaultGrade = doc.pipeline.grade?.output;
1497
+ if (defaultGrade && await pathExists(defaultGrade)) return defaultGrade;
1498
+ }
1499
+ /** Parse `--steps run,grade,envelope` against configured pipeline keys. */
1500
+ function parsePipelineSteps(pipeline, stepsArg) {
1501
+ const configured = [];
1502
+ if (pipeline.run !== void 0) configured.push("run");
1503
+ if (pipeline.grade !== void 0) configured.push("grade");
1504
+ if (pipeline.envelope !== void 0) configured.push("envelope");
1505
+ if (configured.length === 0) throw new ConfigError("pipeline block has no steps configured");
1506
+ if (!stepsArg) return configured;
1507
+ const validStepNames = /* @__PURE__ */ new Set([
1508
+ "run",
1509
+ "grade",
1510
+ "envelope"
1511
+ ]);
1512
+ const requested = stepsArg.split(",").map((s) => s.trim()).filter(Boolean);
1513
+ for (const step of requested) {
1514
+ if (!validStepNames.has(step)) throw new ConfigError(`unknown pipeline step "${step}"; valid steps are: run, grade, envelope`);
1515
+ if (!configured.includes(step)) throw new ConfigError(`pipeline step "${step}" is not configured in suite.yaml`);
1516
+ }
1517
+ const requestedSet = new Set(requested);
1518
+ return configured.filter((step) => requestedSet.has(step));
1519
+ }
1520
+ /** Parent directory of suite.yaml. */
1521
+ function suiteDirectoryFromPath(suitePath) {
1522
+ return dirname(resolve(suitePath));
1523
+ }
1524
+ //#endregion
1525
+ //#region src/cli/args.ts
1526
+ /** Parse process argv into command, positional args, and options. */
1527
+ function parseArgs(argv) {
1528
+ const positional = [];
1529
+ const options = {};
1530
+ let command;
1531
+ const args = [...argv];
1532
+ if (args.length > 0 && !args[0].startsWith("-")) command = args.shift();
1533
+ for (let i = 0; i < args.length; i++) {
1534
+ const arg = args[i];
1535
+ if (arg === "--") {
1536
+ positional.push(...args.slice(i + 1));
1537
+ break;
1538
+ }
1539
+ if (arg.startsWith("--")) {
1540
+ const key = arg.slice(2);
1541
+ const next = args[i + 1];
1542
+ if (next && !next.startsWith("-")) {
1543
+ options[key] = next;
1544
+ i++;
1545
+ } else options[key] = true;
1546
+ } else if (arg.startsWith("-") && arg.length === 2) {
1547
+ const key = arg.slice(1);
1548
+ const next = args[i + 1];
1549
+ if (next && !next.startsWith("-")) {
1550
+ options[key] = next;
1551
+ i++;
1552
+ } else options[key] = true;
1553
+ } else positional.push(arg);
1554
+ }
1555
+ return {
1556
+ command,
1557
+ positional,
1558
+ options
1559
+ };
1560
+ }
1561
+ /** Return a string option value, or undefined when absent or boolean. */
1562
+ function getOption(options, name) {
1563
+ const v = options[name];
1564
+ return typeof v === "string" ? v : void 0;
1565
+ }
1566
+ /** Parse an integer option with fallback when absent or non-numeric. */
1567
+ function getOptionInt(options, name, defaultValue) {
1568
+ const v = getOption(options, name);
1569
+ if (v === void 0) return defaultValue;
1570
+ const n = Number.parseInt(v, 10);
1571
+ if (!Number.isFinite(n)) return defaultValue;
1572
+ return n;
1573
+ }
1574
+ /** True when a boolean flag is set or explicitly `"true"`. */
1575
+ function hasOption(options, name) {
1576
+ const v = options[name];
1577
+ return v === true || typeof v === "string" && v === "true";
1578
+ }
1579
+ //#endregion
1580
+ //#region src/cli/commands/envelope.ts
1581
+ /**
1582
+ * `harness-eval envelope` — build EvalRunEnvelope and interchange projections.
1583
+ *
1584
+ * Reads a suite run report (and optional grading JSON), builds a versioned
1585
+ * {@link EvalRunEnvelope}, and serializes one of three projections:
1586
+ *
1587
+ * - `envelope` — full nested JSON document (default)
1588
+ * - `trajectory` — JSONL of {@link EvalDatasetRow} per repetition
1589
+ * - `instances` — JSONL of {@link InstancesJsonlRow} for Vertex batch upload
1590
+ *
1591
+ * Exit code 0 when behavioral pass, 1 when any cell failed assertions.
1592
+ */
1593
+ const PROJECTIONS = /* @__PURE__ */ new Set([
1594
+ "envelope",
1595
+ "trajectory",
1596
+ "instances"
1597
+ ]);
1598
+ /**
1599
+ * Parse and validate `--projection` CLI flag.
1600
+ *
1601
+ * @returns `"envelope"` when omitted; `undefined` when value is invalid.
1602
+ */
1603
+ function parseEnvelopeProjection(value) {
1604
+ if (value === void 0) return "envelope";
1605
+ if (PROJECTIONS.has(value)) return value;
1606
+ }
1607
+ /**
1608
+ * Serialize an envelope to stdout/file string for the chosen projection.
1609
+ *
1610
+ * Trajectory and instances projections emit NDJSON (one JSON object per line).
1611
+ */
1612
+ function serializeEnvelopeProjection(envelope, projection) {
1613
+ switch (projection) {
1614
+ case "trajectory": return `${toTrajectory(envelope).map((row) => JSON.stringify(row)).join("\n")}\n`;
1615
+ case "instances": return `${toInstancesJsonl(envelope).map((row) => JSON.stringify(row)).join("\n")}\n`;
1616
+ default: return `${JSON.stringify(envelope, null, 2)}\n`;
1617
+ }
1618
+ }
1619
+ /** Read harness-eval package version for envelope harness.frameworkVersion. */
1620
+ async function readFrameworkVersion() {
1621
+ try {
1622
+ const text = await readFile(join(dirname(fileURLToPath(import.meta.url)), "../../../package.json"), "utf8");
1623
+ return JSON.parse(text).version;
1624
+ } catch {
1625
+ return;
1626
+ }
1627
+ }
1628
+ /**
1629
+ * CLI entry point for the `envelope` subcommand.
1630
+ *
1631
+ * @returns Process exit code: 0 on behavioral pass, 1 on failure, 2 on usage/error.
1632
+ */
1633
+ async function envelopeCommand(args) {
1634
+ const reportPath = args.positional[0];
1635
+ if (!reportPath) {
1636
+ console.error("usage: harness-eval envelope <report.json> [--output path] [--grading path] [--suite path] [--projection envelope|trajectory|instances] [--include-raw-stream-events] [--no-transcript]");
1637
+ return 2;
1638
+ }
1639
+ const outputPath = getOption(args.options, "output");
1640
+ const suitePath = getOption(args.options, "suite");
1641
+ let gradingPath = getOption(args.options, "grading");
1642
+ if (!gradingPath && suitePath) gradingPath = await resolveGradingArtifactFromSuite(suitePath);
1643
+ const projection = parseEnvelopeProjection(getOption(args.options, "projection"));
1644
+ if (!projection) {
1645
+ console.error("invalid --projection; expected envelope, trajectory, or instances");
1646
+ return 2;
1647
+ }
1648
+ let envelope;
1649
+ try {
1650
+ const frameworkVersion = await readFrameworkVersion();
1651
+ envelope = await buildEvalRunEnvelopeFromFiles(reportPath, {
1652
+ gradingPath,
1653
+ suitePath,
1654
+ includeTranscript: !hasOption(args.options, "no-transcript"),
1655
+ includeRawStreamEvents: hasOption(args.options, "include-raw-stream-events"),
1656
+ harness: { frameworkVersion }
1657
+ });
1658
+ } catch (err) {
1659
+ console.error(err instanceof Error ? err.message : String(err));
1660
+ return 2;
1661
+ }
1662
+ const serialized = serializeEnvelopeProjection(envelope, projection);
1663
+ if (outputPath) await writeFile(outputPath, serialized, "utf8");
1664
+ else process.stdout.write(serialized);
1665
+ return envelope.summary.behavioralPass ? 0 : 1;
1666
+ }
1667
+ //#endregion
1668
+ //#region src/pipeline/run-pipeline.ts
1669
+ /**
1670
+ * Orchestrate run → grade → envelope pipeline steps.
1671
+ */
1672
+ /** Execute configured pipeline steps in order; stop on first failure. */
1673
+ async function runPipeline(doc, options = {}) {
1674
+ if (!doc.pipeline) throw new ConfigError("suite document has no pipeline block", doc.suitePath);
1675
+ const steps = parsePipelineSteps(doc.pipeline, options.steps);
1676
+ const suiteDir = suiteDirectoryFromPath(doc.suitePath);
1677
+ const executed = {};
1678
+ let runReport;
1679
+ let exitCode = 0;
1680
+ for (const step of steps) {
1681
+ const resolved = await resolvePipelineInputs({
1682
+ suitePath: doc.suitePath,
1683
+ suiteDir,
1684
+ pipeline: doc.pipeline,
1685
+ steps: [step],
1686
+ executed,
1687
+ overrides: options.overrides
1688
+ });
1689
+ if (step === "run" && resolved.run) {
1690
+ const adapter = getAdapter(doc.suite.adapter ?? "claude-code");
1691
+ runReport = await runSuite(doc.suite, {
1692
+ adapter,
1693
+ maxConcurrent: resolved.run.maxConcurrent ?? options.maxConcurrent ?? 4,
1694
+ onProgress: options.onRunProgress
1695
+ });
1696
+ await writeFile(resolved.run.output, JSON.stringify(runReport, null, 2), "utf8");
1697
+ executed.run = { output: resolved.run.output };
1698
+ if (!runReport.cells.every((cell) => cell.passed)) return {
1699
+ exitCode: 1,
1700
+ stepsRun: steps.slice(0, steps.indexOf(step) + 1),
1701
+ runReport
1702
+ };
1703
+ continue;
1704
+ }
1705
+ if (step === "grade" && resolved.grade) {
1706
+ if (!doc.judge) throw new ConfigError("grade step requires inline judge: block in suite.yaml", doc.suitePath);
1707
+ const gradeOptions = resolveGradeOptions({ judge: doc.judge }, {
1708
+ sourceReport: resolved.grade.input,
1709
+ maxConcurrent: resolved.grade.maxConcurrent
1710
+ }, doc.suitePath);
1711
+ const grading = await gradeReport(await loadSuiteReport(resolved.grade.input), {
1712
+ ...gradeOptions,
1713
+ onProgress: options.onGradeProgress
1714
+ });
1715
+ await writeFile(resolved.grade.output, JSON.stringify(grading, null, 2), "utf8");
1716
+ executed.grade = {
1717
+ input: resolved.grade.input,
1718
+ output: resolved.grade.output
1719
+ };
1720
+ if (!gradingReportPassed(grading)) return {
1721
+ exitCode: 1,
1722
+ stepsRun: steps.slice(0, steps.indexOf(step) + 1),
1723
+ runReport
1724
+ };
1725
+ continue;
1726
+ }
1727
+ if (step === "envelope" && resolved.envelope) {
1728
+ const envelope = await buildEvalRunEnvelopeFromFiles(resolved.envelope.report, {
1729
+ gradingPath: resolved.envelope.grading,
1730
+ suitePath: doc.suitePath,
1731
+ includeTranscript: !resolved.envelope.noTranscript,
1732
+ includeRawStreamEvents: resolved.envelope.includeRawStreamEvents,
1733
+ harness: { frameworkVersion: options.frameworkVersion }
1734
+ });
1735
+ const serialized = serializeEnvelopeProjection(envelope, resolved.envelope.projection);
1736
+ await writeFile(resolved.envelope.output, serialized, "utf8");
1737
+ const behavioralFail = !envelope.summary.behavioralPass;
1738
+ const outcomeFail = envelope.summary.outcomePass !== void 0 && !envelope.summary.outcomePass;
1739
+ if (behavioralFail || outcomeFail) return {
1740
+ exitCode: 1,
1741
+ stepsRun: steps.slice(0, steps.indexOf(step) + 1),
1742
+ runReport
1743
+ };
1744
+ continue;
1745
+ }
1746
+ }
1747
+ return {
1748
+ exitCode,
1749
+ stepsRun: steps,
1750
+ runReport
1751
+ };
1752
+ }
1753
+ //#endregion
1754
+ //#region src/otel/attributes.ts
1755
+ /** Build a string-typed OTLP attribute. */
1756
+ function strAttr(key, value) {
1757
+ return {
1758
+ key,
1759
+ value: { stringValue: value }
1760
+ };
1761
+ }
1762
+ /** Build an integer-typed OTLP attribute (stored as decimal string). */
1763
+ function intAttr(key, value) {
1764
+ return {
1765
+ key,
1766
+ value: { intValue: String(value) }
1767
+ };
1768
+ }
1769
+ /** Build a boolean-typed OTLP attribute. */
1770
+ function boolAttr(key, value) {
1771
+ return {
1772
+ key,
1773
+ value: { boolValue: value }
1774
+ };
1775
+ }
1776
+ /** Build a JSON-serialized string attribute (common for message arrays). */
1777
+ function jsonAttr(key, value) {
1778
+ return {
1779
+ key,
1780
+ value: { stringValue: JSON.stringify(value) }
1781
+ };
1782
+ }
1783
+ //#endregion
1784
+ //#region src/otel/messages.ts
1785
+ /**
1786
+ * Map harness stop reasons to GenAI semconv finish_reason values.
1787
+ *
1788
+ * Unknown reasons pass through unchanged for forward compatibility.
1789
+ */
1790
+ function mapStopReason(reason) {
1791
+ if (!reason) return void 0;
1792
+ switch (reason) {
1793
+ case "end_turn": return "stop";
1794
+ case "tool_use": return "tool_calls";
1795
+ case "max_tokens": return "length";
1796
+ case "stop_sequence": return "stop";
1797
+ default: return reason;
1798
+ }
1799
+ }
1800
+ /** Build a tool_call part from a {@link ToolCall}. */
1801
+ function toolCallPart(call) {
1802
+ return {
1803
+ type: "tool_call",
1804
+ id: call.callId,
1805
+ name: call.name,
1806
+ arguments: call.args ?? {}
1807
+ };
1808
+ }
1809
+ /** Build a tool_call_response part from a {@link ToolCall} result. */
1810
+ function toolResponsePart(call) {
1811
+ return {
1812
+ type: "tool_call_response",
1813
+ id: call.callId,
1814
+ result: call.result
1815
+ };
1816
+ }
1817
+ /** Convert one assistant turn to a GenAI semconv assistant message. */
1818
+ function assistantMessageFromTurn(turn) {
1819
+ const parts = [];
1820
+ if (turn.text) parts.push({
1821
+ type: "text",
1822
+ content: turn.text
1823
+ });
1824
+ for (const call of turn.toolCalls) parts.push(toolCallPart(call));
1825
+ const finish = mapStopReason(turn.stopReason);
1826
+ return {
1827
+ role: "assistant",
1828
+ parts,
1829
+ ...finish ? { finish_reason: finish } : {}
1830
+ };
1831
+ }
1832
+ /** Aggregate tool results from a turn into a single tool-role message, if any. */
1833
+ function toolResultsMessage(calls) {
1834
+ const parts = calls.filter((c) => c.result !== null).map((c) => toolResponsePart(c));
1835
+ if (parts.length === 0) return null;
1836
+ return {
1837
+ role: "tool",
1838
+ parts
1839
+ };
1840
+ }
1841
+ /**
1842
+ * Input history before the assistant turn at `turnIndex`.
1843
+ */
1844
+ function inputMessagesBeforeTurn(view, turnIndex, prompt) {
1845
+ const messages = [];
1846
+ if (prompt) messages.push({
1847
+ role: "user",
1848
+ parts: [{
1849
+ type: "text",
1850
+ content: prompt
1851
+ }]
1852
+ });
1853
+ for (let i = 0; i < turnIndex; i++) {
1854
+ const turn = view.turns[i];
1855
+ if (!turn) continue;
1856
+ messages.push(assistantMessageFromTurn(turn));
1857
+ const toolMsg = toolResultsMessage(turn.toolCalls);
1858
+ if (toolMsg) messages.push(toolMsg);
1859
+ }
1860
+ return messages;
1861
+ }
1862
+ //#endregion
1863
+ //#region src/otel/types.ts
1864
+ /** OTLP span kinds (enum integers). */
1865
+ const SpanKind = {
1866
+ INTERNAL: 1,
1867
+ CLIENT: 2
1868
+ };
1869
+ /** OTLP status codes. */
1870
+ const StatusCode = {
1871
+ UNSET: 0,
1872
+ OK: 1,
1873
+ ERROR: 2
1874
+ };
1875
+ //#endregion
1876
+ //#region src/otel/emitter.ts
1877
+ /**
1878
+ * TrajectoryView → OTLP JSON export using OpenTelemetry GenAI semantic conventions.
1879
+ *
1880
+ * Produces an `ExportTraceServiceRequest` suitable for OTLP/HTTP JSON ingestion.
1881
+ * Assertions continue to use {@link TrajectoryView} directly; this is export-only.
1882
+ */
1883
+ const INSTRUMENTATION_VERSION = "0.1.0";
1884
+ /**
1885
+ * Map a {@link TrajectoryView} to OTLP trace JSON.
1886
+ *
1887
+ * Span tree (siblings under `invoke_agent`, not nested):
1888
+ * ```
1889
+ * invoke_agent
1890
+ * ├── chat {model}
1891
+ * ├── execute_tool {name}
1892
+ * ├── chat {model}
1893
+ * └── ...
1894
+ * ```
1895
+ */
1896
+ function trajectoryToOtlp(view, options = {}) {
1897
+ const agentName = options.agentName ?? "claude-code";
1898
+ const providerName = options.providerName ?? "anthropic";
1899
+ const serviceName = options.serviceName ?? "harness-eval";
1900
+ const scopeName = options.instrumentationScope ?? "@alis-build/harness-eval";
1901
+ const traceId = traceIdFromSession(view.meta.sessionId);
1902
+ const rootSpanId = spanIdFromKey(traceId, "invoke_agent");
1903
+ const durationMs = Math.max(view.usage.durationMs, 1);
1904
+ const endMs = options.endTimeMs ?? Date.now();
1905
+ const startMs = endMs - durationMs;
1906
+ const rootStartNs = msToNs(startMs);
1907
+ const rootEndNs = msToNs(endMs);
1908
+ const spans = [];
1909
+ const timings = buildSpanTimings(view, startMs, endMs);
1910
+ spans.push({
1911
+ traceId,
1912
+ spanId: rootSpanId,
1913
+ name: "invoke_agent",
1914
+ kind: SpanKind.INTERNAL,
1915
+ startTimeUnixNano: rootStartNs,
1916
+ endTimeUnixNano: rootEndNs,
1917
+ attributes: [
1918
+ strAttr("gen_ai.operation.name", "invoke_agent"),
1919
+ strAttr("gen_ai.agent.name", agentName),
1920
+ strAttr("gen_ai.provider.name", providerName),
1921
+ strAttr("gen_ai.conversation.id", view.meta.sessionId),
1922
+ strAttr("gen_ai.request.model", view.meta.model),
1923
+ strAttr("gen_ai.response.model", view.meta.model),
1924
+ intAttr("gen_ai.usage.input_tokens", view.usage.inputTokens),
1925
+ intAttr("gen_ai.usage.output_tokens", view.usage.outputTokens),
1926
+ boolAttr("harness_eval.success", view.success)
1927
+ ],
1928
+ status: viewStatus(view)
1929
+ });
1930
+ let opIndex = 0;
1931
+ for (const turn of view.turns) {
1932
+ const chatTiming = timings[opIndex++];
1933
+ const chatSpanId = spanIdFromKey(traceId, `chat:${turn.turnIndex}`);
1934
+ const inputMessages = inputMessagesBeforeTurn(view, turn.turnIndex, options.prompt);
1935
+ const outputMessages = [assistantMessageFromTurn(turn)];
1936
+ spans.push({
1937
+ traceId,
1938
+ spanId: chatSpanId,
1939
+ parentSpanId: rootSpanId,
1940
+ name: `chat ${view.meta.model}`,
1941
+ kind: SpanKind.CLIENT,
1942
+ startTimeUnixNano: chatTiming.startNs,
1943
+ endTimeUnixNano: chatTiming.endNs,
1944
+ attributes: [
1945
+ strAttr("gen_ai.operation.name", "chat"),
1946
+ strAttr("gen_ai.provider.name", providerName),
1947
+ strAttr("gen_ai.request.model", view.meta.model),
1948
+ strAttr("gen_ai.response.model", view.meta.model),
1949
+ ...inputMessages.length > 0 ? [jsonAttr("gen_ai.input.messages", inputMessages)] : [],
1950
+ jsonAttr("gen_ai.output.messages", outputMessages),
1951
+ ...turn.stopReason ? [jsonAttr("gen_ai.response.finish_reasons", [mapStopReason(turn.stopReason) ?? turn.stopReason])] : []
1952
+ ],
1953
+ status: { code: StatusCode.OK }
1954
+ });
1955
+ if (turn.toolCalls.length === 0) continue;
1956
+ const toolTiming = timings[opIndex++];
1957
+ for (const call of turn.toolCalls) {
1958
+ const toolSpanId = spanIdFromKey(traceId, `tool:${call.callId}`);
1959
+ spans.push({
1960
+ traceId,
1961
+ spanId: toolSpanId,
1962
+ parentSpanId: rootSpanId,
1963
+ name: `execute_tool ${call.name}`,
1964
+ kind: SpanKind.INTERNAL,
1965
+ startTimeUnixNano: toolTiming.startNs,
1966
+ endTimeUnixNano: toolTiming.endNs,
1967
+ attributes: [
1968
+ strAttr("gen_ai.operation.name", "execute_tool"),
1969
+ strAttr("gen_ai.provider.name", providerName),
1970
+ strAttr("gen_ai.tool.name", call.name),
1971
+ strAttr("gen_ai.tool.call.id", call.callId),
1972
+ jsonAttr("gen_ai.tool.call.arguments", call.args ?? {}),
1973
+ ...call.result !== null ? [jsonAttr("gen_ai.tool.call.result", call.result)] : [],
1974
+ ...call.namespace ? [strAttr("harness_eval.tool.namespace", call.namespace)] : [],
1975
+ boolAttr("harness_eval.tool.is_error", call.isError)
1976
+ ],
1977
+ status: call.isError ? {
1978
+ code: StatusCode.ERROR,
1979
+ message: "tool reported error"
1980
+ } : { code: StatusCode.OK }
1981
+ });
1982
+ }
1983
+ }
1984
+ return { resourceSpans: [{
1985
+ resource: { attributes: [strAttr("service.name", serviceName), strAttr("gen_ai.agent.name", agentName)] },
1986
+ scopeSpans: [{
1987
+ scope: {
1988
+ name: scopeName,
1989
+ version: INSTRUMENTATION_VERSION
1990
+ },
1991
+ spans
1992
+ }]
1993
+ }] };
1994
+ }
1995
+ /** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
1996
+ const emitOtel = trajectoryToOtlp;
1997
+ /** Map view success flag to OTLP span status on the root invoke_agent span. */
1998
+ function viewStatus(view) {
1999
+ if (view.success) return { code: StatusCode.OK };
2000
+ return {
2001
+ code: StatusCode.ERROR,
2002
+ message: "harness run did not complete successfully"
2003
+ };
2004
+ }
2005
+ /**
2006
+ * Assign synthetic timestamps to chat and tool spans.
2007
+ *
2008
+ * Stream-json does not carry per-turn wall times, so we divide the session
2009
+ * duration evenly across chat/tool slots for OTLP consumers that require
2010
+ * start/end times on every span.
2011
+ */
2012
+ function buildSpanTimings(view, startMs, endMs) {
2013
+ const slots = [];
2014
+ for (const turn of view.turns) {
2015
+ slots.push("chat");
2016
+ if (turn.toolCalls.length > 0) slots.push("tools");
2017
+ }
2018
+ if (slots.length === 0) return [];
2019
+ const slotMs = Math.max(endMs - startMs, 1) / slots.length;
2020
+ const timings = [];
2021
+ let offset = startMs;
2022
+ for (const slot of slots) {
2023
+ const slotStart = offset;
2024
+ const slotEnd = offset + slotMs;
2025
+ timings.push({
2026
+ startNs: msToNs(slotStart),
2027
+ endNs: msToNs(slotEnd)
2028
+ });
2029
+ offset = slotEnd;
2030
+ }
2031
+ return timings;
2032
+ }
2033
+ /**
2034
+ * Derive a deterministic 128-bit trace id from the harness session id.
2035
+ *
2036
+ * Uses SHA-256 truncation so the same session always maps to the same trace.
2037
+ */
2038
+ function traceIdFromSession(sessionId) {
2039
+ return createHash("sha256").update(`harness-eval:trace:${sessionId}`).digest("hex").slice(0, 32).toUpperCase();
2040
+ }
2041
+ /**
2042
+ * Derive a deterministic 64-bit span id from trace id and a logical span key.
2043
+ */
2044
+ function spanIdFromKey(traceId, key) {
2045
+ return createHash("sha256").update(`${traceId}:span:${key}`).digest("hex").slice(0, 16).toUpperCase();
2046
+ }
2047
+ /** Convert milliseconds since epoch to OTLP nanosecond timestamp string. */
2048
+ function msToNs(ms) {
2049
+ return String(Math.round(ms * 1e6));
2050
+ }
2051
+ //#endregion
2052
+ //#region src/reporter/format-console.ts
2053
+ const RESET = "\x1B[0m";
2054
+ const GREEN = "\x1B[32m";
2055
+ const RED = "\x1B[31m";
2056
+ const YELLOW = "\x1B[33m";
2057
+ /**
2058
+ * Render renderable rows as ANSI-colored console output.
2059
+ *
2060
+ * @param color When false, emit plain text without escape codes.
2061
+ */
2062
+ function formatConsole(rows, color = true) {
2063
+ const lines = [];
2064
+ for (const row of rows) {
2065
+ const status = row.passed ? color ? `${GREEN}PASS${RESET}` : "PASS" : color ? `${RED}FAIL${RESET}` : "FAIL";
2066
+ const crashNote = row.adapterErrors > 0 ? ` ${color ? YELLOW : ""}[${row.adapterErrors} adapter errors]${color ? RESET : ""}` : "";
2067
+ lines.push(`${row.caseId} @ ${row.cellLabel} ${status}${crashNote}`);
2068
+ if (row.category) lines.push(` category: ${row.category}`);
2069
+ for (const stat of row.stats) {
2070
+ const marker = stat.meetsThreshold ? color ? `${GREEN}✓${RESET}` : "✓" : color ? `${RED}✗${RESET}` : "✗";
2071
+ const rateStr = formatRate$1(stat);
2072
+ const thresholdPct = (stat.threshold * 100).toFixed(0);
2073
+ let line = ` ├─ ${stat.description}: ${rateStr} [threshold ${thresholdPct}%] ${marker}`;
2074
+ if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
2075
+ const arrow = stat.delta >= 0 ? "↑" : "↓";
2076
+ const basePct = (stat.baselinePassRate * 100).toFixed(0);
2077
+ const curPct = (stat.passRate * 100).toFixed(0);
2078
+ const deltaPct = (stat.delta * 100).toFixed(0);
2079
+ line += ` (${basePct}% → ${curPct}% (${arrow}${deltaPct}%))`;
2080
+ }
2081
+ lines.push(line);
2082
+ }
2083
+ lines.push("");
2084
+ }
2085
+ return lines.join("\n").trimEnd();
2086
+ }
2087
+ /** Format pass rate for display, noting when all reps crashed. */
2088
+ function formatRate$1(stat) {
2089
+ if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
2090
+ const pct = (stat.passRate * 100).toFixed(0);
2091
+ return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
2092
+ }
2093
+ //#endregion
2094
+ //#region src/reporter/format-json.ts
2095
+ /**
2096
+ * Serialize a suite report as indented JSON (no transformation).
2097
+ *
2098
+ * Used by `--format json` and `--output` persistence.
2099
+ */
2100
+ function formatJson(report) {
2101
+ return JSON.stringify(report, null, 2);
2102
+ }
2103
+ //#endregion
2104
+ //#region src/reporter/format-markdown.ts
2105
+ /** Render renderable rows as a GitHub-flavored markdown report. */
2106
+ function formatMarkdown(rows) {
2107
+ const lines = ["# Harness Eval Report", ""];
2108
+ for (const row of rows) {
2109
+ const status = row.passed ? "PASS" : "FAIL";
2110
+ const crashNote = row.adapterErrors > 0 ? ` (${row.adapterErrors} adapter errors)` : "";
2111
+ lines.push(`## ${row.caseId} @ ${row.cellLabel} — ${status}${crashNote}`);
2112
+ if (row.category) lines.push(`**Category:** ${row.category}`);
2113
+ if (row.notes) lines.push("<details><summary>Notes</summary>", row.notes, "</details>");
2114
+ lines.push("");
2115
+ lines.push("| Assertion | Result | Threshold | Status |");
2116
+ lines.push("| --- | --- | --- | --- |");
2117
+ for (const stat of row.stats) {
2118
+ const rateStr = formatRate(stat);
2119
+ const threshold = `${(stat.threshold * 100).toFixed(0)}%`;
2120
+ const statusCell = stat.meetsThreshold ? "✓" : "✗";
2121
+ let result = rateStr;
2122
+ if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
2123
+ const base = (stat.baselinePassRate * 100).toFixed(0);
2124
+ const cur = (stat.passRate * 100).toFixed(0);
2125
+ const d = (stat.delta * 100).toFixed(0);
2126
+ const sign = stat.delta >= 0 ? "+" : "";
2127
+ result += ` (${base}% → ${cur}%, ${sign}${d}%)`;
2128
+ }
2129
+ lines.push(`| ${stat.description} | ${result} | ${threshold} | ${statusCell} |`);
2130
+ }
2131
+ lines.push("");
2132
+ }
2133
+ return lines.join("\n").trimEnd();
2134
+ }
2135
+ /** Format pass rate for markdown tables, noting when all reps crashed. */
2136
+ function formatRate(stat) {
2137
+ if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
2138
+ const pct = (stat.passRate * 100).toFixed(0);
2139
+ return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
2140
+ }
2141
+ //#endregion
2142
+ //#region src/reporter/renderable.ts
2143
+ /** Map a suite report to formatter-ready rows (one per cell). */
2144
+ function toRenderableRows(report) {
2145
+ return report.cells.map((cell) => cellToRow(cell));
2146
+ }
2147
+ /**
2148
+ * Attach baseline pass-rate deltas to matching rows.
2149
+ *
2150
+ * Rows without a matching baseline cell are returned unchanged.
2151
+ */
2152
+ function applyBaseline(rows, baseline) {
2153
+ const baselineMap = new Map(baseline.cells.map((c) => [`${c.caseId}::${c.cell.label}`, c]));
2154
+ return rows.map((row) => {
2155
+ const baseCell = baselineMap.get(`${row.caseId}::${row.cellLabel}`);
2156
+ if (!baseCell) return row;
2157
+ const stats = row.stats.map((stat, i) => {
2158
+ const baseStat = baseCell.assertionStats[i];
2159
+ if (!baseStat) return stat;
2160
+ const delta = stat.passRate - baseStat.passRate;
2161
+ return {
2162
+ ...stat,
2163
+ baselinePassRate: baseStat.passRate,
2164
+ delta
2165
+ };
2166
+ });
2167
+ return {
2168
+ ...row,
2169
+ stats
2170
+ };
2171
+ });
2172
+ }
2173
+ /** Convert one {@link CellReport} to a {@link RenderableRow}. */
2174
+ function cellToRow(cell) {
2175
+ const totalReps = cell.repetitions.length;
2176
+ const stats = cell.assertionStats.map((s) => ({
2177
+ description: s.description,
2178
+ threshold: s.threshold,
2179
+ passedCount: s.passedCount,
2180
+ evaluatedCount: s.evaluatedCount,
2181
+ totalReps,
2182
+ adapterErrors: cell.adapterErrors,
2183
+ passRate: s.passRate,
2184
+ meetsThreshold: s.meetsThreshold
2185
+ }));
2186
+ return {
2187
+ caseId: cell.caseId,
2188
+ category: cell.category,
2189
+ notes: cell.notes,
2190
+ cellLabel: cell.cell.label,
2191
+ passed: cell.passed,
2192
+ adapterErrors: cell.adapterErrors,
2193
+ totalReps,
2194
+ stats
2195
+ };
2196
+ }
2197
+ //#endregion
2198
+ //#region src/reporter/index.ts
2199
+ /**
2200
+ * Format a {@link SuiteReport} for console, markdown, or JSON output.
2201
+ *
2202
+ * JSON format bypasses the renderable intermediate model and serializes the
2203
+ * report directly. Console and markdown apply optional baseline deltas.
2204
+ */
2205
+ function formatReport(report, options) {
2206
+ if (options.format === "json") return formatJson(report);
2207
+ let rows = toRenderableRows(report);
2208
+ if (options.baseline) rows = applyBaseline(rows, options.baseline);
2209
+ const useColor = options.color ?? options.format === "console";
2210
+ if (options.format === "markdown") return formatMarkdown(rows);
2211
+ return formatConsole(rows, useColor);
2212
+ }
2213
+ //#endregion
2214
+ export { serializeToolInput as A, TRAJECTORY_SCHEMA_VERSION as B, trajectoryExactMatch as C, trajectorySingleToolUse as D, trajectoryRecall as E, loadSuiteReport as F, trajectoryToTranscript as I, createCodexGrader as L, gradingReportPassed as M, resolveGradeOptions as N, toEvaluationInstance as O, gradeReport as P, createClaudeGrader as R, trajectoryAnyOrderMatch as S, trajectoryPrecision as T, buildEvalRunEnvelopeFromFiles as _, envelopeCommand as a, computeTrajectoryMetrics as b, getOptionInt as c, resolveGradingArtifactFromSuite as d, resolvePipelineInputs as f, buildEvalRunEnvelope as g, toTrajectory as h, runPipeline as i, formatGradingConsole as j, toTrajectoryInstances as k, hasOption as l, toInstancesJsonl as m, emitOtel as n, parseEnvelopeProjection as o, suiteDirectoryFromPath as p, trajectoryToOtlp as r, getOption as s, formatReport as t, parseArgs as u, enrichRepetitionWithProtojson as v, trajectoryInOrderMatch as w, parseToolInput as x, toHarnessMetrics as y, EVAL_RUN_SCHEMA_VERSION as z };
2215
+
2216
+ //# sourceMappingURL=reporter-Biy-5-9M.js.map