npm - @alis-build/harness-eval - Versions diffs - 0.1.0 → 0.1.2 - Mend

@alis-build/harness-eval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/README.md +17 -4
package/dist/adapters/claude-code/index.d.ts +1 -1
package/dist/adapters/claude-code/index.js +1 -1
package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
package/dist/cli/bin.js +109 -12
package/dist/cli/bin.js.map +1 -1
package/dist/config/loader.d.ts +1 -1
package/dist/config/loader.js +1 -1
package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
package/dist/index.d.ts +270 -152
package/dist/index.js +124 -5
package/dist/index.js.map +1 -0
package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
package/dist/loader-DcI0KfRX.js.map +1 -0
package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
package/dist/projections-BcX7w-f6.js.map +1 -0
package/dist/runner/suite.d.ts +1 -1
package/dist/runner/suite.js +1 -1
package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
package/dist/suite-Dlzl-HI0.js.map +1 -0
package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
package/package.json +4 -2
package/schemas/eval-interchange-instances.schema.json +196 -0
package/schemas/eval-interchange.schema.json +65 -52
package/schemas/eval-run-envelope.schema.json +182 -425
package/dist/build-DsVJ_UeU.js.map +0 -1
package/dist/loader-BCnFJ8rm.js.map +0 -1
package/dist/suite-chj0j22j.js.map +0 -1
package/schemas/eval-interchange-agent-trace.schema.json +0 -322
package/schemas/eval-interchange-proto-instance.schema.json +0 -106

package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { i as buildJudgeArgs } from "./claude-code-ycT0JQZF.js";
-import { n as createLimit } from "./suite-chj0j22j.js";
+import { i as buildJudgeArgs } from "./claude-code-DZ4Vkgp6.js";
+import { n as createLimit } from "./suite-Dlzl-HI0.js";
 import { spawn } from "node:child_process";
 import { readFile } from "node:fs/promises";
 import { parse } from "yaml";
@@ -11,24 +11,28 @@ const EVAL_RUN_SCHEMA_VERSION = "1.0";
 const TRAJECTORY_SCHEMA_VERSION = "1.0";
 //#endregion
 //#region src/otel/attributes.ts
+/** Build a string-typed OTLP attribute. */
 function strAttr(key, value) {
 	return {
 		key,
 		value: { stringValue: value }
 	};
 }
+/** Build an integer-typed OTLP attribute (stored as decimal string). */
 function intAttr(key, value) {
 	return {
 		key,
 		value: { intValue: String(value) }
 	};
 }
+/** Build a boolean-typed OTLP attribute. */
 function boolAttr(key, value) {
 	return {
 		key,
 		value: { boolValue: value }
 	};
 }
+/** Build a JSON-serialized string attribute (common for message arrays). */
 function jsonAttr(key, value) {
 	return {
 		key,
@@ -37,6 +41,11 @@ function jsonAttr(key, value) {
 }
 //#endregion
 //#region src/otel/messages.ts
+/**
+* Map harness stop reasons to GenAI semconv finish_reason values.
+*
+* Unknown reasons pass through unchanged for forward compatibility.
+*/
 function mapStopReason(reason) {
 	if (!reason) return void 0;
 	switch (reason) {
@@ -47,6 +56,7 @@ function mapStopReason(reason) {
 		default: return reason;
 	}
 }
+/** Build a tool_call part from a {@link ToolCall}. */
 function toolCallPart(call) {
 	return {
 		type: "tool_call",
@@ -55,6 +65,7 @@ function toolCallPart(call) {
 		arguments: call.args ?? {}
 	};
 }
+/** Build a tool_call_response part from a {@link ToolCall} result. */
 function toolResponsePart(call) {
 	return {
 		type: "tool_call_response",
@@ -62,6 +73,7 @@ function toolResponsePart(call) {
 		result: call.result
 	};
 }
+/** Convert one assistant turn to a GenAI semconv assistant message. */
 function assistantMessageFromTurn(turn) {
 	const parts = [];
 	if (turn.text) parts.push({
@@ -76,6 +88,7 @@ function assistantMessageFromTurn(turn) {
 		...finish ? { finish_reason: finish } : {}
 	};
 }
+/** Aggregate tool results from a turn into a single tool-role message, if any. */
 function toolResultsMessage(calls) {
 	const parts = calls.filter((c) => c.result !== null).map((c) => toolResponsePart(c));
 	if (parts.length === 0) return null;
@@ -238,8 +251,9 @@ function trajectoryToOtlp(view, options = {}) {
 		}]
 	}] };
 }
-/** Alias matching the implementation plan naming. */
+/** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
 const emitOtel = trajectoryToOtlp;
+/** Map view success flag to OTLP span status on the root invoke_agent span. */
 function viewStatus(view) {
 	if (view.success) return { code: StatusCode.OK };
 	return {
@@ -247,6 +261,13 @@ function viewStatus(view) {
 		message: "harness run did not complete successfully"
 	};
 }
+/**
+* Assign synthetic timestamps to chat and tool spans.
+*
+* Stream-json does not carry per-turn wall times, so we divide the session
+* duration evenly across chat/tool slots for OTLP consumers that require
+* start/end times on every span.
+*/
 function buildSpanTimings(view, startMs, endMs) {
 	const slots = [];
 	for (const turn of view.turns) {
@@ -268,17 +289,31 @@ function buildSpanTimings(view, startMs, endMs) {
 	}
 	return timings;
 }
+/**
+* Derive a deterministic 128-bit trace id from the harness session id.
+*
+* Uses SHA-256 truncation so the same session always maps to the same trace.
+*/
 function traceIdFromSession(sessionId) {
 	return createHash("sha256").update(`harness-eval:trace:${sessionId}`).digest("hex").slice(0, 32).toUpperCase();
 }
+/**
+* Derive a deterministic 64-bit span id from trace id and a logical span key.
+*/
 function spanIdFromKey(traceId, key) {
 	return createHash("sha256").update(`${traceId}:span:${key}`).digest("hex").slice(0, 16).toUpperCase();
 }
+/** Convert milliseconds since epoch to OTLP nanosecond timestamp string. */
 function msToNs(ms) {
 	return String(Math.round(ms * 1e6));
 }
 //#endregion
 //#region src/grader/prompt.ts
+/**
+* Build the full grader prompt including eval prompt, transcript, and schema.
+*
+* When `systemInstruction` is set it is prepended as a judge-specific prefix.
+*/
 function buildGraderPrompt(input) {
 	const expectationList = input.expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
 	return `${input.systemInstruction ? `${input.systemInstruction.trim()}\n\n` : ""}You are an automated evaluation grader (not the agent under test). Your only job is to score expectations against the transcript below.
@@ -320,6 +355,13 @@ Include every expectation in the same order. summary must match the expectations
 }
 //#endregion
 //#region src/grader/parse.ts
+/**
+* Extract assistant text from Claude stdout.
+*
+* Handles plain text, single JSON result envelopes, stream-json arrays, and
+* assistant message objects — the judge subprocess may emit any of these
+* depending on Claude Code version and flags.
+*/
 function extractClaudeResponseText(stdout) {
 	const trimmed = stdout.trim();
 	if (!trimmed) return "";
@@ -337,6 +379,7 @@ function extractClaudeResponseText(stdout) {
 	} catch {}
 	return trimmed;
 }
+/** Walk a stream-json event array and return the final assistant or result text. */
 function extractFromEventArray(events) {
 	const result = events.find((e) => typeof e === "object" && e !== null && e.type === "result");
 	if (result?.result) return result.result;
@@ -348,6 +391,7 @@ function extractFromEventArray(events) {
 	if (assistantTexts.length > 0) return assistantTexts[assistantTexts.length - 1];
 	return null;
 }
+/** Concatenate text blocks from an Anthropic-style assistant message object. */
 function textFromAssistantMessage(message) {
 	if (!message || typeof message !== "object") return null;
 	const content = message.content;
@@ -357,6 +401,12 @@ function textFromAssistantMessage(message) {
 	for (const block of content) if (typeof block === "object" && block !== null && block.type === "text" && typeof block.text === "string") texts.push(block.text);
 	return texts.length > 0 ? texts.join("\n") : null;
 }
+/**
+* Parse grader JSON from response text.
+*
+* Tries the raw string first, then fenced code blocks and brace-delimited
+* substrings. Returns null when no valid expectations array is found.
+*/
 function parseGraderJson(text) {
 	const candidates = [text.trim(), extractJsonBlock(text)];
 	for (const candidate of candidates) {
@@ -370,6 +420,7 @@ function parseGraderJson(text) {
 	}
 	return null;
 }
+/** Extract JSON from markdown fences or the outermost `{...}` substring. */
 function extractJsonBlock(text) {
 	const fence = text.match(/```(?:json)?\s*([\s\S]*?)```/);
 	if (fence?.[1]) return fence[1].trim();
@@ -378,6 +429,7 @@ function extractJsonBlock(text) {
 	if (start >= 0 && end > start) return text.slice(start, end + 1);
 	return null;
 }
+/** Map raw grader JSON to runtime {@link GraderOutput} with computed summary. */
 function normalizeGraderJson(raw) {
 	const expectations = (raw.expectations ?? []).map((e) => ({
 		text: e.text ?? "",
@@ -424,15 +476,22 @@ const JUDGE_CLAUDE_DEFAULTS = {
 	disableSlashCommands: true,
 	noSessionPersistence: true
 };
+/** Merge user-supplied Claude Code options over judge-safe defaults. */
 function mergeJudgeClaudeOptions(claudeCode) {
 	return {
 		...JUDGE_CLAUDE_DEFAULTS,
 		...claudeCode
 	};
 }
+/** Factory returning a {@link GraderFn} bound to subprocess options. */
 function createClaudeGrader(options = {}) {
 	return (input) => runClaudeGrader(input, options);
 }
+/**
+* Spawn Claude as judge, parse JSON response, align with input expectations.
+*
+* Unparseable output fails all expectations and sets {@link GraderOutput.error}.
+*/
 async function runClaudeGrader(input, options = {}) {
 	const binary = options.binary ?? options.claudeCode?.binary ?? "claude";
 	const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
@@ -478,6 +537,12 @@ async function runClaudeGrader(input, options = {}) {
 		evalFeedback: parsed.evalFeedback
 	};
 }
+/**
+* Spawn a child process and collect stdout until exit or timeout.
+*
+* Non-zero exit with empty stdout is treated as failure; partial stdout on
+* non-zero exit is retained (Claude sometimes exits non-zero after emitting JSON).
+*/
 function spawnCollectStdout(binary, args, timeoutMs, extraEnv, cwd) {
 	return new Promise((resolve, reject) => {
 		const child = spawn(binary, args, {
@@ -512,6 +577,9 @@ function spawnCollectStdout(binary, args, timeoutMs, extraEnv, cwd) {
 		});
 	});
 }
+/**
+* Build subprocess env, stripping CLAUDECODE to avoid nested-session guards.
+*/
 function buildChildEnv(extraEnv) {
 	const env = {
 		...process.env,
@@ -525,6 +593,11 @@ function buildChildEnv(extraEnv) {
 /**
 * Load expectations sidecar (YAML or JSON).
 */
+/**
+* Load expectations sidecar (YAML or JSON).
+*
+* File format: `{ "<caseId>": ["expectation 1", ...], ... }`.
+*/
 async function loadExpectationsMap(path) {
 	const text = await readFile(path, "utf8");
 	const trimmed = path.trim().toLowerCase();
@@ -541,7 +614,14 @@ async function loadExpectationsMap(path) {
 }
 //#endregion
 //#region src/grader/transcript.ts
+/** Maximum characters per tool result embedded in grader transcripts. */
 const MAX_RESULT_CHARS = 4e3;
+/**
+* Render a {@link TrajectoryView} as markdown for LLM graders.
+*
+* Tool results are truncated at {@link MAX_RESULT_CHARS} to keep judge
+* prompts within reasonable token limits.
+*/
 function trajectoryToTranscript(view, prompt) {
 	const lines = [];
 	if (prompt) lines.push("## User prompt", "", prompt, "");
@@ -564,6 +644,7 @@ function trajectoryToTranscript(view, prompt) {
 	lines.push("## Session metadata", `session_id: ${view.meta.sessionId}`, `model: ${view.meta.model}`, `cwd: ${view.meta.cwd}`, `success: ${view.success}`, `tool_calls: ${view.toolCalls.length}`, `duration_ms: ${view.usage.durationMs}`, `input_tokens: ${view.usage.inputTokens}`, `output_tokens: ${view.usage.outputTokens}`);
 	return lines.join("\n").trimEnd();
 }
+/** Format unknown values as JSON for transcript embedding. */
 function formatJson$1(value) {
 	try {
 		return JSON.stringify(value);
@@ -571,10 +652,12 @@ function formatJson$1(value) {
 		return String(value);
 	}
 }
+/** Format a tool result, truncating long string or JSON payloads. */
 function formatResult(result) {
 	if (typeof result === "string") return truncate(result);
 	return truncate(formatJson$1(result));
 }
+/** Truncate text with ellipsis when exceeding the transcript size budget. */
 function truncate(text) {
 	if (text.length <= MAX_RESULT_CHARS) return text;
 	return `${text.slice(0, MAX_RESULT_CHARS)}… (truncated)`;
@@ -584,6 +667,12 @@ function truncate(text) {
 /**
 * Grade a harness-eval SuiteReport with outcome expectations (LLM judge).
 */
+/**
+* Grade every repetition in a {@link SuiteReport} that has expectations.
+*
+* Expectations come from inline case fields or an optional sidecar YAML/JSON
+* map. Runs are concurrent under {@link GradeReportOptions.maxConcurrent}.
+*/
 async function gradeReport(report, options = {}) {
 	const expectationsMap = options.expectationsPath ? await loadExpectationsMap(options.expectationsPath) : {};
 	const gradeFn = options.gradeFn ?? createClaudeGrader({
@@ -707,6 +796,7 @@ async function gradeReport(report, options = {}) {
 		}
 	};
 }
+/** Load a suite report JSON file produced by `harness-eval run`. */
 async function loadSuiteReport(path) {
 	const text = await readFile(path, "utf8");
 	return JSON.parse(text);
@@ -747,6 +837,11 @@ const RESET$1 = "\x1B[0m";
 const GREEN$1 = "\x1B[32m";
 const RED$1 = "\x1B[31m";
 const DIM = "\x1B[2m";
+/**
+* Format a {@link SuiteGradingReport} for terminal output.
+*
+* @param color When true, emit ANSI status colors (default for TTY console).
+*/
 function formatGradingConsole(report, color = true) {
 	const lines = [];
 	if (report.results.length === 0) {
@@ -770,6 +865,7 @@ function formatGradingConsole(report, color = true) {
 	lines.push(`Overall: ${report.summary.passed}/${report.summary.total} (${overallPct}%) expectations passed`);
 	return lines.join("\n").trimEnd();
 }
+/** True when every graded rep passed all expectations without grader errors. */
 function gradingReportPassed(report) {
 	return report.results.every((r) => !r.graderError && r.summary.failed === 0 && r.summary.total > 0);
 }
@@ -779,6 +875,11 @@ const RESET = "\x1B[0m";
 const GREEN = "\x1B[32m";
 const RED = "\x1B[31m";
 const YELLOW = "\x1B[33m";
+/**
+* Render renderable rows as ANSI-colored console output.
+*
+* @param color When false, emit plain text without escape codes.
+*/
 function formatConsole(rows, color = true) {
 	const lines = [];
 	for (const row of rows) {
@@ -804,6 +905,7 @@ function formatConsole(rows, color = true) {
 	}
 	return lines.join("\n").trimEnd();
 }
+/** Format pass rate for display, noting when all reps crashed. */
 function formatRate$1(stat) {
 	if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
 	const pct = (stat.passRate * 100).toFixed(0);
@@ -811,11 +913,17 @@ function formatRate$1(stat) {
 }
 //#endregion
 //#region src/reporter/format-json.ts
+/**
+* Serialize a suite report as indented JSON (no transformation).
+*
+* Used by `--format json` and `--output` persistence.
+*/
 function formatJson(report) {
 	return JSON.stringify(report, null, 2);
 }
 //#endregion
 //#region src/reporter/format-markdown.ts
+/** Render renderable rows as a GitHub-flavored markdown report. */
 function formatMarkdown(rows) {
 	const lines = ["# Harness Eval Report", ""];
 	for (const row of rows) {
@@ -845,6 +953,7 @@ function formatMarkdown(rows) {
 	}
 	return lines.join("\n").trimEnd();
 }
+/** Format pass rate for markdown tables, noting when all reps crashed. */
 function formatRate(stat) {
 	if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
 	const pct = (stat.passRate * 100).toFixed(0);
@@ -852,9 +961,15 @@ function formatRate(stat) {
 }
 //#endregion
 //#region src/reporter/renderable.ts
+/** Map a suite report to formatter-ready rows (one per cell). */
 function toRenderableRows(report) {
 	return report.cells.map((cell) => cellToRow(cell));
 }
+/**
+* Attach baseline pass-rate deltas to matching rows.
+*
+* Rows without a matching baseline cell are returned unchanged.
+*/
 function applyBaseline(rows, baseline) {
 	const baselineMap = new Map(baseline.cells.map((c) => [`${c.caseId}::${c.cell.label}`, c]));
 	return rows.map((row) => {
@@ -876,6 +991,7 @@ function applyBaseline(rows, baseline) {
 		};
 	});
 }
+/** Convert one {@link CellReport} to a {@link RenderableRow}. */
 function cellToRow(cell) {
 	const totalReps = cell.repetitions.length;
 	const stats = cell.assertionStats.map((s) => ({
@@ -901,6 +1017,12 @@ function cellToRow(cell) {
 }
 //#endregion
 //#region src/reporter/index.ts
+/**
+* Format a {@link SuiteReport} for console, markdown, or JSON output.
+*
+* JSON format bypasses the renderable intermediate model and serializes the
+* report directly. Console and markdown apply optional baseline deltas.
+*/
 function formatReport(report, options) {
 	if (options.format === "json") return formatJson(report);
 	let rows = toRenderableRows(report);
@@ -910,81 +1032,149 @@ function formatReport(report, options) {
 	return formatConsole(rows, useColor);
 }
 //#endregion
-//#region src/eval-interchange/build.ts
-const DEFAULT_AGENT_ID = "agent";
+//#region src/eval-interchange/normalize.ts
+/**
+* Serialize tool arguments to the Vertex wire string format.
+*
+* Already-string inputs pass through unchanged (e.g. pre-serialized reference
+* steps). Objects and nullish values become JSON strings; empty input becomes `{}`.
+*
+* @param args - Tool arguments from harness or suite YAML.
+* @returns JSON string suitable for {@link ProtojsonToolCall.toolInput}.
+*/
 function serializeToolInput(args) {
+	if (typeof args === "string") return args;
 	return JSON.stringify(args ?? {});
 }
-function parseToolInput(toolInput) {
-	try {
-		return JSON.parse(toolInput);
-	} catch {
-		return toolInput;
-	}
+/**
+* Normalize a tool name according to suite reference configuration.
+*
+* In `"bare"` mode, strips the MCP namespace prefix (`mcp__api__foo` → `foo`)
+* so reference trajectories authored with bare names match harness tool names.
+*
+* @param toolName - Raw tool name from harness or suite.
+* @param mode - `"harness"` preserves the name; `"bare"` strips after last `__`.
+*/
+function normalizeReferenceToolName(toolName, mode) {
+	if (mode !== "bare") return toolName;
+	const separator = toolName.lastIndexOf("__");
+	if (separator === -1) return toolName;
+	return toolName.slice(separator + 2);
 }
-function toolCallToInterchange(toolCall) {
-	return {
-		tool_name: toolCall.name,
-		tool_input: serializeToolInput(toolCall.args)
-	};
+/**
+* Convert a harness or suite trajectory into Vertex protojson wire format.
+*
+* `toolNameMode` controls MCP prefix stripping for every tool name in the
+* trajectory. Suite reference steps and predicted harness tool calls use the
+* same mode so comparisons stay consistent across metrics and instances.
+*
+* @param trajectory - Tool calls in harness or YAML reference shape.
+* @param options.toolNameMode - `"harness"` keeps full names; `"bare"` strips after last `__`.
+*/
+function toProtojsonTrajectory(trajectory, options = {}) {
+	const toolNameMode = options.toolNameMode ?? "harness";
+	return { toolCalls: trajectory.map((toolCall) => {
+		const name = "name" in toolCall ? toolCall.name : toolCall.tool_name;
+		const args = "args" in toolCall ? toolCall.args : toolCall.tool_input;
+		return {
+			toolName: normalizeReferenceToolName(name, toolNameMode),
+			toolInput: serializeToolInput(args)
+		};
+	}) };
 }
-function interchangeToTabular(toolCall) {
+//#endregion
+//#region src/eval-interchange/protojson/trajectory-instances.ts
+/**
+* Build Vertex Trajectory*Instance protojson wire objects.
+*
+* Each trajectory metric in Vertex EvaluateInstances expects a specific
+* protobuf message. This module constructs all six instance payloads from
+* one predicted/reference pair so callers can batch-upload via JSONL.
+*/
+/**
+* Build a pair instance with predicted and reference trajectories.
+*
+* Both sides use the same `referenceToolNameMode` so wire payloads align with
+* {@link toHarnessMetrics} and Vertex EvaluateInstances sees comparable names.
+* In `"bare"` mode, MCP prefixes are stripped on predicted and reference alike.
+*/
+function pairInstance(predicted, reference, referenceToolNameMode) {
 	return {
-		tool_name: toolCall.tool_name,
-		tool_input: parseToolInput(toolCall.tool_input)
+		predictedTrajectory: toProtojsonTrajectory(predicted, { toolNameMode: referenceToolNameMode }),
+		referenceTrajectory: toProtojsonTrajectory(reference, { toolNameMode: referenceToolNameMode })
 	};
 }
-function predictedTrajectoryFromView(view) {
-	return view.toolCalls.map(toolCallToInterchange);
-}
-function buildAgentTrace(view, agentId = DEFAULT_AGENT_ID) {
-	const agents = { [agentId]: {
-		agent_id: agentId,
-		agent_type: "assistant",
-		description: view.meta.model,
-		tools: view.meta.availableTools.map((name) => ({ name }))
-	} };
-	const activeTools = view.meta.availableTools.map((name) => ({ name }));
+/**
+* Build all Trajectory*Instance payloads for one predicted/reference pair.
+*
+* Pair metrics (exact, in-order, any-order, precision, recall) share the
+* same trajectory pair; single-tool-use omits the reference trajectory
+* per Vertex API shape.
+*/
+function toTrajectoryInstances(options) {
+	const referenceToolNameMode = options.referenceToolNameMode ?? "harness";
+	const pair = pairInstance(options.predicted, options.reference, referenceToolNameMode);
 	return {
-		agents,
-		turns: view.turns.map((turn) => {
-			const events = [];
-			if (turn.text) events.push({
-				author: agentId,
-				content: { parts: [{ text: turn.text }] },
-				active_tools: activeTools
-			});
-			for (const toolCall of turn.toolCalls) {
-				events.push({
-					author: agentId,
-					content: { parts: [{ function_call: {
-						name: toolCall.name,
-						args: toolCall.args ?? {}
-					} }] },
-					active_tools: activeTools
-				});
-				if (toolCall.result !== null && toolCall.result !== void 0) events.push({
-					author: agentId,
-					content: { parts: [{ function_response: {
-						name: toolCall.name,
-						response: toolCall.result
-					} }] },
-					active_tools: activeTools
-				});
-			}
-			return {
-				turn_index: turn.turnIndex,
-				events
-			};
-		})
+		exactMatch: pair,
+		inOrderMatch: pair,
+		anyOrderMatch: pair,
+		precision: pair,
+		recall: pair,
+		singleToolUse: { predictedTrajectory: pair.predictedTrajectory }
 	};
 }
-function latencyInSeconds(view) {
-	return view.usage.durationMs / 1e3;
+/**
+* Convert suite reference steps to cell-level protojson trajectory export.
+*/
+function toReferenceTrajectory(reference, referenceToolNameMode = "harness") {
+	return toProtojsonTrajectory(reference, { toolNameMode: referenceToolNameMode });
+}
+/**
+* Map a trajectory instance key to the Vertex protobuf message type name.
+*
+* Used as `messageType` in {@link InstancesJsonlRow} for EvaluateInstances batching.
+*/
+function trajectoryInstanceMessageType(key) {
+	switch (key) {
+		case "exactMatch": return "TrajectoryExactMatchInstance";
+		case "inOrderMatch": return "TrajectoryInOrderMatchInstance";
+		case "anyOrderMatch": return "TrajectoryAnyOrderMatchInstance";
+		case "precision": return "TrajectoryPrecisionInstance";
+		case "recall": return "TrajectoryRecallInstance";
+		case "singleToolUse": return "TrajectorySingleToolUseInstance";
+	}
+}
+//#endregion
+//#region src/eval-interchange/protojson/evaluation-instance.ts
+/**
+* Build an EvaluationInstance protojson object from harness strings.
+*
+* Omitted fields are excluded from the output object rather than set to
+* empty wrappers — protojson omits unset optional fields.
+*
+* @param options.prompt - Case prompt sent to the agent.
+* @param options.response - Final agent response from the trajectory.
+* @param options.reference - Optional reference answer text (rare in harness eval).
+*/
+function toEvaluationInstance(options) {
+	const instance = {};
+	if (options.prompt !== void 0) instance.prompt = { text: options.prompt };
+	if (options.response !== void 0) instance.response = { text: options.response };
+	if (options.reference !== void 0) instance.reference = { text: options.reference };
+	return instance;
 }
 //#endregion
 //#region src/metrics/trajectory.ts
-function normalizeToolCall$1(toolCall) {
+/**
+* Trajectory-level metrics for comparing predicted and reference tool-call sequences.
+*
+* Aligns with Vertex AI EvaluationService trajectory metrics (exact match,
+* in-order, any-order, precision, recall, single tool use). Tool calls are
+* compared by `(tool_name, serialized tool_input)` identity after normalization.
+*
+* Binary metrics return 0 or 1; precision and recall return fractions in [0, 1].
+*/
+function normalizeToolCall(toolCall) {
 	if (typeof toolCall.tool_input === "string") return {
 		tool_name: toolCall.tool_name,
 		tool_input: toolCall.tool_input
@@ -995,11 +1185,17 @@ function normalizeToolCall$1(toolCall) {
 	};
 }
 function normalizeTrajectory(trajectory) {
-	return trajectory.map(normalizeToolCall$1);
+	return trajectory.map(normalizeToolCall);
 }
+/** Stable composite key for multiset and equality checks. */
 function toolCallKey(toolCall) {
 	return `${toolCall.tool_name}\0${toolCall.tool_input}`;
 }
+/**
+* Count predicted tool calls that appear in reference (multiset intersection).
+*
+* Duplicate tool calls are matched one-for-one; order does not matter.
+*/
 function multisetIntersectionSize(predicted, reference) {
 	const refCounts = /* @__PURE__ */ new Map();
 	for (const toolCall of reference) {
@@ -1017,6 +1213,12 @@ function multisetIntersectionSize(predicted, reference) {
 	}
 	return matched;
 }
+/**
+* Whether reference appears as a subsequence of predicted (order preserved).
+*
+* Extra predicted calls between reference steps are allowed (in-order match
+* semantics per Vertex).
+*/
 function isSubsequence(predicted, reference) {
 	let refIndex = 0;
 	for (const toolCall of predicted) {
@@ -1032,12 +1234,15 @@ function arraysEqual(left, right) {
 		return toolCallKey(toolCall) === toolCallKey(other);
 	});
 }
+/** Exact sequence equality after normalization. */
 function trajectoryExactMatch(predicted, reference) {
 	return arraysEqual(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
 }
+/** Reference is a subsequence of predicted (order preserved, extras allowed). */
 function trajectoryInOrderMatch(predicted, reference) {
 	return isSubsequence(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
 }
+/** Same multiset of tool calls; length must match. */
 function trajectoryAnyOrderMatch(predicted, reference) {
 	const predictedNorm = normalizeTrajectory(predicted);
 	const referenceNorm = normalizeTrajectory(reference);
@@ -1046,22 +1251,34 @@ function trajectoryAnyOrderMatch(predicted, reference) {
 	const referenceKeys = referenceNorm.map(toolCallKey).sort();
 	return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
 }
+/**
+* Fraction of predicted tool calls that appear in reference (multiset).
+*
+* Returns 1 when both trajectories are empty.
+*/
 function trajectoryPrecision(predicted, reference) {
 	const predictedNorm = normalizeTrajectory(predicted);
 	if (predictedNorm.length === 0) return reference.length === 0 ? 1 : 0;
 	return multisetIntersectionSize(predictedNorm, normalizeTrajectory(reference)) / predictedNorm.length;
 }
+/**
+* Fraction of reference tool calls matched in predicted (multiset recall).
+*
+* Returns 1 when reference is empty and predicted is empty.
+*/
 function trajectoryRecall(predicted, reference) {
 	const referenceNorm = normalizeTrajectory(reference);
 	if (referenceNorm.length === 0) return predicted.length === 0 ? 1 : 0;
 	return multisetIntersectionSize(normalizeTrajectory(predicted), referenceNorm) / referenceNorm.length;
 }
+/** Both trajectories have exactly one call and they match. */
 function trajectorySingleToolUse(predicted, reference) {
 	const predictedNorm = normalizeTrajectory(predicted);
 	const referenceNorm = normalizeTrajectory(reference);
 	if (predictedNorm.length !== 1 || referenceNorm.length !== 1) return 0;
 	return toolCallKey(predictedNorm[0]) === toolCallKey(referenceNorm[0]) ? 1 : 0;
 }
+/** Compute all trajectory metrics in one pass. */
 function computeTrajectoryMetrics(predicted, reference) {
 	return {
 		trajectory_exact_match: trajectoryExactMatch(predicted, reference),
@@ -1072,201 +1289,144 @@ function computeTrajectoryMetrics(predicted, reference) {
 		trajectory_single_tool_use: trajectorySingleToolUse(predicted, reference)
 	};
 }
-//#endregion
-//#region src/metrics/tool-calls.ts
-function normalizeToolCall(toolCall) {
-	if (typeof toolCall.tool_input === "string") return {
-		tool_name: toolCall.tool_name,
-		tool_input: toolCall.tool_input
-	};
-	return {
-		tool_name: toolCall.tool_name,
-		tool_input: serializeToolInput(toolCall.tool_input)
-	};
-}
-function parsedArgs(toolCall) {
-	const parsed = parseToolInput(toolCall.tool_input);
-	if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) return null;
-	return parsed;
-}
-function toolCallValid(toolCall) {
-	const normalized = normalizeToolCall(toolCall);
-	if (!normalized.tool_name.trim()) return 0;
+/**
+* Parse a wire tool_input string to JSON, or return the raw string on failure.
+*
+* Exported for tool-call metrics that need structured arg comparison.
+*/
+function parseToolInput(toolInput) {
 	try {
-		JSON.parse(normalized.tool_input);
-		return 1;
+		return JSON.parse(toolInput);
 	} catch {
-		return 0;
-	}
-}
-function toolNameMatch(predicted, reference) {
-	const predictedNorm = normalizeToolCall(predicted);
-	const referenceNorm = normalizeToolCall(reference);
-	return predictedNorm.tool_name === referenceNorm.tool_name ? 1 : 0;
-}
-function toolParameterKeyMatch(predicted, reference) {
-	if (toolNameMatch(predicted, reference) === 0) return 0;
-	const predictedArgs = parsedArgs(normalizeToolCall(predicted));
-	const referenceArgs = parsedArgs(normalizeToolCall(reference));
-	if (predictedArgs === null || referenceArgs === null) return 0;
-	const predictedKeys = Object.keys(predictedArgs).sort();
-	const referenceKeys = Object.keys(referenceArgs).sort();
-	if (predictedKeys.length !== referenceKeys.length) return 0;
-	return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
-}
-function valuesEqual(left, right, useStrictStringMatch) {
-	if (useStrictStringMatch) return JSON.stringify(left) === JSON.stringify(right);
-	return JSON.stringify(left) === JSON.stringify(right);
-}
-function toolParameterKvMatch(predicted, reference, options = {}) {
-	if (toolParameterKeyMatch(predicted, reference) === 0) return 0;
-	const predictedArgs = parsedArgs(normalizeToolCall(predicted));
-	const referenceArgs = parsedArgs(normalizeToolCall(reference));
-	for (const key of Object.keys(referenceArgs)) if (!valuesEqual(predictedArgs[key], referenceArgs[key], options.useStrictStringMatch ?? false)) return 0;
-	return 1;
-}
-function computeToolCallMetrics(predicted, reference, options = {}) {
-	const pairCount = Math.max(predicted.length, reference.length, 1);
-	let valid = 0;
-	let nameMatch = 0;
-	let keyMatch = 0;
-	let kvMatch = 0;
-	for (let index = 0; index < pairCount; index += 1) {
-		const predictedCall = predicted[index];
-		const referenceCall = reference[index];
-		if (!predictedCall) continue;
-		valid += toolCallValid(predictedCall);
-		if (!referenceCall) continue;
-		nameMatch += toolNameMatch(predictedCall, referenceCall);
-		keyMatch += toolParameterKeyMatch(predictedCall, referenceCall);
-		kvMatch += toolParameterKvMatch(predictedCall, referenceCall, options);
+		return toolInput;
 	}
-	return {
-		tool_call_valid: valid / pairCount,
-		tool_name_match: nameMatch / pairCount,
-		tool_parameter_key_match: keyMatch / pairCount,
-		tool_parameter_kv_match: kvMatch / pairCount
-	};
 }
 //#endregion
-//#region src/eval-interchange/projections.ts
+//#region src/eval-interchange/protojson/harness-metrics.ts
 /**
-* Envelope projection methods for eval interchange output.
+* Harness-owned trajectory metric scores in Vertex camelCase field names.
+*
+* Wraps {@link computeTrajectoryMetrics} for envelope export. External
+* systems can compare harness-precomputed scores against Vertex EvaluateInstances
+* results without reimplementing trajectory matching logic.
 */
-function repetitionInterchangeFields(repetition) {
-	if (!repetition.trajectory) return { predicted_trajectory: [] };
+/**
+* Compute trajectory metrics and map snake_case keys to Vertex camelCase.
+*
+* When `referenceToolNameMode` is `"bare"`, both predicted and reference tool
+* names are stripped to the suffix after the last `__` so suite reference steps
+* authored with bare names (e.g. `ListLandingZones`) match harness MCP names
+* (e.g. `mcp__plugin__ListLandingZones`).
+*
+* @param predicted - Tool calls from the harness trajectory view.
+* @param reference - Reference steps from suite YAML.
+* @param options.referenceToolNameMode - Name normalization mode from suite YAML.
+*/
+function toHarnessMetrics(predicted, reference, options = {}) {
+	const referenceToolNameMode = options.referenceToolNameMode ?? "harness";
+	const metrics = computeTrajectoryMetrics(predicted.map((toolCall) => ({
+		tool_name: normalizeReferenceToolName(toolCall.name, referenceToolNameMode),
+		tool_input: toolCall.args
+	})), reference.map((step) => ({
+		tool_name: normalizeReferenceToolName(step.tool_name, referenceToolNameMode),
+		tool_input: step.tool_input
+	})));
 	return {
-		predicted_trajectory: repetition.predicted_trajectory ?? predictedTrajectoryFromView(repetition.trajectory),
-		agent_trace: repetition.agent_trace ?? buildAgentTrace(repetition.trajectory),
-		latency_in_seconds: repetition.latency_in_seconds ?? latencyInSeconds(repetition.trajectory),
-		failure: repetition.failure ?? (repetition.trajectory.success ? 0 : 1)
+		trajectoryExactMatch: metrics.trajectory_exact_match,
+		trajectoryInOrderMatch: metrics.trajectory_in_order_match,
+		trajectoryAnyOrderMatch: metrics.trajectory_any_order_match,
+		trajectoryPrecision: metrics.trajectory_precision,
+		trajectoryRecall: metrics.trajectory_recall,
+		trajectorySingleToolUse: metrics.trajectory_single_tool_use
 	};
 }
-function referenceTrajectoryForCell(cell) {
-	return cell.reference_trajectory;
+//#endregion
+//#region src/eval-interchange/enrich.ts
+/**
+* Enrich eval repetitions with Vertex protojson interchange fields.
+*
+* Called during envelope build for each successful repetition. Adds
+* `evaluationInstance`, optional `trajectoryInstances` / `harnessMetrics`
+* when a suite reference exists, and Vertex-style `latencySeconds` / `failure`
+* flags derived from trajectory success.
+*/
+/** Extract reference steps from suite config when present. */
+function referenceSteps(reference) {
+	return reference?.steps;
 }
-function repetitionToDatasetRow(cell, repetition) {
-	const fields = repetitionInterchangeFields(repetition);
+/**
+* Attach Vertex protojson interchange fields to one {@link EvalRepetition}.
+*
+* When no trajectory exists (adapter error), sets `failure: 1` and skips
+* protojson payloads. Trajectory instances and harness metrics are only
+* computed when the suite defines a non-empty reference trajectory.
+*
+* @param repetition - Base repetition from the runner (trajectory, assertions, grades).
+* @param options.prompt - Case prompt for EvaluationInstance.
+* @param options.reference - Suite reference trajectory config, if any.
+*/
+function enrichRepetitionWithProtojson(repetition, options = {}) {
 	if (!repetition.trajectory) return {
-		prompt: cell.prompt,
-		response: void 0,
-		predicted_trajectory: [],
-		reference_trajectory: referenceTrajectoryForCell(cell),
-		latency_in_seconds: repetition.durationMs / 1e3,
-		failure: 1,
-		human_ratings: cell.human_ratings
-	};
-	return {
-		prompt: cell.prompt,
-		response: repetition.trajectory.finalResponse,
-		predicted_trajectory: fields.predicted_trajectory.map(interchangeToTabular),
-		reference_trajectory: referenceTrajectoryForCell(cell),
-		latency_in_seconds: fields.latency_in_seconds ?? repetition.durationMs / 1e3,
-		failure: fields.failure ?? 1,
-		human_ratings: cell.human_ratings
-	};
-}
-function repetitionToProtoInstance(cell, repetition) {
-	const fields = repetitionInterchangeFields(repetition);
-	if (!repetition.trajectory) return null;
-	const reference = referenceTrajectoryForCell(cell);
-	return {
-		prompt: cell.prompt,
-		response: repetition.trajectory.finalResponse,
-		predicted_trajectory: { tool_calls: fields.predicted_trajectory },
-		reference_trajectory: reference ? { tool_calls: reference.map((toolCall) => ({
-			tool_name: toolCall.tool_name,
-			tool_input: typeof toolCall.tool_input === "string" ? toolCall.tool_input : JSON.stringify(toolCall.tool_input ?? {})
-		})) } : void 0
-	};
-}
-function repetitionToAgentTrace(repetition) {
-	return repetitionInterchangeFields(repetition).agent_trace ?? null;
-}
-function computeRepetitionMetrics(repetition, referenceTrajectory) {
-	if (!referenceTrajectory?.length) return {};
-	const predictedTabular = (repetition.predicted_trajectory ?? (repetition.trajectory ? predictedTrajectoryFromView(repetition.trajectory) : [])).map(interchangeToTabular);
-	return {
-		trajectoryMetrics: computeTrajectoryMetrics(predictedTabular, referenceTrajectory),
-		toolCallMetrics: computeToolCallMetrics(predictedTabular, referenceTrajectory)
-	};
-}
-function toTrajectory(envelope) {
-	const rows = [];
-	for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
-		const row = repetitionToDatasetRow(cell, repetition);
-		if (row) rows.push(row);
-	}
-	return rows;
-}
-function toProtoInstances(envelope) {
-	const instances = [];
-	for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
-		const instance = repetitionToProtoInstance(cell, repetition);
-		if (instance) instances.push(instance);
-	}
-	return instances;
-}
-function toAgentTrace(envelope) {
-	const traces = [];
-	for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
-		const trace = repetitionToAgentTrace(repetition);
-		if (trace) traces.push(trace);
-	}
-	return traces;
-}
-function enrichRepetitionWithInterchange(repetition, referenceTrajectory) {
-	if (!repetition.trajectory) return repetition;
-	const predicted_trajectory = predictedTrajectoryFromView(repetition.trajectory);
-	const agent_trace = buildAgentTrace(repetition.trajectory);
-	const latency_in_seconds = latencyInSeconds(repetition.trajectory);
-	const failure = repetition.trajectory.success ? 0 : 1;
-	const metrics = computeRepetitionMetrics({
 		...repetition,
-		predicted_trajectory,
-		agent_trace,
-		latency_in_seconds,
-		failure
-	}, referenceTrajectory);
-	return {
+		failure: 1
+	};
+	const predicted = repetition.trajectory.toolCalls;
+	const referenceStepsList = referenceSteps(options.reference);
+	const referenceToolNameMode = options.reference?.tool_name_mode ?? "harness";
+	const enriched = {
 		...repetition,
-		predicted_trajectory,
-		agent_trace,
-		latency_in_seconds,
-		failure,
-		trajectoryMetrics: metrics.trajectoryMetrics,
-		toolCallMetrics: metrics.toolCallMetrics
+		evaluationInstance: toEvaluationInstance({
+			prompt: options.prompt,
+			response: repetition.trajectory.finalResponse
+		}),
+		latencySeconds: repetition.trajectory.usage.durationMs / 1e3,
+		failure: repetition.trajectory.success ? 0 : 1
 	};
+	if (referenceStepsList?.length) {
+		enriched.trajectoryInstances = toTrajectoryInstances({
+			predicted,
+			reference: referenceStepsList,
+			referenceToolNameMode
+		});
+		enriched.harnessMetrics = toHarnessMetrics(predicted, referenceStepsList, { referenceToolNameMode });
+	}
+	return enriched;
 }
 //#endregion
 //#region src/eval-record/build.ts
 /**
 * Build {@link EvalRunEnvelope} from harness-eval run and grading reports.
+*
+* This is the canonical export path from in-process or on-disk {@link SuiteReport}
+* JSON into the cross-harness eval record contract. It stitches together:
+*
+*   - Behavioral assertion results from the runner
+*   - Optional outcome grades from the LLM grader
+*   - Vertex protojson interchange fields via {@link enrichRepetitionWithProtojson}
+*   - Optional artifacts (transcript, raw stream-json) controlled by build options
+*
+* Downstream consumers include CI gates, databases, and the `harness-eval envelope`
+* CLI projection commands.
+*/
+/**
+* Pull raw stream-json events from an adapter result when the adapter exposes them.
+*
+* Adapters may attach `rawEvents` for debug-only envelope export; this helper
+* avoids coupling the builder to a specific adapter result type.
 */
 function extractRawEvents(adapterResult) {
 	if (adapterResult !== null && typeof adapterResult === "object" && "rawEvents" in adapterResult && Array.isArray(adapterResult.rawEvents)) return adapterResult.rawEvents;
 }
-function outcomePassForCell(caseId, cellLabel, repetitions) {
+/**
+* Derive cell-level outcome pass from graded repetitions.
+*
+* Returns `undefined` when no repetition was graded (outcome gate not applicable).
+* When graded, every repetition must have zero failed expectations and no grader error.
+*
+* @param _caseId - Reserved for future per-case outcome rules; unused today.
+* @param _cellLabel - Reserved for future per-cell outcome rules; unused today.
+*/
+function outcomePassForCell(_caseId, _cellLabel, repetitions) {
 	const graded = repetitions.filter((r) => r.outcomeGrades);
 	if (graded.length === 0) return void 0;
 	return graded.every((r) => r.outcomeGrades.error === void 0 && r.outcomeGrades.summary.failed === 0);
@@ -1274,6 +1434,10 @@ function outcomePassForCell(caseId, cellLabel, repetitions) {
 /**
 * Convert a {@link SuiteReport} (and optional grading) into a versioned
 * {@link EvalRunEnvelope} for storage or API handoff.
+*
+* @param report - Runner output for one suite execution.
+* @param options - Provenance, grading merge, and artifact inclusion flags.
+* @returns A fully populated envelope with protojson interchange fields on each repetition.
 */
 function buildEvalRunEnvelope(report, options = {}) {
 	const includeTranscript = options.includeTranscript !== false;
@@ -1281,7 +1445,8 @@ function buildEvalRunEnvelope(report, options = {}) {
 	const judge = options.grading?.judge ?? { id: "harness-eval/claude-grader" };
 	const cells = report.cells.map((cell) => {
 		const prompt = cell.prompt ?? "";
-		const referenceTrajectory = cell.reference_trajectory;
+		const referenceTrajectoryConfig = cell.reference_trajectory;
+		const referenceTrajectory = referenceTrajectoryConfig ? toReferenceTrajectory(referenceTrajectoryConfig.steps, referenceTrajectoryConfig.tool_name_mode ?? "harness") : void 0;
 		const repetitions = cell.repetitions.map((rep) => {
 			const base = {
 				repetitionIndex: rep.repetitionIndex,
@@ -1317,7 +1482,10 @@ function buildEvalRunEnvelope(report, options = {}) {
 				evalFeedback: graded.evalFeedback,
 				error: graded.graderError
 			};
-			return enrichRepetitionWithInterchange(base, referenceTrajectory);
+			return enrichRepetitionWithProtojson(base, {
+				prompt,
+				reference: referenceTrajectoryConfig
+			});
 		});
 		return {
 			caseId: cell.caseId,
@@ -1325,8 +1493,8 @@ function buildEvalRunEnvelope(report, options = {}) {
 			notes: cell.notes,
 			prompt: cell.prompt,
 			expectations: cell.expectations,
-			reference_trajectory: cell.reference_trajectory,
-			human_ratings: cell.human_ratings,
+			referenceTrajectory,
+			humanRatings: cell.human_ratings,
 			cellLabel: cell.cell.label,
 			axes: cell.cell.axes,
 			assertionStats: cell.assertionStats,
@@ -1360,7 +1528,16 @@ function buildEvalRunEnvelope(report, options = {}) {
 		cells
 	};
 }
-/** Build envelope from on-disk report + optional grading JSON paths. */
+/**
+* Build an envelope from on-disk runner and grader JSON artifacts.
+*
+* Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
+* outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
+* attaches suite URI and SHA-256 content hash for reproducibility.
+*
+* @param reportPath - Path to the suite run report JSON from `harness-eval run`.
+* @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
+*/
 async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
 	const reportText = await readFile(reportPath, "utf8");
 	const report = JSON.parse(reportText);
@@ -1391,6 +1568,72 @@ async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
 	});
 }
 //#endregion
-export { TRAJECTORY_SCHEMA_VERSION as A, gradeReport as C, emitOtel as D, createClaudeGrader as E, trajectoryToOtlp as O, resolveGradeOptions as S, trajectoryToTranscript as T, trajectoryRecall as _, toProtoInstances as a, formatGradingConsole as b, toolCallValid as c, toolParameterKvMatch as d, computeTrajectoryMetrics as f, trajectoryPrecision as g, trajectoryInOrderMatch as h, toAgentTrace as i, EVAL_RUN_SCHEMA_VERSION as k, toolNameMatch as l, trajectoryExactMatch as m, buildEvalRunEnvelopeFromFiles as n, toTrajectory as o, trajectoryAnyOrderMatch as p, enrichRepetitionWithInterchange as r, computeToolCallMetrics as s, buildEvalRunEnvelope as t, toolParameterKeyMatch as u, trajectorySingleToolUse as v, loadSuiteReport as w, gradingReportPassed as x, formatReport as y };
+//#region src/eval-interchange/projections.ts
+/** Trajectory instance keys emitted in stable order for JSONL export. */
+const TRAJECTORY_INSTANCE_KEYS = [
+	"exactMatch",
+	"inOrderMatch",
+	"anyOrderMatch",
+	"precision",
+	"recall",
+	"singleToolUse"
+];
+/**
+* Flatten one repetition into a trajectory dataset row.
+*
+* Pulls prompt from the cell, response from evaluationInstance, and falls
+* back to duration-based latency when enrich did not set latencySeconds.
+*/
+function repetitionToDatasetRow(cell, repetition) {
+	return {
+		caseId: cell.caseId,
+		repetitionIndex: repetition.repetitionIndex,
+		prompt: cell.prompt,
+		response: repetition.evaluationInstance?.response?.text,
+		evaluationInstance: repetition.evaluationInstance,
+		latencySeconds: repetition.latencySeconds ?? repetition.durationMs / 1e3,
+		failure: repetition.failure ?? (repetition.trajectory?.success ? 0 : 1),
+		humanRatings: cell.humanRatings
+	};
+}
+/**
+* Expand one repetition into type-tagged instance rows for EvaluateInstances.
+*
+* Returns an empty array when the repetition has no reference trajectory
+* (and therefore no trajectoryInstances block).
+*/
+function repetitionToInstanceRows(cell, repetition) {
+	if (!repetition.trajectoryInstances) return [];
+	const rows = [];
+	for (const key of TRAJECTORY_INSTANCE_KEYS) {
+		const instance = repetition.trajectoryInstances[key];
+		if (!instance) continue;
+		rows.push({
+			messageType: trajectoryInstanceMessageType(key),
+			caseId: cell.caseId,
+			repetitionIndex: repetition.repetitionIndex,
+			instance
+		});
+	}
+	return rows;
+}
+/**
+* Trajectory projection — all repetitions in the envelope as dataset rows.
+*/
+function toTrajectory(envelope) {
+	const rows = [];
+	for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(repetitionToDatasetRow(cell, repetition));
+	return rows;
+}
+/**
+* Instances projection — all trajectory metric instances as JSONL rows.
+*/
+function toInstancesJsonl(envelope) {
+	const rows = [];
+	for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(...repetitionToInstanceRows(cell, repetition));
+	return rows;
+}
+//#endregion
+export { loadSuiteReport as C, trajectoryToOtlp as D, emitOtel as E, EVAL_RUN_SCHEMA_VERSION as O, gradeReport as S, createClaudeGrader as T, serializeToolInput as _, enrichRepetitionWithProtojson as a, gradingReportPassed as b, parseToolInput as c, trajectoryInOrderMatch as d, trajectoryPrecision as f, toTrajectoryInstances as g, toEvaluationInstance as h, buildEvalRunEnvelopeFromFiles as i, TRAJECTORY_SCHEMA_VERSION as k, trajectoryAnyOrderMatch as l, trajectorySingleToolUse as m, toTrajectory as n, toHarnessMetrics as o, trajectoryRecall as p, buildEvalRunEnvelope as r, computeTrajectoryMetrics as s, toInstancesJsonl as t, trajectoryExactMatch as u, formatReport as v, trajectoryToTranscript as w, resolveGradeOptions as x, formatGradingConsole as y };
-//# sourceMappingURL=build-DsVJ_UeU.js.map
+//# sourceMappingURL=projections-BcX7w-f6.js.map