@alis-build/harness-eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +104 -10
  2. package/dist/adapters/claude-code/index.d.ts +2 -2
  3. package/dist/adapters/claude-code/index.js +2 -1
  4. package/dist/adapters/codex/index.d.ts +68 -0
  5. package/dist/adapters/codex/index.js +3 -0
  6. package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
  7. package/dist/claude-code-C_7hxC8z.js.map +1 -0
  8. package/dist/cli/bin.js +204 -127
  9. package/dist/cli/bin.js.map +1 -1
  10. package/dist/codex-0cHO2te9.js +496 -0
  11. package/dist/codex-0cHO2te9.js.map +1 -0
  12. package/dist/config/loader.d.ts +2 -2
  13. package/dist/config/loader.js +2 -2
  14. package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
  15. package/dist/index.d.ts +397 -153
  16. package/dist/index.js +125 -5
  17. package/dist/index.js.map +1 -0
  18. package/dist/loader-B1WmGGzf.d.ts +107 -0
  19. package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
  20. package/dist/loader-DnQ6Jt0i.js.map +1 -0
  21. package/dist/reporter-Biy-5-9M.js +2216 -0
  22. package/dist/reporter-Biy-5-9M.js.map +1 -0
  23. package/dist/runner/suite.d.ts +1 -1
  24. package/dist/runner/suite.js +1 -1
  25. package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
  26. package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
  27. package/dist/suite-BcP64nlb.js.map +1 -0
  28. package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
  29. package/dist/types-Bac8_Ixb.js +246 -0
  30. package/dist/types-Bac8_Ixb.js.map +1 -0
  31. package/dist/types-Bu8uOZZN.d.ts +77 -0
  32. package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
  33. package/package.json +7 -2
  34. package/schemas/eval-interchange-instances.schema.json +196 -0
  35. package/schemas/eval-interchange.schema.json +65 -52
  36. package/schemas/eval-run-envelope.schema.json +182 -425
  37. package/dist/build-DsVJ_UeU.js +0 -1396
  38. package/dist/build-DsVJ_UeU.js.map +0 -1
  39. package/dist/claude-code-ycT0JQZF.js.map +0 -1
  40. package/dist/loader-BCnFJ8rm.js.map +0 -1
  41. package/dist/loader-DTvoVfN0.d.ts +0 -33
  42. package/dist/suite-chj0j22j.js.map +0 -1
  43. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  44. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
@@ -0,0 +1,246 @@
1
+ //#region src/types/stream.ts
2
+ /** Type guards. Prefer these over manual `e.type === "..."` checks at call sites. */
3
+ function isSystemInit(e) {
4
+ return e.type === "system" && e.subtype === "init";
5
+ }
6
+ function isSystemRetry(e) {
7
+ return e.type === "system" && e.subtype === "api_retry";
8
+ }
9
+ function isAssistantMessage(e) {
10
+ return e.type === "assistant";
11
+ }
12
+ function isUserMessage(e) {
13
+ return e.type === "user";
14
+ }
15
+ function isResult(e) {
16
+ return e.type === "result";
17
+ }
18
+ function isTextBlock(b) {
19
+ return b.type === "text";
20
+ }
21
+ function isToolUseBlock(b) {
22
+ return b.type === "tool_use";
23
+ }
24
+ function isToolResultBlock(b) {
25
+ return b.type === "tool_result";
26
+ }
27
+ //#endregion
28
+ //#region src/types/trajectory.ts
29
+ /**
30
+ * Extract the MCP namespace prefix from a tool name.
31
+ *
32
+ * Claude Code formats MCP tool names as `mcp__<server>__<tool>`. The namespace
33
+ * is the first two segments joined: `mcp__<server>`. Returns null for non-MCP
34
+ * tool names (built-ins like `Bash`, `Read`, `Edit`).
35
+ *
36
+ * @example
37
+ * namespaceOf("mcp__api__search_skills") // "mcp__api"
38
+ * namespaceOf("Bash") // null
39
+ */
40
+ function namespaceOf(toolName) {
41
+ if (!toolName.startsWith("mcp__")) return null;
42
+ const parts = toolName.split("__");
43
+ if (parts.length < 3) return null;
44
+ return `${parts[0]}__${parts[1]}`;
45
+ }
46
+ //#endregion
47
+ //#region src/trajectory/builder.ts
48
+ /**
49
+ * TrajectoryBuilder — consumes a stream of {@link StreamEvent} values and
50
+ * produces a {@link TrajectoryView}.
51
+ *
52
+ * State machine: the builder is a small, tolerant state machine. Invariants:
53
+ *
54
+ * - Exactly one `system/init` event opens the session. The builder requires
55
+ * it to be present before `build()`.
56
+ * - Each `assistant` event begins a new turn. Text blocks accumulate into
57
+ * the turn's text; `tool_use` blocks become `ToolCall` records.
58
+ * - `user` events with `tool_result` blocks deliver tool results back. We
59
+ * match them to pending calls by `tool_use_id`.
60
+ * - One `result` event closes the session and carries aggregate usage.
61
+ *
62
+ * The builder is *tolerant of partial streams*: a process killed mid-run
63
+ * produces a coherent (but flagged) view. Tool calls without matching results
64
+ * keep `result: null`. The `success` flag reflects whether a successful result
65
+ * event was actually observed.
66
+ *
67
+ * Why a class (not a reducer)?
68
+ * The internal `pendingCalls` map is mutable by design — we modify ToolCall
69
+ * objects in place when results arrive, so other parts of the view (which
70
+ * hold references to the same objects) see the update for free. A reducer
71
+ * would force a deep copy per result event, which is wasteful and would
72
+ * complicate identity-based queries.
73
+ */
74
+ var TrajectoryBuilder = class {
75
+ meta = null;
76
+ sessionStartTs = null;
77
+ turns = [];
78
+ allToolCalls = [];
79
+ /**
80
+ * tool_use_id → ToolCall, for matching results back to calls.
81
+ * Entries are removed once a result is observed.
82
+ */
83
+ pendingCalls = /* @__PURE__ */ new Map();
84
+ retries = [];
85
+ finalUsage = null;
86
+ finalCostUsd = 0;
87
+ finalDurationMs = 0;
88
+ finalNumTurns = 0;
89
+ finalResultText = "";
90
+ sawResultEvent = false;
91
+ resultIsError = false;
92
+ /**
93
+ * Consume one event. Safe to call with events in stream order.
94
+ *
95
+ * Unknown event types are silently ignored — the schema evolves and we
96
+ * don't want CI to break on a new event type we haven't modelled.
97
+ */
98
+ consume(event) {
99
+ if (isSystemInit(event)) {
100
+ this.meta = {
101
+ sessionId: event.session_id,
102
+ model: event.model,
103
+ cwd: event.cwd,
104
+ permissionMode: event.permissionMode,
105
+ availableTools: event.tools ?? [],
106
+ mcpServers: (event.mcp_servers ?? []).map((s) => ({
107
+ name: s.name,
108
+ status: s.status
109
+ }))
110
+ };
111
+ this.sessionStartTs = Date.now();
112
+ return;
113
+ }
114
+ if (event.type === "system" && event.subtype === "api_retry") {
115
+ this.retries.push({
116
+ offsetMs: this.sessionStartTs ? Date.now() - this.sessionStartTs : 0,
117
+ raw: event
118
+ });
119
+ return;
120
+ }
121
+ if (isAssistantMessage(event)) {
122
+ this.handleAssistantMessage(event);
123
+ return;
124
+ }
125
+ if (isUserMessage(event)) {
126
+ this.handleUserMessage(event);
127
+ return;
128
+ }
129
+ if (isResult(event)) {
130
+ this.sawResultEvent = true;
131
+ this.resultIsError = event.is_error;
132
+ this.finalUsage = event.usage ?? null;
133
+ this.finalCostUsd = event.total_cost_usd ?? 0;
134
+ this.finalDurationMs = event.duration_ms ?? 0;
135
+ this.finalNumTurns = event.num_turns ?? 0;
136
+ this.finalResultText = event.result ?? "";
137
+ return;
138
+ }
139
+ }
140
+ /**
141
+ * Finalize the view. Call after consuming the last event from the stream.
142
+ *
143
+ * Throws if no `system/init` was observed — at that point we have no model,
144
+ * no session id, and no available-tools list, which means assertions like
145
+ * "called any mcp__api__* tool" can't even be evaluated meaningfully.
146
+ */
147
+ build() {
148
+ if (this.meta === null) throw new Error("TrajectoryBuilder.build() called before any system/init event was observed. The harness may have failed to start, or the stream was truncated before init.");
149
+ const lastTurn = this.turns[this.turns.length - 1];
150
+ const accumulatedText = this.turns.map((t) => t.text).filter((t) => t.length > 0).join("\n\n").trim();
151
+ return {
152
+ meta: this.meta,
153
+ toolCalls: this.allToolCalls,
154
+ turns: this.turns,
155
+ finalResponse: accumulatedText || this.finalResultText,
156
+ finalStopReason: lastTurn?.stopReason ?? null,
157
+ usage: {
158
+ inputTokens: this.finalUsage?.input_tokens ?? 0,
159
+ outputTokens: this.finalUsage?.output_tokens ?? 0,
160
+ totalCostUsd: this.finalCostUsd,
161
+ durationMs: this.finalDurationMs,
162
+ numTurns: this.finalNumTurns || this.turns.length
163
+ },
164
+ retries: this.retries,
165
+ success: this.sawResultEvent && !this.resultIsError
166
+ };
167
+ }
168
+ handleAssistantMessage(event) {
169
+ const turnIndex = this.turns.length;
170
+ const textChunks = [];
171
+ const toolCallsThisTurn = [];
172
+ for (const block of event.message.content) {
173
+ if (isTextBlock(block)) {
174
+ textChunks.push(block.text);
175
+ continue;
176
+ }
177
+ if (isToolUseBlock(block)) {
178
+ const call = {
179
+ name: block.name,
180
+ namespace: namespaceOf(block.name),
181
+ callId: block.id,
182
+ args: block.input,
183
+ result: null,
184
+ isError: false,
185
+ turnIndex,
186
+ callIndex: this.allToolCalls.length
187
+ };
188
+ this.allToolCalls.push(call);
189
+ this.pendingCalls.set(block.id, call);
190
+ toolCallsThisTurn.push(call);
191
+ continue;
192
+ }
193
+ }
194
+ this.turns.push({
195
+ turnIndex,
196
+ text: textChunks.join("").trim(),
197
+ toolCalls: toolCallsThisTurn,
198
+ stopReason: event.message.stop_reason ?? null
199
+ });
200
+ }
201
+ handleUserMessage(event) {
202
+ const content = event.message.content;
203
+ if (typeof content === "string") return;
204
+ for (const block of content) {
205
+ if (!isToolResultBlock(block)) continue;
206
+ const call = this.pendingCalls.get(block.tool_use_id);
207
+ if (!call) continue;
208
+ call.result = block.content;
209
+ call.isError = block.is_error ?? false;
210
+ this.pendingCalls.delete(block.tool_use_id);
211
+ }
212
+ }
213
+ };
214
+ /**
215
+ * Convenience: drain an async iterable of events through a fresh builder.
216
+ *
217
+ * Suitable when you have the full event stream and just want the view.
218
+ * For interactive/incremental scenarios (e.g. surfacing partial state in a UI)
219
+ * instantiate {@link TrajectoryBuilder} directly and call `consume()` /
220
+ * `build()` yourself.
221
+ */
222
+ async function buildTrajectory(events) {
223
+ const builder = new TrajectoryBuilder();
224
+ for await (const event of events) builder.consume(event);
225
+ return builder.build();
226
+ }
227
+ //#endregion
228
+ //#region src/adapters/types.ts
229
+ /**
230
+ * Thrown when the harness fails to produce a usable trajectory.
231
+ *
232
+ * Most commonly this means the process failed before emitting a usable
233
+ * session init event. Inspect `diagnostics.stderr` for the cause.
234
+ */
235
+ var AdapterError = class extends Error {
236
+ diagnostics;
237
+ constructor(message, diagnostics) {
238
+ super(message);
239
+ this.diagnostics = diagnostics;
240
+ this.name = "AdapterError";
241
+ }
242
+ };
243
+ //#endregion
244
+ export { isAssistantMessage as a, isSystemRetry as c, isToolUseBlock as d, isUserMessage as f, namespaceOf as i, isTextBlock as l, TrajectoryBuilder as n, isResult as o, buildTrajectory as r, isSystemInit as s, AdapterError as t, isToolResultBlock as u };
245
+
246
+ //# sourceMappingURL=types-Bac8_Ixb.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types-Bac8_Ixb.js","names":[],"sources":["../src/types/stream.ts","../src/types/trajectory.ts","../src/trajectory/builder.ts","../src/adapters/types.ts"],"sourcesContent":["/**\n * Discriminated union of events emitted by Claude Code's\n * `--output-format stream-json` mode.\n *\n * The format is NDJSON (one JSON object per line on stdout). Each line has\n * a required `type` field and often a `subtype` for further disambiguation.\n *\n * Source notes: the stream-json schema is not formally documented as of mid-2026.\n * These types are derived from:\n * - https://code.claude.com/docs/en/headless\n * - https://github.com/anthropics/claude-code/issues/24612 (event-types tracking issue)\n * - https://takopi.dev/reference/runners/claude/stream-json-cheatsheet/\n * - The `@anthropic-ai/claude-agent-sdk` TypeScript declaration files,\n * which are the de-facto source of truth.\n *\n * When adding new event types, prefer extending the union here rather than\n * branching on `any` in callers. Unknown events should be tolerated silently\n * by the builder (the schema evolves and we don't want CI to break on a new\n * event type we haven't modelled yet).\n */\n\n/** Top-level discriminated union of stream-json events. */\nexport type StreamEvent =\n | SystemInitEvent\n | SystemRetryEvent\n | SystemPluginInstallEvent\n | SystemCompactBoundaryEvent\n | SystemUnknownEvent\n | AssistantMessageEvent\n | UserMessageEvent\n | ResultEvent;\n\n// system events\n\n/** Emitted once at session start. Carries the session-level metadata. */\nexport interface SystemInitEvent {\n type: \"system\";\n subtype: \"init\";\n session_id: string;\n cwd: string;\n model: string;\n permissionMode?: string;\n apiKeySource?: string;\n /** Names of tools available in the session (built-in + MCP). */\n tools: string[];\n /** MCP servers configured for this session, with connection status. */\n mcp_servers: McpServerStatus[];\n}\n\nexport interface McpServerStatus {\n name: string;\n status: \"connected\" | \"disconnected\" | \"error\" | string;\n}\n\n/** Emitted when the API rate-limits us or otherwise asks for a retry. */\nexport interface SystemRetryEvent {\n type: \"system\";\n subtype: \"api_retry\";\n session_id: string;\n /** Implementation-defined retry payload (delay, reason, etc). */\n [key: string]: unknown;\n}\n\n/** Emitted while marketplace plugins are installing pre-session. */\nexport interface SystemPluginInstallEvent {\n type: \"system\";\n subtype: \"plugin_install\";\n session_id: string;\n [key: string]: unknown;\n}\n\n/** Emitted when Claude Code compacts the context window mid-session. */\nexport interface SystemCompactBoundaryEvent {\n type: \"system\";\n subtype: \"compact_boundary\";\n session_id: string;\n [key: string]: unknown;\n}\n\n/**\n * Catch-all for `type: \"system\"` events we haven't modelled.\n *\n * Keeps the union exhaustive while tolerating schema evolution. Callers should\n * either explicitly handle a known subtype or fall through to ignore.\n */\nexport interface SystemUnknownEvent {\n type: \"system\";\n subtype: string;\n session_id?: string;\n [key: string]: unknown;\n}\n\n// conversational events\n\n/** One assistant turn. The `message` field mirrors the Anthropic Messages API shape. */\nexport interface AssistantMessageEvent {\n type: \"assistant\";\n session_id: string;\n message: AssistantMessage;\n}\n\nexport interface AssistantMessage {\n id: string;\n type: \"message\";\n role: \"assistant\";\n content: ContentBlock[];\n model?: string;\n stop_reason?: StopReason | null;\n usage?: Usage;\n}\n\n/**\n * A user-role message in the stream.\n *\n * In stream-json these are usually *synthetic* — the harness injects them to\n * feed tool results back into the conversation after dispatching a tool. The\n * very first user message (the prompt) is also emitted here for completeness.\n */\nexport interface UserMessageEvent {\n type: \"user\";\n session_id: string;\n message: UserMessage;\n}\n\nexport interface UserMessage {\n role: \"user\";\n /** String for the initial prompt, array of blocks when carrying tool results. */\n content: ContentBlock[] | string;\n}\n\n// content blocks\n\nexport type ContentBlock = TextBlock | ToolUseBlock | ToolResultBlock;\n\nexport interface TextBlock {\n type: \"text\";\n text: string;\n}\n\nexport interface ToolUseBlock {\n type: \"tool_use\";\n /** Unique id assigned by the model; used to match tool_result back to this call. */\n id: string;\n /** Tool name. MCP tools follow the convention `mcp__<server>__<tool>`. */\n name: string;\n /** Arguments the model passed. Schema is per-tool. */\n input: unknown;\n}\n\nexport interface ToolResultBlock {\n type: \"tool_result\";\n /** The id of the corresponding tool_use block. */\n tool_use_id: string;\n /** Tool output. May be plain text or further content blocks for richer tools. */\n content: string | ContentBlock[];\n is_error?: boolean;\n}\n\n// result envelope\n\n/** Emitted once at session end. Carries aggregate usage and cost. */\nexport interface ResultEvent {\n type: \"result\";\n subtype: \"success\" | \"error\";\n session_id: string;\n total_cost_usd: number;\n is_error: boolean;\n duration_ms: number;\n duration_api_ms?: number;\n num_turns: number;\n /** The final text the harness returned, if any. */\n result?: string;\n usage?: Usage;\n}\n\n// shared scalars\n\n/**\n * Reasons the model can stop a turn. Open-ended string union because new\n * stop reasons appear over time.\n */\nexport type StopReason =\n | \"end_turn\"\n | \"tool_use\"\n | \"max_tokens\"\n | \"stop_sequence\"\n | (string & {});\n\nexport interface Usage {\n input_tokens: number;\n output_tokens: number;\n cache_creation_input_tokens?: number;\n cache_read_input_tokens?: number;\n}\n\n// type guards\n\n/** Type guards. Prefer these over manual `e.type === \"...\"` checks at call sites. */\n\nexport function isSystemInit(e: StreamEvent): e is SystemInitEvent {\n return e.type === \"system\" && (e as SystemInitEvent).subtype === \"init\";\n}\n\nexport function isSystemRetry(e: StreamEvent): e is SystemRetryEvent {\n return e.type === \"system\" && (e as SystemRetryEvent).subtype === \"api_retry\";\n}\n\nexport function isAssistantMessage(e: StreamEvent): e is AssistantMessageEvent {\n return e.type === \"assistant\";\n}\n\nexport function isUserMessage(e: StreamEvent): e is UserMessageEvent {\n return e.type === \"user\";\n}\n\nexport function isResult(e: StreamEvent): e is ResultEvent {\n return e.type === \"result\";\n}\n\nexport function isTextBlock(b: ContentBlock): b is TextBlock {\n return b.type === \"text\";\n}\n\nexport function isToolUseBlock(b: ContentBlock): b is ToolUseBlock {\n return b.type === \"tool_use\";\n}\n\nexport function isToolResultBlock(b: ContentBlock): b is ToolResultBlock {\n return b.type === \"tool_result\";\n}\n","/**\n * TrajectoryView — the assertion-friendly projection of a Claude Code session.\n *\n * The view is derived from the stream of {@link StreamEvent} values produced by\n * the harness, but is optimized for the queries that the assertion DSL needs to\n * express:\n *\n * - did tool X get called? (look at `toolCalls`)\n * - did tool A come before tool B? (compare `turnIndex` / `callIndex`)\n * - was a tool called with arguments matching predicate P? (`toolCalls[i].args`)\n * - did the agent answer without using any tool? (`toolCalls.length === 0`)\n *\n * The view is reconstructable from the raw events (lossless w.r.t. assertions),\n * but operating on it directly is dramatically simpler than walking event\n * streams or OTel span trees.\n *\n * Design notes:\n * - `turnIndex` and `callIndex` are the right primitives for ordering.\n * Wall-clock timestamps from the stream are unreliable for sub-second\n * ordering and parallel tool dispatch.\n * - Parallel tool calls (multiple `tool_use` blocks in one assistant message)\n * share a `turnIndex` but have distinct `callIndex` values in emission order.\n * - `namespace` is precomputed so assertions like `called(pattern: \"mcp__api__*\")`\n * can do a cheap string check.\n */\n\nimport type { StopReason } from \"./stream\";\n\nexport interface TrajectoryView {\n /** Session metadata, captured from the `system/init` event. */\n meta: SessionMeta;\n\n /** Every tool call, in global emission order. */\n toolCalls: ToolCall[];\n\n /** Each assistant turn: text content + any tool calls emitted in that turn. */\n turns: AssistantTurn[];\n\n /** All assistant text concatenated across turns. Useful for `response_contains`. */\n finalResponse: string;\n\n /** Stop reason of the *last* assistant turn. */\n finalStopReason: StopReason | null;\n\n /** Aggregate usage and cost from the result event. */\n usage: UsageSummary;\n\n /** Retry events observed during the run (rate limits, transient errors). */\n retries: RetryRecord[];\n\n /** Whether the result envelope indicated success. */\n success: boolean;\n}\n\nexport interface SessionMeta {\n sessionId: string;\n model: string;\n cwd: string;\n permissionMode?: string;\n /** Tool names the harness reported as available at session start. */\n availableTools: string[];\n /** MCP servers configured for the session, with connection status. */\n mcpServers: { name: string; status: string }[];\n}\n\nexport interface ToolCall {\n /** Fully-qualified tool name, e.g. `\"mcp__api__search_skills\"` or `\"Bash\"`. */\n name: string;\n\n /**\n * Namespace prefix for MCP-style names (`\"mcp__api\"`), or null for built-ins.\n * Precomputed via {@link namespaceOf} for cheap pattern matching.\n */\n namespace: string | null;\n\n /** The `tool_use` block's `id`; matches a later `tool_result.tool_use_id`. */\n callId: string;\n\n /** Args the model emitted on this call. Tool-specific schema. */\n args: unknown;\n\n /** Tool result, or null if no result was observed (e.g. process killed). */\n result: unknown | null;\n\n /** Whether the tool reported an error in its result. */\n isError: boolean;\n\n /**\n * Which assistant turn produced this call. Parallel calls within a single\n * assistant message share a `turnIndex`.\n */\n turnIndex: number;\n\n /** Index in the global ordered tool-call sequence. */\n callIndex: number;\n}\n\nexport interface AssistantTurn {\n turnIndex: number;\n /** Text emitted in this turn (may be empty if turn was tool-only). */\n text: string;\n /** Tool calls emitted in this turn, in their block order. */\n toolCalls: ToolCall[];\n /** Stop reason reported by the model for this turn. */\n stopReason: StopReason | null;\n}\n\nexport interface UsageSummary {\n inputTokens: number;\n outputTokens: number;\n totalCostUsd: number;\n durationMs: number;\n numTurns: number;\n}\n\nexport interface RetryRecord {\n /** ms since session start (approximate; the stream doesn't include precise ts). */\n offsetMs: number;\n /** Raw payload from the `system/api_retry` event for diagnostics. */\n raw: unknown;\n}\n\n// helpers\n\n/**\n * Extract the MCP namespace prefix from a tool name.\n *\n * Claude Code formats MCP tool names as `mcp__<server>__<tool>`. The namespace\n * is the first two segments joined: `mcp__<server>`. Returns null for non-MCP\n * tool names (built-ins like `Bash`, `Read`, `Edit`).\n *\n * @example\n * namespaceOf(\"mcp__api__search_skills\") // \"mcp__api\"\n * namespaceOf(\"Bash\") // null\n */\nexport function namespaceOf(toolName: string): string | null {\n if (!toolName.startsWith(\"mcp__\")) return null;\n const parts = toolName.split(\"__\");\n if (parts.length < 3) return null;\n return `${parts[0]}__${parts[1]}`;\n}\n","/**\n * TrajectoryBuilder — consumes a stream of {@link StreamEvent} values and\n * produces a {@link TrajectoryView}.\n *\n * State machine: the builder is a small, tolerant state machine. Invariants:\n *\n * - Exactly one `system/init` event opens the session. The builder requires\n * it to be present before `build()`.\n * - Each `assistant` event begins a new turn. Text blocks accumulate into\n * the turn's text; `tool_use` blocks become `ToolCall` records.\n * - `user` events with `tool_result` blocks deliver tool results back. We\n * match them to pending calls by `tool_use_id`.\n * - One `result` event closes the session and carries aggregate usage.\n *\n * The builder is *tolerant of partial streams*: a process killed mid-run\n * produces a coherent (but flagged) view. Tool calls without matching results\n * keep `result: null`. The `success` flag reflects whether a successful result\n * event was actually observed.\n *\n * Why a class (not a reducer)?\n * The internal `pendingCalls` map is mutable by design — we modify ToolCall\n * objects in place when results arrive, so other parts of the view (which\n * hold references to the same objects) see the update for free. A reducer\n * would force a deep copy per result event, which is wasteful and would\n * complicate identity-based queries.\n */\n\nimport {\n isAssistantMessage,\n isResult,\n isSystemInit,\n isTextBlock,\n isToolResultBlock,\n isToolUseBlock,\n isUserMessage,\n type StreamEvent,\n type Usage,\n} from \"../types/stream\";\nimport {\n namespaceOf,\n type AssistantTurn,\n type RetryRecord,\n type SessionMeta,\n type ToolCall,\n type TrajectoryView,\n} from \"../types/trajectory\";\n\nexport class TrajectoryBuilder {\n private meta: SessionMeta | null = null;\n private sessionStartTs: number | null = null;\n\n private turns: AssistantTurn[] = [];\n private allToolCalls: ToolCall[] = [];\n\n /**\n * tool_use_id → ToolCall, for matching results back to calls.\n * Entries are removed once a result is observed.\n */\n private pendingCalls: Map<string, ToolCall> = new Map();\n\n private retries: RetryRecord[] = [];\n\n private finalUsage: Usage | null = null;\n private finalCostUsd = 0;\n private finalDurationMs = 0;\n private finalNumTurns = 0;\n private finalResultText = \"\";\n private sawResultEvent = false;\n private resultIsError = false;\n\n /**\n * Consume one event. Safe to call with events in stream order.\n *\n * Unknown event types are silently ignored — the schema evolves and we\n * don't want CI to break on a new event type we haven't modelled.\n */\n consume(event: StreamEvent): void {\n if (isSystemInit(event)) {\n this.meta = {\n sessionId: event.session_id,\n model: event.model,\n cwd: event.cwd,\n permissionMode: event.permissionMode,\n availableTools: event.tools ?? [],\n mcpServers: (event.mcp_servers ?? []).map((s) => ({\n name: s.name,\n status: s.status,\n })),\n };\n this.sessionStartTs = Date.now();\n return;\n }\n\n if (event.type === \"system\" && event.subtype === \"api_retry\") {\n this.retries.push({\n offsetMs: this.sessionStartTs ? Date.now() - this.sessionStartTs : 0,\n raw: event,\n });\n return;\n }\n\n if (isAssistantMessage(event)) {\n this.handleAssistantMessage(event);\n return;\n }\n\n if (isUserMessage(event)) {\n this.handleUserMessage(event);\n return;\n }\n\n if (isResult(event)) {\n this.sawResultEvent = true;\n this.resultIsError = event.is_error;\n this.finalUsage = event.usage ?? null;\n this.finalCostUsd = event.total_cost_usd ?? 0;\n this.finalDurationMs = event.duration_ms ?? 0;\n this.finalNumTurns = event.num_turns ?? 0;\n this.finalResultText = event.result ?? \"\";\n return;\n }\n\n // Unknown event: ignored. See class doc.\n }\n\n /**\n * Finalize the view. Call after consuming the last event from the stream.\n *\n * Throws if no `system/init` was observed — at that point we have no model,\n * no session id, and no available-tools list, which means assertions like\n * \"called any mcp__api__* tool\" can't even be evaluated meaningfully.\n */\n build(): TrajectoryView {\n if (this.meta === null) {\n throw new Error(\n \"TrajectoryBuilder.build() called before any system/init event was observed. \" +\n \"The harness may have failed to start, or the stream was truncated before init.\",\n );\n }\n\n const lastTurn = this.turns[this.turns.length - 1];\n\n // Prefer the assistant text we accumulated turn-by-turn over the\n // `result.result` field, because the latter is sometimes a summary\n // and the former is exactly what the model said.\n const accumulatedText = this.turns\n .map((t) => t.text)\n .filter((t) => t.length > 0)\n .join(\"\\n\\n\")\n .trim();\n\n return {\n meta: this.meta,\n toolCalls: this.allToolCalls,\n turns: this.turns,\n finalResponse: accumulatedText || this.finalResultText,\n finalStopReason: lastTurn?.stopReason ?? null,\n usage: {\n inputTokens: this.finalUsage?.input_tokens ?? 0,\n outputTokens: this.finalUsage?.output_tokens ?? 0,\n totalCostUsd: this.finalCostUsd,\n durationMs: this.finalDurationMs,\n // Fall back to observed turn count if the result event was missing.\n numTurns: this.finalNumTurns || this.turns.length,\n },\n retries: this.retries,\n // Successful = saw a non-error result envelope. Streams that ended without\n // a result event are reported as unsuccessful regardless of tool outcomes.\n success: this.sawResultEvent && !this.resultIsError,\n };\n }\n\n // private handlers\n\n private handleAssistantMessage(\n event: Extract<StreamEvent, { type: \"assistant\" }>,\n ): void {\n const turnIndex = this.turns.length;\n const textChunks: string[] = [];\n const toolCallsThisTurn: ToolCall[] = [];\n\n for (const block of event.message.content) {\n if (isTextBlock(block)) {\n textChunks.push(block.text);\n continue;\n }\n if (isToolUseBlock(block)) {\n const call: ToolCall = {\n name: block.name,\n namespace: namespaceOf(block.name),\n callId: block.id,\n args: block.input,\n result: null,\n isError: false,\n turnIndex,\n callIndex: this.allToolCalls.length,\n };\n this.allToolCalls.push(call);\n this.pendingCalls.set(block.id, call);\n toolCallsThisTurn.push(call);\n continue;\n }\n // tool_result blocks don't appear in assistant messages — those arrive\n // via user messages. If one does appear, ignore it; we'd rather drop\n // an unexpected block than crash the eval.\n }\n\n this.turns.push({\n turnIndex,\n text: textChunks.join(\"\").trim(),\n toolCalls: toolCallsThisTurn,\n stopReason: event.message.stop_reason ?? null,\n });\n }\n\n private handleUserMessage(\n event: Extract<StreamEvent, { type: \"user\" }>,\n ): void {\n const content = event.message.content;\n\n // The very first user message carries the prompt as a plain string. We\n // already know the prompt (the caller passed it to the adapter), so we\n // ignore this case — there's nothing assertion-relevant in it.\n if (typeof content === \"string\") return;\n\n for (const block of content) {\n if (!isToolResultBlock(block)) continue;\n\n const call = this.pendingCalls.get(block.tool_use_id);\n if (!call) {\n // Unmatched result: ignore. Can happen if events arrive out of order\n // or the corresponding tool_use was emitted in an earlier run that\n // we're resuming. Either way, dropping is safer than throwing.\n continue;\n }\n\n call.result = block.content;\n call.isError = block.is_error ?? false;\n this.pendingCalls.delete(block.tool_use_id);\n }\n }\n}\n\n/**\n * Convenience: drain an async iterable of events through a fresh builder.\n *\n * Suitable when you have the full event stream and just want the view.\n * For interactive/incremental scenarios (e.g. surfacing partial state in a UI)\n * instantiate {@link TrajectoryBuilder} directly and call `consume()` /\n * `build()` yourself.\n */\nexport async function buildTrajectory(\n events: AsyncIterable<StreamEvent>,\n): Promise<TrajectoryView> {\n const builder = new TrajectoryBuilder();\n for await (const event of events) {\n builder.consume(event);\n }\n return builder.build();\n}\n","/**\n * Generic harness adapter contract.\n *\n * Every harness adapter produces a {@link TrajectoryView} plus process\n * diagnostics. The runner and assertion engine depend only on these types —\n * not on any specific harness implementation.\n */\n\nimport type { TrajectoryView } from \"../types/trajectory\";\n\n/** Base config every adapter must accept. */\nexport interface BaseAdapterConfig {\n prompt: string;\n model?: string;\n timeoutMs?: number;\n signal?: AbortSignal;\n env?: Record<string, string>;\n cwd?: string;\n}\n\n/** Suite-level config: generic fields plus adapter-specific nested blocks. */\nexport type SuiteConfig = Partial<BaseAdapterConfig> & {\n /** Claude Code adapter options (when `adapter` is `claude-code`). */\n claudeCode?: Record<string, unknown>;\n /** Codex CLI adapter options (when `adapter` is `codex`). */\n codex?: Record<string, unknown>;\n};\n\n/** Generic harness adapter interface. */\nexport interface HarnessAdapter<\n TConfig extends BaseAdapterConfig = BaseAdapterConfig,\n> {\n readonly id: string;\n run(config: TConfig): Promise<AdapterResult>;\n}\n\n/** Successful adapter run. */\nexport interface AdapterResult {\n view: TrajectoryView;\n diagnostics: AdapterDiagnostics;\n}\n\n/** Process-level diagnostics from any adapter. */\nexport interface AdapterDiagnostics {\n exitCode: number | null;\n signal: NodeJS.Signals | null;\n stderr: string;\n parseErrors: ParseErrorRecord[];\n timedOut: boolean;\n durationMs: number;\n}\n\nexport interface ParseErrorRecord {\n line: string;\n error: string;\n}\n\n/**\n * Thrown when the harness fails to produce a usable trajectory.\n *\n * Most commonly this means the process failed before emitting a usable\n * session init event. Inspect `diagnostics.stderr` for the cause.\n */\nexport class AdapterError extends Error {\n constructor(\n message: string,\n public readonly diagnostics: Partial<AdapterDiagnostics>,\n ) {\n super(message);\n this.name = \"AdapterError\";\n }\n}\n"],"mappings":";;AAuMA,SAAgB,aAAa,GAAsC;CACjE,OAAO,EAAE,SAAS,YAAa,EAAsB,YAAY;AACnE;AAEA,SAAgB,cAAc,GAAuC;CACnE,OAAO,EAAE,SAAS,YAAa,EAAuB,YAAY;AACpE;AAEA,SAAgB,mBAAmB,GAA4C;CAC7E,OAAO,EAAE,SAAS;AACpB;AAEA,SAAgB,cAAc,GAAuC;CACnE,OAAO,EAAE,SAAS;AACpB;AAEA,SAAgB,SAAS,GAAkC;CACzD,OAAO,EAAE,SAAS;AACpB;AAEA,SAAgB,YAAY,GAAiC;CAC3D,OAAO,EAAE,SAAS;AACpB;AAEA,SAAgB,eAAe,GAAoC;CACjE,OAAO,EAAE,SAAS;AACpB;AAEA,SAAgB,kBAAkB,GAAuC;CACvE,OAAO,EAAE,SAAS;AACpB;;;;;;;;;;;;;;AC9FA,SAAgB,YAAY,UAAiC;CAC3D,IAAI,CAAC,SAAS,WAAW,OAAO,GAAG,OAAO;CAC1C,MAAM,QAAQ,SAAS,MAAM,IAAI;CACjC,IAAI,MAAM,SAAS,GAAG,OAAO;CAC7B,OAAO,GAAG,MAAM,GAAG,IAAI,MAAM;AAC/B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AC7FA,IAAa,oBAAb,MAA+B;CAC7B,OAAmC;CACnC,iBAAwC;CAExC,QAAiC,CAAC;CAClC,eAAmC,CAAC;;;;;CAMpC,+BAA8C,IAAI,IAAI;CAEtD,UAAiC,CAAC;CAElC,aAAmC;CACnC,eAAuB;CACvB,kBAA0B;CAC1B,gBAAwB;CACxB,kBAA0B;CAC1B,iBAAyB;CACzB,gBAAwB;;;;;;;CAQxB,QAAQ,OAA0B;EAChC,IAAI,aAAa,KAAK,GAAG;GACvB,KAAK,OAAO;IACV,WAAW,MAAM;IACjB,OAAO,MAAM;IACb,KAAK,MAAM;IACX,gBAAgB,MAAM;IACtB,gBAAgB,MAAM,SAAS,CAAC;IAChC,aAAa,MAAM,eAAe,CAAC,EAAA,CAAG,KAAK,OAAO;KAChD,MAAM,EAAE;KACR,QAAQ,EAAE;IACZ,EAAE;GACJ;GACA,KAAK,iBAAiB,KAAK,IAAI;GAC/B;EACF;EAEA,IAAI,MAAM,SAAS,YAAY,MAAM,YAAY,aAAa;GAC5D,KAAK,QAAQ,KAAK;IAChB,UAAU,KAAK,iBAAiB,KAAK,IAAI,IAAI,KAAK,iBAAiB;IACnE,KAAK;GACP,CAAC;GACD;EACF;EAEA,IAAI,mBAAmB,KAAK,GAAG;GAC7B,KAAK,uBAAuB,KAAK;GACjC;EACF;EAEA,IAAI,cAAc,KAAK,GAAG;GACxB,KAAK,kBAAkB,KAAK;GAC5B;EACF;EAEA,IAAI,SAAS,KAAK,GAAG;GACnB,KAAK,iBAAiB;GACtB,KAAK,gBAAgB,MAAM;GAC3B,KAAK,aAAa,MAAM,SAAS;GACjC,KAAK,eAAe,MAAM,kBAAkB;GAC5C,KAAK,kBAAkB,MAAM,eAAe;GAC5C,KAAK,gBAAgB,MAAM,aAAa;GACxC,KAAK,kBAAkB,MAAM,UAAU;GACvC;EACF;CAGF;;;;;;;;CASA,QAAwB;EACtB,IAAI,KAAK,SAAS,MAChB,MAAM,IAAI,MACR,4JAEF;EAGF,MAAM,WAAW,KAAK,MAAM,KAAK,MAAM,SAAS;EAKhD,MAAM,kBAAkB,KAAK,MAC1B,KAAK,MAAM,EAAE,IAAI,CAAC,CAClB,QAAQ,MAAM,EAAE,SAAS,CAAC,CAAC,CAC3B,KAAK,MAAM,CAAC,CACZ,KAAK;EAER,OAAO;GACL,MAAM,KAAK;GACX,WAAW,KAAK;GAChB,OAAO,KAAK;GACZ,eAAe,mBAAmB,KAAK;GACvC,iBAAiB,UAAU,cAAc;GACzC,OAAO;IACL,aAAa,KAAK,YAAY,gBAAgB;IAC9C,cAAc,KAAK,YAAY,iBAAiB;IAChD,cAAc,KAAK;IACnB,YAAY,KAAK;IAEjB,UAAU,KAAK,iBAAiB,KAAK,MAAM;GAC7C;GACA,SAAS,KAAK;GAGd,SAAS,KAAK,kBAAkB,CAAC,KAAK;EACxC;CACF;CAIA,uBACE,OACM;EACN,MAAM,YAAY,KAAK,MAAM;EAC7B,MAAM,aAAuB,CAAC;EAC9B,MAAM,oBAAgC,CAAC;EAEvC,KAAK,MAAM,SAAS,MAAM,QAAQ,SAAS;GACzC,IAAI,YAAY,KAAK,GAAG;IACtB,WAAW,KAAK,MAAM,IAAI;IAC1B;GACF;GACA,IAAI,eAAe,KAAK,GAAG;IACzB,MAAM,OAAiB;KACrB,MAAM,MAAM;KACZ,WAAW,YAAY,MAAM,IAAI;KACjC,QAAQ,MAAM;KACd,MAAM,MAAM;KACZ,QAAQ;KACR,SAAS;KACT;KACA,WAAW,KAAK,aAAa;IAC/B;IACA,KAAK,aAAa,KAAK,IAAI;IAC3B,KAAK,aAAa,IAAI,MAAM,IAAI,IAAI;IACpC,kBAAkB,KAAK,IAAI;IAC3B;GACF;EAIF;EAEA,KAAK,MAAM,KAAK;GACd;GACA,MAAM,WAAW,KAAK,EAAE,CAAC,CAAC,KAAK;GAC/B,WAAW;GACX,YAAY,MAAM,QAAQ,eAAe;EAC3C,CAAC;CACH;CAEA,kBACE,OACM;EACN,MAAM,UAAU,MAAM,QAAQ;EAK9B,IAAI,OAAO,YAAY,UAAU;EAEjC,KAAK,MAAM,SAAS,SAAS;GAC3B,IAAI,CAAC,kBAAkB,KAAK,GAAG;GAE/B,MAAM,OAAO,KAAK,aAAa,IAAI,MAAM,WAAW;GACpD,IAAI,CAAC,MAIH;GAGF,KAAK,SAAS,MAAM;GACpB,KAAK,UAAU,MAAM,YAAY;GACjC,KAAK,aAAa,OAAO,MAAM,WAAW;EAC5C;CACF;AACF;;;;;;;;;AAUA,eAAsB,gBACpB,QACyB;CACzB,MAAM,UAAU,IAAI,kBAAkB;CACtC,WAAW,MAAM,SAAS,QACxB,QAAQ,QAAQ,KAAK;CAEvB,OAAO,QAAQ,MAAM;AACvB;;;;;;;;;ACpMA,IAAa,eAAb,cAAkC,MAAM;CAGpB;CAFlB,YACE,SACA,aACA;EACA,MAAM,OAAO;EAFG,KAAA,cAAA;EAGhB,KAAK,OAAO;CACd;AACF"}
@@ -0,0 +1,77 @@
1
+ import { i as BaseAdapterConfig, r as AdapterResult, x as StreamEvent } from "./types-C0gBkl0-.js";
2
+
3
+ //#region src/adapters/codex/types.d.ts
4
+ /** Codex sandbox policies (`codex exec --sandbox`). */
5
+ type CodexSandbox = "read-only" | "workspace-write" | "danger-full-access";
6
+ /** Codex approval modes (`--ask-for-approval`). */
7
+ type CodexAskForApproval = "untrusted" | "on-request" | "never";
8
+ /** Codex-specific options (nested under `codex` in YAML). */
9
+ interface CodexOptions {
10
+ binary?: string;
11
+ model?: string;
12
+ profile?: string;
13
+ sandbox?: CodexSandbox;
14
+ addDirs?: string[];
15
+ /** Inline `-c key=value` overrides (repeatable on CLI). */
16
+ configOverrides?: string[];
17
+ askForApproval?: CodexAskForApproval;
18
+ dangerouslyBypassApprovalsAndSandbox?: boolean;
19
+ dangerouslyBypassHookTrust?: boolean;
20
+ ephemeral?: boolean;
21
+ ignoreUserConfig?: boolean;
22
+ skipGitRepoCheck?: boolean;
23
+ outputSchema?: string;
24
+ outputLastMessage?: string;
25
+ /**
26
+ * When true (default), harness runs auto-generate a temp `--output-last-message`
27
+ * path and read it back as `finalResponse` if JSONL has no assistant_message.
28
+ */
29
+ captureLastMessage?: boolean;
30
+ /**
31
+ * When true, each run uses a fresh temp `$CODEX_HOME` for isolation.
32
+ * Default false — inherit caller's ~/.codex config and auth.
33
+ */
34
+ isolateConfig?: boolean;
35
+ }
36
+ /** Configuration for a single Codex harness run. */
37
+ interface CodexAdapterConfig extends BaseAdapterConfig, CodexOptions {}
38
+ /** Codex run result — includes mapped stream events for debugging. */
39
+ interface CodexAdapterResult extends AdapterResult {
40
+ rawEvents: StreamEvent[];
41
+ }
42
+ /** Raw Codex `--json` thread event (partial — tolerate unknown fields). */
43
+ interface CodexJsonEvent {
44
+ type?: string;
45
+ thread_id?: string;
46
+ usage?: CodexUsage;
47
+ item?: CodexItem;
48
+ message?: string;
49
+ }
50
+ /** Token usage on a Codex turn or thread event. */
51
+ interface CodexUsage {
52
+ input_tokens?: number;
53
+ cached_input_tokens?: number;
54
+ output_tokens?: number;
55
+ reasoning_output_tokens?: number;
56
+ }
57
+ /** One item in a Codex thread (tool call, command, or assistant message). */
58
+ interface CodexItem {
59
+ id?: string;
60
+ type?: string;
61
+ item_type?: string;
62
+ server?: string;
63
+ tool?: string;
64
+ arguments?: unknown;
65
+ command?: string;
66
+ exit_code?: number;
67
+ aggregated_output?: string;
68
+ text?: string;
69
+ result?: unknown;
70
+ error?: {
71
+ message?: string;
72
+ } | null;
73
+ status?: string;
74
+ }
75
+ //#endregion
76
+ export { CodexOptions as i, CodexAdapterResult as n, CodexJsonEvent as r, CodexAdapterConfig as t };
77
+ //# sourceMappingURL=types-Bu8uOZZN.d.ts.map
@@ -265,7 +265,8 @@ interface BaseAdapterConfig {
265
265
  }
266
266
  /** Suite-level config: generic fields plus adapter-specific nested blocks. */
267
267
  type SuiteConfig = Partial<BaseAdapterConfig> & {
268
- /** Claude Code adapter options (when `adapter` is `claude-code`). */claudeCode?: Record<string, unknown>;
268
+ /** Claude Code adapter options (when `adapter` is `claude-code`). */claudeCode?: Record<string, unknown>; /** Codex CLI adapter options (when `adapter` is `codex`). */
269
+ codex?: Record<string, unknown>;
269
270
  };
270
271
  /** Generic harness adapter interface. */
271
272
  interface HarnessAdapter<TConfig extends BaseAdapterConfig = BaseAdapterConfig> {
@@ -302,4 +303,4 @@ declare class AdapterError extends Error {
302
303
  }
303
304
  //#endregion
304
305
  export { Usage as A, isUserMessage as B, SystemInitEvent as C, TextBlock as D, SystemUnknownEvent as E, isSystemInit as F, isSystemRetry as I, isTextBlock as L, UserMessageEvent as M, isAssistantMessage as N, ToolResultBlock as O, isResult as P, isToolResultBlock as R, SystemCompactBoundaryEvent as S, SystemRetryEvent as T, ContentBlock as _, HarnessAdapter as a, StopReason as b, AssistantTurn as c, ToolCall as d, TrajectoryView as f, AssistantMessageEvent as g, AssistantMessage as h, BaseAdapterConfig as i, UserMessage as j, ToolUseBlock as k, RetryRecord as l, namespaceOf as m, AdapterError as n, ParseErrorRecord as o, UsageSummary as p, AdapterResult as r, SuiteConfig as s, AdapterDiagnostics as t, SessionMeta as u, McpServerStatus as v, SystemPluginInstallEvent as w, StreamEvent as x, ResultEvent as y, isToolUseBlock as z };
305
- //# sourceMappingURL=types-B9H4IZtA.d.ts.map
306
+ //# sourceMappingURL=types-C0gBkl0-.d.ts.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@alis-build/harness-eval",
3
- "version": "0.1.1",
3
+ "version": "0.1.3",
4
4
  "description": "Harness-level eval framework for measuring AI coding agent tool-selection behavior",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -27,6 +27,10 @@
27
27
  "types": "./dist/adapters/claude-code/index.d.ts",
28
28
  "import": "./dist/adapters/claude-code/index.js"
29
29
  },
30
+ "./adapters/codex": {
31
+ "types": "./dist/adapters/codex/index.d.ts",
32
+ "import": "./dist/adapters/codex/index.js"
33
+ },
30
34
  "./runner": {
31
35
  "types": "./dist/runner/suite.d.ts",
32
36
  "import": "./dist/runner/suite.js"
@@ -62,6 +66,7 @@
62
66
  "zod": "^4.4.3"
63
67
  },
64
68
  "devDependencies": {
69
+ "@google-cloud/aiplatform": "^6.8.1",
65
70
  "@types/node": "^22.12.0",
66
71
  "tsdown": "^0.22.3",
67
72
  "tsx": "^4.22.4",
@@ -71,5 +76,5 @@
71
76
  "publishConfig": {
72
77
  "access": "public"
73
78
  },
74
- "packageManager": "pnpm@11.3.0"
79
+ "packageManager": "pnpm@11.8.0"
75
80
  }
@@ -0,0 +1,196 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json#InstancesJsonlRow",
4
+ "title": "InstancesJsonlRow",
5
+ "description": "Type-tagged JSONL row for Vertex EvaluateInstances batching.",
6
+ "type": "object",
7
+ "properties": {
8
+ "messageType": {
9
+ "$ref": "#/$defs/__schema0"
10
+ },
11
+ "caseId": {
12
+ "$ref": "#/$defs/__schema1"
13
+ },
14
+ "repetitionIndex": {
15
+ "$ref": "#/$defs/__schema2"
16
+ },
17
+ "instance": {
18
+ "$ref": "#/$defs/__schema3"
19
+ }
20
+ },
21
+ "required": [
22
+ "messageType",
23
+ "caseId",
24
+ "repetitionIndex",
25
+ "instance"
26
+ ],
27
+ "additionalProperties": false,
28
+ "$defs": {
29
+ "__schema0": {
30
+ "type": "string",
31
+ "description": "Vertex protobuf message type name."
32
+ },
33
+ "__schema1": {
34
+ "type": "string",
35
+ "description": "Test case id."
36
+ },
37
+ "__schema2": {
38
+ "type": "integer",
39
+ "minimum": -9007199254740991,
40
+ "maximum": 9007199254740991,
41
+ "description": "Repetition index."
42
+ },
43
+ "__schema3": {
44
+ "anyOf": [
45
+ {
46
+ "$ref": "#/$defs/TrajectoryPairInstanceJson"
47
+ },
48
+ {
49
+ "$ref": "#/$defs/TrajectorySingleToolUseInstanceJson"
50
+ },
51
+ {
52
+ "$ref": "#/$defs/EvaluationInstanceJson"
53
+ }
54
+ ],
55
+ "description": "Protojson instance payload."
56
+ },
57
+ "TrajectoryPairInstanceJson": {
58
+ "type": "object",
59
+ "properties": {
60
+ "predictedTrajectory": {
61
+ "$ref": "#/$defs/__schema4"
62
+ },
63
+ "referenceTrajectory": {
64
+ "$ref": "#/$defs/__schema8"
65
+ }
66
+ },
67
+ "required": [
68
+ "predictedTrajectory",
69
+ "referenceTrajectory"
70
+ ],
71
+ "additionalProperties": false,
72
+ "title": "TrajectoryPairInstanceJson",
73
+ "description": "Shared shape for Trajectory*Match/Precision/Recall instances."
74
+ },
75
+ "__schema4": {
76
+ "description": "Predicted tool-call trajectory.",
77
+ "$ref": "#/$defs/ProtojsonTrajectory"
78
+ },
79
+ "__schema5": {
80
+ "type": "array",
81
+ "items": {
82
+ "$ref": "#/$defs/ProtojsonToolCall"
83
+ },
84
+ "description": "Ordered tool calls in the trajectory."
85
+ },
86
+ "ProtojsonToolCall": {
87
+ "type": "object",
88
+ "properties": {
89
+ "toolName": {
90
+ "$ref": "#/$defs/__schema6"
91
+ },
92
+ "toolInput": {
93
+ "$ref": "#/$defs/__schema7"
94
+ }
95
+ },
96
+ "required": [
97
+ "toolName",
98
+ "toolInput"
99
+ ],
100
+ "additionalProperties": false,
101
+ "title": "ProtojsonToolCall",
102
+ "description": "Tool call in Vertex EvaluationService wire format."
103
+ },
104
+ "__schema6": {
105
+ "type": "string",
106
+ "description": "Tool name as emitted by the agent."
107
+ },
108
+ "__schema7": {
109
+ "type": "string",
110
+ "description": "JSON-serialized tool arguments (Vertex wire format)."
111
+ },
112
+ "ProtojsonTrajectory": {
113
+ "type": "object",
114
+ "properties": {
115
+ "toolCalls": {
116
+ "$ref": "#/$defs/__schema5"
117
+ }
118
+ },
119
+ "required": [
120
+ "toolCalls"
121
+ ],
122
+ "additionalProperties": false,
123
+ "title": "ProtojsonTrajectory",
124
+ "description": "Vertex Trajectory message wire format."
125
+ },
126
+ "__schema8": {
127
+ "description": "Reference tool-call trajectory.",
128
+ "$ref": "#/$defs/ProtojsonTrajectory"
129
+ },
130
+ "TrajectorySingleToolUseInstanceJson": {
131
+ "type": "object",
132
+ "properties": {
133
+ "predictedTrajectory": {
134
+ "$ref": "#/$defs/__schema9"
135
+ }
136
+ },
137
+ "required": [
138
+ "predictedTrajectory"
139
+ ],
140
+ "additionalProperties": false,
141
+ "title": "TrajectorySingleToolUseInstanceJson",
142
+ "description": "Vertex TrajectorySingleToolUseInstance wire format."
143
+ },
144
+ "__schema9": {
145
+ "description": "Predicted tool-call trajectory.",
146
+ "$ref": "#/$defs/ProtojsonTrajectory"
147
+ },
148
+ "EvaluationInstanceJson": {
149
+ "type": "object",
150
+ "properties": {
151
+ "prompt": {
152
+ "$ref": "#/$defs/__schema10"
153
+ },
154
+ "response": {
155
+ "$ref": "#/$defs/__schema13"
156
+ },
157
+ "reference": {
158
+ "$ref": "#/$defs/__schema14"
159
+ }
160
+ },
161
+ "additionalProperties": false,
162
+ "title": "EvaluationInstanceJson",
163
+ "description": "Vertex EvaluationInstance wire format (agentEvalData omitted in v1)."
164
+ },
165
+ "__schema10": {
166
+ "description": "Eval prompt.",
167
+ "$ref": "#/$defs/InstanceData"
168
+ },
169
+ "InstanceData": {
170
+ "type": "object",
171
+ "properties": {
172
+ "text": {
173
+ "$ref": "#/$defs/__schema11"
174
+ }
175
+ },
176
+ "additionalProperties": false,
177
+ "title": "InstanceData",
178
+ "description": "EvaluationInstance prompt/response/reference text wrapper."
179
+ },
180
+ "__schema11": {
181
+ "description": "Plain text instance data.",
182
+ "$ref": "#/$defs/__schema12"
183
+ },
184
+ "__schema12": {
185
+ "type": "string"
186
+ },
187
+ "__schema13": {
188
+ "description": "Final agent response.",
189
+ "$ref": "#/$defs/InstanceData"
190
+ },
191
+ "__schema14": {
192
+ "description": "Reference answer text.",
193
+ "$ref": "#/$defs/InstanceData"
194
+ }
195
+ }
196
+ }