@alis-build/harness-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +700 -0
- package/dist/adapters/claude-code/index.d.ts +3 -0
- package/dist/adapters/claude-code/index.js +2 -0
- package/dist/build-DsVJ_UeU.js +1396 -0
- package/dist/build-DsVJ_UeU.js.map +1 -0
- package/dist/cardinality-DlE44e-4.js +31 -0
- package/dist/cardinality-DlE44e-4.js.map +1 -0
- package/dist/claude-code-ycT0JQZF.js +563 -0
- package/dist/claude-code-ycT0JQZF.js.map +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +623 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/config/loader.d.ts +2 -0
- package/dist/config/loader.js +2 -0
- package/dist/index-6Z17eKZx.d.ts +72 -0
- package/dist/index.d.ts +725 -0
- package/dist/index.js +5 -0
- package/dist/loader-BCnFJ8rm.js +717 -0
- package/dist/loader-BCnFJ8rm.js.map +1 -0
- package/dist/loader-DTvoVfN0.d.ts +33 -0
- package/dist/rolldown-runtime-D7D4PA-g.js +13 -0
- package/dist/runner/suite.d.ts +2 -0
- package/dist/runner/suite.js +2 -0
- package/dist/suite-BoOvK_lq.d.ts +7 -0
- package/dist/suite-chj0j22j.js +684 -0
- package/dist/suite-chj0j22j.js.map +1 -0
- package/dist/types-B9H4IZtA.d.ts +305 -0
- package/dist/types-BQol062t.d.ts +292 -0
- package/package.json +74 -0
- package/schemas/eval-interchange-agent-trace.schema.json +322 -0
- package/schemas/eval-interchange-proto-instance.schema.json +106 -0
- package/schemas/eval-interchange.schema.json +140 -0
- package/schemas/eval-run-envelope.schema.json +2195 -0
- package/schemas/trajectory-view.schema.json +441 -0
|
@@ -0,0 +1,1396 @@
|
|
|
1
|
+
import { i as buildJudgeArgs } from "./claude-code-ycT0JQZF.js";
|
|
2
|
+
import { n as createLimit } from "./suite-chj0j22j.js";
|
|
3
|
+
import { spawn } from "node:child_process";
|
|
4
|
+
import { readFile } from "node:fs/promises";
|
|
5
|
+
import { parse } from "yaml";
|
|
6
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
7
|
+
//#region src/types/eval-record.ts
|
|
8
|
+
/** Schema version for {@link EvalRunEnvelope} JSON documents. */
|
|
9
|
+
const EVAL_RUN_SCHEMA_VERSION = "1.0";
|
|
10
|
+
/** Schema version embedded in each {@link TrajectoryView} at export time. */
|
|
11
|
+
const TRAJECTORY_SCHEMA_VERSION = "1.0";
|
|
12
|
+
//#endregion
|
|
13
|
+
//#region src/otel/attributes.ts
|
|
14
|
+
function strAttr(key, value) {
|
|
15
|
+
return {
|
|
16
|
+
key,
|
|
17
|
+
value: { stringValue: value }
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
function intAttr(key, value) {
|
|
21
|
+
return {
|
|
22
|
+
key,
|
|
23
|
+
value: { intValue: String(value) }
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
function boolAttr(key, value) {
|
|
27
|
+
return {
|
|
28
|
+
key,
|
|
29
|
+
value: { boolValue: value }
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
function jsonAttr(key, value) {
|
|
33
|
+
return {
|
|
34
|
+
key,
|
|
35
|
+
value: { stringValue: JSON.stringify(value) }
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
//#endregion
|
|
39
|
+
//#region src/otel/messages.ts
|
|
40
|
+
function mapStopReason(reason) {
|
|
41
|
+
if (!reason) return void 0;
|
|
42
|
+
switch (reason) {
|
|
43
|
+
case "end_turn": return "stop";
|
|
44
|
+
case "tool_use": return "tool_calls";
|
|
45
|
+
case "max_tokens": return "length";
|
|
46
|
+
case "stop_sequence": return "stop";
|
|
47
|
+
default: return reason;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
function toolCallPart(call) {
|
|
51
|
+
return {
|
|
52
|
+
type: "tool_call",
|
|
53
|
+
id: call.callId,
|
|
54
|
+
name: call.name,
|
|
55
|
+
arguments: call.args ?? {}
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
function toolResponsePart(call) {
|
|
59
|
+
return {
|
|
60
|
+
type: "tool_call_response",
|
|
61
|
+
id: call.callId,
|
|
62
|
+
result: call.result
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
function assistantMessageFromTurn(turn) {
|
|
66
|
+
const parts = [];
|
|
67
|
+
if (turn.text) parts.push({
|
|
68
|
+
type: "text",
|
|
69
|
+
content: turn.text
|
|
70
|
+
});
|
|
71
|
+
for (const call of turn.toolCalls) parts.push(toolCallPart(call));
|
|
72
|
+
const finish = mapStopReason(turn.stopReason);
|
|
73
|
+
return {
|
|
74
|
+
role: "assistant",
|
|
75
|
+
parts,
|
|
76
|
+
...finish ? { finish_reason: finish } : {}
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
function toolResultsMessage(calls) {
|
|
80
|
+
const parts = calls.filter((c) => c.result !== null).map((c) => toolResponsePart(c));
|
|
81
|
+
if (parts.length === 0) return null;
|
|
82
|
+
return {
|
|
83
|
+
role: "tool",
|
|
84
|
+
parts
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Input history before the assistant turn at `turnIndex`.
|
|
89
|
+
*/
|
|
90
|
+
function inputMessagesBeforeTurn(view, turnIndex, prompt) {
|
|
91
|
+
const messages = [];
|
|
92
|
+
if (prompt) messages.push({
|
|
93
|
+
role: "user",
|
|
94
|
+
parts: [{
|
|
95
|
+
type: "text",
|
|
96
|
+
content: prompt
|
|
97
|
+
}]
|
|
98
|
+
});
|
|
99
|
+
for (let i = 0; i < turnIndex; i++) {
|
|
100
|
+
const turn = view.turns[i];
|
|
101
|
+
if (!turn) continue;
|
|
102
|
+
messages.push(assistantMessageFromTurn(turn));
|
|
103
|
+
const toolMsg = toolResultsMessage(turn.toolCalls);
|
|
104
|
+
if (toolMsg) messages.push(toolMsg);
|
|
105
|
+
}
|
|
106
|
+
return messages;
|
|
107
|
+
}
|
|
108
|
+
//#endregion
|
|
109
|
+
//#region src/otel/types.ts
|
|
110
|
+
/** OTLP span kinds (enum integers). */
|
|
111
|
+
const SpanKind = {
|
|
112
|
+
INTERNAL: 1,
|
|
113
|
+
CLIENT: 2
|
|
114
|
+
};
|
|
115
|
+
/** OTLP status codes. */
|
|
116
|
+
const StatusCode = {
|
|
117
|
+
UNSET: 0,
|
|
118
|
+
OK: 1,
|
|
119
|
+
ERROR: 2
|
|
120
|
+
};
|
|
121
|
+
//#endregion
|
|
122
|
+
//#region src/otel/emitter.ts
|
|
123
|
+
/**
|
|
124
|
+
* TrajectoryView → OTLP JSON export using OpenTelemetry GenAI semantic conventions.
|
|
125
|
+
*
|
|
126
|
+
* Produces an `ExportTraceServiceRequest` suitable for OTLP/HTTP JSON ingestion.
|
|
127
|
+
* Assertions continue to use {@link TrajectoryView} directly; this is export-only.
|
|
128
|
+
*/
|
|
129
|
+
const INSTRUMENTATION_VERSION = "0.1.0";
|
|
130
|
+
/**
|
|
131
|
+
* Map a {@link TrajectoryView} to OTLP trace JSON.
|
|
132
|
+
*
|
|
133
|
+
* Span tree (siblings under `invoke_agent`, not nested):
|
|
134
|
+
* ```
|
|
135
|
+
* invoke_agent
|
|
136
|
+
* ├── chat {model}
|
|
137
|
+
* ├── execute_tool {name}
|
|
138
|
+
* ├── chat {model}
|
|
139
|
+
* └── ...
|
|
140
|
+
* ```
|
|
141
|
+
*/
|
|
142
|
+
function trajectoryToOtlp(view, options = {}) {
|
|
143
|
+
const agentName = options.agentName ?? "claude-code";
|
|
144
|
+
const providerName = options.providerName ?? "anthropic";
|
|
145
|
+
const serviceName = options.serviceName ?? "harness-eval";
|
|
146
|
+
const scopeName = options.instrumentationScope ?? "@alis-build/harness-eval";
|
|
147
|
+
const traceId = traceIdFromSession(view.meta.sessionId);
|
|
148
|
+
const rootSpanId = spanIdFromKey(traceId, "invoke_agent");
|
|
149
|
+
const durationMs = Math.max(view.usage.durationMs, 1);
|
|
150
|
+
const endMs = options.endTimeMs ?? Date.now();
|
|
151
|
+
const startMs = endMs - durationMs;
|
|
152
|
+
const rootStartNs = msToNs(startMs);
|
|
153
|
+
const rootEndNs = msToNs(endMs);
|
|
154
|
+
const spans = [];
|
|
155
|
+
const timings = buildSpanTimings(view, startMs, endMs);
|
|
156
|
+
spans.push({
|
|
157
|
+
traceId,
|
|
158
|
+
spanId: rootSpanId,
|
|
159
|
+
name: "invoke_agent",
|
|
160
|
+
kind: SpanKind.INTERNAL,
|
|
161
|
+
startTimeUnixNano: rootStartNs,
|
|
162
|
+
endTimeUnixNano: rootEndNs,
|
|
163
|
+
attributes: [
|
|
164
|
+
strAttr("gen_ai.operation.name", "invoke_agent"),
|
|
165
|
+
strAttr("gen_ai.agent.name", agentName),
|
|
166
|
+
strAttr("gen_ai.provider.name", providerName),
|
|
167
|
+
strAttr("gen_ai.conversation.id", view.meta.sessionId),
|
|
168
|
+
strAttr("gen_ai.request.model", view.meta.model),
|
|
169
|
+
strAttr("gen_ai.response.model", view.meta.model),
|
|
170
|
+
intAttr("gen_ai.usage.input_tokens", view.usage.inputTokens),
|
|
171
|
+
intAttr("gen_ai.usage.output_tokens", view.usage.outputTokens),
|
|
172
|
+
boolAttr("harness_eval.success", view.success)
|
|
173
|
+
],
|
|
174
|
+
status: viewStatus(view)
|
|
175
|
+
});
|
|
176
|
+
let opIndex = 0;
|
|
177
|
+
for (const turn of view.turns) {
|
|
178
|
+
const chatTiming = timings[opIndex++];
|
|
179
|
+
const chatSpanId = spanIdFromKey(traceId, `chat:${turn.turnIndex}`);
|
|
180
|
+
const inputMessages = inputMessagesBeforeTurn(view, turn.turnIndex, options.prompt);
|
|
181
|
+
const outputMessages = [assistantMessageFromTurn(turn)];
|
|
182
|
+
spans.push({
|
|
183
|
+
traceId,
|
|
184
|
+
spanId: chatSpanId,
|
|
185
|
+
parentSpanId: rootSpanId,
|
|
186
|
+
name: `chat ${view.meta.model}`,
|
|
187
|
+
kind: SpanKind.CLIENT,
|
|
188
|
+
startTimeUnixNano: chatTiming.startNs,
|
|
189
|
+
endTimeUnixNano: chatTiming.endNs,
|
|
190
|
+
attributes: [
|
|
191
|
+
strAttr("gen_ai.operation.name", "chat"),
|
|
192
|
+
strAttr("gen_ai.provider.name", providerName),
|
|
193
|
+
strAttr("gen_ai.request.model", view.meta.model),
|
|
194
|
+
strAttr("gen_ai.response.model", view.meta.model),
|
|
195
|
+
...inputMessages.length > 0 ? [jsonAttr("gen_ai.input.messages", inputMessages)] : [],
|
|
196
|
+
jsonAttr("gen_ai.output.messages", outputMessages),
|
|
197
|
+
...turn.stopReason ? [jsonAttr("gen_ai.response.finish_reasons", [mapStopReason(turn.stopReason) ?? turn.stopReason])] : []
|
|
198
|
+
],
|
|
199
|
+
status: { code: StatusCode.OK }
|
|
200
|
+
});
|
|
201
|
+
if (turn.toolCalls.length === 0) continue;
|
|
202
|
+
const toolTiming = timings[opIndex++];
|
|
203
|
+
for (const call of turn.toolCalls) {
|
|
204
|
+
const toolSpanId = spanIdFromKey(traceId, `tool:${call.callId}`);
|
|
205
|
+
spans.push({
|
|
206
|
+
traceId,
|
|
207
|
+
spanId: toolSpanId,
|
|
208
|
+
parentSpanId: rootSpanId,
|
|
209
|
+
name: `execute_tool ${call.name}`,
|
|
210
|
+
kind: SpanKind.INTERNAL,
|
|
211
|
+
startTimeUnixNano: toolTiming.startNs,
|
|
212
|
+
endTimeUnixNano: toolTiming.endNs,
|
|
213
|
+
attributes: [
|
|
214
|
+
strAttr("gen_ai.operation.name", "execute_tool"),
|
|
215
|
+
strAttr("gen_ai.provider.name", providerName),
|
|
216
|
+
strAttr("gen_ai.tool.name", call.name),
|
|
217
|
+
strAttr("gen_ai.tool.call.id", call.callId),
|
|
218
|
+
jsonAttr("gen_ai.tool.call.arguments", call.args ?? {}),
|
|
219
|
+
...call.result !== null ? [jsonAttr("gen_ai.tool.call.result", call.result)] : [],
|
|
220
|
+
...call.namespace ? [strAttr("harness_eval.tool.namespace", call.namespace)] : [],
|
|
221
|
+
boolAttr("harness_eval.tool.is_error", call.isError)
|
|
222
|
+
],
|
|
223
|
+
status: call.isError ? {
|
|
224
|
+
code: StatusCode.ERROR,
|
|
225
|
+
message: "tool reported error"
|
|
226
|
+
} : { code: StatusCode.OK }
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
return { resourceSpans: [{
|
|
231
|
+
resource: { attributes: [strAttr("service.name", serviceName), strAttr("gen_ai.agent.name", agentName)] },
|
|
232
|
+
scopeSpans: [{
|
|
233
|
+
scope: {
|
|
234
|
+
name: scopeName,
|
|
235
|
+
version: INSTRUMENTATION_VERSION
|
|
236
|
+
},
|
|
237
|
+
spans
|
|
238
|
+
}]
|
|
239
|
+
}] };
|
|
240
|
+
}
|
|
241
|
+
/** Alias matching the implementation plan naming. */
|
|
242
|
+
const emitOtel = trajectoryToOtlp;
|
|
243
|
+
function viewStatus(view) {
|
|
244
|
+
if (view.success) return { code: StatusCode.OK };
|
|
245
|
+
return {
|
|
246
|
+
code: StatusCode.ERROR,
|
|
247
|
+
message: "harness run did not complete successfully"
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
function buildSpanTimings(view, startMs, endMs) {
|
|
251
|
+
const slots = [];
|
|
252
|
+
for (const turn of view.turns) {
|
|
253
|
+
slots.push("chat");
|
|
254
|
+
if (turn.toolCalls.length > 0) slots.push("tools");
|
|
255
|
+
}
|
|
256
|
+
if (slots.length === 0) return [];
|
|
257
|
+
const slotMs = Math.max(endMs - startMs, 1) / slots.length;
|
|
258
|
+
const timings = [];
|
|
259
|
+
let offset = startMs;
|
|
260
|
+
for (const slot of slots) {
|
|
261
|
+
const slotStart = offset;
|
|
262
|
+
const slotEnd = offset + slotMs;
|
|
263
|
+
timings.push({
|
|
264
|
+
startNs: msToNs(slotStart),
|
|
265
|
+
endNs: msToNs(slotEnd)
|
|
266
|
+
});
|
|
267
|
+
offset = slotEnd;
|
|
268
|
+
}
|
|
269
|
+
return timings;
|
|
270
|
+
}
|
|
271
|
+
function traceIdFromSession(sessionId) {
|
|
272
|
+
return createHash("sha256").update(`harness-eval:trace:${sessionId}`).digest("hex").slice(0, 32).toUpperCase();
|
|
273
|
+
}
|
|
274
|
+
function spanIdFromKey(traceId, key) {
|
|
275
|
+
return createHash("sha256").update(`${traceId}:span:${key}`).digest("hex").slice(0, 16).toUpperCase();
|
|
276
|
+
}
|
|
277
|
+
function msToNs(ms) {
|
|
278
|
+
return String(Math.round(ms * 1e6));
|
|
279
|
+
}
|
|
280
|
+
//#endregion
|
|
281
|
+
//#region src/grader/prompt.ts
|
|
282
|
+
function buildGraderPrompt(input) {
|
|
283
|
+
const expectationList = input.expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
|
|
284
|
+
return `${input.systemInstruction ? `${input.systemInstruction.trim()}\n\n` : ""}You are an automated evaluation grader (not the agent under test). Your only job is to score expectations against the transcript below.
|
|
285
|
+
|
|
286
|
+
Your job is to evaluate each expectation against the transcript and final response.
|
|
287
|
+
PASS only when there is clear evidence in the transcript or final response.
|
|
288
|
+
When uncertain, FAIL — burden of proof is on PASS.
|
|
289
|
+
|
|
290
|
+
Also critique the expectations themselves if any are trivially satisfied or miss important outcomes.
|
|
291
|
+
|
|
292
|
+
## Eval prompt
|
|
293
|
+
|
|
294
|
+
${input.prompt}
|
|
295
|
+
|
|
296
|
+
## Execution transcript
|
|
297
|
+
|
|
298
|
+
${input.transcript}
|
|
299
|
+
|
|
300
|
+
## Expectations to grade
|
|
301
|
+
|
|
302
|
+
${expectationList}
|
|
303
|
+
|
|
304
|
+
## Output format
|
|
305
|
+
|
|
306
|
+
Respond with ONLY a single JSON object (no markdown fences, no commentary) matching this schema:
|
|
307
|
+
|
|
308
|
+
{
|
|
309
|
+
"expectations": [
|
|
310
|
+
{ "text": "<original expectation>", "passed": true|false, "evidence": "<quote or description>" }
|
|
311
|
+
],
|
|
312
|
+
"summary": { "passed": <int>, "failed": <int>, "total": <int>, "pass_rate": <0.0-1.0> },
|
|
313
|
+
"eval_feedback": {
|
|
314
|
+
"suggestions": [{ "assertion": "<optional>", "reason": "<string>" }],
|
|
315
|
+
"overall": "<brief assessment>"
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
Include every expectation in the same order. summary must match the expectations array.`;
|
|
320
|
+
}
|
|
321
|
+
//#endregion
|
|
322
|
+
//#region src/grader/parse.ts
|
|
323
|
+
function extractClaudeResponseText(stdout) {
|
|
324
|
+
const trimmed = stdout.trim();
|
|
325
|
+
if (!trimmed) return "";
|
|
326
|
+
try {
|
|
327
|
+
const data = JSON.parse(trimmed);
|
|
328
|
+
if (Array.isArray(data)) return extractFromEventArray(data) ?? trimmed;
|
|
329
|
+
if (typeof data === "object" && data !== null) {
|
|
330
|
+
const event = data;
|
|
331
|
+
if (event.type === "result" && typeof event.result === "string") return event.result;
|
|
332
|
+
if (event.type === "assistant" && event.message) {
|
|
333
|
+
const text = textFromAssistantMessage(event.message);
|
|
334
|
+
if (text) return text;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
} catch {}
|
|
338
|
+
return trimmed;
|
|
339
|
+
}
|
|
340
|
+
function extractFromEventArray(events) {
|
|
341
|
+
const result = events.find((e) => typeof e === "object" && e !== null && e.type === "result");
|
|
342
|
+
if (result?.result) return result.result;
|
|
343
|
+
const assistantTexts = [];
|
|
344
|
+
for (const event of events) if (typeof event === "object" && event !== null && event.type === "assistant") {
|
|
345
|
+
const text = textFromAssistantMessage(event.message);
|
|
346
|
+
if (text) assistantTexts.push(text);
|
|
347
|
+
}
|
|
348
|
+
if (assistantTexts.length > 0) return assistantTexts[assistantTexts.length - 1];
|
|
349
|
+
return null;
|
|
350
|
+
}
|
|
351
|
+
function textFromAssistantMessage(message) {
|
|
352
|
+
if (!message || typeof message !== "object") return null;
|
|
353
|
+
const content = message.content;
|
|
354
|
+
if (typeof content === "string") return content;
|
|
355
|
+
if (!Array.isArray(content)) return null;
|
|
356
|
+
const texts = [];
|
|
357
|
+
for (const block of content) if (typeof block === "object" && block !== null && block.type === "text" && typeof block.text === "string") texts.push(block.text);
|
|
358
|
+
return texts.length > 0 ? texts.join("\n") : null;
|
|
359
|
+
}
|
|
360
|
+
function parseGraderJson(text) {
|
|
361
|
+
const candidates = [text.trim(), extractJsonBlock(text)];
|
|
362
|
+
for (const candidate of candidates) {
|
|
363
|
+
if (!candidate) continue;
|
|
364
|
+
try {
|
|
365
|
+
const normalized = normalizeGraderJson(JSON.parse(candidate));
|
|
366
|
+
if (normalized.expectations.length > 0) return normalized;
|
|
367
|
+
} catch {
|
|
368
|
+
continue;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return null;
|
|
372
|
+
}
|
|
373
|
+
function extractJsonBlock(text) {
|
|
374
|
+
const fence = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
375
|
+
if (fence?.[1]) return fence[1].trim();
|
|
376
|
+
const start = text.indexOf("{");
|
|
377
|
+
const end = text.lastIndexOf("}");
|
|
378
|
+
if (start >= 0 && end > start) return text.slice(start, end + 1);
|
|
379
|
+
return null;
|
|
380
|
+
}
|
|
381
|
+
function normalizeGraderJson(raw) {
|
|
382
|
+
const expectations = (raw.expectations ?? []).map((e) => ({
|
|
383
|
+
text: e.text ?? "",
|
|
384
|
+
passed: Boolean(e.passed),
|
|
385
|
+
evidence: e.evidence ?? ""
|
|
386
|
+
}));
|
|
387
|
+
const passed = expectations.filter((e) => e.passed).length;
|
|
388
|
+
const failed = expectations.length - passed;
|
|
389
|
+
const total = expectations.length;
|
|
390
|
+
const passRate = raw.summary?.pass_rate ?? raw.summary?.passRate ?? (total === 0 ? 0 : passed / total);
|
|
391
|
+
const summary = {
|
|
392
|
+
passed: raw.summary?.passed ?? passed,
|
|
393
|
+
failed: raw.summary?.failed ?? failed,
|
|
394
|
+
total: raw.summary?.total ?? total,
|
|
395
|
+
passRate
|
|
396
|
+
};
|
|
397
|
+
let evalFeedback;
|
|
398
|
+
if (raw.eval_feedback) evalFeedback = {
|
|
399
|
+
suggestions: (raw.eval_feedback.suggestions ?? []).map((s) => ({
|
|
400
|
+
assertion: s.assertion,
|
|
401
|
+
reason: s.reason ?? ""
|
|
402
|
+
})),
|
|
403
|
+
overall: raw.eval_feedback.overall ?? ""
|
|
404
|
+
};
|
|
405
|
+
return {
|
|
406
|
+
expectations,
|
|
407
|
+
summary,
|
|
408
|
+
evalFeedback
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
//#endregion
|
|
412
|
+
//#region src/grader/claude-grader.ts
|
|
413
|
+
/**
|
|
414
|
+
* Grade expectations by spawning Claude as judge (skill-creator grader pattern).
|
|
415
|
+
*/
|
|
416
|
+
const DEFAULT_TIMEOUT_MS = 3e5;
|
|
417
|
+
/**
|
|
418
|
+
* Judge subprocess defaults — grading is a single-shot JSON response, not an agent session.
|
|
419
|
+
* Without these, Claude Code may load plugins/MCP and loop on tools until timeout.
|
|
420
|
+
*/
|
|
421
|
+
const JUDGE_CLAUDE_DEFAULTS = {
|
|
422
|
+
maxTurns: 1,
|
|
423
|
+
bare: true,
|
|
424
|
+
disableSlashCommands: true,
|
|
425
|
+
noSessionPersistence: true
|
|
426
|
+
};
|
|
427
|
+
function mergeJudgeClaudeOptions(claudeCode) {
|
|
428
|
+
return {
|
|
429
|
+
...JUDGE_CLAUDE_DEFAULTS,
|
|
430
|
+
...claudeCode
|
|
431
|
+
};
|
|
432
|
+
}
|
|
433
|
+
function createClaudeGrader(options = {}) {
|
|
434
|
+
return (input) => runClaudeGrader(input, options);
|
|
435
|
+
}
|
|
436
|
+
async function runClaudeGrader(input, options = {}) {
|
|
437
|
+
const binary = options.binary ?? options.claudeCode?.binary ?? "claude";
|
|
438
|
+
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
439
|
+
const prompt = buildGraderPrompt(input);
|
|
440
|
+
const model = options.model ?? options.claudeCode?.model;
|
|
441
|
+
const responseText = extractClaudeResponseText(await spawnCollectStdout(binary, buildJudgeArgs(prompt, {
|
|
442
|
+
...mergeJudgeClaudeOptions(options.claudeCode),
|
|
443
|
+
model
|
|
444
|
+
}), timeoutMs, options.env, options.cwd));
|
|
445
|
+
const parsed = parseGraderJson(responseText);
|
|
446
|
+
if (!parsed) return {
|
|
447
|
+
expectations: input.expectations.map((text) => ({
|
|
448
|
+
text,
|
|
449
|
+
passed: false,
|
|
450
|
+
evidence: "Grader returned unparseable output"
|
|
451
|
+
})),
|
|
452
|
+
summary: {
|
|
453
|
+
passed: 0,
|
|
454
|
+
failed: input.expectations.length,
|
|
455
|
+
total: input.expectations.length,
|
|
456
|
+
passRate: 0
|
|
457
|
+
},
|
|
458
|
+
error: `failed to parse grader JSON from response: ${responseText.slice(0, 200)}`
|
|
459
|
+
};
|
|
460
|
+
const expectations = input.expectations.map((text, i) => {
|
|
461
|
+
const graded = parsed.expectations[i];
|
|
462
|
+
return {
|
|
463
|
+
text,
|
|
464
|
+
passed: graded?.passed ?? false,
|
|
465
|
+
evidence: graded?.evidence ?? "No evidence returned"
|
|
466
|
+
};
|
|
467
|
+
});
|
|
468
|
+
const passed = expectations.filter((e) => e.passed).length;
|
|
469
|
+
const total = expectations.length;
|
|
470
|
+
return {
|
|
471
|
+
expectations,
|
|
472
|
+
summary: {
|
|
473
|
+
passed,
|
|
474
|
+
failed: total - passed,
|
|
475
|
+
total,
|
|
476
|
+
passRate: total === 0 ? 0 : passed / total
|
|
477
|
+
},
|
|
478
|
+
evalFeedback: parsed.evalFeedback
|
|
479
|
+
};
|
|
480
|
+
}
|
|
481
|
+
function spawnCollectStdout(binary, args, timeoutMs, extraEnv, cwd) {
|
|
482
|
+
return new Promise((resolve, reject) => {
|
|
483
|
+
const child = spawn(binary, args, {
|
|
484
|
+
env: buildChildEnv(extraEnv),
|
|
485
|
+
cwd,
|
|
486
|
+
stdio: [
|
|
487
|
+
"ignore",
|
|
488
|
+
"pipe",
|
|
489
|
+
"pipe"
|
|
490
|
+
]
|
|
491
|
+
});
|
|
492
|
+
const chunks = [];
|
|
493
|
+
child.stdout?.setEncoding("utf8");
|
|
494
|
+
child.stdout?.on("data", (c) => chunks.push(c));
|
|
495
|
+
const stderrChunks = [];
|
|
496
|
+
child.stderr?.setEncoding("utf8");
|
|
497
|
+
child.stderr?.on("data", (c) => stderrChunks.push(c));
|
|
498
|
+
const timer = setTimeout(() => {
|
|
499
|
+
child.kill("SIGTERM");
|
|
500
|
+
const stderrHint = stderrChunks.join("").trim().slice(0, 400);
|
|
501
|
+
reject(/* @__PURE__ */ new Error(`grader timed out after ${timeoutMs}ms` + (stderrHint ? ` (stderr: ${stderrHint})` : "")));
|
|
502
|
+
}, timeoutMs);
|
|
503
|
+
const finalize = (err) => {
|
|
504
|
+
clearTimeout(timer);
|
|
505
|
+
if (err) reject(err);
|
|
506
|
+
else resolve(chunks.join(""));
|
|
507
|
+
};
|
|
508
|
+
child.on("error", (err) => finalize(err));
|
|
509
|
+
child.on("close", (code) => {
|
|
510
|
+
if (code !== 0 && chunks.length === 0) finalize(/* @__PURE__ */ new Error(`grader exited ${code}: ${stderrChunks.join("").slice(0, 500)}`));
|
|
511
|
+
else finalize();
|
|
512
|
+
});
|
|
513
|
+
});
|
|
514
|
+
}
|
|
515
|
+
function buildChildEnv(extraEnv) {
|
|
516
|
+
const env = {
|
|
517
|
+
...process.env,
|
|
518
|
+
...extraEnv
|
|
519
|
+
};
|
|
520
|
+
delete env.CLAUDECODE;
|
|
521
|
+
return env;
|
|
522
|
+
}
|
|
523
|
+
//#endregion
|
|
524
|
+
//#region src/grader/expectations.ts
|
|
525
|
+
/**
|
|
526
|
+
* Load expectations sidecar (YAML or JSON).
|
|
527
|
+
*/
|
|
528
|
+
async function loadExpectationsMap(path) {
|
|
529
|
+
const text = await readFile(path, "utf8");
|
|
530
|
+
const trimmed = path.trim().toLowerCase();
|
|
531
|
+
let raw;
|
|
532
|
+
if (trimmed.endsWith(".json")) raw = JSON.parse(text);
|
|
533
|
+
else raw = parse(text);
|
|
534
|
+
if (!raw || typeof raw !== "object") throw new Error(`expectations file must be an object mapping case ids to lists`);
|
|
535
|
+
const map = {};
|
|
536
|
+
for (const [caseId, value] of Object.entries(raw)) {
|
|
537
|
+
if (!Array.isArray(value)) throw new Error(`expectations for case "${caseId}" must be an array of strings`);
|
|
538
|
+
map[caseId] = value.map(String);
|
|
539
|
+
}
|
|
540
|
+
return map;
|
|
541
|
+
}
|
|
542
|
+
//#endregion
|
|
543
|
+
//#region src/grader/transcript.ts
|
|
544
|
+
const MAX_RESULT_CHARS = 4e3;
|
|
545
|
+
function trajectoryToTranscript(view, prompt) {
|
|
546
|
+
const lines = [];
|
|
547
|
+
if (prompt) lines.push("## User prompt", "", prompt, "");
|
|
548
|
+
for (const turn of view.turns) {
|
|
549
|
+
lines.push(`## Assistant turn ${turn.turnIndex + 1}`, "");
|
|
550
|
+
if (turn.text) lines.push(turn.text, "");
|
|
551
|
+
for (const call of turn.toolCalls) {
|
|
552
|
+
lines.push(`[Tool call] ${call.name} (id=${call.callId})`);
|
|
553
|
+
lines.push(`Arguments: ${formatJson$1(call.args)}`);
|
|
554
|
+
if (call.result !== null) {
|
|
555
|
+
lines.push(`[Tool result] ${formatResult(call.result)}`);
|
|
556
|
+
if (call.isError) lines.push("(tool reported error)");
|
|
557
|
+
} else lines.push("[Tool result] (none observed)");
|
|
558
|
+
lines.push("");
|
|
559
|
+
}
|
|
560
|
+
if (turn.stopReason) lines.push(`Stop reason: ${turn.stopReason}`, "");
|
|
561
|
+
}
|
|
562
|
+
const finalInTurns = view.turns.some((t) => t.text === view.finalResponse);
|
|
563
|
+
if (view.finalResponse && !finalInTurns) lines.push("## Final response", "", view.finalResponse, "");
|
|
564
|
+
lines.push("## Session metadata", `session_id: ${view.meta.sessionId}`, `model: ${view.meta.model}`, `cwd: ${view.meta.cwd}`, `success: ${view.success}`, `tool_calls: ${view.toolCalls.length}`, `duration_ms: ${view.usage.durationMs}`, `input_tokens: ${view.usage.inputTokens}`, `output_tokens: ${view.usage.outputTokens}`);
|
|
565
|
+
return lines.join("\n").trimEnd();
|
|
566
|
+
}
|
|
567
|
+
function formatJson$1(value) {
|
|
568
|
+
try {
|
|
569
|
+
return JSON.stringify(value);
|
|
570
|
+
} catch {
|
|
571
|
+
return String(value);
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
function formatResult(result) {
|
|
575
|
+
if (typeof result === "string") return truncate(result);
|
|
576
|
+
return truncate(formatJson$1(result));
|
|
577
|
+
}
|
|
578
|
+
function truncate(text) {
|
|
579
|
+
if (text.length <= MAX_RESULT_CHARS) return text;
|
|
580
|
+
return `${text.slice(0, MAX_RESULT_CHARS)}… (truncated)`;
|
|
581
|
+
}
|
|
582
|
+
//#endregion
|
|
583
|
+
//#region src/grader/grade-report.ts
|
|
584
|
+
/**
|
|
585
|
+
* Grade a harness-eval SuiteReport with outcome expectations (LLM judge).
|
|
586
|
+
*/
|
|
587
|
+
async function gradeReport(report, options = {}) {
|
|
588
|
+
const expectationsMap = options.expectationsPath ? await loadExpectationsMap(options.expectationsPath) : {};
|
|
589
|
+
const gradeFn = options.gradeFn ?? createClaudeGrader({
|
|
590
|
+
binary: options.binary,
|
|
591
|
+
model: options.model,
|
|
592
|
+
timeoutMs: options.timeoutMs,
|
|
593
|
+
env: options.env,
|
|
594
|
+
cwd: options.cwd,
|
|
595
|
+
claudeCode: options.claudeCode
|
|
596
|
+
});
|
|
597
|
+
const limit = createLimit(options.maxConcurrent ?? 2);
|
|
598
|
+
const tasks = [];
|
|
599
|
+
for (const cell of report.cells) {
|
|
600
|
+
const expectations = cell.expectations ?? expectationsMap[cell.caseId] ?? [];
|
|
601
|
+
if (expectations.length === 0) continue;
|
|
602
|
+
for (const rep of cell.repetitions) {
|
|
603
|
+
if (!rep.adapterResult) continue;
|
|
604
|
+
tasks.push({
|
|
605
|
+
cell,
|
|
606
|
+
rep,
|
|
607
|
+
expectations
|
|
608
|
+
});
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
const gradeStartTs = Date.now();
|
|
612
|
+
options.onProgress?.({
|
|
613
|
+
kind: "grade-start",
|
|
614
|
+
total: tasks.length
|
|
615
|
+
});
|
|
616
|
+
const results = await Promise.all(tasks.map(({ cell, rep, expectations }) => limit(async () => {
|
|
617
|
+
const start = Date.now();
|
|
618
|
+
const view = rep.adapterResult.view;
|
|
619
|
+
const prompt = cell.prompt ?? "";
|
|
620
|
+
const transcript = trajectoryToTranscript(view, prompt);
|
|
621
|
+
try {
|
|
622
|
+
const graded = await gradeFn({
|
|
623
|
+
prompt,
|
|
624
|
+
transcript,
|
|
625
|
+
expectations,
|
|
626
|
+
systemInstruction: options.systemInstruction
|
|
627
|
+
});
|
|
628
|
+
const result = {
|
|
629
|
+
caseId: cell.caseId,
|
|
630
|
+
cellLabel: cell.cell.label,
|
|
631
|
+
repetitionIndex: rep.repetitionIndex,
|
|
632
|
+
prompt,
|
|
633
|
+
expectations: graded.expectations,
|
|
634
|
+
summary: graded.summary,
|
|
635
|
+
evalFeedback: graded.evalFeedback,
|
|
636
|
+
graderError: graded.error,
|
|
637
|
+
durationMs: Date.now() - start
|
|
638
|
+
};
|
|
639
|
+
options.onProgress?.({
|
|
640
|
+
kind: "grade-complete",
|
|
641
|
+
caseId: result.caseId,
|
|
642
|
+
cellLabel: result.cellLabel,
|
|
643
|
+
repetitionIndex: result.repetitionIndex,
|
|
644
|
+
passed: result.summary.passed,
|
|
645
|
+
failed: result.summary.failed,
|
|
646
|
+
durationMs: result.durationMs,
|
|
647
|
+
graderError: result.graderError
|
|
648
|
+
});
|
|
649
|
+
return result;
|
|
650
|
+
} catch (err) {
|
|
651
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
652
|
+
const result = {
|
|
653
|
+
caseId: cell.caseId,
|
|
654
|
+
cellLabel: cell.cell.label,
|
|
655
|
+
repetitionIndex: rep.repetitionIndex,
|
|
656
|
+
prompt,
|
|
657
|
+
expectations: expectations.map((text) => ({
|
|
658
|
+
text,
|
|
659
|
+
passed: false,
|
|
660
|
+
evidence: message
|
|
661
|
+
})),
|
|
662
|
+
summary: {
|
|
663
|
+
passed: 0,
|
|
664
|
+
failed: expectations.length,
|
|
665
|
+
total: expectations.length,
|
|
666
|
+
passRate: 0
|
|
667
|
+
},
|
|
668
|
+
graderError: message,
|
|
669
|
+
durationMs: Date.now() - start
|
|
670
|
+
};
|
|
671
|
+
options.onProgress?.({
|
|
672
|
+
kind: "grade-complete",
|
|
673
|
+
caseId: result.caseId,
|
|
674
|
+
cellLabel: result.cellLabel,
|
|
675
|
+
repetitionIndex: result.repetitionIndex,
|
|
676
|
+
passed: 0,
|
|
677
|
+
failed: expectations.length,
|
|
678
|
+
durationMs: result.durationMs,
|
|
679
|
+
graderError: message
|
|
680
|
+
});
|
|
681
|
+
return result;
|
|
682
|
+
}
|
|
683
|
+
})));
|
|
684
|
+
results.sort((a, b) => {
|
|
685
|
+
const keyA = `${a.caseId}::${a.cellLabel}::${a.repetitionIndex}`;
|
|
686
|
+
const keyB = `${b.caseId}::${b.cellLabel}::${b.repetitionIndex}`;
|
|
687
|
+
return keyA.localeCompare(keyB);
|
|
688
|
+
});
|
|
689
|
+
const totalExpectations = results.reduce((n, r) => n + r.summary.total, 0);
|
|
690
|
+
const passedExpectations = results.reduce((n, r) => n + r.summary.passed, 0);
|
|
691
|
+
options.onProgress?.({
|
|
692
|
+
kind: "grade-done",
|
|
693
|
+
durationMs: Date.now() - gradeStartTs,
|
|
694
|
+
totalExpectations,
|
|
695
|
+
passedExpectations
|
|
696
|
+
});
|
|
697
|
+
return {
|
|
698
|
+
gradedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
699
|
+
sourceReport: options.sourceReport ?? "",
|
|
700
|
+
gradingConfigPath: options.gradingConfigPath,
|
|
701
|
+
results,
|
|
702
|
+
summary: {
|
|
703
|
+
passed: passedExpectations,
|
|
704
|
+
failed: totalExpectations - passedExpectations,
|
|
705
|
+
total: totalExpectations,
|
|
706
|
+
passRate: totalExpectations === 0 ? 0 : passedExpectations / totalExpectations
|
|
707
|
+
}
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
async function loadSuiteReport(path) {
|
|
711
|
+
const text = await readFile(path, "utf8");
|
|
712
|
+
return JSON.parse(text);
|
|
713
|
+
}
|
|
714
|
+
//#endregion
|
|
715
|
+
//#region src/grader/resolve-grade-options.ts
|
|
716
|
+
/**
|
|
717
|
+
* Merge standalone grading YAML with CLI flags (CLI wins).
|
|
718
|
+
*/
|
|
719
|
+
function resolveGradeOptions(fileConfig, cli = {}, configPath) {
|
|
720
|
+
const judge = fileConfig?.judge;
|
|
721
|
+
const adapter = judge?.adapter ?? "claude-code";
|
|
722
|
+
if (adapter !== "claude-code") throw new Error(`unsupported grading adapter "${adapter}" (only claude-code today)`);
|
|
723
|
+
const claudeCode = judge?.claudeCode ?? {};
|
|
724
|
+
const binary = cli.binary ?? claudeCode.binary;
|
|
725
|
+
const model = cli.model ?? judge?.model ?? claudeCode.model;
|
|
726
|
+
return {
|
|
727
|
+
sourceReport: cli.sourceReport,
|
|
728
|
+
expectationsPath: cli.expectationsPath,
|
|
729
|
+
model,
|
|
730
|
+
binary,
|
|
731
|
+
timeoutMs: cli.timeoutMs ?? judge?.timeoutMs,
|
|
732
|
+
maxConcurrent: cli.maxConcurrent ?? judge?.maxConcurrent,
|
|
733
|
+
systemInstruction: judge?.system_instruction,
|
|
734
|
+
env: judge?.env,
|
|
735
|
+
cwd: judge?.cwd,
|
|
736
|
+
claudeCode: {
|
|
737
|
+
...claudeCode,
|
|
738
|
+
binary: void 0,
|
|
739
|
+
model: void 0
|
|
740
|
+
},
|
|
741
|
+
gradingConfigPath: configPath
|
|
742
|
+
};
|
|
743
|
+
}
|
|
744
|
+
//#endregion
|
|
745
|
+
//#region src/grader/format-console.ts
|
|
746
|
+
const RESET$1 = "\x1B[0m";
|
|
747
|
+
const GREEN$1 = "\x1B[32m";
|
|
748
|
+
const RED$1 = "\x1B[31m";
|
|
749
|
+
const DIM = "\x1B[2m";
|
|
750
|
+
function formatGradingConsole(report, color = true) {
|
|
751
|
+
const lines = [];
|
|
752
|
+
if (report.results.length === 0) {
|
|
753
|
+
lines.push("No repetitions graded. Add expectations to the suite YAML or pass --expectations.");
|
|
754
|
+
return lines.join("\n");
|
|
755
|
+
}
|
|
756
|
+
for (const result of report.results) {
|
|
757
|
+
const status = result.summary.failed === 0 && !result.graderError ? color ? `${GREEN$1}PASS${RESET$1}` : "PASS" : color ? `${RED$1}FAIL${RESET$1}` : "FAIL";
|
|
758
|
+
lines.push(`${result.caseId} @ ${result.cellLabel} rep${result.repetitionIndex} ${status}`);
|
|
759
|
+
if (result.graderError) lines.push(color ? ` ${RED$1}grader error: ${result.graderError}${RESET$1}` : ` grader error: ${result.graderError}`);
|
|
760
|
+
for (const exp of result.expectations) {
|
|
761
|
+
const marker = exp.passed ? color ? `${GREEN$1}✓${RESET$1}` : "✓" : color ? `${RED$1}✗${RESET$1}` : "✗";
|
|
762
|
+
lines.push(` ├─ ${exp.text} ${marker}`);
|
|
763
|
+
if (!exp.passed || exp.evidence) lines.push(color ? ` │ ${DIM}${exp.evidence}${RESET$1}` : ` │ ${exp.evidence}`);
|
|
764
|
+
}
|
|
765
|
+
const pct = (result.summary.passRate * 100).toFixed(0);
|
|
766
|
+
lines.push(` └─ ${result.summary.passed}/${result.summary.total} (${pct}%) expectations`);
|
|
767
|
+
lines.push("");
|
|
768
|
+
}
|
|
769
|
+
const overallPct = (report.summary.passRate * 100).toFixed(0);
|
|
770
|
+
lines.push(`Overall: ${report.summary.passed}/${report.summary.total} (${overallPct}%) expectations passed`);
|
|
771
|
+
return lines.join("\n").trimEnd();
|
|
772
|
+
}
|
|
773
|
+
function gradingReportPassed(report) {
|
|
774
|
+
return report.results.every((r) => !r.graderError && r.summary.failed === 0 && r.summary.total > 0);
|
|
775
|
+
}
|
|
776
|
+
//#endregion
|
|
777
|
+
//#region src/reporter/format-console.ts
|
|
778
|
+
const RESET = "\x1B[0m";
|
|
779
|
+
const GREEN = "\x1B[32m";
|
|
780
|
+
const RED = "\x1B[31m";
|
|
781
|
+
const YELLOW = "\x1B[33m";
|
|
782
|
+
function formatConsole(rows, color = true) {
|
|
783
|
+
const lines = [];
|
|
784
|
+
for (const row of rows) {
|
|
785
|
+
const status = row.passed ? color ? `${GREEN}PASS${RESET}` : "PASS" : color ? `${RED}FAIL${RESET}` : "FAIL";
|
|
786
|
+
const crashNote = row.adapterErrors > 0 ? ` ${color ? YELLOW : ""}[${row.adapterErrors} adapter errors]${color ? RESET : ""}` : "";
|
|
787
|
+
lines.push(`${row.caseId} @ ${row.cellLabel} ${status}${crashNote}`);
|
|
788
|
+
if (row.category) lines.push(` category: ${row.category}`);
|
|
789
|
+
for (const stat of row.stats) {
|
|
790
|
+
const marker = stat.meetsThreshold ? color ? `${GREEN}✓${RESET}` : "✓" : color ? `${RED}✗${RESET}` : "✗";
|
|
791
|
+
const rateStr = formatRate$1(stat);
|
|
792
|
+
const thresholdPct = (stat.threshold * 100).toFixed(0);
|
|
793
|
+
let line = ` ├─ ${stat.description}: ${rateStr} [threshold ${thresholdPct}%] ${marker}`;
|
|
794
|
+
if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
|
|
795
|
+
const arrow = stat.delta >= 0 ? "↑" : "↓";
|
|
796
|
+
const basePct = (stat.baselinePassRate * 100).toFixed(0);
|
|
797
|
+
const curPct = (stat.passRate * 100).toFixed(0);
|
|
798
|
+
const deltaPct = (stat.delta * 100).toFixed(0);
|
|
799
|
+
line += ` (${basePct}% → ${curPct}% (${arrow}${deltaPct}%))`;
|
|
800
|
+
}
|
|
801
|
+
lines.push(line);
|
|
802
|
+
}
|
|
803
|
+
lines.push("");
|
|
804
|
+
}
|
|
805
|
+
return lines.join("\n").trimEnd();
|
|
806
|
+
}
|
|
807
|
+
function formatRate$1(stat) {
|
|
808
|
+
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
809
|
+
const pct = (stat.passRate * 100).toFixed(0);
|
|
810
|
+
return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
|
|
811
|
+
}
|
|
812
|
+
//#endregion
|
|
813
|
+
//#region src/reporter/format-json.ts
|
|
814
|
+
function formatJson(report) {
|
|
815
|
+
return JSON.stringify(report, null, 2);
|
|
816
|
+
}
|
|
817
|
+
//#endregion
|
|
818
|
+
//#region src/reporter/format-markdown.ts
|
|
819
|
+
function formatMarkdown(rows) {
|
|
820
|
+
const lines = ["# Harness Eval Report", ""];
|
|
821
|
+
for (const row of rows) {
|
|
822
|
+
const status = row.passed ? "PASS" : "FAIL";
|
|
823
|
+
const crashNote = row.adapterErrors > 0 ? ` (${row.adapterErrors} adapter errors)` : "";
|
|
824
|
+
lines.push(`## ${row.caseId} @ ${row.cellLabel} — ${status}${crashNote}`);
|
|
825
|
+
if (row.category) lines.push(`**Category:** ${row.category}`);
|
|
826
|
+
if (row.notes) lines.push("<details><summary>Notes</summary>", row.notes, "</details>");
|
|
827
|
+
lines.push("");
|
|
828
|
+
lines.push("| Assertion | Result | Threshold | Status |");
|
|
829
|
+
lines.push("| --- | --- | --- | --- |");
|
|
830
|
+
for (const stat of row.stats) {
|
|
831
|
+
const rateStr = formatRate(stat);
|
|
832
|
+
const threshold = `${(stat.threshold * 100).toFixed(0)}%`;
|
|
833
|
+
const statusCell = stat.meetsThreshold ? "✓" : "✗";
|
|
834
|
+
let result = rateStr;
|
|
835
|
+
if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
|
|
836
|
+
const base = (stat.baselinePassRate * 100).toFixed(0);
|
|
837
|
+
const cur = (stat.passRate * 100).toFixed(0);
|
|
838
|
+
const d = (stat.delta * 100).toFixed(0);
|
|
839
|
+
const sign = stat.delta >= 0 ? "+" : "";
|
|
840
|
+
result += ` (${base}% → ${cur}%, ${sign}${d}%)`;
|
|
841
|
+
}
|
|
842
|
+
lines.push(`| ${stat.description} | ${result} | ${threshold} | ${statusCell} |`);
|
|
843
|
+
}
|
|
844
|
+
lines.push("");
|
|
845
|
+
}
|
|
846
|
+
return lines.join("\n").trimEnd();
|
|
847
|
+
}
|
|
848
|
+
function formatRate(stat) {
|
|
849
|
+
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
850
|
+
const pct = (stat.passRate * 100).toFixed(0);
|
|
851
|
+
return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
|
|
852
|
+
}
|
|
853
|
+
//#endregion
|
|
854
|
+
//#region src/reporter/renderable.ts
|
|
855
|
+
function toRenderableRows(report) {
|
|
856
|
+
return report.cells.map((cell) => cellToRow(cell));
|
|
857
|
+
}
|
|
858
|
+
function applyBaseline(rows, baseline) {
|
|
859
|
+
const baselineMap = new Map(baseline.cells.map((c) => [`${c.caseId}::${c.cell.label}`, c]));
|
|
860
|
+
return rows.map((row) => {
|
|
861
|
+
const baseCell = baselineMap.get(`${row.caseId}::${row.cellLabel}`);
|
|
862
|
+
if (!baseCell) return row;
|
|
863
|
+
const stats = row.stats.map((stat, i) => {
|
|
864
|
+
const baseStat = baseCell.assertionStats[i];
|
|
865
|
+
if (!baseStat) return stat;
|
|
866
|
+
const delta = stat.passRate - baseStat.passRate;
|
|
867
|
+
return {
|
|
868
|
+
...stat,
|
|
869
|
+
baselinePassRate: baseStat.passRate,
|
|
870
|
+
delta
|
|
871
|
+
};
|
|
872
|
+
});
|
|
873
|
+
return {
|
|
874
|
+
...row,
|
|
875
|
+
stats
|
|
876
|
+
};
|
|
877
|
+
});
|
|
878
|
+
}
|
|
879
|
+
function cellToRow(cell) {
|
|
880
|
+
const totalReps = cell.repetitions.length;
|
|
881
|
+
const stats = cell.assertionStats.map((s) => ({
|
|
882
|
+
description: s.description,
|
|
883
|
+
threshold: s.threshold,
|
|
884
|
+
passedCount: s.passedCount,
|
|
885
|
+
evaluatedCount: s.evaluatedCount,
|
|
886
|
+
totalReps,
|
|
887
|
+
adapterErrors: cell.adapterErrors,
|
|
888
|
+
passRate: s.passRate,
|
|
889
|
+
meetsThreshold: s.meetsThreshold
|
|
890
|
+
}));
|
|
891
|
+
return {
|
|
892
|
+
caseId: cell.caseId,
|
|
893
|
+
category: cell.category,
|
|
894
|
+
notes: cell.notes,
|
|
895
|
+
cellLabel: cell.cell.label,
|
|
896
|
+
passed: cell.passed,
|
|
897
|
+
adapterErrors: cell.adapterErrors,
|
|
898
|
+
totalReps,
|
|
899
|
+
stats
|
|
900
|
+
};
|
|
901
|
+
}
|
|
902
|
+
//#endregion
|
|
903
|
+
//#region src/reporter/index.ts
|
|
904
|
+
function formatReport(report, options) {
|
|
905
|
+
if (options.format === "json") return formatJson(report);
|
|
906
|
+
let rows = toRenderableRows(report);
|
|
907
|
+
if (options.baseline) rows = applyBaseline(rows, options.baseline);
|
|
908
|
+
const useColor = options.color ?? options.format === "console";
|
|
909
|
+
if (options.format === "markdown") return formatMarkdown(rows);
|
|
910
|
+
return formatConsole(rows, useColor);
|
|
911
|
+
}
|
|
912
|
+
//#endregion
|
|
913
|
+
//#region src/eval-interchange/build.ts
|
|
914
|
+
const DEFAULT_AGENT_ID = "agent";
|
|
915
|
+
function serializeToolInput(args) {
|
|
916
|
+
return JSON.stringify(args ?? {});
|
|
917
|
+
}
|
|
918
|
+
function parseToolInput(toolInput) {
|
|
919
|
+
try {
|
|
920
|
+
return JSON.parse(toolInput);
|
|
921
|
+
} catch {
|
|
922
|
+
return toolInput;
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
function toolCallToInterchange(toolCall) {
|
|
926
|
+
return {
|
|
927
|
+
tool_name: toolCall.name,
|
|
928
|
+
tool_input: serializeToolInput(toolCall.args)
|
|
929
|
+
};
|
|
930
|
+
}
|
|
931
|
+
function interchangeToTabular(toolCall) {
|
|
932
|
+
return {
|
|
933
|
+
tool_name: toolCall.tool_name,
|
|
934
|
+
tool_input: parseToolInput(toolCall.tool_input)
|
|
935
|
+
};
|
|
936
|
+
}
|
|
937
|
+
function predictedTrajectoryFromView(view) {
|
|
938
|
+
return view.toolCalls.map(toolCallToInterchange);
|
|
939
|
+
}
|
|
940
|
+
function buildAgentTrace(view, agentId = DEFAULT_AGENT_ID) {
|
|
941
|
+
const agents = { [agentId]: {
|
|
942
|
+
agent_id: agentId,
|
|
943
|
+
agent_type: "assistant",
|
|
944
|
+
description: view.meta.model,
|
|
945
|
+
tools: view.meta.availableTools.map((name) => ({ name }))
|
|
946
|
+
} };
|
|
947
|
+
const activeTools = view.meta.availableTools.map((name) => ({ name }));
|
|
948
|
+
return {
|
|
949
|
+
agents,
|
|
950
|
+
turns: view.turns.map((turn) => {
|
|
951
|
+
const events = [];
|
|
952
|
+
if (turn.text) events.push({
|
|
953
|
+
author: agentId,
|
|
954
|
+
content: { parts: [{ text: turn.text }] },
|
|
955
|
+
active_tools: activeTools
|
|
956
|
+
});
|
|
957
|
+
for (const toolCall of turn.toolCalls) {
|
|
958
|
+
events.push({
|
|
959
|
+
author: agentId,
|
|
960
|
+
content: { parts: [{ function_call: {
|
|
961
|
+
name: toolCall.name,
|
|
962
|
+
args: toolCall.args ?? {}
|
|
963
|
+
} }] },
|
|
964
|
+
active_tools: activeTools
|
|
965
|
+
});
|
|
966
|
+
if (toolCall.result !== null && toolCall.result !== void 0) events.push({
|
|
967
|
+
author: agentId,
|
|
968
|
+
content: { parts: [{ function_response: {
|
|
969
|
+
name: toolCall.name,
|
|
970
|
+
response: toolCall.result
|
|
971
|
+
} }] },
|
|
972
|
+
active_tools: activeTools
|
|
973
|
+
});
|
|
974
|
+
}
|
|
975
|
+
return {
|
|
976
|
+
turn_index: turn.turnIndex,
|
|
977
|
+
events
|
|
978
|
+
};
|
|
979
|
+
})
|
|
980
|
+
};
|
|
981
|
+
}
|
|
982
|
+
function latencyInSeconds(view) {
|
|
983
|
+
return view.usage.durationMs / 1e3;
|
|
984
|
+
}
|
|
985
|
+
//#endregion
|
|
986
|
+
//#region src/metrics/trajectory.ts
|
|
987
|
+
function normalizeToolCall$1(toolCall) {
|
|
988
|
+
if (typeof toolCall.tool_input === "string") return {
|
|
989
|
+
tool_name: toolCall.tool_name,
|
|
990
|
+
tool_input: toolCall.tool_input
|
|
991
|
+
};
|
|
992
|
+
return {
|
|
993
|
+
tool_name: toolCall.tool_name,
|
|
994
|
+
tool_input: serializeToolInput(toolCall.tool_input)
|
|
995
|
+
};
|
|
996
|
+
}
|
|
997
|
+
function normalizeTrajectory(trajectory) {
|
|
998
|
+
return trajectory.map(normalizeToolCall$1);
|
|
999
|
+
}
|
|
1000
|
+
function toolCallKey(toolCall) {
|
|
1001
|
+
return `${toolCall.tool_name}\0${toolCall.tool_input}`;
|
|
1002
|
+
}
|
|
1003
|
+
function multisetIntersectionSize(predicted, reference) {
|
|
1004
|
+
const refCounts = /* @__PURE__ */ new Map();
|
|
1005
|
+
for (const toolCall of reference) {
|
|
1006
|
+
const key = toolCallKey(toolCall);
|
|
1007
|
+
refCounts.set(key, (refCounts.get(key) ?? 0) + 1);
|
|
1008
|
+
}
|
|
1009
|
+
let matched = 0;
|
|
1010
|
+
for (const toolCall of predicted) {
|
|
1011
|
+
const key = toolCallKey(toolCall);
|
|
1012
|
+
const count = refCounts.get(key) ?? 0;
|
|
1013
|
+
if (count > 0) {
|
|
1014
|
+
matched += 1;
|
|
1015
|
+
refCounts.set(key, count - 1);
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
return matched;
|
|
1019
|
+
}
|
|
1020
|
+
function isSubsequence(predicted, reference) {
|
|
1021
|
+
let refIndex = 0;
|
|
1022
|
+
for (const toolCall of predicted) {
|
|
1023
|
+
if (refIndex >= reference.length) break;
|
|
1024
|
+
if (toolCallKey(toolCall) === toolCallKey(reference[refIndex])) refIndex += 1;
|
|
1025
|
+
}
|
|
1026
|
+
return refIndex === reference.length;
|
|
1027
|
+
}
|
|
1028
|
+
function arraysEqual(left, right) {
|
|
1029
|
+
if (left.length !== right.length) return false;
|
|
1030
|
+
return left.every((toolCall, index) => {
|
|
1031
|
+
const other = right[index];
|
|
1032
|
+
return toolCallKey(toolCall) === toolCallKey(other);
|
|
1033
|
+
});
|
|
1034
|
+
}
|
|
1035
|
+
function trajectoryExactMatch(predicted, reference) {
|
|
1036
|
+
return arraysEqual(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
|
|
1037
|
+
}
|
|
1038
|
+
function trajectoryInOrderMatch(predicted, reference) {
|
|
1039
|
+
return isSubsequence(normalizeTrajectory(predicted), normalizeTrajectory(reference)) ? 1 : 0;
|
|
1040
|
+
}
|
|
1041
|
+
function trajectoryAnyOrderMatch(predicted, reference) {
|
|
1042
|
+
const predictedNorm = normalizeTrajectory(predicted);
|
|
1043
|
+
const referenceNorm = normalizeTrajectory(reference);
|
|
1044
|
+
if (predictedNorm.length !== referenceNorm.length) return 0;
|
|
1045
|
+
const predictedKeys = predictedNorm.map(toolCallKey).sort();
|
|
1046
|
+
const referenceKeys = referenceNorm.map(toolCallKey).sort();
|
|
1047
|
+
return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
|
|
1048
|
+
}
|
|
1049
|
+
function trajectoryPrecision(predicted, reference) {
|
|
1050
|
+
const predictedNorm = normalizeTrajectory(predicted);
|
|
1051
|
+
if (predictedNorm.length === 0) return reference.length === 0 ? 1 : 0;
|
|
1052
|
+
return multisetIntersectionSize(predictedNorm, normalizeTrajectory(reference)) / predictedNorm.length;
|
|
1053
|
+
}
|
|
1054
|
+
function trajectoryRecall(predicted, reference) {
|
|
1055
|
+
const referenceNorm = normalizeTrajectory(reference);
|
|
1056
|
+
if (referenceNorm.length === 0) return predicted.length === 0 ? 1 : 0;
|
|
1057
|
+
return multisetIntersectionSize(normalizeTrajectory(predicted), referenceNorm) / referenceNorm.length;
|
|
1058
|
+
}
|
|
1059
|
+
function trajectorySingleToolUse(predicted, reference) {
|
|
1060
|
+
const predictedNorm = normalizeTrajectory(predicted);
|
|
1061
|
+
const referenceNorm = normalizeTrajectory(reference);
|
|
1062
|
+
if (predictedNorm.length !== 1 || referenceNorm.length !== 1) return 0;
|
|
1063
|
+
return toolCallKey(predictedNorm[0]) === toolCallKey(referenceNorm[0]) ? 1 : 0;
|
|
1064
|
+
}
|
|
1065
|
+
function computeTrajectoryMetrics(predicted, reference) {
|
|
1066
|
+
return {
|
|
1067
|
+
trajectory_exact_match: trajectoryExactMatch(predicted, reference),
|
|
1068
|
+
trajectory_in_order_match: trajectoryInOrderMatch(predicted, reference),
|
|
1069
|
+
trajectory_any_order_match: trajectoryAnyOrderMatch(predicted, reference),
|
|
1070
|
+
trajectory_precision: trajectoryPrecision(predicted, reference),
|
|
1071
|
+
trajectory_recall: trajectoryRecall(predicted, reference),
|
|
1072
|
+
trajectory_single_tool_use: trajectorySingleToolUse(predicted, reference)
|
|
1073
|
+
};
|
|
1074
|
+
}
|
|
1075
|
+
//#endregion
|
|
1076
|
+
//#region src/metrics/tool-calls.ts
|
|
1077
|
+
function normalizeToolCall(toolCall) {
|
|
1078
|
+
if (typeof toolCall.tool_input === "string") return {
|
|
1079
|
+
tool_name: toolCall.tool_name,
|
|
1080
|
+
tool_input: toolCall.tool_input
|
|
1081
|
+
};
|
|
1082
|
+
return {
|
|
1083
|
+
tool_name: toolCall.tool_name,
|
|
1084
|
+
tool_input: serializeToolInput(toolCall.tool_input)
|
|
1085
|
+
};
|
|
1086
|
+
}
|
|
1087
|
+
function parsedArgs(toolCall) {
|
|
1088
|
+
const parsed = parseToolInput(toolCall.tool_input);
|
|
1089
|
+
if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) return null;
|
|
1090
|
+
return parsed;
|
|
1091
|
+
}
|
|
1092
|
+
function toolCallValid(toolCall) {
|
|
1093
|
+
const normalized = normalizeToolCall(toolCall);
|
|
1094
|
+
if (!normalized.tool_name.trim()) return 0;
|
|
1095
|
+
try {
|
|
1096
|
+
JSON.parse(normalized.tool_input);
|
|
1097
|
+
return 1;
|
|
1098
|
+
} catch {
|
|
1099
|
+
return 0;
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
function toolNameMatch(predicted, reference) {
|
|
1103
|
+
const predictedNorm = normalizeToolCall(predicted);
|
|
1104
|
+
const referenceNorm = normalizeToolCall(reference);
|
|
1105
|
+
return predictedNorm.tool_name === referenceNorm.tool_name ? 1 : 0;
|
|
1106
|
+
}
|
|
1107
|
+
function toolParameterKeyMatch(predicted, reference) {
|
|
1108
|
+
if (toolNameMatch(predicted, reference) === 0) return 0;
|
|
1109
|
+
const predictedArgs = parsedArgs(normalizeToolCall(predicted));
|
|
1110
|
+
const referenceArgs = parsedArgs(normalizeToolCall(reference));
|
|
1111
|
+
if (predictedArgs === null || referenceArgs === null) return 0;
|
|
1112
|
+
const predictedKeys = Object.keys(predictedArgs).sort();
|
|
1113
|
+
const referenceKeys = Object.keys(referenceArgs).sort();
|
|
1114
|
+
if (predictedKeys.length !== referenceKeys.length) return 0;
|
|
1115
|
+
return predictedKeys.every((key, index) => key === referenceKeys[index]) ? 1 : 0;
|
|
1116
|
+
}
|
|
1117
|
+
function valuesEqual(left, right, useStrictStringMatch) {
|
|
1118
|
+
if (useStrictStringMatch) return JSON.stringify(left) === JSON.stringify(right);
|
|
1119
|
+
return JSON.stringify(left) === JSON.stringify(right);
|
|
1120
|
+
}
|
|
1121
|
+
function toolParameterKvMatch(predicted, reference, options = {}) {
|
|
1122
|
+
if (toolParameterKeyMatch(predicted, reference) === 0) return 0;
|
|
1123
|
+
const predictedArgs = parsedArgs(normalizeToolCall(predicted));
|
|
1124
|
+
const referenceArgs = parsedArgs(normalizeToolCall(reference));
|
|
1125
|
+
for (const key of Object.keys(referenceArgs)) if (!valuesEqual(predictedArgs[key], referenceArgs[key], options.useStrictStringMatch ?? false)) return 0;
|
|
1126
|
+
return 1;
|
|
1127
|
+
}
|
|
1128
|
+
function computeToolCallMetrics(predicted, reference, options = {}) {
|
|
1129
|
+
const pairCount = Math.max(predicted.length, reference.length, 1);
|
|
1130
|
+
let valid = 0;
|
|
1131
|
+
let nameMatch = 0;
|
|
1132
|
+
let keyMatch = 0;
|
|
1133
|
+
let kvMatch = 0;
|
|
1134
|
+
for (let index = 0; index < pairCount; index += 1) {
|
|
1135
|
+
const predictedCall = predicted[index];
|
|
1136
|
+
const referenceCall = reference[index];
|
|
1137
|
+
if (!predictedCall) continue;
|
|
1138
|
+
valid += toolCallValid(predictedCall);
|
|
1139
|
+
if (!referenceCall) continue;
|
|
1140
|
+
nameMatch += toolNameMatch(predictedCall, referenceCall);
|
|
1141
|
+
keyMatch += toolParameterKeyMatch(predictedCall, referenceCall);
|
|
1142
|
+
kvMatch += toolParameterKvMatch(predictedCall, referenceCall, options);
|
|
1143
|
+
}
|
|
1144
|
+
return {
|
|
1145
|
+
tool_call_valid: valid / pairCount,
|
|
1146
|
+
tool_name_match: nameMatch / pairCount,
|
|
1147
|
+
tool_parameter_key_match: keyMatch / pairCount,
|
|
1148
|
+
tool_parameter_kv_match: kvMatch / pairCount
|
|
1149
|
+
};
|
|
1150
|
+
}
|
|
1151
|
+
//#endregion
|
|
1152
|
+
//#region src/eval-interchange/projections.ts
|
|
1153
|
+
/**
|
|
1154
|
+
* Envelope projection methods for eval interchange output.
|
|
1155
|
+
*/
|
|
1156
|
+
function repetitionInterchangeFields(repetition) {
|
|
1157
|
+
if (!repetition.trajectory) return { predicted_trajectory: [] };
|
|
1158
|
+
return {
|
|
1159
|
+
predicted_trajectory: repetition.predicted_trajectory ?? predictedTrajectoryFromView(repetition.trajectory),
|
|
1160
|
+
agent_trace: repetition.agent_trace ?? buildAgentTrace(repetition.trajectory),
|
|
1161
|
+
latency_in_seconds: repetition.latency_in_seconds ?? latencyInSeconds(repetition.trajectory),
|
|
1162
|
+
failure: repetition.failure ?? (repetition.trajectory.success ? 0 : 1)
|
|
1163
|
+
};
|
|
1164
|
+
}
|
|
1165
|
+
function referenceTrajectoryForCell(cell) {
|
|
1166
|
+
return cell.reference_trajectory;
|
|
1167
|
+
}
|
|
1168
|
+
function repetitionToDatasetRow(cell, repetition) {
|
|
1169
|
+
const fields = repetitionInterchangeFields(repetition);
|
|
1170
|
+
if (!repetition.trajectory) return {
|
|
1171
|
+
prompt: cell.prompt,
|
|
1172
|
+
response: void 0,
|
|
1173
|
+
predicted_trajectory: [],
|
|
1174
|
+
reference_trajectory: referenceTrajectoryForCell(cell),
|
|
1175
|
+
latency_in_seconds: repetition.durationMs / 1e3,
|
|
1176
|
+
failure: 1,
|
|
1177
|
+
human_ratings: cell.human_ratings
|
|
1178
|
+
};
|
|
1179
|
+
return {
|
|
1180
|
+
prompt: cell.prompt,
|
|
1181
|
+
response: repetition.trajectory.finalResponse,
|
|
1182
|
+
predicted_trajectory: fields.predicted_trajectory.map(interchangeToTabular),
|
|
1183
|
+
reference_trajectory: referenceTrajectoryForCell(cell),
|
|
1184
|
+
latency_in_seconds: fields.latency_in_seconds ?? repetition.durationMs / 1e3,
|
|
1185
|
+
failure: fields.failure ?? 1,
|
|
1186
|
+
human_ratings: cell.human_ratings
|
|
1187
|
+
};
|
|
1188
|
+
}
|
|
1189
|
+
function repetitionToProtoInstance(cell, repetition) {
|
|
1190
|
+
const fields = repetitionInterchangeFields(repetition);
|
|
1191
|
+
if (!repetition.trajectory) return null;
|
|
1192
|
+
const reference = referenceTrajectoryForCell(cell);
|
|
1193
|
+
return {
|
|
1194
|
+
prompt: cell.prompt,
|
|
1195
|
+
response: repetition.trajectory.finalResponse,
|
|
1196
|
+
predicted_trajectory: { tool_calls: fields.predicted_trajectory },
|
|
1197
|
+
reference_trajectory: reference ? { tool_calls: reference.map((toolCall) => ({
|
|
1198
|
+
tool_name: toolCall.tool_name,
|
|
1199
|
+
tool_input: typeof toolCall.tool_input === "string" ? toolCall.tool_input : JSON.stringify(toolCall.tool_input ?? {})
|
|
1200
|
+
})) } : void 0
|
|
1201
|
+
};
|
|
1202
|
+
}
|
|
1203
|
+
function repetitionToAgentTrace(repetition) {
|
|
1204
|
+
return repetitionInterchangeFields(repetition).agent_trace ?? null;
|
|
1205
|
+
}
|
|
1206
|
+
function computeRepetitionMetrics(repetition, referenceTrajectory) {
|
|
1207
|
+
if (!referenceTrajectory?.length) return {};
|
|
1208
|
+
const predictedTabular = (repetition.predicted_trajectory ?? (repetition.trajectory ? predictedTrajectoryFromView(repetition.trajectory) : [])).map(interchangeToTabular);
|
|
1209
|
+
return {
|
|
1210
|
+
trajectoryMetrics: computeTrajectoryMetrics(predictedTabular, referenceTrajectory),
|
|
1211
|
+
toolCallMetrics: computeToolCallMetrics(predictedTabular, referenceTrajectory)
|
|
1212
|
+
};
|
|
1213
|
+
}
|
|
1214
|
+
function toTrajectory(envelope) {
|
|
1215
|
+
const rows = [];
|
|
1216
|
+
for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
|
|
1217
|
+
const row = repetitionToDatasetRow(cell, repetition);
|
|
1218
|
+
if (row) rows.push(row);
|
|
1219
|
+
}
|
|
1220
|
+
return rows;
|
|
1221
|
+
}
|
|
1222
|
+
function toProtoInstances(envelope) {
|
|
1223
|
+
const instances = [];
|
|
1224
|
+
for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
|
|
1225
|
+
const instance = repetitionToProtoInstance(cell, repetition);
|
|
1226
|
+
if (instance) instances.push(instance);
|
|
1227
|
+
}
|
|
1228
|
+
return instances;
|
|
1229
|
+
}
|
|
1230
|
+
function toAgentTrace(envelope) {
|
|
1231
|
+
const traces = [];
|
|
1232
|
+
for (const cell of envelope.cells) for (const repetition of cell.repetitions) {
|
|
1233
|
+
const trace = repetitionToAgentTrace(repetition);
|
|
1234
|
+
if (trace) traces.push(trace);
|
|
1235
|
+
}
|
|
1236
|
+
return traces;
|
|
1237
|
+
}
|
|
1238
|
+
function enrichRepetitionWithInterchange(repetition, referenceTrajectory) {
|
|
1239
|
+
if (!repetition.trajectory) return repetition;
|
|
1240
|
+
const predicted_trajectory = predictedTrajectoryFromView(repetition.trajectory);
|
|
1241
|
+
const agent_trace = buildAgentTrace(repetition.trajectory);
|
|
1242
|
+
const latency_in_seconds = latencyInSeconds(repetition.trajectory);
|
|
1243
|
+
const failure = repetition.trajectory.success ? 0 : 1;
|
|
1244
|
+
const metrics = computeRepetitionMetrics({
|
|
1245
|
+
...repetition,
|
|
1246
|
+
predicted_trajectory,
|
|
1247
|
+
agent_trace,
|
|
1248
|
+
latency_in_seconds,
|
|
1249
|
+
failure
|
|
1250
|
+
}, referenceTrajectory);
|
|
1251
|
+
return {
|
|
1252
|
+
...repetition,
|
|
1253
|
+
predicted_trajectory,
|
|
1254
|
+
agent_trace,
|
|
1255
|
+
latency_in_seconds,
|
|
1256
|
+
failure,
|
|
1257
|
+
trajectoryMetrics: metrics.trajectoryMetrics,
|
|
1258
|
+
toolCallMetrics: metrics.toolCallMetrics
|
|
1259
|
+
};
|
|
1260
|
+
}
|
|
1261
|
+
//#endregion
|
|
1262
|
+
//#region src/eval-record/build.ts
|
|
1263
|
+
/**
|
|
1264
|
+
* Build {@link EvalRunEnvelope} from harness-eval run and grading reports.
|
|
1265
|
+
*/
|
|
1266
|
+
function extractRawEvents(adapterResult) {
|
|
1267
|
+
if (adapterResult !== null && typeof adapterResult === "object" && "rawEvents" in adapterResult && Array.isArray(adapterResult.rawEvents)) return adapterResult.rawEvents;
|
|
1268
|
+
}
|
|
1269
|
+
function outcomePassForCell(caseId, cellLabel, repetitions) {
|
|
1270
|
+
const graded = repetitions.filter((r) => r.outcomeGrades);
|
|
1271
|
+
if (graded.length === 0) return void 0;
|
|
1272
|
+
return graded.every((r) => r.outcomeGrades.error === void 0 && r.outcomeGrades.summary.failed === 0);
|
|
1273
|
+
}
|
|
1274
|
+
/**
|
|
1275
|
+
* Convert a {@link SuiteReport} (and optional grading) into a versioned
|
|
1276
|
+
* {@link EvalRunEnvelope} for storage or API handoff.
|
|
1277
|
+
*/
|
|
1278
|
+
function buildEvalRunEnvelope(report, options = {}) {
|
|
1279
|
+
const includeTranscript = options.includeTranscript !== false;
|
|
1280
|
+
const includeRaw = options.includeRawStreamEvents === true;
|
|
1281
|
+
const judge = options.grading?.judge ?? { id: "harness-eval/claude-grader" };
|
|
1282
|
+
const cells = report.cells.map((cell) => {
|
|
1283
|
+
const prompt = cell.prompt ?? "";
|
|
1284
|
+
const referenceTrajectory = cell.reference_trajectory;
|
|
1285
|
+
const repetitions = cell.repetitions.map((rep) => {
|
|
1286
|
+
const base = {
|
|
1287
|
+
repetitionIndex: rep.repetitionIndex,
|
|
1288
|
+
durationMs: rep.durationMs,
|
|
1289
|
+
assertionResults: rep.assertionResults
|
|
1290
|
+
};
|
|
1291
|
+
if (rep.error) {
|
|
1292
|
+
base.error = {
|
|
1293
|
+
message: rep.error.message,
|
|
1294
|
+
diagnostics: rep.error.diagnostics
|
|
1295
|
+
};
|
|
1296
|
+
return base;
|
|
1297
|
+
}
|
|
1298
|
+
if (rep.adapterResult) {
|
|
1299
|
+
base.trajectory = {
|
|
1300
|
+
...rep.adapterResult.view,
|
|
1301
|
+
schemaVersion: "1.0"
|
|
1302
|
+
};
|
|
1303
|
+
base.diagnostics = rep.adapterResult.diagnostics;
|
|
1304
|
+
const artifacts = {};
|
|
1305
|
+
if (includeTranscript) artifacts.transcript = trajectoryToTranscript(rep.adapterResult.view, prompt);
|
|
1306
|
+
if (includeRaw) {
|
|
1307
|
+
const raw = extractRawEvents(rep.adapterResult);
|
|
1308
|
+
if (raw) artifacts.rawStreamEvents = raw;
|
|
1309
|
+
}
|
|
1310
|
+
if (Object.keys(artifacts).length > 0) base.artifacts = artifacts;
|
|
1311
|
+
}
|
|
1312
|
+
const graded = options.grading?.results.find((r) => r.caseId === cell.caseId && r.cellLabel === cell.cell.label && r.repetitionIndex === rep.repetitionIndex);
|
|
1313
|
+
if (graded) base.outcomeGrades = {
|
|
1314
|
+
judge,
|
|
1315
|
+
expectations: graded.expectations,
|
|
1316
|
+
summary: graded.summary,
|
|
1317
|
+
evalFeedback: graded.evalFeedback,
|
|
1318
|
+
error: graded.graderError
|
|
1319
|
+
};
|
|
1320
|
+
return enrichRepetitionWithInterchange(base, referenceTrajectory);
|
|
1321
|
+
});
|
|
1322
|
+
return {
|
|
1323
|
+
caseId: cell.caseId,
|
|
1324
|
+
category: cell.category,
|
|
1325
|
+
notes: cell.notes,
|
|
1326
|
+
prompt: cell.prompt,
|
|
1327
|
+
expectations: cell.expectations,
|
|
1328
|
+
reference_trajectory: cell.reference_trajectory,
|
|
1329
|
+
human_ratings: cell.human_ratings,
|
|
1330
|
+
cellLabel: cell.cell.label,
|
|
1331
|
+
axes: cell.cell.axes,
|
|
1332
|
+
assertionStats: cell.assertionStats,
|
|
1333
|
+
adapterErrors: cell.adapterErrors,
|
|
1334
|
+
behavioralPass: cell.passed,
|
|
1335
|
+
outcomePass: outcomePassForCell(cell.caseId, cell.cell.label, repetitions),
|
|
1336
|
+
repetitions
|
|
1337
|
+
};
|
|
1338
|
+
});
|
|
1339
|
+
const cellsPassed = cells.filter((c) => c.behavioralPass).length;
|
|
1340
|
+
const gradedCells = cells.filter((c) => c.outcomePass !== void 0);
|
|
1341
|
+
const outcomePass = gradedCells.length > 0 ? gradedCells.every((c) => c.outcomePass === true) : void 0;
|
|
1342
|
+
return {
|
|
1343
|
+
schemaVersion: "1.0",
|
|
1344
|
+
runId: options.runId ?? randomUUID(),
|
|
1345
|
+
startedAt: report.startedAt,
|
|
1346
|
+
durationMs: report.durationMs,
|
|
1347
|
+
suite: options.suite,
|
|
1348
|
+
harness: {
|
|
1349
|
+
adapter: options.harness?.adapter ?? "claude-code",
|
|
1350
|
+
frameworkVersion: options.harness?.frameworkVersion,
|
|
1351
|
+
harnessVersion: options.harness?.harnessVersion
|
|
1352
|
+
},
|
|
1353
|
+
provenance: options.provenance,
|
|
1354
|
+
summary: {
|
|
1355
|
+
cellsTotal: cells.length,
|
|
1356
|
+
cellsPassed,
|
|
1357
|
+
behavioralPass: cellsPassed === cells.length,
|
|
1358
|
+
outcomePass
|
|
1359
|
+
},
|
|
1360
|
+
cells
|
|
1361
|
+
};
|
|
1362
|
+
}
|
|
1363
|
+
/** Build envelope from on-disk report + optional grading JSON paths. */
|
|
1364
|
+
async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
|
|
1365
|
+
const reportText = await readFile(reportPath, "utf8");
|
|
1366
|
+
const report = JSON.parse(reportText);
|
|
1367
|
+
let grading = options.grading;
|
|
1368
|
+
if (options.gradingPath) {
|
|
1369
|
+
const gradingText = await readFile(options.gradingPath, "utf8");
|
|
1370
|
+
const parsed = JSON.parse(gradingText);
|
|
1371
|
+
grading = {
|
|
1372
|
+
gradedAt: parsed.gradedAt,
|
|
1373
|
+
sourceReport: parsed.sourceReport,
|
|
1374
|
+
results: parsed.results,
|
|
1375
|
+
judge: options.grading?.judge ?? { id: "harness-eval/claude-grader" }
|
|
1376
|
+
};
|
|
1377
|
+
}
|
|
1378
|
+
let suite = options.suite;
|
|
1379
|
+
if (options.suitePath) {
|
|
1380
|
+
const content = await readFile(options.suitePath, "utf8");
|
|
1381
|
+
suite = {
|
|
1382
|
+
...suite,
|
|
1383
|
+
uri: options.suitePath,
|
|
1384
|
+
contentHash: createHash("sha256").update(content).digest("hex")
|
|
1385
|
+
};
|
|
1386
|
+
}
|
|
1387
|
+
return buildEvalRunEnvelope(report, {
|
|
1388
|
+
...options,
|
|
1389
|
+
suite,
|
|
1390
|
+
grading
|
|
1391
|
+
});
|
|
1392
|
+
}
|
|
1393
|
+
//#endregion
|
|
1394
|
+
export { TRAJECTORY_SCHEMA_VERSION as A, gradeReport as C, emitOtel as D, createClaudeGrader as E, trajectoryToOtlp as O, resolveGradeOptions as S, trajectoryToTranscript as T, trajectoryRecall as _, toProtoInstances as a, formatGradingConsole as b, toolCallValid as c, toolParameterKvMatch as d, computeTrajectoryMetrics as f, trajectoryPrecision as g, trajectoryInOrderMatch as h, toAgentTrace as i, EVAL_RUN_SCHEMA_VERSION as k, toolNameMatch as l, trajectoryExactMatch as m, buildEvalRunEnvelopeFromFiles as n, toTrajectory as o, trajectoryAnyOrderMatch as p, enrichRepetitionWithInterchange as r, computeToolCallMetrics as s, buildEvalRunEnvelope as t, toolParameterKeyMatch as u, trajectorySingleToolUse as v, loadSuiteReport as w, gradingReportPassed as x, formatReport as y };
|
|
1395
|
+
|
|
1396
|
+
//# sourceMappingURL=build-DsVJ_UeU.js.map
|