@alis-build/harness-eval 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +187 -30
- package/dist/adapters/claude-code/index.d.ts +2 -2
- package/dist/adapters/claude-code/index.js +2 -1
- package/dist/adapters/codex/index.d.ts +68 -0
- package/dist/adapters/codex/index.js +3 -0
- package/dist/{claude-code-DZ4Vkgp6.js → claude-code-C_7hxC8z.js} +3 -245
- package/dist/claude-code-C_7hxC8z.js.map +1 -0
- package/dist/cli/bin.js +131 -151
- package/dist/cli/bin.js.map +1 -1
- package/dist/codex-0cHO2te9.js +496 -0
- package/dist/codex-0cHO2te9.js.map +1 -0
- package/dist/config/loader.d.ts +2 -2
- package/dist/config/loader.js +2 -2
- package/dist/{index-V22PrR0p.d.ts → index-C56AEDUr.d.ts} +2 -2
- package/dist/index.d.ts +134 -6
- package/dist/index.js +6 -5
- package/dist/index.js.map +1 -1
- package/dist/{loader-DcI0KfRX.js → loader-CiBm4Kf6.js} +491 -209
- package/dist/loader-CiBm4Kf6.js.map +1 -0
- package/dist/loader-CrmzNwkq.d.ts +107 -0
- package/dist/{projections-BcX7w-f6.js → reporter-BKCJZRYr.js} +1475 -729
- package/dist/reporter-BKCJZRYr.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-Dlzl-HI0.js → suite-C3-8EjUW.js} +558 -4
- package/dist/suite-C3-8EjUW.js.map +1 -0
- package/dist/{suite-DPJMIEbu.d.ts → suite-qyOGre2g.d.ts} +2 -2
- package/dist/types-Bac8_Ixb.js +246 -0
- package/dist/types-Bac8_Ixb.js.map +1 -0
- package/dist/{types-CD3TwOtZ.d.ts → types-CLt4Yygc.d.ts} +2 -2
- package/dist/{types-B9H4IZtA.d.ts → types-D0HR2WnP.d.ts} +9 -2
- package/dist/types-DFMpv_HJ.d.ts +77 -0
- package/package.json +11 -2
- package/schemas/eval-run-envelope.schema.json +193 -183
- package/dist/claude-code-DZ4Vkgp6.js.map +0 -1
- package/dist/loader-C9yQHUPC.d.ts +0 -50
- package/dist/loader-DcI0KfRX.js.map +0 -1
- package/dist/projections-BcX7w-f6.js.map +0 -1
- package/dist/suite-Dlzl-HI0.js.map +0 -1
|
@@ -1,506 +1,537 @@
|
|
|
1
|
-
import { i as buildJudgeArgs } from "./claude-code-
|
|
2
|
-
import { n as createLimit } from "./suite-
|
|
1
|
+
import { i as buildJudgeArgs } from "./claude-code-C_7hxC8z.js";
|
|
2
|
+
import { h as buildJudgeArgs$1, m as prepareGeminiCliEnv, n as createLimit, t as runSuite, u as getAdapter } from "./suite-C3-8EjUW.js";
|
|
3
|
+
import { s as buildJudgeArgs$2 } from "./codex-0cHO2te9.js";
|
|
4
|
+
import { i as loadGradingConfig, l as ConfigError, o as loadSuiteDocument, s as DEFAULT_PIPELINE_OUTPUTS, t as loadSuite } from "./loader-CiBm4Kf6.js";
|
|
3
5
|
import { spawn } from "node:child_process";
|
|
4
|
-
import { readFile } from "node:fs/promises";
|
|
5
|
-
import {
|
|
6
|
+
import { readFile, stat, writeFile } from "node:fs/promises";
|
|
7
|
+
import { basename, dirname, join, resolve } from "node:path";
|
|
6
8
|
import { createHash, randomUUID } from "node:crypto";
|
|
9
|
+
import { parse } from "yaml";
|
|
10
|
+
import { fileURLToPath } from "node:url";
|
|
7
11
|
//#region src/types/eval-record.ts
|
|
8
12
|
/** Schema version for {@link EvalRunEnvelope} JSON documents. */
|
|
9
13
|
const EVAL_RUN_SCHEMA_VERSION = "1.0";
|
|
10
14
|
/** Schema version embedded in each {@link TrajectoryView} at export time. */
|
|
11
15
|
const TRAJECTORY_SCHEMA_VERSION = "1.0";
|
|
12
16
|
//#endregion
|
|
13
|
-
//#region src/
|
|
14
|
-
/**
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
//#region src/grader/prompt.ts
|
|
18
|
+
/**
|
|
19
|
+
* Build the full grader prompt including eval prompt, transcript, and schema.
|
|
20
|
+
*
|
|
21
|
+
* When `systemInstruction` is set it is prepended as a judge-specific prefix.
|
|
22
|
+
*/
|
|
23
|
+
function buildGraderPrompt(input) {
|
|
24
|
+
const expectationList = input.expectations.map((e, i) => `${i + 1}. ${e}`).join("\n");
|
|
25
|
+
return `${input.systemInstruction ? `${input.systemInstruction.trim()}\n\n` : ""}You are an automated evaluation grader (not the agent under test). Your only job is to score expectations against the transcript below.
|
|
26
|
+
|
|
27
|
+
Your job is to evaluate each expectation against the transcript and final response.
|
|
28
|
+
PASS only when there is clear evidence in the transcript or final response.
|
|
29
|
+
When uncertain, FAIL — burden of proof is on PASS.
|
|
30
|
+
|
|
31
|
+
Also critique the expectations themselves if any are trivially satisfied or miss important outcomes.
|
|
32
|
+
|
|
33
|
+
## Eval prompt
|
|
34
|
+
|
|
35
|
+
${input.prompt}
|
|
36
|
+
|
|
37
|
+
## Execution transcript
|
|
38
|
+
|
|
39
|
+
${input.transcript}
|
|
40
|
+
|
|
41
|
+
## Expectations to grade
|
|
42
|
+
|
|
43
|
+
${expectationList}
|
|
44
|
+
|
|
45
|
+
## Output format
|
|
46
|
+
|
|
47
|
+
Respond with ONLY a single JSON object (no markdown fences, no commentary) matching this schema:
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
"expectations": [
|
|
51
|
+
{ "text": "<original expectation>", "passed": true|false, "evidence": "<quote or description>" }
|
|
52
|
+
],
|
|
53
|
+
"summary": { "passed": <int>, "failed": <int>, "total": <int>, "pass_rate": <0.0-1.0> },
|
|
54
|
+
"eval_feedback": {
|
|
55
|
+
"suggestions": [{ "assertion": "<optional>", "reason": "<string>" }],
|
|
56
|
+
"overall": "<brief assessment>"
|
|
57
|
+
}
|
|
20
58
|
}
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
return {
|
|
24
|
-
key,
|
|
25
|
-
value: { intValue: String(value) }
|
|
26
|
-
};
|
|
59
|
+
|
|
60
|
+
Include every expectation in the same order. summary must match the expectations array.`;
|
|
27
61
|
}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
62
|
+
//#endregion
|
|
63
|
+
//#region src/grader/parse.ts
|
|
64
|
+
/**
|
|
65
|
+
* Extract assistant text from Claude stdout.
|
|
66
|
+
*
|
|
67
|
+
* Handles plain text, single JSON result envelopes, stream-json arrays, and
|
|
68
|
+
* assistant message objects — the judge subprocess may emit any of these
|
|
69
|
+
* depending on Claude Code version and flags.
|
|
70
|
+
*/
|
|
71
|
+
function extractClaudeResponseText(stdout) {
|
|
72
|
+
const trimmed = stdout.trim();
|
|
73
|
+
if (!trimmed) return "";
|
|
74
|
+
try {
|
|
75
|
+
const data = JSON.parse(trimmed);
|
|
76
|
+
if (Array.isArray(data)) return extractFromEventArray(data) ?? trimmed;
|
|
77
|
+
if (typeof data === "object" && data !== null) {
|
|
78
|
+
const event = data;
|
|
79
|
+
if (event.type === "result" && typeof event.result === "string") return event.result;
|
|
80
|
+
if (event.type === "assistant" && event.message) {
|
|
81
|
+
const text = textFromAssistantMessage(event.message);
|
|
82
|
+
if (text) return text;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
} catch {}
|
|
86
|
+
return trimmed;
|
|
34
87
|
}
|
|
35
|
-
/**
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
88
|
+
/**
|
|
89
|
+
* Extract assistant text from Codex judge stdout.
|
|
90
|
+
*
|
|
91
|
+
* Handles plain text and JSONL streams from accidental `--json` usage.
|
|
92
|
+
*/
|
|
93
|
+
function extractCodexResponseText(stdout) {
|
|
94
|
+
const trimmed = stdout.trim();
|
|
95
|
+
if (!trimmed) return "";
|
|
96
|
+
const lines = trimmed.split("\n").filter((line) => line.trim().length > 0);
|
|
97
|
+
if (lines.length > 1) for (let i = lines.length - 1; i >= 0; i--) try {
|
|
98
|
+
const event = JSON.parse(lines[i]);
|
|
99
|
+
if (event.type === "item.completed" && (event.item?.type === "assistant_message" || event.item?.item_type === "assistant_message") && event.item.text) return event.item.text;
|
|
100
|
+
} catch {
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
return trimmed;
|
|
41
104
|
}
|
|
42
|
-
//#endregion
|
|
43
|
-
//#region src/otel/messages.ts
|
|
44
105
|
/**
|
|
45
|
-
*
|
|
106
|
+
* Extract judge response text from Gemini CLI `--output-format json` stdout.
|
|
46
107
|
*
|
|
47
|
-
*
|
|
108
|
+
* Parses `{ response, stats, error? }` and returns the `response` field when
|
|
109
|
+
* present. When stdout is empty, {@link spawnCollectStdout} may already have
|
|
110
|
+
* recovered the JSON payload from stderr before this runs.
|
|
48
111
|
*/
|
|
49
|
-
function
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
112
|
+
function extractGeminiCliResponseText(stdout) {
|
|
113
|
+
const trimmed = stdout.trim();
|
|
114
|
+
if (!trimmed) return "";
|
|
115
|
+
try {
|
|
116
|
+
const data = JSON.parse(trimmed);
|
|
117
|
+
if (typeof data.response === "string" && data.response.length > 0) return data.response;
|
|
118
|
+
if (data.error?.message) return data.error.message;
|
|
119
|
+
} catch {}
|
|
120
|
+
return trimmed;
|
|
121
|
+
}
|
|
122
|
+
/** Walk a stream-json event array and return the final assistant or result text. */
|
|
123
|
+
function extractFromEventArray(events) {
|
|
124
|
+
const result = events.find((e) => typeof e === "object" && e !== null && e.type === "result");
|
|
125
|
+
if (result?.result) return result.result;
|
|
126
|
+
const assistantTexts = [];
|
|
127
|
+
for (const event of events) if (typeof event === "object" && event !== null && event.type === "assistant") {
|
|
128
|
+
const text = textFromAssistantMessage(event.message);
|
|
129
|
+
if (text) assistantTexts.push(text);
|
|
57
130
|
}
|
|
131
|
+
if (assistantTexts.length > 0) return assistantTexts[assistantTexts.length - 1];
|
|
132
|
+
return null;
|
|
58
133
|
}
|
|
59
|
-
/**
|
|
60
|
-
function
|
|
61
|
-
return
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
134
|
+
/** Concatenate text blocks from an Anthropic-style assistant message object. */
|
|
135
|
+
function textFromAssistantMessage(message) {
|
|
136
|
+
if (!message || typeof message !== "object") return null;
|
|
137
|
+
const content = message.content;
|
|
138
|
+
if (typeof content === "string") return content;
|
|
139
|
+
if (!Array.isArray(content)) return null;
|
|
140
|
+
const texts = [];
|
|
141
|
+
for (const block of content) if (typeof block === "object" && block !== null && block.type === "text" && typeof block.text === "string") texts.push(block.text);
|
|
142
|
+
return texts.length > 0 ? texts.join("\n") : null;
|
|
67
143
|
}
|
|
68
|
-
/**
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
144
|
+
/**
|
|
145
|
+
* Parse grader JSON from response text.
|
|
146
|
+
*
|
|
147
|
+
* Tries the raw string first, then fenced code blocks and brace-delimited
|
|
148
|
+
* substrings. Returns null when no valid expectations array is found.
|
|
149
|
+
*/
|
|
150
|
+
function parseGraderJson(text) {
|
|
151
|
+
const candidates = [text.trim(), extractJsonBlock(text)];
|
|
152
|
+
for (const candidate of candidates) {
|
|
153
|
+
if (!candidate) continue;
|
|
154
|
+
try {
|
|
155
|
+
const normalized = normalizeGraderJson(JSON.parse(candidate));
|
|
156
|
+
if (normalized.expectations.length > 0) return normalized;
|
|
157
|
+
} catch {
|
|
158
|
+
continue;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return null;
|
|
75
162
|
}
|
|
76
|
-
/**
|
|
77
|
-
function
|
|
78
|
-
const
|
|
79
|
-
if (
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
const finish = mapStopReason(turn.stopReason);
|
|
85
|
-
return {
|
|
86
|
-
role: "assistant",
|
|
87
|
-
parts,
|
|
88
|
-
...finish ? { finish_reason: finish } : {}
|
|
89
|
-
};
|
|
163
|
+
/** Extract JSON from markdown fences or the outermost `{...}` substring. */
|
|
164
|
+
function extractJsonBlock(text) {
|
|
165
|
+
const fence = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
166
|
+
if (fence?.[1]) return fence[1].trim();
|
|
167
|
+
const start = text.indexOf("{");
|
|
168
|
+
const end = text.lastIndexOf("}");
|
|
169
|
+
if (start >= 0 && end > start) return text.slice(start, end + 1);
|
|
170
|
+
return null;
|
|
90
171
|
}
|
|
91
|
-
/**
|
|
92
|
-
function
|
|
93
|
-
const
|
|
94
|
-
|
|
172
|
+
/** Map raw grader JSON to runtime {@link GraderOutput} with computed summary. */
|
|
173
|
+
function normalizeGraderJson(raw) {
|
|
174
|
+
const expectations = (raw.expectations ?? []).map((e) => ({
|
|
175
|
+
text: e.text ?? "",
|
|
176
|
+
passed: Boolean(e.passed),
|
|
177
|
+
evidence: e.evidence ?? ""
|
|
178
|
+
}));
|
|
179
|
+
const passed = expectations.filter((e) => e.passed).length;
|
|
180
|
+
const failed = expectations.length - passed;
|
|
181
|
+
const total = expectations.length;
|
|
182
|
+
const passRate = raw.summary?.pass_rate ?? raw.summary?.passRate ?? (total === 0 ? 0 : passed / total);
|
|
183
|
+
const summary = {
|
|
184
|
+
passed: raw.summary?.passed ?? passed,
|
|
185
|
+
failed: raw.summary?.failed ?? failed,
|
|
186
|
+
total: raw.summary?.total ?? total,
|
|
187
|
+
passRate
|
|
188
|
+
};
|
|
189
|
+
let evalFeedback;
|
|
190
|
+
if (raw.eval_feedback) evalFeedback = {
|
|
191
|
+
suggestions: (raw.eval_feedback.suggestions ?? []).map((s) => ({
|
|
192
|
+
assertion: s.assertion,
|
|
193
|
+
reason: s.reason ?? ""
|
|
194
|
+
})),
|
|
195
|
+
overall: raw.eval_feedback.overall ?? ""
|
|
196
|
+
};
|
|
95
197
|
return {
|
|
96
|
-
|
|
97
|
-
|
|
198
|
+
expectations,
|
|
199
|
+
summary,
|
|
200
|
+
evalFeedback
|
|
98
201
|
};
|
|
99
202
|
}
|
|
203
|
+
//#endregion
|
|
204
|
+
//#region src/grader/spawn-judge.ts
|
|
100
205
|
/**
|
|
101
|
-
*
|
|
206
|
+
* Shared subprocess utilities for judge graders (Claude, Codex, Gemini CLI).
|
|
207
|
+
*
|
|
208
|
+
* Owns detached spawn, process-group teardown, and SIGTERM → SIGKILL
|
|
209
|
+
* escalation so all graders share one implementation.
|
|
102
210
|
*/
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
const turn = view.turns[i];
|
|
114
|
-
if (!turn) continue;
|
|
115
|
-
messages.push(assistantMessageFromTurn(turn));
|
|
116
|
-
const toolMsg = toolResultsMessage(turn.toolCalls);
|
|
117
|
-
if (toolMsg) messages.push(toolMsg);
|
|
211
|
+
const KILL_GRACE_MS = 5e3;
|
|
212
|
+
/** Kill the detached process group (fallback to single process if group kill fails). */
|
|
213
|
+
function killTree(child, signal) {
|
|
214
|
+
if (child.pid === void 0) return;
|
|
215
|
+
try {
|
|
216
|
+
process.kill(-child.pid, signal);
|
|
217
|
+
} catch {
|
|
218
|
+
try {
|
|
219
|
+
child.kill(signal);
|
|
220
|
+
} catch {}
|
|
118
221
|
}
|
|
119
|
-
return messages;
|
|
120
222
|
}
|
|
121
|
-
//#endregion
|
|
122
|
-
//#region src/otel/types.ts
|
|
123
|
-
/** OTLP span kinds (enum integers). */
|
|
124
|
-
const SpanKind = {
|
|
125
|
-
INTERNAL: 1,
|
|
126
|
-
CLIENT: 2
|
|
127
|
-
};
|
|
128
|
-
/** OTLP status codes. */
|
|
129
|
-
const StatusCode = {
|
|
130
|
-
UNSET: 0,
|
|
131
|
-
OK: 1,
|
|
132
|
-
ERROR: 2
|
|
133
|
-
};
|
|
134
|
-
//#endregion
|
|
135
|
-
//#region src/otel/emitter.ts
|
|
136
|
-
/**
|
|
137
|
-
* TrajectoryView → OTLP JSON export using OpenTelemetry GenAI semantic conventions.
|
|
138
|
-
*
|
|
139
|
-
* Produces an `ExportTraceServiceRequest` suitable for OTLP/HTTP JSON ingestion.
|
|
140
|
-
* Assertions continue to use {@link TrajectoryView} directly; this is export-only.
|
|
141
|
-
*/
|
|
142
|
-
const INSTRUMENTATION_VERSION = "0.1.0";
|
|
143
223
|
/**
|
|
144
|
-
*
|
|
224
|
+
* Spawn a judge subprocess with process-group teardown and collect stdout.
|
|
145
225
|
*
|
|
146
|
-
*
|
|
147
|
-
*
|
|
148
|
-
* invoke_agent
|
|
149
|
-
* ├── chat {model}
|
|
150
|
-
* ├── execute_tool {name}
|
|
151
|
-
* ├── chat {model}
|
|
152
|
-
* └── ...
|
|
153
|
-
* ```
|
|
226
|
+
* Non-zero exit with empty stdout is treated as failure; partial stdout on
|
|
227
|
+
* non-zero exit is retained (judges sometimes exit non-zero after emitting JSON).
|
|
154
228
|
*/
|
|
155
|
-
function
|
|
156
|
-
const
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
const rootStartNs = msToNs(startMs);
|
|
166
|
-
const rootEndNs = msToNs(endMs);
|
|
167
|
-
const spans = [];
|
|
168
|
-
const timings = buildSpanTimings(view, startMs, endMs);
|
|
169
|
-
spans.push({
|
|
170
|
-
traceId,
|
|
171
|
-
spanId: rootSpanId,
|
|
172
|
-
name: "invoke_agent",
|
|
173
|
-
kind: SpanKind.INTERNAL,
|
|
174
|
-
startTimeUnixNano: rootStartNs,
|
|
175
|
-
endTimeUnixNano: rootEndNs,
|
|
176
|
-
attributes: [
|
|
177
|
-
strAttr("gen_ai.operation.name", "invoke_agent"),
|
|
178
|
-
strAttr("gen_ai.agent.name", agentName),
|
|
179
|
-
strAttr("gen_ai.provider.name", providerName),
|
|
180
|
-
strAttr("gen_ai.conversation.id", view.meta.sessionId),
|
|
181
|
-
strAttr("gen_ai.request.model", view.meta.model),
|
|
182
|
-
strAttr("gen_ai.response.model", view.meta.model),
|
|
183
|
-
intAttr("gen_ai.usage.input_tokens", view.usage.inputTokens),
|
|
184
|
-
intAttr("gen_ai.usage.output_tokens", view.usage.outputTokens),
|
|
185
|
-
boolAttr("harness_eval.success", view.success)
|
|
186
|
-
],
|
|
187
|
-
status: viewStatus(view)
|
|
188
|
-
});
|
|
189
|
-
let opIndex = 0;
|
|
190
|
-
for (const turn of view.turns) {
|
|
191
|
-
const chatTiming = timings[opIndex++];
|
|
192
|
-
const chatSpanId = spanIdFromKey(traceId, `chat:${turn.turnIndex}`);
|
|
193
|
-
const inputMessages = inputMessagesBeforeTurn(view, turn.turnIndex, options.prompt);
|
|
194
|
-
const outputMessages = [assistantMessageFromTurn(turn)];
|
|
195
|
-
spans.push({
|
|
196
|
-
traceId,
|
|
197
|
-
spanId: chatSpanId,
|
|
198
|
-
parentSpanId: rootSpanId,
|
|
199
|
-
name: `chat ${view.meta.model}`,
|
|
200
|
-
kind: SpanKind.CLIENT,
|
|
201
|
-
startTimeUnixNano: chatTiming.startNs,
|
|
202
|
-
endTimeUnixNano: chatTiming.endNs,
|
|
203
|
-
attributes: [
|
|
204
|
-
strAttr("gen_ai.operation.name", "chat"),
|
|
205
|
-
strAttr("gen_ai.provider.name", providerName),
|
|
206
|
-
strAttr("gen_ai.request.model", view.meta.model),
|
|
207
|
-
strAttr("gen_ai.response.model", view.meta.model),
|
|
208
|
-
...inputMessages.length > 0 ? [jsonAttr("gen_ai.input.messages", inputMessages)] : [],
|
|
209
|
-
jsonAttr("gen_ai.output.messages", outputMessages),
|
|
210
|
-
...turn.stopReason ? [jsonAttr("gen_ai.response.finish_reasons", [mapStopReason(turn.stopReason) ?? turn.stopReason])] : []
|
|
229
|
+
function spawnCollectStdout(options) {
|
|
230
|
+
const { binary, args, timeoutMs, env, cwd } = options;
|
|
231
|
+
return new Promise((resolve, reject) => {
|
|
232
|
+
const child = spawn(binary, args, {
|
|
233
|
+
env: env ?? process.env,
|
|
234
|
+
cwd,
|
|
235
|
+
stdio: [
|
|
236
|
+
"ignore",
|
|
237
|
+
"pipe",
|
|
238
|
+
"pipe"
|
|
211
239
|
],
|
|
212
|
-
|
|
240
|
+
detached: true
|
|
213
241
|
});
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
/** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
|
|
255
|
-
const emitOtel = trajectoryToOtlp;
|
|
256
|
-
/** Map view success flag to OTLP span status on the root invoke_agent span. */
|
|
257
|
-
function viewStatus(view) {
|
|
258
|
-
if (view.success) return { code: StatusCode.OK };
|
|
259
|
-
return {
|
|
260
|
-
code: StatusCode.ERROR,
|
|
261
|
-
message: "harness run did not complete successfully"
|
|
262
|
-
};
|
|
242
|
+
const chunks = [];
|
|
243
|
+
child.stdout?.setEncoding("utf8");
|
|
244
|
+
child.stdout?.on("data", (c) => chunks.push(c));
|
|
245
|
+
const stderrChunks = [];
|
|
246
|
+
child.stderr?.setEncoding("utf8");
|
|
247
|
+
child.stderr?.on("data", (c) => stderrChunks.push(c));
|
|
248
|
+
let killEscalation = null;
|
|
249
|
+
const timer = setTimeout(() => {
|
|
250
|
+
killTree(child, "SIGTERM");
|
|
251
|
+
killEscalation = setTimeout(() => killTree(child, "SIGKILL"), KILL_GRACE_MS);
|
|
252
|
+
const stderrHint = stderrChunks.join("").trim().slice(0, 400);
|
|
253
|
+
reject(/* @__PURE__ */ new Error(`grader timed out after ${timeoutMs}ms` + (stderrHint ? ` (stderr: ${stderrHint})` : "")));
|
|
254
|
+
}, timeoutMs);
|
|
255
|
+
const finalize = (err, output) => {
|
|
256
|
+
clearTimeout(timer);
|
|
257
|
+
if (killEscalation) clearTimeout(killEscalation);
|
|
258
|
+
if (err) reject(err);
|
|
259
|
+
else resolve(output ?? chunks.join(""));
|
|
260
|
+
};
|
|
261
|
+
child.on("error", (err) => finalize(err));
|
|
262
|
+
child.on("close", (code) => {
|
|
263
|
+
const stdout = chunks.join("");
|
|
264
|
+
const stderr = stderrChunks.join("");
|
|
265
|
+
if (stdout.length > 0) {
|
|
266
|
+
finalize(void 0, stdout);
|
|
267
|
+
return;
|
|
268
|
+
}
|
|
269
|
+
const stderrJson = extractJsonPayload(stderr);
|
|
270
|
+
if (stderrJson) {
|
|
271
|
+
finalize(void 0, stderrJson);
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
if (code !== 0) {
|
|
275
|
+
finalize(/* @__PURE__ */ new Error(`grader exited ${code}: ${stderr.slice(0, 500)}`));
|
|
276
|
+
return;
|
|
277
|
+
}
|
|
278
|
+
finalize(void 0, stdout);
|
|
279
|
+
});
|
|
280
|
+
});
|
|
263
281
|
}
|
|
264
282
|
/**
|
|
265
|
-
*
|
|
283
|
+
* Return trailing JSON object from mixed stderr output.
|
|
266
284
|
*
|
|
267
|
-
*
|
|
268
|
-
*
|
|
269
|
-
* start/end times on every span.
|
|
285
|
+
* Gemini CLI judge runs sometimes print warnings before the JSON envelope;
|
|
286
|
+
* scan from the first `{` and validate with `JSON.parse`.
|
|
270
287
|
*/
|
|
271
|
-
function
|
|
272
|
-
const
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
const slotStart = offset;
|
|
283
|
-
const slotEnd = offset + slotMs;
|
|
284
|
-
timings.push({
|
|
285
|
-
startNs: msToNs(slotStart),
|
|
286
|
-
endNs: msToNs(slotEnd)
|
|
287
|
-
});
|
|
288
|
-
offset = slotEnd;
|
|
288
|
+
function extractJsonPayload(text) {
|
|
289
|
+
const trimmed = text.trim();
|
|
290
|
+
if (!trimmed) return null;
|
|
291
|
+
const jsonStart = trimmed.indexOf("{");
|
|
292
|
+
if (jsonStart < 0) return null;
|
|
293
|
+
const candidate = trimmed.slice(jsonStart);
|
|
294
|
+
try {
|
|
295
|
+
JSON.parse(candidate);
|
|
296
|
+
return candidate;
|
|
297
|
+
} catch {
|
|
298
|
+
return null;
|
|
289
299
|
}
|
|
290
|
-
return timings;
|
|
291
300
|
}
|
|
301
|
+
//#endregion
|
|
302
|
+
//#region src/grader/claude-grader.ts
|
|
292
303
|
/**
|
|
293
|
-
*
|
|
294
|
-
*
|
|
295
|
-
* Uses SHA-256 truncation so the same session always maps to the same trace.
|
|
304
|
+
* Grade expectations by spawning Claude as judge (skill-creator grader pattern).
|
|
296
305
|
*/
|
|
297
|
-
|
|
298
|
-
return createHash("sha256").update(`harness-eval:trace:${sessionId}`).digest("hex").slice(0, 32).toUpperCase();
|
|
299
|
-
}
|
|
306
|
+
const DEFAULT_TIMEOUT_MS$2 = 3e5;
|
|
300
307
|
/**
|
|
301
|
-
*
|
|
308
|
+
* Judge subprocess defaults — grading is a single-shot JSON response, not an agent session.
|
|
309
|
+
* Without these, Claude Code may load plugins/MCP and loop on tools until timeout.
|
|
302
310
|
*/
|
|
303
|
-
|
|
304
|
-
|
|
311
|
+
const JUDGE_CLAUDE_DEFAULTS = {
|
|
312
|
+
maxTurns: 1,
|
|
313
|
+
bare: true,
|
|
314
|
+
disableSlashCommands: true,
|
|
315
|
+
noSessionPersistence: true
|
|
316
|
+
};
|
|
317
|
+
/** Merge user-supplied Claude Code options over judge-safe defaults. */
|
|
318
|
+
function mergeJudgeClaudeOptions(claudeCode) {
|
|
319
|
+
return {
|
|
320
|
+
...JUDGE_CLAUDE_DEFAULTS,
|
|
321
|
+
...claudeCode
|
|
322
|
+
};
|
|
305
323
|
}
|
|
306
|
-
/**
|
|
307
|
-
function
|
|
308
|
-
return
|
|
324
|
+
/** Factory returning a {@link GraderFn} bound to subprocess options. */
|
|
325
|
+
function createClaudeGrader(options = {}) {
|
|
326
|
+
return (input) => runClaudeGrader(input, options);
|
|
309
327
|
}
|
|
310
|
-
//#endregion
|
|
311
|
-
//#region src/grader/prompt.ts
|
|
312
328
|
/**
|
|
313
|
-
*
|
|
329
|
+
* Spawn Claude as judge, parse JSON response, align with input expectations.
|
|
314
330
|
*
|
|
315
|
-
*
|
|
331
|
+
* Unparseable output fails all expectations and sets {@link GraderOutput.error}.
|
|
316
332
|
*/
|
|
317
|
-
function
|
|
318
|
-
const
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
333
|
+
async function runClaudeGrader(input, options = {}) {
|
|
334
|
+
const binary = options.binary ?? options.claudeCode?.binary ?? "claude";
|
|
335
|
+
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS$2;
|
|
336
|
+
const prompt = buildGraderPrompt(input);
|
|
337
|
+
const model = options.model ?? options.claudeCode?.model;
|
|
338
|
+
const responseText = extractClaudeResponseText(await spawnCollectStdout({
|
|
339
|
+
binary,
|
|
340
|
+
args: buildJudgeArgs(prompt, {
|
|
341
|
+
...mergeJudgeClaudeOptions(options.claudeCode),
|
|
342
|
+
model
|
|
343
|
+
}),
|
|
344
|
+
timeoutMs,
|
|
345
|
+
env: buildChildEnv(options.env),
|
|
346
|
+
cwd: options.cwd
|
|
347
|
+
}));
|
|
348
|
+
const parsed = parseGraderJson(responseText);
|
|
349
|
+
if (!parsed) return {
|
|
350
|
+
expectations: input.expectations.map((text) => ({
|
|
351
|
+
text,
|
|
352
|
+
passed: false,
|
|
353
|
+
evidence: "Grader returned unparseable output"
|
|
354
|
+
})),
|
|
355
|
+
summary: {
|
|
356
|
+
passed: 0,
|
|
357
|
+
failed: input.expectations.length,
|
|
358
|
+
total: input.expectations.length,
|
|
359
|
+
passRate: 0
|
|
360
|
+
},
|
|
361
|
+
error: `failed to parse grader JSON from response: ${responseText.slice(0, 200)}`
|
|
362
|
+
};
|
|
363
|
+
const expectations = input.expectations.map((text, i) => {
|
|
364
|
+
const graded = parsed.expectations[i];
|
|
365
|
+
return {
|
|
366
|
+
text,
|
|
367
|
+
passed: graded?.passed ?? false,
|
|
368
|
+
evidence: graded?.evidence ?? "No evidence returned"
|
|
369
|
+
};
|
|
370
|
+
});
|
|
371
|
+
const passed = expectations.filter((e) => e.passed).length;
|
|
372
|
+
const total = expectations.length;
|
|
373
|
+
return {
|
|
374
|
+
expectations,
|
|
375
|
+
summary: {
|
|
376
|
+
passed,
|
|
377
|
+
failed: total - passed,
|
|
378
|
+
total,
|
|
379
|
+
passRate: total === 0 ? 0 : passed / total
|
|
380
|
+
},
|
|
381
|
+
evalFeedback: parsed.evalFeedback
|
|
382
|
+
};
|
|
352
383
|
}
|
|
353
|
-
|
|
354
|
-
|
|
384
|
+
/**
|
|
385
|
+
* Build subprocess env, stripping CLAUDECODE to avoid nested-session guards.
|
|
386
|
+
*/
|
|
387
|
+
function buildChildEnv(extraEnv) {
|
|
388
|
+
const env = {
|
|
389
|
+
...process.env,
|
|
390
|
+
...extraEnv
|
|
391
|
+
};
|
|
392
|
+
delete env.CLAUDECODE;
|
|
393
|
+
return env;
|
|
355
394
|
}
|
|
356
395
|
//#endregion
|
|
357
|
-
//#region src/grader/
|
|
396
|
+
//#region src/grader/codex-grader.ts
|
|
358
397
|
/**
|
|
359
|
-
*
|
|
360
|
-
*
|
|
361
|
-
* Handles plain text, single JSON result envelopes, stream-json arrays, and
|
|
362
|
-
* assistant message objects — the judge subprocess may emit any of these
|
|
363
|
-
* depending on Claude Code version and flags.
|
|
398
|
+
* Grade expectations by spawning Codex as judge.
|
|
364
399
|
*/
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
}
|
|
379
|
-
} catch {}
|
|
380
|
-
return trimmed;
|
|
381
|
-
}
|
|
382
|
-
/** Walk a stream-json event array and return the final assistant or result text. */
|
|
383
|
-
function extractFromEventArray(events) {
|
|
384
|
-
const result = events.find((e) => typeof e === "object" && e !== null && e.type === "result");
|
|
385
|
-
if (result?.result) return result.result;
|
|
386
|
-
const assistantTexts = [];
|
|
387
|
-
for (const event of events) if (typeof event === "object" && event !== null && event.type === "assistant") {
|
|
388
|
-
const text = textFromAssistantMessage(event.message);
|
|
389
|
-
if (text) assistantTexts.push(text);
|
|
390
|
-
}
|
|
391
|
-
if (assistantTexts.length > 0) return assistantTexts[assistantTexts.length - 1];
|
|
392
|
-
return null;
|
|
400
|
+
const DEFAULT_TIMEOUT_MS$1 = 3e5;
|
|
401
|
+
/** Judge subprocess defaults — single-shot grading without persistent sessions. */
|
|
402
|
+
const JUDGE_CODEX_DEFAULTS = {
|
|
403
|
+
ephemeral: true,
|
|
404
|
+
ignoreUserConfig: true,
|
|
405
|
+
skipGitRepoCheck: true
|
|
406
|
+
};
|
|
407
|
+
/** Merge user-supplied Codex options over judge-safe defaults. */
|
|
408
|
+
function mergeJudgeCodexOptions(codex) {
|
|
409
|
+
return {
|
|
410
|
+
...JUDGE_CODEX_DEFAULTS,
|
|
411
|
+
...codex
|
|
412
|
+
};
|
|
393
413
|
}
|
|
394
|
-
/**
|
|
395
|
-
function
|
|
396
|
-
|
|
397
|
-
const content = message.content;
|
|
398
|
-
if (typeof content === "string") return content;
|
|
399
|
-
if (!Array.isArray(content)) return null;
|
|
400
|
-
const texts = [];
|
|
401
|
-
for (const block of content) if (typeof block === "object" && block !== null && block.type === "text" && typeof block.text === "string") texts.push(block.text);
|
|
402
|
-
return texts.length > 0 ? texts.join("\n") : null;
|
|
414
|
+
/** Factory returning a {@link GraderFn} bound to subprocess options. */
|
|
415
|
+
function createCodexGrader(options = {}) {
|
|
416
|
+
return (input) => runCodexGrader(input, options);
|
|
403
417
|
}
|
|
404
418
|
/**
|
|
405
|
-
*
|
|
419
|
+
* Spawn Codex as judge, parse JSON response, align with input expectations.
|
|
406
420
|
*
|
|
407
|
-
*
|
|
408
|
-
* substrings. Returns null when no valid expectations array is found.
|
|
421
|
+
* Unparseable output fails all expectations and sets {@link GraderOutput.error}.
|
|
409
422
|
*/
|
|
410
|
-
function
|
|
411
|
-
const
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
const end = text.lastIndexOf("}");
|
|
429
|
-
if (start >= 0 && end > start) return text.slice(start, end + 1);
|
|
430
|
-
return null;
|
|
431
|
-
}
|
|
432
|
-
/** Map raw grader JSON to runtime {@link GraderOutput} with computed summary. */
|
|
433
|
-
function normalizeGraderJson(raw) {
|
|
434
|
-
const expectations = (raw.expectations ?? []).map((e) => ({
|
|
435
|
-
text: e.text ?? "",
|
|
436
|
-
passed: Boolean(e.passed),
|
|
437
|
-
evidence: e.evidence ?? ""
|
|
423
|
+
async function runCodexGrader(input, options = {}) {
|
|
424
|
+
const binary = options.binary ?? options.codex?.binary ?? "codex";
|
|
425
|
+
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS$1;
|
|
426
|
+
const prompt = buildGraderPrompt(input);
|
|
427
|
+
const model = options.model ?? options.codex?.model;
|
|
428
|
+
const responseText = extractCodexResponseText(await spawnCollectStdout({
|
|
429
|
+
binary,
|
|
430
|
+
args: buildJudgeArgs$2(prompt, {
|
|
431
|
+
...mergeJudgeCodexOptions(options.codex),
|
|
432
|
+
model,
|
|
433
|
+
cwd: options.cwd
|
|
434
|
+
}),
|
|
435
|
+
timeoutMs,
|
|
436
|
+
env: {
|
|
437
|
+
...process.env,
|
|
438
|
+
...options.env
|
|
439
|
+
},
|
|
440
|
+
cwd: options.cwd
|
|
438
441
|
}));
|
|
439
|
-
const
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
failed: raw.summary?.failed ?? failed,
|
|
446
|
-
total: raw.summary?.total ?? total,
|
|
447
|
-
passRate
|
|
448
|
-
};
|
|
449
|
-
let evalFeedback;
|
|
450
|
-
if (raw.eval_feedback) evalFeedback = {
|
|
451
|
-
suggestions: (raw.eval_feedback.suggestions ?? []).map((s) => ({
|
|
452
|
-
assertion: s.assertion,
|
|
453
|
-
reason: s.reason ?? ""
|
|
442
|
+
const parsed = parseGraderJson(responseText);
|
|
443
|
+
if (!parsed) return {
|
|
444
|
+
expectations: input.expectations.map((text) => ({
|
|
445
|
+
text,
|
|
446
|
+
passed: false,
|
|
447
|
+
evidence: "Grader returned unparseable output"
|
|
454
448
|
})),
|
|
455
|
-
|
|
449
|
+
summary: {
|
|
450
|
+
passed: 0,
|
|
451
|
+
failed: input.expectations.length,
|
|
452
|
+
total: input.expectations.length,
|
|
453
|
+
passRate: 0
|
|
454
|
+
},
|
|
455
|
+
error: `failed to parse grader JSON from response: ${responseText.slice(0, 200)}`
|
|
456
456
|
};
|
|
457
|
+
const expectations = input.expectations.map((text, i) => {
|
|
458
|
+
const graded = parsed.expectations[i];
|
|
459
|
+
return {
|
|
460
|
+
text,
|
|
461
|
+
passed: graded?.passed ?? false,
|
|
462
|
+
evidence: graded?.evidence ?? "No evidence returned"
|
|
463
|
+
};
|
|
464
|
+
});
|
|
465
|
+
const passed = expectations.filter((e) => e.passed).length;
|
|
466
|
+
const total = expectations.length;
|
|
457
467
|
return {
|
|
458
468
|
expectations,
|
|
459
|
-
summary
|
|
460
|
-
|
|
469
|
+
summary: {
|
|
470
|
+
passed,
|
|
471
|
+
failed: total - passed,
|
|
472
|
+
total,
|
|
473
|
+
passRate: total === 0 ? 0 : passed / total
|
|
474
|
+
},
|
|
475
|
+
evalFeedback: parsed.evalFeedback
|
|
461
476
|
};
|
|
462
477
|
}
|
|
463
478
|
//#endregion
|
|
464
|
-
//#region src/grader/
|
|
479
|
+
//#region src/grader/gemini-cli-grader.ts
|
|
465
480
|
/**
|
|
466
|
-
* Grade expectations by spawning
|
|
481
|
+
* Grade expectations by spawning Gemini CLI as judge.
|
|
467
482
|
*/
|
|
468
483
|
const DEFAULT_TIMEOUT_MS = 3e5;
|
|
469
|
-
/**
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
*/
|
|
473
|
-
|
|
474
|
-
maxTurns: 1,
|
|
475
|
-
bare: true,
|
|
476
|
-
disableSlashCommands: true,
|
|
477
|
-
noSessionPersistence: true
|
|
484
|
+
/** Judge subprocess defaults — single-shot grading without interactive approval. */
|
|
485
|
+
const JUDGE_GEMINI_CLI_DEFAULTS = {
|
|
486
|
+
approvalMode: "yolo",
|
|
487
|
+
/** Avoid loading user MCP servers, skills, and extensions for lightweight grading. */
|
|
488
|
+
isolateConfig: true
|
|
478
489
|
};
|
|
479
|
-
/** Merge user-supplied
|
|
480
|
-
function
|
|
490
|
+
/** Merge user-supplied Gemini CLI options over judge-safe defaults. */
|
|
491
|
+
function mergeJudgeGeminiCliOptions(geminiCli) {
|
|
481
492
|
return {
|
|
482
|
-
...
|
|
483
|
-
...
|
|
493
|
+
...JUDGE_GEMINI_CLI_DEFAULTS,
|
|
494
|
+
...geminiCli
|
|
484
495
|
};
|
|
485
496
|
}
|
|
486
497
|
/** Factory returning a {@link GraderFn} bound to subprocess options. */
|
|
487
|
-
function
|
|
488
|
-
return (input) =>
|
|
498
|
+
function createGeminiCliGrader(options = {}) {
|
|
499
|
+
return (input) => runGeminiCliGrader(input, options);
|
|
489
500
|
}
|
|
490
501
|
/**
|
|
491
|
-
* Spawn
|
|
502
|
+
* Spawn Gemini CLI as judge, parse JSON response, align with input expectations.
|
|
492
503
|
*
|
|
493
|
-
*
|
|
504
|
+
* Uses {@link prepareGeminiCliEnv} for config isolation and {@link spawnCollectStdout}
|
|
505
|
+
* which may recover JSON from stderr when stdout is empty. Unparseable output fails
|
|
506
|
+
* all expectations and sets {@link GraderOutput.error}.
|
|
494
507
|
*/
|
|
495
|
-
async function
|
|
496
|
-
const binary = options.binary ?? options.
|
|
508
|
+
async function runGeminiCliGrader(input, options = {}) {
|
|
509
|
+
const binary = options.binary ?? options.geminiCli?.binary ?? "gemini";
|
|
497
510
|
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
498
511
|
const prompt = buildGraderPrompt(input);
|
|
499
|
-
const model = options.model ?? options.
|
|
500
|
-
const
|
|
501
|
-
|
|
512
|
+
const model = options.model ?? options.geminiCli?.model;
|
|
513
|
+
const geminiCli = mergeJudgeGeminiCliOptions(options.geminiCli);
|
|
514
|
+
const args = buildJudgeArgs$1(prompt, {
|
|
515
|
+
...geminiCli,
|
|
502
516
|
model
|
|
503
|
-
})
|
|
517
|
+
});
|
|
518
|
+
const { env, cleanup } = await prepareGeminiCliEnv({
|
|
519
|
+
isolateConfig: geminiCli.isolateConfig,
|
|
520
|
+
env: options.env
|
|
521
|
+
});
|
|
522
|
+
let stdout;
|
|
523
|
+
try {
|
|
524
|
+
stdout = await spawnCollectStdout({
|
|
525
|
+
binary,
|
|
526
|
+
args,
|
|
527
|
+
timeoutMs,
|
|
528
|
+
env,
|
|
529
|
+
cwd: options.cwd
|
|
530
|
+
});
|
|
531
|
+
} finally {
|
|
532
|
+
await cleanup();
|
|
533
|
+
}
|
|
534
|
+
const responseText = extractGeminiCliResponseText(stdout);
|
|
504
535
|
const parsed = parseGraderJson(responseText);
|
|
505
536
|
if (!parsed) return {
|
|
506
537
|
expectations: input.expectations.map((text) => ({
|
|
@@ -537,57 +568,6 @@ async function runClaudeGrader(input, options = {}) {
|
|
|
537
568
|
evalFeedback: parsed.evalFeedback
|
|
538
569
|
};
|
|
539
570
|
}
|
|
540
|
-
/**
|
|
541
|
-
* Spawn a child process and collect stdout until exit or timeout.
|
|
542
|
-
*
|
|
543
|
-
* Non-zero exit with empty stdout is treated as failure; partial stdout on
|
|
544
|
-
* non-zero exit is retained (Claude sometimes exits non-zero after emitting JSON).
|
|
545
|
-
*/
|
|
546
|
-
function spawnCollectStdout(binary, args, timeoutMs, extraEnv, cwd) {
|
|
547
|
-
return new Promise((resolve, reject) => {
|
|
548
|
-
const child = spawn(binary, args, {
|
|
549
|
-
env: buildChildEnv(extraEnv),
|
|
550
|
-
cwd,
|
|
551
|
-
stdio: [
|
|
552
|
-
"ignore",
|
|
553
|
-
"pipe",
|
|
554
|
-
"pipe"
|
|
555
|
-
]
|
|
556
|
-
});
|
|
557
|
-
const chunks = [];
|
|
558
|
-
child.stdout?.setEncoding("utf8");
|
|
559
|
-
child.stdout?.on("data", (c) => chunks.push(c));
|
|
560
|
-
const stderrChunks = [];
|
|
561
|
-
child.stderr?.setEncoding("utf8");
|
|
562
|
-
child.stderr?.on("data", (c) => stderrChunks.push(c));
|
|
563
|
-
const timer = setTimeout(() => {
|
|
564
|
-
child.kill("SIGTERM");
|
|
565
|
-
const stderrHint = stderrChunks.join("").trim().slice(0, 400);
|
|
566
|
-
reject(/* @__PURE__ */ new Error(`grader timed out after ${timeoutMs}ms` + (stderrHint ? ` (stderr: ${stderrHint})` : "")));
|
|
567
|
-
}, timeoutMs);
|
|
568
|
-
const finalize = (err) => {
|
|
569
|
-
clearTimeout(timer);
|
|
570
|
-
if (err) reject(err);
|
|
571
|
-
else resolve(chunks.join(""));
|
|
572
|
-
};
|
|
573
|
-
child.on("error", (err) => finalize(err));
|
|
574
|
-
child.on("close", (code) => {
|
|
575
|
-
if (code !== 0 && chunks.length === 0) finalize(/* @__PURE__ */ new Error(`grader exited ${code}: ${stderrChunks.join("").slice(0, 500)}`));
|
|
576
|
-
else finalize();
|
|
577
|
-
});
|
|
578
|
-
});
|
|
579
|
-
}
|
|
580
|
-
/**
|
|
581
|
-
* Build subprocess env, stripping CLAUDECODE to avoid nested-session guards.
|
|
582
|
-
*/
|
|
583
|
-
function buildChildEnv(extraEnv) {
|
|
584
|
-
const env = {
|
|
585
|
-
...process.env,
|
|
586
|
-
...extraEnv
|
|
587
|
-
};
|
|
588
|
-
delete env.CLAUDECODE;
|
|
589
|
-
return env;
|
|
590
|
-
}
|
|
591
571
|
//#endregion
|
|
592
572
|
//#region src/grader/expectations.ts
|
|
593
573
|
/**
|
|
@@ -663,6 +643,32 @@ function truncate(text) {
|
|
|
663
643
|
return `${text.slice(0, MAX_RESULT_CHARS)}… (truncated)`;
|
|
664
644
|
}
|
|
665
645
|
//#endregion
|
|
646
|
+
//#region src/eval-record/judge-metadata.ts
|
|
647
|
+
/** Map harness grading adapter id to a stable judge identifier. */
|
|
648
|
+
function judgeIdForAdapter(adapter) {
|
|
649
|
+
switch (adapter) {
|
|
650
|
+
case "codex": return "harness-eval/codex-grader";
|
|
651
|
+
case "claude-code": return "harness-eval/claude-grader";
|
|
652
|
+
default: return adapter ? `harness-eval/${adapter}-grader` : "harness-eval/claude-grader";
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
/** Build {@link JudgeInfo} from grading adapter and optional model override. */
|
|
656
|
+
function resolveJudgeInfo(options) {
|
|
657
|
+
const adapter = options.adapter ?? "claude-code";
|
|
658
|
+
return {
|
|
659
|
+
id: options.id ?? judgeIdForAdapter(adapter),
|
|
660
|
+
model: options.model,
|
|
661
|
+
adapter
|
|
662
|
+
};
|
|
663
|
+
}
|
|
664
|
+
/** Derive judge metadata from a parsed grading YAML config. */
|
|
665
|
+
function judgeInfoFromGradingConfig(config) {
|
|
666
|
+
return resolveJudgeInfo({
|
|
667
|
+
adapter: config.judge.adapter ?? "claude-code",
|
|
668
|
+
model: config.judge.model ?? config.judge.codex?.model ?? config.judge.claudeCode?.model
|
|
669
|
+
});
|
|
670
|
+
}
|
|
671
|
+
//#endregion
|
|
666
672
|
//#region src/grader/grade-report.ts
|
|
667
673
|
/**
|
|
668
674
|
* Grade a harness-eval SuiteReport with outcome expectations (LLM judge).
|
|
@@ -675,14 +681,28 @@ function truncate(text) {
|
|
|
675
681
|
*/
|
|
676
682
|
async function gradeReport(report, options = {}) {
|
|
677
683
|
const expectationsMap = options.expectationsPath ? await loadExpectationsMap(options.expectationsPath) : {};
|
|
678
|
-
const gradeFn = options.gradeFn ??
|
|
684
|
+
const gradeFn = options.gradeFn ?? (options.judgeAdapter === "codex" ? createCodexGrader({
|
|
685
|
+
binary: options.binary,
|
|
686
|
+
model: options.model,
|
|
687
|
+
timeoutMs: options.timeoutMs,
|
|
688
|
+
env: options.env,
|
|
689
|
+
cwd: options.cwd,
|
|
690
|
+
codex: options.codex
|
|
691
|
+
}) : options.judgeAdapter === "gemini-cli" ? createGeminiCliGrader({
|
|
692
|
+
binary: options.binary,
|
|
693
|
+
model: options.model,
|
|
694
|
+
timeoutMs: options.timeoutMs,
|
|
695
|
+
env: options.env,
|
|
696
|
+
cwd: options.cwd,
|
|
697
|
+
geminiCli: options.geminiCli
|
|
698
|
+
}) : createClaudeGrader({
|
|
679
699
|
binary: options.binary,
|
|
680
700
|
model: options.model,
|
|
681
701
|
timeoutMs: options.timeoutMs,
|
|
682
702
|
env: options.env,
|
|
683
703
|
cwd: options.cwd,
|
|
684
704
|
claudeCode: options.claudeCode
|
|
685
|
-
});
|
|
705
|
+
}));
|
|
686
706
|
const limit = createLimit(options.maxConcurrent ?? 2);
|
|
687
707
|
const tasks = [];
|
|
688
708
|
for (const cell of report.cells) {
|
|
@@ -787,6 +807,10 @@ async function gradeReport(report, options = {}) {
|
|
|
787
807
|
gradedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
788
808
|
sourceReport: options.sourceReport ?? "",
|
|
789
809
|
gradingConfigPath: options.gradingConfigPath,
|
|
810
|
+
judge: resolveJudgeInfo({
|
|
811
|
+
adapter: options.judgeAdapter ?? "claude-code",
|
|
812
|
+
model: options.model
|
|
813
|
+
}),
|
|
790
814
|
results,
|
|
791
815
|
summary: {
|
|
792
816
|
passed: passedExpectations,
|
|
@@ -809,10 +833,49 @@ async function loadSuiteReport(path) {
|
|
|
809
833
|
function resolveGradeOptions(fileConfig, cli = {}, configPath) {
|
|
810
834
|
const judge = fileConfig?.judge;
|
|
811
835
|
const adapter = judge?.adapter ?? "claude-code";
|
|
812
|
-
if (adapter !== "claude-code") throw new Error(`unsupported grading adapter "${adapter}" (only claude-code today)`);
|
|
813
836
|
const claudeCode = judge?.claudeCode ?? {};
|
|
814
|
-
const
|
|
815
|
-
const
|
|
837
|
+
const codex = judge?.codex ?? {};
|
|
838
|
+
const geminiCli = judge?.geminiCli ?? {};
|
|
839
|
+
const adapterBlock = adapter === "codex" ? codex : adapter === "gemini-cli" ? geminiCli : claudeCode;
|
|
840
|
+
const binary = cli.binary ?? adapterBlock.binary;
|
|
841
|
+
const model = cli.model ?? judge?.model ?? adapterBlock.model;
|
|
842
|
+
if (adapter === "codex") return {
|
|
843
|
+
sourceReport: cli.sourceReport,
|
|
844
|
+
expectationsPath: cli.expectationsPath,
|
|
845
|
+
model,
|
|
846
|
+
binary,
|
|
847
|
+
timeoutMs: cli.timeoutMs ?? judge?.timeoutMs,
|
|
848
|
+
maxConcurrent: cli.maxConcurrent ?? judge?.maxConcurrent,
|
|
849
|
+
systemInstruction: judge?.system_instruction,
|
|
850
|
+
env: judge?.env,
|
|
851
|
+
cwd: judge?.cwd,
|
|
852
|
+
judgeAdapter: "codex",
|
|
853
|
+
codex: {
|
|
854
|
+
...codex,
|
|
855
|
+
binary: void 0,
|
|
856
|
+
model: void 0
|
|
857
|
+
},
|
|
858
|
+
gradingConfigPath: configPath
|
|
859
|
+
};
|
|
860
|
+
if (adapter === "gemini-cli") return {
|
|
861
|
+
sourceReport: cli.sourceReport,
|
|
862
|
+
expectationsPath: cli.expectationsPath,
|
|
863
|
+
model,
|
|
864
|
+
binary,
|
|
865
|
+
timeoutMs: cli.timeoutMs ?? judge?.timeoutMs,
|
|
866
|
+
maxConcurrent: cli.maxConcurrent ?? judge?.maxConcurrent,
|
|
867
|
+
systemInstruction: judge?.system_instruction,
|
|
868
|
+
env: judge?.env,
|
|
869
|
+
cwd: judge?.cwd,
|
|
870
|
+
judgeAdapter: "gemini-cli",
|
|
871
|
+
geminiCli: {
|
|
872
|
+
...geminiCli,
|
|
873
|
+
binary: void 0,
|
|
874
|
+
model: void 0
|
|
875
|
+
},
|
|
876
|
+
gradingConfigPath: configPath
|
|
877
|
+
};
|
|
878
|
+
if (adapter !== "claude-code") throw new Error(`unsupported grading adapter "${adapter}" (supported: claude-code, codex, gemini-cli)`);
|
|
816
879
|
return {
|
|
817
880
|
sourceReport: cli.sourceReport,
|
|
818
881
|
expectationsPath: cli.expectationsPath,
|
|
@@ -823,6 +886,7 @@ function resolveGradeOptions(fileConfig, cli = {}, configPath) {
|
|
|
823
886
|
systemInstruction: judge?.system_instruction,
|
|
824
887
|
env: judge?.env,
|
|
825
888
|
cwd: judge?.cwd,
|
|
889
|
+
judgeAdapter: "claude-code",
|
|
826
890
|
claudeCode: {
|
|
827
891
|
...claudeCode,
|
|
828
892
|
binary: void 0,
|
|
@@ -870,168 +934,6 @@ function gradingReportPassed(report) {
|
|
|
870
934
|
return report.results.every((r) => !r.graderError && r.summary.failed === 0 && r.summary.total > 0);
|
|
871
935
|
}
|
|
872
936
|
//#endregion
|
|
873
|
-
//#region src/reporter/format-console.ts
|
|
874
|
-
const RESET = "\x1B[0m";
|
|
875
|
-
const GREEN = "\x1B[32m";
|
|
876
|
-
const RED = "\x1B[31m";
|
|
877
|
-
const YELLOW = "\x1B[33m";
|
|
878
|
-
/**
|
|
879
|
-
* Render renderable rows as ANSI-colored console output.
|
|
880
|
-
*
|
|
881
|
-
* @param color When false, emit plain text without escape codes.
|
|
882
|
-
*/
|
|
883
|
-
function formatConsole(rows, color = true) {
|
|
884
|
-
const lines = [];
|
|
885
|
-
for (const row of rows) {
|
|
886
|
-
const status = row.passed ? color ? `${GREEN}PASS${RESET}` : "PASS" : color ? `${RED}FAIL${RESET}` : "FAIL";
|
|
887
|
-
const crashNote = row.adapterErrors > 0 ? ` ${color ? YELLOW : ""}[${row.adapterErrors} adapter errors]${color ? RESET : ""}` : "";
|
|
888
|
-
lines.push(`${row.caseId} @ ${row.cellLabel} ${status}${crashNote}`);
|
|
889
|
-
if (row.category) lines.push(` category: ${row.category}`);
|
|
890
|
-
for (const stat of row.stats) {
|
|
891
|
-
const marker = stat.meetsThreshold ? color ? `${GREEN}✓${RESET}` : "✓" : color ? `${RED}✗${RESET}` : "✗";
|
|
892
|
-
const rateStr = formatRate$1(stat);
|
|
893
|
-
const thresholdPct = (stat.threshold * 100).toFixed(0);
|
|
894
|
-
let line = ` ├─ ${stat.description}: ${rateStr} [threshold ${thresholdPct}%] ${marker}`;
|
|
895
|
-
if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
|
|
896
|
-
const arrow = stat.delta >= 0 ? "↑" : "↓";
|
|
897
|
-
const basePct = (stat.baselinePassRate * 100).toFixed(0);
|
|
898
|
-
const curPct = (stat.passRate * 100).toFixed(0);
|
|
899
|
-
const deltaPct = (stat.delta * 100).toFixed(0);
|
|
900
|
-
line += ` (${basePct}% → ${curPct}% (${arrow}${deltaPct}%))`;
|
|
901
|
-
}
|
|
902
|
-
lines.push(line);
|
|
903
|
-
}
|
|
904
|
-
lines.push("");
|
|
905
|
-
}
|
|
906
|
-
return lines.join("\n").trimEnd();
|
|
907
|
-
}
|
|
908
|
-
/** Format pass rate for display, noting when all reps crashed. */
|
|
909
|
-
function formatRate$1(stat) {
|
|
910
|
-
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
911
|
-
const pct = (stat.passRate * 100).toFixed(0);
|
|
912
|
-
return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
|
|
913
|
-
}
|
|
914
|
-
//#endregion
|
|
915
|
-
//#region src/reporter/format-json.ts
|
|
916
|
-
/**
|
|
917
|
-
* Serialize a suite report as indented JSON (no transformation).
|
|
918
|
-
*
|
|
919
|
-
* Used by `--format json` and `--output` persistence.
|
|
920
|
-
*/
|
|
921
|
-
function formatJson(report) {
|
|
922
|
-
return JSON.stringify(report, null, 2);
|
|
923
|
-
}
|
|
924
|
-
//#endregion
|
|
925
|
-
//#region src/reporter/format-markdown.ts
|
|
926
|
-
/** Render renderable rows as a GitHub-flavored markdown report. */
|
|
927
|
-
function formatMarkdown(rows) {
|
|
928
|
-
const lines = ["# Harness Eval Report", ""];
|
|
929
|
-
for (const row of rows) {
|
|
930
|
-
const status = row.passed ? "PASS" : "FAIL";
|
|
931
|
-
const crashNote = row.adapterErrors > 0 ? ` (${row.adapterErrors} adapter errors)` : "";
|
|
932
|
-
lines.push(`## ${row.caseId} @ ${row.cellLabel} — ${status}${crashNote}`);
|
|
933
|
-
if (row.category) lines.push(`**Category:** ${row.category}`);
|
|
934
|
-
if (row.notes) lines.push("<details><summary>Notes</summary>", row.notes, "</details>");
|
|
935
|
-
lines.push("");
|
|
936
|
-
lines.push("| Assertion | Result | Threshold | Status |");
|
|
937
|
-
lines.push("| --- | --- | --- | --- |");
|
|
938
|
-
for (const stat of row.stats) {
|
|
939
|
-
const rateStr = formatRate(stat);
|
|
940
|
-
const threshold = `${(stat.threshold * 100).toFixed(0)}%`;
|
|
941
|
-
const statusCell = stat.meetsThreshold ? "✓" : "✗";
|
|
942
|
-
let result = rateStr;
|
|
943
|
-
if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
|
|
944
|
-
const base = (stat.baselinePassRate * 100).toFixed(0);
|
|
945
|
-
const cur = (stat.passRate * 100).toFixed(0);
|
|
946
|
-
const d = (stat.delta * 100).toFixed(0);
|
|
947
|
-
const sign = stat.delta >= 0 ? "+" : "";
|
|
948
|
-
result += ` (${base}% → ${cur}%, ${sign}${d}%)`;
|
|
949
|
-
}
|
|
950
|
-
lines.push(`| ${stat.description} | ${result} | ${threshold} | ${statusCell} |`);
|
|
951
|
-
}
|
|
952
|
-
lines.push("");
|
|
953
|
-
}
|
|
954
|
-
return lines.join("\n").trimEnd();
|
|
955
|
-
}
|
|
956
|
-
/** Format pass rate for markdown tables, noting when all reps crashed. */
|
|
957
|
-
function formatRate(stat) {
|
|
958
|
-
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
959
|
-
const pct = (stat.passRate * 100).toFixed(0);
|
|
960
|
-
return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
|
|
961
|
-
}
|
|
962
|
-
//#endregion
|
|
963
|
-
//#region src/reporter/renderable.ts
|
|
964
|
-
/** Map a suite report to formatter-ready rows (one per cell). */
|
|
965
|
-
function toRenderableRows(report) {
|
|
966
|
-
return report.cells.map((cell) => cellToRow(cell));
|
|
967
|
-
}
|
|
968
|
-
/**
|
|
969
|
-
* Attach baseline pass-rate deltas to matching rows.
|
|
970
|
-
*
|
|
971
|
-
* Rows without a matching baseline cell are returned unchanged.
|
|
972
|
-
*/
|
|
973
|
-
function applyBaseline(rows, baseline) {
|
|
974
|
-
const baselineMap = new Map(baseline.cells.map((c) => [`${c.caseId}::${c.cell.label}`, c]));
|
|
975
|
-
return rows.map((row) => {
|
|
976
|
-
const baseCell = baselineMap.get(`${row.caseId}::${row.cellLabel}`);
|
|
977
|
-
if (!baseCell) return row;
|
|
978
|
-
const stats = row.stats.map((stat, i) => {
|
|
979
|
-
const baseStat = baseCell.assertionStats[i];
|
|
980
|
-
if (!baseStat) return stat;
|
|
981
|
-
const delta = stat.passRate - baseStat.passRate;
|
|
982
|
-
return {
|
|
983
|
-
...stat,
|
|
984
|
-
baselinePassRate: baseStat.passRate,
|
|
985
|
-
delta
|
|
986
|
-
};
|
|
987
|
-
});
|
|
988
|
-
return {
|
|
989
|
-
...row,
|
|
990
|
-
stats
|
|
991
|
-
};
|
|
992
|
-
});
|
|
993
|
-
}
|
|
994
|
-
/** Convert one {@link CellReport} to a {@link RenderableRow}. */
|
|
995
|
-
function cellToRow(cell) {
|
|
996
|
-
const totalReps = cell.repetitions.length;
|
|
997
|
-
const stats = cell.assertionStats.map((s) => ({
|
|
998
|
-
description: s.description,
|
|
999
|
-
threshold: s.threshold,
|
|
1000
|
-
passedCount: s.passedCount,
|
|
1001
|
-
evaluatedCount: s.evaluatedCount,
|
|
1002
|
-
totalReps,
|
|
1003
|
-
adapterErrors: cell.adapterErrors,
|
|
1004
|
-
passRate: s.passRate,
|
|
1005
|
-
meetsThreshold: s.meetsThreshold
|
|
1006
|
-
}));
|
|
1007
|
-
return {
|
|
1008
|
-
caseId: cell.caseId,
|
|
1009
|
-
category: cell.category,
|
|
1010
|
-
notes: cell.notes,
|
|
1011
|
-
cellLabel: cell.cell.label,
|
|
1012
|
-
passed: cell.passed,
|
|
1013
|
-
adapterErrors: cell.adapterErrors,
|
|
1014
|
-
totalReps,
|
|
1015
|
-
stats
|
|
1016
|
-
};
|
|
1017
|
-
}
|
|
1018
|
-
//#endregion
|
|
1019
|
-
//#region src/reporter/index.ts
|
|
1020
|
-
/**
|
|
1021
|
-
* Format a {@link SuiteReport} for console, markdown, or JSON output.
|
|
1022
|
-
*
|
|
1023
|
-
* JSON format bypasses the renderable intermediate model and serializes the
|
|
1024
|
-
* report directly. Console and markdown apply optional baseline deltas.
|
|
1025
|
-
*/
|
|
1026
|
-
function formatReport(report, options) {
|
|
1027
|
-
if (options.format === "json") return formatJson(report);
|
|
1028
|
-
let rows = toRenderableRows(report);
|
|
1029
|
-
if (options.baseline) rows = applyBaseline(rows, options.baseline);
|
|
1030
|
-
const useColor = options.color ?? options.format === "console";
|
|
1031
|
-
if (options.format === "markdown") return formatMarkdown(rows);
|
|
1032
|
-
return formatConsole(rows, useColor);
|
|
1033
|
-
}
|
|
1034
|
-
//#endregion
|
|
1035
937
|
//#region src/eval-interchange/normalize.ts
|
|
1036
938
|
/**
|
|
1037
939
|
* Serialize tool arguments to the Vertex wire string format.
|
|
@@ -1431,6 +1333,36 @@ function outcomePassForCell(_caseId, _cellLabel, repetitions) {
|
|
|
1431
1333
|
if (graded.length === 0) return void 0;
|
|
1432
1334
|
return graded.every((r) => r.outcomeGrades.error === void 0 && r.outcomeGrades.summary.failed === 0);
|
|
1433
1335
|
}
|
|
1336
|
+
/** Resolve judge metadata for envelope export (explicit options win). */
|
|
1337
|
+
async function resolveEnvelopeJudge(options) {
|
|
1338
|
+
if (options.grading?.judge) return options.grading.judge;
|
|
1339
|
+
if (options.gradingConfigPath) try {
|
|
1340
|
+
return judgeInfoFromGradingConfig(await loadGradingConfig(resolve(options.gradingConfigPath)));
|
|
1341
|
+
} catch {}
|
|
1342
|
+
return resolveJudgeInfo({ adapter: "claude-code" });
|
|
1343
|
+
}
|
|
1344
|
+
/** Path to pass to {@link loadSuite} (directory layout uses the suite folder). */
|
|
1345
|
+
async function resolveSuiteLoadPath(suitePath) {
|
|
1346
|
+
const abs = resolve(suitePath);
|
|
1347
|
+
if (basename(abs) === "suite.yaml") return dirname(abs);
|
|
1348
|
+
try {
|
|
1349
|
+
if ((await stat(abs)).isDirectory()) return abs;
|
|
1350
|
+
} catch {}
|
|
1351
|
+
return abs;
|
|
1352
|
+
}
|
|
1353
|
+
/** Read suite YAML bytes for content hashing. */
|
|
1354
|
+
async function readSuiteYamlContent(suitePath) {
|
|
1355
|
+
const loadPath = await resolveSuiteLoadPath(suitePath);
|
|
1356
|
+
return readFile(basename(resolve(suitePath)) === "suite.yaml" ? resolve(suitePath) : join(loadPath, "suite.yaml"), "utf8");
|
|
1357
|
+
}
|
|
1358
|
+
async function resolveEnvelopeHarnessAdapter(options) {
|
|
1359
|
+
if (options.harnessAdapter) return options.harnessAdapter;
|
|
1360
|
+
if (options.suitePath) try {
|
|
1361
|
+
const suite = await loadSuite(await resolveSuiteLoadPath(options.suitePath));
|
|
1362
|
+
if (suite.adapter) return suite.adapter;
|
|
1363
|
+
} catch {}
|
|
1364
|
+
return "claude-code";
|
|
1365
|
+
}
|
|
1434
1366
|
/**
|
|
1435
1367
|
* Convert a {@link SuiteReport} (and optional grading) into a versioned
|
|
1436
1368
|
* {@link EvalRunEnvelope} for storage or API handoff.
|
|
@@ -1442,7 +1374,7 @@ function outcomePassForCell(_caseId, _cellLabel, repetitions) {
|
|
|
1442
1374
|
function buildEvalRunEnvelope(report, options = {}) {
|
|
1443
1375
|
const includeTranscript = options.includeTranscript !== false;
|
|
1444
1376
|
const includeRaw = options.includeRawStreamEvents === true;
|
|
1445
|
-
const judge = options.grading?.judge ?? {
|
|
1377
|
+
const judge = options.grading?.judge ?? resolveJudgeInfo({ adapter: "claude-code" });
|
|
1446
1378
|
const cells = report.cells.map((cell) => {
|
|
1447
1379
|
const prompt = cell.prompt ?? "";
|
|
1448
1380
|
const referenceTrajectoryConfig = cell.reference_trajectory;
|
|
@@ -1529,111 +1461,925 @@ function buildEvalRunEnvelope(report, options = {}) {
|
|
|
1529
1461
|
};
|
|
1530
1462
|
}
|
|
1531
1463
|
/**
|
|
1532
|
-
* Build an envelope from on-disk runner and grader JSON artifacts.
|
|
1533
|
-
*
|
|
1534
|
-
* Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
|
|
1535
|
-
* outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
|
|
1536
|
-
* attaches suite URI and SHA-256 content hash for reproducibility.
|
|
1464
|
+
* Build an envelope from on-disk runner and grader JSON artifacts.
|
|
1465
|
+
*
|
|
1466
|
+
* Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
|
|
1467
|
+
* outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
|
|
1468
|
+
* attaches suite URI and SHA-256 content hash for reproducibility.
|
|
1469
|
+
*
|
|
1470
|
+
* @param reportPath - Path to the suite run report JSON from `harness-eval run`.
|
|
1471
|
+
* @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
|
|
1472
|
+
*/
|
|
1473
|
+
async function buildEvalRunEnvelopeFromFiles(reportPath, options = {}) {
|
|
1474
|
+
const reportText = await readFile(reportPath, "utf8");
|
|
1475
|
+
const report = JSON.parse(reportText);
|
|
1476
|
+
const harnessAdapter = await resolveEnvelopeHarnessAdapter({
|
|
1477
|
+
harnessAdapter: options.harness?.adapter,
|
|
1478
|
+
suitePath: options.suitePath
|
|
1479
|
+
});
|
|
1480
|
+
let grading = options.grading;
|
|
1481
|
+
if (options.gradingPath) {
|
|
1482
|
+
const gradingText = await readFile(options.gradingPath, "utf8");
|
|
1483
|
+
const parsed = JSON.parse(gradingText);
|
|
1484
|
+
const judge = parsed.judge ?? await resolveEnvelopeJudge({ gradingConfigPath: parsed.gradingConfigPath });
|
|
1485
|
+
grading = {
|
|
1486
|
+
gradedAt: parsed.gradedAt,
|
|
1487
|
+
sourceReport: parsed.sourceReport,
|
|
1488
|
+
results: parsed.results,
|
|
1489
|
+
judge
|
|
1490
|
+
};
|
|
1491
|
+
}
|
|
1492
|
+
let suite = options.suite;
|
|
1493
|
+
if (options.suitePath) {
|
|
1494
|
+
const content = await readSuiteYamlContent(options.suitePath);
|
|
1495
|
+
suite = {
|
|
1496
|
+
...suite,
|
|
1497
|
+
uri: options.suitePath,
|
|
1498
|
+
contentHash: createHash("sha256").update(content).digest("hex")
|
|
1499
|
+
};
|
|
1500
|
+
}
|
|
1501
|
+
return buildEvalRunEnvelope(report, {
|
|
1502
|
+
...options,
|
|
1503
|
+
suite,
|
|
1504
|
+
grading,
|
|
1505
|
+
harness: {
|
|
1506
|
+
...options.harness,
|
|
1507
|
+
adapter: harnessAdapter
|
|
1508
|
+
}
|
|
1509
|
+
});
|
|
1510
|
+
}
|
|
1511
|
+
//#endregion
|
|
1512
|
+
//#region src/eval-interchange/projections.ts
|
|
1513
|
+
/** Trajectory instance keys emitted in stable order for JSONL export. */
|
|
1514
|
+
const TRAJECTORY_INSTANCE_KEYS = [
|
|
1515
|
+
"exactMatch",
|
|
1516
|
+
"inOrderMatch",
|
|
1517
|
+
"anyOrderMatch",
|
|
1518
|
+
"precision",
|
|
1519
|
+
"recall",
|
|
1520
|
+
"singleToolUse"
|
|
1521
|
+
];
|
|
1522
|
+
/**
|
|
1523
|
+
* Flatten one repetition into a trajectory dataset row.
|
|
1524
|
+
*
|
|
1525
|
+
* Pulls prompt from the cell, response from evaluationInstance, and falls
|
|
1526
|
+
* back to duration-based latency when enrich did not set latencySeconds.
|
|
1527
|
+
*/
|
|
1528
|
+
function repetitionToDatasetRow(cell, repetition) {
|
|
1529
|
+
return {
|
|
1530
|
+
caseId: cell.caseId,
|
|
1531
|
+
repetitionIndex: repetition.repetitionIndex,
|
|
1532
|
+
prompt: cell.prompt,
|
|
1533
|
+
response: repetition.evaluationInstance?.response?.text,
|
|
1534
|
+
evaluationInstance: repetition.evaluationInstance,
|
|
1535
|
+
latencySeconds: repetition.latencySeconds ?? repetition.durationMs / 1e3,
|
|
1536
|
+
failure: repetition.failure ?? (repetition.trajectory?.success ? 0 : 1),
|
|
1537
|
+
humanRatings: cell.humanRatings
|
|
1538
|
+
};
|
|
1539
|
+
}
|
|
1540
|
+
/**
|
|
1541
|
+
* Expand one repetition into type-tagged instance rows for EvaluateInstances.
|
|
1542
|
+
*
|
|
1543
|
+
* Returns an empty array when the repetition has no reference trajectory
|
|
1544
|
+
* (and therefore no trajectoryInstances block).
|
|
1545
|
+
*/
|
|
1546
|
+
function repetitionToInstanceRows(cell, repetition) {
|
|
1547
|
+
if (!repetition.trajectoryInstances) return [];
|
|
1548
|
+
const rows = [];
|
|
1549
|
+
for (const key of TRAJECTORY_INSTANCE_KEYS) {
|
|
1550
|
+
const instance = repetition.trajectoryInstances[key];
|
|
1551
|
+
if (!instance) continue;
|
|
1552
|
+
rows.push({
|
|
1553
|
+
messageType: trajectoryInstanceMessageType(key),
|
|
1554
|
+
caseId: cell.caseId,
|
|
1555
|
+
repetitionIndex: repetition.repetitionIndex,
|
|
1556
|
+
instance
|
|
1557
|
+
});
|
|
1558
|
+
}
|
|
1559
|
+
return rows;
|
|
1560
|
+
}
|
|
1561
|
+
/**
|
|
1562
|
+
* Trajectory projection — all repetitions in the envelope as dataset rows.
|
|
1563
|
+
*/
|
|
1564
|
+
function toTrajectory(envelope) {
|
|
1565
|
+
const rows = [];
|
|
1566
|
+
for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(repetitionToDatasetRow(cell, repetition));
|
|
1567
|
+
return rows;
|
|
1568
|
+
}
|
|
1569
|
+
/**
|
|
1570
|
+
* Instances projection — all trajectory metric instances as JSONL rows.
|
|
1571
|
+
*/
|
|
1572
|
+
function toInstancesJsonl(envelope) {
|
|
1573
|
+
const rows = [];
|
|
1574
|
+
for (const cell of envelope.cells) for (const repetition of cell.repetitions) rows.push(...repetitionToInstanceRows(cell, repetition));
|
|
1575
|
+
return rows;
|
|
1576
|
+
}
|
|
1577
|
+
//#endregion
|
|
1578
|
+
//#region src/pipeline/resolve-inputs.ts
|
|
1579
|
+
/**
|
|
1580
|
+
* Resolve pipeline step inputs and outputs with precedence rules.
|
|
1581
|
+
*
|
|
1582
|
+
* Precedence: CLI override > explicit YAML > prior step in this run > default path on disk > error.
|
|
1583
|
+
*/
|
|
1584
|
+
/** Resolve absolute paths for enabled pipeline steps. */
|
|
1585
|
+
async function resolvePipelineInputs(options) {
|
|
1586
|
+
const { suitePath, suiteDir, pipeline, steps, overrides } = options;
|
|
1587
|
+
const executed = options.executed ?? {};
|
|
1588
|
+
const stepSet = new Set(steps);
|
|
1589
|
+
const resolved = { suitePath: resolve(suitePath) };
|
|
1590
|
+
const defaultRunOutput = resolve(suiteDir, pipeline.run?.output ?? DEFAULT_PIPELINE_OUTPUTS.run);
|
|
1591
|
+
const defaultGradeOutput = resolve(suiteDir, pipeline.grade?.output ?? DEFAULT_PIPELINE_OUTPUTS.grade);
|
|
1592
|
+
if (stepSet.has("run") && pipeline.run) resolved.run = {
|
|
1593
|
+
output: resolve(suiteDir, overrides?.run?.output ?? pipeline.run.output),
|
|
1594
|
+
maxConcurrent: overrides?.run?.maxConcurrent ?? pipeline.run.maxConcurrent
|
|
1595
|
+
};
|
|
1596
|
+
if (stepSet.has("grade") && pipeline.grade) resolved.grade = {
|
|
1597
|
+
input: await resolveReportPath({
|
|
1598
|
+
explicit: overrides?.grade?.input ?? pipeline.grade.input,
|
|
1599
|
+
executedOutput: executed.run?.output,
|
|
1600
|
+
defaultPath: defaultRunOutput,
|
|
1601
|
+
label: "grade input (report)"
|
|
1602
|
+
}),
|
|
1603
|
+
output: resolve(suiteDir, overrides?.grade?.output ?? pipeline.grade.output),
|
|
1604
|
+
maxConcurrent: overrides?.grade?.maxConcurrent ?? pipeline.grade.maxConcurrent
|
|
1605
|
+
};
|
|
1606
|
+
if (stepSet.has("envelope") && pipeline.envelope) resolved.envelope = {
|
|
1607
|
+
report: await resolveReportPath({
|
|
1608
|
+
explicit: overrides?.envelope?.report ?? pipeline.envelope.report,
|
|
1609
|
+
executedOutput: executed.run?.output,
|
|
1610
|
+
defaultPath: defaultRunOutput,
|
|
1611
|
+
label: "envelope report"
|
|
1612
|
+
}),
|
|
1613
|
+
grading: await resolveOptionalGradingPath({
|
|
1614
|
+
explicit: overrides?.envelope?.grading ?? pipeline.envelope.grading,
|
|
1615
|
+
executedOutput: executed.grade?.output,
|
|
1616
|
+
defaultPath: defaultGradeOutput
|
|
1617
|
+
}),
|
|
1618
|
+
output: resolve(suiteDir, overrides?.envelope?.output ?? pipeline.envelope.output),
|
|
1619
|
+
projection: overrides?.envelope?.projection ?? pipeline.envelope.projection ?? "envelope",
|
|
1620
|
+
includeRawStreamEvents: pipeline.envelope.includeRawStreamEvents ?? false,
|
|
1621
|
+
noTranscript: pipeline.envelope.noTranscript ?? false
|
|
1622
|
+
};
|
|
1623
|
+
return resolved;
|
|
1624
|
+
}
|
|
1625
|
+
/**
|
|
1626
|
+
* Resolve a required report path: explicit override → prior step output → default on disk.
|
|
1627
|
+
* Throws when none of the above exist.
|
|
1628
|
+
*/
|
|
1629
|
+
async function resolveReportPath(options) {
|
|
1630
|
+
if (options.explicit) return resolve(options.explicit);
|
|
1631
|
+
if (options.executedOutput) return resolve(options.executedOutput);
|
|
1632
|
+
if (await pathExists(options.defaultPath)) return options.defaultPath;
|
|
1633
|
+
throw new ConfigError(`pipeline: could not resolve ${options.label}; specify an explicit path or run the run step first`, options.defaultPath);
|
|
1634
|
+
}
|
|
1635
|
+
/** Resolve optional grading path; returns undefined when grading was not run and file is absent. */
|
|
1636
|
+
async function resolveOptionalGradingPath(options) {
|
|
1637
|
+
if (options.explicit) return resolve(options.explicit);
|
|
1638
|
+
if (options.executedOutput) return resolve(options.executedOutput);
|
|
1639
|
+
if (await pathExists(options.defaultPath)) return options.defaultPath;
|
|
1640
|
+
}
|
|
1641
|
+
async function pathExists(filePath) {
|
|
1642
|
+
try {
|
|
1643
|
+
await stat(filePath);
|
|
1644
|
+
return true;
|
|
1645
|
+
} catch {
|
|
1646
|
+
return false;
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
/**
|
|
1650
|
+
* Resolve a grading artifact path from a unified suite's `pipeline:` block.
|
|
1651
|
+
*
|
|
1652
|
+
* Used by `harness-eval envelope --suite` when `--grading` is omitted (spec C-7).
|
|
1653
|
+
* Checks `pipeline.envelope.grading` then default `pipeline.grade.output` on disk.
|
|
1654
|
+
*/
|
|
1655
|
+
async function resolveGradingArtifactFromSuite(suitePath) {
|
|
1656
|
+
let doc;
|
|
1657
|
+
try {
|
|
1658
|
+
doc = await loadSuiteDocument(suitePath);
|
|
1659
|
+
} catch {
|
|
1660
|
+
return;
|
|
1661
|
+
}
|
|
1662
|
+
if (!doc.pipeline) return void 0;
|
|
1663
|
+
const explicit = doc.pipeline.envelope?.grading;
|
|
1664
|
+
if (explicit && await pathExists(explicit)) return explicit;
|
|
1665
|
+
const defaultGrade = doc.pipeline.grade?.output;
|
|
1666
|
+
if (defaultGrade && await pathExists(defaultGrade)) return defaultGrade;
|
|
1667
|
+
}
|
|
1668
|
+
/** Parse `--steps run,grade,envelope` against configured pipeline keys. */
|
|
1669
|
+
function parsePipelineSteps(pipeline, stepsArg) {
|
|
1670
|
+
const configured = [];
|
|
1671
|
+
if (pipeline.run !== void 0) configured.push("run");
|
|
1672
|
+
if (pipeline.grade !== void 0) configured.push("grade");
|
|
1673
|
+
if (pipeline.envelope !== void 0) configured.push("envelope");
|
|
1674
|
+
if (configured.length === 0) throw new ConfigError("pipeline block has no steps configured");
|
|
1675
|
+
if (!stepsArg) return configured;
|
|
1676
|
+
const validStepNames = /* @__PURE__ */ new Set([
|
|
1677
|
+
"run",
|
|
1678
|
+
"grade",
|
|
1679
|
+
"envelope"
|
|
1680
|
+
]);
|
|
1681
|
+
const requested = stepsArg.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1682
|
+
for (const step of requested) {
|
|
1683
|
+
if (!validStepNames.has(step)) throw new ConfigError(`unknown pipeline step "${step}"; valid steps are: run, grade, envelope`);
|
|
1684
|
+
if (!configured.includes(step)) throw new ConfigError(`pipeline step "${step}" is not configured in suite.yaml`);
|
|
1685
|
+
}
|
|
1686
|
+
const requestedSet = new Set(requested);
|
|
1687
|
+
return configured.filter((step) => requestedSet.has(step));
|
|
1688
|
+
}
|
|
1689
|
+
/** Parent directory of suite.yaml. */
|
|
1690
|
+
function suiteDirectoryFromPath(suitePath) {
|
|
1691
|
+
return dirname(resolve(suitePath));
|
|
1692
|
+
}
|
|
1693
|
+
//#endregion
|
|
1694
|
+
//#region src/cli/args.ts
|
|
1695
|
+
/** Parse process argv into command, positional args, and options. */
|
|
1696
|
+
function parseArgs(argv) {
|
|
1697
|
+
const positional = [];
|
|
1698
|
+
const options = {};
|
|
1699
|
+
let command;
|
|
1700
|
+
const args = [...argv];
|
|
1701
|
+
if (args.length > 0 && !args[0].startsWith("-")) command = args.shift();
|
|
1702
|
+
for (let i = 0; i < args.length; i++) {
|
|
1703
|
+
const arg = args[i];
|
|
1704
|
+
if (arg === "--") {
|
|
1705
|
+
positional.push(...args.slice(i + 1));
|
|
1706
|
+
break;
|
|
1707
|
+
}
|
|
1708
|
+
if (arg.startsWith("--")) {
|
|
1709
|
+
const key = arg.slice(2);
|
|
1710
|
+
const next = args[i + 1];
|
|
1711
|
+
if (next && !next.startsWith("-")) {
|
|
1712
|
+
options[key] = next;
|
|
1713
|
+
i++;
|
|
1714
|
+
} else options[key] = true;
|
|
1715
|
+
} else if (arg.startsWith("-") && arg.length === 2) {
|
|
1716
|
+
const key = arg.slice(1);
|
|
1717
|
+
const next = args[i + 1];
|
|
1718
|
+
if (next && !next.startsWith("-")) {
|
|
1719
|
+
options[key] = next;
|
|
1720
|
+
i++;
|
|
1721
|
+
} else options[key] = true;
|
|
1722
|
+
} else positional.push(arg);
|
|
1723
|
+
}
|
|
1724
|
+
return {
|
|
1725
|
+
command,
|
|
1726
|
+
positional,
|
|
1727
|
+
options
|
|
1728
|
+
};
|
|
1729
|
+
}
|
|
1730
|
+
/** Return a string option value, or undefined when absent or boolean. */
|
|
1731
|
+
function getOption(options, name) {
|
|
1732
|
+
const v = options[name];
|
|
1733
|
+
return typeof v === "string" ? v : void 0;
|
|
1734
|
+
}
|
|
1735
|
+
/** Parse an integer option with fallback when absent or non-numeric. */
|
|
1736
|
+
function getOptionInt(options, name, defaultValue) {
|
|
1737
|
+
const v = getOption(options, name);
|
|
1738
|
+
if (v === void 0) return defaultValue;
|
|
1739
|
+
const n = Number.parseInt(v, 10);
|
|
1740
|
+
if (!Number.isFinite(n)) return defaultValue;
|
|
1741
|
+
return n;
|
|
1742
|
+
}
|
|
1743
|
+
/** True when a boolean flag is set or explicitly `"true"`. */
|
|
1744
|
+
function hasOption(options, name) {
|
|
1745
|
+
const v = options[name];
|
|
1746
|
+
return v === true || typeof v === "string" && v === "true";
|
|
1747
|
+
}
|
|
1748
|
+
//#endregion
|
|
1749
|
+
//#region src/cli/commands/envelope.ts
|
|
1750
|
+
/**
|
|
1751
|
+
* `harness-eval envelope` — build EvalRunEnvelope and interchange projections.
|
|
1752
|
+
*
|
|
1753
|
+
* Reads a suite run report (and optional grading JSON), builds a versioned
|
|
1754
|
+
* {@link EvalRunEnvelope}, and serializes one of three projections:
|
|
1755
|
+
*
|
|
1756
|
+
* - `envelope` — full nested JSON document (default)
|
|
1757
|
+
* - `trajectory` — JSONL of {@link EvalDatasetRow} per repetition
|
|
1758
|
+
* - `instances` — JSONL of {@link InstancesJsonlRow} for Vertex batch upload
|
|
1759
|
+
*
|
|
1760
|
+
* Exit code 0 when behavioral pass, 1 when any cell failed assertions.
|
|
1761
|
+
*/
|
|
1762
|
+
const PROJECTIONS = /* @__PURE__ */ new Set([
|
|
1763
|
+
"envelope",
|
|
1764
|
+
"trajectory",
|
|
1765
|
+
"instances"
|
|
1766
|
+
]);
|
|
1767
|
+
/**
|
|
1768
|
+
* Parse and validate `--projection` CLI flag.
|
|
1769
|
+
*
|
|
1770
|
+
* @returns `"envelope"` when omitted; `undefined` when value is invalid.
|
|
1771
|
+
*/
|
|
1772
|
+
function parseEnvelopeProjection(value) {
|
|
1773
|
+
if (value === void 0) return "envelope";
|
|
1774
|
+
if (PROJECTIONS.has(value)) return value;
|
|
1775
|
+
}
|
|
1776
|
+
/**
|
|
1777
|
+
* Serialize an envelope to stdout/file string for the chosen projection.
|
|
1778
|
+
*
|
|
1779
|
+
* Trajectory and instances projections emit NDJSON (one JSON object per line).
|
|
1780
|
+
*/
|
|
1781
|
+
function serializeEnvelopeProjection(envelope, projection) {
|
|
1782
|
+
switch (projection) {
|
|
1783
|
+
case "trajectory": return `${toTrajectory(envelope).map((row) => JSON.stringify(row)).join("\n")}\n`;
|
|
1784
|
+
case "instances": return `${toInstancesJsonl(envelope).map((row) => JSON.stringify(row)).join("\n")}\n`;
|
|
1785
|
+
default: return `${JSON.stringify(envelope, null, 2)}\n`;
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
/** Read harness-eval package version for envelope harness.frameworkVersion. */
|
|
1789
|
+
async function readFrameworkVersion() {
|
|
1790
|
+
try {
|
|
1791
|
+
const text = await readFile(join(dirname(fileURLToPath(import.meta.url)), "../../../package.json"), "utf8");
|
|
1792
|
+
return JSON.parse(text).version;
|
|
1793
|
+
} catch {
|
|
1794
|
+
return;
|
|
1795
|
+
}
|
|
1796
|
+
}
|
|
1797
|
+
/**
|
|
1798
|
+
* CLI entry point for the `envelope` subcommand.
|
|
1799
|
+
*
|
|
1800
|
+
* @returns Process exit code: 0 on behavioral pass, 1 on failure, 2 on usage/error.
|
|
1801
|
+
*/
|
|
1802
|
+
async function envelopeCommand(args) {
|
|
1803
|
+
const reportPath = args.positional[0];
|
|
1804
|
+
if (!reportPath) {
|
|
1805
|
+
console.error("usage: harness-eval envelope <report.json> [--output path] [--grading path] [--suite path] [--projection envelope|trajectory|instances] [--include-raw-stream-events] [--no-transcript]");
|
|
1806
|
+
return 2;
|
|
1807
|
+
}
|
|
1808
|
+
const outputPath = getOption(args.options, "output");
|
|
1809
|
+
const suitePath = getOption(args.options, "suite");
|
|
1810
|
+
let gradingPath = getOption(args.options, "grading");
|
|
1811
|
+
if (!gradingPath && suitePath) gradingPath = await resolveGradingArtifactFromSuite(suitePath);
|
|
1812
|
+
const projection = parseEnvelopeProjection(getOption(args.options, "projection"));
|
|
1813
|
+
if (!projection) {
|
|
1814
|
+
console.error("invalid --projection; expected envelope, trajectory, or instances");
|
|
1815
|
+
return 2;
|
|
1816
|
+
}
|
|
1817
|
+
let envelope;
|
|
1818
|
+
try {
|
|
1819
|
+
const frameworkVersion = await readFrameworkVersion();
|
|
1820
|
+
envelope = await buildEvalRunEnvelopeFromFiles(reportPath, {
|
|
1821
|
+
gradingPath,
|
|
1822
|
+
suitePath,
|
|
1823
|
+
includeTranscript: !hasOption(args.options, "no-transcript"),
|
|
1824
|
+
includeRawStreamEvents: hasOption(args.options, "include-raw-stream-events"),
|
|
1825
|
+
harness: { frameworkVersion }
|
|
1826
|
+
});
|
|
1827
|
+
} catch (err) {
|
|
1828
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
1829
|
+
return 2;
|
|
1830
|
+
}
|
|
1831
|
+
const serialized = serializeEnvelopeProjection(envelope, projection);
|
|
1832
|
+
if (outputPath) await writeFile(outputPath, serialized, "utf8");
|
|
1833
|
+
else process.stdout.write(serialized);
|
|
1834
|
+
return envelope.summary.behavioralPass ? 0 : 1;
|
|
1835
|
+
}
|
|
1836
|
+
//#endregion
|
|
1837
|
+
//#region src/pipeline/run-pipeline.ts
|
|
1838
|
+
/**
|
|
1839
|
+
* Orchestrate run → grade → envelope pipeline steps.
|
|
1840
|
+
*/
|
|
1841
|
+
/** Execute configured pipeline steps in order; stop on first failure. */
|
|
1842
|
+
async function runPipeline(doc, options = {}) {
|
|
1843
|
+
if (!doc.pipeline) throw new ConfigError("suite document has no pipeline block", doc.suitePath);
|
|
1844
|
+
const steps = parsePipelineSteps(doc.pipeline, options.steps);
|
|
1845
|
+
const suiteDir = suiteDirectoryFromPath(doc.suitePath);
|
|
1846
|
+
const executed = {};
|
|
1847
|
+
let runReport;
|
|
1848
|
+
let exitCode = 0;
|
|
1849
|
+
for (const step of steps) {
|
|
1850
|
+
const resolved = await resolvePipelineInputs({
|
|
1851
|
+
suitePath: doc.suitePath,
|
|
1852
|
+
suiteDir,
|
|
1853
|
+
pipeline: doc.pipeline,
|
|
1854
|
+
steps: [step],
|
|
1855
|
+
executed,
|
|
1856
|
+
overrides: options.overrides
|
|
1857
|
+
});
|
|
1858
|
+
if (step === "run" && resolved.run) {
|
|
1859
|
+
const adapter = getAdapter(doc.suite.adapter ?? "claude-code");
|
|
1860
|
+
runReport = await runSuite(doc.suite, {
|
|
1861
|
+
adapter,
|
|
1862
|
+
maxConcurrent: resolved.run.maxConcurrent ?? options.maxConcurrent ?? 4,
|
|
1863
|
+
onProgress: options.onRunProgress
|
|
1864
|
+
});
|
|
1865
|
+
await writeFile(resolved.run.output, JSON.stringify(runReport, null, 2), "utf8");
|
|
1866
|
+
executed.run = { output: resolved.run.output };
|
|
1867
|
+
if (!runReport.cells.every((cell) => cell.passed)) return {
|
|
1868
|
+
exitCode: 1,
|
|
1869
|
+
stepsRun: steps.slice(0, steps.indexOf(step) + 1),
|
|
1870
|
+
runReport
|
|
1871
|
+
};
|
|
1872
|
+
continue;
|
|
1873
|
+
}
|
|
1874
|
+
if (step === "grade" && resolved.grade) {
|
|
1875
|
+
if (!doc.judge) throw new ConfigError("grade step requires inline judge: block in suite.yaml", doc.suitePath);
|
|
1876
|
+
const gradeOptions = resolveGradeOptions({ judge: doc.judge }, {
|
|
1877
|
+
sourceReport: resolved.grade.input,
|
|
1878
|
+
maxConcurrent: resolved.grade.maxConcurrent
|
|
1879
|
+
}, doc.suitePath);
|
|
1880
|
+
const grading = await gradeReport(await loadSuiteReport(resolved.grade.input), {
|
|
1881
|
+
...gradeOptions,
|
|
1882
|
+
onProgress: options.onGradeProgress
|
|
1883
|
+
});
|
|
1884
|
+
await writeFile(resolved.grade.output, JSON.stringify(grading, null, 2), "utf8");
|
|
1885
|
+
executed.grade = {
|
|
1886
|
+
input: resolved.grade.input,
|
|
1887
|
+
output: resolved.grade.output
|
|
1888
|
+
};
|
|
1889
|
+
if (!gradingReportPassed(grading)) return {
|
|
1890
|
+
exitCode: 1,
|
|
1891
|
+
stepsRun: steps.slice(0, steps.indexOf(step) + 1),
|
|
1892
|
+
runReport
|
|
1893
|
+
};
|
|
1894
|
+
continue;
|
|
1895
|
+
}
|
|
1896
|
+
if (step === "envelope" && resolved.envelope) {
|
|
1897
|
+
const envelope = await buildEvalRunEnvelopeFromFiles(resolved.envelope.report, {
|
|
1898
|
+
gradingPath: resolved.envelope.grading,
|
|
1899
|
+
suitePath: doc.suitePath,
|
|
1900
|
+
includeTranscript: !resolved.envelope.noTranscript,
|
|
1901
|
+
includeRawStreamEvents: resolved.envelope.includeRawStreamEvents,
|
|
1902
|
+
harness: { frameworkVersion: options.frameworkVersion }
|
|
1903
|
+
});
|
|
1904
|
+
const serialized = serializeEnvelopeProjection(envelope, resolved.envelope.projection);
|
|
1905
|
+
await writeFile(resolved.envelope.output, serialized, "utf8");
|
|
1906
|
+
const behavioralFail = !envelope.summary.behavioralPass;
|
|
1907
|
+
const outcomeFail = envelope.summary.outcomePass !== void 0 && !envelope.summary.outcomePass;
|
|
1908
|
+
if (behavioralFail || outcomeFail) return {
|
|
1909
|
+
exitCode: 1,
|
|
1910
|
+
stepsRun: steps.slice(0, steps.indexOf(step) + 1),
|
|
1911
|
+
runReport
|
|
1912
|
+
};
|
|
1913
|
+
continue;
|
|
1914
|
+
}
|
|
1915
|
+
}
|
|
1916
|
+
return {
|
|
1917
|
+
exitCode,
|
|
1918
|
+
stepsRun: steps,
|
|
1919
|
+
runReport
|
|
1920
|
+
};
|
|
1921
|
+
}
|
|
1922
|
+
//#endregion
|
|
1923
|
+
//#region src/otel/attributes.ts
|
|
1924
|
+
/** Build a string-typed OTLP attribute. */
|
|
1925
|
+
function strAttr(key, value) {
|
|
1926
|
+
return {
|
|
1927
|
+
key,
|
|
1928
|
+
value: { stringValue: value }
|
|
1929
|
+
};
|
|
1930
|
+
}
|
|
1931
|
+
/** Build an integer-typed OTLP attribute (stored as decimal string). */
|
|
1932
|
+
function intAttr(key, value) {
|
|
1933
|
+
return {
|
|
1934
|
+
key,
|
|
1935
|
+
value: { intValue: String(value) }
|
|
1936
|
+
};
|
|
1937
|
+
}
|
|
1938
|
+
/** Build a boolean-typed OTLP attribute. */
|
|
1939
|
+
function boolAttr(key, value) {
|
|
1940
|
+
return {
|
|
1941
|
+
key,
|
|
1942
|
+
value: { boolValue: value }
|
|
1943
|
+
};
|
|
1944
|
+
}
|
|
1945
|
+
/** Build a JSON-serialized string attribute (common for message arrays). */
|
|
1946
|
+
function jsonAttr(key, value) {
|
|
1947
|
+
return {
|
|
1948
|
+
key,
|
|
1949
|
+
value: { stringValue: JSON.stringify(value) }
|
|
1950
|
+
};
|
|
1951
|
+
}
|
|
1952
|
+
//#endregion
|
|
1953
|
+
//#region src/otel/messages.ts
|
|
1954
|
+
/**
|
|
1955
|
+
* Map harness stop reasons to GenAI semconv finish_reason values.
|
|
1956
|
+
*
|
|
1957
|
+
* Unknown reasons pass through unchanged for forward compatibility.
|
|
1958
|
+
*/
|
|
1959
|
+
function mapStopReason(reason) {
|
|
1960
|
+
if (!reason) return void 0;
|
|
1961
|
+
switch (reason) {
|
|
1962
|
+
case "end_turn": return "stop";
|
|
1963
|
+
case "tool_use": return "tool_calls";
|
|
1964
|
+
case "max_tokens": return "length";
|
|
1965
|
+
case "stop_sequence": return "stop";
|
|
1966
|
+
default: return reason;
|
|
1967
|
+
}
|
|
1968
|
+
}
|
|
1969
|
+
/** Build a tool_call part from a {@link ToolCall}. */
|
|
1970
|
+
function toolCallPart(call) {
|
|
1971
|
+
return {
|
|
1972
|
+
type: "tool_call",
|
|
1973
|
+
id: call.callId,
|
|
1974
|
+
name: call.name,
|
|
1975
|
+
arguments: call.args ?? {}
|
|
1976
|
+
};
|
|
1977
|
+
}
|
|
1978
|
+
/** Build a tool_call_response part from a {@link ToolCall} result. */
|
|
1979
|
+
function toolResponsePart(call) {
|
|
1980
|
+
return {
|
|
1981
|
+
type: "tool_call_response",
|
|
1982
|
+
id: call.callId,
|
|
1983
|
+
result: call.result
|
|
1984
|
+
};
|
|
1985
|
+
}
|
|
1986
|
+
/** Convert one assistant turn to a GenAI semconv assistant message. */
|
|
1987
|
+
function assistantMessageFromTurn(turn) {
|
|
1988
|
+
const parts = [];
|
|
1989
|
+
if (turn.text) parts.push({
|
|
1990
|
+
type: "text",
|
|
1991
|
+
content: turn.text
|
|
1992
|
+
});
|
|
1993
|
+
for (const call of turn.toolCalls) parts.push(toolCallPart(call));
|
|
1994
|
+
const finish = mapStopReason(turn.stopReason);
|
|
1995
|
+
return {
|
|
1996
|
+
role: "assistant",
|
|
1997
|
+
parts,
|
|
1998
|
+
...finish ? { finish_reason: finish } : {}
|
|
1999
|
+
};
|
|
2000
|
+
}
|
|
2001
|
+
/** Aggregate tool results from a turn into a single tool-role message, if any. */
|
|
2002
|
+
function toolResultsMessage(calls) {
|
|
2003
|
+
const parts = calls.filter((c) => c.result !== null).map((c) => toolResponsePart(c));
|
|
2004
|
+
if (parts.length === 0) return null;
|
|
2005
|
+
return {
|
|
2006
|
+
role: "tool",
|
|
2007
|
+
parts
|
|
2008
|
+
};
|
|
2009
|
+
}
|
|
2010
|
+
/**
|
|
2011
|
+
* Input history before the assistant turn at `turnIndex`.
|
|
2012
|
+
*/
|
|
2013
|
+
function inputMessagesBeforeTurn(view, turnIndex, prompt) {
|
|
2014
|
+
const messages = [];
|
|
2015
|
+
if (prompt) messages.push({
|
|
2016
|
+
role: "user",
|
|
2017
|
+
parts: [{
|
|
2018
|
+
type: "text",
|
|
2019
|
+
content: prompt
|
|
2020
|
+
}]
|
|
2021
|
+
});
|
|
2022
|
+
for (let i = 0; i < turnIndex; i++) {
|
|
2023
|
+
const turn = view.turns[i];
|
|
2024
|
+
if (!turn) continue;
|
|
2025
|
+
messages.push(assistantMessageFromTurn(turn));
|
|
2026
|
+
const toolMsg = toolResultsMessage(turn.toolCalls);
|
|
2027
|
+
if (toolMsg) messages.push(toolMsg);
|
|
2028
|
+
}
|
|
2029
|
+
return messages;
|
|
2030
|
+
}
|
|
2031
|
+
//#endregion
|
|
2032
|
+
//#region src/otel/types.ts
|
|
2033
|
+
/** OTLP span kinds (enum integers). */
|
|
2034
|
+
const SpanKind = {
|
|
2035
|
+
INTERNAL: 1,
|
|
2036
|
+
CLIENT: 2
|
|
2037
|
+
};
|
|
2038
|
+
/** OTLP status codes. */
|
|
2039
|
+
const StatusCode = {
|
|
2040
|
+
UNSET: 0,
|
|
2041
|
+
OK: 1,
|
|
2042
|
+
ERROR: 2
|
|
2043
|
+
};
|
|
2044
|
+
//#endregion
|
|
2045
|
+
//#region src/otel/emitter.ts
|
|
2046
|
+
/**
|
|
2047
|
+
* TrajectoryView → OTLP JSON export using OpenTelemetry GenAI semantic conventions.
|
|
2048
|
+
*
|
|
2049
|
+
* Produces an `ExportTraceServiceRequest` suitable for OTLP/HTTP JSON ingestion.
|
|
2050
|
+
* Assertions continue to use {@link TrajectoryView} directly; this is export-only.
|
|
2051
|
+
*/
|
|
2052
|
+
const INSTRUMENTATION_VERSION = "0.1.0";
|
|
2053
|
+
/**
|
|
2054
|
+
* Map a {@link TrajectoryView} to OTLP trace JSON.
|
|
2055
|
+
*
|
|
2056
|
+
* Span tree (siblings under `invoke_agent`, not nested):
|
|
2057
|
+
* ```
|
|
2058
|
+
* invoke_agent
|
|
2059
|
+
* ├── chat {model}
|
|
2060
|
+
* ├── execute_tool {name}
|
|
2061
|
+
* ├── chat {model}
|
|
2062
|
+
* └── ...
|
|
2063
|
+
* ```
|
|
2064
|
+
*/
|
|
2065
|
+
function trajectoryToOtlp(view, options = {}) {
|
|
2066
|
+
const agentName = options.agentName ?? "claude-code";
|
|
2067
|
+
const providerName = options.providerName ?? "anthropic";
|
|
2068
|
+
const serviceName = options.serviceName ?? "harness-eval";
|
|
2069
|
+
const scopeName = options.instrumentationScope ?? "@alis-build/harness-eval";
|
|
2070
|
+
const traceId = traceIdFromSession(view.meta.sessionId);
|
|
2071
|
+
const rootSpanId = spanIdFromKey(traceId, "invoke_agent");
|
|
2072
|
+
const durationMs = Math.max(view.usage.durationMs, 1);
|
|
2073
|
+
const endMs = options.endTimeMs ?? Date.now();
|
|
2074
|
+
const startMs = endMs - durationMs;
|
|
2075
|
+
const rootStartNs = msToNs(startMs);
|
|
2076
|
+
const rootEndNs = msToNs(endMs);
|
|
2077
|
+
const spans = [];
|
|
2078
|
+
const timings = buildSpanTimings(view, startMs, endMs);
|
|
2079
|
+
spans.push({
|
|
2080
|
+
traceId,
|
|
2081
|
+
spanId: rootSpanId,
|
|
2082
|
+
name: "invoke_agent",
|
|
2083
|
+
kind: SpanKind.INTERNAL,
|
|
2084
|
+
startTimeUnixNano: rootStartNs,
|
|
2085
|
+
endTimeUnixNano: rootEndNs,
|
|
2086
|
+
attributes: [
|
|
2087
|
+
strAttr("gen_ai.operation.name", "invoke_agent"),
|
|
2088
|
+
strAttr("gen_ai.agent.name", agentName),
|
|
2089
|
+
strAttr("gen_ai.provider.name", providerName),
|
|
2090
|
+
strAttr("gen_ai.conversation.id", view.meta.sessionId),
|
|
2091
|
+
strAttr("gen_ai.request.model", view.meta.model),
|
|
2092
|
+
strAttr("gen_ai.response.model", view.meta.model),
|
|
2093
|
+
intAttr("gen_ai.usage.input_tokens", view.usage.inputTokens),
|
|
2094
|
+
intAttr("gen_ai.usage.output_tokens", view.usage.outputTokens),
|
|
2095
|
+
boolAttr("harness_eval.success", view.success)
|
|
2096
|
+
],
|
|
2097
|
+
status: viewStatus(view)
|
|
2098
|
+
});
|
|
2099
|
+
let opIndex = 0;
|
|
2100
|
+
for (const turn of view.turns) {
|
|
2101
|
+
const chatTiming = timings[opIndex++];
|
|
2102
|
+
const chatSpanId = spanIdFromKey(traceId, `chat:${turn.turnIndex}`);
|
|
2103
|
+
const inputMessages = inputMessagesBeforeTurn(view, turn.turnIndex, options.prompt);
|
|
2104
|
+
const outputMessages = [assistantMessageFromTurn(turn)];
|
|
2105
|
+
spans.push({
|
|
2106
|
+
traceId,
|
|
2107
|
+
spanId: chatSpanId,
|
|
2108
|
+
parentSpanId: rootSpanId,
|
|
2109
|
+
name: `chat ${view.meta.model}`,
|
|
2110
|
+
kind: SpanKind.CLIENT,
|
|
2111
|
+
startTimeUnixNano: chatTiming.startNs,
|
|
2112
|
+
endTimeUnixNano: chatTiming.endNs,
|
|
2113
|
+
attributes: [
|
|
2114
|
+
strAttr("gen_ai.operation.name", "chat"),
|
|
2115
|
+
strAttr("gen_ai.provider.name", providerName),
|
|
2116
|
+
strAttr("gen_ai.request.model", view.meta.model),
|
|
2117
|
+
strAttr("gen_ai.response.model", view.meta.model),
|
|
2118
|
+
...inputMessages.length > 0 ? [jsonAttr("gen_ai.input.messages", inputMessages)] : [],
|
|
2119
|
+
jsonAttr("gen_ai.output.messages", outputMessages),
|
|
2120
|
+
...turn.stopReason ? [jsonAttr("gen_ai.response.finish_reasons", [mapStopReason(turn.stopReason) ?? turn.stopReason])] : []
|
|
2121
|
+
],
|
|
2122
|
+
status: { code: StatusCode.OK }
|
|
2123
|
+
});
|
|
2124
|
+
if (turn.toolCalls.length === 0) continue;
|
|
2125
|
+
const toolTiming = timings[opIndex++];
|
|
2126
|
+
for (const call of turn.toolCalls) {
|
|
2127
|
+
const toolSpanId = spanIdFromKey(traceId, `tool:${call.callId}`);
|
|
2128
|
+
spans.push({
|
|
2129
|
+
traceId,
|
|
2130
|
+
spanId: toolSpanId,
|
|
2131
|
+
parentSpanId: rootSpanId,
|
|
2132
|
+
name: `execute_tool ${call.name}`,
|
|
2133
|
+
kind: SpanKind.INTERNAL,
|
|
2134
|
+
startTimeUnixNano: toolTiming.startNs,
|
|
2135
|
+
endTimeUnixNano: toolTiming.endNs,
|
|
2136
|
+
attributes: [
|
|
2137
|
+
strAttr("gen_ai.operation.name", "execute_tool"),
|
|
2138
|
+
strAttr("gen_ai.provider.name", providerName),
|
|
2139
|
+
strAttr("gen_ai.tool.name", call.name),
|
|
2140
|
+
strAttr("gen_ai.tool.call.id", call.callId),
|
|
2141
|
+
jsonAttr("gen_ai.tool.call.arguments", call.args ?? {}),
|
|
2142
|
+
...call.result !== null ? [jsonAttr("gen_ai.tool.call.result", call.result)] : [],
|
|
2143
|
+
...call.namespace ? [strAttr("harness_eval.tool.namespace", call.namespace)] : [],
|
|
2144
|
+
boolAttr("harness_eval.tool.is_error", call.isError)
|
|
2145
|
+
],
|
|
2146
|
+
status: call.isError ? {
|
|
2147
|
+
code: StatusCode.ERROR,
|
|
2148
|
+
message: "tool reported error"
|
|
2149
|
+
} : { code: StatusCode.OK }
|
|
2150
|
+
});
|
|
2151
|
+
}
|
|
2152
|
+
}
|
|
2153
|
+
return { resourceSpans: [{
|
|
2154
|
+
resource: { attributes: [strAttr("service.name", serviceName), strAttr("gen_ai.agent.name", agentName)] },
|
|
2155
|
+
scopeSpans: [{
|
|
2156
|
+
scope: {
|
|
2157
|
+
name: scopeName,
|
|
2158
|
+
version: INSTRUMENTATION_VERSION
|
|
2159
|
+
},
|
|
2160
|
+
spans
|
|
2161
|
+
}]
|
|
2162
|
+
}] };
|
|
2163
|
+
}
|
|
2164
|
+
/** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
|
|
2165
|
+
const emitOtel = trajectoryToOtlp;
|
|
2166
|
+
/** Map view success flag to OTLP span status on the root invoke_agent span. */
|
|
2167
|
+
function viewStatus(view) {
|
|
2168
|
+
if (view.success) return { code: StatusCode.OK };
|
|
2169
|
+
return {
|
|
2170
|
+
code: StatusCode.ERROR,
|
|
2171
|
+
message: "harness run did not complete successfully"
|
|
2172
|
+
};
|
|
2173
|
+
}
|
|
2174
|
+
/**
|
|
2175
|
+
* Assign synthetic timestamps to chat and tool spans.
|
|
2176
|
+
*
|
|
2177
|
+
* Stream-json does not carry per-turn wall times, so we divide the session
|
|
2178
|
+
* duration evenly across chat/tool slots for OTLP consumers that require
|
|
2179
|
+
* start/end times on every span.
|
|
2180
|
+
*/
|
|
2181
|
+
function buildSpanTimings(view, startMs, endMs) {
|
|
2182
|
+
const slots = [];
|
|
2183
|
+
for (const turn of view.turns) {
|
|
2184
|
+
slots.push("chat");
|
|
2185
|
+
if (turn.toolCalls.length > 0) slots.push("tools");
|
|
2186
|
+
}
|
|
2187
|
+
if (slots.length === 0) return [];
|
|
2188
|
+
const slotMs = Math.max(endMs - startMs, 1) / slots.length;
|
|
2189
|
+
const timings = [];
|
|
2190
|
+
let offset = startMs;
|
|
2191
|
+
for (const slot of slots) {
|
|
2192
|
+
const slotStart = offset;
|
|
2193
|
+
const slotEnd = offset + slotMs;
|
|
2194
|
+
timings.push({
|
|
2195
|
+
startNs: msToNs(slotStart),
|
|
2196
|
+
endNs: msToNs(slotEnd)
|
|
2197
|
+
});
|
|
2198
|
+
offset = slotEnd;
|
|
2199
|
+
}
|
|
2200
|
+
return timings;
|
|
2201
|
+
}
|
|
2202
|
+
/**
|
|
2203
|
+
* Derive a deterministic 128-bit trace id from the harness session id.
|
|
1537
2204
|
*
|
|
1538
|
-
*
|
|
1539
|
-
* @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
|
|
2205
|
+
* Uses SHA-256 truncation so the same session always maps to the same trace.
|
|
1540
2206
|
*/
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
};
|
|
1554
|
-
}
|
|
1555
|
-
let suite = options.suite;
|
|
1556
|
-
if (options.suitePath) {
|
|
1557
|
-
const content = await readFile(options.suitePath, "utf8");
|
|
1558
|
-
suite = {
|
|
1559
|
-
...suite,
|
|
1560
|
-
uri: options.suitePath,
|
|
1561
|
-
contentHash: createHash("sha256").update(content).digest("hex")
|
|
1562
|
-
};
|
|
1563
|
-
}
|
|
1564
|
-
return buildEvalRunEnvelope(report, {
|
|
1565
|
-
...options,
|
|
1566
|
-
suite,
|
|
1567
|
-
grading
|
|
1568
|
-
});
|
|
2207
|
+
function traceIdFromSession(sessionId) {
|
|
2208
|
+
return createHash("sha256").update(`harness-eval:trace:${sessionId}`).digest("hex").slice(0, 32).toUpperCase();
|
|
2209
|
+
}
|
|
2210
|
+
/**
|
|
2211
|
+
* Derive a deterministic 64-bit span id from trace id and a logical span key.
|
|
2212
|
+
*/
|
|
2213
|
+
function spanIdFromKey(traceId, key) {
|
|
2214
|
+
return createHash("sha256").update(`${traceId}:span:${key}`).digest("hex").slice(0, 16).toUpperCase();
|
|
2215
|
+
}
|
|
2216
|
+
/** Convert milliseconds since epoch to OTLP nanosecond timestamp string. */
|
|
2217
|
+
function msToNs(ms) {
|
|
2218
|
+
return String(Math.round(ms * 1e6));
|
|
1569
2219
|
}
|
|
1570
2220
|
//#endregion
|
|
1571
|
-
//#region src/
|
|
1572
|
-
|
|
1573
|
-
const
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
"anyOrderMatch",
|
|
1577
|
-
"precision",
|
|
1578
|
-
"recall",
|
|
1579
|
-
"singleToolUse"
|
|
1580
|
-
];
|
|
2221
|
+
//#region src/reporter/format-console.ts
|
|
2222
|
+
const RESET = "\x1B[0m";
|
|
2223
|
+
const GREEN = "\x1B[32m";
|
|
2224
|
+
const RED = "\x1B[31m";
|
|
2225
|
+
const YELLOW = "\x1B[33m";
|
|
1581
2226
|
/**
|
|
1582
|
-
*
|
|
2227
|
+
* Render renderable rows as ANSI-colored console output.
|
|
1583
2228
|
*
|
|
1584
|
-
*
|
|
1585
|
-
* back to duration-based latency when enrich did not set latencySeconds.
|
|
2229
|
+
* @param color When false, emit plain text without escape codes.
|
|
1586
2230
|
*/
|
|
1587
|
-
function
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
2231
|
+
function formatConsole(rows, color = true) {
|
|
2232
|
+
const lines = [];
|
|
2233
|
+
for (const row of rows) {
|
|
2234
|
+
const status = row.passed ? color ? `${GREEN}PASS${RESET}` : "PASS" : color ? `${RED}FAIL${RESET}` : "FAIL";
|
|
2235
|
+
const crashNote = row.adapterErrors > 0 ? ` ${color ? YELLOW : ""}[${row.adapterErrors} adapter errors]${color ? RESET : ""}` : "";
|
|
2236
|
+
lines.push(`${row.caseId} @ ${row.cellLabel} ${status}${crashNote}`);
|
|
2237
|
+
if (row.category) lines.push(` category: ${row.category}`);
|
|
2238
|
+
for (const stat of row.stats) {
|
|
2239
|
+
const marker = stat.meetsThreshold ? color ? `${GREEN}✓${RESET}` : "✓" : color ? `${RED}✗${RESET}` : "✗";
|
|
2240
|
+
const rateStr = formatRate$1(stat);
|
|
2241
|
+
const thresholdPct = (stat.threshold * 100).toFixed(0);
|
|
2242
|
+
let line = ` ├─ ${stat.description}: ${rateStr} [threshold ${thresholdPct}%] ${marker}`;
|
|
2243
|
+
if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
|
|
2244
|
+
const arrow = stat.delta >= 0 ? "↑" : "↓";
|
|
2245
|
+
const basePct = (stat.baselinePassRate * 100).toFixed(0);
|
|
2246
|
+
const curPct = (stat.passRate * 100).toFixed(0);
|
|
2247
|
+
const deltaPct = (stat.delta * 100).toFixed(0);
|
|
2248
|
+
line += ` (${basePct}% → ${curPct}% (${arrow}${deltaPct}%))`;
|
|
2249
|
+
}
|
|
2250
|
+
lines.push(line);
|
|
2251
|
+
}
|
|
2252
|
+
lines.push("");
|
|
2253
|
+
}
|
|
2254
|
+
return lines.join("\n").trimEnd();
|
|
2255
|
+
}
|
|
2256
|
+
/** Format pass rate for display, noting when all reps crashed. */
|
|
2257
|
+
function formatRate$1(stat) {
|
|
2258
|
+
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
2259
|
+
const pct = (stat.passRate * 100).toFixed(0);
|
|
2260
|
+
return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
|
|
1598
2261
|
}
|
|
2262
|
+
//#endregion
|
|
2263
|
+
//#region src/reporter/format-json.ts
|
|
1599
2264
|
/**
|
|
1600
|
-
*
|
|
2265
|
+
* Serialize a suite report as indented JSON (no transformation).
|
|
1601
2266
|
*
|
|
1602
|
-
*
|
|
1603
|
-
* (and therefore no trajectoryInstances block).
|
|
2267
|
+
* Used by `--format json` and `--output` persistence.
|
|
1604
2268
|
*/
|
|
1605
|
-
function
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
});
|
|
2269
|
+
function formatJson(report) {
|
|
2270
|
+
return JSON.stringify(report, null, 2);
|
|
2271
|
+
}
|
|
2272
|
+
//#endregion
|
|
2273
|
+
//#region src/reporter/format-markdown.ts
|
|
2274
|
+
/** Render renderable rows as a GitHub-flavored markdown report. */
|
|
2275
|
+
function formatMarkdown(rows) {
|
|
2276
|
+
const lines = ["# Harness Eval Report", ""];
|
|
2277
|
+
for (const row of rows) {
|
|
2278
|
+
const status = row.passed ? "PASS" : "FAIL";
|
|
2279
|
+
const crashNote = row.adapterErrors > 0 ? ` (${row.adapterErrors} adapter errors)` : "";
|
|
2280
|
+
lines.push(`## ${row.caseId} @ ${row.cellLabel} — ${status}${crashNote}`);
|
|
2281
|
+
if (row.category) lines.push(`**Category:** ${row.category}`);
|
|
2282
|
+
if (row.notes) lines.push("<details><summary>Notes</summary>", row.notes, "</details>");
|
|
2283
|
+
lines.push("");
|
|
2284
|
+
lines.push("| Assertion | Result | Threshold | Status |");
|
|
2285
|
+
lines.push("| --- | --- | --- | --- |");
|
|
2286
|
+
for (const stat of row.stats) {
|
|
2287
|
+
const rateStr = formatRate(stat);
|
|
2288
|
+
const threshold = `${(stat.threshold * 100).toFixed(0)}%`;
|
|
2289
|
+
const statusCell = stat.meetsThreshold ? "✓" : "✗";
|
|
2290
|
+
let result = rateStr;
|
|
2291
|
+
if (stat.delta !== void 0 && stat.baselinePassRate !== void 0) {
|
|
2292
|
+
const base = (stat.baselinePassRate * 100).toFixed(0);
|
|
2293
|
+
const cur = (stat.passRate * 100).toFixed(0);
|
|
2294
|
+
const d = (stat.delta * 100).toFixed(0);
|
|
2295
|
+
const sign = stat.delta >= 0 ? "+" : "";
|
|
2296
|
+
result += ` (${base}% → ${cur}%, ${sign}${d}%)`;
|
|
2297
|
+
}
|
|
2298
|
+
lines.push(`| ${stat.description} | ${result} | ${threshold} | ${statusCell} |`);
|
|
2299
|
+
}
|
|
2300
|
+
lines.push("");
|
|
1617
2301
|
}
|
|
1618
|
-
return
|
|
2302
|
+
return lines.join("\n").trimEnd();
|
|
2303
|
+
}
|
|
2304
|
+
/** Format pass rate for markdown tables, noting when all reps crashed. */
|
|
2305
|
+
function formatRate(stat) {
|
|
2306
|
+
if (stat.evaluatedCount === 0) return `0/${stat.totalReps} (all reps crashed)`;
|
|
2307
|
+
const pct = (stat.passRate * 100).toFixed(0);
|
|
2308
|
+
return `${stat.passedCount}/${stat.evaluatedCount} (${pct}%)`;
|
|
2309
|
+
}
|
|
2310
|
+
//#endregion
|
|
2311
|
+
//#region src/reporter/renderable.ts
|
|
2312
|
+
/** Map a suite report to formatter-ready rows (one per cell). */
|
|
2313
|
+
function toRenderableRows(report) {
|
|
2314
|
+
return report.cells.map((cell) => cellToRow(cell));
|
|
1619
2315
|
}
|
|
1620
2316
|
/**
|
|
1621
|
-
*
|
|
2317
|
+
* Attach baseline pass-rate deltas to matching rows.
|
|
2318
|
+
*
|
|
2319
|
+
* Rows without a matching baseline cell are returned unchanged.
|
|
1622
2320
|
*/
|
|
1623
|
-
function
|
|
1624
|
-
const
|
|
1625
|
-
|
|
1626
|
-
|
|
2321
|
+
function applyBaseline(rows, baseline) {
|
|
2322
|
+
const baselineMap = new Map(baseline.cells.map((c) => [`${c.caseId}::${c.cell.label}`, c]));
|
|
2323
|
+
return rows.map((row) => {
|
|
2324
|
+
const baseCell = baselineMap.get(`${row.caseId}::${row.cellLabel}`);
|
|
2325
|
+
if (!baseCell) return row;
|
|
2326
|
+
const stats = row.stats.map((stat, i) => {
|
|
2327
|
+
const baseStat = baseCell.assertionStats[i];
|
|
2328
|
+
if (!baseStat) return stat;
|
|
2329
|
+
const delta = stat.passRate - baseStat.passRate;
|
|
2330
|
+
return {
|
|
2331
|
+
...stat,
|
|
2332
|
+
baselinePassRate: baseStat.passRate,
|
|
2333
|
+
delta
|
|
2334
|
+
};
|
|
2335
|
+
});
|
|
2336
|
+
return {
|
|
2337
|
+
...row,
|
|
2338
|
+
stats
|
|
2339
|
+
};
|
|
2340
|
+
});
|
|
2341
|
+
}
|
|
2342
|
+
/** Convert one {@link CellReport} to a {@link RenderableRow}. */
|
|
2343
|
+
function cellToRow(cell) {
|
|
2344
|
+
const totalReps = cell.repetitions.length;
|
|
2345
|
+
const stats = cell.assertionStats.map((s) => ({
|
|
2346
|
+
description: s.description,
|
|
2347
|
+
threshold: s.threshold,
|
|
2348
|
+
passedCount: s.passedCount,
|
|
2349
|
+
evaluatedCount: s.evaluatedCount,
|
|
2350
|
+
totalReps,
|
|
2351
|
+
adapterErrors: cell.adapterErrors,
|
|
2352
|
+
passRate: s.passRate,
|
|
2353
|
+
meetsThreshold: s.meetsThreshold
|
|
2354
|
+
}));
|
|
2355
|
+
return {
|
|
2356
|
+
caseId: cell.caseId,
|
|
2357
|
+
category: cell.category,
|
|
2358
|
+
notes: cell.notes,
|
|
2359
|
+
cellLabel: cell.cell.label,
|
|
2360
|
+
passed: cell.passed,
|
|
2361
|
+
adapterErrors: cell.adapterErrors,
|
|
2362
|
+
totalReps,
|
|
2363
|
+
stats
|
|
2364
|
+
};
|
|
1627
2365
|
}
|
|
2366
|
+
//#endregion
|
|
2367
|
+
//#region src/reporter/index.ts
|
|
1628
2368
|
/**
|
|
1629
|
-
*
|
|
2369
|
+
* Format a {@link SuiteReport} for console, markdown, or JSON output.
|
|
2370
|
+
*
|
|
2371
|
+
* JSON format bypasses the renderable intermediate model and serializes the
|
|
2372
|
+
* report directly. Console and markdown apply optional baseline deltas.
|
|
1630
2373
|
*/
|
|
1631
|
-
function
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
2374
|
+
function formatReport(report, options) {
|
|
2375
|
+
if (options.format === "json") return formatJson(report);
|
|
2376
|
+
let rows = toRenderableRows(report);
|
|
2377
|
+
if (options.baseline) rows = applyBaseline(rows, options.baseline);
|
|
2378
|
+
const useColor = options.color ?? options.format === "console";
|
|
2379
|
+
if (options.format === "markdown") return formatMarkdown(rows);
|
|
2380
|
+
return formatConsole(rows, useColor);
|
|
1635
2381
|
}
|
|
1636
2382
|
//#endregion
|
|
1637
|
-
export {
|
|
2383
|
+
export { serializeToolInput as A, TRAJECTORY_SCHEMA_VERSION as B, trajectoryExactMatch as C, trajectorySingleToolUse as D, trajectoryRecall as E, loadSuiteReport as F, trajectoryToTranscript as I, createCodexGrader as L, gradingReportPassed as M, resolveGradeOptions as N, toEvaluationInstance as O, gradeReport as P, createClaudeGrader as R, trajectoryAnyOrderMatch as S, trajectoryPrecision as T, buildEvalRunEnvelopeFromFiles as _, envelopeCommand as a, computeTrajectoryMetrics as b, getOptionInt as c, resolveGradingArtifactFromSuite as d, resolvePipelineInputs as f, buildEvalRunEnvelope as g, toTrajectory as h, runPipeline as i, formatGradingConsole as j, toTrajectoryInstances as k, hasOption as l, toInstancesJsonl as m, emitOtel as n, parseEnvelopeProjection as o, suiteDirectoryFromPath as p, trajectoryToOtlp as r, getOption as s, formatReport as t, parseArgs as u, enrichRepetitionWithProtojson as v, trajectoryInOrderMatch as w, parseToolInput as x, toHarnessMetrics as y, EVAL_RUN_SCHEMA_VERSION as z };
|
|
1638
2384
|
|
|
1639
|
-
//# sourceMappingURL=
|
|
2385
|
+
//# sourceMappingURL=reporter-BKCJZRYr.js.map
|