agentv 4.18.0-next.1 → 4.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-WH3OE42V.js → artifact-writer-YATMDPWI.js} +6 -5
- package/dist/{chunk-MCBERRMC.js → chunk-62M5MR5K.js} +22 -20
- package/dist/chunk-62M5MR5K.js.map +1 -0
- package/dist/{chunk-VRPCMCLQ.js → chunk-IWI4AJRS.js} +80 -42
- package/dist/chunk-IWI4AJRS.js.map +1 -0
- package/dist/{chunk-HBDOJJFY.js → chunk-NL6P5MUH.js} +5 -3
- package/dist/{chunk-HBDOJJFY.js.map → chunk-NL6P5MUH.js.map} +1 -1
- package/dist/{chunk-RCOAXXHP.js → chunk-PTYQS37Y.js} +28906 -30884
- package/dist/chunk-PTYQS37Y.js.map +1 -0
- package/dist/chunk-R2QDYORI.js +2178 -0
- package/dist/chunk-R2QDYORI.js.map +1 -0
- package/dist/cli.js +7 -6
- package/dist/cli.js.map +1 -1
- package/dist/{dist-7W4OI3X2.js → dist-RTIUSC6L.js} +63 -59
- package/dist/index.js +7 -6
- package/dist/{interactive-J4QEU5FG.js → interactive-7AZMOH2V.js} +8 -7
- package/dist/{interactive-J4QEU5FG.js.map → interactive-7AZMOH2V.js.map} +1 -1
- package/dist/ts-eval-loader-XFQ6S4DT-S7P2UUBX.js +15 -0
- package/dist/ts-eval-loader-XFQ6S4DT-S7P2UUBX.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-MCBERRMC.js.map +0 -1
- package/dist/chunk-RCOAXXHP.js.map +0 -1
- package/dist/chunk-VRPCMCLQ.js.map +0 -1
- /package/dist/{artifact-writer-WH3OE42V.js.map → artifact-writer-YATMDPWI.js.map} +0 -0
- /package/dist/{dist-7W4OI3X2.js.map → dist-RTIUSC6L.js.map} +0 -0
|
@@ -0,0 +1,2178 @@
|
|
|
1
|
+
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
2
|
+
import {
|
|
3
|
+
external_exports,
|
|
4
|
+
generateText,
|
|
5
|
+
getAgentvConfigDir,
|
|
6
|
+
getAgentvHome,
|
|
7
|
+
interpolateEnv,
|
|
8
|
+
normalizeToolCall,
|
|
9
|
+
parseRepoCheckout,
|
|
10
|
+
parseRepoClone,
|
|
11
|
+
parseRepoSource,
|
|
12
|
+
toCamelCaseDeep,
|
|
13
|
+
toSnakeCaseDeep
|
|
14
|
+
} from "./chunk-PTYQS37Y.js";
|
|
15
|
+
|
|
16
|
+
// ../../packages/core/dist/index.js
|
|
17
|
+
import { readFileSync } from "node:fs";
|
|
18
|
+
import path from "node:path";
|
|
19
|
+
import { parse } from "yaml";
|
|
20
|
+
import { readFile } from "node:fs/promises";
|
|
21
|
+
import path2 from "node:path";
|
|
22
|
+
import { parse as parse2 } from "yaml";
|
|
23
|
+
import { mkdir, readFile as readFile2, writeFile } from "node:fs/promises";
|
|
24
|
+
import path3 from "node:path";
|
|
25
|
+
import { execFile } from "node:child_process";
|
|
26
|
+
import { existsSync, mkdirSync, readFileSync as readFileSync2, rmSync, writeFileSync } from "node:fs";
|
|
27
|
+
import { cp, mkdtemp, readdir, rm, stat } from "node:fs/promises";
|
|
28
|
+
import os from "node:os";
|
|
29
|
+
import path4 from "node:path";
|
|
30
|
+
import { promisify } from "node:util";
|
|
31
|
+
import {
|
|
32
|
+
copyFileSync,
|
|
33
|
+
existsSync as existsSync2,
|
|
34
|
+
mkdirSync as mkdirSync2,
|
|
35
|
+
readFileSync as readFileSync3,
|
|
36
|
+
readdirSync,
|
|
37
|
+
statSync,
|
|
38
|
+
writeFileSync as writeFileSync2
|
|
39
|
+
} from "node:fs";
|
|
40
|
+
import path5 from "node:path";
|
|
41
|
+
import { parse as parseYaml, stringify as stringifyYaml } from "yaml";
|
|
42
|
+
import { readdir as readdir2, stat as stat2 } from "node:fs/promises";
|
|
43
|
+
import { homedir } from "node:os";
|
|
44
|
+
import path6 from "node:path";
|
|
45
|
+
import { readdir as readdir3, stat as stat3 } from "node:fs/promises";
|
|
46
|
+
import { homedir as homedir2 } from "node:os";
|
|
47
|
+
import path7 from "node:path";
|
|
48
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
49
|
+
function codeGraderInstruction(graderName, description) {
|
|
50
|
+
const desc = description ? ` This grader: ${description}.` : "";
|
|
51
|
+
return `Run \`agentv eval assert ${graderName} --agent-output <agent_output> --agent-input <original_prompt>\` and check the result.${desc} The command accepts --agent-output (the agent's full response text) and --agent-input (the original user prompt). It returns JSON on stdout: {"score": 0-1, "reasoning": "..."}. A score >= 0.5 means pass (exit 0); below 0.5 means fail (exit 1).`;
|
|
52
|
+
}
|
|
53
|
+
function deriveGraderNameFromCommand(command) {
|
|
54
|
+
if (!Array.isArray(command) || command.length === 0) return void 0;
|
|
55
|
+
for (const arg of command) {
|
|
56
|
+
if (typeof arg !== "string") continue;
|
|
57
|
+
const match = arg.match(/([^/]+)\.(ts|js|mts|mjs)$/);
|
|
58
|
+
if (match) return match[1] || void 0;
|
|
59
|
+
}
|
|
60
|
+
return void 0;
|
|
61
|
+
}
|
|
62
|
+
function assertionToNaturalLanguage(entry) {
|
|
63
|
+
const type = entry.type;
|
|
64
|
+
switch (type) {
|
|
65
|
+
case "skill-trigger":
|
|
66
|
+
return null;
|
|
67
|
+
case "rubrics": {
|
|
68
|
+
if (typeof entry.criteria === "string") {
|
|
69
|
+
return entry.criteria;
|
|
70
|
+
}
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
case "contains":
|
|
74
|
+
return `Output contains '${entry.value}'`;
|
|
75
|
+
case "contains-any":
|
|
76
|
+
case "contains_any": {
|
|
77
|
+
const values = Array.isArray(entry.value) ? entry.value.join("', '") : entry.value;
|
|
78
|
+
return `Output contains any of: '${values}'`;
|
|
79
|
+
}
|
|
80
|
+
case "contains-all":
|
|
81
|
+
case "contains_all": {
|
|
82
|
+
const values = Array.isArray(entry.value) ? entry.value.join("', '") : entry.value;
|
|
83
|
+
return `Output contains all of: '${values}'`;
|
|
84
|
+
}
|
|
85
|
+
case "icontains":
|
|
86
|
+
return `Output contains (case-insensitive) '${entry.value}'`;
|
|
87
|
+
case "regex":
|
|
88
|
+
return `Output matches regex: ${entry.value}`;
|
|
89
|
+
case "equals":
|
|
90
|
+
return `Output exactly equals: ${entry.value}`;
|
|
91
|
+
case "is-json":
|
|
92
|
+
case "is_json":
|
|
93
|
+
return "Output is valid JSON";
|
|
94
|
+
case "starts-with":
|
|
95
|
+
case "starts_with":
|
|
96
|
+
return `Output starts with '${entry.value}'`;
|
|
97
|
+
case "ends-with":
|
|
98
|
+
case "ends_with":
|
|
99
|
+
return `Output ends with '${entry.value}'`;
|
|
100
|
+
case "llm-grader":
|
|
101
|
+
case "llm_grader": {
|
|
102
|
+
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
return typeof entry.prompt === "string" ? entry.prompt : null;
|
|
106
|
+
}
|
|
107
|
+
case "tool-trajectory":
|
|
108
|
+
case "tool_trajectory": {
|
|
109
|
+
const expectedArr = Array.isArray(entry.expected) ? entry.expected : [];
|
|
110
|
+
const tools = expectedArr.map((e) => e.tool).filter(Boolean).join(", ");
|
|
111
|
+
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
112
|
+
}
|
|
113
|
+
case "code-grader":
|
|
114
|
+
case "code_grader": {
|
|
115
|
+
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
116
|
+
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
117
|
+
return codeGraderInstruction(graderName, desc);
|
|
118
|
+
}
|
|
119
|
+
case "field-accuracy":
|
|
120
|
+
case "field_accuracy": {
|
|
121
|
+
const fieldPaths = Array.isArray(entry.fields) ? entry.fields.map((f) => f.path).filter(Boolean).join(", ") : "";
|
|
122
|
+
return fieldPaths ? `Fields ${fieldPaths} match expected values` : "Fields match expected values";
|
|
123
|
+
}
|
|
124
|
+
case "latency":
|
|
125
|
+
return typeof entry.threshold === "number" ? `Response time under ${entry.threshold}ms` : "Response time within threshold";
|
|
126
|
+
case "cost":
|
|
127
|
+
return typeof entry.budget === "number" ? `Cost under $${entry.budget}` : "Cost within budget";
|
|
128
|
+
case "token-usage":
|
|
129
|
+
case "token_usage":
|
|
130
|
+
return "Token usage within limits";
|
|
131
|
+
case "execution-metrics":
|
|
132
|
+
case "execution_metrics":
|
|
133
|
+
return "Execution within metric bounds";
|
|
134
|
+
default: {
|
|
135
|
+
if (entry.command !== void 0 && type) {
|
|
136
|
+
return codeGraderInstruction(deriveGraderNameFromCommand(entry.command) ?? type);
|
|
137
|
+
}
|
|
138
|
+
if (typeof entry.criteria === "string") return entry.criteria;
|
|
139
|
+
if (typeof entry.prompt === "string") return entry.prompt;
|
|
140
|
+
return type ? `${type} assertion` : null;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
function assertionToNaturalLanguageList(entry) {
|
|
145
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
146
|
+
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
147
|
+
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
const nl = assertionToNaturalLanguage(entry);
|
|
151
|
+
return nl !== null ? [nl] : [];
|
|
152
|
+
}
|
|
153
|
+
function extractTriggerAssertions(assertions) {
|
|
154
|
+
return assertions.filter((a) => a.type === "skill-trigger");
|
|
155
|
+
}
|
|
156
|
+
function resolveAssertions(rawCase) {
|
|
157
|
+
if (Array.isArray(rawCase.assertions)) return rawCase.assertions;
|
|
158
|
+
if (Array.isArray(rawCase.assert)) return rawCase.assert;
|
|
159
|
+
return [];
|
|
160
|
+
}
|
|
161
|
+
function resolveSuiteAssertions(suite) {
|
|
162
|
+
if (Array.isArray(suite.assertions)) return suite.assertions;
|
|
163
|
+
if (Array.isArray(suite.assert)) return suite.assert;
|
|
164
|
+
return [];
|
|
165
|
+
}
|
|
166
|
+
function extractInput(rawCase) {
|
|
167
|
+
const files = Array.isArray(rawCase.input_files) ? rawCase.input_files.filter((f) => typeof f === "string") : [];
|
|
168
|
+
const input = rawCase.input;
|
|
169
|
+
if (typeof input === "string") {
|
|
170
|
+
return { prompt: input, files };
|
|
171
|
+
}
|
|
172
|
+
if (Array.isArray(input)) {
|
|
173
|
+
let prompt = "";
|
|
174
|
+
for (const msg of input) {
|
|
175
|
+
if (msg.role !== "user") continue;
|
|
176
|
+
if (typeof msg.content === "string") {
|
|
177
|
+
prompt = msg.content;
|
|
178
|
+
} else if (Array.isArray(msg.content)) {
|
|
179
|
+
for (const block of msg.content) {
|
|
180
|
+
if (block.type === "text" && typeof block.value === "string") prompt = block.value;
|
|
181
|
+
else if (block.type === "file" && typeof block.value === "string")
|
|
182
|
+
files.push(block.value);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return { prompt, files };
|
|
187
|
+
}
|
|
188
|
+
return { prompt: "", files };
|
|
189
|
+
}
|
|
190
|
+
function extractExpectedOutput(raw) {
|
|
191
|
+
if (raw === void 0 || raw === null) return void 0;
|
|
192
|
+
if (typeof raw === "string") return raw;
|
|
193
|
+
if (Array.isArray(raw)) {
|
|
194
|
+
for (let i = raw.length - 1; i >= 0; i--) {
|
|
195
|
+
const msg = raw[i];
|
|
196
|
+
if (typeof msg.content === "string") return msg.content;
|
|
197
|
+
}
|
|
198
|
+
return void 0;
|
|
199
|
+
}
|
|
200
|
+
return JSON.stringify(raw);
|
|
201
|
+
}
|
|
202
|
+
function transpileEvalYaml(suite, source = "EVAL.yaml") {
|
|
203
|
+
const warnings = [];
|
|
204
|
+
const files = /* @__PURE__ */ new Map();
|
|
205
|
+
if (typeof suite !== "object" || suite === null) {
|
|
206
|
+
throw new Error(`Invalid EVAL.yaml: expected an object in '${source}'`);
|
|
207
|
+
}
|
|
208
|
+
const rawSuite = suite;
|
|
209
|
+
if (!Array.isArray(rawSuite.tests)) {
|
|
210
|
+
throw new Error(`Invalid EVAL.yaml: missing 'tests' array in '${source}'`);
|
|
211
|
+
}
|
|
212
|
+
if (rawSuite.assert !== void 0 && rawSuite.assertions === void 0) {
|
|
213
|
+
warnings.push("'assert' is deprecated at the suite level. Use 'assertions' instead.");
|
|
214
|
+
}
|
|
215
|
+
const suiteAssertions = resolveSuiteAssertions(rawSuite);
|
|
216
|
+
const suiteNlAssertions = suiteAssertions.filter((a) => a.type !== "skill-trigger").flatMap(assertionToNaturalLanguageList);
|
|
217
|
+
function getSkillFile(skillName) {
|
|
218
|
+
const existing = files.get(skillName);
|
|
219
|
+
if (existing) return existing;
|
|
220
|
+
const created = { skill_name: skillName, evals: [] };
|
|
221
|
+
files.set(skillName, created);
|
|
222
|
+
return created;
|
|
223
|
+
}
|
|
224
|
+
const tests = rawSuite.tests;
|
|
225
|
+
for (let idx = 0; idx < tests.length; idx++) {
|
|
226
|
+
const rawCase = tests[idx];
|
|
227
|
+
const caseAssertions = resolveAssertions(rawCase);
|
|
228
|
+
if (rawCase.assert !== void 0 && rawCase.assertions === void 0) {
|
|
229
|
+
const caseId = rawCase.id ?? idx + 1;
|
|
230
|
+
warnings.push(`Test '${caseId}': 'assert' is deprecated. Use 'assertions' instead.`);
|
|
231
|
+
}
|
|
232
|
+
const nlAssertions = [];
|
|
233
|
+
if (typeof rawCase.criteria === "string" && rawCase.criteria.trim()) {
|
|
234
|
+
nlAssertions.push(rawCase.criteria.trim());
|
|
235
|
+
}
|
|
236
|
+
for (const entry of caseAssertions) {
|
|
237
|
+
if (entry.type !== "skill-trigger") {
|
|
238
|
+
nlAssertions.push(...assertionToNaturalLanguageList(entry));
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
nlAssertions.push(...suiteNlAssertions);
|
|
242
|
+
const triggerJudges = extractTriggerAssertions(caseAssertions);
|
|
243
|
+
const { prompt, files: inputFiles } = extractInput(rawCase);
|
|
244
|
+
const expectedOutput = extractExpectedOutput(rawCase.expected_output);
|
|
245
|
+
const numericId = idx + 1;
|
|
246
|
+
const baseCase = {
|
|
247
|
+
id: numericId,
|
|
248
|
+
prompt,
|
|
249
|
+
...expectedOutput !== void 0 && { expected_output: expectedOutput },
|
|
250
|
+
...inputFiles.length > 0 && { files: inputFiles },
|
|
251
|
+
assertions: nlAssertions
|
|
252
|
+
};
|
|
253
|
+
if (triggerJudges.length === 0) {
|
|
254
|
+
const noSkillFile2 = getSkillFile("_no-skill");
|
|
255
|
+
noSkillFile2.evals.push({ ...baseCase });
|
|
256
|
+
} else {
|
|
257
|
+
for (const tj of triggerJudges) {
|
|
258
|
+
const skillName = typeof tj.skill === "string" ? tj.skill : "_no-skill";
|
|
259
|
+
const shouldTrigger = tj.should_trigger !== false;
|
|
260
|
+
const skillFile = getSkillFile(skillName);
|
|
261
|
+
skillFile.evals.push({ ...baseCase, should_trigger: shouldTrigger });
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
const noSkillFile = files.get("_no-skill");
|
|
266
|
+
if (noSkillFile && noSkillFile.evals.length > 0) {
|
|
267
|
+
let dominantSkill = null;
|
|
268
|
+
let maxCount = 0;
|
|
269
|
+
for (const [name, f] of files) {
|
|
270
|
+
if (name !== "_no-skill" && f.evals.length > maxCount) {
|
|
271
|
+
maxCount = f.evals.length;
|
|
272
|
+
dominantSkill = name;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
if (dominantSkill) {
|
|
276
|
+
const targetFile = getSkillFile(dominantSkill);
|
|
277
|
+
for (const evalCase of noSkillFile.evals) {
|
|
278
|
+
targetFile.evals.push(evalCase);
|
|
279
|
+
}
|
|
280
|
+
files.delete("_no-skill");
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
return { files, warnings };
|
|
284
|
+
}
|
|
285
|
+
function transpileEvalYamlFile(evalYamlPath) {
|
|
286
|
+
const content = readFileSync(evalYamlPath, "utf8");
|
|
287
|
+
const parsed = parse(content);
|
|
288
|
+
return transpileEvalYaml(parsed, path.basename(evalYamlPath));
|
|
289
|
+
}
|
|
290
|
+
function getOutputFilenames(result) {
|
|
291
|
+
const names = /* @__PURE__ */ new Map();
|
|
292
|
+
if (result.files.size === 1) {
|
|
293
|
+
for (const [skill] of result.files) {
|
|
294
|
+
names.set(skill, "evals.json");
|
|
295
|
+
}
|
|
296
|
+
} else {
|
|
297
|
+
for (const [skill] of result.files) {
|
|
298
|
+
const safeName = skill.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
299
|
+
names.set(skill, `${safeName}.evals.json`);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
return names;
|
|
303
|
+
}
|
|
304
|
+
var AgentVConfigSchema = external_exports.object({
|
|
305
|
+
/** Default execution settings */
|
|
306
|
+
execution: external_exports.object({
|
|
307
|
+
/** Number of parallel workers (default: 3) */
|
|
308
|
+
workers: external_exports.number().int().min(1).max(50).optional(),
|
|
309
|
+
/** Maximum retries on failure (default: 2) */
|
|
310
|
+
maxRetries: external_exports.number().int().min(0).optional(),
|
|
311
|
+
/** Agent timeout in milliseconds. No timeout if not set. */
|
|
312
|
+
agentTimeoutMs: external_exports.number().int().min(0).optional(),
|
|
313
|
+
/** Enable verbose logging */
|
|
314
|
+
verbose: external_exports.boolean().optional(),
|
|
315
|
+
/** Always keep temp workspaces after eval */
|
|
316
|
+
keepWorkspaces: external_exports.boolean().optional(),
|
|
317
|
+
/** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
|
|
318
|
+
otelFile: external_exports.string().optional()
|
|
319
|
+
}).optional(),
|
|
320
|
+
/** Output settings */
|
|
321
|
+
output: external_exports.object({
|
|
322
|
+
/** Output format */
|
|
323
|
+
format: external_exports.enum(["jsonl", "yaml", "json", "xml"]).optional(),
|
|
324
|
+
/** Output directory */
|
|
325
|
+
dir: external_exports.string().optional()
|
|
326
|
+
}).optional(),
|
|
327
|
+
/** Response caching */
|
|
328
|
+
cache: external_exports.object({
|
|
329
|
+
/** Enable response caching */
|
|
330
|
+
enabled: external_exports.boolean().optional(),
|
|
331
|
+
/** Cache file path */
|
|
332
|
+
path: external_exports.string().optional()
|
|
333
|
+
}).optional(),
|
|
334
|
+
/** Cost and duration limits */
|
|
335
|
+
limits: external_exports.object({
|
|
336
|
+
/** Maximum cost per run in USD */
|
|
337
|
+
maxCostUsd: external_exports.number().min(0).optional(),
|
|
338
|
+
/** Maximum duration per run in milliseconds */
|
|
339
|
+
maxDurationMs: external_exports.number().int().min(0).optional()
|
|
340
|
+
}).optional()
|
|
341
|
+
});
|
|
342
|
+
function defineConfig(config) {
|
|
343
|
+
return AgentVConfigSchema.parse(config);
|
|
344
|
+
}
|
|
345
|
+
var CONFIG_FILE_NAMES = [
|
|
346
|
+
"agentv.config.ts",
|
|
347
|
+
"agentv.config.js",
|
|
348
|
+
"agentv.config.mts",
|
|
349
|
+
"agentv.config.mjs",
|
|
350
|
+
".agentv/config.ts",
|
|
351
|
+
".agentv/config.js"
|
|
352
|
+
];
|
|
353
|
+
async function loadTsConfig(projectRoot) {
|
|
354
|
+
const { existsSync: existsSync3 } = await import("node:fs");
|
|
355
|
+
const { pathToFileURL } = await import("node:url");
|
|
356
|
+
const { join } = await import("node:path");
|
|
357
|
+
for (const fileName of CONFIG_FILE_NAMES) {
|
|
358
|
+
const filePath = join(projectRoot, fileName);
|
|
359
|
+
if (!existsSync3(filePath)) {
|
|
360
|
+
continue;
|
|
361
|
+
}
|
|
362
|
+
try {
|
|
363
|
+
const fileUrl = pathToFileURL(filePath).href;
|
|
364
|
+
const mod = await import(fileUrl);
|
|
365
|
+
const config = mod.default ?? mod;
|
|
366
|
+
return AgentVConfigSchema.parse(config);
|
|
367
|
+
} catch (error) {
|
|
368
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
369
|
+
throw new Error(`Failed to load config from ${filePath}: ${msg}`);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
return null;
|
|
373
|
+
}
|
|
374
|
+
var rubricItemSchema = external_exports.object({
|
|
375
|
+
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
376
|
+
outcome: external_exports.string().describe("Concrete expected outcome for this rubric item"),
|
|
377
|
+
weight: external_exports.number().default(1).describe("Relative importance (default 1.0)"),
|
|
378
|
+
required: external_exports.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
379
|
+
});
|
|
380
|
+
var rubricGenerationSchema = external_exports.object({
|
|
381
|
+
rubrics: external_exports.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
382
|
+
});
|
|
383
|
+
async function generateRubrics(options) {
|
|
384
|
+
const { criteria, question, referenceAnswer, provider } = options;
|
|
385
|
+
const prompt = buildPrompt(criteria, question, referenceAnswer);
|
|
386
|
+
const model = provider.asLanguageModel?.();
|
|
387
|
+
if (!model) {
|
|
388
|
+
throw new Error("Provider does not support language model interface");
|
|
389
|
+
}
|
|
390
|
+
const system = `You are an expert at creating evaluation rubrics.
|
|
391
|
+
You must return a valid JSON object matching this schema:
|
|
392
|
+
{
|
|
393
|
+
"rubrics": [
|
|
394
|
+
{
|
|
395
|
+
"id": "string (short identifier)",
|
|
396
|
+
"outcome": "string (concrete expected outcome for this rubric item)",
|
|
397
|
+
"weight": number (default 1.0),
|
|
398
|
+
"required": boolean (default true)
|
|
399
|
+
}
|
|
400
|
+
]
|
|
401
|
+
}`;
|
|
402
|
+
let result;
|
|
403
|
+
let lastError;
|
|
404
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
405
|
+
try {
|
|
406
|
+
const { text } = await generateText({
|
|
407
|
+
model,
|
|
408
|
+
system,
|
|
409
|
+
prompt
|
|
410
|
+
});
|
|
411
|
+
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
412
|
+
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
|
|
413
|
+
break;
|
|
414
|
+
} catch (e) {
|
|
415
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
if (!result) {
|
|
419
|
+
throw new Error(`Failed to parse generated rubrics after 3 attempts: ${lastError?.message}`);
|
|
420
|
+
}
|
|
421
|
+
return result.rubrics;
|
|
422
|
+
}
|
|
423
|
+
function buildPrompt(criteria, question, referenceAnswer) {
|
|
424
|
+
const parts = [
|
|
425
|
+
"You are an expert at creating evaluation rubrics.",
|
|
426
|
+
"Given the expected outcome (and optionally the question and reference answer),",
|
|
427
|
+
"generate a list of specific, measurable rubric items to evaluate whether an answer meets the expected outcome.",
|
|
428
|
+
"",
|
|
429
|
+
"Each rubric should:",
|
|
430
|
+
"- Be specific and testable",
|
|
431
|
+
"- Have a short, descriptive ID",
|
|
432
|
+
"- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
|
|
433
|
+
"- Indicate if it is required (mandatory) or optional",
|
|
434
|
+
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
435
|
+
"",
|
|
436
|
+
"Generate 3-7 rubric items that comprehensively cover the expected outcome.",
|
|
437
|
+
"",
|
|
438
|
+
"[[ ## criteria ## ]]",
|
|
439
|
+
criteria,
|
|
440
|
+
""
|
|
441
|
+
];
|
|
442
|
+
if (question && question.trim().length > 0) {
|
|
443
|
+
parts.push("[[ ## question ## ]]", question, "");
|
|
444
|
+
}
|
|
445
|
+
if (referenceAnswer && referenceAnswer.trim().length > 0) {
|
|
446
|
+
parts.push("[[ ## reference_answer ## ]]", referenceAnswer, "");
|
|
447
|
+
}
|
|
448
|
+
return parts.join("\n");
|
|
449
|
+
}
|
|
450
|
+
function normalizeGitUrl(url) {
|
|
451
|
+
let normalized = url.replace(/\.git$/, "");
|
|
452
|
+
try {
|
|
453
|
+
const parsed = new URL(normalized);
|
|
454
|
+
parsed.hostname = parsed.hostname.toLowerCase();
|
|
455
|
+
normalized = parsed.toString().replace(/\/$/, "");
|
|
456
|
+
} catch {
|
|
457
|
+
}
|
|
458
|
+
return normalized;
|
|
459
|
+
}
|
|
460
|
+
async function scanRepoDeps(evalFilePaths) {
|
|
461
|
+
const seen = /* @__PURE__ */ new Map();
|
|
462
|
+
const errors = [];
|
|
463
|
+
for (const filePath of evalFilePaths) {
|
|
464
|
+
try {
|
|
465
|
+
const repos = await extractReposFromEvalFile(filePath);
|
|
466
|
+
for (const repo of repos) {
|
|
467
|
+
if (!repo.source || repo.source.type !== "git") continue;
|
|
468
|
+
const ref = repo.checkout?.ref;
|
|
469
|
+
const key = `${normalizeGitUrl(repo.source.url)}\0${ref ?? ""}`;
|
|
470
|
+
const existing = seen.get(key);
|
|
471
|
+
if (existing) {
|
|
472
|
+
existing.usedBy.push(filePath);
|
|
473
|
+
} else {
|
|
474
|
+
const { ref: _ref, ...checkoutRest } = repo.checkout ?? {};
|
|
475
|
+
const hasCheckout = Object.keys(checkoutRest).length > 0;
|
|
476
|
+
seen.set(key, {
|
|
477
|
+
url: repo.source.url,
|
|
478
|
+
ref,
|
|
479
|
+
clone: repo.clone,
|
|
480
|
+
checkout: hasCheckout ? checkoutRest : void 0,
|
|
481
|
+
usedBy: [filePath]
|
|
482
|
+
});
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
} catch (err) {
|
|
486
|
+
errors.push({
|
|
487
|
+
file: filePath,
|
|
488
|
+
message: err instanceof Error ? err.message : String(err)
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
return { repos: [...seen.values()], errors };
|
|
493
|
+
}
|
|
494
|
+
async function extractReposFromEvalFile(filePath) {
|
|
495
|
+
const content = await readFile(filePath, "utf8");
|
|
496
|
+
const parsed = interpolateEnv(parse2(content), process.env);
|
|
497
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return [];
|
|
498
|
+
const obj = parsed;
|
|
499
|
+
const evalFileDir = path2.dirname(path2.resolve(filePath));
|
|
500
|
+
const repos = [];
|
|
501
|
+
const suiteRepos = await extractReposFromWorkspaceRaw(obj.workspace, evalFileDir);
|
|
502
|
+
repos.push(...suiteRepos);
|
|
503
|
+
const tests = Array.isArray(obj.tests) ? obj.tests : [];
|
|
504
|
+
for (const test of tests) {
|
|
505
|
+
if (test && typeof test === "object" && !Array.isArray(test)) {
|
|
506
|
+
const testObj = test;
|
|
507
|
+
const testRepos = await extractReposFromWorkspaceRaw(testObj.workspace, evalFileDir);
|
|
508
|
+
repos.push(...testRepos);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
return repos;
|
|
512
|
+
}
|
|
513
|
+
async function extractReposFromWorkspaceRaw(raw, evalFileDir) {
|
|
514
|
+
if (typeof raw === "string") {
|
|
515
|
+
const workspaceFilePath = path2.resolve(evalFileDir, raw);
|
|
516
|
+
const content = await readFile(workspaceFilePath, "utf8");
|
|
517
|
+
const parsed = interpolateEnv(parse2(content), process.env);
|
|
518
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return [];
|
|
519
|
+
return extractReposFromObject(parsed);
|
|
520
|
+
}
|
|
521
|
+
if (raw && typeof raw === "object" && !Array.isArray(raw)) {
|
|
522
|
+
return extractReposFromObject(raw);
|
|
523
|
+
}
|
|
524
|
+
return [];
|
|
525
|
+
}
|
|
526
|
+
function extractReposFromObject(obj) {
|
|
527
|
+
const rawRepos = Array.isArray(obj.repos) ? obj.repos : [];
|
|
528
|
+
const result = [];
|
|
529
|
+
for (const r of rawRepos) {
|
|
530
|
+
if (!r || typeof r !== "object" || Array.isArray(r)) continue;
|
|
531
|
+
const repo = r;
|
|
532
|
+
const source = parseRepoSource(repo.source);
|
|
533
|
+
if (!source) continue;
|
|
534
|
+
result.push({
|
|
535
|
+
source,
|
|
536
|
+
checkout: parseRepoCheckout(repo.checkout),
|
|
537
|
+
clone: parseRepoClone(repo.clone)
|
|
538
|
+
});
|
|
539
|
+
}
|
|
540
|
+
return result;
|
|
541
|
+
}
|
|
542
|
+
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
543
|
+
var ResponseCache = class {
|
|
544
|
+
cachePath;
|
|
545
|
+
constructor(cachePath) {
|
|
546
|
+
this.cachePath = cachePath ?? DEFAULT_CACHE_PATH;
|
|
547
|
+
}
|
|
548
|
+
async get(key) {
|
|
549
|
+
const filePath = this.keyToPath(key);
|
|
550
|
+
try {
|
|
551
|
+
const data = await readFile2(filePath, "utf8");
|
|
552
|
+
return JSON.parse(data);
|
|
553
|
+
} catch {
|
|
554
|
+
return void 0;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
async set(key, value) {
|
|
558
|
+
const filePath = this.keyToPath(key);
|
|
559
|
+
const dir = path3.dirname(filePath);
|
|
560
|
+
await mkdir(dir, { recursive: true });
|
|
561
|
+
await writeFile(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
562
|
+
}
|
|
563
|
+
keyToPath(key) {
|
|
564
|
+
const prefix = key.slice(0, 2);
|
|
565
|
+
return path3.join(this.cachePath, prefix, `${key}.json`);
|
|
566
|
+
}
|
|
567
|
+
};
|
|
568
|
+
function shouldEnableCache(params) {
|
|
569
|
+
if (params.cliNoCache) return false;
|
|
570
|
+
return params.cliCache || params.yamlCache === true;
|
|
571
|
+
}
|
|
572
|
+
function shouldSkipCacheForTemperature(targetConfig) {
|
|
573
|
+
const temp = targetConfig.temperature;
|
|
574
|
+
if (typeof temp === "number" && temp > 0) {
|
|
575
|
+
return true;
|
|
576
|
+
}
|
|
577
|
+
return false;
|
|
578
|
+
}
|
|
579
|
+
var execFileAsync = promisify(execFile);
|
|
580
|
+
function sanitizeRepoSlug(repo) {
|
|
581
|
+
return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
|
|
582
|
+
}
|
|
583
|
+
function withFriendlyGitHubAuthError(error) {
|
|
584
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
585
|
+
const lower = message.toLowerCase();
|
|
586
|
+
if (lower.includes("authentication failed") || lower.includes("could not read username") || lower.includes("permission denied") || lower.includes("not logged into any github hosts")) {
|
|
587
|
+
return new Error(`${message}. Run 'gh auth login' to authenticate.`);
|
|
588
|
+
}
|
|
589
|
+
return new Error(message);
|
|
590
|
+
}
|
|
591
|
+
function normalizeResultsExportConfig(config) {
|
|
592
|
+
return {
|
|
593
|
+
repo: config.repo.trim(),
|
|
594
|
+
path: config.path.trim().replace(/^\/+|\/+$/g, ""),
|
|
595
|
+
auto_push: config.auto_push === true,
|
|
596
|
+
branch_prefix: config.branch_prefix?.trim() || "eval-results"
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
function resolveResultsRepoUrl(repo) {
|
|
600
|
+
if (repo.includes("://") || repo.startsWith("git@")) {
|
|
601
|
+
return repo;
|
|
602
|
+
}
|
|
603
|
+
return `https://github.com/${repo}.git`;
|
|
604
|
+
}
|
|
605
|
+
function getResultsRepoCachePaths(repo) {
|
|
606
|
+
const rootDir = path4.join(getAgentvHome(), "cache", "results-repo", sanitizeRepoSlug(repo));
|
|
607
|
+
return {
|
|
608
|
+
rootDir,
|
|
609
|
+
repoDir: path4.join(rootDir, "repo"),
|
|
610
|
+
statusFile: path4.join(rootDir, "status.json")
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
function readPersistedStatus(statusFile) {
|
|
614
|
+
if (!existsSync(statusFile)) {
|
|
615
|
+
return {};
|
|
616
|
+
}
|
|
617
|
+
try {
|
|
618
|
+
return JSON.parse(readFileSync2(statusFile, "utf8"));
|
|
619
|
+
} catch {
|
|
620
|
+
return {};
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
function writePersistedStatus(statusFile, status) {
|
|
624
|
+
mkdirSync(path4.dirname(statusFile), { recursive: true });
|
|
625
|
+
writeFileSync(statusFile, `${JSON.stringify(status, null, 2)}
|
|
626
|
+
`, "utf8");
|
|
627
|
+
}
|
|
628
|
+
async function runCommand(executable, args, options) {
|
|
629
|
+
try {
|
|
630
|
+
const { stdout, stderr } = await execFileAsync(executable, [...args], {
|
|
631
|
+
cwd: options?.cwd,
|
|
632
|
+
env: process.env
|
|
633
|
+
});
|
|
634
|
+
return { stdout, stderr };
|
|
635
|
+
} catch (error) {
|
|
636
|
+
if (options?.check === false && error && typeof error === "object") {
|
|
637
|
+
const execError = error;
|
|
638
|
+
return {
|
|
639
|
+
stdout: execError.stdout ?? "",
|
|
640
|
+
stderr: execError.stderr ?? ""
|
|
641
|
+
};
|
|
642
|
+
}
|
|
643
|
+
throw withFriendlyGitHubAuthError(error);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
async function runGit(args, options) {
|
|
647
|
+
return runCommand("git", args, options);
|
|
648
|
+
}
|
|
649
|
+
async function runGh(args, options) {
|
|
650
|
+
return runCommand("gh", args, options);
|
|
651
|
+
}
|
|
652
|
+
async function resolveDefaultBranch(repoDir) {
|
|
653
|
+
try {
|
|
654
|
+
const { stdout } = await runGit(["symbolic-ref", "refs/remotes/origin/HEAD"], { cwd: repoDir });
|
|
655
|
+
const ref = stdout.trim();
|
|
656
|
+
const prefix = "refs/remotes/origin/";
|
|
657
|
+
if (ref.startsWith(prefix)) {
|
|
658
|
+
return ref.slice(prefix.length);
|
|
659
|
+
}
|
|
660
|
+
} catch {
|
|
661
|
+
}
|
|
662
|
+
for (const candidate of ["main", "master"]) {
|
|
663
|
+
try {
|
|
664
|
+
await runGit(["rev-parse", "--verify", `origin/${candidate}`], { cwd: repoDir });
|
|
665
|
+
return candidate;
|
|
666
|
+
} catch {
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
return "main";
|
|
670
|
+
}
|
|
671
|
+
async function updateCacheRepo(repoDir, baseBranch) {
|
|
672
|
+
await runGit(["fetch", "origin", "--prune"], { cwd: repoDir });
|
|
673
|
+
await runGit(["checkout", baseBranch], { cwd: repoDir });
|
|
674
|
+
await runGit(["pull", "--ff-only", "origin", baseBranch], { cwd: repoDir });
|
|
675
|
+
}
|
|
676
|
+
function updateStatusFile(config, patch) {
|
|
677
|
+
const cachePaths = getResultsRepoCachePaths(config.repo);
|
|
678
|
+
const current = readPersistedStatus(cachePaths.statusFile);
|
|
679
|
+
writePersistedStatus(cachePaths.statusFile, {
|
|
680
|
+
...current,
|
|
681
|
+
...patch
|
|
682
|
+
});
|
|
683
|
+
}
|
|
684
|
+
async function ensureResultsRepoClone(config) {
|
|
685
|
+
const normalized = normalizeResultsExportConfig(config);
|
|
686
|
+
const cachePaths = getResultsRepoCachePaths(normalized.repo);
|
|
687
|
+
mkdirSync(cachePaths.rootDir, { recursive: true });
|
|
688
|
+
if (!existsSync(cachePaths.repoDir)) {
|
|
689
|
+
try {
|
|
690
|
+
await runGit([
|
|
691
|
+
"clone",
|
|
692
|
+
"--filter=blob:none",
|
|
693
|
+
resolveResultsRepoUrl(normalized.repo),
|
|
694
|
+
cachePaths.repoDir
|
|
695
|
+
]);
|
|
696
|
+
return cachePaths.repoDir;
|
|
697
|
+
} catch (error) {
|
|
698
|
+
updateStatusFile(normalized, { last_error: withFriendlyGitHubAuthError(error).message });
|
|
699
|
+
throw withFriendlyGitHubAuthError(error);
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
if (!existsSync(path4.join(cachePaths.repoDir, ".git"))) {
|
|
703
|
+
throw new Error(`Results repo cache is not a git repository: ${cachePaths.repoDir}`);
|
|
704
|
+
}
|
|
705
|
+
return cachePaths.repoDir;
|
|
706
|
+
}
|
|
707
|
+
function getResultsRepoStatus(config) {
|
|
708
|
+
if (!config) {
|
|
709
|
+
return {
|
|
710
|
+
configured: false,
|
|
711
|
+
available: false,
|
|
712
|
+
repo: "",
|
|
713
|
+
cache_dir: ""
|
|
714
|
+
};
|
|
715
|
+
}
|
|
716
|
+
const normalized = normalizeResultsExportConfig(config);
|
|
717
|
+
const cachePaths = getResultsRepoCachePaths(normalized.repo);
|
|
718
|
+
const persisted = readPersistedStatus(cachePaths.statusFile);
|
|
719
|
+
return {
|
|
720
|
+
configured: true,
|
|
721
|
+
available: existsSync(cachePaths.repoDir),
|
|
722
|
+
repo: normalized.repo,
|
|
723
|
+
path: normalized.path,
|
|
724
|
+
auto_push: normalized.auto_push,
|
|
725
|
+
branch_prefix: normalized.branch_prefix,
|
|
726
|
+
cache_dir: cachePaths.repoDir,
|
|
727
|
+
last_synced_at: persisted.last_synced_at,
|
|
728
|
+
last_error: persisted.last_error
|
|
729
|
+
};
|
|
730
|
+
}
|
|
731
|
+
async function syncResultsRepo(config) {
|
|
732
|
+
const normalized = normalizeResultsExportConfig(config);
|
|
733
|
+
try {
|
|
734
|
+
const repoDir = await ensureResultsRepoClone(normalized);
|
|
735
|
+
const baseBranch = await resolveDefaultBranch(repoDir);
|
|
736
|
+
await updateCacheRepo(repoDir, baseBranch);
|
|
737
|
+
updateStatusFile(normalized, {
|
|
738
|
+
last_synced_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
739
|
+
last_error: void 0
|
|
740
|
+
});
|
|
741
|
+
} catch (error) {
|
|
742
|
+
updateStatusFile(normalized, {
|
|
743
|
+
last_error: withFriendlyGitHubAuthError(error).message
|
|
744
|
+
});
|
|
745
|
+
throw withFriendlyGitHubAuthError(error);
|
|
746
|
+
}
|
|
747
|
+
return getResultsRepoStatus(normalized);
|
|
748
|
+
}
|
|
749
|
+
async function checkoutResultsRepoBranch(config, branchName) {
|
|
750
|
+
const normalized = normalizeResultsExportConfig(config);
|
|
751
|
+
const repoDir = await ensureResultsRepoClone(normalized);
|
|
752
|
+
const baseBranch = await resolveDefaultBranch(repoDir);
|
|
753
|
+
await updateCacheRepo(repoDir, baseBranch);
|
|
754
|
+
await runGit(["checkout", "-B", branchName, `origin/${baseBranch}`], { cwd: repoDir });
|
|
755
|
+
updateStatusFile(normalized, { last_error: void 0 });
|
|
756
|
+
return {
|
|
757
|
+
branchName,
|
|
758
|
+
baseBranch,
|
|
759
|
+
repoDir
|
|
760
|
+
};
|
|
761
|
+
}
|
|
762
|
+
async function prepareResultsRepoBranch(config, branchName) {
|
|
763
|
+
const normalized = normalizeResultsExportConfig(config);
|
|
764
|
+
const cloneDir = await ensureResultsRepoClone(normalized);
|
|
765
|
+
const baseBranch = await resolveDefaultBranch(cloneDir);
|
|
766
|
+
await updateCacheRepo(cloneDir, baseBranch);
|
|
767
|
+
const worktreeRoot = await mkdtemp(path4.join(os.tmpdir(), "agentv-results-repo-"));
|
|
768
|
+
const worktreeDir = path4.join(worktreeRoot, "repo");
|
|
769
|
+
await runGit(["worktree", "add", "-B", branchName, worktreeDir, `origin/${baseBranch}`], {
|
|
770
|
+
cwd: cloneDir
|
|
771
|
+
});
|
|
772
|
+
return {
|
|
773
|
+
branchName,
|
|
774
|
+
baseBranch,
|
|
775
|
+
repoDir: worktreeDir,
|
|
776
|
+
cleanup: async () => {
|
|
777
|
+
try {
|
|
778
|
+
await runGit(["worktree", "remove", "--force", worktreeDir], { cwd: cloneDir });
|
|
779
|
+
} finally {
|
|
780
|
+
await rm(worktreeRoot, { recursive: true, force: true }).catch(() => void 0);
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
};
|
|
784
|
+
}
|
|
785
|
+
async function stageResultsArtifacts(params) {
|
|
786
|
+
rmSync(params.destinationDir, { recursive: true, force: true });
|
|
787
|
+
mkdirSync(path4.dirname(params.destinationDir), { recursive: true });
|
|
788
|
+
await cp(params.sourceDir, params.destinationDir, { recursive: true });
|
|
789
|
+
}
|
|
790
|
+
function resolveResultsRepoRunsDir(config) {
|
|
791
|
+
const normalized = normalizeResultsExportConfig(config);
|
|
792
|
+
return path4.join(
|
|
793
|
+
getResultsRepoCachePaths(normalized.repo).repoDir,
|
|
794
|
+
...normalized.path.split("/")
|
|
795
|
+
);
|
|
796
|
+
}
|
|
797
|
+
async function directorySizeBytes(targetPath) {
|
|
798
|
+
const entry = await stat(targetPath);
|
|
799
|
+
if (entry.isFile()) {
|
|
800
|
+
return entry.size;
|
|
801
|
+
}
|
|
802
|
+
let total = 0;
|
|
803
|
+
for (const child of await readdir(targetPath, { withFileTypes: true })) {
|
|
804
|
+
total += await directorySizeBytes(path4.join(targetPath, child.name));
|
|
805
|
+
}
|
|
806
|
+
return total;
|
|
807
|
+
}
|
|
808
|
+
async function commitAndPushResultsBranch(params) {
|
|
809
|
+
await runGit(["add", "--all"], { cwd: params.repoDir });
|
|
810
|
+
const { stdout: diffStdout } = await runGit(["status", "--porcelain"], {
|
|
811
|
+
cwd: params.repoDir,
|
|
812
|
+
check: false
|
|
813
|
+
});
|
|
814
|
+
if (diffStdout.trim().length === 0) {
|
|
815
|
+
return false;
|
|
816
|
+
}
|
|
817
|
+
await runGit(["commit", "-m", params.commitMessage], { cwd: params.repoDir });
|
|
818
|
+
await runGit(["push", "-u", "origin", params.branchName], { cwd: params.repoDir });
|
|
819
|
+
return true;
|
|
820
|
+
}
|
|
821
|
+
async function pushResultsRepoBranch(config, branchName, cwd) {
|
|
822
|
+
const normalized = normalizeResultsExportConfig(config);
|
|
823
|
+
await runGit(["push", "-u", "origin", branchName], {
|
|
824
|
+
cwd: cwd ?? getResultsRepoCachePaths(normalized.repo).repoDir
|
|
825
|
+
});
|
|
826
|
+
updateStatusFile(normalized, {
|
|
827
|
+
last_synced_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
828
|
+
last_error: void 0
|
|
829
|
+
});
|
|
830
|
+
}
|
|
831
|
+
async function createDraftResultsPr(params) {
|
|
832
|
+
const { stdout } = await runGh(
|
|
833
|
+
[
|
|
834
|
+
"pr",
|
|
835
|
+
"create",
|
|
836
|
+
"--draft",
|
|
837
|
+
"--repo",
|
|
838
|
+
params.repo,
|
|
839
|
+
"--base",
|
|
840
|
+
params.baseBranch,
|
|
841
|
+
"--head",
|
|
842
|
+
params.branchName,
|
|
843
|
+
"--title",
|
|
844
|
+
params.title,
|
|
845
|
+
"--body",
|
|
846
|
+
params.body
|
|
847
|
+
],
|
|
848
|
+
{ cwd: params.repoDir }
|
|
849
|
+
);
|
|
850
|
+
return stdout.trim();
|
|
851
|
+
}
|
|
852
|
+
function getBenchmarksRegistryPath() {
|
|
853
|
+
return path5.join(getAgentvConfigDir(), "projects.yaml");
|
|
854
|
+
}
|
|
855
|
+
function migrateProjectsYaml(targetPath) {
|
|
856
|
+
const dataHome = getAgentvHome();
|
|
857
|
+
const configDir = getAgentvConfigDir();
|
|
858
|
+
if (dataHome === configDir) return;
|
|
859
|
+
const legacyPath = path5.join(dataHome, "projects.yaml");
|
|
860
|
+
if (!existsSync2(legacyPath)) return;
|
|
861
|
+
mkdirSync2(path5.dirname(targetPath), { recursive: true });
|
|
862
|
+
copyFileSync(legacyPath, targetPath);
|
|
863
|
+
}
|
|
864
|
+
function loadBenchmarkRegistry() {
|
|
865
|
+
const registryPath = getBenchmarksRegistryPath();
|
|
866
|
+
if (!existsSync2(registryPath)) {
|
|
867
|
+
migrateProjectsYaml(registryPath);
|
|
868
|
+
}
|
|
869
|
+
if (!existsSync2(registryPath)) {
|
|
870
|
+
return { benchmarks: [] };
|
|
871
|
+
}
|
|
872
|
+
try {
|
|
873
|
+
const raw = readFileSync3(registryPath, "utf-8");
|
|
874
|
+
const parsed = parseYaml(raw);
|
|
875
|
+
if (!parsed || !Array.isArray(parsed.benchmarks)) {
|
|
876
|
+
return { benchmarks: [] };
|
|
877
|
+
}
|
|
878
|
+
return { benchmarks: parsed.benchmarks };
|
|
879
|
+
} catch {
|
|
880
|
+
return { benchmarks: [] };
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
function saveBenchmarkRegistry(registry) {
|
|
884
|
+
const registryPath = getBenchmarksRegistryPath();
|
|
885
|
+
const dir = path5.dirname(registryPath);
|
|
886
|
+
if (!existsSync2(dir)) {
|
|
887
|
+
mkdirSync2(dir, { recursive: true });
|
|
888
|
+
}
|
|
889
|
+
writeFileSync2(registryPath, stringifyYaml({ benchmarks: registry.benchmarks }), "utf-8");
|
|
890
|
+
}
|
|
891
|
+
function deriveBenchmarkId(dirPath, existingIds) {
|
|
892
|
+
const base = path5.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
|
|
893
|
+
let candidate = base || "benchmark";
|
|
894
|
+
let suffix = 2;
|
|
895
|
+
while (existingIds.includes(candidate)) {
|
|
896
|
+
candidate = `${base}-${suffix}`;
|
|
897
|
+
suffix++;
|
|
898
|
+
}
|
|
899
|
+
return candidate;
|
|
900
|
+
}
|
|
901
|
+
function addBenchmark(benchmarkPath) {
|
|
902
|
+
const absPath = path5.resolve(benchmarkPath);
|
|
903
|
+
if (!existsSync2(absPath)) {
|
|
904
|
+
throw new Error(`Directory not found: ${absPath}`);
|
|
905
|
+
}
|
|
906
|
+
if (!existsSync2(path5.join(absPath, ".agentv"))) {
|
|
907
|
+
throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
|
|
908
|
+
}
|
|
909
|
+
const registry = loadBenchmarkRegistry();
|
|
910
|
+
const existing = registry.benchmarks.find((p) => p.path === absPath);
|
|
911
|
+
if (existing) {
|
|
912
|
+
return existing;
|
|
913
|
+
}
|
|
914
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
915
|
+
const entry = {
|
|
916
|
+
id: deriveBenchmarkId(
|
|
917
|
+
absPath,
|
|
918
|
+
registry.benchmarks.map((p) => p.id)
|
|
919
|
+
),
|
|
920
|
+
name: path5.basename(absPath),
|
|
921
|
+
path: absPath,
|
|
922
|
+
addedAt: now,
|
|
923
|
+
lastOpenedAt: now
|
|
924
|
+
};
|
|
925
|
+
registry.benchmarks.push(entry);
|
|
926
|
+
saveBenchmarkRegistry(registry);
|
|
927
|
+
return entry;
|
|
928
|
+
}
|
|
929
|
+
function removeBenchmark(benchmarkId) {
|
|
930
|
+
const registry = loadBenchmarkRegistry();
|
|
931
|
+
const idx = registry.benchmarks.findIndex((p) => p.id === benchmarkId);
|
|
932
|
+
if (idx < 0) return false;
|
|
933
|
+
registry.benchmarks.splice(idx, 1);
|
|
934
|
+
saveBenchmarkRegistry(registry);
|
|
935
|
+
return true;
|
|
936
|
+
}
|
|
937
|
+
function getBenchmark(benchmarkId) {
|
|
938
|
+
return loadBenchmarkRegistry().benchmarks.find((p) => p.id === benchmarkId);
|
|
939
|
+
}
|
|
940
|
+
function touchBenchmark(benchmarkId) {
|
|
941
|
+
const registry = loadBenchmarkRegistry();
|
|
942
|
+
const entry = registry.benchmarks.find((p) => p.id === benchmarkId);
|
|
943
|
+
if (entry) {
|
|
944
|
+
entry.lastOpenedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
945
|
+
saveBenchmarkRegistry(registry);
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
function discoverBenchmarks(rootDir, maxDepth = 2) {
|
|
949
|
+
const absRoot = path5.resolve(rootDir);
|
|
950
|
+
if (!existsSync2(absRoot) || !statSync(absRoot).isDirectory()) {
|
|
951
|
+
return [];
|
|
952
|
+
}
|
|
953
|
+
const results = [];
|
|
954
|
+
function scan(dir, depth) {
|
|
955
|
+
if (depth > maxDepth) return;
|
|
956
|
+
if (existsSync2(path5.join(dir, ".agentv"))) {
|
|
957
|
+
results.push(dir);
|
|
958
|
+
return;
|
|
959
|
+
}
|
|
960
|
+
if (depth === maxDepth) return;
|
|
961
|
+
try {
|
|
962
|
+
const entries = readdirSync(dir, { withFileTypes: true });
|
|
963
|
+
for (const entry of entries) {
|
|
964
|
+
if (!entry.isDirectory()) continue;
|
|
965
|
+
if (entry.name.startsWith(".") || entry.name === "node_modules") continue;
|
|
966
|
+
scan(path5.join(dir, entry.name), depth + 1);
|
|
967
|
+
}
|
|
968
|
+
} catch {
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
scan(absRoot, 0);
|
|
972
|
+
return results;
|
|
973
|
+
}
|
|
974
|
+
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
975
|
+
"requests",
|
|
976
|
+
"trace",
|
|
977
|
+
"workspacePath",
|
|
978
|
+
"output",
|
|
979
|
+
"beforeAllOutput",
|
|
980
|
+
"beforeEachOutput",
|
|
981
|
+
"afterAllOutput",
|
|
982
|
+
"afterEachOutput",
|
|
983
|
+
"fileChanges",
|
|
984
|
+
// Promoted execution metrics (debug, not needed for regression comparison)
|
|
985
|
+
"tokenUsage",
|
|
986
|
+
"costUsd",
|
|
987
|
+
"durationMs",
|
|
988
|
+
"startTime",
|
|
989
|
+
"endTime"
|
|
990
|
+
]);
|
|
991
|
+
var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "input"]);
|
|
992
|
+
function trimEvaluatorResult(result) {
|
|
993
|
+
const trimmed = {};
|
|
994
|
+
for (const [key, value] of Object.entries(result)) {
|
|
995
|
+
if (STRIPPED_EVALUATOR_FIELDS.has(key)) continue;
|
|
996
|
+
if (key === "scores" && Array.isArray(value)) {
|
|
997
|
+
trimmed[key] = value.map(trimEvaluatorResult);
|
|
998
|
+
} else {
|
|
999
|
+
trimmed[key] = value;
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
return trimmed;
|
|
1003
|
+
}
|
|
1004
|
+
function trimBaselineResult(result) {
|
|
1005
|
+
const trimmed = {};
|
|
1006
|
+
for (const [key, value] of Object.entries(result)) {
|
|
1007
|
+
if (STRIPPED_TOP_LEVEL_FIELDS.has(key)) continue;
|
|
1008
|
+
if (key === "scores" && Array.isArray(value)) {
|
|
1009
|
+
trimmed[key] = value.map(trimEvaluatorResult);
|
|
1010
|
+
} else {
|
|
1011
|
+
trimmed[key] = value;
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
return trimmed;
|
|
1015
|
+
}
|
|
1016
|
+
var DEFAULT_CATEGORY = "Uncategorized";
|
|
1017
|
+
function deriveCategory(relativePath) {
|
|
1018
|
+
const parts = relativePath.split(/[/\\]/);
|
|
1019
|
+
if (parts.length <= 1) {
|
|
1020
|
+
return DEFAULT_CATEGORY;
|
|
1021
|
+
}
|
|
1022
|
+
const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
|
|
1023
|
+
return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
|
|
1024
|
+
}
|
|
1025
|
+
var OTEL_BACKEND_PRESETS = {
|
|
1026
|
+
langfuse: {
|
|
1027
|
+
name: "langfuse",
|
|
1028
|
+
endpoint: process.env.LANGFUSE_HOST ? `${process.env.LANGFUSE_HOST}/api/public/otel/v1/traces` : "https://cloud.langfuse.com/api/public/otel/v1/traces",
|
|
1029
|
+
headers: (env) => {
|
|
1030
|
+
const pub = env.LANGFUSE_PUBLIC_KEY ?? "";
|
|
1031
|
+
const secret = env.LANGFUSE_SECRET_KEY ?? "";
|
|
1032
|
+
return { Authorization: `Basic ${Buffer.from(`${pub}:${secret}`).toString("base64")}` };
|
|
1033
|
+
}
|
|
1034
|
+
},
|
|
1035
|
+
braintrust: {
|
|
1036
|
+
name: "braintrust",
|
|
1037
|
+
endpoint: "https://api.braintrust.dev/otel/v1/traces",
|
|
1038
|
+
headers: (env) => {
|
|
1039
|
+
const headers = {
|
|
1040
|
+
Authorization: `Bearer ${env.BRAINTRUST_API_KEY ?? ""}`
|
|
1041
|
+
};
|
|
1042
|
+
const parent = env.BRAINTRUST_PARENT ?? (env.BRAINTRUST_PROJECT_ID ? `project_id:${env.BRAINTRUST_PROJECT_ID}` : void 0) ?? (env.BRAINTRUST_PROJECT ? `project_name:${env.BRAINTRUST_PROJECT}` : void 0);
|
|
1043
|
+
if (parent) {
|
|
1044
|
+
headers["x-bt-parent"] = parent;
|
|
1045
|
+
}
|
|
1046
|
+
return headers;
|
|
1047
|
+
}
|
|
1048
|
+
},
|
|
1049
|
+
confident: {
|
|
1050
|
+
name: "confident",
|
|
1051
|
+
endpoint: "https://otel.confident-ai.com/v1/traces",
|
|
1052
|
+
headers: (env) => ({
|
|
1053
|
+
"x-confident-api-key": env.CONFIDENT_API_KEY ?? ""
|
|
1054
|
+
})
|
|
1055
|
+
}
|
|
1056
|
+
};
|
|
1057
|
+
var OtelTraceExporter = class {
|
|
1058
|
+
constructor(options) {
|
|
1059
|
+
this.options = options;
|
|
1060
|
+
}
|
|
1061
|
+
provider = null;
|
|
1062
|
+
tracer = null;
|
|
1063
|
+
api = null;
|
|
1064
|
+
// biome-ignore lint/suspicious/noExplicitAny: OTel types loaded dynamically
|
|
1065
|
+
W3CPropagator = null;
|
|
1066
|
+
/** Initialize the OTel SDK. Returns false if OTel packages are not available. */
|
|
1067
|
+
async init() {
|
|
1068
|
+
try {
|
|
1069
|
+
const [sdkTraceNode, resourcesMod, semconvMod, api, coreMod] = await Promise.all([
|
|
1070
|
+
import("./src-PXDA7QIS.js"),
|
|
1071
|
+
import("./esm-UYZ3HJBU.js"),
|
|
1072
|
+
import("./esm-RVQPUGWH.js"),
|
|
1073
|
+
import("./esm-R77SNOF5.js"),
|
|
1074
|
+
import("./esm-ZADQ4XQH-5LX2IKZV.js").catch(() => null)
|
|
1075
|
+
]);
|
|
1076
|
+
const { NodeTracerProvider: Provider, SimpleSpanProcessor } = sdkTraceNode;
|
|
1077
|
+
const { resourceFromAttributes } = resourcesMod;
|
|
1078
|
+
const { ATTR_SERVICE_NAME } = semconvMod;
|
|
1079
|
+
const resource = resourceFromAttributes({
|
|
1080
|
+
[ATTR_SERVICE_NAME]: this.options.serviceName ?? "agentv"
|
|
1081
|
+
});
|
|
1082
|
+
const processors = [];
|
|
1083
|
+
if (this.options.endpoint) {
|
|
1084
|
+
const otlpHttp = await import("./esm-QNEMCJPL.js");
|
|
1085
|
+
const { OTLPTraceExporter } = otlpHttp;
|
|
1086
|
+
const exporter = new OTLPTraceExporter({
|
|
1087
|
+
url: this.options.endpoint,
|
|
1088
|
+
headers: this.options.headers
|
|
1089
|
+
});
|
|
1090
|
+
processors.push(new SimpleSpanProcessor(exporter));
|
|
1091
|
+
}
|
|
1092
|
+
if (this.options.otlpFilePath) {
|
|
1093
|
+
const { OtlpJsonFileExporter: OtlpJsonFileExporter2 } = await import("./otlp-json-file-exporter-RJFPCKVK-T6N4OGWG.js");
|
|
1094
|
+
processors.push(
|
|
1095
|
+
new SimpleSpanProcessor(new OtlpJsonFileExporter2(this.options.otlpFilePath))
|
|
1096
|
+
);
|
|
1097
|
+
}
|
|
1098
|
+
if (processors.length === 0) {
|
|
1099
|
+
return false;
|
|
1100
|
+
}
|
|
1101
|
+
this.provider = new Provider({
|
|
1102
|
+
resource,
|
|
1103
|
+
spanProcessors: processors
|
|
1104
|
+
});
|
|
1105
|
+
this.provider.register();
|
|
1106
|
+
this.api = api;
|
|
1107
|
+
this.tracer = api.trace.getTracer("agentv", "1.0.0");
|
|
1108
|
+
this.W3CPropagator = coreMod?.W3CTraceContextPropagator ?? null;
|
|
1109
|
+
return true;
|
|
1110
|
+
} catch {
|
|
1111
|
+
return false;
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
/** Export a single evaluation result as an OTel trace. */
|
|
1115
|
+
async exportResult(result) {
|
|
1116
|
+
if (!this.tracer || !this.api) return;
|
|
1117
|
+
const api = this.api;
|
|
1118
|
+
const tracer = this.tracer;
|
|
1119
|
+
const captureContent = this.options.captureContent ?? false;
|
|
1120
|
+
const startHr = toHrTime(result.startTime ?? result.timestamp);
|
|
1121
|
+
const endHr = toHrTime(result.endTime ?? result.timestamp);
|
|
1122
|
+
let parentCtx = api.ROOT_CONTEXT;
|
|
1123
|
+
const traceparent = process.env.TRACEPARENT;
|
|
1124
|
+
if (traceparent && this.W3CPropagator) {
|
|
1125
|
+
try {
|
|
1126
|
+
const propagator = new this.W3CPropagator();
|
|
1127
|
+
parentCtx = propagator.extract(
|
|
1128
|
+
api.ROOT_CONTEXT,
|
|
1129
|
+
{ traceparent, tracestate: process.env.TRACESTATE ?? "" },
|
|
1130
|
+
{
|
|
1131
|
+
get: (carrier, key) => carrier[key],
|
|
1132
|
+
keys: (carrier) => Object.keys(carrier)
|
|
1133
|
+
}
|
|
1134
|
+
);
|
|
1135
|
+
} catch {
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
tracer.startActiveSpan(
|
|
1139
|
+
"agentv.eval",
|
|
1140
|
+
{ startTime: startHr },
|
|
1141
|
+
parentCtx,
|
|
1142
|
+
(rootSpan) => {
|
|
1143
|
+
rootSpan.setAttribute("gen_ai.operation.name", "evaluate");
|
|
1144
|
+
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
1145
|
+
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
1146
|
+
rootSpan.setAttribute("agentv.target", result.target);
|
|
1147
|
+
if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
|
|
1148
|
+
rootSpan.setAttribute("agentv.score", result.score);
|
|
1149
|
+
if (captureContent && result.output.length > 0) {
|
|
1150
|
+
const lastMsg = result.output[result.output.length - 1];
|
|
1151
|
+
const text = typeof lastMsg.content === "string" ? lastMsg.content : JSON.stringify(lastMsg.content);
|
|
1152
|
+
rootSpan.setAttribute("agentv.output_text", text);
|
|
1153
|
+
}
|
|
1154
|
+
if (result.durationMs != null)
|
|
1155
|
+
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
1156
|
+
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
1157
|
+
if (result.tokenUsage) {
|
|
1158
|
+
if (result.tokenUsage.input != null) {
|
|
1159
|
+
rootSpan.setAttribute("agentv.trace.token_input", result.tokenUsage.input);
|
|
1160
|
+
}
|
|
1161
|
+
if (result.tokenUsage.output != null) {
|
|
1162
|
+
rootSpan.setAttribute("agentv.trace.token_output", result.tokenUsage.output);
|
|
1163
|
+
}
|
|
1164
|
+
if (result.tokenUsage.cached != null) {
|
|
1165
|
+
rootSpan.setAttribute("agentv.trace.token_cached", result.tokenUsage.cached);
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
if (result.trace) {
|
|
1169
|
+
const t = result.trace;
|
|
1170
|
+
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
1171
|
+
rootSpan.setAttribute(
|
|
1172
|
+
"agentv.trace.tool_names",
|
|
1173
|
+
Object.keys(t.toolCalls).sort().join(",")
|
|
1174
|
+
);
|
|
1175
|
+
if (t.llmCallCount != null)
|
|
1176
|
+
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
1177
|
+
}
|
|
1178
|
+
if (result.output) {
|
|
1179
|
+
const parentCtx2 = api.trace.setSpan(api.context.active(), rootSpan);
|
|
1180
|
+
if (this.options.groupTurns) {
|
|
1181
|
+
const turns = groupMessagesIntoTurns(result.output);
|
|
1182
|
+
if (turns.length > 1) {
|
|
1183
|
+
for (const [i, turn] of turns.entries()) {
|
|
1184
|
+
api.context.with(parentCtx2, () => {
|
|
1185
|
+
tracer.startActiveSpan(
|
|
1186
|
+
`agentv.turn.${i + 1}`,
|
|
1187
|
+
{},
|
|
1188
|
+
(turnSpan) => {
|
|
1189
|
+
const turnCtx = api.trace.setSpan(api.context.active(), turnSpan);
|
|
1190
|
+
for (const msg of turn.messages) {
|
|
1191
|
+
this.exportMessage(tracer, api, turnCtx, msg, captureContent);
|
|
1192
|
+
}
|
|
1193
|
+
turnSpan.end();
|
|
1194
|
+
}
|
|
1195
|
+
);
|
|
1196
|
+
});
|
|
1197
|
+
}
|
|
1198
|
+
} else {
|
|
1199
|
+
for (const msg of result.output) {
|
|
1200
|
+
this.exportMessage(tracer, api, parentCtx2, msg, captureContent);
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
} else {
|
|
1204
|
+
for (const msg of result.output) {
|
|
1205
|
+
this.exportMessage(tracer, api, parentCtx2, msg, captureContent);
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
1208
|
+
}
|
|
1209
|
+
if (result.scores) {
|
|
1210
|
+
for (const score of result.scores) {
|
|
1211
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
1212
|
+
"agentv.grader.score": score.score,
|
|
1213
|
+
"agentv.grader.type": score.type,
|
|
1214
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
1215
|
+
});
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
if (result.error) {
|
|
1219
|
+
rootSpan.setStatus({ code: api.SpanStatusCode.ERROR, message: result.error });
|
|
1220
|
+
} else {
|
|
1221
|
+
rootSpan.setStatus({ code: api.SpanStatusCode.OK });
|
|
1222
|
+
}
|
|
1223
|
+
rootSpan.end(endHr);
|
|
1224
|
+
}
|
|
1225
|
+
);
|
|
1226
|
+
}
|
|
1227
|
+
/** Flush pending spans and shut down. */
|
|
1228
|
+
async shutdown() {
|
|
1229
|
+
await this.provider?.shutdown();
|
|
1230
|
+
}
|
|
1231
|
+
/** Create a streaming observer for real-time span export */
|
|
1232
|
+
createStreamingObserver() {
|
|
1233
|
+
if (!this.tracer || !this.api) return null;
|
|
1234
|
+
let parentCtx;
|
|
1235
|
+
const traceparent = process.env.TRACEPARENT;
|
|
1236
|
+
if (traceparent && this.W3CPropagator) {
|
|
1237
|
+
try {
|
|
1238
|
+
const propagator = new this.W3CPropagator();
|
|
1239
|
+
parentCtx = propagator.extract(
|
|
1240
|
+
this.api.ROOT_CONTEXT,
|
|
1241
|
+
{ traceparent, tracestate: process.env.TRACESTATE ?? "" },
|
|
1242
|
+
{
|
|
1243
|
+
get: (carrier, key) => carrier[key],
|
|
1244
|
+
keys: (carrier) => Object.keys(carrier)
|
|
1245
|
+
}
|
|
1246
|
+
);
|
|
1247
|
+
} catch {
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
return new OtelStreamingObserver(
|
|
1251
|
+
this.tracer,
|
|
1252
|
+
this.api,
|
|
1253
|
+
this.options.captureContent ?? false,
|
|
1254
|
+
parentCtx
|
|
1255
|
+
);
|
|
1256
|
+
}
|
|
1257
|
+
// -----------------------------------------------------------------------
|
|
1258
|
+
// Private helpers
|
|
1259
|
+
// -----------------------------------------------------------------------
|
|
1260
|
+
exportMessage(tracer, api, parentCtx, msg, captureContent) {
|
|
1261
|
+
const isAssistant = msg.role === "assistant";
|
|
1262
|
+
const model = msg.metadata?.model ? String(msg.metadata.model) : void 0;
|
|
1263
|
+
const spanName = isAssistant ? `chat ${model ?? "unknown"}` : `gen_ai.message.${msg.role}`;
|
|
1264
|
+
const startHr = toHrTime(msg.startTime);
|
|
1265
|
+
const endHr = toHrTime(msg.endTime);
|
|
1266
|
+
api.context.with(parentCtx, () => {
|
|
1267
|
+
tracer.startActiveSpan(
|
|
1268
|
+
spanName,
|
|
1269
|
+
{ startTime: startHr },
|
|
1270
|
+
parentCtx,
|
|
1271
|
+
(span) => {
|
|
1272
|
+
if (isAssistant) {
|
|
1273
|
+
span.setAttribute("gen_ai.operation.name", "chat");
|
|
1274
|
+
}
|
|
1275
|
+
if (model) {
|
|
1276
|
+
span.setAttribute("gen_ai.request.model", model);
|
|
1277
|
+
span.setAttribute("gen_ai.response.model", model);
|
|
1278
|
+
}
|
|
1279
|
+
if (msg.tokenUsage) {
|
|
1280
|
+
if (msg.tokenUsage.input != null) {
|
|
1281
|
+
span.setAttribute("gen_ai.usage.input_tokens", msg.tokenUsage.input);
|
|
1282
|
+
}
|
|
1283
|
+
if (msg.tokenUsage.output != null) {
|
|
1284
|
+
span.setAttribute("gen_ai.usage.output_tokens", msg.tokenUsage.output);
|
|
1285
|
+
}
|
|
1286
|
+
if (msg.tokenUsage.cached != null) {
|
|
1287
|
+
span.setAttribute("gen_ai.usage.cache_read.input_tokens", msg.tokenUsage.cached);
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1290
|
+
if (captureContent && msg.content != null) {
|
|
1291
|
+
span.setAttribute(
|
|
1292
|
+
"gen_ai.output.messages",
|
|
1293
|
+
typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content)
|
|
1294
|
+
);
|
|
1295
|
+
}
|
|
1296
|
+
if (msg.toolCalls) {
|
|
1297
|
+
const msgCtx = api.trace.setSpan(api.context.active(), span);
|
|
1298
|
+
for (const tc of msg.toolCalls) {
|
|
1299
|
+
api.context.with(msgCtx, () => {
|
|
1300
|
+
tracer.startActiveSpan(
|
|
1301
|
+
`execute_tool ${tc.tool}`,
|
|
1302
|
+
{},
|
|
1303
|
+
msgCtx,
|
|
1304
|
+
(toolSpan) => {
|
|
1305
|
+
toolSpan.setAttribute("gen_ai.tool.name", tc.tool);
|
|
1306
|
+
if (tc.id) toolSpan.setAttribute("gen_ai.tool.call.id", tc.id);
|
|
1307
|
+
if (captureContent) {
|
|
1308
|
+
if (tc.input != null) {
|
|
1309
|
+
toolSpan.setAttribute(
|
|
1310
|
+
"gen_ai.tool.call.arguments",
|
|
1311
|
+
typeof tc.input === "string" ? tc.input : JSON.stringify(tc.input)
|
|
1312
|
+
);
|
|
1313
|
+
}
|
|
1314
|
+
if (tc.output != null) {
|
|
1315
|
+
toolSpan.setAttribute(
|
|
1316
|
+
"gen_ai.tool.call.result",
|
|
1317
|
+
typeof tc.output === "string" ? tc.output : JSON.stringify(tc.output)
|
|
1318
|
+
);
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
toolSpan.end();
|
|
1322
|
+
}
|
|
1323
|
+
);
|
|
1324
|
+
});
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
span.end(endHr);
|
|
1328
|
+
}
|
|
1329
|
+
);
|
|
1330
|
+
});
|
|
1331
|
+
}
|
|
1332
|
+
};
|
|
1333
|
+
var OtelStreamingObserver = class {
|
|
1334
|
+
constructor(tracer, api, captureContent, parentCtx) {
|
|
1335
|
+
this.tracer = tracer;
|
|
1336
|
+
this.api = api;
|
|
1337
|
+
this.captureContent = captureContent;
|
|
1338
|
+
this.parentCtx = parentCtx;
|
|
1339
|
+
}
|
|
1340
|
+
// biome-ignore lint/suspicious/noExplicitAny: OTel span type loaded dynamically
|
|
1341
|
+
rootSpan = null;
|
|
1342
|
+
// biome-ignore lint/suspicious/noExplicitAny: OTel context loaded dynamically
|
|
1343
|
+
rootCtx = null;
|
|
1344
|
+
observedChildSpans = false;
|
|
1345
|
+
pendingMetrics = null;
|
|
1346
|
+
/** Create root eval span immediately (visible in backend right away) */
|
|
1347
|
+
startEvalCase(testId, target, evalSet) {
|
|
1348
|
+
this.pendingMetrics = null;
|
|
1349
|
+
this.observedChildSpans = false;
|
|
1350
|
+
const ctx = this.parentCtx ?? this.api.context.active();
|
|
1351
|
+
this.rootSpan = this.tracer.startSpan("agentv.eval", void 0, ctx);
|
|
1352
|
+
this.rootSpan.setAttribute("gen_ai.operation.name", "evaluate");
|
|
1353
|
+
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
1354
|
+
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
1355
|
+
this.rootSpan.setAttribute("agentv.target", target);
|
|
1356
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
|
|
1357
|
+
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
1358
|
+
}
|
|
1359
|
+
/** Create and immediately export a tool span */
|
|
1360
|
+
onToolCall(name, input, output, _durationMs, toolCallId) {
|
|
1361
|
+
if (!this.rootCtx) return;
|
|
1362
|
+
this.observedChildSpans = true;
|
|
1363
|
+
this.api.context.with(this.rootCtx, () => {
|
|
1364
|
+
const span = this.tracer.startSpan(`execute_tool ${name}`, void 0, this.rootCtx);
|
|
1365
|
+
span.setAttribute("gen_ai.tool.name", name);
|
|
1366
|
+
if (toolCallId) span.setAttribute("gen_ai.tool.call.id", toolCallId);
|
|
1367
|
+
if (this.captureContent) {
|
|
1368
|
+
if (input != null)
|
|
1369
|
+
span.setAttribute(
|
|
1370
|
+
"gen_ai.tool.call.arguments",
|
|
1371
|
+
typeof input === "string" ? input : JSON.stringify(input)
|
|
1372
|
+
);
|
|
1373
|
+
if (output != null)
|
|
1374
|
+
span.setAttribute(
|
|
1375
|
+
"gen_ai.tool.call.result",
|
|
1376
|
+
typeof output === "string" ? output : JSON.stringify(output)
|
|
1377
|
+
);
|
|
1378
|
+
}
|
|
1379
|
+
span.end();
|
|
1380
|
+
});
|
|
1381
|
+
}
|
|
1382
|
+
/** Create and immediately export an LLM span */
|
|
1383
|
+
onLlmCall(model, tokenUsage) {
|
|
1384
|
+
if (!this.rootCtx) return;
|
|
1385
|
+
this.observedChildSpans = true;
|
|
1386
|
+
this.api.context.with(this.rootCtx, () => {
|
|
1387
|
+
const span = this.tracer.startSpan(`chat ${model}`, void 0, this.rootCtx);
|
|
1388
|
+
span.setAttribute("gen_ai.operation.name", "chat");
|
|
1389
|
+
span.setAttribute("gen_ai.request.model", model);
|
|
1390
|
+
span.setAttribute("gen_ai.response.model", model);
|
|
1391
|
+
if (tokenUsage) {
|
|
1392
|
+
if (tokenUsage.input != null)
|
|
1393
|
+
span.setAttribute("gen_ai.usage.input_tokens", tokenUsage.input);
|
|
1394
|
+
if (tokenUsage.output != null)
|
|
1395
|
+
span.setAttribute("gen_ai.usage.output_tokens", tokenUsage.output);
|
|
1396
|
+
if (tokenUsage.cached != null)
|
|
1397
|
+
span.setAttribute("gen_ai.usage.cache_read.input_tokens", tokenUsage.cached);
|
|
1398
|
+
}
|
|
1399
|
+
span.end();
|
|
1400
|
+
});
|
|
1401
|
+
}
|
|
1402
|
+
/** Record final execution metrics before the root span is finalized. */
|
|
1403
|
+
recordEvalMetrics(result) {
|
|
1404
|
+
this.pendingMetrics = result;
|
|
1405
|
+
}
|
|
1406
|
+
/** Finalize root span with score/verdict after evaluation completes */
|
|
1407
|
+
finalizeEvalCase(score, error) {
|
|
1408
|
+
if (!this.rootSpan) return;
|
|
1409
|
+
this.rootSpan.setAttribute("agentv.score", score);
|
|
1410
|
+
if (this.pendingMetrics?.durationMs != null) {
|
|
1411
|
+
this.rootSpan.setAttribute("agentv.trace.duration_ms", this.pendingMetrics.durationMs);
|
|
1412
|
+
}
|
|
1413
|
+
if (this.pendingMetrics?.costUsd != null) {
|
|
1414
|
+
this.rootSpan.setAttribute("agentv.trace.cost_usd", this.pendingMetrics.costUsd);
|
|
1415
|
+
}
|
|
1416
|
+
if (this.pendingMetrics?.tokenUsage) {
|
|
1417
|
+
if (this.pendingMetrics.tokenUsage.input != null) {
|
|
1418
|
+
this.rootSpan.setAttribute(
|
|
1419
|
+
"agentv.trace.token_input",
|
|
1420
|
+
this.pendingMetrics.tokenUsage.input
|
|
1421
|
+
);
|
|
1422
|
+
}
|
|
1423
|
+
if (this.pendingMetrics.tokenUsage.output != null) {
|
|
1424
|
+
this.rootSpan.setAttribute(
|
|
1425
|
+
"agentv.trace.token_output",
|
|
1426
|
+
this.pendingMetrics.tokenUsage.output
|
|
1427
|
+
);
|
|
1428
|
+
}
|
|
1429
|
+
if (this.pendingMetrics.tokenUsage.cached != null) {
|
|
1430
|
+
this.rootSpan.setAttribute(
|
|
1431
|
+
"agentv.trace.token_cached",
|
|
1432
|
+
this.pendingMetrics.tokenUsage.cached
|
|
1433
|
+
);
|
|
1434
|
+
}
|
|
1435
|
+
}
|
|
1436
|
+
if (this.pendingMetrics?.trace) {
|
|
1437
|
+
this.rootSpan.setAttribute("agentv.trace.event_count", this.pendingMetrics.trace.eventCount);
|
|
1438
|
+
this.rootSpan.setAttribute(
|
|
1439
|
+
"agentv.trace.tool_names",
|
|
1440
|
+
Object.keys(this.pendingMetrics.trace.toolCalls).sort().join(",")
|
|
1441
|
+
);
|
|
1442
|
+
if (this.pendingMetrics.trace.llmCallCount != null) {
|
|
1443
|
+
this.rootSpan.setAttribute(
|
|
1444
|
+
"agentv.trace.llm_call_count",
|
|
1445
|
+
this.pendingMetrics.trace.llmCallCount
|
|
1446
|
+
);
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
if (error) {
|
|
1450
|
+
this.rootSpan.setStatus({ code: this.api.SpanStatusCode.ERROR, message: error });
|
|
1451
|
+
} else {
|
|
1452
|
+
this.rootSpan.setStatus({ code: this.api.SpanStatusCode.OK });
|
|
1453
|
+
}
|
|
1454
|
+
this.rootSpan.end();
|
|
1455
|
+
this.rootSpan = null;
|
|
1456
|
+
this.rootCtx = null;
|
|
1457
|
+
this.observedChildSpans = false;
|
|
1458
|
+
this.pendingMetrics = null;
|
|
1459
|
+
}
|
|
1460
|
+
/** Backfill child spans from the completed result when the provider emitted no live callbacks. */
|
|
1461
|
+
completeFromResult(result) {
|
|
1462
|
+
this.recordEvalMetrics({
|
|
1463
|
+
durationMs: result.durationMs,
|
|
1464
|
+
costUsd: result.costUsd,
|
|
1465
|
+
tokenUsage: result.tokenUsage,
|
|
1466
|
+
trace: result.trace
|
|
1467
|
+
});
|
|
1468
|
+
if (this.observedChildSpans || !this.rootCtx) {
|
|
1469
|
+
return;
|
|
1470
|
+
}
|
|
1471
|
+
const model = result.output.find((msg) => msg.role === "assistant")?.metadata?.model ?? result.target ?? "unknown";
|
|
1472
|
+
this.onLlmCall(String(model), result.tokenUsage);
|
|
1473
|
+
for (const message of result.output) {
|
|
1474
|
+
for (const toolCall of message.toolCalls ?? []) {
|
|
1475
|
+
this.onToolCall(
|
|
1476
|
+
toolCall.tool,
|
|
1477
|
+
toolCall.input,
|
|
1478
|
+
toolCall.output,
|
|
1479
|
+
toolCall.durationMs ?? 0,
|
|
1480
|
+
toolCall.id
|
|
1481
|
+
);
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
}
|
|
1485
|
+
/** Return the active eval span's trace ID and span ID for Braintrust trace bridging */
|
|
1486
|
+
getActiveSpanIds() {
|
|
1487
|
+
if (!this.rootSpan) return null;
|
|
1488
|
+
try {
|
|
1489
|
+
const spanCtx = this.rootSpan.spanContext?.() ?? this.rootSpan._spanContext;
|
|
1490
|
+
if (!spanCtx?.traceId || !spanCtx?.spanId) return null;
|
|
1491
|
+
return { parentSpanId: spanCtx.spanId, rootSpanId: spanCtx.traceId };
|
|
1492
|
+
} catch {
|
|
1493
|
+
return null;
|
|
1494
|
+
}
|
|
1495
|
+
}
|
|
1496
|
+
/** Get ProviderStreamCallbacks for passing to providers */
|
|
1497
|
+
getStreamCallbacks() {
|
|
1498
|
+
return {
|
|
1499
|
+
onToolCallEnd: (name, input, output, durationMs, toolCallId) => this.onToolCall(name, input, output, durationMs, toolCallId),
|
|
1500
|
+
onLlmCallEnd: (model, tokenUsage) => this.onLlmCall(model, tokenUsage),
|
|
1501
|
+
getActiveSpanIds: () => this.getActiveSpanIds()
|
|
1502
|
+
};
|
|
1503
|
+
}
|
|
1504
|
+
};
|
|
1505
|
+
function groupMessagesIntoTurns(messages) {
|
|
1506
|
+
const turns = [];
|
|
1507
|
+
let current = [];
|
|
1508
|
+
for (const msg of messages) {
|
|
1509
|
+
if (msg.role === "user" && current.length > 0) {
|
|
1510
|
+
turns.push({ messages: current });
|
|
1511
|
+
current = [];
|
|
1512
|
+
}
|
|
1513
|
+
current.push(msg);
|
|
1514
|
+
}
|
|
1515
|
+
if (current.length > 0) turns.push({ messages: current });
|
|
1516
|
+
return turns;
|
|
1517
|
+
}
|
|
1518
|
+
function toHrTime(iso) {
|
|
1519
|
+
if (!iso) return void 0;
|
|
1520
|
+
return new Date(iso).getTime();
|
|
1521
|
+
}
|
|
1522
|
+
var SKIPPED_TYPES = /* @__PURE__ */ new Set(["progress", "system", "file-history-snapshot"]);
|
|
1523
|
+
function parseClaudeSession(jsonl) {
|
|
1524
|
+
const messages = [];
|
|
1525
|
+
let sessionId = "";
|
|
1526
|
+
let projectPath;
|
|
1527
|
+
let model;
|
|
1528
|
+
let startTimestamp;
|
|
1529
|
+
let endTimestamp;
|
|
1530
|
+
const usageByRequestId = /* @__PURE__ */ new Map();
|
|
1531
|
+
let lastAssistantRequestId;
|
|
1532
|
+
let lastAssistantIdx = -1;
|
|
1533
|
+
const pendingToolCalls = /* @__PURE__ */ new Map();
|
|
1534
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
1535
|
+
for (const line of lines) {
|
|
1536
|
+
let event;
|
|
1537
|
+
try {
|
|
1538
|
+
event = JSON.parse(line);
|
|
1539
|
+
} catch {
|
|
1540
|
+
continue;
|
|
1541
|
+
}
|
|
1542
|
+
if (!event.type) continue;
|
|
1543
|
+
if (event.timestamp) {
|
|
1544
|
+
if (!startTimestamp) startTimestamp = event.timestamp;
|
|
1545
|
+
endTimestamp = event.timestamp;
|
|
1546
|
+
}
|
|
1547
|
+
if (SKIPPED_TYPES.has(event.type)) continue;
|
|
1548
|
+
if (event.isSidechain) continue;
|
|
1549
|
+
if (!sessionId && event.sessionId) {
|
|
1550
|
+
sessionId = event.sessionId;
|
|
1551
|
+
}
|
|
1552
|
+
if (!projectPath && event.cwd) {
|
|
1553
|
+
projectPath = event.cwd;
|
|
1554
|
+
}
|
|
1555
|
+
switch (event.type) {
|
|
1556
|
+
case "user": {
|
|
1557
|
+
const msg = event.message;
|
|
1558
|
+
if (!msg) break;
|
|
1559
|
+
const contentArr = msg.content;
|
|
1560
|
+
if (Array.isArray(contentArr)) {
|
|
1561
|
+
for (const block of contentArr) {
|
|
1562
|
+
if (block.type === "tool_result" && block.tool_use_id) {
|
|
1563
|
+
const pending = pendingToolCalls.get(block.tool_use_id);
|
|
1564
|
+
if (pending) {
|
|
1565
|
+
const existingMsg = messages[pending.msgIdx];
|
|
1566
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
1567
|
+
existingCalls[pending.toolIdx] = {
|
|
1568
|
+
...existingCalls[pending.toolIdx],
|
|
1569
|
+
output: extractToolResultContent(block.content)
|
|
1570
|
+
};
|
|
1571
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
1572
|
+
pendingToolCalls.delete(block.tool_use_id);
|
|
1573
|
+
}
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
const text = extractTextContent(contentArr);
|
|
1578
|
+
if (text !== void 0) {
|
|
1579
|
+
messages.push({ role: "user", content: text });
|
|
1580
|
+
}
|
|
1581
|
+
break;
|
|
1582
|
+
}
|
|
1583
|
+
case "assistant": {
|
|
1584
|
+
const msg = event.message;
|
|
1585
|
+
if (!msg) break;
|
|
1586
|
+
if (!model && msg.model) {
|
|
1587
|
+
model = msg.model;
|
|
1588
|
+
}
|
|
1589
|
+
if (msg.usage && event.requestId) {
|
|
1590
|
+
usageByRequestId.set(event.requestId, msg.usage);
|
|
1591
|
+
}
|
|
1592
|
+
const { text, toolCalls } = extractAssistantContent(msg.content);
|
|
1593
|
+
if (event.requestId && event.requestId === lastAssistantRequestId && lastAssistantIdx >= 0) {
|
|
1594
|
+
messages[lastAssistantIdx] = {
|
|
1595
|
+
role: "assistant",
|
|
1596
|
+
content: text || void 0,
|
|
1597
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
1598
|
+
};
|
|
1599
|
+
registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls);
|
|
1600
|
+
} else {
|
|
1601
|
+
if (text || toolCalls.length > 0) {
|
|
1602
|
+
lastAssistantIdx = messages.length;
|
|
1603
|
+
messages.push({
|
|
1604
|
+
role: "assistant",
|
|
1605
|
+
content: text || void 0,
|
|
1606
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
1607
|
+
});
|
|
1608
|
+
registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls);
|
|
1609
|
+
}
|
|
1610
|
+
}
|
|
1611
|
+
lastAssistantRequestId = event.requestId;
|
|
1612
|
+
break;
|
|
1613
|
+
}
|
|
1614
|
+
}
|
|
1615
|
+
}
|
|
1616
|
+
let totalInputTokens = 0;
|
|
1617
|
+
let totalOutputTokens = 0;
|
|
1618
|
+
for (const usage of usageByRequestId.values()) {
|
|
1619
|
+
totalInputTokens += Number(usage.input_tokens ?? 0);
|
|
1620
|
+
totalOutputTokens += Number(usage.output_tokens ?? 0);
|
|
1621
|
+
}
|
|
1622
|
+
const hasUsage = usageByRequestId.size > 0;
|
|
1623
|
+
let durationMs;
|
|
1624
|
+
if (startTimestamp && endTimestamp) {
|
|
1625
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
1626
|
+
}
|
|
1627
|
+
const source = {
|
|
1628
|
+
provider: "claude",
|
|
1629
|
+
sessionId,
|
|
1630
|
+
projectPath,
|
|
1631
|
+
startedAt: startTimestamp,
|
|
1632
|
+
model
|
|
1633
|
+
};
|
|
1634
|
+
return {
|
|
1635
|
+
messages,
|
|
1636
|
+
source,
|
|
1637
|
+
tokenUsage: hasUsage ? { input: totalInputTokens, output: totalOutputTokens } : void 0,
|
|
1638
|
+
durationMs,
|
|
1639
|
+
costUsd: null
|
|
1640
|
+
};
|
|
1641
|
+
}
|
|
1642
|
+
function registerPendingToolCalls(toolCalls, msgIdx, pending) {
|
|
1643
|
+
for (let i = 0; i < toolCalls.length; i++) {
|
|
1644
|
+
const id = toolCalls[i].id;
|
|
1645
|
+
if (id) {
|
|
1646
|
+
pending.set(id, { msgIdx, toolIdx: i });
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
}
|
|
1650
|
+
function extractTextContent(content) {
|
|
1651
|
+
if (content === void 0 || content === null) return void 0;
|
|
1652
|
+
if (typeof content === "string") return content;
|
|
1653
|
+
const textParts = [];
|
|
1654
|
+
for (const block of content) {
|
|
1655
|
+
if (block.type === "text" && block.text) {
|
|
1656
|
+
textParts.push(block.text);
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
return textParts.length > 0 ? textParts.join("") : void 0;
|
|
1660
|
+
}
|
|
1661
|
+
function extractAssistantContent(content) {
|
|
1662
|
+
if (content === void 0 || content === null) {
|
|
1663
|
+
return { text: void 0, toolCalls: [] };
|
|
1664
|
+
}
|
|
1665
|
+
if (typeof content === "string") {
|
|
1666
|
+
return { text: content, toolCalls: [] };
|
|
1667
|
+
}
|
|
1668
|
+
const textParts = [];
|
|
1669
|
+
const toolCalls = [];
|
|
1670
|
+
for (const block of content) {
|
|
1671
|
+
switch (block.type) {
|
|
1672
|
+
case "text":
|
|
1673
|
+
if (block.text) textParts.push(block.text);
|
|
1674
|
+
break;
|
|
1675
|
+
case "tool_use":
|
|
1676
|
+
if (block.name) {
|
|
1677
|
+
toolCalls.push(
|
|
1678
|
+
normalizeToolCall("claude", {
|
|
1679
|
+
tool: block.name,
|
|
1680
|
+
input: block.input,
|
|
1681
|
+
id: block.id
|
|
1682
|
+
})
|
|
1683
|
+
);
|
|
1684
|
+
}
|
|
1685
|
+
break;
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
return {
|
|
1689
|
+
text: textParts.length > 0 ? textParts.join("") : void 0,
|
|
1690
|
+
toolCalls
|
|
1691
|
+
};
|
|
1692
|
+
}
|
|
1693
|
+
function extractToolResultContent(content) {
|
|
1694
|
+
if (content === void 0 || content === null) return void 0;
|
|
1695
|
+
if (typeof content === "string") return content;
|
|
1696
|
+
const parts = [];
|
|
1697
|
+
for (const block of content) {
|
|
1698
|
+
if (block.type === "text" && block.text) {
|
|
1699
|
+
parts.push(block.text);
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
1703
|
+
}
|
|
1704
|
+
function parseCodexSession(jsonl) {
|
|
1705
|
+
const messages = [];
|
|
1706
|
+
let sessionId = "";
|
|
1707
|
+
let cwd;
|
|
1708
|
+
let model;
|
|
1709
|
+
let version;
|
|
1710
|
+
let startTimestamp;
|
|
1711
|
+
let endTimestamp;
|
|
1712
|
+
const pendingCalls = /* @__PURE__ */ new Map();
|
|
1713
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
1714
|
+
for (const line of lines) {
|
|
1715
|
+
let entry;
|
|
1716
|
+
try {
|
|
1717
|
+
entry = JSON.parse(line);
|
|
1718
|
+
} catch {
|
|
1719
|
+
continue;
|
|
1720
|
+
}
|
|
1721
|
+
if (!entry.type) continue;
|
|
1722
|
+
if (entry.timestamp) {
|
|
1723
|
+
if (!startTimestamp) startTimestamp = entry.timestamp;
|
|
1724
|
+
endTimestamp = entry.timestamp;
|
|
1725
|
+
}
|
|
1726
|
+
const payload = entry.payload ?? {};
|
|
1727
|
+
switch (entry.type) {
|
|
1728
|
+
case "session_meta": {
|
|
1729
|
+
sessionId = String(payload.id ?? "");
|
|
1730
|
+
cwd = payload.cwd ? String(payload.cwd) : void 0;
|
|
1731
|
+
version = payload.cli_version ? String(payload.cli_version) : void 0;
|
|
1732
|
+
if (payload.model && !model) {
|
|
1733
|
+
model = String(payload.model);
|
|
1734
|
+
}
|
|
1735
|
+
break;
|
|
1736
|
+
}
|
|
1737
|
+
case "turn_context": {
|
|
1738
|
+
if (payload.model && !model) {
|
|
1739
|
+
model = String(payload.model);
|
|
1740
|
+
}
|
|
1741
|
+
if (payload.cwd && !cwd) {
|
|
1742
|
+
cwd = String(payload.cwd);
|
|
1743
|
+
}
|
|
1744
|
+
break;
|
|
1745
|
+
}
|
|
1746
|
+
case "response_item": {
|
|
1747
|
+
const itemType = String(payload.type ?? "");
|
|
1748
|
+
const role = String(payload.role ?? "");
|
|
1749
|
+
switch (itemType) {
|
|
1750
|
+
case "message": {
|
|
1751
|
+
if (role === "developer") break;
|
|
1752
|
+
const content = extractResponseItemContent(payload.content);
|
|
1753
|
+
if (role === "user" && content) {
|
|
1754
|
+
messages.push({ role: "user", content });
|
|
1755
|
+
} else if (role === "assistant" && content) {
|
|
1756
|
+
messages.push({ role: "assistant", content });
|
|
1757
|
+
}
|
|
1758
|
+
break;
|
|
1759
|
+
}
|
|
1760
|
+
case "function_call": {
|
|
1761
|
+
const toolName = String(payload.name ?? "");
|
|
1762
|
+
const callId = String(payload.call_id ?? "");
|
|
1763
|
+
let input;
|
|
1764
|
+
if (typeof payload.arguments === "string") {
|
|
1765
|
+
try {
|
|
1766
|
+
input = JSON.parse(payload.arguments);
|
|
1767
|
+
} catch {
|
|
1768
|
+
input = payload.arguments;
|
|
1769
|
+
}
|
|
1770
|
+
} else {
|
|
1771
|
+
input = payload.arguments;
|
|
1772
|
+
}
|
|
1773
|
+
const toolCall = normalizeToolCall("codex", {
|
|
1774
|
+
tool: toolName,
|
|
1775
|
+
input,
|
|
1776
|
+
id: callId
|
|
1777
|
+
});
|
|
1778
|
+
const msgIdx = messages.length;
|
|
1779
|
+
messages.push({
|
|
1780
|
+
role: "assistant",
|
|
1781
|
+
toolCalls: [toolCall]
|
|
1782
|
+
});
|
|
1783
|
+
if (callId) {
|
|
1784
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
1785
|
+
}
|
|
1786
|
+
break;
|
|
1787
|
+
}
|
|
1788
|
+
case "custom_tool_call": {
|
|
1789
|
+
const toolName = String(payload.name ?? "");
|
|
1790
|
+
const callId = String(payload.call_id ?? "");
|
|
1791
|
+
let input;
|
|
1792
|
+
if (typeof payload.arguments === "string") {
|
|
1793
|
+
try {
|
|
1794
|
+
input = JSON.parse(payload.arguments);
|
|
1795
|
+
} catch {
|
|
1796
|
+
input = payload.arguments;
|
|
1797
|
+
}
|
|
1798
|
+
} else {
|
|
1799
|
+
input = payload.arguments;
|
|
1800
|
+
}
|
|
1801
|
+
const toolCall = normalizeToolCall("codex", {
|
|
1802
|
+
tool: toolName,
|
|
1803
|
+
input,
|
|
1804
|
+
id: callId
|
|
1805
|
+
});
|
|
1806
|
+
const msgIdx = messages.length;
|
|
1807
|
+
messages.push({
|
|
1808
|
+
role: "assistant",
|
|
1809
|
+
toolCalls: [toolCall]
|
|
1810
|
+
});
|
|
1811
|
+
if (callId) {
|
|
1812
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
1813
|
+
}
|
|
1814
|
+
break;
|
|
1815
|
+
}
|
|
1816
|
+
case "function_call_output":
|
|
1817
|
+
case "custom_tool_call_output": {
|
|
1818
|
+
const callId = String(payload.call_id ?? "");
|
|
1819
|
+
const pending = pendingCalls.get(callId);
|
|
1820
|
+
if (pending) {
|
|
1821
|
+
const existingMsg = messages[pending.msgIdx];
|
|
1822
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
1823
|
+
existingCalls[pending.toolIdx] = {
|
|
1824
|
+
...existingCalls[pending.toolIdx],
|
|
1825
|
+
output: payload.output
|
|
1826
|
+
};
|
|
1827
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
1828
|
+
pendingCalls.delete(callId);
|
|
1829
|
+
}
|
|
1830
|
+
break;
|
|
1831
|
+
}
|
|
1832
|
+
// Skip reasoning blocks (thinking tokens)
|
|
1833
|
+
case "reasoning":
|
|
1834
|
+
break;
|
|
1835
|
+
}
|
|
1836
|
+
break;
|
|
1837
|
+
}
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
let durationMs;
|
|
1841
|
+
if (startTimestamp && endTimestamp) {
|
|
1842
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
1843
|
+
}
|
|
1844
|
+
const source = {
|
|
1845
|
+
provider: "codex",
|
|
1846
|
+
sessionId,
|
|
1847
|
+
cwd,
|
|
1848
|
+
startedAt: startTimestamp,
|
|
1849
|
+
model,
|
|
1850
|
+
version
|
|
1851
|
+
};
|
|
1852
|
+
return {
|
|
1853
|
+
messages,
|
|
1854
|
+
source,
|
|
1855
|
+
// Codex rollout files don't include token counts (only rate limit info)
|
|
1856
|
+
tokenUsage: void 0,
|
|
1857
|
+
durationMs,
|
|
1858
|
+
costUsd: null
|
|
1859
|
+
};
|
|
1860
|
+
}
|
|
1861
|
+
function extractResponseItemContent(content) {
|
|
1862
|
+
if (typeof content === "string") return content;
|
|
1863
|
+
if (!Array.isArray(content)) return void 0;
|
|
1864
|
+
const parts = [];
|
|
1865
|
+
for (const block of content) {
|
|
1866
|
+
if (typeof block === "object" && block !== null) {
|
|
1867
|
+
const b = block;
|
|
1868
|
+
if (typeof b.text === "string") {
|
|
1869
|
+
parts.push(b.text);
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
1874
|
+
}
|
|
1875
|
+
var DEFAULT_SESSIONS_DIR = () => path6.join(homedir(), ".codex", "sessions");
|
|
1876
|
+
async function discoverCodexSessions(opts) {
|
|
1877
|
+
const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
|
|
1878
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
1879
|
+
const sessions = [];
|
|
1880
|
+
let yearDirs;
|
|
1881
|
+
try {
|
|
1882
|
+
yearDirs = await readdir2(sessionsDir);
|
|
1883
|
+
} catch {
|
|
1884
|
+
return [];
|
|
1885
|
+
}
|
|
1886
|
+
for (const year of yearDirs) {
|
|
1887
|
+
const yearPath = path6.join(sessionsDir, year);
|
|
1888
|
+
let monthDirs;
|
|
1889
|
+
try {
|
|
1890
|
+
monthDirs = await readdir2(yearPath);
|
|
1891
|
+
} catch {
|
|
1892
|
+
continue;
|
|
1893
|
+
}
|
|
1894
|
+
for (const month of monthDirs) {
|
|
1895
|
+
const monthPath = path6.join(yearPath, month);
|
|
1896
|
+
let dayDirs;
|
|
1897
|
+
try {
|
|
1898
|
+
dayDirs = await readdir2(monthPath);
|
|
1899
|
+
} catch {
|
|
1900
|
+
continue;
|
|
1901
|
+
}
|
|
1902
|
+
for (const day of dayDirs) {
|
|
1903
|
+
if (opts?.date) {
|
|
1904
|
+
const dirDate = `${year}-${month}-${day}`;
|
|
1905
|
+
if (dirDate !== opts.date) continue;
|
|
1906
|
+
}
|
|
1907
|
+
const dayPath = path6.join(monthPath, day);
|
|
1908
|
+
let files;
|
|
1909
|
+
try {
|
|
1910
|
+
files = await readdir2(dayPath);
|
|
1911
|
+
} catch {
|
|
1912
|
+
continue;
|
|
1913
|
+
}
|
|
1914
|
+
for (const file of files) {
|
|
1915
|
+
if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
|
|
1916
|
+
const filePath = path6.join(dayPath, file);
|
|
1917
|
+
const nameWithoutExt = file.replace(/\.jsonl$/, "");
|
|
1918
|
+
const parts = nameWithoutExt.split("-");
|
|
1919
|
+
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
1920
|
+
let updatedAt;
|
|
1921
|
+
try {
|
|
1922
|
+
const fileStat = await stat2(filePath);
|
|
1923
|
+
updatedAt = fileStat.mtime;
|
|
1924
|
+
} catch {
|
|
1925
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
1926
|
+
}
|
|
1927
|
+
sessions.push({ sessionId, filePath, filename: file, updatedAt });
|
|
1928
|
+
}
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
}
|
|
1932
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
1933
|
+
return sessions.slice(0, limit);
|
|
1934
|
+
}
|
|
1935
|
+
var DEFAULT_PROJECTS_DIR = () => path7.join(homedir2(), ".claude", "projects");
|
|
1936
|
+
function encodeProjectPath(projectPath) {
|
|
1937
|
+
return projectPath.replace(/\//g, "-");
|
|
1938
|
+
}
|
|
1939
|
+
async function discoverClaudeSessions(opts) {
|
|
1940
|
+
const projectsDir = opts?.projectsDir ?? DEFAULT_PROJECTS_DIR();
|
|
1941
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
1942
|
+
let projectDirs;
|
|
1943
|
+
try {
|
|
1944
|
+
projectDirs = await readdir3(projectsDir);
|
|
1945
|
+
} catch {
|
|
1946
|
+
return [];
|
|
1947
|
+
}
|
|
1948
|
+
if (opts?.projectPath) {
|
|
1949
|
+
const encoded = encodeProjectPath(opts.projectPath);
|
|
1950
|
+
projectDirs = projectDirs.filter((dir) => dir === encoded || dir.includes(encoded));
|
|
1951
|
+
}
|
|
1952
|
+
const sessions = [];
|
|
1953
|
+
for (const projectDir of projectDirs) {
|
|
1954
|
+
const dirPath = path7.join(projectsDir, projectDir);
|
|
1955
|
+
let entries;
|
|
1956
|
+
try {
|
|
1957
|
+
entries = await readdir3(dirPath);
|
|
1958
|
+
} catch {
|
|
1959
|
+
continue;
|
|
1960
|
+
}
|
|
1961
|
+
for (const entry of entries) {
|
|
1962
|
+
if (!entry.endsWith(".jsonl")) continue;
|
|
1963
|
+
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
1964
|
+
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
1965
|
+
const filePath = path7.join(dirPath, entry);
|
|
1966
|
+
let updatedAt;
|
|
1967
|
+
try {
|
|
1968
|
+
const fileStat = await stat3(filePath);
|
|
1969
|
+
updatedAt = fileStat.mtime;
|
|
1970
|
+
} catch {
|
|
1971
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
1972
|
+
}
|
|
1973
|
+
sessions.push({
|
|
1974
|
+
sessionId,
|
|
1975
|
+
filePath,
|
|
1976
|
+
projectDir,
|
|
1977
|
+
updatedAt
|
|
1978
|
+
});
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
1982
|
+
return sessions.slice(0, limit);
|
|
1983
|
+
}
|
|
1984
|
+
function toTranscriptJsonLines(entry, options) {
|
|
1985
|
+
const source = {
|
|
1986
|
+
provider: entry.source.provider,
|
|
1987
|
+
session_id: entry.source.sessionId,
|
|
1988
|
+
model: entry.source.model,
|
|
1989
|
+
timestamp: entry.source.startedAt,
|
|
1990
|
+
git_branch: entry.source.gitBranch,
|
|
1991
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
1992
|
+
version: entry.source.version
|
|
1993
|
+
};
|
|
1994
|
+
const transcriptTokenUsage = entry.tokenUsage ? {
|
|
1995
|
+
input: entry.tokenUsage.input,
|
|
1996
|
+
output: entry.tokenUsage.output,
|
|
1997
|
+
cached: entry.tokenUsage.cached,
|
|
1998
|
+
reasoning: entry.tokenUsage.reasoning
|
|
1999
|
+
} : void 0;
|
|
2000
|
+
const testId = options?.testId ?? entry.source.sessionId;
|
|
2001
|
+
const target = options?.target ?? entry.source.provider;
|
|
2002
|
+
return entry.messages.map((message, index) => ({
|
|
2003
|
+
test_id: testId,
|
|
2004
|
+
target,
|
|
2005
|
+
message_index: index,
|
|
2006
|
+
...toSnakeCaseDeep(message),
|
|
2007
|
+
transcript_token_usage: transcriptTokenUsage,
|
|
2008
|
+
transcript_duration_ms: entry.durationMs,
|
|
2009
|
+
transcript_cost_usd: entry.costUsd,
|
|
2010
|
+
source
|
|
2011
|
+
}));
|
|
2012
|
+
}
|
|
2013
|
+
function buildReplayMessage(line) {
|
|
2014
|
+
const camelCased = toCamelCaseDeep(line);
|
|
2015
|
+
return {
|
|
2016
|
+
role: camelCased.role,
|
|
2017
|
+
name: camelCased.name,
|
|
2018
|
+
content: camelCased.content,
|
|
2019
|
+
toolCalls: camelCased.toolCalls,
|
|
2020
|
+
startTime: camelCased.startTime,
|
|
2021
|
+
endTime: camelCased.endTime,
|
|
2022
|
+
durationMs: camelCased.durationMs,
|
|
2023
|
+
metadata: camelCased.metadata,
|
|
2024
|
+
tokenUsage: camelCased.tokenUsage
|
|
2025
|
+
};
|
|
2026
|
+
}
|
|
2027
|
+
function groupTranscriptJsonLines(lines) {
|
|
2028
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
2029
|
+
for (const line of lines) {
|
|
2030
|
+
const existing = grouped.get(line.test_id);
|
|
2031
|
+
const source = {
|
|
2032
|
+
provider: line.source.provider,
|
|
2033
|
+
sessionId: line.source.session_id,
|
|
2034
|
+
startedAt: line.source.timestamp,
|
|
2035
|
+
model: line.source.model,
|
|
2036
|
+
gitBranch: line.source.git_branch,
|
|
2037
|
+
cwd: line.source.cwd,
|
|
2038
|
+
version: line.source.version
|
|
2039
|
+
};
|
|
2040
|
+
const transcriptTokenUsage = line.transcript_token_usage ? {
|
|
2041
|
+
input: line.transcript_token_usage.input,
|
|
2042
|
+
output: line.transcript_token_usage.output,
|
|
2043
|
+
cached: line.transcript_token_usage.cached,
|
|
2044
|
+
reasoning: line.transcript_token_usage.reasoning
|
|
2045
|
+
} : void 0;
|
|
2046
|
+
if (existing) {
|
|
2047
|
+
existing.messages.push({ index: line.message_index, message: buildReplayMessage(line) });
|
|
2048
|
+
continue;
|
|
2049
|
+
}
|
|
2050
|
+
grouped.set(line.test_id, {
|
|
2051
|
+
target: line.target,
|
|
2052
|
+
tokenUsage: transcriptTokenUsage,
|
|
2053
|
+
durationMs: line.transcript_duration_ms,
|
|
2054
|
+
costUsd: line.transcript_cost_usd,
|
|
2055
|
+
source,
|
|
2056
|
+
messages: [{ index: line.message_index, message: buildReplayMessage(line) }]
|
|
2057
|
+
});
|
|
2058
|
+
}
|
|
2059
|
+
return [...grouped.entries()].map(([testId, entry]) => ({
|
|
2060
|
+
testId,
|
|
2061
|
+
target: entry.target,
|
|
2062
|
+
tokenUsage: entry.tokenUsage,
|
|
2063
|
+
durationMs: entry.durationMs,
|
|
2064
|
+
costUsd: entry.costUsd,
|
|
2065
|
+
source: entry.source,
|
|
2066
|
+
messages: entry.messages.sort((first, second) => first.index - second.index).map((item) => item.message)
|
|
2067
|
+
}));
|
|
2068
|
+
}
|
|
2069
|
+
async function readTranscriptJsonl(filePath) {
|
|
2070
|
+
const text = await readFile3(filePath, "utf8");
|
|
2071
|
+
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
2072
|
+
}
|
|
2073
|
+
async function readTranscriptFile(filePath) {
|
|
2074
|
+
return readFile3(filePath, "utf8");
|
|
2075
|
+
}
|
|
2076
|
+
var TranscriptProvider = class _TranscriptProvider {
|
|
2077
|
+
id;
|
|
2078
|
+
kind = "transcript";
|
|
2079
|
+
targetName;
|
|
2080
|
+
entries;
|
|
2081
|
+
cursor = 0;
|
|
2082
|
+
constructor(targetName, entries) {
|
|
2083
|
+
this.targetName = targetName;
|
|
2084
|
+
this.id = `transcript:${targetName}`;
|
|
2085
|
+
this.entries = entries;
|
|
2086
|
+
}
|
|
2087
|
+
/**
|
|
2088
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
2089
|
+
*/
|
|
2090
|
+
static async fromFile(filePath) {
|
|
2091
|
+
const lines = await readTranscriptJsonl(filePath);
|
|
2092
|
+
if (lines.length === 0) {
|
|
2093
|
+
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
2094
|
+
}
|
|
2095
|
+
const entries = groupTranscriptJsonLines(lines);
|
|
2096
|
+
const providerName = entries[0]?.source.provider ?? "transcript";
|
|
2097
|
+
return new _TranscriptProvider(providerName, entries);
|
|
2098
|
+
}
|
|
2099
|
+
get lineCount() {
|
|
2100
|
+
return this.entries.length;
|
|
2101
|
+
}
|
|
2102
|
+
async invoke(_request) {
|
|
2103
|
+
if (this.cursor >= this.entries.length) {
|
|
2104
|
+
throw new Error(
|
|
2105
|
+
`Transcript exhausted: ${this.entries.length} entr${this.entries.length === 1 ? "y" : "ies"} available but ${this.cursor + 1} invocations attempted. Each transcript entry maps to one test case.`
|
|
2106
|
+
);
|
|
2107
|
+
}
|
|
2108
|
+
const entry = this.entries[this.cursor++];
|
|
2109
|
+
return {
|
|
2110
|
+
output: entry.messages,
|
|
2111
|
+
tokenUsage: entry.tokenUsage ? {
|
|
2112
|
+
input: entry.tokenUsage.input,
|
|
2113
|
+
output: entry.tokenUsage.output,
|
|
2114
|
+
cached: entry.tokenUsage.cached,
|
|
2115
|
+
reasoning: entry.tokenUsage.reasoning
|
|
2116
|
+
} : void 0,
|
|
2117
|
+
durationMs: entry.durationMs,
|
|
2118
|
+
costUsd: entry.costUsd ?? void 0,
|
|
2119
|
+
startTime: entry.source.startedAt
|
|
2120
|
+
};
|
|
2121
|
+
}
|
|
2122
|
+
};
|
|
2123
|
+
function createAgentKernel() {
|
|
2124
|
+
return { status: "stub" };
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
export {
|
|
2128
|
+
transpileEvalYaml,
|
|
2129
|
+
transpileEvalYamlFile,
|
|
2130
|
+
getOutputFilenames,
|
|
2131
|
+
defineConfig,
|
|
2132
|
+
loadTsConfig,
|
|
2133
|
+
generateRubrics,
|
|
2134
|
+
scanRepoDeps,
|
|
2135
|
+
ResponseCache,
|
|
2136
|
+
shouldEnableCache,
|
|
2137
|
+
shouldSkipCacheForTemperature,
|
|
2138
|
+
normalizeResultsExportConfig,
|
|
2139
|
+
resolveResultsRepoUrl,
|
|
2140
|
+
getResultsRepoCachePaths,
|
|
2141
|
+
ensureResultsRepoClone,
|
|
2142
|
+
getResultsRepoStatus,
|
|
2143
|
+
syncResultsRepo,
|
|
2144
|
+
checkoutResultsRepoBranch,
|
|
2145
|
+
prepareResultsRepoBranch,
|
|
2146
|
+
stageResultsArtifacts,
|
|
2147
|
+
resolveResultsRepoRunsDir,
|
|
2148
|
+
directorySizeBytes,
|
|
2149
|
+
commitAndPushResultsBranch,
|
|
2150
|
+
pushResultsRepoBranch,
|
|
2151
|
+
createDraftResultsPr,
|
|
2152
|
+
getBenchmarksRegistryPath,
|
|
2153
|
+
loadBenchmarkRegistry,
|
|
2154
|
+
saveBenchmarkRegistry,
|
|
2155
|
+
deriveBenchmarkId,
|
|
2156
|
+
addBenchmark,
|
|
2157
|
+
removeBenchmark,
|
|
2158
|
+
getBenchmark,
|
|
2159
|
+
touchBenchmark,
|
|
2160
|
+
discoverBenchmarks,
|
|
2161
|
+
trimBaselineResult,
|
|
2162
|
+
DEFAULT_CATEGORY,
|
|
2163
|
+
deriveCategory,
|
|
2164
|
+
OTEL_BACKEND_PRESETS,
|
|
2165
|
+
OtelTraceExporter,
|
|
2166
|
+
OtelStreamingObserver,
|
|
2167
|
+
parseClaudeSession,
|
|
2168
|
+
parseCodexSession,
|
|
2169
|
+
discoverCodexSessions,
|
|
2170
|
+
discoverClaudeSessions,
|
|
2171
|
+
toTranscriptJsonLines,
|
|
2172
|
+
groupTranscriptJsonLines,
|
|
2173
|
+
readTranscriptJsonl,
|
|
2174
|
+
readTranscriptFile,
|
|
2175
|
+
TranscriptProvider,
|
|
2176
|
+
createAgentKernel
|
|
2177
|
+
};
|
|
2178
|
+
//# sourceMappingURL=chunk-R2QDYORI.js.map
|