agentv 4.17.1 → 4.18.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,689 @@
1
+ import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
+ import {
3
+ DEFAULT_THRESHOLD,
4
+ toTranscriptJsonLines
5
+ } from "./chunk-RCOAXXHP.js";
6
+
7
+ // src/commands/eval/artifact-writer.ts
8
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
9
+ import path2 from "node:path";
10
+
11
+ // src/utils/case-conversion.ts
12
+ function toSnakeCase(str) {
13
+ if (/^[A-Z]/.test(str)) {
14
+ return str;
15
+ }
16
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
17
+ }
18
+ function toSnakeCaseDeep(obj) {
19
+ if (obj === null || obj === void 0) {
20
+ return obj;
21
+ }
22
+ if (Array.isArray(obj)) {
23
+ return obj.map((item) => toSnakeCaseDeep(item));
24
+ }
25
+ if (typeof obj === "object") {
26
+ const result = {};
27
+ for (const [key, value] of Object.entries(obj)) {
28
+ const snakeKey = toSnakeCase(key);
29
+ result[snakeKey] = toSnakeCaseDeep(value);
30
+ }
31
+ return result;
32
+ }
33
+ return obj;
34
+ }
35
+
36
+ // src/commands/eval/result-layout.ts
37
+ import { existsSync, statSync } from "node:fs";
38
+ import path from "node:path";
39
+ var RESULT_INDEX_FILENAME = "index.jsonl";
40
+ var RESULT_RUNS_DIRNAME = "runs";
41
+ var DEFAULT_EXPERIMENT_NAME = "default";
42
+ function normalizeExperimentName(experiment) {
43
+ const trimmed = experiment?.trim();
44
+ if (!trimmed) {
45
+ return DEFAULT_EXPERIMENT_NAME;
46
+ }
47
+ if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
48
+ throw new Error(
49
+ `Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
50
+ );
51
+ }
52
+ return trimmed;
53
+ }
54
+ function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
55
+ return timestamp.toISOString().replace(/[:.]/g, "-");
56
+ }
57
+ function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
58
+ return path.join(
59
+ cwd,
60
+ ".agentv",
61
+ "results",
62
+ RESULT_RUNS_DIRNAME,
63
+ normalizeExperimentName(experiment),
64
+ createRunDirName(timestamp)
65
+ );
66
+ }
67
+ function resolveRunIndexPath(runDir) {
68
+ return path.join(runDir, RESULT_INDEX_FILENAME);
69
+ }
70
+ function isRunManifestPath(filePath) {
71
+ return path.basename(filePath) === RESULT_INDEX_FILENAME;
72
+ }
73
+ function resolveExistingRunPrimaryPath(runDir) {
74
+ const indexPath = resolveRunIndexPath(runDir);
75
+ if (existsSync(indexPath)) {
76
+ return indexPath;
77
+ }
78
+ return void 0;
79
+ }
80
+ function isDirectoryPath(filePath) {
81
+ try {
82
+ return statSync(filePath).isDirectory();
83
+ } catch {
84
+ return false;
85
+ }
86
+ }
87
+ function resolveWorkspaceOrFilePath(filePath) {
88
+ if (!isDirectoryPath(filePath)) {
89
+ return filePath;
90
+ }
91
+ const existing = resolveExistingRunPrimaryPath(filePath);
92
+ if (!existing) {
93
+ throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
94
+ }
95
+ return existing;
96
+ }
97
+ function resolveRunManifestPath(filePath) {
98
+ if (isDirectoryPath(filePath)) {
99
+ return resolveWorkspaceOrFilePath(filePath);
100
+ }
101
+ if (!isRunManifestPath(filePath)) {
102
+ throw new Error(
103
+ `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
104
+ );
105
+ }
106
+ return filePath;
107
+ }
108
+
109
+ // src/commands/eval/artifact-writer.ts
110
+ function buildTestTargetKey(testId, target) {
111
+ return `${testId ?? "unknown"}::${target ?? "unknown"}`;
112
+ }
113
+ function deduplicateByTestIdTarget(results) {
114
+ const seen = /* @__PURE__ */ new Map();
115
+ for (let i = 0; i < results.length; i++) {
116
+ seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
117
+ }
118
+ const deduped = [];
119
+ for (let i = 0; i < results.length; i++) {
120
+ const key = buildTestTargetKey(results[i].testId, results[i].target);
121
+ if (seen.get(key) === i) {
122
+ deduped.push(results[i]);
123
+ }
124
+ }
125
+ return deduped;
126
+ }
127
+ async function aggregateRunDir(runDir, options) {
128
+ const indexPath = path2.join(runDir, RESULT_INDEX_FILENAME);
129
+ const content = await readFile(indexPath, "utf8");
130
+ const allResults = parseJsonlResults(content);
131
+ const results = deduplicateByTestIdTarget(allResults);
132
+ const timing = buildTimingArtifact(results);
133
+ const timingPath = path2.join(runDir, "timing.json");
134
+ await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
135
+ `, "utf8");
136
+ const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
137
+ const benchmarkPath = path2.join(runDir, "benchmark.json");
138
+ await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
139
+ `, "utf8");
140
+ const targetSet = new Set(results.map((r) => r.target ?? "unknown"));
141
+ return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
142
+ }
143
+ function computeStats(values) {
144
+ if (values.length === 0) {
145
+ return { mean: 0, stddev: 0 };
146
+ }
147
+ const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
148
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
149
+ return {
150
+ mean: Math.round(mean * 1e3) / 1e3,
151
+ stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
152
+ };
153
+ }
154
+ function computePassRate(result) {
155
+ const scores = result.scores;
156
+ if (scores && scores.length > 0) {
157
+ const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
158
+ return passed / scores.length;
159
+ }
160
+ return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
161
+ }
162
+ function countToolCalls(result) {
163
+ const toolCalls = {};
164
+ let total = 0;
165
+ const trace = result.trace;
166
+ if (trace?.steps) {
167
+ for (const step of trace.steps) {
168
+ if (step.toolName || step.type === "tool") {
169
+ const name = step.toolName ?? "unknown";
170
+ toolCalls[name] = (toolCalls[name] ?? 0) + 1;
171
+ total += 1;
172
+ }
173
+ }
174
+ }
175
+ return { toolCalls, total };
176
+ }
177
+ function parseWorkspaceChanges(fileChanges) {
178
+ if (!fileChanges) {
179
+ return void 0;
180
+ }
181
+ let filesModified = 0;
182
+ let filesCreated = 0;
183
+ const lines = fileChanges.split("\n");
184
+ for (const line of lines) {
185
+ if (line.startsWith("--- /dev/null")) {
186
+ filesCreated += 1;
187
+ } else if (line.startsWith("--- a/")) {
188
+ filesModified += 1;
189
+ }
190
+ }
191
+ const summaryLines = lines.slice(0, 20);
192
+ const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
193
+ ... (${lines.length - 20} more lines)` : fileChanges;
194
+ return {
195
+ files_modified: filesModified,
196
+ files_created: filesCreated,
197
+ diff_summary: diffSummary
198
+ };
199
+ }
200
+ function buildAssertions(result) {
201
+ if (!result.assertions) return [];
202
+ return result.assertions.map((a) => ({
203
+ text: a.text,
204
+ passed: a.passed,
205
+ evidence: a.evidence ?? ""
206
+ }));
207
+ }
208
+ function buildEvaluators(scores) {
209
+ if (!scores || scores.length === 0) {
210
+ return void 0;
211
+ }
212
+ return scores.map((s) => ({
213
+ name: s.name,
214
+ type: s.type,
215
+ score: s.score,
216
+ reasoning: "",
217
+ weight: s.weight,
218
+ verdict: s.verdict,
219
+ assertions: s.assertions,
220
+ details: s.details
221
+ }));
222
+ }
223
+ function buildGradingArtifact(result) {
224
+ const assertions = buildAssertions(result);
225
+ const passed = assertions.filter((e) => e.passed).length;
226
+ const failed = assertions.filter((e) => !e.passed).length;
227
+ const total = assertions.length;
228
+ const { toolCalls, total: totalToolCalls } = countToolCalls(result);
229
+ const errorsEncountered = result.error ? 1 : 0;
230
+ return {
231
+ assertions,
232
+ summary: {
233
+ passed,
234
+ failed,
235
+ total,
236
+ pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
237
+ },
238
+ execution_metrics: {
239
+ tool_calls: toolCalls,
240
+ total_tool_calls: totalToolCalls,
241
+ errors_encountered: errorsEncountered
242
+ },
243
+ graders: buildEvaluators(result.scores),
244
+ workspace_changes: parseWorkspaceChanges(result.fileChanges),
245
+ conversation: result.conversationId ? {
246
+ turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
247
+ conversation_id: result.conversationId
248
+ } : void 0
249
+ };
250
+ }
251
+ function buildTimingArtifact(results) {
252
+ let totalInput = 0;
253
+ let totalOutput = 0;
254
+ let totalReasoning = 0;
255
+ let totalDurationMs = 0;
256
+ for (const result of results) {
257
+ const usage = result.tokenUsage;
258
+ if (usage) {
259
+ totalInput += usage.input ?? 0;
260
+ totalOutput += usage.output ?? 0;
261
+ totalReasoning += usage.reasoning ?? 0;
262
+ }
263
+ if (result.durationMs != null) {
264
+ totalDurationMs += result.durationMs;
265
+ }
266
+ }
267
+ return {
268
+ total_tokens: totalInput + totalOutput,
269
+ duration_ms: totalDurationMs,
270
+ total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
271
+ token_usage: {
272
+ input: totalInput,
273
+ output: totalOutput,
274
+ reasoning: totalReasoning
275
+ }
276
+ };
277
+ }
278
+ function buildBenchmarkArtifact(results, evalFile = "", experiment) {
279
+ const targetSet = /* @__PURE__ */ new Set();
280
+ const testIdSet = /* @__PURE__ */ new Set();
281
+ for (const result of results) {
282
+ targetSet.add(result.target ?? "unknown");
283
+ testIdSet.add(result.testId ?? "unknown");
284
+ }
285
+ const targets = [...targetSet].sort();
286
+ const testIds = [...testIdSet].sort();
287
+ const runSummary = {};
288
+ const notes = [];
289
+ for (const target of targets) {
290
+ const targetResults = results.filter((r) => r.target === target);
291
+ const passRates = targetResults.map(computePassRate);
292
+ const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
293
+ const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
294
+ const usage = r.tokenUsage;
295
+ return (usage.input ?? 0) + (usage.output ?? 0);
296
+ });
297
+ const entry = {
298
+ pass_rate: computeStats(passRates),
299
+ time_seconds: computeStats(timings),
300
+ tokens: computeStats(tokens)
301
+ };
302
+ const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
303
+ if (toolCallCounts.some((c) => c > 0)) {
304
+ entry.tool_calls = computeStats(toolCallCounts);
305
+ }
306
+ const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
307
+ if (costs.length > 0) {
308
+ entry.cost_usd = computeStats(costs);
309
+ }
310
+ runSummary[target] = entry;
311
+ }
312
+ const evaluatorScores = /* @__PURE__ */ new Map();
313
+ for (const result of results) {
314
+ if (result.scores) {
315
+ for (const score of result.scores) {
316
+ const key = `${score.name}:${score.type}`;
317
+ if (!evaluatorScores.has(key)) {
318
+ evaluatorScores.set(key, []);
319
+ }
320
+ evaluatorScores.get(key)?.push(score.score);
321
+ }
322
+ }
323
+ }
324
+ let perEvaluatorSummary;
325
+ if (evaluatorScores.size > 0) {
326
+ perEvaluatorSummary = {};
327
+ for (const [key, scores] of evaluatorScores) {
328
+ perEvaluatorSummary[key] = computeStats(scores);
329
+ }
330
+ }
331
+ const errorCount = results.filter(
332
+ (r) => r.executionStatus != null && r.executionStatus === "execution_error"
333
+ ).length;
334
+ if (errorCount > 0) {
335
+ notes.push(
336
+ `${errorCount} test(s) had execution errors and are included in pass_rate as failures`
337
+ );
338
+ }
339
+ if (results.length === 0) {
340
+ notes.push("No results to summarize");
341
+ }
342
+ const firstResult = results[0];
343
+ const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
344
+ return {
345
+ metadata: {
346
+ eval_file: evalFile,
347
+ timestamp,
348
+ targets,
349
+ tests_run: testIds,
350
+ experiment
351
+ },
352
+ run_summary: runSummary,
353
+ per_grader_summary: perEvaluatorSummary,
354
+ notes
355
+ };
356
+ }
357
+ function buildAggregateGradingArtifact(results) {
358
+ const assertions = [];
359
+ for (const result of results) {
360
+ if (!result.assertions) continue;
361
+ const testId = result.testId ?? "unknown";
362
+ for (const a of result.assertions) {
363
+ assertions.push({
364
+ test_id: testId,
365
+ text: a.text,
366
+ passed: a.passed,
367
+ evidence: a.evidence ?? ""
368
+ });
369
+ }
370
+ }
371
+ const passed = assertions.filter((a) => a.passed).length;
372
+ const failed = assertions.filter((a) => !a.passed).length;
373
+ const total = assertions.length;
374
+ return {
375
+ assertions,
376
+ summary: {
377
+ passed,
378
+ failed,
379
+ total,
380
+ pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
381
+ }
382
+ };
383
+ }
384
+ function safeArtifactPathSegment(value, fallback) {
385
+ const trimmed = value?.trim();
386
+ if (!trimmed) {
387
+ return fallback;
388
+ }
389
+ return trimmed.replace(/[/\\:*?"<>|]/g, "_");
390
+ }
391
+ function safeTestId(testId) {
392
+ return safeArtifactPathSegment(testId, "unknown");
393
+ }
394
+ function getSuite(result) {
395
+ return result.suite;
396
+ }
397
+ function buildArtifactSubdir(result) {
398
+ const segments = [];
399
+ const evalSet = getSuite(result);
400
+ if (evalSet) {
401
+ segments.push(safeArtifactPathSegment(evalSet, "default"));
402
+ }
403
+ segments.push(safeTestId(result.testId));
404
+ return path2.posix.join(...segments);
405
+ }
406
+ function formatOutputMarkdown(output) {
407
+ return output.map((msg) => `@[${msg.role}]:
408
+ ${String(msg.content ?? "")}`).join("\n\n");
409
+ }
410
+ function extractInput(result) {
411
+ const input = result.input;
412
+ if (!input) return null;
413
+ if (typeof input === "string") return input;
414
+ if (Array.isArray(input) && input.length > 0) {
415
+ return formatOutputMarkdown(input);
416
+ }
417
+ return null;
418
+ }
419
+ function toRelativeArtifactPath(outputDir, filePath) {
420
+ return path2.relative(outputDir, filePath).split(path2.sep).join("/");
421
+ }
422
+ function buildIndexArtifactEntry(result, options) {
423
+ return {
424
+ timestamp: result.timestamp,
425
+ test_id: result.testId ?? "unknown",
426
+ suite: getSuite(result),
427
+ category: result.category,
428
+ conversation_id: result.conversationId,
429
+ score: result.score,
430
+ target: result.target ?? "unknown",
431
+ scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
432
+ execution_status: result.executionStatus,
433
+ error: result.error,
434
+ failure_stage: result.failureStage,
435
+ failure_reason_code: result.failureReasonCode,
436
+ workspace_path: result.workspacePath,
437
+ grading_path: toRelativeArtifactPath(options.outputDir, options.gradingPath),
438
+ timing_path: toRelativeArtifactPath(options.outputDir, options.timingPath),
439
+ output_path: options.outputPath ? toRelativeArtifactPath(options.outputDir, options.outputPath) : void 0,
440
+ input_path: options.inputPath ? toRelativeArtifactPath(options.outputDir, options.inputPath) : void 0
441
+ };
442
+ }
443
+ function buildResultIndexArtifact(result) {
444
+ const artifactSubdir = buildArtifactSubdir(result);
445
+ const input = extractInput(result);
446
+ const hasResponse = Array.isArray(result.output) && result.output.length > 0;
447
+ return {
448
+ timestamp: result.timestamp,
449
+ test_id: result.testId ?? "unknown",
450
+ suite: getSuite(result),
451
+ category: result.category,
452
+ conversation_id: result.conversationId,
453
+ score: result.score,
454
+ target: result.target ?? "unknown",
455
+ scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
456
+ execution_status: result.executionStatus,
457
+ error: result.error,
458
+ failure_stage: result.failureStage,
459
+ failure_reason_code: result.failureReasonCode,
460
+ workspace_path: result.workspacePath,
461
+ grading_path: path2.posix.join(artifactSubdir, "grading.json"),
462
+ timing_path: path2.posix.join(artifactSubdir, "timing.json"),
463
+ input_path: input ? path2.posix.join(artifactSubdir, "input.md") : void 0,
464
+ output_path: hasResponse ? path2.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
465
+ response_path: hasResponse ? path2.posix.join(artifactSubdir, "outputs", "response.md") : void 0
466
+ };
467
+ }
468
+ async function writeJsonlFile(filePath, records) {
469
+ const content = records.length === 0 ? "" : `${records.map((record) => JSON.stringify(toSnakeCaseDeep(record))).join("\n")}
470
+ `;
471
+ await writeFile(filePath, content, "utf8");
472
+ }
473
+ function toCamelCase(str) {
474
+ return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
475
+ }
476
+ function toCamelCaseDeep(obj) {
477
+ if (obj === null || obj === void 0) {
478
+ return obj;
479
+ }
480
+ if (Array.isArray(obj)) {
481
+ return obj.map((item) => toCamelCaseDeep(item));
482
+ }
483
+ if (typeof obj === "object") {
484
+ const result = {};
485
+ for (const [key, value] of Object.entries(obj)) {
486
+ result[toCamelCase(key)] = toCamelCaseDeep(value);
487
+ }
488
+ return result;
489
+ }
490
+ return obj;
491
+ }
492
+ var EXECUTION_STATUSES = /* @__PURE__ */ new Set([
493
+ "ok",
494
+ "quality_failure",
495
+ "execution_error"
496
+ ]);
497
+ function isAssertionEntry(value) {
498
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
499
+ return false;
500
+ }
501
+ const candidate = value;
502
+ return typeof candidate.text === "string" && typeof candidate.passed === "boolean" && (candidate.evidence === void 0 || typeof candidate.evidence === "string");
503
+ }
504
+ function isOutputMessage(value) {
505
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
506
+ return false;
507
+ }
508
+ const candidate = value;
509
+ return typeof candidate.role === "string";
510
+ }
511
+ function isExecutionStatus(value) {
512
+ return typeof value === "string" && EXECUTION_STATUSES.has(value);
513
+ }
514
+ function normalizeParsedResult(value) {
515
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
516
+ return void 0;
517
+ }
518
+ const result = value;
519
+ return {
520
+ ...result,
521
+ timestamp: typeof result.timestamp === "string" ? result.timestamp : (/* @__PURE__ */ new Date(0)).toISOString(),
522
+ testId: typeof result.testId === "string" ? result.testId : "unknown",
523
+ score: typeof result.score === "number" ? result.score : 0,
524
+ assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
525
+ target: typeof result.target === "string" ? result.target : "unknown",
526
+ output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
527
+ executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : "ok"
528
+ };
529
+ }
530
+ function parseJsonlResults(content) {
531
+ const results = [];
532
+ const lines = content.split("\n");
533
+ for (const line of lines) {
534
+ const trimmed = line.trim();
535
+ if (trimmed.length === 0) {
536
+ continue;
537
+ }
538
+ try {
539
+ const parsed = JSON.parse(trimmed);
540
+ const camelCased = toCamelCaseDeep(parsed);
541
+ const normalized = normalizeParsedResult(camelCased);
542
+ if (normalized) {
543
+ results.push(normalized);
544
+ }
545
+ } catch {
546
+ }
547
+ }
548
+ return results;
549
+ }
550
+ async function writeArtifacts(jsonlPath, outputDir, options) {
551
+ const content = await readFile(jsonlPath, "utf8");
552
+ const results = parseJsonlResults(content);
553
+ return writeArtifactsFromResults(results, outputDir, options);
554
+ }
555
+ function buildTranscriptMessageLines(results) {
556
+ const lines = [];
557
+ for (const result of results) {
558
+ const transcriptLines = toTranscriptJsonLines(
559
+ {
560
+ messages: [...result.input ?? [], ...result.output],
561
+ source: {
562
+ provider: result.target,
563
+ sessionId: result.conversationId ?? result.testId,
564
+ startedAt: result.timestamp
565
+ },
566
+ tokenUsage: result.tokenUsage,
567
+ durationMs: result.durationMs,
568
+ costUsd: result.costUsd
569
+ },
570
+ {
571
+ testId: result.testId,
572
+ target: result.target
573
+ }
574
+ );
575
+ lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
576
+ }
577
+ return lines.length > 0 ? `${lines.join("\n")}
578
+ ` : "";
579
+ }
580
+ async function writePerTestArtifacts(results, outputDir, options) {
581
+ await mkdir(outputDir, { recursive: true });
582
+ for (const result of results) {
583
+ const grading = buildGradingArtifact(result);
584
+ const timing = buildTimingArtifact([result]);
585
+ const artifactSubdir = buildArtifactSubdir(result);
586
+ const testDir = path2.join(outputDir, artifactSubdir);
587
+ await mkdir(testDir, { recursive: true });
588
+ await writeFile(
589
+ path2.join(testDir, "grading.json"),
590
+ `${JSON.stringify(grading, null, 2)}
591
+ `,
592
+ "utf8"
593
+ );
594
+ await writeFile(
595
+ path2.join(testDir, "timing.json"),
596
+ `${JSON.stringify(timing, null, 2)}
597
+ `,
598
+ "utf8"
599
+ );
600
+ const input = extractInput(result);
601
+ if (input) {
602
+ await writeFile(path2.join(testDir, "input.md"), input, "utf8");
603
+ }
604
+ if (result.output && result.output.length > 0) {
605
+ const outputsDir = path2.join(testDir, "outputs");
606
+ await mkdir(outputsDir, { recursive: true });
607
+ await writeFile(
608
+ path2.join(outputsDir, "response.md"),
609
+ formatOutputMarkdown(result.output),
610
+ "utf8"
611
+ );
612
+ }
613
+ }
614
+ }
615
+ async function writeArtifactsFromResults(results, outputDir, options) {
616
+ const testArtifactDir = outputDir;
617
+ const timingPath = path2.join(outputDir, "timing.json");
618
+ const benchmarkPath = path2.join(outputDir, "benchmark.json");
619
+ const indexPath = path2.join(outputDir, RESULT_INDEX_FILENAME);
620
+ await mkdir(outputDir, { recursive: true });
621
+ const indexRecords = [];
622
+ for (const result of results) {
623
+ const grading = buildGradingArtifact(result);
624
+ const timing2 = buildTimingArtifact([result]);
625
+ const artifactSubdir = buildArtifactSubdir(result);
626
+ const testDir = path2.join(outputDir, artifactSubdir);
627
+ const gradingPath = path2.join(testDir, "grading.json");
628
+ const perTestTimingPath = path2.join(testDir, "timing.json");
629
+ await mkdir(testDir, { recursive: true });
630
+ await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
631
+ `, "utf8");
632
+ await writeFile(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
633
+ `, "utf8");
634
+ const input = extractInput(result);
635
+ if (input) {
636
+ await writeFile(path2.join(testDir, "input.md"), input, "utf8");
637
+ }
638
+ if (result.output && result.output.length > 0) {
639
+ const outputsDir = path2.join(testDir, "outputs");
640
+ await mkdir(outputsDir, { recursive: true });
641
+ await writeFile(
642
+ path2.join(outputsDir, "response.md"),
643
+ formatOutputMarkdown(result.output),
644
+ "utf8"
645
+ );
646
+ }
647
+ indexRecords.push({
648
+ ...buildResultIndexArtifact(result),
649
+ experiment: options?.experiment
650
+ });
651
+ }
652
+ const timing = buildTimingArtifact(results);
653
+ await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
654
+ `, "utf8");
655
+ const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
656
+ await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
657
+ `, "utf8");
658
+ await writeJsonlFile(indexPath, indexRecords);
659
+ const transcriptPath = path2.join(outputDir, "transcript.jsonl");
660
+ await writeFile(transcriptPath, buildTranscriptMessageLines(results), "utf8");
661
+ return { testArtifactDir, timingPath, benchmarkPath, indexPath };
662
+ }
663
+
664
+ export {
665
+ toSnakeCaseDeep,
666
+ RESULT_INDEX_FILENAME,
667
+ RESULT_RUNS_DIRNAME,
668
+ normalizeExperimentName,
669
+ buildDefaultRunDir,
670
+ resolveRunIndexPath,
671
+ resolveExistingRunPrimaryPath,
672
+ isDirectoryPath,
673
+ resolveWorkspaceOrFilePath,
674
+ resolveRunManifestPath,
675
+ buildTestTargetKey,
676
+ deduplicateByTestIdTarget,
677
+ aggregateRunDir,
678
+ buildGradingArtifact,
679
+ buildTimingArtifact,
680
+ buildBenchmarkArtifact,
681
+ buildAggregateGradingArtifact,
682
+ buildIndexArtifactEntry,
683
+ buildResultIndexArtifact,
684
+ parseJsonlResults,
685
+ writeArtifacts,
686
+ writePerTestArtifacts,
687
+ writeArtifactsFromResults
688
+ };
689
+ //# sourceMappingURL=chunk-HBDOJJFY.js.map