agentv 4.9.1 → 4.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/{chunk-XOSNETAV.js → chunk-BAUNAXHT.js} +1 -1
  2. package/dist/chunk-BPGJ4HBU.js +183 -0
  3. package/dist/chunk-BPGJ4HBU.js.map +1 -0
  4. package/dist/{chunk-2IKIOZ4Z.js → chunk-FH24D7XW.js} +1090 -303
  5. package/dist/chunk-FH24D7XW.js.map +1 -0
  6. package/dist/{chunk-RHAXSXIY.js → chunk-FQGY6QXQ.js} +1360 -653
  7. package/dist/chunk-FQGY6QXQ.js.map +1 -0
  8. package/dist/chunk-NPVGBFF6.js +151 -0
  9. package/dist/chunk-NPVGBFF6.js.map +1 -0
  10. package/dist/{chunk-2JW4HVCX.js → chunk-QRYAMYT7.js} +1120 -731
  11. package/dist/chunk-QRYAMYT7.js.map +1 -0
  12. package/dist/cli.js +6 -4
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{dist-DDFE3W2A.js → dist-HNSXNRVK.js} +36 -3
  15. package/dist/docker-workspace-RPPXBT27-B4AQHVWA.js +11 -0
  16. package/dist/{esm-CZAWIY6F.js → esm-UYZ3HJBU.js} +2 -2
  17. package/dist/esm-UYZ3HJBU.js.map +1 -0
  18. package/dist/exec-AR6JUUN5-6MBPURPR.js +11 -0
  19. package/dist/exec-AR6JUUN5-6MBPURPR.js.map +1 -0
  20. package/dist/index.js +6 -4
  21. package/dist/{interactive-VMDBXBRL.js → interactive-SIOZB665.js} +6 -4
  22. package/dist/{interactive-VMDBXBRL.js.map → interactive-SIOZB665.js.map} +1 -1
  23. package/dist/{src-ML4D2MC2.js → src-PXDA7QIS.js} +2 -2
  24. package/dist/studio/assets/index-Bi-KHfNm.js +65 -0
  25. package/dist/studio/assets/index-D_j-w4UO.css +1 -0
  26. package/dist/studio/assets/{index-DcwjOyrk.js → index-VyDFrnoK.js} +1 -1
  27. package/dist/studio/index.html +2 -2
  28. package/package.json +1 -1
  29. package/dist/chunk-2IKIOZ4Z.js.map +0 -1
  30. package/dist/chunk-2JW4HVCX.js.map +0 -1
  31. package/dist/chunk-RHAXSXIY.js.map +0 -1
  32. package/dist/studio/assets/index-DHxVz6M9.css +0 -1
  33. package/dist/studio/assets/index-Y5InSvcS.js +0 -65
  34. /package/dist/{chunk-XOSNETAV.js.map → chunk-BAUNAXHT.js.map} +0 -0
  35. /package/dist/{dist-DDFE3W2A.js.map → dist-HNSXNRVK.js.map} +0 -0
  36. /package/dist/{esm-CZAWIY6F.js.map → docker-workspace-RPPXBT27-B4AQHVWA.js.map} +0 -0
  37. /package/dist/{src-ML4D2MC2.js.map → src-PXDA7QIS.js.map} +0 -0
@@ -9,10 +9,14 @@ import {
9
9
  ResponseCache,
10
10
  buildDirectoryChain,
11
11
  buildSearchRoots,
12
+ commitAndPushResultsBranch,
13
+ createDraftResultsPr,
12
14
  deriveCategory,
15
+ directorySizeBytes,
13
16
  ensureVSCodeSubagents,
14
17
  findDeprecatedCamelCaseTargetWarnings,
15
18
  findGitRoot,
19
+ getResultsRepoStatus,
16
20
  interpolateEnv,
17
21
  isEvaluatorKind,
18
22
  listTargetNames,
@@ -21,23 +25,28 @@ import {
21
25
  loadTestSuite,
22
26
  loadTsConfig,
23
27
  normalizeLineEndings,
28
+ prepareResultsRepoBranch,
24
29
  readTargetDefinitions,
25
30
  readTestSuiteMetadata,
26
31
  resolveFileReference,
32
+ resolveResultsRepoRunsDir,
27
33
  resolveTargetDefinition,
28
34
  runEvaluation,
29
35
  shouldEnableCache,
30
36
  shouldSkipCacheForTemperature,
37
+ stageResultsArtifacts,
31
38
  subscribeToCodexLogEntries,
32
39
  subscribeToCopilotCliLogEntries,
33
40
  subscribeToCopilotSdkLogEntries,
34
- subscribeToPiLogEntries
35
- } from "./chunk-RHAXSXIY.js";
41
+ subscribeToPiLogEntries,
42
+ syncResultsRepo,
43
+ toCamelCaseDeep
44
+ } from "./chunk-FQGY6QXQ.js";
36
45
 
37
46
  // package.json
38
47
  var package_default = {
39
48
  name: "agentv",
40
- version: "4.9.1",
49
+ version: "4.11.0",
41
50
  description: "CLI entry point for AgentV",
42
51
  type: "module",
43
52
  repository: {
@@ -249,7 +258,7 @@ async function discoverTargetsFile(options) {
249
258
  // src/commands/eval/run-eval.ts
250
259
  import { constants as constants4, mkdirSync } from "node:fs";
251
260
  import { access as access4 } from "node:fs/promises";
252
- import path15 from "node:path";
261
+ import path17 from "node:path";
253
262
  import { pathToFileURL } from "node:url";
254
263
 
255
264
  // src/version-check.ts
@@ -306,45 +315,43 @@ async function promptContinue() {
306
315
  return confirm({ message: "Continue anyway?", default: false });
307
316
  }
308
317
 
309
- // src/commands/eval/artifact-writer.ts
310
- import { mkdir, readFile, writeFile } from "node:fs/promises";
311
- import path4 from "node:path";
318
+ // src/commands/results/remote.ts
319
+ import path6 from "node:path";
312
320
 
313
- // src/utils/case-conversion.ts
314
- function toSnakeCase(str) {
315
- if (/^[A-Z]/.test(str)) {
316
- return str;
317
- }
318
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
319
- }
320
- function toSnakeCaseDeep(obj) {
321
- if (obj === null || obj === void 0) {
322
- return obj;
323
- }
324
- if (Array.isArray(obj)) {
325
- return obj.map((item) => toSnakeCaseDeep(item));
326
- }
327
- if (typeof obj === "object") {
328
- const result = {};
329
- for (const [key, value] of Object.entries(obj)) {
330
- const snakeKey = toSnakeCase(key);
331
- result[snakeKey] = toSnakeCaseDeep(value);
332
- }
333
- return result;
334
- }
335
- return obj;
336
- }
321
+ // src/commands/inspect/utils.ts
322
+ import { readFileSync as readFileSync2, readdirSync, statSync as statSync2 } from "node:fs";
323
+ import path5 from "node:path";
337
324
 
338
325
  // src/commands/eval/result-layout.ts
339
326
  import { existsSync, statSync } from "node:fs";
340
327
  import path3 from "node:path";
341
328
  var RESULT_INDEX_FILENAME = "index.jsonl";
342
329
  var RESULT_RUNS_DIRNAME = "runs";
330
+ var DEFAULT_EXPERIMENT_NAME = "default";
331
+ function normalizeExperimentName(experiment) {
332
+ const trimmed = experiment?.trim();
333
+ if (!trimmed) {
334
+ return DEFAULT_EXPERIMENT_NAME;
335
+ }
336
+ if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
337
+ throw new Error(
338
+ `Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
339
+ );
340
+ }
341
+ return trimmed;
342
+ }
343
343
  function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
344
344
  return timestamp.toISOString().replace(/[:.]/g, "-");
345
345
  }
346
- function buildDefaultRunDir(cwd) {
347
- return path3.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME, createRunDirName());
346
+ function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
347
+ return path3.join(
348
+ cwd,
349
+ ".agentv",
350
+ "results",
351
+ RESULT_RUNS_DIRNAME,
352
+ normalizeExperimentName(experiment),
353
+ createRunDirName(timestamp)
354
+ );
348
355
  }
349
356
  function resolveRunIndexPath(runDir) {
350
357
  return path3.join(runDir, RESULT_INDEX_FILENAME);
@@ -366,26 +373,794 @@ function isDirectoryPath(filePath) {
366
373
  return false;
367
374
  }
368
375
  }
369
- function resolveWorkspaceOrFilePath(filePath) {
370
- if (!isDirectoryPath(filePath)) {
371
- return filePath;
372
- }
373
- const existing = resolveExistingRunPrimaryPath(filePath);
374
- if (!existing) {
375
- throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
376
+ function resolveWorkspaceOrFilePath(filePath) {
377
+ if (!isDirectoryPath(filePath)) {
378
+ return filePath;
379
+ }
380
+ const existing = resolveExistingRunPrimaryPath(filePath);
381
+ if (!existing) {
382
+ throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
383
+ }
384
+ return existing;
385
+ }
386
+ function resolveRunManifestPath(filePath) {
387
+ if (isDirectoryPath(filePath)) {
388
+ return resolveWorkspaceOrFilePath(filePath);
389
+ }
390
+ if (!isRunManifestPath(filePath)) {
391
+ throw new Error(
392
+ `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
393
+ );
394
+ }
395
+ return filePath;
396
+ }
397
+
398
+ // src/commands/results/manifest.ts
399
+ import { existsSync as existsSync2, readFileSync } from "node:fs";
400
+ import path4 from "node:path";
401
+ function parseJsonlLines(content) {
402
+ return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
403
+ }
404
+ function parseMarkdownMessages(content) {
405
+ const trimmed = content.trim();
406
+ if (!trimmed.startsWith("@[")) {
407
+ return [];
408
+ }
409
+ const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
410
+ return matches.map((match) => ({
411
+ role: match[1],
412
+ content: match[2].trimEnd()
413
+ }));
414
+ }
415
+ function readOptionalText(baseDir, relativePath) {
416
+ if (!relativePath) {
417
+ return void 0;
418
+ }
419
+ const absolutePath = path4.join(baseDir, relativePath);
420
+ if (!existsSync2(absolutePath)) {
421
+ return void 0;
422
+ }
423
+ return readFileSync(absolutePath, "utf8");
424
+ }
425
+ function readOptionalJson(baseDir, relativePath) {
426
+ const text = readOptionalText(baseDir, relativePath);
427
+ if (!text) {
428
+ return void 0;
429
+ }
430
+ try {
431
+ return JSON.parse(text);
432
+ } catch {
433
+ return void 0;
434
+ }
435
+ }
436
+ function hydrateInput(baseDir, record) {
437
+ const inputText = readOptionalText(baseDir, record.input_path);
438
+ if (!inputText) {
439
+ return void 0;
440
+ }
441
+ const messages = parseMarkdownMessages(inputText);
442
+ return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
443
+ }
444
+ function hydrateOutput(baseDir, record) {
445
+ const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
446
+ if (!responseText) {
447
+ return void 0;
448
+ }
449
+ const messages = parseMarkdownMessages(responseText);
450
+ if (messages.length > 0) {
451
+ return messages.map((message) => ({
452
+ role: message.role,
453
+ content: message.content
454
+ }));
455
+ }
456
+ return [{ role: "assistant", content: responseText.trimEnd() }];
457
+ }
458
+ function hydrateManifestRecord(baseDir, record) {
459
+ const grading = readOptionalJson(baseDir, record.grading_path);
460
+ const timing = readOptionalJson(baseDir, record.timing_path);
461
+ const testId = record.test_id ?? "unknown";
462
+ return {
463
+ timestamp: record.timestamp,
464
+ testId,
465
+ suite: record.suite,
466
+ category: record.category,
467
+ target: record.target,
468
+ score: record.score,
469
+ executionStatus: record.execution_status,
470
+ error: record.error,
471
+ assertions: grading?.assertions.map((assertion) => ({
472
+ text: assertion.text,
473
+ passed: assertion.passed,
474
+ evidence: assertion.evidence
475
+ })),
476
+ scores: grading?.evaluators?.map((evaluator) => ({
477
+ name: evaluator.name,
478
+ type: evaluator.type,
479
+ score: evaluator.score,
480
+ assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
481
+ text: String(assertion.text ?? ""),
482
+ passed: Boolean(assertion.passed),
483
+ evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
484
+ })) : void 0,
485
+ weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
486
+ verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
487
+ details: evaluator.details
488
+ })) ?? record.scores,
489
+ tokenUsage: timing?.token_usage ? {
490
+ input: timing.token_usage.input,
491
+ output: timing.token_usage.output,
492
+ reasoning: timing.token_usage.reasoning
493
+ } : record.token_usage,
494
+ durationMs: timing?.duration_ms ?? record.duration_ms,
495
+ costUsd: record.cost_usd,
496
+ input: hydrateInput(baseDir, record),
497
+ output: hydrateOutput(baseDir, record)
498
+ };
499
+ }
500
+ function parseResultManifest(content) {
501
+ return parseJsonlLines(content);
502
+ }
503
+ function resolveResultSourcePath(source, cwd) {
504
+ const resolved = path4.isAbsolute(source) ? source : path4.resolve(cwd ?? process.cwd(), source);
505
+ if (isDirectoryPath(resolved) || path4.basename(resolved) === RESULT_INDEX_FILENAME) {
506
+ return resolveRunManifestPath(resolved);
507
+ }
508
+ return resolved;
509
+ }
510
+ function loadManifestResults(sourceFile) {
511
+ const resolvedSourceFile = resolveRunManifestPath(sourceFile);
512
+ const content = readFileSync(resolvedSourceFile, "utf8");
513
+ const records = parseResultManifest(content);
514
+ const baseDir = path4.dirname(resolvedSourceFile);
515
+ return records.map((record) => hydrateManifestRecord(baseDir, record));
516
+ }
517
+ function loadLightweightResults(sourceFile) {
518
+ const resolvedSourceFile = resolveRunManifestPath(sourceFile);
519
+ const content = readFileSync(resolvedSourceFile, "utf8");
520
+ return parseResultManifest(content).map((record) => ({
521
+ testId: record.test_id ?? "unknown",
522
+ suite: record.suite,
523
+ target: record.target,
524
+ experiment: record.experiment,
525
+ score: record.score,
526
+ scores: record.scores,
527
+ executionStatus: record.execution_status,
528
+ error: record.error,
529
+ timestamp: record.timestamp
530
+ }));
531
+ }
532
+
533
+ // src/commands/inspect/utils.ts
534
+ var colors = {
535
+ reset: "\x1B[0m",
536
+ bold: "\x1B[1m",
537
+ dim: "\x1B[2m",
538
+ green: "\x1B[32m",
539
+ red: "\x1B[31m",
540
+ yellow: "\x1B[33m",
541
+ cyan: "\x1B[36m",
542
+ gray: "\x1B[90m"
543
+ };
544
+ var noColor = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
545
+ var c = noColor ? Object.fromEntries(Object.keys(colors).map((k) => [k, ""])) : colors;
546
+ var ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
547
+ function stripAnsi(str) {
548
+ return str.replace(ansiPattern, "");
549
+ }
550
+ function padRight(str, len) {
551
+ const plainLen = stripAnsi(str).length;
552
+ return str + " ".repeat(Math.max(0, len - plainLen));
553
+ }
554
+ function padLeft(str, len) {
555
+ const plainLen = stripAnsi(str).length;
556
+ return " ".repeat(Math.max(0, len - plainLen)) + str;
557
+ }
558
+ function loadResultFile(filePath) {
559
+ const resolvedFilePath = resolveTraceResultPath(filePath);
560
+ if (path5.extname(resolvedFilePath) === ".json") {
561
+ return loadOtlpTraceFile(resolvedFilePath);
562
+ }
563
+ if (path5.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
564
+ return loadManifestAsRawResults(resolvedFilePath);
565
+ }
566
+ return loadJsonlRecords(resolvedFilePath);
567
+ }
568
+ function resolveTraceResultPath(filePath) {
569
+ return resolveWorkspaceOrFilePath(filePath);
570
+ }
571
+ function loadJsonlRecords(filePath) {
572
+ const content = readFileSync2(filePath, "utf8");
573
+ const lines = content.trim().split("\n").filter((line) => line.trim());
574
+ return lines.map((line, i) => {
575
+ const record = JSON.parse(line);
576
+ if (typeof record.score !== "number") {
577
+ throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`);
578
+ }
579
+ return record;
580
+ });
581
+ }
582
+ function loadManifestAsRawResults(filePath) {
583
+ return loadManifestResults(filePath).map(toRawResult);
584
+ }
585
+ function toRawResult(result) {
586
+ return {
587
+ timestamp: result.timestamp,
588
+ test_id: result.testId,
589
+ suite: result.suite,
590
+ conversation_id: result.conversationId,
591
+ score: result.score,
592
+ assertions: result.assertions?.map((assertion) => ({
593
+ text: assertion.text,
594
+ passed: assertion.passed,
595
+ evidence: assertion.evidence
596
+ })),
597
+ target: result.target,
598
+ error: result.error,
599
+ scores: result.scores?.map((score) => ({
600
+ name: score.name,
601
+ type: score.type,
602
+ score: score.score,
603
+ assertions: score.assertions?.map((assertion) => ({
604
+ text: assertion.text,
605
+ passed: assertion.passed,
606
+ evidence: assertion.evidence
607
+ })),
608
+ weight: score.weight
609
+ })),
610
+ token_usage: result.tokenUsage ? {
611
+ input: result.tokenUsage.input,
612
+ output: result.tokenUsage.output,
613
+ cached: result.tokenUsage.cached
614
+ } : void 0,
615
+ cost_usd: result.costUsd,
616
+ duration_ms: result.durationMs,
617
+ start_time: result.startTime,
618
+ end_time: result.endTime,
619
+ input: result.input,
620
+ output: result.output,
621
+ file_changes: result.fileChanges
622
+ };
623
+ }
624
+ function loadOtlpTraceFile(filePath) {
625
+ const parsed = JSON.parse(readFileSync2(filePath, "utf8"));
626
+ const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
627
+ if (!spans || spans.length === 0) {
628
+ return [];
629
+ }
630
+ const spanMap = /* @__PURE__ */ new Map();
631
+ const childMap = /* @__PURE__ */ new Map();
632
+ for (const span of spans) {
633
+ if (!span.spanId) continue;
634
+ spanMap.set(span.spanId, span);
635
+ if (span.parentSpanId) {
636
+ const siblings = childMap.get(span.parentSpanId) ?? [];
637
+ siblings.push(span);
638
+ childMap.set(span.parentSpanId, siblings);
639
+ }
640
+ }
641
+ const roots = spans.filter((span) => !span.parentSpanId || !spanMap.has(span.parentSpanId));
642
+ const supportedRoots = roots.filter(isAgentvEvalRoot);
643
+ const candidateRoots = supportedRoots.length > 0 ? supportedRoots : roots;
644
+ return candidateRoots.map((root, index) => {
645
+ const descendants = collectChildSpans(root.spanId, childMap);
646
+ const rootAttrs = parseOtlpAttributes(root.attributes);
647
+ const parsedDescendants = descendants.map((span) => ({
648
+ ...span,
649
+ parsedAttributes: parseOtlpAttributes(span.attributes)
650
+ }));
651
+ const toolSpans = parsedDescendants.filter(
652
+ (span) => typeof span.parsedAttributes.gen_ai_tool_name === "string"
653
+ );
654
+ const llmSpans = parsedDescendants.filter(
655
+ (span) => span.parsedAttributes.gen_ai_operation_name === "chat" || typeof span.name === "string" && span.name.startsWith("chat ")
656
+ );
657
+ const tokenUsage = descendants.reduce(
658
+ (acc, span) => {
659
+ const attrs = parseOtlpAttributes(span.attributes);
660
+ acc.input += numberAttr(attrs.gen_ai_usage_input_tokens) ?? 0;
661
+ acc.output += numberAttr(attrs.gen_ai_usage_output_tokens) ?? 0;
662
+ const cached = numberAttr(attrs.gen_ai_usage_cache_read_input_tokens);
663
+ if (cached !== void 0 && cached > 0) {
664
+ acc.cached = (acc.cached ?? 0) + cached;
665
+ }
666
+ return acc;
667
+ },
668
+ { input: 0, output: 0, cached: void 0 }
669
+ );
670
+ const traceSummary = buildDerivedTraceSummary({
671
+ trace: {
672
+ event_count: numberAttr(rootAttrs.agentv_trace_event_count) ?? (toolSpans.length > 0 ? toolSpans.length : void 0),
673
+ tool_calls: countRawSpanNames(
674
+ toolSpans.map((span) => ({
675
+ type: "tool",
676
+ name: String(span.parsedAttributes.gen_ai_tool_name)
677
+ }))
678
+ ),
679
+ error_count: descendants.filter((span) => span.status?.code === 2).length || void 0,
680
+ llm_call_count: numberAttr(rootAttrs.agentv_trace_llm_call_count) ?? (llmSpans.length > 0 ? llmSpans.length : void 0)
681
+ },
682
+ spans: [
683
+ ...llmSpans.map((span) => ({
684
+ type: "llm",
685
+ name: span.name ?? "chat",
686
+ duration_ms: durationFromSpan(span)
687
+ })),
688
+ ...toolSpans.map((span) => ({
689
+ type: "tool",
690
+ name: String(span.parsedAttributes.gen_ai_tool_name),
691
+ duration_ms: durationFromSpan(span)
692
+ }))
693
+ ],
694
+ duration_ms: numberAttr(rootAttrs.agentv_trace_duration_ms) ?? durationFromSpan(root),
695
+ cost_usd: numberAttr(rootAttrs.agentv_trace_cost_usd),
696
+ token_usage: tokenUsage.input || tokenUsage.output || tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_input) || numberAttr(rootAttrs.agentv_trace_token_output) || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
697
+ input: tokenUsage.input || numberAttr(rootAttrs.agentv_trace_token_input) || 0,
698
+ output: tokenUsage.output || numberAttr(rootAttrs.agentv_trace_token_output) || 0,
699
+ ...tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
700
+ cached: tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) || 0
701
+ } : {}
702
+ } : void 0
703
+ });
704
+ const score = numberAttr(rootAttrs.agentv_score);
705
+ if (score === void 0) {
706
+ throw new Error(
707
+ `Unsupported OTLP trace root span at index ${index + 1}: missing agentv.score attribute`
708
+ );
709
+ }
710
+ return {
711
+ test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
712
+ suite: stringAttr(rootAttrs.agentv_suite),
713
+ target: stringAttr(rootAttrs.agentv_target),
714
+ score,
715
+ error: root.status?.code === 2 ? root.status.message : void 0,
716
+ cost_usd: traceSummary?.cost_usd,
717
+ duration_ms: traceSummary?.duration_ms,
718
+ token_usage: traceSummary?.token_usage,
719
+ trace: traceSummary ? {
720
+ event_count: traceSummary.event_count,
721
+ tool_calls: traceSummary.tool_calls,
722
+ error_count: traceSummary.error_count,
723
+ tool_durations: traceSummary.tool_durations,
724
+ llm_call_count: traceSummary.llm_call_count,
725
+ token_usage: traceSummary.token_usage,
726
+ cost_usd: traceSummary.cost_usd,
727
+ duration_ms: traceSummary.duration_ms
728
+ } : void 0,
729
+ spans: traceSummary?.spans,
730
+ output: stringAttr(rootAttrs.agentv_output_text),
731
+ scores: root.events?.filter(
732
+ (event) => event.name?.startsWith("agentv.grader.") || event.name?.startsWith("agentv.evaluator.")
733
+ ).map((event) => {
734
+ const attrs = parseOtlpAttributes(event.attributes);
735
+ const name = event.name?.replace(/^agentv\.grader\./, "").replace(/^agentv\.evaluator\./, "") ?? "unknown";
736
+ return {
737
+ name,
738
+ type: stringAttr(attrs.agentv_grader_type) ?? stringAttr(attrs.agentv_evaluator_type) ?? "unknown",
739
+ score: numberAttr(attrs.agentv_grader_score) ?? numberAttr(attrs.agentv_evaluator_score) ?? 0
740
+ };
741
+ })
742
+ };
743
+ });
744
+ }
745
+ function isAgentvEvalRoot(span) {
746
+ const attrs = parseOtlpAttributes(span.attributes);
747
+ return span.name === "agentv.eval" || numberAttr(attrs.agentv_score) !== void 0 || typeof stringAttr(attrs.agentv_test_id) === "string";
748
+ }
749
+ function collectChildSpans(spanId, childMap) {
750
+ if (!spanId) return [];
751
+ const direct = childMap.get(spanId) ?? [];
752
+ const all = [...direct];
753
+ for (const child of direct) {
754
+ all.push(...collectChildSpans(child.spanId, childMap));
755
+ }
756
+ return all;
757
+ }
758
+ function parseOtlpAttributes(attributes) {
759
+ const parsed = {};
760
+ for (const attribute of attributes ?? []) {
761
+ parsed[attribute.key.replace(/\./g, "_")] = parseOtlpValue(attribute.value);
762
+ }
763
+ return parsed;
764
+ }
765
+ function parseOtlpValue(value) {
766
+ if (!value) return void 0;
767
+ if ("stringValue" in value && value.stringValue !== void 0) return value.stringValue;
768
+ if ("intValue" in value && value.intValue !== void 0) return Number(value.intValue);
769
+ if ("doubleValue" in value && value.doubleValue !== void 0) return value.doubleValue;
770
+ if ("boolValue" in value && value.boolValue !== void 0) return value.boolValue;
771
+ if ("arrayValue" in value)
772
+ return (value.arrayValue?.values ?? []).map((entry) => parseOtlpValue(entry));
773
+ return void 0;
774
+ }
775
+ function durationFromSpan(span) {
776
+ const start = Number(span.startTimeUnixNano);
777
+ const end = Number(span.endTimeUnixNano);
778
+ if (!Number.isFinite(start) || !Number.isFinite(end)) return void 0;
779
+ return Math.round((end - start) / 1e6);
780
+ }
781
+ function stringAttr(value) {
782
+ return typeof value === "string" ? value : void 0;
783
+ }
784
+ function numberAttr(value) {
785
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
786
+ }
787
+ function buildDerivedTraceSummary(result) {
788
+ const toolSpans = (result.spans ?? []).filter((span) => span.type === "tool");
789
+ const llmSpans = (result.spans ?? []).filter((span) => span.type === "llm");
790
+ const toolCalls = result.trace?.tool_calls ?? countRawSpanNames(toolSpans);
791
+ const toolDurations = result.trace?.tool_durations ?? groupRawSpanDurations(toolSpans);
792
+ const hasSpanData = (result.spans?.length ?? 0) > 0;
793
+ const eventCount = result.trace?.event_count ?? (hasSpanData ? toolSpans.length : void 0);
794
+ const llmCallCount = result.trace?.llm_call_count ?? (hasSpanData ? llmSpans.length : void 0);
795
+ if (!result.trace && !result.spans?.length && result.token_usage === void 0 && result.cost_usd === void 0 && result.duration_ms === void 0) {
796
+ return void 0;
797
+ }
798
+ return {
799
+ event_count: eventCount,
800
+ tool_calls: toolCalls,
801
+ error_count: result.trace?.error_count,
802
+ tool_durations: toolDurations,
803
+ llm_call_count: llmCallCount,
804
+ token_usage: result.trace?.token_usage ?? result.token_usage,
805
+ cost_usd: result.trace?.cost_usd ?? result.cost_usd,
806
+ duration_ms: result.trace?.duration_ms ?? result.duration_ms,
807
+ spans: result.spans
808
+ };
809
+ }
810
+ function countRawSpanNames(spans) {
811
+ const counts = {};
812
+ for (const span of spans) {
813
+ counts[span.name] = (counts[span.name] ?? 0) + 1;
814
+ }
815
+ return Object.keys(counts).length > 0 ? counts : void 0;
816
+ }
817
+ function groupRawSpanDurations(spans) {
818
+ const grouped = {};
819
+ for (const span of spans) {
820
+ if (span.duration_ms === void 0) continue;
821
+ const existing = grouped[span.name] ?? [];
822
+ existing.push(span.duration_ms);
823
+ grouped[span.name] = existing;
824
+ }
825
+ return Object.keys(grouped).length > 0 ? grouped : void 0;
826
+ }
827
+ function getTraceSummary(result) {
828
+ const derived = buildDerivedTraceSummary(result);
829
+ if (!derived) return void 0;
830
+ const { spans: _spans, ...trace } = derived;
831
+ return trace;
832
+ }
833
+ function getTraceSpans(result) {
834
+ return buildDerivedTraceSummary(result)?.spans ?? [];
835
+ }
836
+ function toTraceSummary(result) {
837
+ const rawTrace = getTraceSummary(result);
838
+ if (!rawTrace) return void 0;
839
+ return toCamelCaseDeep(rawTrace);
840
+ }
841
+ function buildRunId(relativeRunPath) {
842
+ const normalized = relativeRunPath.split(path5.sep).join("/");
843
+ const segments = normalized.split("/").filter(Boolean);
844
+ if (segments.length >= 2) {
845
+ const experiment = segments.slice(0, -1).join("/");
846
+ const timestamp = segments.at(-1);
847
+ if (experiment === "default") {
848
+ return timestamp ?? normalized;
849
+ }
850
+ return `${experiment}::${timestamp}`;
851
+ }
852
+ return segments[0];
853
+ }
854
+ function collectRunManifestPaths(runsDir, currentDir, files) {
855
+ const primaryPath = resolveExistingRunPrimaryPath(currentDir);
856
+ if (primaryPath) {
857
+ const relativeRunPath = path5.relative(runsDir, currentDir);
858
+ files.push({
859
+ filePath: primaryPath,
860
+ displayName: path5.basename(currentDir),
861
+ runId: buildRunId(relativeRunPath)
862
+ });
863
+ return;
864
+ }
865
+ const entries = readdirSync(currentDir, { withFileTypes: true });
866
+ for (const entry of entries) {
867
+ if (entry.isDirectory()) {
868
+ collectRunManifestPaths(runsDir, path5.join(currentDir, entry.name), files);
869
+ }
870
+ }
871
+ }
872
+ function listResultFilesFromRunsDir(runsDir, limit) {
873
+ const files = [];
874
+ try {
875
+ const entries = readdirSync(runsDir, { withFileTypes: true });
876
+ for (const entry of entries) {
877
+ if (entry.isDirectory()) {
878
+ collectRunManifestPaths(runsDir, path5.join(runsDir, entry.name), files);
879
+ }
880
+ }
881
+ } catch {
882
+ }
883
+ files.sort((a, b) => b.displayName.localeCompare(a.displayName));
884
+ const limited = limit !== void 0 && limit > 0 ? files.slice(0, limit) : files;
885
+ const metas = [];
886
+ for (const { filePath, displayName, runId } of limited) {
887
+ try {
888
+ const fileStat = statSync2(filePath);
889
+ const results = loadResultFile(filePath);
890
+ const testCount = results.length;
891
+ const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
892
+ const passRate = testCount > 0 ? passCount / testCount : 0;
893
+ const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0;
894
+ const filenameTimestamp = extractTimestampFromFilename(displayName);
895
+ const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? "unknown";
896
+ metas.push({
897
+ path: filePath,
898
+ filename: runId,
899
+ displayName,
900
+ timestamp,
901
+ testCount,
902
+ passRate,
903
+ avgScore,
904
+ sizeBytes: fileStat.size
905
+ });
906
+ } catch {
907
+ }
908
+ }
909
+ return metas;
910
+ }
911
+ function listResultFiles(cwd, limit) {
912
+ return listResultFilesFromRunsDir(
913
+ path5.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
914
+ limit
915
+ );
916
+ }
917
+ function extractTimestampFromFilename(filename) {
918
+ const match = filename.match(
919
+ /(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/
920
+ );
921
+ if (!match) return void 0;
922
+ return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ":$1:$2.$3Z");
923
+ }
924
+ function formatNumber(n) {
925
+ return n.toLocaleString();
926
+ }
927
+ function formatDuration(ms) {
928
+ if (ms < 1e3) return `${Math.round(ms)}ms`;
929
+ if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
930
+ const minutes = Math.floor(ms / 6e4);
931
+ const seconds = (ms % 6e4 / 1e3).toFixed(0);
932
+ return `${minutes}m${seconds}s`;
933
+ }
934
+ function formatCost(usd) {
935
+ if (usd < 0.01) return `$${usd.toFixed(4)}`;
936
+ return `$${usd.toFixed(3)}`;
937
+ }
938
+ function formatSize(bytes) {
939
+ if (bytes < 1024) return `${bytes}B`;
940
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}KB`;
941
+ return `${(bytes / (1024 * 1024)).toFixed(1)}MB`;
942
+ }
943
+ function formatScore(score) {
944
+ return `${(score * 100).toFixed(0)}%`;
945
+ }
946
+
947
+ // src/commands/results/remote.ts
948
+ var REMOTE_RUN_PREFIX = "remote::";
949
+ var SIZE_WARNING_BYTES = 10 * 1024 * 1024;
950
+ function getStatusMessage(error) {
951
+ return error instanceof Error ? error.message : String(error);
952
+ }
953
+ function normalizeResultsExportConfig(config) {
954
+ return {
955
+ repo: config.repo,
956
+ path: config.path,
957
+ auto_push: config.auto_push === true,
958
+ branch_prefix: config.branch_prefix?.trim() || "eval-results"
959
+ };
960
+ }
961
+ function slugify(value) {
962
+ return value.trim().replace(/[^A-Za-z0-9._/-]+/g, "-").replace(/\/+/g, "/").replace(/^-+|-+$/g, "").slice(0, 120);
963
+ }
964
+ function getRelativeRunPath(cwd, runDir) {
965
+ const relative = path6.relative(path6.join(cwd, ".agentv", "results", "runs"), runDir);
966
+ if (!relative.startsWith("..") && !path6.isAbsolute(relative)) {
967
+ return relative;
968
+ }
969
+ const experiment = path6.basename(path6.dirname(runDir));
970
+ const runName = path6.basename(runDir);
971
+ return experiment && experiment !== runName ? path6.join(experiment, runName) : runName;
972
+ }
973
+ function buildBranchName(config, payload) {
974
+ const timestamp = path6.basename(payload.run_dir);
975
+ const evalStem = payload.test_files.length === 1 ? path6.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
976
+ const experiment = slugify(payload.experiment ?? "default");
977
+ const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
978
+ return `${config.branch_prefix}/${branchLeaf}`;
979
+ }
980
+ function buildCommitTitle(payload) {
981
+ const passed = payload.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length;
982
+ const avgScore = payload.results.length > 0 ? payload.results.reduce((sum, result) => sum + result.score, 0) / payload.results.length : 0;
983
+ const experiment = payload.experiment ?? "default";
984
+ return `feat(results): ${experiment} - ${passed}/${payload.results.length} PASS (${avgScore.toFixed(3)})`;
985
+ }
986
+ function buildPrBody(payload) {
987
+ const sections = payload.eval_summaries.map((summary) => {
988
+ const table = summary.results.map((result) => `| ${result.test_id} | ${result.score.toFixed(3)} | ${result.status} |`).join("\n");
989
+ return [
990
+ `### ${summary.eval_file}`,
991
+ "",
992
+ `Summary: ${summary.passed}/${summary.total} PASS (${summary.avg_score.toFixed(3)})`,
993
+ "",
994
+ "| Test | Score | Status |",
995
+ "|---|---|---|",
996
+ table || "| (no results) | 0.000 | ERROR |"
997
+ ].join("\n");
998
+ }).join("\n\n");
999
+ return [
1000
+ "## Results",
1001
+ "",
1002
+ sections,
1003
+ "",
1004
+ `Run: ${path6.basename(payload.run_dir)}`,
1005
+ `Experiment: ${payload.experiment ?? "default"}`,
1006
+ `Eval Files: ${payload.test_files.join(", ")}`
1007
+ ].join("\n");
1008
+ }
1009
+ async function maybeWarnLargeArtifact(runDir) {
1010
+ const sizeBytes = await directorySizeBytes(runDir);
1011
+ if (sizeBytes > SIZE_WARNING_BYTES) {
1012
+ console.warn(
1013
+ `Warning: run artifacts total ${(sizeBytes / (1024 * 1024)).toFixed(1)}MB. Export will continue.`
1014
+ );
1015
+ }
1016
+ }
1017
+ async function loadNormalizedResultsConfig(cwd) {
1018
+ const repoRoot = await findRepoRoot(cwd) ?? cwd;
1019
+ const config = await loadConfig(path6.join(cwd, "_"), repoRoot);
1020
+ if (!config?.results?.export) {
1021
+ return void 0;
1022
+ }
1023
+ return normalizeResultsExportConfig(config.results.export);
1024
+ }
1025
+ function encodeRemoteRunId(filename) {
1026
+ return `${REMOTE_RUN_PREFIX}${filename}`;
1027
+ }
1028
+ async function getRemoteResultsStatus(cwd) {
1029
+ const config = await loadNormalizedResultsConfig(cwd);
1030
+ const status = getResultsRepoStatus(config);
1031
+ const runCount = config && status.available ? listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length : 0;
1032
+ return {
1033
+ ...status,
1034
+ run_count: runCount
1035
+ };
1036
+ }
1037
+ async function syncRemoteResults(cwd) {
1038
+ const config = await loadNormalizedResultsConfig(cwd);
1039
+ if (!config) {
1040
+ return {
1041
+ ...getResultsRepoStatus(),
1042
+ run_count: 0
1043
+ };
1044
+ }
1045
+ try {
1046
+ await syncResultsRepo(config);
1047
+ } catch (error) {
1048
+ return {
1049
+ ...getResultsRepoStatus(config),
1050
+ run_count: 0,
1051
+ last_error: getStatusMessage(error)
1052
+ };
1053
+ }
1054
+ return getRemoteResultsStatus(cwd);
1055
+ }
1056
+ async function listMergedResultFiles(cwd, limit) {
1057
+ const localRuns = listResultFiles(cwd).map(
1058
+ (meta) => ({
1059
+ ...meta,
1060
+ source: "local",
1061
+ raw_filename: meta.filename
1062
+ })
1063
+ );
1064
+ const remoteStatus = await getRemoteResultsStatus(cwd);
1065
+ const config = await loadNormalizedResultsConfig(cwd);
1066
+ if (!config || !remoteStatus.available) {
1067
+ return {
1068
+ runs: limit !== void 0 && limit > 0 ? localRuns.slice(0, limit) : localRuns,
1069
+ remote_status: remoteStatus
1070
+ };
1071
+ }
1072
+ const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
1073
+ (meta) => ({
1074
+ ...meta,
1075
+ filename: encodeRemoteRunId(meta.filename),
1076
+ raw_filename: meta.filename,
1077
+ source: "remote"
1078
+ })
1079
+ );
1080
+ const merged = [...localRuns, ...remoteRuns].sort(
1081
+ (a, b) => b.timestamp.localeCompare(a.timestamp)
1082
+ );
1083
+ return {
1084
+ runs: limit !== void 0 && limit > 0 ? merged.slice(0, limit) : merged,
1085
+ remote_status: remoteStatus
1086
+ };
1087
+ }
1088
+ async function findRunById(cwd, runId) {
1089
+ const { runs } = await listMergedResultFiles(cwd);
1090
+ return runs.find((run) => run.filename === runId);
1091
+ }
1092
+ async function maybeAutoExportRunArtifacts(payload) {
1093
+ const config = await loadNormalizedResultsConfig(payload.cwd);
1094
+ if (!config?.auto_push) {
1095
+ return;
1096
+ }
1097
+ try {
1098
+ await maybeWarnLargeArtifact(payload.run_dir);
1099
+ const branchName = buildBranchName(config, payload);
1100
+ const prepared = await prepareResultsRepoBranch(config, branchName);
1101
+ try {
1102
+ const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
1103
+ const destinationDir = path6.join(prepared.repoDir, config.path, relativeRunPath);
1104
+ await stageResultsArtifacts({
1105
+ repoDir: prepared.repoDir,
1106
+ sourceDir: payload.run_dir,
1107
+ destinationDir
1108
+ });
1109
+ const commitTitle = buildCommitTitle(payload);
1110
+ const changed = await commitAndPushResultsBranch({
1111
+ repoDir: prepared.repoDir,
1112
+ branchName,
1113
+ commitMessage: commitTitle
1114
+ });
1115
+ if (!changed) {
1116
+ console.warn("Warning: results export produced no git changes. Skipping PR creation.");
1117
+ return;
1118
+ }
1119
+ const prUrl = await createDraftResultsPr({
1120
+ repo: config.repo,
1121
+ repoDir: prepared.repoDir,
1122
+ baseBranch: prepared.baseBranch,
1123
+ branchName,
1124
+ title: commitTitle,
1125
+ body: buildPrBody(payload)
1126
+ });
1127
+ console.log(`Remote results draft PR created: ${prUrl}`);
1128
+ } finally {
1129
+ await prepared.cleanup();
1130
+ }
1131
+ } catch (error) {
1132
+ console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
1133
+ console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
1134
+ }
1135
+ }
1136
+
1137
+ // src/commands/eval/artifact-writer.ts
1138
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
1139
+ import path7 from "node:path";
1140
+
1141
+ // src/utils/case-conversion.ts
1142
+ function toSnakeCase(str) {
1143
+ if (/^[A-Z]/.test(str)) {
1144
+ return str;
376
1145
  }
377
- return existing;
1146
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
378
1147
  }
379
- function resolveRunManifestPath(filePath) {
380
- if (isDirectoryPath(filePath)) {
381
- return resolveWorkspaceOrFilePath(filePath);
1148
+ function toSnakeCaseDeep(obj) {
1149
+ if (obj === null || obj === void 0) {
1150
+ return obj;
382
1151
  }
383
- if (!isRunManifestPath(filePath)) {
384
- throw new Error(
385
- `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
386
- );
1152
+ if (Array.isArray(obj)) {
1153
+ return obj.map((item) => toSnakeCaseDeep(item));
387
1154
  }
388
- return filePath;
1155
+ if (typeof obj === "object") {
1156
+ const result = {};
1157
+ for (const [key, value] of Object.entries(obj)) {
1158
+ const snakeKey = toSnakeCase(key);
1159
+ result[snakeKey] = toSnakeCaseDeep(value);
1160
+ }
1161
+ return result;
1162
+ }
1163
+ return obj;
389
1164
  }
390
1165
 
391
1166
  // src/commands/eval/artifact-writer.ts
@@ -524,7 +1299,7 @@ function buildTimingArtifact(results) {
524
1299
  }
525
1300
  };
526
1301
  }
527
- function buildBenchmarkArtifact(results, evalFile = "") {
1302
+ function buildBenchmarkArtifact(results, evalFile = "", experiment) {
528
1303
  const targetSet = /* @__PURE__ */ new Set();
529
1304
  const testIdSet = /* @__PURE__ */ new Set();
530
1305
  for (const result of results) {
@@ -549,7 +1324,7 @@ function buildBenchmarkArtifact(results, evalFile = "") {
549
1324
  tokens: computeStats(tokens)
550
1325
  };
551
1326
  const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
552
- if (toolCallCounts.some((c) => c > 0)) {
1327
+ if (toolCallCounts.some((c2) => c2 > 0)) {
553
1328
  entry.tool_calls = computeStats(toolCallCounts);
554
1329
  }
555
1330
  const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
@@ -595,7 +1370,8 @@ function buildBenchmarkArtifact(results, evalFile = "") {
595
1370
  eval_file: evalFile,
596
1371
  timestamp,
597
1372
  targets,
598
- tests_run: testIds
1373
+ tests_run: testIds,
1374
+ experiment
599
1375
  },
600
1376
  run_summary: runSummary,
601
1377
  per_grader_summary: perEvaluatorSummary,
@@ -622,7 +1398,7 @@ function buildArtifactSubdir(result) {
622
1398
  segments.push(safeArtifactPathSegment(evalSet, "default"));
623
1399
  }
624
1400
  segments.push(safeTestId(result.testId));
625
- return path4.posix.join(...segments);
1401
+ return path7.posix.join(...segments);
626
1402
  }
627
1403
  function formatOutputMarkdown(output) {
628
1404
  return output.map((msg) => `@[${msg.role}]:
@@ -655,11 +1431,11 @@ function buildResultIndexArtifact(result) {
655
1431
  failure_stage: result.failureStage,
656
1432
  failure_reason_code: result.failureReasonCode,
657
1433
  workspace_path: result.workspacePath,
658
- grading_path: path4.posix.join(artifactSubdir, "grading.json"),
659
- timing_path: path4.posix.join(artifactSubdir, "timing.json"),
660
- input_path: input ? path4.posix.join(artifactSubdir, "input.md") : void 0,
661
- output_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
662
- response_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0
1434
+ grading_path: path7.posix.join(artifactSubdir, "grading.json"),
1435
+ timing_path: path7.posix.join(artifactSubdir, "timing.json"),
1436
+ input_path: input ? path7.posix.join(artifactSubdir, "input.md") : void 0,
1437
+ output_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
1438
+ response_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0
663
1439
  };
664
1440
  }
665
1441
  async function writeJsonlFile(filePath, records) {
@@ -669,18 +1445,18 @@ async function writeJsonlFile(filePath, records) {
669
1445
  }
670
1446
  async function writeArtifactsFromResults(results, outputDir, options) {
671
1447
  const testArtifactDir = outputDir;
672
- const timingPath = path4.join(outputDir, "timing.json");
673
- const benchmarkPath = path4.join(outputDir, "benchmark.json");
674
- const indexPath = path4.join(outputDir, RESULT_INDEX_FILENAME);
1448
+ const timingPath = path7.join(outputDir, "timing.json");
1449
+ const benchmarkPath = path7.join(outputDir, "benchmark.json");
1450
+ const indexPath = path7.join(outputDir, RESULT_INDEX_FILENAME);
675
1451
  await mkdir(outputDir, { recursive: true });
676
1452
  const indexRecords = [];
677
1453
  for (const result of results) {
678
1454
  const grading = buildGradingArtifact(result);
679
1455
  const timing2 = buildTimingArtifact([result]);
680
1456
  const artifactSubdir = buildArtifactSubdir(result);
681
- const testDir = path4.join(outputDir, artifactSubdir);
682
- const gradingPath = path4.join(testDir, "grading.json");
683
- const perTestTimingPath = path4.join(testDir, "timing.json");
1457
+ const testDir = path7.join(outputDir, artifactSubdir);
1458
+ const gradingPath = path7.join(testDir, "grading.json");
1459
+ const perTestTimingPath = path7.join(testDir, "timing.json");
684
1460
  await mkdir(testDir, { recursive: true });
685
1461
  await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
686
1462
  `, "utf8");
@@ -688,23 +1464,26 @@ async function writeArtifactsFromResults(results, outputDir, options) {
688
1464
  `, "utf8");
689
1465
  const input = extractInput(result);
690
1466
  if (input) {
691
- await writeFile(path4.join(testDir, "input.md"), input, "utf8");
1467
+ await writeFile(path7.join(testDir, "input.md"), input, "utf8");
692
1468
  }
693
1469
  if (result.output && result.output.length > 0) {
694
- const outputsDir = path4.join(testDir, "outputs");
1470
+ const outputsDir = path7.join(testDir, "outputs");
695
1471
  await mkdir(outputsDir, { recursive: true });
696
1472
  await writeFile(
697
- path4.join(outputsDir, "response.md"),
1473
+ path7.join(outputsDir, "response.md"),
698
1474
  formatOutputMarkdown(result.output),
699
1475
  "utf8"
700
1476
  );
701
1477
  }
702
- indexRecords.push(buildResultIndexArtifact(result));
1478
+ indexRecords.push({
1479
+ ...buildResultIndexArtifact(result),
1480
+ experiment: options?.experiment
1481
+ });
703
1482
  }
704
1483
  const timing = buildTimingArtifact(results);
705
1484
  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
706
1485
  `, "utf8");
707
- const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
1486
+ const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
708
1487
  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
709
1488
  `, "utf8");
710
1489
  await writeJsonlFile(indexPath, indexRecords);
@@ -758,13 +1537,13 @@ async function writeBenchmarkJson(outputPath, results) {
758
1537
  // src/commands/eval/env.ts
759
1538
  import { constants as constants3 } from "node:fs";
760
1539
  import { access as access3 } from "node:fs/promises";
761
- import path5 from "node:path";
1540
+ import path8 from "node:path";
762
1541
  import { config as loadDotenv } from "dotenv";
763
1542
  function uniqueDirs(directories) {
764
1543
  const seen = /* @__PURE__ */ new Set();
765
1544
  const result = [];
766
1545
  for (const dir of directories) {
767
- const absolute = path5.resolve(dir);
1546
+ const absolute = path8.resolve(dir);
768
1547
  if (seen.has(absolute)) {
769
1548
  continue;
770
1549
  }
@@ -783,14 +1562,14 @@ async function fileExists2(filePath) {
783
1562
  }
784
1563
  function collectAncestorDirectories(start, boundary) {
785
1564
  const directories = [];
786
- const boundaryDir = path5.resolve(boundary);
787
- let current = path5.resolve(start);
1565
+ const boundaryDir = path8.resolve(boundary);
1566
+ let current = path8.resolve(start);
788
1567
  while (current !== void 0) {
789
1568
  directories.push(current);
790
1569
  if (current === boundaryDir) {
791
1570
  break;
792
1571
  }
793
- const parent = path5.dirname(current);
1572
+ const parent = path8.dirname(current);
794
1573
  if (parent === current) {
795
1574
  break;
796
1575
  }
@@ -800,12 +1579,12 @@ function collectAncestorDirectories(start, boundary) {
800
1579
  }
801
1580
  async function loadEnvFromHierarchy(options) {
802
1581
  const { testFilePath, repoRoot, verbose } = options;
803
- const testDir = path5.dirname(path5.resolve(testFilePath));
1582
+ const testDir = path8.dirname(path8.resolve(testFilePath));
804
1583
  const cwd = process.cwd();
805
1584
  const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
806
1585
  const envFiles = [];
807
1586
  for (const dir of searchDirs) {
808
- const candidate = path5.join(dir, ".env");
1587
+ const candidate = path8.join(dir, ".env");
809
1588
  if (await fileExists2(candidate)) {
810
1589
  envFiles.push(candidate);
811
1590
  }
@@ -827,11 +1606,11 @@ async function loadEnvFromHierarchy(options) {
827
1606
  }
828
1607
 
829
1608
  // src/commands/eval/output-writer.ts
830
- import path11 from "node:path";
1609
+ import path14 from "node:path";
831
1610
 
832
1611
  // src/commands/eval/html-writer.ts
833
1612
  import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
834
- import path6 from "node:path";
1613
+ import path9 from "node:path";
835
1614
 
836
1615
  // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
837
1616
  var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
@@ -1050,7 +1829,7 @@ var HtmlWriter = class _HtmlWriter {
1050
1829
  this.filePath = filePath;
1051
1830
  }
1052
1831
  static async open(filePath) {
1053
- await mkdir2(path6.dirname(filePath), { recursive: true });
1832
+ await mkdir2(path9.dirname(filePath), { recursive: true });
1054
1833
  const writer = new _HtmlWriter(filePath);
1055
1834
  await writer.writeHtml();
1056
1835
  return writer;
@@ -1561,7 +2340,7 @@ var SCRIPT = `
1561
2340
 
1562
2341
  // src/commands/eval/json-writer.ts
1563
2342
  import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
1564
- import path7 from "node:path";
2343
+ import path10 from "node:path";
1565
2344
  var JsonWriter = class _JsonWriter {
1566
2345
  filePath;
1567
2346
  results = [];
@@ -1570,7 +2349,7 @@ var JsonWriter = class _JsonWriter {
1570
2349
  this.filePath = filePath;
1571
2350
  }
1572
2351
  static async open(filePath) {
1573
- await mkdir3(path7.dirname(filePath), { recursive: true });
2352
+ await mkdir3(path10.dirname(filePath), { recursive: true });
1574
2353
  return new _JsonWriter(filePath);
1575
2354
  }
1576
2355
  async append(result) {
@@ -1605,7 +2384,7 @@ var JsonWriter = class _JsonWriter {
1605
2384
  // src/commands/eval/jsonl-writer.ts
1606
2385
  import { createWriteStream } from "node:fs";
1607
2386
  import { mkdir as mkdir4 } from "node:fs/promises";
1608
- import path8 from "node:path";
2387
+ import path11 from "node:path";
1609
2388
  import { finished } from "node:stream/promises";
1610
2389
  var JsonlWriter = class _JsonlWriter {
1611
2390
  stream;
@@ -1615,7 +2394,7 @@ var JsonlWriter = class _JsonlWriter {
1615
2394
  this.stream = stream;
1616
2395
  }
1617
2396
  static async open(filePath) {
1618
- await mkdir4(path8.dirname(filePath), { recursive: true });
2397
+ await mkdir4(path11.dirname(filePath), { recursive: true });
1619
2398
  const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
1620
2399
  return new _JsonlWriter(stream);
1621
2400
  }
@@ -1647,7 +2426,7 @@ var JsonlWriter = class _JsonlWriter {
1647
2426
 
1648
2427
  // src/commands/eval/junit-writer.ts
1649
2428
  import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
1650
- import path9 from "node:path";
2429
+ import path12 from "node:path";
1651
2430
  function escapeXml(str) {
1652
2431
  return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
1653
2432
  }
@@ -1661,7 +2440,7 @@ var JunitWriter = class _JunitWriter {
1661
2440
  this.threshold = options?.threshold ?? 0.5;
1662
2441
  }
1663
2442
  static async open(filePath, options) {
1664
- await mkdir5(path9.dirname(filePath), { recursive: true });
2443
+ await mkdir5(path12.dirname(filePath), { recursive: true });
1665
2444
  return new _JunitWriter(filePath, options);
1666
2445
  }
1667
2446
  async append(result) {
@@ -1737,7 +2516,7 @@ ${suiteXmls.join("\n")}
1737
2516
  // src/commands/eval/yaml-writer.ts
1738
2517
  import { createWriteStream as createWriteStream2 } from "node:fs";
1739
2518
  import { mkdir as mkdir6 } from "node:fs/promises";
1740
- import path10 from "node:path";
2519
+ import path13 from "node:path";
1741
2520
  import { finished as finished2 } from "node:stream/promises";
1742
2521
  import { stringify as stringifyYaml } from "yaml";
1743
2522
  var YamlWriter = class _YamlWriter {
@@ -1749,7 +2528,7 @@ var YamlWriter = class _YamlWriter {
1749
2528
  this.stream = stream;
1750
2529
  }
1751
2530
  static async open(filePath) {
1752
- await mkdir6(path10.dirname(filePath), { recursive: true });
2531
+ await mkdir6(path13.dirname(filePath), { recursive: true });
1753
2532
  const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
1754
2533
  return new _YamlWriter(stream);
1755
2534
  }
@@ -1805,7 +2584,7 @@ async function createOutputWriter(filePath, format) {
1805
2584
  }
1806
2585
  var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
1807
2586
  function createWriterFromPath(filePath, options) {
1808
- const ext = path11.extname(filePath).toLowerCase();
2587
+ const ext = path14.extname(filePath).toLowerCase();
1809
2588
  switch (ext) {
1810
2589
  case ".jsonl":
1811
2590
  return JsonlWriter.open(filePath);
@@ -1838,10 +2617,10 @@ function useColors() {
1838
2617
  }
1839
2618
  function formatVerdict(score, verdict) {
1840
2619
  if (verdict === void 0) return "";
1841
- const colors = useColors();
2620
+ const colors2 = useColors();
1842
2621
  const scoreStr = score !== void 0 ? score.toFixed(3) : "";
1843
2622
  const verdictLabel = verdict === "ERROR" ? "ERROR" : `${scoreStr} ${verdict}`;
1844
- if (!colors) return ` | ${verdictLabel}`;
2623
+ if (!colors2) return ` | ${verdictLabel}`;
1845
2624
  const color = verdict === "PASS" ? ANSI_GREEN : verdict === "FAIL" ? ANSI_RED2 : ANSI_YELLOW2;
1846
2625
  return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET2}`;
1847
2626
  }
@@ -1901,12 +2680,12 @@ var ProgressDisplay = class {
1901
2680
  }
1902
2681
  addLogPaths(paths, provider) {
1903
2682
  const newPaths = [];
1904
- for (const path17 of paths) {
1905
- if (this.logPathSet.has(path17)) {
2683
+ for (const path19 of paths) {
2684
+ if (this.logPathSet.has(path19)) {
1906
2685
  continue;
1907
2686
  }
1908
- this.logPathSet.add(path17);
1909
- newPaths.push(path17);
2687
+ this.logPathSet.add(path19);
2688
+ newPaths.push(path19);
1910
2689
  }
1911
2690
  if (newPaths.length === 0) {
1912
2691
  return;
@@ -1919,8 +2698,8 @@ var ProgressDisplay = class {
1919
2698
  this.hasPrintedLogHeader = true;
1920
2699
  }
1921
2700
  const startIndex = this.logPaths.length - newPaths.length;
1922
- newPaths.forEach((path17, offset) => {
1923
- console.log(`${startIndex + offset + 1}. ${path17}`);
2701
+ newPaths.forEach((path19, offset) => {
2702
+ console.log(`${startIndex + offset + 1}. ${path19}`);
1924
2703
  });
1925
2704
  }
1926
2705
  finish() {
@@ -1931,149 +2710,34 @@ var ProgressDisplay = class {
1931
2710
  }
1932
2711
  };
1933
2712
 
1934
- // src/commands/results/manifest.ts
1935
- import { existsSync as existsSync2, readFileSync } from "node:fs";
1936
- import path12 from "node:path";
1937
- function parseJsonlLines(content) {
1938
- return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
1939
- }
1940
- function parseMarkdownMessages(content) {
1941
- const trimmed = content.trim();
1942
- if (!trimmed.startsWith("@[")) {
1943
- return [];
1944
- }
1945
- const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
1946
- return matches.map((match) => ({
1947
- role: match[1],
1948
- content: match[2].trimEnd()
1949
- }));
1950
- }
1951
- function readOptionalText(baseDir, relativePath) {
1952
- if (!relativePath) {
1953
- return void 0;
1954
- }
1955
- const absolutePath = path12.join(baseDir, relativePath);
1956
- if (!existsSync2(absolutePath)) {
1957
- return void 0;
1958
- }
1959
- return readFileSync(absolutePath, "utf8");
1960
- }
1961
- function readOptionalJson(baseDir, relativePath) {
1962
- const text = readOptionalText(baseDir, relativePath);
1963
- if (!text) {
1964
- return void 0;
1965
- }
1966
- try {
1967
- return JSON.parse(text);
1968
- } catch {
1969
- return void 0;
1970
- }
1971
- }
1972
- function hydrateInput(baseDir, record) {
1973
- const inputText = readOptionalText(baseDir, record.input_path);
1974
- if (!inputText) {
1975
- return void 0;
1976
- }
1977
- const messages = parseMarkdownMessages(inputText);
1978
- return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
1979
- }
1980
- function hydrateOutput(baseDir, record) {
1981
- const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
1982
- if (!responseText) {
1983
- return void 0;
1984
- }
1985
- const messages = parseMarkdownMessages(responseText);
1986
- if (messages.length > 0) {
1987
- return messages.map((message) => ({
1988
- role: message.role,
1989
- content: message.content
1990
- }));
1991
- }
1992
- return [{ role: "assistant", content: responseText.trimEnd() }];
1993
- }
1994
- function hydrateManifestRecord(baseDir, record) {
1995
- const grading = readOptionalJson(baseDir, record.grading_path);
1996
- const timing = readOptionalJson(baseDir, record.timing_path);
1997
- const testId = record.test_id ?? "unknown";
1998
- return {
1999
- timestamp: record.timestamp,
2000
- testId,
2001
- suite: record.suite,
2002
- category: record.category,
2003
- target: record.target,
2004
- score: record.score,
2005
- executionStatus: record.execution_status,
2006
- error: record.error,
2007
- assertions: grading?.assertions.map((assertion) => ({
2008
- text: assertion.text,
2009
- passed: assertion.passed,
2010
- evidence: assertion.evidence
2011
- })),
2012
- scores: grading?.evaluators?.map((evaluator) => ({
2013
- name: evaluator.name,
2014
- type: evaluator.type,
2015
- score: evaluator.score,
2016
- assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
2017
- text: String(assertion.text ?? ""),
2018
- passed: Boolean(assertion.passed),
2019
- evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
2020
- })) : void 0,
2021
- weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
2022
- verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
2023
- details: evaluator.details
2024
- })) ?? record.scores,
2025
- tokenUsage: timing?.token_usage ? {
2026
- input: timing.token_usage.input,
2027
- output: timing.token_usage.output,
2028
- reasoning: timing.token_usage.reasoning
2029
- } : record.token_usage,
2030
- durationMs: timing?.duration_ms ?? record.duration_ms,
2031
- costUsd: record.cost_usd,
2032
- input: hydrateInput(baseDir, record),
2033
- output: hydrateOutput(baseDir, record)
2034
- };
2035
- }
2036
- function parseResultManifest(content) {
2037
- return parseJsonlLines(content);
2038
- }
2039
- function resolveResultSourcePath(source, cwd) {
2040
- const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
2041
- if (isDirectoryPath(resolved) || path12.basename(resolved) === RESULT_INDEX_FILENAME) {
2042
- return resolveRunManifestPath(resolved);
2043
- }
2044
- return resolved;
2045
- }
2046
- function loadManifestResults(sourceFile) {
2047
- const resolvedSourceFile = resolveRunManifestPath(sourceFile);
2048
- const content = readFileSync(resolvedSourceFile, "utf8");
2049
- const records = parseResultManifest(content);
2050
- const baseDir = path12.dirname(resolvedSourceFile);
2051
- return records.map((record) => hydrateManifestRecord(baseDir, record));
2052
- }
2053
- function loadLightweightResults(sourceFile) {
2054
- const resolvedSourceFile = resolveRunManifestPath(sourceFile);
2055
- const content = readFileSync(resolvedSourceFile, "utf8");
2056
- return parseResultManifest(content).map((record) => ({
2057
- testId: record.test_id ?? "unknown",
2058
- suite: record.suite,
2059
- target: record.target,
2060
- experiment: record.experiment,
2061
- score: record.score,
2062
- scores: record.scores,
2063
- executionStatus: record.execution_status,
2064
- error: record.error,
2065
- timestamp: record.timestamp
2066
- }));
2067
- }
2068
-
2069
2713
  // src/commands/eval/retry-errors.ts
2070
2714
  async function loadRetrySourceResults(jsonlPath) {
2071
2715
  return loadManifestResults(resolveResultSourcePath(jsonlPath));
2072
2716
  }
2717
+ function escapeGlob(id) {
2718
+ return id.replace(/[*?[\]{}()!@#+|\\]/g, "\\$&");
2719
+ }
2073
2720
  async function loadErrorTestIds(jsonlPath) {
2074
2721
  const ids = (await loadRetrySourceResults(jsonlPath)).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
2075
2722
  return [...new Set(ids)];
2076
2723
  }
2724
+ async function loadFullyCompletedTestIds(jsonlPath) {
2725
+ const results = await loadRetrySourceResults(jsonlPath);
2726
+ const allIds = /* @__PURE__ */ new Set();
2727
+ const errorIds = /* @__PURE__ */ new Set();
2728
+ for (const result of results) {
2729
+ if (!result.testId) continue;
2730
+ allIds.add(result.testId);
2731
+ if (result.executionStatus === "execution_error") {
2732
+ errorIds.add(result.testId);
2733
+ }
2734
+ }
2735
+ return [...allIds].filter((id) => !errorIds.has(id));
2736
+ }
2737
+ function buildExclusionFilter(completedIds) {
2738
+ const escaped = completedIds.map(escapeGlob);
2739
+ return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(",")}}`;
2740
+ }
2077
2741
  async function loadNonErrorResults(jsonlPath) {
2078
2742
  return (await loadRetrySourceResults(jsonlPath)).filter(
2079
2743
  (result) => result.testId && result.executionStatus !== "execution_error"
@@ -2082,7 +2746,7 @@ async function loadNonErrorResults(jsonlPath) {
2082
2746
 
2083
2747
  // src/commands/eval/run-cache.ts
2084
2748
  import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
2085
- import path13 from "node:path";
2749
+ import path15 from "node:path";
2086
2750
  var CACHE_FILENAME = "cache.json";
2087
2751
  function resolveRunCacheFile(cache) {
2088
2752
  if (cache.lastRunDir) {
@@ -2091,7 +2755,7 @@ function resolveRunCacheFile(cache) {
2091
2755
  return "";
2092
2756
  }
2093
2757
  function cachePath(cwd) {
2094
- return path13.join(cwd, ".agentv", CACHE_FILENAME);
2758
+ return path15.join(cwd, ".agentv", CACHE_FILENAME);
2095
2759
  }
2096
2760
  async function loadRunCache(cwd) {
2097
2761
  try {
@@ -2102,13 +2766,13 @@ async function loadRunCache(cwd) {
2102
2766
  }
2103
2767
  }
2104
2768
  async function saveRunCache(cwd, resultPath) {
2105
- if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2769
+ if (path15.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2106
2770
  return;
2107
2771
  }
2108
- const dir = path13.join(cwd, ".agentv");
2772
+ const dir = path15.join(cwd, ".agentv");
2109
2773
  await mkdir7(dir, { recursive: true });
2110
2774
  const cache = {
2111
- lastRunDir: path13.dirname(resultPath),
2775
+ lastRunDir: path15.dirname(resultPath),
2112
2776
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
2113
2777
  };
2114
2778
  await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
@@ -2233,7 +2897,7 @@ function calculateEvaluationSummary(results, options) {
2233
2897
  byFailureReason
2234
2898
  };
2235
2899
  }
2236
- function formatScore(value) {
2900
+ function formatScore2(value) {
2237
2901
  return value.toFixed(3);
2238
2902
  }
2239
2903
  function formatEvaluationSummary(summary, options) {
@@ -2261,13 +2925,13 @@ function formatEvaluationSummary(summary, options) {
2261
2925
  let verdictColor;
2262
2926
  let verdictText;
2263
2927
  if (allExecutionErrors) {
2264
- overallVerdict = "INCONCLUSIVE";
2928
+ overallVerdict = "ERROR";
2265
2929
  verdictColor = "\x1B[33m";
2266
- verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
2930
+ verdictText = `RESULT: ERROR (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
2267
2931
  } else {
2268
2932
  overallVerdict = overallPassed ? "PASS" : "FAIL";
2269
2933
  verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
2270
- verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
2934
+ verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} scored >= ${threshold}, mean: ${formatScore2(summary.mean)})`;
2271
2935
  }
2272
2936
  lines.push("\n==================================================");
2273
2937
  if (useColor) {
@@ -2290,16 +2954,16 @@ function formatEvaluationSummary(summary, options) {
2290
2954
  if (summary.executionErrorCount > 0) {
2291
2955
  const qualityCount = summary.total - summary.executionErrorCount;
2292
2956
  lines.push(
2293
- `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
2957
+ `Mean score: ${formatScore2(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
2294
2958
  );
2295
2959
  } else {
2296
- lines.push(`Mean score: ${formatScore(summary.mean)}`);
2960
+ lines.push(`Mean score: ${formatScore2(summary.mean)}`);
2297
2961
  }
2298
- lines.push(`Median score: ${formatScore(summary.median)}`);
2299
- lines.push(`Min score: ${formatScore(summary.min)}`);
2300
- lines.push(`Max score: ${formatScore(summary.max)}`);
2962
+ lines.push(`Median score: ${formatScore2(summary.median)}`);
2963
+ lines.push(`Min score: ${formatScore2(summary.min)}`);
2964
+ lines.push(`Max score: ${formatScore2(summary.max)}`);
2301
2965
  if (typeof summary.standardDeviation === "number") {
2302
- lines.push(`Std deviation: ${formatScore(summary.standardDeviation)}`);
2966
+ lines.push(`Std deviation: ${formatScore2(summary.standardDeviation)}`);
2303
2967
  }
2304
2968
  lines.push("\nScore distribution:");
2305
2969
  for (const bin of summary.histogram) {
@@ -2308,11 +2972,11 @@ function formatEvaluationSummary(summary, options) {
2308
2972
  }
2309
2973
  lines.push("\nTop performing tests:");
2310
2974
  summary.topResults.forEach((result, index) => {
2311
- lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
2975
+ lines.push(` ${index + 1}. ${result.testId}: ${formatScore2(result.score)}`);
2312
2976
  });
2313
2977
  lines.push("\nLowest performing tests:");
2314
2978
  summary.bottomResults.forEach((result, index) => {
2315
- lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
2979
+ lines.push(` ${index + 1}. ${result.testId}: ${formatScore2(result.score)}`);
2316
2980
  });
2317
2981
  const failureStageEntries = Object.entries(summary.byFailureStage);
2318
2982
  if (failureStageEntries.length > 0) {
@@ -2361,7 +3025,7 @@ function formatMatrixSummary(results) {
2361
3025
  for (const testId of testIds) {
2362
3026
  const cells = targets.map((target) => {
2363
3027
  const score = scoreMap.get(testId)?.get(target);
2364
- return score !== void 0 ? formatScore(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
3028
+ return score !== void 0 ? formatScore2(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
2365
3029
  });
2366
3030
  lines.push(`${testId.padEnd(testIdColWidth)} ${cells.join(" ")}`);
2367
3031
  }
@@ -2369,7 +3033,7 @@ function formatMatrixSummary(results) {
2369
3033
  const avgCells = targets.map((target) => {
2370
3034
  const scores = results.filter((r) => r.target === target).map((r) => r.score);
2371
3035
  const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
2372
- return formatScore(avg).padEnd(targetColWidth);
3036
+ return formatScore2(avg).padEnd(targetColWidth);
2373
3037
  });
2374
3038
  lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
2375
3039
  return lines.join("\n");
@@ -2377,7 +3041,7 @@ function formatMatrixSummary(results) {
2377
3041
 
2378
3042
  // ../../packages/core/dist/evaluation/validation/index.js
2379
3043
  import { readFile as readFile3 } from "node:fs/promises";
2380
- import path14 from "node:path";
3044
+ import path16 from "node:path";
2381
3045
  import { parse } from "yaml";
2382
3046
  import { readFile as readFile22 } from "node:fs/promises";
2383
3047
  import path22 from "node:path";
@@ -2420,8 +3084,8 @@ async function detectFileType(filePath) {
2420
3084
  }
2421
3085
  }
2422
3086
  function inferFileTypeFromPath(filePath) {
2423
- const normalized = path14.normalize(filePath).replace(/\\/g, "/");
2424
- const basename = path14.basename(filePath);
3087
+ const normalized = path16.normalize(filePath).replace(/\\/g, "/");
3088
+ const basename = path16.basename(filePath);
2425
3089
  if (normalized.includes("/.agentv/")) {
2426
3090
  if (basename === "config.yaml" || basename === "config.yml") {
2427
3091
  return "config";
@@ -2747,12 +3411,21 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
2747
3411
  const hooks = workspace.hooks;
2748
3412
  const afterEachHook = isObject(hooks) ? hooks.after_each : void 0;
2749
3413
  const isolation = workspace.isolation;
3414
+ const docker = workspace.docker;
2750
3415
  if (Array.isArray(repos)) {
2751
3416
  for (const repo of repos) {
2752
3417
  if (!isObject(repo)) continue;
2753
3418
  const source = repo.source;
2754
3419
  const checkout = repo.checkout;
2755
3420
  const clone = repo.clone;
3421
+ if (!isObject(source) && !isObject(docker)) {
3422
+ errors.push({
3423
+ severity: "error",
3424
+ filePath,
3425
+ location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
3426
+ message: "repos[].source is required for non-Docker workspaces. Source-less repos are only valid when workspace.docker is configured (repo exists inside the container)."
3427
+ });
3428
+ }
2756
3429
  if (isObject(source) && isObject(checkout)) {
2757
3430
  const sourceType = source.type;
2758
3431
  const resolve = checkout.resolve;
@@ -2760,8 +3433,8 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
2760
3433
  errors.push({
2761
3434
  severity: "warning",
2762
3435
  filePath,
2763
- location: `workspace.repos[path=${repo.path}]`,
2764
- message: "checkout.resolve has no effect for a local source. Use source.type to choose where the repo comes from; keep checkout.ref or checkout.ancestor only when pinning a local source."
3436
+ location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
3437
+ message: "checkout.resolve has no effect for a local source. Use source.type to choose where the repo comes from; keep checkout.ref, checkout.base_commit, or checkout.ancestor only when pinning a local source."
2765
3438
  });
2766
3439
  }
2767
3440
  }
@@ -2772,7 +3445,7 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
2772
3445
  errors.push({
2773
3446
  severity: "warning",
2774
3447
  filePath,
2775
- location: `workspace.repos[path=${repo.path}]`,
3448
+ location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
2776
3449
  message: `clone.depth (${depth}) may be insufficient for checkout.ancestor (${ancestor}). Recommend depth >= ${ancestor + 1}.`
2777
3450
  });
2778
3451
  }
@@ -3522,11 +4195,69 @@ async function validateConfigFile(filePath) {
3522
4195
  });
3523
4196
  }
3524
4197
  }
4198
+ const results = config.results;
4199
+ if (results !== void 0) {
4200
+ if (typeof results !== "object" || results === null || Array.isArray(results)) {
4201
+ errors.push({
4202
+ severity: "error",
4203
+ filePath,
4204
+ location: "results",
4205
+ message: "Field 'results' must be an object"
4206
+ });
4207
+ } else {
4208
+ const exportConfig = results.export;
4209
+ if (exportConfig !== void 0) {
4210
+ if (typeof exportConfig !== "object" || exportConfig === null || Array.isArray(exportConfig)) {
4211
+ errors.push({
4212
+ severity: "error",
4213
+ filePath,
4214
+ location: "results.export",
4215
+ message: "Field 'results.export' must be an object"
4216
+ });
4217
+ } else {
4218
+ const exportRecord = exportConfig;
4219
+ if (typeof exportRecord.repo !== "string" || exportRecord.repo.trim().length === 0) {
4220
+ errors.push({
4221
+ severity: "error",
4222
+ filePath,
4223
+ location: "results.export.repo",
4224
+ message: "Field 'results.export.repo' must be a non-empty string"
4225
+ });
4226
+ }
4227
+ if (typeof exportRecord.path !== "string" || exportRecord.path.trim().length === 0) {
4228
+ errors.push({
4229
+ severity: "error",
4230
+ filePath,
4231
+ location: "results.export.path",
4232
+ message: "Field 'results.export.path' must be a non-empty string"
4233
+ });
4234
+ }
4235
+ if (exportRecord.auto_push !== void 0 && typeof exportRecord.auto_push !== "boolean") {
4236
+ errors.push({
4237
+ severity: "error",
4238
+ filePath,
4239
+ location: "results.export.auto_push",
4240
+ message: "Field 'results.export.auto_push' must be a boolean"
4241
+ });
4242
+ }
4243
+ if (exportRecord.branch_prefix !== void 0 && (typeof exportRecord.branch_prefix !== "string" || exportRecord.branch_prefix.trim().length === 0)) {
4244
+ errors.push({
4245
+ severity: "error",
4246
+ filePath,
4247
+ location: "results.export.branch_prefix",
4248
+ message: "Field 'results.export.branch_prefix' must be a non-empty string"
4249
+ });
4250
+ }
4251
+ }
4252
+ }
4253
+ }
4254
+ }
3525
4255
  const allowedFields = /* @__PURE__ */ new Set([
3526
4256
  "$schema",
3527
4257
  "eval_patterns",
3528
4258
  "required_version",
3529
4259
  "execution",
4260
+ "results",
3530
4261
  "studio"
3531
4262
  ]);
3532
4263
  const unexpectedFields = Object.keys(config).filter((key) => !allowedFields.has(key));
@@ -4086,7 +4817,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4086
4817
  threshold: normalizeOptionalNumber(rawOptions.threshold),
4087
4818
  tags: normalizeStringArray(rawOptions.tag),
4088
4819
  excludeTags: normalizeStringArray(rawOptions.excludeTag),
4089
- transcript: normalizeString(rawOptions.transcript)
4820
+ transcript: normalizeString(rawOptions.transcript),
4821
+ experiment: normalizeString(rawOptions.experiment)
4090
4822
  };
4091
4823
  }
4092
4824
  async function ensureFileExists(filePath, description) {
@@ -4096,10 +4828,10 @@ async function ensureFileExists(filePath, description) {
4096
4828
  throw new Error(`${description} not found: ${filePath}`);
4097
4829
  }
4098
4830
  }
4099
- function buildDefaultOutputPath(cwd) {
4100
- const runDir = buildDefaultRunDir(cwd);
4831
+ function buildDefaultOutputPathForExperiment(cwd, experiment) {
4832
+ const runDir = buildDefaultRunDir(cwd, experiment);
4101
4833
  mkdirSync(runDir, { recursive: true });
4102
- return path15.join(runDir, "index.jsonl");
4834
+ return path17.join(runDir, "index.jsonl");
4103
4835
  }
4104
4836
  function createProgressReporter(maxWorkers, options) {
4105
4837
  const display = new ProgressDisplay(maxWorkers, options);
@@ -4113,7 +4845,7 @@ function createProgressReporter(maxWorkers, options) {
4113
4845
  };
4114
4846
  }
4115
4847
  function makeTestCaseKey(testFilePath, testId) {
4116
- return `${path15.resolve(testFilePath)}::${testId}`;
4848
+ return `${path17.resolve(testFilePath)}::${testId}`;
4117
4849
  }
4118
4850
  function createDisplayIdTracker() {
4119
4851
  const map = /* @__PURE__ */ new Map();
@@ -4169,7 +4901,7 @@ async function prepareFileMetadata(params) {
4169
4901
  repoRoot,
4170
4902
  verbose: options.verbose
4171
4903
  });
4172
- const relativePath = path15.relative(cwd, testFilePath);
4904
+ const relativePath = path17.relative(cwd, testFilePath);
4173
4905
  const category = deriveCategory(relativePath);
4174
4906
  const suite = await loadTestSuite(testFilePath, repoRoot, {
4175
4907
  verbose: options.verbose,
@@ -4194,7 +4926,7 @@ async function prepareFileMetadata(params) {
4194
4926
  selections = [
4195
4927
  {
4196
4928
  selection: transcriptSelection,
4197
- inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
4929
+ inlineTargetLabel: `transcript (${path17.basename(options.transcript)})`
4198
4930
  }
4199
4931
  ];
4200
4932
  } else {
@@ -4430,32 +5162,36 @@ async function runEvalCommand(input) {
4430
5162
  );
4431
5163
  }
4432
5164
  const repoRoot = await findRepoRoot(cwd);
4433
- const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
5165
+ const yamlConfig = await loadConfig(path17.join(cwd, "_"), repoRoot);
4434
5166
  if (yamlConfig?.required_version) {
4435
5167
  await enforceRequiredVersion(yamlConfig.required_version, {
4436
5168
  strict: normalizeBoolean(input.rawOptions.strict)
4437
5169
  });
4438
5170
  }
4439
5171
  let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
5172
+ if (!process.env.AGENTV_EXPERIMENT) {
5173
+ process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
5174
+ }
4440
5175
  if (options.graderTarget === "agentv" && !options.model) {
4441
5176
  throw new Error("--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)");
4442
5177
  }
4443
5178
  let retryNonErrorResults;
4444
5179
  if (options.retryErrors) {
4445
- const retryPath = path15.resolve(options.retryErrors);
5180
+ const retryPath = path17.resolve(options.retryErrors);
4446
5181
  await ensureFileExists(retryPath, "Retry-errors JSONL file");
5182
+ const completedIds = await loadFullyCompletedTestIds(retryPath);
4447
5183
  const errorIds = await loadErrorTestIds(retryPath);
4448
- if (errorIds.length === 0) {
4449
- console.log("No execution errors found in the previous output. Nothing to retry.");
4450
- return;
4451
- }
4452
- console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
4453
- const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
4454
- options = { ...options, filter: filterPattern };
4455
5184
  retryNonErrorResults = await loadNonErrorResults(retryPath);
5185
+ if (errorIds.length > 0) {
5186
+ console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
5187
+ }
5188
+ if (completedIds.length > 0) {
5189
+ options = { ...options, filter: buildExclusionFilter(completedIds) };
5190
+ console.log(`Skipping ${completedIds.length} already-completed test(s).`);
5191
+ }
4456
5192
  }
4457
5193
  if (options.workspacePath) {
4458
- const resolvedWorkspace = path15.resolve(options.workspacePath);
5194
+ const resolvedWorkspace = path17.resolve(options.workspacePath);
4459
5195
  try {
4460
5196
  const { stat: stat2 } = await import("node:fs/promises");
4461
5197
  const stats = await stat2(resolvedWorkspace);
@@ -4496,25 +5232,25 @@ async function runEvalCommand(input) {
4496
5232
  let outputPath;
4497
5233
  let usesDefaultArtifactWorkspace;
4498
5234
  if (explicitDir) {
4499
- runDir = path15.resolve(explicitDir);
5235
+ runDir = path17.resolve(explicitDir);
4500
5236
  mkdirSync(runDir, { recursive: true });
4501
- outputPath = path15.join(runDir, "index.jsonl");
5237
+ outputPath = path17.join(runDir, "index.jsonl");
4502
5238
  usesDefaultArtifactWorkspace = true;
4503
5239
  } else if (options.outPath) {
4504
- outputPath = path15.resolve(options.outPath);
4505
- runDir = path15.dirname(outputPath);
5240
+ outputPath = path17.resolve(options.outPath);
5241
+ runDir = path17.dirname(outputPath);
4506
5242
  mkdirSync(runDir, { recursive: true });
4507
5243
  usesDefaultArtifactWorkspace = false;
4508
5244
  } else {
4509
- outputPath = buildDefaultOutputPath(cwd);
4510
- runDir = path15.dirname(outputPath);
5245
+ outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
5246
+ runDir = path17.dirname(outputPath);
4511
5247
  usesDefaultArtifactWorkspace = true;
4512
5248
  }
4513
5249
  let otelExporter = null;
4514
5250
  const useFileExport = !!options.otelFile;
4515
5251
  if (options.exportOtel || useFileExport) {
4516
5252
  try {
4517
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-DDFE3W2A.js");
5253
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-HNSXNRVK.js");
4518
5254
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4519
5255
  let headers = {};
4520
5256
  if (options.otelBackend) {
@@ -4538,7 +5274,7 @@ async function runEvalCommand(input) {
4538
5274
  headers,
4539
5275
  captureContent,
4540
5276
  groupTurns: options.otelGroupTurns,
4541
- otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0
5277
+ otlpFilePath: options.otelFile ? path17.resolve(options.otelFile) : void 0
4542
5278
  });
4543
5279
  const initialized = await otelExporter.init();
4544
5280
  if (!initialized) {
@@ -4555,7 +5291,7 @@ async function runEvalCommand(input) {
4555
5291
  }
4556
5292
  }
4557
5293
  const primaryWritePath = outputPath;
4558
- const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
5294
+ const resolvedExportPaths = options.exportPaths.map((p) => path17.resolve(p));
4559
5295
  console.log(`Artifact directory: ${runDir}`);
4560
5296
  if (resolvedExportPaths.length > 0) {
4561
5297
  console.log("Export files:");
@@ -4563,12 +5299,13 @@ async function runEvalCommand(input) {
4563
5299
  console.log(` ${p}`);
4564
5300
  }
4565
5301
  }
4566
- const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
5302
+ const resolvedTestFiles = input.testFiles.map((file) => path17.resolve(file));
4567
5303
  if (options.otelFile) {
4568
- console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
5304
+ console.log(`OTLP JSON file: ${path17.resolve(options.otelFile)}`);
4569
5305
  }
4570
5306
  const evaluationRunner = await resolveEvaluationRunner();
4571
5307
  const allResults = [];
5308
+ const remoteEvalSummaries = [];
4572
5309
  const seenTestCases = /* @__PURE__ */ new Set();
4573
5310
  const displayIdTracker = createDisplayIdTracker();
4574
5311
  const totalWorkers = options.workers ?? DEFAULT_WORKERS;
@@ -4609,7 +5346,7 @@ async function runEvalCommand(input) {
4609
5346
  for (const [testFilePath, meta] of fileMetadata.entries()) {
4610
5347
  if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
4611
5348
  fileMetadata.delete(testFilePath);
4612
- skippedFiles.push(path15.relative(cwd, testFilePath));
5349
+ skippedFiles.push(path17.relative(cwd, testFilePath));
4613
5350
  }
4614
5351
  }
4615
5352
  if (skippedFiles.length > 0 && options.verbose) {
@@ -4630,7 +5367,7 @@ async function runEvalCommand(input) {
4630
5367
  cliNoCache: options.noCache,
4631
5368
  yamlCache: yamlCacheEnabled
4632
5369
  });
4633
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
5370
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path17.resolve(yamlCachePath) : void 0) : void 0;
4634
5371
  if (cacheEnabled) {
4635
5372
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
4636
5373
  }
@@ -4651,6 +5388,10 @@ async function runEvalCommand(input) {
4651
5388
  }
4652
5389
  }
4653
5390
  if (totalEvalCount === 0) {
5391
+ if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
5392
+ console.log("No execution errors or missing cases in the previous run. Nothing to retry.");
5393
+ return;
5394
+ }
4654
5395
  throw new Error("No tests matched the provided filters.");
4655
5396
  }
4656
5397
  const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
@@ -4708,7 +5449,7 @@ async function runEvalCommand(input) {
4708
5449
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
4709
5450
  let transcriptProviderFactory;
4710
5451
  if (options.transcript) {
4711
- const { TranscriptProvider } = await import("./dist-DDFE3W2A.js");
5452
+ const { TranscriptProvider } = await import("./dist-HNSXNRVK.js");
4712
5453
  const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
4713
5454
  const totalTests = [...fileMetadata.values()].reduce(
4714
5455
  (sum, meta) => sum + meta.testCases.length,
@@ -4767,11 +5508,23 @@ async function runEvalCommand(input) {
4767
5508
  threshold: resolvedThreshold,
4768
5509
  providerFactory: transcriptProviderFactory
4769
5510
  });
5511
+ const evalFile = path17.relative(cwd, testFilePath);
5512
+ const existingSummary = remoteEvalSummaries.find(
5513
+ (summary2) => summary2.evalFile === evalFile
5514
+ );
5515
+ if (existingSummary) {
5516
+ existingSummary.results.push(...result.results);
5517
+ } else {
5518
+ remoteEvalSummaries.push({
5519
+ evalFile,
5520
+ results: [...result.results]
5521
+ });
5522
+ }
4770
5523
  return result.results;
4771
5524
  } catch (fileError) {
4772
5525
  const message = fileError instanceof Error ? fileError.message : String(fileError);
4773
5526
  console.error(`
4774
- \u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
5527
+ \u26A0 Eval file failed: ${path17.basename(testFilePath)} \u2014 ${message}
4775
5528
  `);
4776
5529
  const errorResults = applicableTestCases.map((testCase) => ({
4777
5530
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -4818,7 +5571,7 @@ async function runEvalCommand(input) {
4818
5571
  console.log(formatMatrixSummary(allResults));
4819
5572
  }
4820
5573
  if (options.benchmarkJson && allResults.length > 0) {
4821
- const benchmarkPath = path15.resolve(options.benchmarkJson);
5574
+ const benchmarkPath = path17.resolve(options.benchmarkJson);
4822
5575
  await writeBenchmarkJson(benchmarkPath, allResults);
4823
5576
  console.log(`Benchmark written to: ${benchmarkPath}`);
4824
5577
  }
@@ -4830,7 +5583,8 @@ async function runEvalCommand(input) {
4830
5583
  benchmarkPath: workspaceBenchmarkPath,
4831
5584
  indexPath
4832
5585
  } = await writeArtifactsFromResults(allResults, runDir, {
4833
- evalFile
5586
+ evalFile,
5587
+ experiment: normalizeExperimentName(options.experiment)
4834
5588
  });
4835
5589
  console.log(`Artifact workspace written to: ${runDir}`);
4836
5590
  console.log(` Index: ${indexPath}`);
@@ -4849,7 +5603,7 @@ async function runEvalCommand(input) {
4849
5603
  await writer.close();
4850
5604
  }
4851
5605
  console.log(
4852
- `Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
5606
+ `Export file(s) written: ${resolvedExportPaths.map((p) => path17.relative(cwd, p)).join(", ")}`
4853
5607
  );
4854
5608
  }
4855
5609
  const failedWithWorkspaces = allResults.filter(
@@ -4865,11 +5619,29 @@ async function runEvalCommand(input) {
4865
5619
  console.log(`
4866
5620
  Results written to: ${outputPath}`);
4867
5621
  await saveRunCache(cwd, outputPath).catch(() => void 0);
5622
+ await maybeAutoExportRunArtifacts({
5623
+ cwd,
5624
+ run_dir: runDir,
5625
+ test_files: activeTestFiles,
5626
+ results: allResults,
5627
+ eval_summaries: remoteEvalSummaries.map((summary2) => ({
5628
+ eval_file: summary2.evalFile,
5629
+ total: summary2.results.length,
5630
+ passed: summary2.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
5631
+ avg_score: summary2.results.length > 0 ? summary2.results.reduce((sum, result) => sum + result.score, 0) / summary2.results.length : 0,
5632
+ results: summary2.results.map((result) => ({
5633
+ test_id: result.testId,
5634
+ score: result.score,
5635
+ status: result.executionStatus === "execution_error" || result.error ? "ERROR" : result.score >= DEFAULT_THRESHOLD ? "PASS" : "FAIL"
5636
+ }))
5637
+ })),
5638
+ experiment: normalizeExperimentName(options.experiment)
5639
+ });
4868
5640
  }
4869
5641
  if (summary.executionErrorCount > 0 && !options.retryErrors) {
4870
- const evalFileArgs = activeTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
5642
+ const evalFileArgs = activeTestFiles.map((f) => path17.relative(cwd, f)).join(" ");
4871
5643
  const targetFlag = options.target ? ` --target ${options.target}` : "";
4872
- const relativeOutputPath = path15.relative(cwd, outputPath);
5644
+ const relativeOutputPath = path17.relative(cwd, outputPath);
4873
5645
  console.log(
4874
5646
  `
4875
5647
  Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
@@ -4903,7 +5675,7 @@ async function resolveEvaluationRunner() {
4903
5675
  if (!overridePath) {
4904
5676
  return runEvaluation;
4905
5677
  }
4906
- const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
5678
+ const resolved = path17.isAbsolute(overridePath) ? overridePath : path17.resolve(process.cwd(), overridePath);
4907
5679
  const moduleUrl = pathToFileURL(resolved).href;
4908
5680
  const mod = await import(moduleUrl);
4909
5681
  const candidate = mod.runEvaluation;
@@ -4916,11 +5688,11 @@ async function resolveEvaluationRunner() {
4916
5688
  }
4917
5689
 
4918
5690
  // src/commands/eval/discover.ts
4919
- import path16 from "node:path";
5691
+ import path18 from "node:path";
4920
5692
  import fg2 from "fast-glob";
4921
5693
  async function discoverEvalFiles(cwd) {
4922
5694
  const repoRoot = await findRepoRoot(cwd);
4923
- const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
5695
+ const config = await loadConfig(path18.join(cwd, "_"), repoRoot);
4924
5696
  const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
4925
5697
  const ignore = ["**/node_modules/**", "**/dist/**"];
4926
5698
  const matches = await fg2(patterns, {
@@ -4932,7 +5704,7 @@ async function discoverEvalFiles(cwd) {
4932
5704
  caseSensitiveMatch: false
4933
5705
  });
4934
5706
  const evalFiles = matches.map((absPath) => {
4935
- const relativePath = path16.relative(cwd, absPath);
5707
+ const relativePath = path18.relative(cwd, absPath);
4936
5708
  const category = deriveCategory(relativePath);
4937
5709
  return { path: absPath, relativePath, category };
4938
5710
  });
@@ -4956,21 +5728,36 @@ export {
4956
5728
  package_default,
4957
5729
  toSnakeCaseDeep,
4958
5730
  RESULT_INDEX_FILENAME,
4959
- RESULT_RUNS_DIRNAME,
4960
5731
  buildDefaultRunDir,
4961
- resolveExistingRunPrimaryPath,
4962
- resolveWorkspaceOrFilePath,
4963
5732
  resolveRunManifestPath,
4964
5733
  parseResultManifest,
4965
5734
  resolveResultSourcePath,
4966
5735
  loadManifestResults,
4967
5736
  loadLightweightResults,
4968
5737
  HtmlWriter,
5738
+ resolveEvalPaths,
5739
+ findRepoRoot,
5740
+ c,
5741
+ padRight,
5742
+ padLeft,
5743
+ loadResultFile,
5744
+ getTraceSummary,
5745
+ getTraceSpans,
5746
+ toTraceSummary,
5747
+ listResultFiles,
5748
+ formatNumber,
5749
+ formatDuration,
5750
+ formatCost,
5751
+ formatSize,
5752
+ formatScore,
5753
+ getRemoteResultsStatus,
5754
+ syncRemoteResults,
5755
+ listMergedResultFiles,
5756
+ findRunById,
5757
+ maybeAutoExportRunArtifacts,
4969
5758
  writeArtifactsFromResults,
4970
5759
  resolveRunCacheFile,
4971
5760
  loadRunCache,
4972
- resolveEvalPaths,
4973
- findRepoRoot,
4974
5761
  detectFileType,
4975
5762
  validateEvalFile,
4976
5763
  validateTargetsFile,
@@ -4984,4 +5771,4 @@ export {
4984
5771
  getCategories,
4985
5772
  filterByCategory
4986
5773
  };
4987
- //# sourceMappingURL=chunk-2IKIOZ4Z.js.map
5774
+ //# sourceMappingURL=chunk-FH24D7XW.js.map