agentv 4.10.0 → 4.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XOSNETAV.js → chunk-BAUNAXHT.js} +1 -1
- package/dist/chunk-BPGJ4HBU.js +183 -0
- package/dist/chunk-BPGJ4HBU.js.map +1 -0
- package/dist/{chunk-KF6BABQ5.js → chunk-FH24D7XW.js} +1090 -303
- package/dist/chunk-FH24D7XW.js.map +1 -0
- package/dist/{chunk-SE73HJZG.js → chunk-FQGY6QXQ.js} +780 -346
- package/dist/chunk-FQGY6QXQ.js.map +1 -0
- package/dist/chunk-NPVGBFF6.js +151 -0
- package/dist/chunk-NPVGBFF6.js.map +1 -0
- package/dist/{chunk-VA64NETD.js → chunk-QRYAMYT7.js} +1120 -731
- package/dist/chunk-QRYAMYT7.js.map +1 -0
- package/dist/cli.js +6 -4
- package/dist/cli.js.map +1 -1
- package/dist/{dist-XDNB4WDT.js → dist-HNSXNRVK.js} +36 -3
- package/dist/docker-workspace-RPPXBT27-B4AQHVWA.js +11 -0
- package/dist/{esm-CZAWIY6F.js → esm-UYZ3HJBU.js} +2 -2
- package/dist/esm-UYZ3HJBU.js.map +1 -0
- package/dist/exec-AR6JUUN5-6MBPURPR.js +11 -0
- package/dist/exec-AR6JUUN5-6MBPURPR.js.map +1 -0
- package/dist/index.js +6 -4
- package/dist/{interactive-SNKK6VCV.js → interactive-SIOZB665.js} +6 -4
- package/dist/{interactive-SNKK6VCV.js.map → interactive-SIOZB665.js.map} +1 -1
- package/dist/{src-ML4D2MC2.js → src-PXDA7QIS.js} +2 -2
- package/dist/studio/assets/index-Bi-KHfNm.js +65 -0
- package/dist/studio/assets/index-D_j-w4UO.css +1 -0
- package/dist/studio/assets/{index-DcwjOyrk.js → index-VyDFrnoK.js} +1 -1
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-KF6BABQ5.js.map +0 -1
- package/dist/chunk-SE73HJZG.js.map +0 -1
- package/dist/chunk-VA64NETD.js.map +0 -1
- package/dist/studio/assets/index-DHxVz6M9.css +0 -1
- package/dist/studio/assets/index-Y5InSvcS.js +0 -65
- /package/dist/{chunk-XOSNETAV.js.map → chunk-BAUNAXHT.js.map} +0 -0
- /package/dist/{dist-XDNB4WDT.js.map → dist-HNSXNRVK.js.map} +0 -0
- /package/dist/{esm-CZAWIY6F.js.map → docker-workspace-RPPXBT27-B4AQHVWA.js.map} +0 -0
- /package/dist/{src-ML4D2MC2.js.map → src-PXDA7QIS.js.map} +0 -0
|
@@ -9,10 +9,14 @@ import {
|
|
|
9
9
|
ResponseCache,
|
|
10
10
|
buildDirectoryChain,
|
|
11
11
|
buildSearchRoots,
|
|
12
|
+
commitAndPushResultsBranch,
|
|
13
|
+
createDraftResultsPr,
|
|
12
14
|
deriveCategory,
|
|
15
|
+
directorySizeBytes,
|
|
13
16
|
ensureVSCodeSubagents,
|
|
14
17
|
findDeprecatedCamelCaseTargetWarnings,
|
|
15
18
|
findGitRoot,
|
|
19
|
+
getResultsRepoStatus,
|
|
16
20
|
interpolateEnv,
|
|
17
21
|
isEvaluatorKind,
|
|
18
22
|
listTargetNames,
|
|
@@ -21,23 +25,28 @@ import {
|
|
|
21
25
|
loadTestSuite,
|
|
22
26
|
loadTsConfig,
|
|
23
27
|
normalizeLineEndings,
|
|
28
|
+
prepareResultsRepoBranch,
|
|
24
29
|
readTargetDefinitions,
|
|
25
30
|
readTestSuiteMetadata,
|
|
26
31
|
resolveFileReference,
|
|
32
|
+
resolveResultsRepoRunsDir,
|
|
27
33
|
resolveTargetDefinition,
|
|
28
34
|
runEvaluation,
|
|
29
35
|
shouldEnableCache,
|
|
30
36
|
shouldSkipCacheForTemperature,
|
|
37
|
+
stageResultsArtifacts,
|
|
31
38
|
subscribeToCodexLogEntries,
|
|
32
39
|
subscribeToCopilotCliLogEntries,
|
|
33
40
|
subscribeToCopilotSdkLogEntries,
|
|
34
|
-
subscribeToPiLogEntries
|
|
35
|
-
|
|
41
|
+
subscribeToPiLogEntries,
|
|
42
|
+
syncResultsRepo,
|
|
43
|
+
toCamelCaseDeep
|
|
44
|
+
} from "./chunk-FQGY6QXQ.js";
|
|
36
45
|
|
|
37
46
|
// package.json
|
|
38
47
|
var package_default = {
|
|
39
48
|
name: "agentv",
|
|
40
|
-
version: "4.
|
|
49
|
+
version: "4.11.0",
|
|
41
50
|
description: "CLI entry point for AgentV",
|
|
42
51
|
type: "module",
|
|
43
52
|
repository: {
|
|
@@ -249,7 +258,7 @@ async function discoverTargetsFile(options) {
|
|
|
249
258
|
// src/commands/eval/run-eval.ts
|
|
250
259
|
import { constants as constants4, mkdirSync } from "node:fs";
|
|
251
260
|
import { access as access4 } from "node:fs/promises";
|
|
252
|
-
import
|
|
261
|
+
import path17 from "node:path";
|
|
253
262
|
import { pathToFileURL } from "node:url";
|
|
254
263
|
|
|
255
264
|
// src/version-check.ts
|
|
@@ -306,45 +315,43 @@ async function promptContinue() {
|
|
|
306
315
|
return confirm({ message: "Continue anyway?", default: false });
|
|
307
316
|
}
|
|
308
317
|
|
|
309
|
-
// src/commands/
|
|
310
|
-
import
|
|
311
|
-
import path4 from "node:path";
|
|
318
|
+
// src/commands/results/remote.ts
|
|
319
|
+
import path6 from "node:path";
|
|
312
320
|
|
|
313
|
-
// src/utils
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
return str;
|
|
317
|
-
}
|
|
318
|
-
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
319
|
-
}
|
|
320
|
-
function toSnakeCaseDeep(obj) {
|
|
321
|
-
if (obj === null || obj === void 0) {
|
|
322
|
-
return obj;
|
|
323
|
-
}
|
|
324
|
-
if (Array.isArray(obj)) {
|
|
325
|
-
return obj.map((item) => toSnakeCaseDeep(item));
|
|
326
|
-
}
|
|
327
|
-
if (typeof obj === "object") {
|
|
328
|
-
const result = {};
|
|
329
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
330
|
-
const snakeKey = toSnakeCase(key);
|
|
331
|
-
result[snakeKey] = toSnakeCaseDeep(value);
|
|
332
|
-
}
|
|
333
|
-
return result;
|
|
334
|
-
}
|
|
335
|
-
return obj;
|
|
336
|
-
}
|
|
321
|
+
// src/commands/inspect/utils.ts
|
|
322
|
+
import { readFileSync as readFileSync2, readdirSync, statSync as statSync2 } from "node:fs";
|
|
323
|
+
import path5 from "node:path";
|
|
337
324
|
|
|
338
325
|
// src/commands/eval/result-layout.ts
|
|
339
326
|
import { existsSync, statSync } from "node:fs";
|
|
340
327
|
import path3 from "node:path";
|
|
341
328
|
var RESULT_INDEX_FILENAME = "index.jsonl";
|
|
342
329
|
var RESULT_RUNS_DIRNAME = "runs";
|
|
330
|
+
var DEFAULT_EXPERIMENT_NAME = "default";
|
|
331
|
+
function normalizeExperimentName(experiment) {
|
|
332
|
+
const trimmed = experiment?.trim();
|
|
333
|
+
if (!trimmed) {
|
|
334
|
+
return DEFAULT_EXPERIMENT_NAME;
|
|
335
|
+
}
|
|
336
|
+
if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
|
|
337
|
+
throw new Error(
|
|
338
|
+
`Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
|
|
339
|
+
);
|
|
340
|
+
}
|
|
341
|
+
return trimmed;
|
|
342
|
+
}
|
|
343
343
|
function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
|
|
344
344
|
return timestamp.toISOString().replace(/[:.]/g, "-");
|
|
345
345
|
}
|
|
346
|
-
function buildDefaultRunDir(cwd) {
|
|
347
|
-
return path3.join(
|
|
346
|
+
function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
|
|
347
|
+
return path3.join(
|
|
348
|
+
cwd,
|
|
349
|
+
".agentv",
|
|
350
|
+
"results",
|
|
351
|
+
RESULT_RUNS_DIRNAME,
|
|
352
|
+
normalizeExperimentName(experiment),
|
|
353
|
+
createRunDirName(timestamp)
|
|
354
|
+
);
|
|
348
355
|
}
|
|
349
356
|
function resolveRunIndexPath(runDir) {
|
|
350
357
|
return path3.join(runDir, RESULT_INDEX_FILENAME);
|
|
@@ -366,26 +373,794 @@ function isDirectoryPath(filePath) {
|
|
|
366
373
|
return false;
|
|
367
374
|
}
|
|
368
375
|
}
|
|
369
|
-
function resolveWorkspaceOrFilePath(filePath) {
|
|
370
|
-
if (!isDirectoryPath(filePath)) {
|
|
371
|
-
return filePath;
|
|
372
|
-
}
|
|
373
|
-
const existing = resolveExistingRunPrimaryPath(filePath);
|
|
374
|
-
if (!existing) {
|
|
375
|
-
throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
|
|
376
|
+
function resolveWorkspaceOrFilePath(filePath) {
|
|
377
|
+
if (!isDirectoryPath(filePath)) {
|
|
378
|
+
return filePath;
|
|
379
|
+
}
|
|
380
|
+
const existing = resolveExistingRunPrimaryPath(filePath);
|
|
381
|
+
if (!existing) {
|
|
382
|
+
throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
|
|
383
|
+
}
|
|
384
|
+
return existing;
|
|
385
|
+
}
|
|
386
|
+
function resolveRunManifestPath(filePath) {
|
|
387
|
+
if (isDirectoryPath(filePath)) {
|
|
388
|
+
return resolveWorkspaceOrFilePath(filePath);
|
|
389
|
+
}
|
|
390
|
+
if (!isRunManifestPath(filePath)) {
|
|
391
|
+
throw new Error(
|
|
392
|
+
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
|
|
393
|
+
);
|
|
394
|
+
}
|
|
395
|
+
return filePath;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// src/commands/results/manifest.ts
|
|
399
|
+
import { existsSync as existsSync2, readFileSync } from "node:fs";
|
|
400
|
+
import path4 from "node:path";
|
|
401
|
+
function parseJsonlLines(content) {
|
|
402
|
+
return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
403
|
+
}
|
|
404
|
+
function parseMarkdownMessages(content) {
|
|
405
|
+
const trimmed = content.trim();
|
|
406
|
+
if (!trimmed.startsWith("@[")) {
|
|
407
|
+
return [];
|
|
408
|
+
}
|
|
409
|
+
const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
|
|
410
|
+
return matches.map((match) => ({
|
|
411
|
+
role: match[1],
|
|
412
|
+
content: match[2].trimEnd()
|
|
413
|
+
}));
|
|
414
|
+
}
|
|
415
|
+
function readOptionalText(baseDir, relativePath) {
|
|
416
|
+
if (!relativePath) {
|
|
417
|
+
return void 0;
|
|
418
|
+
}
|
|
419
|
+
const absolutePath = path4.join(baseDir, relativePath);
|
|
420
|
+
if (!existsSync2(absolutePath)) {
|
|
421
|
+
return void 0;
|
|
422
|
+
}
|
|
423
|
+
return readFileSync(absolutePath, "utf8");
|
|
424
|
+
}
|
|
425
|
+
function readOptionalJson(baseDir, relativePath) {
|
|
426
|
+
const text = readOptionalText(baseDir, relativePath);
|
|
427
|
+
if (!text) {
|
|
428
|
+
return void 0;
|
|
429
|
+
}
|
|
430
|
+
try {
|
|
431
|
+
return JSON.parse(text);
|
|
432
|
+
} catch {
|
|
433
|
+
return void 0;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
function hydrateInput(baseDir, record) {
|
|
437
|
+
const inputText = readOptionalText(baseDir, record.input_path);
|
|
438
|
+
if (!inputText) {
|
|
439
|
+
return void 0;
|
|
440
|
+
}
|
|
441
|
+
const messages = parseMarkdownMessages(inputText);
|
|
442
|
+
return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
|
|
443
|
+
}
|
|
444
|
+
function hydrateOutput(baseDir, record) {
|
|
445
|
+
const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
|
|
446
|
+
if (!responseText) {
|
|
447
|
+
return void 0;
|
|
448
|
+
}
|
|
449
|
+
const messages = parseMarkdownMessages(responseText);
|
|
450
|
+
if (messages.length > 0) {
|
|
451
|
+
return messages.map((message) => ({
|
|
452
|
+
role: message.role,
|
|
453
|
+
content: message.content
|
|
454
|
+
}));
|
|
455
|
+
}
|
|
456
|
+
return [{ role: "assistant", content: responseText.trimEnd() }];
|
|
457
|
+
}
|
|
458
|
+
function hydrateManifestRecord(baseDir, record) {
|
|
459
|
+
const grading = readOptionalJson(baseDir, record.grading_path);
|
|
460
|
+
const timing = readOptionalJson(baseDir, record.timing_path);
|
|
461
|
+
const testId = record.test_id ?? "unknown";
|
|
462
|
+
return {
|
|
463
|
+
timestamp: record.timestamp,
|
|
464
|
+
testId,
|
|
465
|
+
suite: record.suite,
|
|
466
|
+
category: record.category,
|
|
467
|
+
target: record.target,
|
|
468
|
+
score: record.score,
|
|
469
|
+
executionStatus: record.execution_status,
|
|
470
|
+
error: record.error,
|
|
471
|
+
assertions: grading?.assertions.map((assertion) => ({
|
|
472
|
+
text: assertion.text,
|
|
473
|
+
passed: assertion.passed,
|
|
474
|
+
evidence: assertion.evidence
|
|
475
|
+
})),
|
|
476
|
+
scores: grading?.evaluators?.map((evaluator) => ({
|
|
477
|
+
name: evaluator.name,
|
|
478
|
+
type: evaluator.type,
|
|
479
|
+
score: evaluator.score,
|
|
480
|
+
assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
|
|
481
|
+
text: String(assertion.text ?? ""),
|
|
482
|
+
passed: Boolean(assertion.passed),
|
|
483
|
+
evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
|
|
484
|
+
})) : void 0,
|
|
485
|
+
weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
|
|
486
|
+
verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
|
|
487
|
+
details: evaluator.details
|
|
488
|
+
})) ?? record.scores,
|
|
489
|
+
tokenUsage: timing?.token_usage ? {
|
|
490
|
+
input: timing.token_usage.input,
|
|
491
|
+
output: timing.token_usage.output,
|
|
492
|
+
reasoning: timing.token_usage.reasoning
|
|
493
|
+
} : record.token_usage,
|
|
494
|
+
durationMs: timing?.duration_ms ?? record.duration_ms,
|
|
495
|
+
costUsd: record.cost_usd,
|
|
496
|
+
input: hydrateInput(baseDir, record),
|
|
497
|
+
output: hydrateOutput(baseDir, record)
|
|
498
|
+
};
|
|
499
|
+
}
|
|
500
|
+
function parseResultManifest(content) {
|
|
501
|
+
return parseJsonlLines(content);
|
|
502
|
+
}
|
|
503
|
+
function resolveResultSourcePath(source, cwd) {
|
|
504
|
+
const resolved = path4.isAbsolute(source) ? source : path4.resolve(cwd ?? process.cwd(), source);
|
|
505
|
+
if (isDirectoryPath(resolved) || path4.basename(resolved) === RESULT_INDEX_FILENAME) {
|
|
506
|
+
return resolveRunManifestPath(resolved);
|
|
507
|
+
}
|
|
508
|
+
return resolved;
|
|
509
|
+
}
|
|
510
|
+
function loadManifestResults(sourceFile) {
|
|
511
|
+
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
512
|
+
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
513
|
+
const records = parseResultManifest(content);
|
|
514
|
+
const baseDir = path4.dirname(resolvedSourceFile);
|
|
515
|
+
return records.map((record) => hydrateManifestRecord(baseDir, record));
|
|
516
|
+
}
|
|
517
|
+
function loadLightweightResults(sourceFile) {
|
|
518
|
+
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
519
|
+
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
520
|
+
return parseResultManifest(content).map((record) => ({
|
|
521
|
+
testId: record.test_id ?? "unknown",
|
|
522
|
+
suite: record.suite,
|
|
523
|
+
target: record.target,
|
|
524
|
+
experiment: record.experiment,
|
|
525
|
+
score: record.score,
|
|
526
|
+
scores: record.scores,
|
|
527
|
+
executionStatus: record.execution_status,
|
|
528
|
+
error: record.error,
|
|
529
|
+
timestamp: record.timestamp
|
|
530
|
+
}));
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// src/commands/inspect/utils.ts
|
|
534
|
+
var colors = {
|
|
535
|
+
reset: "\x1B[0m",
|
|
536
|
+
bold: "\x1B[1m",
|
|
537
|
+
dim: "\x1B[2m",
|
|
538
|
+
green: "\x1B[32m",
|
|
539
|
+
red: "\x1B[31m",
|
|
540
|
+
yellow: "\x1B[33m",
|
|
541
|
+
cyan: "\x1B[36m",
|
|
542
|
+
gray: "\x1B[90m"
|
|
543
|
+
};
|
|
544
|
+
var noColor = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
|
|
545
|
+
var c = noColor ? Object.fromEntries(Object.keys(colors).map((k) => [k, ""])) : colors;
|
|
546
|
+
var ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
|
|
547
|
+
function stripAnsi(str) {
|
|
548
|
+
return str.replace(ansiPattern, "");
|
|
549
|
+
}
|
|
550
|
+
function padRight(str, len) {
|
|
551
|
+
const plainLen = stripAnsi(str).length;
|
|
552
|
+
return str + " ".repeat(Math.max(0, len - plainLen));
|
|
553
|
+
}
|
|
554
|
+
function padLeft(str, len) {
|
|
555
|
+
const plainLen = stripAnsi(str).length;
|
|
556
|
+
return " ".repeat(Math.max(0, len - plainLen)) + str;
|
|
557
|
+
}
|
|
558
|
+
function loadResultFile(filePath) {
|
|
559
|
+
const resolvedFilePath = resolveTraceResultPath(filePath);
|
|
560
|
+
if (path5.extname(resolvedFilePath) === ".json") {
|
|
561
|
+
return loadOtlpTraceFile(resolvedFilePath);
|
|
562
|
+
}
|
|
563
|
+
if (path5.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
|
|
564
|
+
return loadManifestAsRawResults(resolvedFilePath);
|
|
565
|
+
}
|
|
566
|
+
return loadJsonlRecords(resolvedFilePath);
|
|
567
|
+
}
|
|
568
|
+
function resolveTraceResultPath(filePath) {
|
|
569
|
+
return resolveWorkspaceOrFilePath(filePath);
|
|
570
|
+
}
|
|
571
|
+
function loadJsonlRecords(filePath) {
|
|
572
|
+
const content = readFileSync2(filePath, "utf8");
|
|
573
|
+
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
574
|
+
return lines.map((line, i) => {
|
|
575
|
+
const record = JSON.parse(line);
|
|
576
|
+
if (typeof record.score !== "number") {
|
|
577
|
+
throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`);
|
|
578
|
+
}
|
|
579
|
+
return record;
|
|
580
|
+
});
|
|
581
|
+
}
|
|
582
|
+
function loadManifestAsRawResults(filePath) {
|
|
583
|
+
return loadManifestResults(filePath).map(toRawResult);
|
|
584
|
+
}
|
|
585
|
+
function toRawResult(result) {
|
|
586
|
+
return {
|
|
587
|
+
timestamp: result.timestamp,
|
|
588
|
+
test_id: result.testId,
|
|
589
|
+
suite: result.suite,
|
|
590
|
+
conversation_id: result.conversationId,
|
|
591
|
+
score: result.score,
|
|
592
|
+
assertions: result.assertions?.map((assertion) => ({
|
|
593
|
+
text: assertion.text,
|
|
594
|
+
passed: assertion.passed,
|
|
595
|
+
evidence: assertion.evidence
|
|
596
|
+
})),
|
|
597
|
+
target: result.target,
|
|
598
|
+
error: result.error,
|
|
599
|
+
scores: result.scores?.map((score) => ({
|
|
600
|
+
name: score.name,
|
|
601
|
+
type: score.type,
|
|
602
|
+
score: score.score,
|
|
603
|
+
assertions: score.assertions?.map((assertion) => ({
|
|
604
|
+
text: assertion.text,
|
|
605
|
+
passed: assertion.passed,
|
|
606
|
+
evidence: assertion.evidence
|
|
607
|
+
})),
|
|
608
|
+
weight: score.weight
|
|
609
|
+
})),
|
|
610
|
+
token_usage: result.tokenUsage ? {
|
|
611
|
+
input: result.tokenUsage.input,
|
|
612
|
+
output: result.tokenUsage.output,
|
|
613
|
+
cached: result.tokenUsage.cached
|
|
614
|
+
} : void 0,
|
|
615
|
+
cost_usd: result.costUsd,
|
|
616
|
+
duration_ms: result.durationMs,
|
|
617
|
+
start_time: result.startTime,
|
|
618
|
+
end_time: result.endTime,
|
|
619
|
+
input: result.input,
|
|
620
|
+
output: result.output,
|
|
621
|
+
file_changes: result.fileChanges
|
|
622
|
+
};
|
|
623
|
+
}
|
|
624
|
+
function loadOtlpTraceFile(filePath) {
|
|
625
|
+
const parsed = JSON.parse(readFileSync2(filePath, "utf8"));
|
|
626
|
+
const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
|
|
627
|
+
if (!spans || spans.length === 0) {
|
|
628
|
+
return [];
|
|
629
|
+
}
|
|
630
|
+
const spanMap = /* @__PURE__ */ new Map();
|
|
631
|
+
const childMap = /* @__PURE__ */ new Map();
|
|
632
|
+
for (const span of spans) {
|
|
633
|
+
if (!span.spanId) continue;
|
|
634
|
+
spanMap.set(span.spanId, span);
|
|
635
|
+
if (span.parentSpanId) {
|
|
636
|
+
const siblings = childMap.get(span.parentSpanId) ?? [];
|
|
637
|
+
siblings.push(span);
|
|
638
|
+
childMap.set(span.parentSpanId, siblings);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
const roots = spans.filter((span) => !span.parentSpanId || !spanMap.has(span.parentSpanId));
|
|
642
|
+
const supportedRoots = roots.filter(isAgentvEvalRoot);
|
|
643
|
+
const candidateRoots = supportedRoots.length > 0 ? supportedRoots : roots;
|
|
644
|
+
return candidateRoots.map((root, index) => {
|
|
645
|
+
const descendants = collectChildSpans(root.spanId, childMap);
|
|
646
|
+
const rootAttrs = parseOtlpAttributes(root.attributes);
|
|
647
|
+
const parsedDescendants = descendants.map((span) => ({
|
|
648
|
+
...span,
|
|
649
|
+
parsedAttributes: parseOtlpAttributes(span.attributes)
|
|
650
|
+
}));
|
|
651
|
+
const toolSpans = parsedDescendants.filter(
|
|
652
|
+
(span) => typeof span.parsedAttributes.gen_ai_tool_name === "string"
|
|
653
|
+
);
|
|
654
|
+
const llmSpans = parsedDescendants.filter(
|
|
655
|
+
(span) => span.parsedAttributes.gen_ai_operation_name === "chat" || typeof span.name === "string" && span.name.startsWith("chat ")
|
|
656
|
+
);
|
|
657
|
+
const tokenUsage = descendants.reduce(
|
|
658
|
+
(acc, span) => {
|
|
659
|
+
const attrs = parseOtlpAttributes(span.attributes);
|
|
660
|
+
acc.input += numberAttr(attrs.gen_ai_usage_input_tokens) ?? 0;
|
|
661
|
+
acc.output += numberAttr(attrs.gen_ai_usage_output_tokens) ?? 0;
|
|
662
|
+
const cached = numberAttr(attrs.gen_ai_usage_cache_read_input_tokens);
|
|
663
|
+
if (cached !== void 0 && cached > 0) {
|
|
664
|
+
acc.cached = (acc.cached ?? 0) + cached;
|
|
665
|
+
}
|
|
666
|
+
return acc;
|
|
667
|
+
},
|
|
668
|
+
{ input: 0, output: 0, cached: void 0 }
|
|
669
|
+
);
|
|
670
|
+
const traceSummary = buildDerivedTraceSummary({
|
|
671
|
+
trace: {
|
|
672
|
+
event_count: numberAttr(rootAttrs.agentv_trace_event_count) ?? (toolSpans.length > 0 ? toolSpans.length : void 0),
|
|
673
|
+
tool_calls: countRawSpanNames(
|
|
674
|
+
toolSpans.map((span) => ({
|
|
675
|
+
type: "tool",
|
|
676
|
+
name: String(span.parsedAttributes.gen_ai_tool_name)
|
|
677
|
+
}))
|
|
678
|
+
),
|
|
679
|
+
error_count: descendants.filter((span) => span.status?.code === 2).length || void 0,
|
|
680
|
+
llm_call_count: numberAttr(rootAttrs.agentv_trace_llm_call_count) ?? (llmSpans.length > 0 ? llmSpans.length : void 0)
|
|
681
|
+
},
|
|
682
|
+
spans: [
|
|
683
|
+
...llmSpans.map((span) => ({
|
|
684
|
+
type: "llm",
|
|
685
|
+
name: span.name ?? "chat",
|
|
686
|
+
duration_ms: durationFromSpan(span)
|
|
687
|
+
})),
|
|
688
|
+
...toolSpans.map((span) => ({
|
|
689
|
+
type: "tool",
|
|
690
|
+
name: String(span.parsedAttributes.gen_ai_tool_name),
|
|
691
|
+
duration_ms: durationFromSpan(span)
|
|
692
|
+
}))
|
|
693
|
+
],
|
|
694
|
+
duration_ms: numberAttr(rootAttrs.agentv_trace_duration_ms) ?? durationFromSpan(root),
|
|
695
|
+
cost_usd: numberAttr(rootAttrs.agentv_trace_cost_usd),
|
|
696
|
+
token_usage: tokenUsage.input || tokenUsage.output || tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_input) || numberAttr(rootAttrs.agentv_trace_token_output) || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
|
|
697
|
+
input: tokenUsage.input || numberAttr(rootAttrs.agentv_trace_token_input) || 0,
|
|
698
|
+
output: tokenUsage.output || numberAttr(rootAttrs.agentv_trace_token_output) || 0,
|
|
699
|
+
...tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
|
|
700
|
+
cached: tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) || 0
|
|
701
|
+
} : {}
|
|
702
|
+
} : void 0
|
|
703
|
+
});
|
|
704
|
+
const score = numberAttr(rootAttrs.agentv_score);
|
|
705
|
+
if (score === void 0) {
|
|
706
|
+
throw new Error(
|
|
707
|
+
`Unsupported OTLP trace root span at index ${index + 1}: missing agentv.score attribute`
|
|
708
|
+
);
|
|
709
|
+
}
|
|
710
|
+
return {
|
|
711
|
+
test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
|
|
712
|
+
suite: stringAttr(rootAttrs.agentv_suite),
|
|
713
|
+
target: stringAttr(rootAttrs.agentv_target),
|
|
714
|
+
score,
|
|
715
|
+
error: root.status?.code === 2 ? root.status.message : void 0,
|
|
716
|
+
cost_usd: traceSummary?.cost_usd,
|
|
717
|
+
duration_ms: traceSummary?.duration_ms,
|
|
718
|
+
token_usage: traceSummary?.token_usage,
|
|
719
|
+
trace: traceSummary ? {
|
|
720
|
+
event_count: traceSummary.event_count,
|
|
721
|
+
tool_calls: traceSummary.tool_calls,
|
|
722
|
+
error_count: traceSummary.error_count,
|
|
723
|
+
tool_durations: traceSummary.tool_durations,
|
|
724
|
+
llm_call_count: traceSummary.llm_call_count,
|
|
725
|
+
token_usage: traceSummary.token_usage,
|
|
726
|
+
cost_usd: traceSummary.cost_usd,
|
|
727
|
+
duration_ms: traceSummary.duration_ms
|
|
728
|
+
} : void 0,
|
|
729
|
+
spans: traceSummary?.spans,
|
|
730
|
+
output: stringAttr(rootAttrs.agentv_output_text),
|
|
731
|
+
scores: root.events?.filter(
|
|
732
|
+
(event) => event.name?.startsWith("agentv.grader.") || event.name?.startsWith("agentv.evaluator.")
|
|
733
|
+
).map((event) => {
|
|
734
|
+
const attrs = parseOtlpAttributes(event.attributes);
|
|
735
|
+
const name = event.name?.replace(/^agentv\.grader\./, "").replace(/^agentv\.evaluator\./, "") ?? "unknown";
|
|
736
|
+
return {
|
|
737
|
+
name,
|
|
738
|
+
type: stringAttr(attrs.agentv_grader_type) ?? stringAttr(attrs.agentv_evaluator_type) ?? "unknown",
|
|
739
|
+
score: numberAttr(attrs.agentv_grader_score) ?? numberAttr(attrs.agentv_evaluator_score) ?? 0
|
|
740
|
+
};
|
|
741
|
+
})
|
|
742
|
+
};
|
|
743
|
+
});
|
|
744
|
+
}
|
|
745
|
+
function isAgentvEvalRoot(span) {
|
|
746
|
+
const attrs = parseOtlpAttributes(span.attributes);
|
|
747
|
+
return span.name === "agentv.eval" || numberAttr(attrs.agentv_score) !== void 0 || typeof stringAttr(attrs.agentv_test_id) === "string";
|
|
748
|
+
}
|
|
749
|
+
function collectChildSpans(spanId, childMap) {
|
|
750
|
+
if (!spanId) return [];
|
|
751
|
+
const direct = childMap.get(spanId) ?? [];
|
|
752
|
+
const all = [...direct];
|
|
753
|
+
for (const child of direct) {
|
|
754
|
+
all.push(...collectChildSpans(child.spanId, childMap));
|
|
755
|
+
}
|
|
756
|
+
return all;
|
|
757
|
+
}
|
|
758
|
+
function parseOtlpAttributes(attributes) {
|
|
759
|
+
const parsed = {};
|
|
760
|
+
for (const attribute of attributes ?? []) {
|
|
761
|
+
parsed[attribute.key.replace(/\./g, "_")] = parseOtlpValue(attribute.value);
|
|
762
|
+
}
|
|
763
|
+
return parsed;
|
|
764
|
+
}
|
|
765
|
+
function parseOtlpValue(value) {
|
|
766
|
+
if (!value) return void 0;
|
|
767
|
+
if ("stringValue" in value && value.stringValue !== void 0) return value.stringValue;
|
|
768
|
+
if ("intValue" in value && value.intValue !== void 0) return Number(value.intValue);
|
|
769
|
+
if ("doubleValue" in value && value.doubleValue !== void 0) return value.doubleValue;
|
|
770
|
+
if ("boolValue" in value && value.boolValue !== void 0) return value.boolValue;
|
|
771
|
+
if ("arrayValue" in value)
|
|
772
|
+
return (value.arrayValue?.values ?? []).map((entry) => parseOtlpValue(entry));
|
|
773
|
+
return void 0;
|
|
774
|
+
}
|
|
775
|
+
function durationFromSpan(span) {
|
|
776
|
+
const start = Number(span.startTimeUnixNano);
|
|
777
|
+
const end = Number(span.endTimeUnixNano);
|
|
778
|
+
if (!Number.isFinite(start) || !Number.isFinite(end)) return void 0;
|
|
779
|
+
return Math.round((end - start) / 1e6);
|
|
780
|
+
}
|
|
781
|
+
function stringAttr(value) {
|
|
782
|
+
return typeof value === "string" ? value : void 0;
|
|
783
|
+
}
|
|
784
|
+
function numberAttr(value) {
|
|
785
|
+
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
786
|
+
}
|
|
787
|
+
function buildDerivedTraceSummary(result) {
|
|
788
|
+
const toolSpans = (result.spans ?? []).filter((span) => span.type === "tool");
|
|
789
|
+
const llmSpans = (result.spans ?? []).filter((span) => span.type === "llm");
|
|
790
|
+
const toolCalls = result.trace?.tool_calls ?? countRawSpanNames(toolSpans);
|
|
791
|
+
const toolDurations = result.trace?.tool_durations ?? groupRawSpanDurations(toolSpans);
|
|
792
|
+
const hasSpanData = (result.spans?.length ?? 0) > 0;
|
|
793
|
+
const eventCount = result.trace?.event_count ?? (hasSpanData ? toolSpans.length : void 0);
|
|
794
|
+
const llmCallCount = result.trace?.llm_call_count ?? (hasSpanData ? llmSpans.length : void 0);
|
|
795
|
+
if (!result.trace && !result.spans?.length && result.token_usage === void 0 && result.cost_usd === void 0 && result.duration_ms === void 0) {
|
|
796
|
+
return void 0;
|
|
797
|
+
}
|
|
798
|
+
return {
|
|
799
|
+
event_count: eventCount,
|
|
800
|
+
tool_calls: toolCalls,
|
|
801
|
+
error_count: result.trace?.error_count,
|
|
802
|
+
tool_durations: toolDurations,
|
|
803
|
+
llm_call_count: llmCallCount,
|
|
804
|
+
token_usage: result.trace?.token_usage ?? result.token_usage,
|
|
805
|
+
cost_usd: result.trace?.cost_usd ?? result.cost_usd,
|
|
806
|
+
duration_ms: result.trace?.duration_ms ?? result.duration_ms,
|
|
807
|
+
spans: result.spans
|
|
808
|
+
};
|
|
809
|
+
}
|
|
810
|
+
function countRawSpanNames(spans) {
|
|
811
|
+
const counts = {};
|
|
812
|
+
for (const span of spans) {
|
|
813
|
+
counts[span.name] = (counts[span.name] ?? 0) + 1;
|
|
814
|
+
}
|
|
815
|
+
return Object.keys(counts).length > 0 ? counts : void 0;
|
|
816
|
+
}
|
|
817
|
+
function groupRawSpanDurations(spans) {
|
|
818
|
+
const grouped = {};
|
|
819
|
+
for (const span of spans) {
|
|
820
|
+
if (span.duration_ms === void 0) continue;
|
|
821
|
+
const existing = grouped[span.name] ?? [];
|
|
822
|
+
existing.push(span.duration_ms);
|
|
823
|
+
grouped[span.name] = existing;
|
|
824
|
+
}
|
|
825
|
+
return Object.keys(grouped).length > 0 ? grouped : void 0;
|
|
826
|
+
}
|
|
827
|
+
function getTraceSummary(result) {
|
|
828
|
+
const derived = buildDerivedTraceSummary(result);
|
|
829
|
+
if (!derived) return void 0;
|
|
830
|
+
const { spans: _spans, ...trace } = derived;
|
|
831
|
+
return trace;
|
|
832
|
+
}
|
|
833
|
+
function getTraceSpans(result) {
|
|
834
|
+
return buildDerivedTraceSummary(result)?.spans ?? [];
|
|
835
|
+
}
|
|
836
|
+
function toTraceSummary(result) {
|
|
837
|
+
const rawTrace = getTraceSummary(result);
|
|
838
|
+
if (!rawTrace) return void 0;
|
|
839
|
+
return toCamelCaseDeep(rawTrace);
|
|
840
|
+
}
|
|
841
|
+
function buildRunId(relativeRunPath) {
|
|
842
|
+
const normalized = relativeRunPath.split(path5.sep).join("/");
|
|
843
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
844
|
+
if (segments.length >= 2) {
|
|
845
|
+
const experiment = segments.slice(0, -1).join("/");
|
|
846
|
+
const timestamp = segments.at(-1);
|
|
847
|
+
if (experiment === "default") {
|
|
848
|
+
return timestamp ?? normalized;
|
|
849
|
+
}
|
|
850
|
+
return `${experiment}::${timestamp}`;
|
|
851
|
+
}
|
|
852
|
+
return segments[0];
|
|
853
|
+
}
|
|
854
|
+
function collectRunManifestPaths(runsDir, currentDir, files) {
|
|
855
|
+
const primaryPath = resolveExistingRunPrimaryPath(currentDir);
|
|
856
|
+
if (primaryPath) {
|
|
857
|
+
const relativeRunPath = path5.relative(runsDir, currentDir);
|
|
858
|
+
files.push({
|
|
859
|
+
filePath: primaryPath,
|
|
860
|
+
displayName: path5.basename(currentDir),
|
|
861
|
+
runId: buildRunId(relativeRunPath)
|
|
862
|
+
});
|
|
863
|
+
return;
|
|
864
|
+
}
|
|
865
|
+
const entries = readdirSync(currentDir, { withFileTypes: true });
|
|
866
|
+
for (const entry of entries) {
|
|
867
|
+
if (entry.isDirectory()) {
|
|
868
|
+
collectRunManifestPaths(runsDir, path5.join(currentDir, entry.name), files);
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
function listResultFilesFromRunsDir(runsDir, limit) {
|
|
873
|
+
const files = [];
|
|
874
|
+
try {
|
|
875
|
+
const entries = readdirSync(runsDir, { withFileTypes: true });
|
|
876
|
+
for (const entry of entries) {
|
|
877
|
+
if (entry.isDirectory()) {
|
|
878
|
+
collectRunManifestPaths(runsDir, path5.join(runsDir, entry.name), files);
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
} catch {
|
|
882
|
+
}
|
|
883
|
+
files.sort((a, b) => b.displayName.localeCompare(a.displayName));
|
|
884
|
+
const limited = limit !== void 0 && limit > 0 ? files.slice(0, limit) : files;
|
|
885
|
+
const metas = [];
|
|
886
|
+
for (const { filePath, displayName, runId } of limited) {
|
|
887
|
+
try {
|
|
888
|
+
const fileStat = statSync2(filePath);
|
|
889
|
+
const results = loadResultFile(filePath);
|
|
890
|
+
const testCount = results.length;
|
|
891
|
+
const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
|
|
892
|
+
const passRate = testCount > 0 ? passCount / testCount : 0;
|
|
893
|
+
const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0;
|
|
894
|
+
const filenameTimestamp = extractTimestampFromFilename(displayName);
|
|
895
|
+
const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? "unknown";
|
|
896
|
+
metas.push({
|
|
897
|
+
path: filePath,
|
|
898
|
+
filename: runId,
|
|
899
|
+
displayName,
|
|
900
|
+
timestamp,
|
|
901
|
+
testCount,
|
|
902
|
+
passRate,
|
|
903
|
+
avgScore,
|
|
904
|
+
sizeBytes: fileStat.size
|
|
905
|
+
});
|
|
906
|
+
} catch {
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
return metas;
|
|
910
|
+
}
|
|
911
|
+
function listResultFiles(cwd, limit) {
|
|
912
|
+
return listResultFilesFromRunsDir(
|
|
913
|
+
path5.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
|
|
914
|
+
limit
|
|
915
|
+
);
|
|
916
|
+
}
|
|
917
|
+
function extractTimestampFromFilename(filename) {
|
|
918
|
+
const match = filename.match(
|
|
919
|
+
/(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/
|
|
920
|
+
);
|
|
921
|
+
if (!match) return void 0;
|
|
922
|
+
return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ":$1:$2.$3Z");
|
|
923
|
+
}
|
|
924
|
+
function formatNumber(n) {
|
|
925
|
+
return n.toLocaleString();
|
|
926
|
+
}
|
|
927
|
+
function formatDuration(ms) {
|
|
928
|
+
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
929
|
+
if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
|
|
930
|
+
const minutes = Math.floor(ms / 6e4);
|
|
931
|
+
const seconds = (ms % 6e4 / 1e3).toFixed(0);
|
|
932
|
+
return `${minutes}m${seconds}s`;
|
|
933
|
+
}
|
|
934
|
+
function formatCost(usd) {
|
|
935
|
+
if (usd < 0.01) return `$${usd.toFixed(4)}`;
|
|
936
|
+
return `$${usd.toFixed(3)}`;
|
|
937
|
+
}
|
|
938
|
+
function formatSize(bytes) {
|
|
939
|
+
if (bytes < 1024) return `${bytes}B`;
|
|
940
|
+
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}KB`;
|
|
941
|
+
return `${(bytes / (1024 * 1024)).toFixed(1)}MB`;
|
|
942
|
+
}
|
|
943
|
+
function formatScore(score) {
|
|
944
|
+
return `${(score * 100).toFixed(0)}%`;
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
// src/commands/results/remote.ts
|
|
948
|
+
var REMOTE_RUN_PREFIX = "remote::";
|
|
949
|
+
var SIZE_WARNING_BYTES = 10 * 1024 * 1024;
|
|
950
|
+
function getStatusMessage(error) {
|
|
951
|
+
return error instanceof Error ? error.message : String(error);
|
|
952
|
+
}
|
|
953
|
+
function normalizeResultsExportConfig(config) {
|
|
954
|
+
return {
|
|
955
|
+
repo: config.repo,
|
|
956
|
+
path: config.path,
|
|
957
|
+
auto_push: config.auto_push === true,
|
|
958
|
+
branch_prefix: config.branch_prefix?.trim() || "eval-results"
|
|
959
|
+
};
|
|
960
|
+
}
|
|
961
|
+
function slugify(value) {
|
|
962
|
+
return value.trim().replace(/[^A-Za-z0-9._/-]+/g, "-").replace(/\/+/g, "/").replace(/^-+|-+$/g, "").slice(0, 120);
|
|
963
|
+
}
|
|
964
|
+
function getRelativeRunPath(cwd, runDir) {
|
|
965
|
+
const relative = path6.relative(path6.join(cwd, ".agentv", "results", "runs"), runDir);
|
|
966
|
+
if (!relative.startsWith("..") && !path6.isAbsolute(relative)) {
|
|
967
|
+
return relative;
|
|
968
|
+
}
|
|
969
|
+
const experiment = path6.basename(path6.dirname(runDir));
|
|
970
|
+
const runName = path6.basename(runDir);
|
|
971
|
+
return experiment && experiment !== runName ? path6.join(experiment, runName) : runName;
|
|
972
|
+
}
|
|
973
|
+
function buildBranchName(config, payload) {
|
|
974
|
+
const timestamp = path6.basename(payload.run_dir);
|
|
975
|
+
const evalStem = payload.test_files.length === 1 ? path6.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
|
|
976
|
+
const experiment = slugify(payload.experiment ?? "default");
|
|
977
|
+
const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
|
|
978
|
+
return `${config.branch_prefix}/${branchLeaf}`;
|
|
979
|
+
}
|
|
980
|
+
function buildCommitTitle(payload) {
|
|
981
|
+
const passed = payload.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length;
|
|
982
|
+
const avgScore = payload.results.length > 0 ? payload.results.reduce((sum, result) => sum + result.score, 0) / payload.results.length : 0;
|
|
983
|
+
const experiment = payload.experiment ?? "default";
|
|
984
|
+
return `feat(results): ${experiment} - ${passed}/${payload.results.length} PASS (${avgScore.toFixed(3)})`;
|
|
985
|
+
}
|
|
986
|
+
function buildPrBody(payload) {
|
|
987
|
+
const sections = payload.eval_summaries.map((summary) => {
|
|
988
|
+
const table = summary.results.map((result) => `| ${result.test_id} | ${result.score.toFixed(3)} | ${result.status} |`).join("\n");
|
|
989
|
+
return [
|
|
990
|
+
`### ${summary.eval_file}`,
|
|
991
|
+
"",
|
|
992
|
+
`Summary: ${summary.passed}/${summary.total} PASS (${summary.avg_score.toFixed(3)})`,
|
|
993
|
+
"",
|
|
994
|
+
"| Test | Score | Status |",
|
|
995
|
+
"|---|---|---|",
|
|
996
|
+
table || "| (no results) | 0.000 | ERROR |"
|
|
997
|
+
].join("\n");
|
|
998
|
+
}).join("\n\n");
|
|
999
|
+
return [
|
|
1000
|
+
"## Results",
|
|
1001
|
+
"",
|
|
1002
|
+
sections,
|
|
1003
|
+
"",
|
|
1004
|
+
`Run: ${path6.basename(payload.run_dir)}`,
|
|
1005
|
+
`Experiment: ${payload.experiment ?? "default"}`,
|
|
1006
|
+
`Eval Files: ${payload.test_files.join(", ")}`
|
|
1007
|
+
].join("\n");
|
|
1008
|
+
}
|
|
1009
|
+
async function maybeWarnLargeArtifact(runDir) {
|
|
1010
|
+
const sizeBytes = await directorySizeBytes(runDir);
|
|
1011
|
+
if (sizeBytes > SIZE_WARNING_BYTES) {
|
|
1012
|
+
console.warn(
|
|
1013
|
+
`Warning: run artifacts total ${(sizeBytes / (1024 * 1024)).toFixed(1)}MB. Export will continue.`
|
|
1014
|
+
);
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
async function loadNormalizedResultsConfig(cwd) {
|
|
1018
|
+
const repoRoot = await findRepoRoot(cwd) ?? cwd;
|
|
1019
|
+
const config = await loadConfig(path6.join(cwd, "_"), repoRoot);
|
|
1020
|
+
if (!config?.results?.export) {
|
|
1021
|
+
return void 0;
|
|
1022
|
+
}
|
|
1023
|
+
return normalizeResultsExportConfig(config.results.export);
|
|
1024
|
+
}
|
|
1025
|
+
function encodeRemoteRunId(filename) {
|
|
1026
|
+
return `${REMOTE_RUN_PREFIX}${filename}`;
|
|
1027
|
+
}
|
|
1028
|
+
async function getRemoteResultsStatus(cwd) {
|
|
1029
|
+
const config = await loadNormalizedResultsConfig(cwd);
|
|
1030
|
+
const status = getResultsRepoStatus(config);
|
|
1031
|
+
const runCount = config && status.available ? listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length : 0;
|
|
1032
|
+
return {
|
|
1033
|
+
...status,
|
|
1034
|
+
run_count: runCount
|
|
1035
|
+
};
|
|
1036
|
+
}
|
|
1037
|
+
async function syncRemoteResults(cwd) {
|
|
1038
|
+
const config = await loadNormalizedResultsConfig(cwd);
|
|
1039
|
+
if (!config) {
|
|
1040
|
+
return {
|
|
1041
|
+
...getResultsRepoStatus(),
|
|
1042
|
+
run_count: 0
|
|
1043
|
+
};
|
|
1044
|
+
}
|
|
1045
|
+
try {
|
|
1046
|
+
await syncResultsRepo(config);
|
|
1047
|
+
} catch (error) {
|
|
1048
|
+
return {
|
|
1049
|
+
...getResultsRepoStatus(config),
|
|
1050
|
+
run_count: 0,
|
|
1051
|
+
last_error: getStatusMessage(error)
|
|
1052
|
+
};
|
|
1053
|
+
}
|
|
1054
|
+
return getRemoteResultsStatus(cwd);
|
|
1055
|
+
}
|
|
1056
|
+
async function listMergedResultFiles(cwd, limit) {
|
|
1057
|
+
const localRuns = listResultFiles(cwd).map(
|
|
1058
|
+
(meta) => ({
|
|
1059
|
+
...meta,
|
|
1060
|
+
source: "local",
|
|
1061
|
+
raw_filename: meta.filename
|
|
1062
|
+
})
|
|
1063
|
+
);
|
|
1064
|
+
const remoteStatus = await getRemoteResultsStatus(cwd);
|
|
1065
|
+
const config = await loadNormalizedResultsConfig(cwd);
|
|
1066
|
+
if (!config || !remoteStatus.available) {
|
|
1067
|
+
return {
|
|
1068
|
+
runs: limit !== void 0 && limit > 0 ? localRuns.slice(0, limit) : localRuns,
|
|
1069
|
+
remote_status: remoteStatus
|
|
1070
|
+
};
|
|
1071
|
+
}
|
|
1072
|
+
const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
|
|
1073
|
+
(meta) => ({
|
|
1074
|
+
...meta,
|
|
1075
|
+
filename: encodeRemoteRunId(meta.filename),
|
|
1076
|
+
raw_filename: meta.filename,
|
|
1077
|
+
source: "remote"
|
|
1078
|
+
})
|
|
1079
|
+
);
|
|
1080
|
+
const merged = [...localRuns, ...remoteRuns].sort(
|
|
1081
|
+
(a, b) => b.timestamp.localeCompare(a.timestamp)
|
|
1082
|
+
);
|
|
1083
|
+
return {
|
|
1084
|
+
runs: limit !== void 0 && limit > 0 ? merged.slice(0, limit) : merged,
|
|
1085
|
+
remote_status: remoteStatus
|
|
1086
|
+
};
|
|
1087
|
+
}
|
|
1088
|
+
async function findRunById(cwd, runId) {
|
|
1089
|
+
const { runs } = await listMergedResultFiles(cwd);
|
|
1090
|
+
return runs.find((run) => run.filename === runId);
|
|
1091
|
+
}
|
|
1092
|
+
async function maybeAutoExportRunArtifacts(payload) {
|
|
1093
|
+
const config = await loadNormalizedResultsConfig(payload.cwd);
|
|
1094
|
+
if (!config?.auto_push) {
|
|
1095
|
+
return;
|
|
1096
|
+
}
|
|
1097
|
+
try {
|
|
1098
|
+
await maybeWarnLargeArtifact(payload.run_dir);
|
|
1099
|
+
const branchName = buildBranchName(config, payload);
|
|
1100
|
+
const prepared = await prepareResultsRepoBranch(config, branchName);
|
|
1101
|
+
try {
|
|
1102
|
+
const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
|
|
1103
|
+
const destinationDir = path6.join(prepared.repoDir, config.path, relativeRunPath);
|
|
1104
|
+
await stageResultsArtifacts({
|
|
1105
|
+
repoDir: prepared.repoDir,
|
|
1106
|
+
sourceDir: payload.run_dir,
|
|
1107
|
+
destinationDir
|
|
1108
|
+
});
|
|
1109
|
+
const commitTitle = buildCommitTitle(payload);
|
|
1110
|
+
const changed = await commitAndPushResultsBranch({
|
|
1111
|
+
repoDir: prepared.repoDir,
|
|
1112
|
+
branchName,
|
|
1113
|
+
commitMessage: commitTitle
|
|
1114
|
+
});
|
|
1115
|
+
if (!changed) {
|
|
1116
|
+
console.warn("Warning: results export produced no git changes. Skipping PR creation.");
|
|
1117
|
+
return;
|
|
1118
|
+
}
|
|
1119
|
+
const prUrl = await createDraftResultsPr({
|
|
1120
|
+
repo: config.repo,
|
|
1121
|
+
repoDir: prepared.repoDir,
|
|
1122
|
+
baseBranch: prepared.baseBranch,
|
|
1123
|
+
branchName,
|
|
1124
|
+
title: commitTitle,
|
|
1125
|
+
body: buildPrBody(payload)
|
|
1126
|
+
});
|
|
1127
|
+
console.log(`Remote results draft PR created: ${prUrl}`);
|
|
1128
|
+
} finally {
|
|
1129
|
+
await prepared.cleanup();
|
|
1130
|
+
}
|
|
1131
|
+
} catch (error) {
|
|
1132
|
+
console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
|
|
1133
|
+
console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
// src/commands/eval/artifact-writer.ts
|
|
1138
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
1139
|
+
import path7 from "node:path";
|
|
1140
|
+
|
|
1141
|
+
// src/utils/case-conversion.ts
|
|
1142
|
+
function toSnakeCase(str) {
|
|
1143
|
+
if (/^[A-Z]/.test(str)) {
|
|
1144
|
+
return str;
|
|
376
1145
|
}
|
|
377
|
-
return
|
|
1146
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
378
1147
|
}
|
|
379
|
-
function
|
|
380
|
-
if (
|
|
381
|
-
return
|
|
1148
|
+
function toSnakeCaseDeep(obj) {
|
|
1149
|
+
if (obj === null || obj === void 0) {
|
|
1150
|
+
return obj;
|
|
382
1151
|
}
|
|
383
|
-
if (
|
|
384
|
-
|
|
385
|
-
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
|
|
386
|
-
);
|
|
1152
|
+
if (Array.isArray(obj)) {
|
|
1153
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
387
1154
|
}
|
|
388
|
-
|
|
1155
|
+
if (typeof obj === "object") {
|
|
1156
|
+
const result = {};
|
|
1157
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
1158
|
+
const snakeKey = toSnakeCase(key);
|
|
1159
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
1160
|
+
}
|
|
1161
|
+
return result;
|
|
1162
|
+
}
|
|
1163
|
+
return obj;
|
|
389
1164
|
}
|
|
390
1165
|
|
|
391
1166
|
// src/commands/eval/artifact-writer.ts
|
|
@@ -524,7 +1299,7 @@ function buildTimingArtifact(results) {
|
|
|
524
1299
|
}
|
|
525
1300
|
};
|
|
526
1301
|
}
|
|
527
|
-
function buildBenchmarkArtifact(results, evalFile = "") {
|
|
1302
|
+
function buildBenchmarkArtifact(results, evalFile = "", experiment) {
|
|
528
1303
|
const targetSet = /* @__PURE__ */ new Set();
|
|
529
1304
|
const testIdSet = /* @__PURE__ */ new Set();
|
|
530
1305
|
for (const result of results) {
|
|
@@ -549,7 +1324,7 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
549
1324
|
tokens: computeStats(tokens)
|
|
550
1325
|
};
|
|
551
1326
|
const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
|
|
552
|
-
if (toolCallCounts.some((
|
|
1327
|
+
if (toolCallCounts.some((c2) => c2 > 0)) {
|
|
553
1328
|
entry.tool_calls = computeStats(toolCallCounts);
|
|
554
1329
|
}
|
|
555
1330
|
const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
|
|
@@ -595,7 +1370,8 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
595
1370
|
eval_file: evalFile,
|
|
596
1371
|
timestamp,
|
|
597
1372
|
targets,
|
|
598
|
-
tests_run: testIds
|
|
1373
|
+
tests_run: testIds,
|
|
1374
|
+
experiment
|
|
599
1375
|
},
|
|
600
1376
|
run_summary: runSummary,
|
|
601
1377
|
per_grader_summary: perEvaluatorSummary,
|
|
@@ -622,7 +1398,7 @@ function buildArtifactSubdir(result) {
|
|
|
622
1398
|
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
623
1399
|
}
|
|
624
1400
|
segments.push(safeTestId(result.testId));
|
|
625
|
-
return
|
|
1401
|
+
return path7.posix.join(...segments);
|
|
626
1402
|
}
|
|
627
1403
|
function formatOutputMarkdown(output) {
|
|
628
1404
|
return output.map((msg) => `@[${msg.role}]:
|
|
@@ -655,11 +1431,11 @@ function buildResultIndexArtifact(result) {
|
|
|
655
1431
|
failure_stage: result.failureStage,
|
|
656
1432
|
failure_reason_code: result.failureReasonCode,
|
|
657
1433
|
workspace_path: result.workspacePath,
|
|
658
|
-
grading_path:
|
|
659
|
-
timing_path:
|
|
660
|
-
input_path: input ?
|
|
661
|
-
output_path: hasResponse ?
|
|
662
|
-
response_path: hasResponse ?
|
|
1434
|
+
grading_path: path7.posix.join(artifactSubdir, "grading.json"),
|
|
1435
|
+
timing_path: path7.posix.join(artifactSubdir, "timing.json"),
|
|
1436
|
+
input_path: input ? path7.posix.join(artifactSubdir, "input.md") : void 0,
|
|
1437
|
+
output_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
|
|
1438
|
+
response_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0
|
|
663
1439
|
};
|
|
664
1440
|
}
|
|
665
1441
|
async function writeJsonlFile(filePath, records) {
|
|
@@ -669,18 +1445,18 @@ async function writeJsonlFile(filePath, records) {
|
|
|
669
1445
|
}
|
|
670
1446
|
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
671
1447
|
const testArtifactDir = outputDir;
|
|
672
|
-
const timingPath =
|
|
673
|
-
const benchmarkPath =
|
|
674
|
-
const indexPath =
|
|
1448
|
+
const timingPath = path7.join(outputDir, "timing.json");
|
|
1449
|
+
const benchmarkPath = path7.join(outputDir, "benchmark.json");
|
|
1450
|
+
const indexPath = path7.join(outputDir, RESULT_INDEX_FILENAME);
|
|
675
1451
|
await mkdir(outputDir, { recursive: true });
|
|
676
1452
|
const indexRecords = [];
|
|
677
1453
|
for (const result of results) {
|
|
678
1454
|
const grading = buildGradingArtifact(result);
|
|
679
1455
|
const timing2 = buildTimingArtifact([result]);
|
|
680
1456
|
const artifactSubdir = buildArtifactSubdir(result);
|
|
681
|
-
const testDir =
|
|
682
|
-
const gradingPath =
|
|
683
|
-
const perTestTimingPath =
|
|
1457
|
+
const testDir = path7.join(outputDir, artifactSubdir);
|
|
1458
|
+
const gradingPath = path7.join(testDir, "grading.json");
|
|
1459
|
+
const perTestTimingPath = path7.join(testDir, "timing.json");
|
|
684
1460
|
await mkdir(testDir, { recursive: true });
|
|
685
1461
|
await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
686
1462
|
`, "utf8");
|
|
@@ -688,23 +1464,26 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
688
1464
|
`, "utf8");
|
|
689
1465
|
const input = extractInput(result);
|
|
690
1466
|
if (input) {
|
|
691
|
-
await writeFile(
|
|
1467
|
+
await writeFile(path7.join(testDir, "input.md"), input, "utf8");
|
|
692
1468
|
}
|
|
693
1469
|
if (result.output && result.output.length > 0) {
|
|
694
|
-
const outputsDir =
|
|
1470
|
+
const outputsDir = path7.join(testDir, "outputs");
|
|
695
1471
|
await mkdir(outputsDir, { recursive: true });
|
|
696
1472
|
await writeFile(
|
|
697
|
-
|
|
1473
|
+
path7.join(outputsDir, "response.md"),
|
|
698
1474
|
formatOutputMarkdown(result.output),
|
|
699
1475
|
"utf8"
|
|
700
1476
|
);
|
|
701
1477
|
}
|
|
702
|
-
indexRecords.push(
|
|
1478
|
+
indexRecords.push({
|
|
1479
|
+
...buildResultIndexArtifact(result),
|
|
1480
|
+
experiment: options?.experiment
|
|
1481
|
+
});
|
|
703
1482
|
}
|
|
704
1483
|
const timing = buildTimingArtifact(results);
|
|
705
1484
|
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
706
1485
|
`, "utf8");
|
|
707
|
-
const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
|
|
1486
|
+
const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
|
|
708
1487
|
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
709
1488
|
`, "utf8");
|
|
710
1489
|
await writeJsonlFile(indexPath, indexRecords);
|
|
@@ -758,13 +1537,13 @@ async function writeBenchmarkJson(outputPath, results) {
|
|
|
758
1537
|
// src/commands/eval/env.ts
|
|
759
1538
|
import { constants as constants3 } from "node:fs";
|
|
760
1539
|
import { access as access3 } from "node:fs/promises";
|
|
761
|
-
import
|
|
1540
|
+
import path8 from "node:path";
|
|
762
1541
|
import { config as loadDotenv } from "dotenv";
|
|
763
1542
|
function uniqueDirs(directories) {
|
|
764
1543
|
const seen = /* @__PURE__ */ new Set();
|
|
765
1544
|
const result = [];
|
|
766
1545
|
for (const dir of directories) {
|
|
767
|
-
const absolute =
|
|
1546
|
+
const absolute = path8.resolve(dir);
|
|
768
1547
|
if (seen.has(absolute)) {
|
|
769
1548
|
continue;
|
|
770
1549
|
}
|
|
@@ -783,14 +1562,14 @@ async function fileExists2(filePath) {
|
|
|
783
1562
|
}
|
|
784
1563
|
function collectAncestorDirectories(start, boundary) {
|
|
785
1564
|
const directories = [];
|
|
786
|
-
const boundaryDir =
|
|
787
|
-
let current =
|
|
1565
|
+
const boundaryDir = path8.resolve(boundary);
|
|
1566
|
+
let current = path8.resolve(start);
|
|
788
1567
|
while (current !== void 0) {
|
|
789
1568
|
directories.push(current);
|
|
790
1569
|
if (current === boundaryDir) {
|
|
791
1570
|
break;
|
|
792
1571
|
}
|
|
793
|
-
const parent =
|
|
1572
|
+
const parent = path8.dirname(current);
|
|
794
1573
|
if (parent === current) {
|
|
795
1574
|
break;
|
|
796
1575
|
}
|
|
@@ -800,12 +1579,12 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
800
1579
|
}
|
|
801
1580
|
async function loadEnvFromHierarchy(options) {
|
|
802
1581
|
const { testFilePath, repoRoot, verbose } = options;
|
|
803
|
-
const testDir =
|
|
1582
|
+
const testDir = path8.dirname(path8.resolve(testFilePath));
|
|
804
1583
|
const cwd = process.cwd();
|
|
805
1584
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
806
1585
|
const envFiles = [];
|
|
807
1586
|
for (const dir of searchDirs) {
|
|
808
|
-
const candidate =
|
|
1587
|
+
const candidate = path8.join(dir, ".env");
|
|
809
1588
|
if (await fileExists2(candidate)) {
|
|
810
1589
|
envFiles.push(candidate);
|
|
811
1590
|
}
|
|
@@ -827,11 +1606,11 @@ async function loadEnvFromHierarchy(options) {
|
|
|
827
1606
|
}
|
|
828
1607
|
|
|
829
1608
|
// src/commands/eval/output-writer.ts
|
|
830
|
-
import
|
|
1609
|
+
import path14 from "node:path";
|
|
831
1610
|
|
|
832
1611
|
// src/commands/eval/html-writer.ts
|
|
833
1612
|
import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
834
|
-
import
|
|
1613
|
+
import path9 from "node:path";
|
|
835
1614
|
|
|
836
1615
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
837
1616
|
var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
|
|
@@ -1050,7 +1829,7 @@ var HtmlWriter = class _HtmlWriter {
|
|
|
1050
1829
|
this.filePath = filePath;
|
|
1051
1830
|
}
|
|
1052
1831
|
static async open(filePath) {
|
|
1053
|
-
await mkdir2(
|
|
1832
|
+
await mkdir2(path9.dirname(filePath), { recursive: true });
|
|
1054
1833
|
const writer = new _HtmlWriter(filePath);
|
|
1055
1834
|
await writer.writeHtml();
|
|
1056
1835
|
return writer;
|
|
@@ -1561,7 +2340,7 @@ var SCRIPT = `
|
|
|
1561
2340
|
|
|
1562
2341
|
// src/commands/eval/json-writer.ts
|
|
1563
2342
|
import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
|
|
1564
|
-
import
|
|
2343
|
+
import path10 from "node:path";
|
|
1565
2344
|
var JsonWriter = class _JsonWriter {
|
|
1566
2345
|
filePath;
|
|
1567
2346
|
results = [];
|
|
@@ -1570,7 +2349,7 @@ var JsonWriter = class _JsonWriter {
|
|
|
1570
2349
|
this.filePath = filePath;
|
|
1571
2350
|
}
|
|
1572
2351
|
static async open(filePath) {
|
|
1573
|
-
await mkdir3(
|
|
2352
|
+
await mkdir3(path10.dirname(filePath), { recursive: true });
|
|
1574
2353
|
return new _JsonWriter(filePath);
|
|
1575
2354
|
}
|
|
1576
2355
|
async append(result) {
|
|
@@ -1605,7 +2384,7 @@ var JsonWriter = class _JsonWriter {
|
|
|
1605
2384
|
// src/commands/eval/jsonl-writer.ts
|
|
1606
2385
|
import { createWriteStream } from "node:fs";
|
|
1607
2386
|
import { mkdir as mkdir4 } from "node:fs/promises";
|
|
1608
|
-
import
|
|
2387
|
+
import path11 from "node:path";
|
|
1609
2388
|
import { finished } from "node:stream/promises";
|
|
1610
2389
|
var JsonlWriter = class _JsonlWriter {
|
|
1611
2390
|
stream;
|
|
@@ -1615,7 +2394,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
1615
2394
|
this.stream = stream;
|
|
1616
2395
|
}
|
|
1617
2396
|
static async open(filePath) {
|
|
1618
|
-
await mkdir4(
|
|
2397
|
+
await mkdir4(path11.dirname(filePath), { recursive: true });
|
|
1619
2398
|
const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
|
|
1620
2399
|
return new _JsonlWriter(stream);
|
|
1621
2400
|
}
|
|
@@ -1647,7 +2426,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
1647
2426
|
|
|
1648
2427
|
// src/commands/eval/junit-writer.ts
|
|
1649
2428
|
import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
|
|
1650
|
-
import
|
|
2429
|
+
import path12 from "node:path";
|
|
1651
2430
|
function escapeXml(str) {
|
|
1652
2431
|
return str.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
1653
2432
|
}
|
|
@@ -1661,7 +2440,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1661
2440
|
this.threshold = options?.threshold ?? 0.5;
|
|
1662
2441
|
}
|
|
1663
2442
|
static async open(filePath, options) {
|
|
1664
|
-
await mkdir5(
|
|
2443
|
+
await mkdir5(path12.dirname(filePath), { recursive: true });
|
|
1665
2444
|
return new _JunitWriter(filePath, options);
|
|
1666
2445
|
}
|
|
1667
2446
|
async append(result) {
|
|
@@ -1737,7 +2516,7 @@ ${suiteXmls.join("\n")}
|
|
|
1737
2516
|
// src/commands/eval/yaml-writer.ts
|
|
1738
2517
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
1739
2518
|
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
1740
|
-
import
|
|
2519
|
+
import path13 from "node:path";
|
|
1741
2520
|
import { finished as finished2 } from "node:stream/promises";
|
|
1742
2521
|
import { stringify as stringifyYaml } from "yaml";
|
|
1743
2522
|
var YamlWriter = class _YamlWriter {
|
|
@@ -1749,7 +2528,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
1749
2528
|
this.stream = stream;
|
|
1750
2529
|
}
|
|
1751
2530
|
static async open(filePath) {
|
|
1752
|
-
await mkdir6(
|
|
2531
|
+
await mkdir6(path13.dirname(filePath), { recursive: true });
|
|
1753
2532
|
const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
|
|
1754
2533
|
return new _YamlWriter(stream);
|
|
1755
2534
|
}
|
|
@@ -1805,7 +2584,7 @@ async function createOutputWriter(filePath, format) {
|
|
|
1805
2584
|
}
|
|
1806
2585
|
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
1807
2586
|
function createWriterFromPath(filePath, options) {
|
|
1808
|
-
const ext =
|
|
2587
|
+
const ext = path14.extname(filePath).toLowerCase();
|
|
1809
2588
|
switch (ext) {
|
|
1810
2589
|
case ".jsonl":
|
|
1811
2590
|
return JsonlWriter.open(filePath);
|
|
@@ -1838,10 +2617,10 @@ function useColors() {
|
|
|
1838
2617
|
}
|
|
1839
2618
|
function formatVerdict(score, verdict) {
|
|
1840
2619
|
if (verdict === void 0) return "";
|
|
1841
|
-
const
|
|
2620
|
+
const colors2 = useColors();
|
|
1842
2621
|
const scoreStr = score !== void 0 ? score.toFixed(3) : "";
|
|
1843
2622
|
const verdictLabel = verdict === "ERROR" ? "ERROR" : `${scoreStr} ${verdict}`;
|
|
1844
|
-
if (!
|
|
2623
|
+
if (!colors2) return ` | ${verdictLabel}`;
|
|
1845
2624
|
const color = verdict === "PASS" ? ANSI_GREEN : verdict === "FAIL" ? ANSI_RED2 : ANSI_YELLOW2;
|
|
1846
2625
|
return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET2}`;
|
|
1847
2626
|
}
|
|
@@ -1901,12 +2680,12 @@ var ProgressDisplay = class {
|
|
|
1901
2680
|
}
|
|
1902
2681
|
addLogPaths(paths, provider) {
|
|
1903
2682
|
const newPaths = [];
|
|
1904
|
-
for (const
|
|
1905
|
-
if (this.logPathSet.has(
|
|
2683
|
+
for (const path19 of paths) {
|
|
2684
|
+
if (this.logPathSet.has(path19)) {
|
|
1906
2685
|
continue;
|
|
1907
2686
|
}
|
|
1908
|
-
this.logPathSet.add(
|
|
1909
|
-
newPaths.push(
|
|
2687
|
+
this.logPathSet.add(path19);
|
|
2688
|
+
newPaths.push(path19);
|
|
1910
2689
|
}
|
|
1911
2690
|
if (newPaths.length === 0) {
|
|
1912
2691
|
return;
|
|
@@ -1919,8 +2698,8 @@ var ProgressDisplay = class {
|
|
|
1919
2698
|
this.hasPrintedLogHeader = true;
|
|
1920
2699
|
}
|
|
1921
2700
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
1922
|
-
newPaths.forEach((
|
|
1923
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
2701
|
+
newPaths.forEach((path19, offset) => {
|
|
2702
|
+
console.log(`${startIndex + offset + 1}. ${path19}`);
|
|
1924
2703
|
});
|
|
1925
2704
|
}
|
|
1926
2705
|
finish() {
|
|
@@ -1931,149 +2710,34 @@ var ProgressDisplay = class {
|
|
|
1931
2710
|
}
|
|
1932
2711
|
};
|
|
1933
2712
|
|
|
1934
|
-
// src/commands/results/manifest.ts
|
|
1935
|
-
import { existsSync as existsSync2, readFileSync } from "node:fs";
|
|
1936
|
-
import path12 from "node:path";
|
|
1937
|
-
function parseJsonlLines(content) {
|
|
1938
|
-
return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
1939
|
-
}
|
|
1940
|
-
function parseMarkdownMessages(content) {
|
|
1941
|
-
const trimmed = content.trim();
|
|
1942
|
-
if (!trimmed.startsWith("@[")) {
|
|
1943
|
-
return [];
|
|
1944
|
-
}
|
|
1945
|
-
const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
|
|
1946
|
-
return matches.map((match) => ({
|
|
1947
|
-
role: match[1],
|
|
1948
|
-
content: match[2].trimEnd()
|
|
1949
|
-
}));
|
|
1950
|
-
}
|
|
1951
|
-
function readOptionalText(baseDir, relativePath) {
|
|
1952
|
-
if (!relativePath) {
|
|
1953
|
-
return void 0;
|
|
1954
|
-
}
|
|
1955
|
-
const absolutePath = path12.join(baseDir, relativePath);
|
|
1956
|
-
if (!existsSync2(absolutePath)) {
|
|
1957
|
-
return void 0;
|
|
1958
|
-
}
|
|
1959
|
-
return readFileSync(absolutePath, "utf8");
|
|
1960
|
-
}
|
|
1961
|
-
function readOptionalJson(baseDir, relativePath) {
|
|
1962
|
-
const text = readOptionalText(baseDir, relativePath);
|
|
1963
|
-
if (!text) {
|
|
1964
|
-
return void 0;
|
|
1965
|
-
}
|
|
1966
|
-
try {
|
|
1967
|
-
return JSON.parse(text);
|
|
1968
|
-
} catch {
|
|
1969
|
-
return void 0;
|
|
1970
|
-
}
|
|
1971
|
-
}
|
|
1972
|
-
function hydrateInput(baseDir, record) {
|
|
1973
|
-
const inputText = readOptionalText(baseDir, record.input_path);
|
|
1974
|
-
if (!inputText) {
|
|
1975
|
-
return void 0;
|
|
1976
|
-
}
|
|
1977
|
-
const messages = parseMarkdownMessages(inputText);
|
|
1978
|
-
return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
|
|
1979
|
-
}
|
|
1980
|
-
function hydrateOutput(baseDir, record) {
|
|
1981
|
-
const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
|
|
1982
|
-
if (!responseText) {
|
|
1983
|
-
return void 0;
|
|
1984
|
-
}
|
|
1985
|
-
const messages = parseMarkdownMessages(responseText);
|
|
1986
|
-
if (messages.length > 0) {
|
|
1987
|
-
return messages.map((message) => ({
|
|
1988
|
-
role: message.role,
|
|
1989
|
-
content: message.content
|
|
1990
|
-
}));
|
|
1991
|
-
}
|
|
1992
|
-
return [{ role: "assistant", content: responseText.trimEnd() }];
|
|
1993
|
-
}
|
|
1994
|
-
function hydrateManifestRecord(baseDir, record) {
|
|
1995
|
-
const grading = readOptionalJson(baseDir, record.grading_path);
|
|
1996
|
-
const timing = readOptionalJson(baseDir, record.timing_path);
|
|
1997
|
-
const testId = record.test_id ?? "unknown";
|
|
1998
|
-
return {
|
|
1999
|
-
timestamp: record.timestamp,
|
|
2000
|
-
testId,
|
|
2001
|
-
suite: record.suite,
|
|
2002
|
-
category: record.category,
|
|
2003
|
-
target: record.target,
|
|
2004
|
-
score: record.score,
|
|
2005
|
-
executionStatus: record.execution_status,
|
|
2006
|
-
error: record.error,
|
|
2007
|
-
assertions: grading?.assertions.map((assertion) => ({
|
|
2008
|
-
text: assertion.text,
|
|
2009
|
-
passed: assertion.passed,
|
|
2010
|
-
evidence: assertion.evidence
|
|
2011
|
-
})),
|
|
2012
|
-
scores: grading?.evaluators?.map((evaluator) => ({
|
|
2013
|
-
name: evaluator.name,
|
|
2014
|
-
type: evaluator.type,
|
|
2015
|
-
score: evaluator.score,
|
|
2016
|
-
assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
|
|
2017
|
-
text: String(assertion.text ?? ""),
|
|
2018
|
-
passed: Boolean(assertion.passed),
|
|
2019
|
-
evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
|
|
2020
|
-
})) : void 0,
|
|
2021
|
-
weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
|
|
2022
|
-
verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
|
|
2023
|
-
details: evaluator.details
|
|
2024
|
-
})) ?? record.scores,
|
|
2025
|
-
tokenUsage: timing?.token_usage ? {
|
|
2026
|
-
input: timing.token_usage.input,
|
|
2027
|
-
output: timing.token_usage.output,
|
|
2028
|
-
reasoning: timing.token_usage.reasoning
|
|
2029
|
-
} : record.token_usage,
|
|
2030
|
-
durationMs: timing?.duration_ms ?? record.duration_ms,
|
|
2031
|
-
costUsd: record.cost_usd,
|
|
2032
|
-
input: hydrateInput(baseDir, record),
|
|
2033
|
-
output: hydrateOutput(baseDir, record)
|
|
2034
|
-
};
|
|
2035
|
-
}
|
|
2036
|
-
function parseResultManifest(content) {
|
|
2037
|
-
return parseJsonlLines(content);
|
|
2038
|
-
}
|
|
2039
|
-
function resolveResultSourcePath(source, cwd) {
|
|
2040
|
-
const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
|
|
2041
|
-
if (isDirectoryPath(resolved) || path12.basename(resolved) === RESULT_INDEX_FILENAME) {
|
|
2042
|
-
return resolveRunManifestPath(resolved);
|
|
2043
|
-
}
|
|
2044
|
-
return resolved;
|
|
2045
|
-
}
|
|
2046
|
-
function loadManifestResults(sourceFile) {
|
|
2047
|
-
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
2048
|
-
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2049
|
-
const records = parseResultManifest(content);
|
|
2050
|
-
const baseDir = path12.dirname(resolvedSourceFile);
|
|
2051
|
-
return records.map((record) => hydrateManifestRecord(baseDir, record));
|
|
2052
|
-
}
|
|
2053
|
-
function loadLightweightResults(sourceFile) {
|
|
2054
|
-
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
2055
|
-
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2056
|
-
return parseResultManifest(content).map((record) => ({
|
|
2057
|
-
testId: record.test_id ?? "unknown",
|
|
2058
|
-
suite: record.suite,
|
|
2059
|
-
target: record.target,
|
|
2060
|
-
experiment: record.experiment,
|
|
2061
|
-
score: record.score,
|
|
2062
|
-
scores: record.scores,
|
|
2063
|
-
executionStatus: record.execution_status,
|
|
2064
|
-
error: record.error,
|
|
2065
|
-
timestamp: record.timestamp
|
|
2066
|
-
}));
|
|
2067
|
-
}
|
|
2068
|
-
|
|
2069
2713
|
// src/commands/eval/retry-errors.ts
|
|
2070
2714
|
async function loadRetrySourceResults(jsonlPath) {
|
|
2071
2715
|
return loadManifestResults(resolveResultSourcePath(jsonlPath));
|
|
2072
2716
|
}
|
|
2717
|
+
function escapeGlob(id) {
|
|
2718
|
+
return id.replace(/[*?[\]{}()!@#+|\\]/g, "\\$&");
|
|
2719
|
+
}
|
|
2073
2720
|
async function loadErrorTestIds(jsonlPath) {
|
|
2074
2721
|
const ids = (await loadRetrySourceResults(jsonlPath)).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
|
|
2075
2722
|
return [...new Set(ids)];
|
|
2076
2723
|
}
|
|
2724
|
+
async function loadFullyCompletedTestIds(jsonlPath) {
|
|
2725
|
+
const results = await loadRetrySourceResults(jsonlPath);
|
|
2726
|
+
const allIds = /* @__PURE__ */ new Set();
|
|
2727
|
+
const errorIds = /* @__PURE__ */ new Set();
|
|
2728
|
+
for (const result of results) {
|
|
2729
|
+
if (!result.testId) continue;
|
|
2730
|
+
allIds.add(result.testId);
|
|
2731
|
+
if (result.executionStatus === "execution_error") {
|
|
2732
|
+
errorIds.add(result.testId);
|
|
2733
|
+
}
|
|
2734
|
+
}
|
|
2735
|
+
return [...allIds].filter((id) => !errorIds.has(id));
|
|
2736
|
+
}
|
|
2737
|
+
function buildExclusionFilter(completedIds) {
|
|
2738
|
+
const escaped = completedIds.map(escapeGlob);
|
|
2739
|
+
return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(",")}}`;
|
|
2740
|
+
}
|
|
2077
2741
|
async function loadNonErrorResults(jsonlPath) {
|
|
2078
2742
|
return (await loadRetrySourceResults(jsonlPath)).filter(
|
|
2079
2743
|
(result) => result.testId && result.executionStatus !== "execution_error"
|
|
@@ -2082,7 +2746,7 @@ async function loadNonErrorResults(jsonlPath) {
|
|
|
2082
2746
|
|
|
2083
2747
|
// src/commands/eval/run-cache.ts
|
|
2084
2748
|
import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
|
|
2085
|
-
import
|
|
2749
|
+
import path15 from "node:path";
|
|
2086
2750
|
var CACHE_FILENAME = "cache.json";
|
|
2087
2751
|
function resolveRunCacheFile(cache) {
|
|
2088
2752
|
if (cache.lastRunDir) {
|
|
@@ -2091,7 +2755,7 @@ function resolveRunCacheFile(cache) {
|
|
|
2091
2755
|
return "";
|
|
2092
2756
|
}
|
|
2093
2757
|
function cachePath(cwd) {
|
|
2094
|
-
return
|
|
2758
|
+
return path15.join(cwd, ".agentv", CACHE_FILENAME);
|
|
2095
2759
|
}
|
|
2096
2760
|
async function loadRunCache(cwd) {
|
|
2097
2761
|
try {
|
|
@@ -2102,13 +2766,13 @@ async function loadRunCache(cwd) {
|
|
|
2102
2766
|
}
|
|
2103
2767
|
}
|
|
2104
2768
|
async function saveRunCache(cwd, resultPath) {
|
|
2105
|
-
if (
|
|
2769
|
+
if (path15.basename(resultPath) !== RESULT_INDEX_FILENAME) {
|
|
2106
2770
|
return;
|
|
2107
2771
|
}
|
|
2108
|
-
const dir =
|
|
2772
|
+
const dir = path15.join(cwd, ".agentv");
|
|
2109
2773
|
await mkdir7(dir, { recursive: true });
|
|
2110
2774
|
const cache = {
|
|
2111
|
-
lastRunDir:
|
|
2775
|
+
lastRunDir: path15.dirname(resultPath),
|
|
2112
2776
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2113
2777
|
};
|
|
2114
2778
|
await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
@@ -2233,7 +2897,7 @@ function calculateEvaluationSummary(results, options) {
|
|
|
2233
2897
|
byFailureReason
|
|
2234
2898
|
};
|
|
2235
2899
|
}
|
|
2236
|
-
function
|
|
2900
|
+
function formatScore2(value) {
|
|
2237
2901
|
return value.toFixed(3);
|
|
2238
2902
|
}
|
|
2239
2903
|
function formatEvaluationSummary(summary, options) {
|
|
@@ -2261,13 +2925,13 @@ function formatEvaluationSummary(summary, options) {
|
|
|
2261
2925
|
let verdictColor;
|
|
2262
2926
|
let verdictText;
|
|
2263
2927
|
if (allExecutionErrors) {
|
|
2264
|
-
overallVerdict = "
|
|
2928
|
+
overallVerdict = "ERROR";
|
|
2265
2929
|
verdictColor = "\x1B[33m";
|
|
2266
|
-
verdictText = `RESULT:
|
|
2930
|
+
verdictText = `RESULT: ERROR (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
|
|
2267
2931
|
} else {
|
|
2268
2932
|
overallVerdict = overallPassed ? "PASS" : "FAIL";
|
|
2269
2933
|
verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
|
|
2270
|
-
verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${
|
|
2934
|
+
verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} scored >= ${threshold}, mean: ${formatScore2(summary.mean)})`;
|
|
2271
2935
|
}
|
|
2272
2936
|
lines.push("\n==================================================");
|
|
2273
2937
|
if (useColor) {
|
|
@@ -2290,16 +2954,16 @@ function formatEvaluationSummary(summary, options) {
|
|
|
2290
2954
|
if (summary.executionErrorCount > 0) {
|
|
2291
2955
|
const qualityCount = summary.total - summary.executionErrorCount;
|
|
2292
2956
|
lines.push(
|
|
2293
|
-
`Mean score: ${
|
|
2957
|
+
`Mean score: ${formatScore2(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
|
|
2294
2958
|
);
|
|
2295
2959
|
} else {
|
|
2296
|
-
lines.push(`Mean score: ${
|
|
2960
|
+
lines.push(`Mean score: ${formatScore2(summary.mean)}`);
|
|
2297
2961
|
}
|
|
2298
|
-
lines.push(`Median score: ${
|
|
2299
|
-
lines.push(`Min score: ${
|
|
2300
|
-
lines.push(`Max score: ${
|
|
2962
|
+
lines.push(`Median score: ${formatScore2(summary.median)}`);
|
|
2963
|
+
lines.push(`Min score: ${formatScore2(summary.min)}`);
|
|
2964
|
+
lines.push(`Max score: ${formatScore2(summary.max)}`);
|
|
2301
2965
|
if (typeof summary.standardDeviation === "number") {
|
|
2302
|
-
lines.push(`Std deviation: ${
|
|
2966
|
+
lines.push(`Std deviation: ${formatScore2(summary.standardDeviation)}`);
|
|
2303
2967
|
}
|
|
2304
2968
|
lines.push("\nScore distribution:");
|
|
2305
2969
|
for (const bin of summary.histogram) {
|
|
@@ -2308,11 +2972,11 @@ function formatEvaluationSummary(summary, options) {
|
|
|
2308
2972
|
}
|
|
2309
2973
|
lines.push("\nTop performing tests:");
|
|
2310
2974
|
summary.topResults.forEach((result, index) => {
|
|
2311
|
-
lines.push(` ${index + 1}. ${result.testId}: ${
|
|
2975
|
+
lines.push(` ${index + 1}. ${result.testId}: ${formatScore2(result.score)}`);
|
|
2312
2976
|
});
|
|
2313
2977
|
lines.push("\nLowest performing tests:");
|
|
2314
2978
|
summary.bottomResults.forEach((result, index) => {
|
|
2315
|
-
lines.push(` ${index + 1}. ${result.testId}: ${
|
|
2979
|
+
lines.push(` ${index + 1}. ${result.testId}: ${formatScore2(result.score)}`);
|
|
2316
2980
|
});
|
|
2317
2981
|
const failureStageEntries = Object.entries(summary.byFailureStage);
|
|
2318
2982
|
if (failureStageEntries.length > 0) {
|
|
@@ -2361,7 +3025,7 @@ function formatMatrixSummary(results) {
|
|
|
2361
3025
|
for (const testId of testIds) {
|
|
2362
3026
|
const cells = targets.map((target) => {
|
|
2363
3027
|
const score = scoreMap.get(testId)?.get(target);
|
|
2364
|
-
return score !== void 0 ?
|
|
3028
|
+
return score !== void 0 ? formatScore2(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
|
|
2365
3029
|
});
|
|
2366
3030
|
lines.push(`${testId.padEnd(testIdColWidth)} ${cells.join(" ")}`);
|
|
2367
3031
|
}
|
|
@@ -2369,7 +3033,7 @@ function formatMatrixSummary(results) {
|
|
|
2369
3033
|
const avgCells = targets.map((target) => {
|
|
2370
3034
|
const scores = results.filter((r) => r.target === target).map((r) => r.score);
|
|
2371
3035
|
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
|
|
2372
|
-
return
|
|
3036
|
+
return formatScore2(avg).padEnd(targetColWidth);
|
|
2373
3037
|
});
|
|
2374
3038
|
lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
|
|
2375
3039
|
return lines.join("\n");
|
|
@@ -2377,7 +3041,7 @@ function formatMatrixSummary(results) {
|
|
|
2377
3041
|
|
|
2378
3042
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
2379
3043
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
2380
|
-
import
|
|
3044
|
+
import path16 from "node:path";
|
|
2381
3045
|
import { parse } from "yaml";
|
|
2382
3046
|
import { readFile as readFile22 } from "node:fs/promises";
|
|
2383
3047
|
import path22 from "node:path";
|
|
@@ -2420,8 +3084,8 @@ async function detectFileType(filePath) {
|
|
|
2420
3084
|
}
|
|
2421
3085
|
}
|
|
2422
3086
|
function inferFileTypeFromPath(filePath) {
|
|
2423
|
-
const normalized =
|
|
2424
|
-
const basename =
|
|
3087
|
+
const normalized = path16.normalize(filePath).replace(/\\/g, "/");
|
|
3088
|
+
const basename = path16.basename(filePath);
|
|
2425
3089
|
if (normalized.includes("/.agentv/")) {
|
|
2426
3090
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
2427
3091
|
return "config";
|
|
@@ -2747,12 +3411,21 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
|
|
|
2747
3411
|
const hooks = workspace.hooks;
|
|
2748
3412
|
const afterEachHook = isObject(hooks) ? hooks.after_each : void 0;
|
|
2749
3413
|
const isolation = workspace.isolation;
|
|
3414
|
+
const docker = workspace.docker;
|
|
2750
3415
|
if (Array.isArray(repos)) {
|
|
2751
3416
|
for (const repo of repos) {
|
|
2752
3417
|
if (!isObject(repo)) continue;
|
|
2753
3418
|
const source = repo.source;
|
|
2754
3419
|
const checkout = repo.checkout;
|
|
2755
3420
|
const clone = repo.clone;
|
|
3421
|
+
if (!isObject(source) && !isObject(docker)) {
|
|
3422
|
+
errors.push({
|
|
3423
|
+
severity: "error",
|
|
3424
|
+
filePath,
|
|
3425
|
+
location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
|
|
3426
|
+
message: "repos[].source is required for non-Docker workspaces. Source-less repos are only valid when workspace.docker is configured (repo exists inside the container)."
|
|
3427
|
+
});
|
|
3428
|
+
}
|
|
2756
3429
|
if (isObject(source) && isObject(checkout)) {
|
|
2757
3430
|
const sourceType = source.type;
|
|
2758
3431
|
const resolve = checkout.resolve;
|
|
@@ -2760,8 +3433,8 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
|
|
|
2760
3433
|
errors.push({
|
|
2761
3434
|
severity: "warning",
|
|
2762
3435
|
filePath,
|
|
2763
|
-
location: `workspace.repos[path=${repo.path}]`,
|
|
2764
|
-
message: "checkout.resolve has no effect for a local source. Use source.type to choose where the repo comes from; keep checkout.ref or checkout.ancestor only when pinning a local source."
|
|
3436
|
+
location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
|
|
3437
|
+
message: "checkout.resolve has no effect for a local source. Use source.type to choose where the repo comes from; keep checkout.ref, checkout.base_commit, or checkout.ancestor only when pinning a local source."
|
|
2765
3438
|
});
|
|
2766
3439
|
}
|
|
2767
3440
|
}
|
|
@@ -2772,7 +3445,7 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
|
|
|
2772
3445
|
errors.push({
|
|
2773
3446
|
severity: "warning",
|
|
2774
3447
|
filePath,
|
|
2775
|
-
location: `workspace.repos[path=${repo.path}]`,
|
|
3448
|
+
location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
|
|
2776
3449
|
message: `clone.depth (${depth}) may be insufficient for checkout.ancestor (${ancestor}). Recommend depth >= ${ancestor + 1}.`
|
|
2777
3450
|
});
|
|
2778
3451
|
}
|
|
@@ -3522,11 +4195,69 @@ async function validateConfigFile(filePath) {
|
|
|
3522
4195
|
});
|
|
3523
4196
|
}
|
|
3524
4197
|
}
|
|
4198
|
+
const results = config.results;
|
|
4199
|
+
if (results !== void 0) {
|
|
4200
|
+
if (typeof results !== "object" || results === null || Array.isArray(results)) {
|
|
4201
|
+
errors.push({
|
|
4202
|
+
severity: "error",
|
|
4203
|
+
filePath,
|
|
4204
|
+
location: "results",
|
|
4205
|
+
message: "Field 'results' must be an object"
|
|
4206
|
+
});
|
|
4207
|
+
} else {
|
|
4208
|
+
const exportConfig = results.export;
|
|
4209
|
+
if (exportConfig !== void 0) {
|
|
4210
|
+
if (typeof exportConfig !== "object" || exportConfig === null || Array.isArray(exportConfig)) {
|
|
4211
|
+
errors.push({
|
|
4212
|
+
severity: "error",
|
|
4213
|
+
filePath,
|
|
4214
|
+
location: "results.export",
|
|
4215
|
+
message: "Field 'results.export' must be an object"
|
|
4216
|
+
});
|
|
4217
|
+
} else {
|
|
4218
|
+
const exportRecord = exportConfig;
|
|
4219
|
+
if (typeof exportRecord.repo !== "string" || exportRecord.repo.trim().length === 0) {
|
|
4220
|
+
errors.push({
|
|
4221
|
+
severity: "error",
|
|
4222
|
+
filePath,
|
|
4223
|
+
location: "results.export.repo",
|
|
4224
|
+
message: "Field 'results.export.repo' must be a non-empty string"
|
|
4225
|
+
});
|
|
4226
|
+
}
|
|
4227
|
+
if (typeof exportRecord.path !== "string" || exportRecord.path.trim().length === 0) {
|
|
4228
|
+
errors.push({
|
|
4229
|
+
severity: "error",
|
|
4230
|
+
filePath,
|
|
4231
|
+
location: "results.export.path",
|
|
4232
|
+
message: "Field 'results.export.path' must be a non-empty string"
|
|
4233
|
+
});
|
|
4234
|
+
}
|
|
4235
|
+
if (exportRecord.auto_push !== void 0 && typeof exportRecord.auto_push !== "boolean") {
|
|
4236
|
+
errors.push({
|
|
4237
|
+
severity: "error",
|
|
4238
|
+
filePath,
|
|
4239
|
+
location: "results.export.auto_push",
|
|
4240
|
+
message: "Field 'results.export.auto_push' must be a boolean"
|
|
4241
|
+
});
|
|
4242
|
+
}
|
|
4243
|
+
if (exportRecord.branch_prefix !== void 0 && (typeof exportRecord.branch_prefix !== "string" || exportRecord.branch_prefix.trim().length === 0)) {
|
|
4244
|
+
errors.push({
|
|
4245
|
+
severity: "error",
|
|
4246
|
+
filePath,
|
|
4247
|
+
location: "results.export.branch_prefix",
|
|
4248
|
+
message: "Field 'results.export.branch_prefix' must be a non-empty string"
|
|
4249
|
+
});
|
|
4250
|
+
}
|
|
4251
|
+
}
|
|
4252
|
+
}
|
|
4253
|
+
}
|
|
4254
|
+
}
|
|
3525
4255
|
const allowedFields = /* @__PURE__ */ new Set([
|
|
3526
4256
|
"$schema",
|
|
3527
4257
|
"eval_patterns",
|
|
3528
4258
|
"required_version",
|
|
3529
4259
|
"execution",
|
|
4260
|
+
"results",
|
|
3530
4261
|
"studio"
|
|
3531
4262
|
]);
|
|
3532
4263
|
const unexpectedFields = Object.keys(config).filter((key) => !allowedFields.has(key));
|
|
@@ -4086,7 +4817,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4086
4817
|
threshold: normalizeOptionalNumber(rawOptions.threshold),
|
|
4087
4818
|
tags: normalizeStringArray(rawOptions.tag),
|
|
4088
4819
|
excludeTags: normalizeStringArray(rawOptions.excludeTag),
|
|
4089
|
-
transcript: normalizeString(rawOptions.transcript)
|
|
4820
|
+
transcript: normalizeString(rawOptions.transcript),
|
|
4821
|
+
experiment: normalizeString(rawOptions.experiment)
|
|
4090
4822
|
};
|
|
4091
4823
|
}
|
|
4092
4824
|
async function ensureFileExists(filePath, description) {
|
|
@@ -4096,10 +4828,10 @@ async function ensureFileExists(filePath, description) {
|
|
|
4096
4828
|
throw new Error(`${description} not found: ${filePath}`);
|
|
4097
4829
|
}
|
|
4098
4830
|
}
|
|
4099
|
-
function
|
|
4100
|
-
const runDir = buildDefaultRunDir(cwd);
|
|
4831
|
+
function buildDefaultOutputPathForExperiment(cwd, experiment) {
|
|
4832
|
+
const runDir = buildDefaultRunDir(cwd, experiment);
|
|
4101
4833
|
mkdirSync(runDir, { recursive: true });
|
|
4102
|
-
return
|
|
4834
|
+
return path17.join(runDir, "index.jsonl");
|
|
4103
4835
|
}
|
|
4104
4836
|
function createProgressReporter(maxWorkers, options) {
|
|
4105
4837
|
const display = new ProgressDisplay(maxWorkers, options);
|
|
@@ -4113,7 +4845,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
4113
4845
|
};
|
|
4114
4846
|
}
|
|
4115
4847
|
function makeTestCaseKey(testFilePath, testId) {
|
|
4116
|
-
return `${
|
|
4848
|
+
return `${path17.resolve(testFilePath)}::${testId}`;
|
|
4117
4849
|
}
|
|
4118
4850
|
function createDisplayIdTracker() {
|
|
4119
4851
|
const map = /* @__PURE__ */ new Map();
|
|
@@ -4169,7 +4901,7 @@ async function prepareFileMetadata(params) {
|
|
|
4169
4901
|
repoRoot,
|
|
4170
4902
|
verbose: options.verbose
|
|
4171
4903
|
});
|
|
4172
|
-
const relativePath =
|
|
4904
|
+
const relativePath = path17.relative(cwd, testFilePath);
|
|
4173
4905
|
const category = deriveCategory(relativePath);
|
|
4174
4906
|
const suite = await loadTestSuite(testFilePath, repoRoot, {
|
|
4175
4907
|
verbose: options.verbose,
|
|
@@ -4194,7 +4926,7 @@ async function prepareFileMetadata(params) {
|
|
|
4194
4926
|
selections = [
|
|
4195
4927
|
{
|
|
4196
4928
|
selection: transcriptSelection,
|
|
4197
|
-
inlineTargetLabel: `transcript (${
|
|
4929
|
+
inlineTargetLabel: `transcript (${path17.basename(options.transcript)})`
|
|
4198
4930
|
}
|
|
4199
4931
|
];
|
|
4200
4932
|
} else {
|
|
@@ -4430,32 +5162,36 @@ async function runEvalCommand(input) {
|
|
|
4430
5162
|
);
|
|
4431
5163
|
}
|
|
4432
5164
|
const repoRoot = await findRepoRoot(cwd);
|
|
4433
|
-
const yamlConfig = await loadConfig(
|
|
5165
|
+
const yamlConfig = await loadConfig(path17.join(cwd, "_"), repoRoot);
|
|
4434
5166
|
if (yamlConfig?.required_version) {
|
|
4435
5167
|
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
4436
5168
|
strict: normalizeBoolean(input.rawOptions.strict)
|
|
4437
5169
|
});
|
|
4438
5170
|
}
|
|
4439
5171
|
let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
|
|
5172
|
+
if (!process.env.AGENTV_EXPERIMENT) {
|
|
5173
|
+
process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
|
|
5174
|
+
}
|
|
4440
5175
|
if (options.graderTarget === "agentv" && !options.model) {
|
|
4441
5176
|
throw new Error("--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)");
|
|
4442
5177
|
}
|
|
4443
5178
|
let retryNonErrorResults;
|
|
4444
5179
|
if (options.retryErrors) {
|
|
4445
|
-
const retryPath =
|
|
5180
|
+
const retryPath = path17.resolve(options.retryErrors);
|
|
4446
5181
|
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
5182
|
+
const completedIds = await loadFullyCompletedTestIds(retryPath);
|
|
4447
5183
|
const errorIds = await loadErrorTestIds(retryPath);
|
|
4448
|
-
if (errorIds.length === 0) {
|
|
4449
|
-
console.log("No execution errors found in the previous output. Nothing to retry.");
|
|
4450
|
-
return;
|
|
4451
|
-
}
|
|
4452
|
-
console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
|
|
4453
|
-
const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
|
|
4454
|
-
options = { ...options, filter: filterPattern };
|
|
4455
5184
|
retryNonErrorResults = await loadNonErrorResults(retryPath);
|
|
5185
|
+
if (errorIds.length > 0) {
|
|
5186
|
+
console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
|
|
5187
|
+
}
|
|
5188
|
+
if (completedIds.length > 0) {
|
|
5189
|
+
options = { ...options, filter: buildExclusionFilter(completedIds) };
|
|
5190
|
+
console.log(`Skipping ${completedIds.length} already-completed test(s).`);
|
|
5191
|
+
}
|
|
4456
5192
|
}
|
|
4457
5193
|
if (options.workspacePath) {
|
|
4458
|
-
const resolvedWorkspace =
|
|
5194
|
+
const resolvedWorkspace = path17.resolve(options.workspacePath);
|
|
4459
5195
|
try {
|
|
4460
5196
|
const { stat: stat2 } = await import("node:fs/promises");
|
|
4461
5197
|
const stats = await stat2(resolvedWorkspace);
|
|
@@ -4496,25 +5232,25 @@ async function runEvalCommand(input) {
|
|
|
4496
5232
|
let outputPath;
|
|
4497
5233
|
let usesDefaultArtifactWorkspace;
|
|
4498
5234
|
if (explicitDir) {
|
|
4499
|
-
runDir =
|
|
5235
|
+
runDir = path17.resolve(explicitDir);
|
|
4500
5236
|
mkdirSync(runDir, { recursive: true });
|
|
4501
|
-
outputPath =
|
|
5237
|
+
outputPath = path17.join(runDir, "index.jsonl");
|
|
4502
5238
|
usesDefaultArtifactWorkspace = true;
|
|
4503
5239
|
} else if (options.outPath) {
|
|
4504
|
-
outputPath =
|
|
4505
|
-
runDir =
|
|
5240
|
+
outputPath = path17.resolve(options.outPath);
|
|
5241
|
+
runDir = path17.dirname(outputPath);
|
|
4506
5242
|
mkdirSync(runDir, { recursive: true });
|
|
4507
5243
|
usesDefaultArtifactWorkspace = false;
|
|
4508
5244
|
} else {
|
|
4509
|
-
outputPath =
|
|
4510
|
-
runDir =
|
|
5245
|
+
outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
|
|
5246
|
+
runDir = path17.dirname(outputPath);
|
|
4511
5247
|
usesDefaultArtifactWorkspace = true;
|
|
4512
5248
|
}
|
|
4513
5249
|
let otelExporter = null;
|
|
4514
5250
|
const useFileExport = !!options.otelFile;
|
|
4515
5251
|
if (options.exportOtel || useFileExport) {
|
|
4516
5252
|
try {
|
|
4517
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
5253
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-HNSXNRVK.js");
|
|
4518
5254
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4519
5255
|
let headers = {};
|
|
4520
5256
|
if (options.otelBackend) {
|
|
@@ -4538,7 +5274,7 @@ async function runEvalCommand(input) {
|
|
|
4538
5274
|
headers,
|
|
4539
5275
|
captureContent,
|
|
4540
5276
|
groupTurns: options.otelGroupTurns,
|
|
4541
|
-
otlpFilePath: options.otelFile ?
|
|
5277
|
+
otlpFilePath: options.otelFile ? path17.resolve(options.otelFile) : void 0
|
|
4542
5278
|
});
|
|
4543
5279
|
const initialized = await otelExporter.init();
|
|
4544
5280
|
if (!initialized) {
|
|
@@ -4555,7 +5291,7 @@ async function runEvalCommand(input) {
|
|
|
4555
5291
|
}
|
|
4556
5292
|
}
|
|
4557
5293
|
const primaryWritePath = outputPath;
|
|
4558
|
-
const resolvedExportPaths = options.exportPaths.map((p) =>
|
|
5294
|
+
const resolvedExportPaths = options.exportPaths.map((p) => path17.resolve(p));
|
|
4559
5295
|
console.log(`Artifact directory: ${runDir}`);
|
|
4560
5296
|
if (resolvedExportPaths.length > 0) {
|
|
4561
5297
|
console.log("Export files:");
|
|
@@ -4563,12 +5299,13 @@ async function runEvalCommand(input) {
|
|
|
4563
5299
|
console.log(` ${p}`);
|
|
4564
5300
|
}
|
|
4565
5301
|
}
|
|
4566
|
-
const resolvedTestFiles = input.testFiles.map((file) =>
|
|
5302
|
+
const resolvedTestFiles = input.testFiles.map((file) => path17.resolve(file));
|
|
4567
5303
|
if (options.otelFile) {
|
|
4568
|
-
console.log(`OTLP JSON file: ${
|
|
5304
|
+
console.log(`OTLP JSON file: ${path17.resolve(options.otelFile)}`);
|
|
4569
5305
|
}
|
|
4570
5306
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
4571
5307
|
const allResults = [];
|
|
5308
|
+
const remoteEvalSummaries = [];
|
|
4572
5309
|
const seenTestCases = /* @__PURE__ */ new Set();
|
|
4573
5310
|
const displayIdTracker = createDisplayIdTracker();
|
|
4574
5311
|
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
|
|
@@ -4609,7 +5346,7 @@ async function runEvalCommand(input) {
|
|
|
4609
5346
|
for (const [testFilePath, meta] of fileMetadata.entries()) {
|
|
4610
5347
|
if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
|
|
4611
5348
|
fileMetadata.delete(testFilePath);
|
|
4612
|
-
skippedFiles.push(
|
|
5349
|
+
skippedFiles.push(path17.relative(cwd, testFilePath));
|
|
4613
5350
|
}
|
|
4614
5351
|
}
|
|
4615
5352
|
if (skippedFiles.length > 0 && options.verbose) {
|
|
@@ -4630,7 +5367,7 @@ async function runEvalCommand(input) {
|
|
|
4630
5367
|
cliNoCache: options.noCache,
|
|
4631
5368
|
yamlCache: yamlCacheEnabled
|
|
4632
5369
|
});
|
|
4633
|
-
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ?
|
|
5370
|
+
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path17.resolve(yamlCachePath) : void 0) : void 0;
|
|
4634
5371
|
if (cacheEnabled) {
|
|
4635
5372
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
4636
5373
|
}
|
|
@@ -4651,6 +5388,10 @@ async function runEvalCommand(input) {
|
|
|
4651
5388
|
}
|
|
4652
5389
|
}
|
|
4653
5390
|
if (totalEvalCount === 0) {
|
|
5391
|
+
if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
|
|
5392
|
+
console.log("No execution errors or missing cases in the previous run. Nothing to retry.");
|
|
5393
|
+
return;
|
|
5394
|
+
}
|
|
4654
5395
|
throw new Error("No tests matched the provided filters.");
|
|
4655
5396
|
}
|
|
4656
5397
|
const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
|
|
@@ -4708,7 +5449,7 @@ async function runEvalCommand(input) {
|
|
|
4708
5449
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
4709
5450
|
let transcriptProviderFactory;
|
|
4710
5451
|
if (options.transcript) {
|
|
4711
|
-
const { TranscriptProvider } = await import("./dist-
|
|
5452
|
+
const { TranscriptProvider } = await import("./dist-HNSXNRVK.js");
|
|
4712
5453
|
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
4713
5454
|
const totalTests = [...fileMetadata.values()].reduce(
|
|
4714
5455
|
(sum, meta) => sum + meta.testCases.length,
|
|
@@ -4767,11 +5508,23 @@ async function runEvalCommand(input) {
|
|
|
4767
5508
|
threshold: resolvedThreshold,
|
|
4768
5509
|
providerFactory: transcriptProviderFactory
|
|
4769
5510
|
});
|
|
5511
|
+
const evalFile = path17.relative(cwd, testFilePath);
|
|
5512
|
+
const existingSummary = remoteEvalSummaries.find(
|
|
5513
|
+
(summary2) => summary2.evalFile === evalFile
|
|
5514
|
+
);
|
|
5515
|
+
if (existingSummary) {
|
|
5516
|
+
existingSummary.results.push(...result.results);
|
|
5517
|
+
} else {
|
|
5518
|
+
remoteEvalSummaries.push({
|
|
5519
|
+
evalFile,
|
|
5520
|
+
results: [...result.results]
|
|
5521
|
+
});
|
|
5522
|
+
}
|
|
4770
5523
|
return result.results;
|
|
4771
5524
|
} catch (fileError) {
|
|
4772
5525
|
const message = fileError instanceof Error ? fileError.message : String(fileError);
|
|
4773
5526
|
console.error(`
|
|
4774
|
-
\u26A0 Eval file failed: ${
|
|
5527
|
+
\u26A0 Eval file failed: ${path17.basename(testFilePath)} \u2014 ${message}
|
|
4775
5528
|
`);
|
|
4776
5529
|
const errorResults = applicableTestCases.map((testCase) => ({
|
|
4777
5530
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -4818,7 +5571,7 @@ async function runEvalCommand(input) {
|
|
|
4818
5571
|
console.log(formatMatrixSummary(allResults));
|
|
4819
5572
|
}
|
|
4820
5573
|
if (options.benchmarkJson && allResults.length > 0) {
|
|
4821
|
-
const benchmarkPath =
|
|
5574
|
+
const benchmarkPath = path17.resolve(options.benchmarkJson);
|
|
4822
5575
|
await writeBenchmarkJson(benchmarkPath, allResults);
|
|
4823
5576
|
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
4824
5577
|
}
|
|
@@ -4830,7 +5583,8 @@ async function runEvalCommand(input) {
|
|
|
4830
5583
|
benchmarkPath: workspaceBenchmarkPath,
|
|
4831
5584
|
indexPath
|
|
4832
5585
|
} = await writeArtifactsFromResults(allResults, runDir, {
|
|
4833
|
-
evalFile
|
|
5586
|
+
evalFile,
|
|
5587
|
+
experiment: normalizeExperimentName(options.experiment)
|
|
4834
5588
|
});
|
|
4835
5589
|
console.log(`Artifact workspace written to: ${runDir}`);
|
|
4836
5590
|
console.log(` Index: ${indexPath}`);
|
|
@@ -4849,7 +5603,7 @@ async function runEvalCommand(input) {
|
|
|
4849
5603
|
await writer.close();
|
|
4850
5604
|
}
|
|
4851
5605
|
console.log(
|
|
4852
|
-
`Export file(s) written: ${resolvedExportPaths.map((p) =>
|
|
5606
|
+
`Export file(s) written: ${resolvedExportPaths.map((p) => path17.relative(cwd, p)).join(", ")}`
|
|
4853
5607
|
);
|
|
4854
5608
|
}
|
|
4855
5609
|
const failedWithWorkspaces = allResults.filter(
|
|
@@ -4865,11 +5619,29 @@ async function runEvalCommand(input) {
|
|
|
4865
5619
|
console.log(`
|
|
4866
5620
|
Results written to: ${outputPath}`);
|
|
4867
5621
|
await saveRunCache(cwd, outputPath).catch(() => void 0);
|
|
5622
|
+
await maybeAutoExportRunArtifacts({
|
|
5623
|
+
cwd,
|
|
5624
|
+
run_dir: runDir,
|
|
5625
|
+
test_files: activeTestFiles,
|
|
5626
|
+
results: allResults,
|
|
5627
|
+
eval_summaries: remoteEvalSummaries.map((summary2) => ({
|
|
5628
|
+
eval_file: summary2.evalFile,
|
|
5629
|
+
total: summary2.results.length,
|
|
5630
|
+
passed: summary2.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
|
|
5631
|
+
avg_score: summary2.results.length > 0 ? summary2.results.reduce((sum, result) => sum + result.score, 0) / summary2.results.length : 0,
|
|
5632
|
+
results: summary2.results.map((result) => ({
|
|
5633
|
+
test_id: result.testId,
|
|
5634
|
+
score: result.score,
|
|
5635
|
+
status: result.executionStatus === "execution_error" || result.error ? "ERROR" : result.score >= DEFAULT_THRESHOLD ? "PASS" : "FAIL"
|
|
5636
|
+
}))
|
|
5637
|
+
})),
|
|
5638
|
+
experiment: normalizeExperimentName(options.experiment)
|
|
5639
|
+
});
|
|
4868
5640
|
}
|
|
4869
5641
|
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
4870
|
-
const evalFileArgs = activeTestFiles.map((f) =>
|
|
5642
|
+
const evalFileArgs = activeTestFiles.map((f) => path17.relative(cwd, f)).join(" ");
|
|
4871
5643
|
const targetFlag = options.target ? ` --target ${options.target}` : "";
|
|
4872
|
-
const relativeOutputPath =
|
|
5644
|
+
const relativeOutputPath = path17.relative(cwd, outputPath);
|
|
4873
5645
|
console.log(
|
|
4874
5646
|
`
|
|
4875
5647
|
Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
|
|
@@ -4903,7 +5675,7 @@ async function resolveEvaluationRunner() {
|
|
|
4903
5675
|
if (!overridePath) {
|
|
4904
5676
|
return runEvaluation;
|
|
4905
5677
|
}
|
|
4906
|
-
const resolved =
|
|
5678
|
+
const resolved = path17.isAbsolute(overridePath) ? overridePath : path17.resolve(process.cwd(), overridePath);
|
|
4907
5679
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
4908
5680
|
const mod = await import(moduleUrl);
|
|
4909
5681
|
const candidate = mod.runEvaluation;
|
|
@@ -4916,11 +5688,11 @@ async function resolveEvaluationRunner() {
|
|
|
4916
5688
|
}
|
|
4917
5689
|
|
|
4918
5690
|
// src/commands/eval/discover.ts
|
|
4919
|
-
import
|
|
5691
|
+
import path18 from "node:path";
|
|
4920
5692
|
import fg2 from "fast-glob";
|
|
4921
5693
|
async function discoverEvalFiles(cwd) {
|
|
4922
5694
|
const repoRoot = await findRepoRoot(cwd);
|
|
4923
|
-
const config = await loadConfig(
|
|
5695
|
+
const config = await loadConfig(path18.join(cwd, "_"), repoRoot);
|
|
4924
5696
|
const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
|
|
4925
5697
|
const ignore = ["**/node_modules/**", "**/dist/**"];
|
|
4926
5698
|
const matches = await fg2(patterns, {
|
|
@@ -4932,7 +5704,7 @@ async function discoverEvalFiles(cwd) {
|
|
|
4932
5704
|
caseSensitiveMatch: false
|
|
4933
5705
|
});
|
|
4934
5706
|
const evalFiles = matches.map((absPath) => {
|
|
4935
|
-
const relativePath =
|
|
5707
|
+
const relativePath = path18.relative(cwd, absPath);
|
|
4936
5708
|
const category = deriveCategory(relativePath);
|
|
4937
5709
|
return { path: absPath, relativePath, category };
|
|
4938
5710
|
});
|
|
@@ -4956,21 +5728,36 @@ export {
|
|
|
4956
5728
|
package_default,
|
|
4957
5729
|
toSnakeCaseDeep,
|
|
4958
5730
|
RESULT_INDEX_FILENAME,
|
|
4959
|
-
RESULT_RUNS_DIRNAME,
|
|
4960
5731
|
buildDefaultRunDir,
|
|
4961
|
-
resolveExistingRunPrimaryPath,
|
|
4962
|
-
resolveWorkspaceOrFilePath,
|
|
4963
5732
|
resolveRunManifestPath,
|
|
4964
5733
|
parseResultManifest,
|
|
4965
5734
|
resolveResultSourcePath,
|
|
4966
5735
|
loadManifestResults,
|
|
4967
5736
|
loadLightweightResults,
|
|
4968
5737
|
HtmlWriter,
|
|
5738
|
+
resolveEvalPaths,
|
|
5739
|
+
findRepoRoot,
|
|
5740
|
+
c,
|
|
5741
|
+
padRight,
|
|
5742
|
+
padLeft,
|
|
5743
|
+
loadResultFile,
|
|
5744
|
+
getTraceSummary,
|
|
5745
|
+
getTraceSpans,
|
|
5746
|
+
toTraceSummary,
|
|
5747
|
+
listResultFiles,
|
|
5748
|
+
formatNumber,
|
|
5749
|
+
formatDuration,
|
|
5750
|
+
formatCost,
|
|
5751
|
+
formatSize,
|
|
5752
|
+
formatScore,
|
|
5753
|
+
getRemoteResultsStatus,
|
|
5754
|
+
syncRemoteResults,
|
|
5755
|
+
listMergedResultFiles,
|
|
5756
|
+
findRunById,
|
|
5757
|
+
maybeAutoExportRunArtifacts,
|
|
4969
5758
|
writeArtifactsFromResults,
|
|
4970
5759
|
resolveRunCacheFile,
|
|
4971
5760
|
loadRunCache,
|
|
4972
|
-
resolveEvalPaths,
|
|
4973
|
-
findRepoRoot,
|
|
4974
5761
|
detectFileType,
|
|
4975
5762
|
validateEvalFile,
|
|
4976
5763
|
validateTargetsFile,
|
|
@@ -4984,4 +5771,4 @@ export {
|
|
|
4984
5771
|
getCategories,
|
|
4985
5772
|
filterByCategory
|
|
4986
5773
|
};
|
|
4987
|
-
//# sourceMappingURL=chunk-
|
|
5774
|
+
//# sourceMappingURL=chunk-FH24D7XW.js.map
|