@agentv/core 4.31.4-next.1 → 4.33.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-575K7WRM.js → chunk-7QB53OPK.js} +1319 -303
- package/dist/chunk-7QB53OPK.js.map +1 -0
- package/dist/{chunk-5RQMJZDJ.js → chunk-EW5X2RGJ.js} +110 -50
- package/dist/chunk-EW5X2RGJ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +196 -87
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +3 -1
- package/dist/evaluation/validation/index.d.ts +3 -1
- package/dist/evaluation/validation/index.js +170 -75
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +2462 -963
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1843 -67
- package/dist/index.d.ts +1843 -67
- package/dist/index.js +625 -196
- package/dist/index.js.map +1 -1
- package/dist/{ts-eval-loader-FRQF6KHR.js → ts-eval-loader-EQJX3OLT.js} +3 -3
- package/package.json +2 -2
- package/dist/chunk-575K7WRM.js.map +0 -1
- package/dist/chunk-5RQMJZDJ.js.map +0 -1
- /package/dist/{ts-eval-loader-FRQF6KHR.js.map → ts-eval-loader-EQJX3OLT.js.map} +0 -0
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
import {
|
|
2
2
|
LLM_GRADER_CAPABLE_KINDS,
|
|
3
|
+
RUBRIC_OPERATOR_VALUES,
|
|
3
4
|
buildDirectoryChain,
|
|
4
5
|
expandFileReferences,
|
|
5
6
|
extractLastAssistantContent,
|
|
6
7
|
fileExists,
|
|
7
8
|
findGitRoot,
|
|
9
|
+
getAgentvConfigDir,
|
|
10
|
+
getAgentvDataDir,
|
|
11
|
+
getSubagentsRoot,
|
|
12
|
+
getWorkspacePoolRoot,
|
|
13
|
+
getWorkspacesRoot,
|
|
8
14
|
interpolateEnv,
|
|
9
15
|
interpolateTemplateVars,
|
|
10
16
|
isAgentProvider,
|
|
@@ -18,7 +24,7 @@ import {
|
|
|
18
24
|
readTextFile,
|
|
19
25
|
resolveDelegatedTargetDefinition,
|
|
20
26
|
resolveTargetDefinition
|
|
21
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-EW5X2RGJ.js";
|
|
22
28
|
import {
|
|
23
29
|
execFileWithStdin,
|
|
24
30
|
execShellWithStdin
|
|
@@ -41,6 +47,49 @@ import { existsSync as existsSync6 } from "node:fs";
|
|
|
41
47
|
import path45 from "node:path";
|
|
42
48
|
import micromatch4 from "micromatch";
|
|
43
49
|
|
|
50
|
+
// src/evaluation/cache/response-cache.ts
|
|
51
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
52
|
+
import path from "node:path";
|
|
53
|
+
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
54
|
+
var ResponseCache = class {
|
|
55
|
+
cachePath;
|
|
56
|
+
constructor(cachePath) {
|
|
57
|
+
this.cachePath = cachePath ?? DEFAULT_CACHE_PATH;
|
|
58
|
+
}
|
|
59
|
+
async get(key) {
|
|
60
|
+
const filePath = this.keyToPath(key);
|
|
61
|
+
try {
|
|
62
|
+
const data = await readFile(filePath, "utf8");
|
|
63
|
+
return JSON.parse(data);
|
|
64
|
+
} catch {
|
|
65
|
+
return void 0;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
async set(key, value) {
|
|
69
|
+
const filePath = this.keyToPath(key);
|
|
70
|
+
const dir = path.dirname(filePath);
|
|
71
|
+
await mkdir(dir, { recursive: true });
|
|
72
|
+
await writeFile(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
73
|
+
}
|
|
74
|
+
keyToPath(key) {
|
|
75
|
+
const prefix = key.slice(0, 2);
|
|
76
|
+
return path.join(this.cachePath, prefix, `${key}.json`);
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
function shouldEnableCache(params) {
|
|
80
|
+
if (params.cliNoCache) return false;
|
|
81
|
+
if (params.cliCache) return true;
|
|
82
|
+
if (params.yamlCache !== void 0) return params.yamlCache;
|
|
83
|
+
return params.tsConfigCache === true;
|
|
84
|
+
}
|
|
85
|
+
function shouldSkipCacheForTemperature(targetConfig) {
|
|
86
|
+
const temp = targetConfig.temperature;
|
|
87
|
+
if (typeof temp === "number" && temp > 0) {
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
return false;
|
|
91
|
+
}
|
|
92
|
+
|
|
44
93
|
// src/evaluation/graders/scoring.ts
|
|
45
94
|
var DEFAULT_THRESHOLD = 0.8;
|
|
46
95
|
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
@@ -133,7 +182,7 @@ function negateScore(score) {
|
|
|
133
182
|
import { execFile as execFile3 } from "node:child_process";
|
|
134
183
|
import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
|
|
135
184
|
import { existsSync as existsSync5 } from "node:fs";
|
|
136
|
-
import { copyFile as copyFile2, mkdir as
|
|
185
|
+
import { copyFile as copyFile2, mkdir as mkdir15, readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
137
186
|
import path44 from "node:path";
|
|
138
187
|
import { promisify as promisify7 } from "node:util";
|
|
139
188
|
import micromatch3 from "micromatch";
|
|
@@ -277,39 +326,8 @@ function validateConcurrency(concurrency) {
|
|
|
277
326
|
}
|
|
278
327
|
}
|
|
279
328
|
|
|
280
|
-
// src/paths.ts
|
|
281
|
-
import os from "node:os";
|
|
282
|
-
import path from "node:path";
|
|
283
|
-
var logged = false;
|
|
284
|
-
function getAgentvConfigDir() {
|
|
285
|
-
return path.join(os.homedir(), ".agentv");
|
|
286
|
-
}
|
|
287
|
-
function getAgentvHome() {
|
|
288
|
-
const envHome = process.env.AGENTV_HOME;
|
|
289
|
-
if (envHome && envHome !== "undefined") {
|
|
290
|
-
if (!logged) {
|
|
291
|
-
logged = true;
|
|
292
|
-
console.log(`Using AGENTV_HOME: ${envHome}`);
|
|
293
|
-
}
|
|
294
|
-
return envHome;
|
|
295
|
-
}
|
|
296
|
-
return path.join(os.homedir(), ".agentv");
|
|
297
|
-
}
|
|
298
|
-
function getWorkspacesRoot() {
|
|
299
|
-
return path.join(getAgentvHome(), "workspaces");
|
|
300
|
-
}
|
|
301
|
-
function getSubagentsRoot() {
|
|
302
|
-
return path.join(getAgentvHome(), "subagents");
|
|
303
|
-
}
|
|
304
|
-
function getTraceStateRoot() {
|
|
305
|
-
return path.join(getAgentvHome(), "trace-state");
|
|
306
|
-
}
|
|
307
|
-
function getWorkspacePoolRoot() {
|
|
308
|
-
return path.join(getAgentvHome(), "workspace-pool");
|
|
309
|
-
}
|
|
310
|
-
|
|
311
329
|
// src/evaluation/graders/code-grader.ts
|
|
312
|
-
import { mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
330
|
+
import { mkdtemp, rm, writeFile as writeFile2 } from "node:fs/promises";
|
|
313
331
|
import { tmpdir } from "node:os";
|
|
314
332
|
import { dirname, join } from "node:path";
|
|
315
333
|
|
|
@@ -643,7 +661,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
643
661
|
const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
|
|
644
662
|
const dir = await getWorkDir();
|
|
645
663
|
const filePath = join(dir, `img-${counter++}.${ext}`);
|
|
646
|
-
await
|
|
664
|
+
await writeFile2(filePath, Buffer.from(base64Data, "base64"));
|
|
647
665
|
blocks.push({ type: "image", media_type: img.media_type, path: filePath });
|
|
648
666
|
} else {
|
|
649
667
|
blocks.push({ type: "image", media_type: img.media_type, path: img.source });
|
|
@@ -686,7 +704,7 @@ var CodeGrader = class {
|
|
|
686
704
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
687
705
|
const tmpDir = await mkdtemp(join(tmpdir(), "agentv-grader-"));
|
|
688
706
|
outputPath = join(tmpDir, "output.json");
|
|
689
|
-
await
|
|
707
|
+
await writeFile2(outputPath, serialized);
|
|
690
708
|
outputForPayload = null;
|
|
691
709
|
}
|
|
692
710
|
}
|
|
@@ -703,6 +721,7 @@ var CodeGrader = class {
|
|
|
703
721
|
context.evalCase.input,
|
|
704
722
|
getImageDir
|
|
705
723
|
),
|
|
724
|
+
metadata: context.evalCase.metadata ?? null,
|
|
706
725
|
trace: context.trace ?? null,
|
|
707
726
|
tokenUsage: context.tokenUsage ?? null,
|
|
708
727
|
costUsd: context.costUsd ?? null,
|
|
@@ -875,7 +894,7 @@ import path3 from "node:path";
|
|
|
875
894
|
import { z } from "zod";
|
|
876
895
|
|
|
877
896
|
// src/evaluation/content-preprocessor.ts
|
|
878
|
-
import { readFile } from "node:fs/promises";
|
|
897
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
879
898
|
import path2 from "node:path";
|
|
880
899
|
import { fileURLToPath } from "node:url";
|
|
881
900
|
var MIME_TYPE_ALIASES = {
|
|
@@ -944,7 +963,7 @@ async function preprocessContentFile(block, preprocessors, basePath) {
|
|
|
944
963
|
return runContentPreprocessor(block, resolvedPath, preprocessor);
|
|
945
964
|
}
|
|
946
965
|
try {
|
|
947
|
-
const buffer = await
|
|
966
|
+
const buffer = await readFile2(resolvedPath);
|
|
948
967
|
const text = buffer.toString("utf8").replace(/\r\n/g, "\n");
|
|
949
968
|
if (buffer.includes(0) || text.includes(REPLACEMENT_CHAR)) {
|
|
950
969
|
return {
|
|
@@ -1040,6 +1059,10 @@ ${text}`;
|
|
|
1040
1059
|
var TEMPLATE_VARIABLES = {
|
|
1041
1060
|
EXPECTED_OUTPUT: "expected_output",
|
|
1042
1061
|
CRITERIA: "criteria",
|
|
1062
|
+
METADATA: "metadata",
|
|
1063
|
+
METADATA_JSON: "metadata_json",
|
|
1064
|
+
RUBRICS: "rubrics",
|
|
1065
|
+
RUBRICS_JSON: "rubrics_json",
|
|
1043
1066
|
INPUT: "input",
|
|
1044
1067
|
OUTPUT: "output",
|
|
1045
1068
|
FILE_CHANGES: "file_changes",
|
|
@@ -1062,6 +1085,27 @@ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
|
|
|
1062
1085
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
|
|
1063
1086
|
]);
|
|
1064
1087
|
|
|
1088
|
+
// src/evaluation/graders/rubric-operators.ts
|
|
1089
|
+
var OPERATOR_GUIDANCE = {
|
|
1090
|
+
correctness: "Correctness: mark satisfied only when the answer positively supports or fulfills the outcome. Omission or contradiction should not satisfy it.",
|
|
1091
|
+
contradiction: "Contradiction guard: mark satisfied when the answer does not make a claim that contradicts the outcome. Do not require the answer to mention the outcome; mark unsatisfied only for incompatible claims."
|
|
1092
|
+
};
|
|
1093
|
+
function formatRubricOperatorLabel(operator) {
|
|
1094
|
+
return operator ? ` (operator: ${operator})` : "";
|
|
1095
|
+
}
|
|
1096
|
+
function formatRubricOperatorGuidance(rubrics) {
|
|
1097
|
+
const operators = /* @__PURE__ */ new Set();
|
|
1098
|
+
for (const rubric of rubrics) {
|
|
1099
|
+
if (rubric.operator) {
|
|
1100
|
+
operators.add(rubric.operator);
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
if (operators.size === 0) {
|
|
1104
|
+
return [];
|
|
1105
|
+
}
|
|
1106
|
+
return [...operators].map((operator) => OPERATOR_GUIDANCE[operator]);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1065
1109
|
// src/evaluation/graders/llm-grader.ts
|
|
1066
1110
|
var DEFAULT_MAX_STEPS = 10;
|
|
1067
1111
|
var MAX_STEPS_LIMIT = 50;
|
|
@@ -1144,6 +1188,32 @@ var scoreRangeEvaluationSchema = z.object({
|
|
|
1144
1188
|
checks: z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
1145
1189
|
overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
1146
1190
|
});
|
|
1191
|
+
function stringifyPretty(value) {
|
|
1192
|
+
return value === void 0 ? "" : JSON.stringify(value, null, 2);
|
|
1193
|
+
}
|
|
1194
|
+
function stringifyCompact(value) {
|
|
1195
|
+
return value === void 0 ? "" : JSON.stringify(value);
|
|
1196
|
+
}
|
|
1197
|
+
function buildTemplateVariables(context) {
|
|
1198
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1199
|
+
const rubrics = context.evaluator?.type === "llm-grader" ? context.evaluator.rubrics : void 0;
|
|
1200
|
+
return {
|
|
1201
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
1202
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
1203
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1204
|
+
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
1205
|
+
[TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
|
|
1206
|
+
[TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
|
|
1207
|
+
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
|
|
1208
|
+
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
|
|
1209
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1210
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1211
|
+
// Deprecated aliases — same values as the primary variables above
|
|
1212
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1213
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1214
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1215
|
+
};
|
|
1216
|
+
}
|
|
1147
1217
|
function resolveContentBasePath(context) {
|
|
1148
1218
|
if (context.workspacePath) {
|
|
1149
1219
|
return context.workspacePath;
|
|
@@ -1215,19 +1285,7 @@ var LlmGrader = class {
|
|
|
1215
1285
|
// LLM mode (existing)
|
|
1216
1286
|
// ---------------------------------------------------------------------------
|
|
1217
1287
|
async evaluateFreeform(context, graderProvider) {
|
|
1218
|
-
const
|
|
1219
|
-
const variables = {
|
|
1220
|
-
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
1221
|
-
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
1222
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1223
|
-
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
1224
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1225
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1226
|
-
// Deprecated aliases — same values as the primary variables above
|
|
1227
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1228
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1229
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1230
|
-
};
|
|
1288
|
+
const variables = buildTemplateVariables(context);
|
|
1231
1289
|
const systemPrompt = buildOutputSchema();
|
|
1232
1290
|
const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
|
|
1233
1291
|
warnDeprecatedTemplateVars(graderTemplate);
|
|
@@ -1294,7 +1352,7 @@ ${context.toolCalls}`;
|
|
|
1294
1352
|
if (hasScoreRanges) {
|
|
1295
1353
|
return this.evaluateWithScoreRanges(context, graderProvider, rubrics);
|
|
1296
1354
|
}
|
|
1297
|
-
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
1355
|
+
const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildRubricPrompt(context, rubrics);
|
|
1298
1356
|
const systemPrompt = buildRubricOutputSchema();
|
|
1299
1357
|
const graderRawRequest = {
|
|
1300
1358
|
userPrompt: prompt,
|
|
@@ -1339,7 +1397,7 @@ ${context.toolCalls}`;
|
|
|
1339
1397
|
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
1340
1398
|
*/
|
|
1341
1399
|
async evaluateWithScoreRanges(context, graderProvider, rubrics) {
|
|
1342
|
-
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
1400
|
+
const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildScoreRangePrompt(context, rubrics);
|
|
1343
1401
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
1344
1402
|
const graderRawRequest = {
|
|
1345
1403
|
userPrompt: prompt,
|
|
@@ -1558,21 +1616,11 @@ ${context.toolCalls}`;
|
|
|
1558
1616
|
*/
|
|
1559
1617
|
buildAgentUserPrompt(context) {
|
|
1560
1618
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1561
|
-
const variables =
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1567
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1568
|
-
// Deprecated aliases
|
|
1569
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1570
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1571
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1572
|
-
};
|
|
1573
|
-
if (this.graderTemplate) {
|
|
1574
|
-
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
1575
|
-
return substituteVariables(this.graderTemplate, variables);
|
|
1619
|
+
const variables = buildTemplateVariables(context);
|
|
1620
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
1621
|
+
if (template) {
|
|
1622
|
+
warnDeprecatedTemplateVars(template);
|
|
1623
|
+
return substituteVariables(template, variables);
|
|
1576
1624
|
}
|
|
1577
1625
|
const config = context.evaluator;
|
|
1578
1626
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
@@ -1622,21 +1670,11 @@ ${context.toolCalls}`;
|
|
|
1622
1670
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1623
1671
|
const config = context.evaluator;
|
|
1624
1672
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1631
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1632
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1633
|
-
// Deprecated aliases
|
|
1634
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1635
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1636
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1637
|
-
};
|
|
1638
|
-
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
1639
|
-
const customPrompt = substituteVariables(this.graderTemplate, variables);
|
|
1673
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
1674
|
+
if (template) {
|
|
1675
|
+
const variables = buildTemplateVariables(context);
|
|
1676
|
+
warnDeprecatedTemplateVars(template);
|
|
1677
|
+
const customPrompt = substituteVariables(template, variables);
|
|
1640
1678
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
1641
1679
|
return `${customPrompt}
|
|
1642
1680
|
|
|
@@ -1762,6 +1800,9 @@ ${outputSchema}`;
|
|
|
1762
1800
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
1763
1801
|
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
1764
1802
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
1803
|
+
if (rubric.operator) {
|
|
1804
|
+
parts.push(`Operator: ${rubric.operator}`);
|
|
1805
|
+
}
|
|
1765
1806
|
if (rubric.outcome) {
|
|
1766
1807
|
parts.push(`Description: ${rubric.outcome}`);
|
|
1767
1808
|
}
|
|
@@ -1774,12 +1815,21 @@ ${outputSchema}`;
|
|
|
1774
1815
|
}
|
|
1775
1816
|
}
|
|
1776
1817
|
}
|
|
1818
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
1819
|
+
if (operatorGuidance.length > 0) {
|
|
1820
|
+
parts.push("", ...operatorGuidance);
|
|
1821
|
+
}
|
|
1777
1822
|
parts.push(
|
|
1778
1823
|
"",
|
|
1779
1824
|
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
1780
1825
|
);
|
|
1781
1826
|
return parts.join("\n");
|
|
1782
1827
|
}
|
|
1828
|
+
buildCustomPrompt(context) {
|
|
1829
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate ?? "";
|
|
1830
|
+
warnDeprecatedTemplateVars(template);
|
|
1831
|
+
return substituteVariables(template, buildTemplateVariables(context));
|
|
1832
|
+
}
|
|
1783
1833
|
buildRubricPrompt(context, rubrics) {
|
|
1784
1834
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1785
1835
|
const parts = [
|
|
@@ -1803,10 +1853,21 @@ ${outputSchema}`;
|
|
|
1803
1853
|
parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
|
|
1804
1854
|
}
|
|
1805
1855
|
parts.push("[[ ## rubrics ## ]]");
|
|
1856
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
1857
|
+
if (operatorGuidance.length > 0) {
|
|
1858
|
+
parts.push("", "Operator guidance:");
|
|
1859
|
+
for (const guidance of operatorGuidance) {
|
|
1860
|
+
parts.push(`- ${guidance}`);
|
|
1861
|
+
}
|
|
1862
|
+
parts.push("");
|
|
1863
|
+
}
|
|
1806
1864
|
for (const rubric of rubrics) {
|
|
1807
1865
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
1808
1866
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
1809
|
-
|
|
1867
|
+
const operatorLabel = formatRubricOperatorLabel(rubric.operator);
|
|
1868
|
+
parts.push(
|
|
1869
|
+
`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`
|
|
1870
|
+
);
|
|
1810
1871
|
}
|
|
1811
1872
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
1812
1873
|
return parts.join("\n");
|
|
@@ -2538,6 +2599,385 @@ var CostGrader = class {
|
|
|
2538
2599
|
};
|
|
2539
2600
|
|
|
2540
2601
|
// src/evaluation/trace.ts
|
|
2602
|
+
import { z as z2 } from "zod";
|
|
2603
|
+
var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trace.v1";
|
|
2604
|
+
var NORMALIZED_TRACE_SOURCE_KINDS = [
|
|
2605
|
+
"agentv_run",
|
|
2606
|
+
"otlp",
|
|
2607
|
+
"phoenix",
|
|
2608
|
+
"langfuse",
|
|
2609
|
+
"pi_session",
|
|
2610
|
+
"imported_transcript",
|
|
2611
|
+
"compact_transcript"
|
|
2612
|
+
];
|
|
2613
|
+
var NORMALIZED_TRACE_EVENT_TYPES = [
|
|
2614
|
+
"message",
|
|
2615
|
+
"model_turn",
|
|
2616
|
+
"tool_call",
|
|
2617
|
+
"tool_result"
|
|
2618
|
+
];
|
|
2619
|
+
var NORMALIZED_TOOL_STATUSES = ["ok", "error", "timeout", "cancelled", "unknown"];
|
|
2620
|
+
var NORMALIZED_REDACTION_LEVELS = ["none", "partial", "full"];
|
|
2621
|
+
function omitUndefinedProperties(value) {
|
|
2622
|
+
return Object.fromEntries(
|
|
2623
|
+
Object.entries(value).filter(([, property]) => property !== void 0)
|
|
2624
|
+
);
|
|
2625
|
+
}
|
|
2626
|
+
var MetadataWireSchema = z2.record(z2.string(), z2.unknown());
|
|
2627
|
+
var TokenUsageWireSchema = z2.object({
|
|
2628
|
+
input: z2.number(),
|
|
2629
|
+
output: z2.number(),
|
|
2630
|
+
cached: z2.number().optional(),
|
|
2631
|
+
reasoning: z2.number().optional()
|
|
2632
|
+
});
|
|
2633
|
+
var NormalizedRedactionStateWireSchema = z2.object({
|
|
2634
|
+
level: z2.enum(NORMALIZED_REDACTION_LEVELS),
|
|
2635
|
+
fields: z2.array(z2.string()).optional(),
|
|
2636
|
+
reason: z2.string().optional()
|
|
2637
|
+
});
|
|
2638
|
+
var NormalizedTraceErrorWireSchema = z2.object({
|
|
2639
|
+
message: z2.string(),
|
|
2640
|
+
name: z2.string().optional(),
|
|
2641
|
+
code: z2.string().optional(),
|
|
2642
|
+
stack: z2.string().optional(),
|
|
2643
|
+
metadata: MetadataWireSchema.optional()
|
|
2644
|
+
});
|
|
2645
|
+
var NormalizedTraceSourceWireSchema = z2.object({
|
|
2646
|
+
kind: z2.enum(NORMALIZED_TRACE_SOURCE_KINDS),
|
|
2647
|
+
path: z2.string().optional(),
|
|
2648
|
+
url: z2.string().optional(),
|
|
2649
|
+
provider: z2.string().optional(),
|
|
2650
|
+
format: z2.string().optional(),
|
|
2651
|
+
version: z2.string().optional(),
|
|
2652
|
+
metadata: MetadataWireSchema.optional()
|
|
2653
|
+
});
|
|
2654
|
+
var NormalizedTraceSessionWireSchema = z2.object({
|
|
2655
|
+
session_id: z2.string().optional(),
|
|
2656
|
+
conversation_id: z2.string().optional(),
|
|
2657
|
+
cwd: z2.string().optional(),
|
|
2658
|
+
started_at: z2.string().optional(),
|
|
2659
|
+
ended_at: z2.string().optional(),
|
|
2660
|
+
metadata: MetadataWireSchema.optional()
|
|
2661
|
+
});
|
|
2662
|
+
var NormalizedTraceBranchWireSchema = z2.object({
|
|
2663
|
+
selected_leaf_id: z2.string().optional(),
|
|
2664
|
+
selected_path_ids: z2.array(z2.string()).optional(),
|
|
2665
|
+
included_event_ids: z2.array(z2.string()).optional(),
|
|
2666
|
+
omitted_event_ids: z2.array(z2.string()).optional(),
|
|
2667
|
+
selection_reason: z2.string().optional()
|
|
2668
|
+
});
|
|
2669
|
+
var NormalizedTraceSourceRefWireSchema = z2.object({
|
|
2670
|
+
event_id: z2.string().optional(),
|
|
2671
|
+
message_id: z2.string().optional(),
|
|
2672
|
+
span_id: z2.string().optional(),
|
|
2673
|
+
trace_id: z2.string().optional(),
|
|
2674
|
+
raw_kind: z2.string().optional(),
|
|
2675
|
+
path: z2.string().optional(),
|
|
2676
|
+
line: z2.number().int().nonnegative().optional(),
|
|
2677
|
+
metadata: MetadataWireSchema.optional()
|
|
2678
|
+
});
|
|
2679
|
+
var NormalizedRawEvidenceWireSchema = z2.object({
|
|
2680
|
+
kind: z2.string(),
|
|
2681
|
+
ref: z2.string().optional(),
|
|
2682
|
+
media_type: z2.string().optional(),
|
|
2683
|
+
content: z2.unknown().optional(),
|
|
2684
|
+
redacted: z2.boolean().optional(),
|
|
2685
|
+
metadata: MetadataWireSchema.optional()
|
|
2686
|
+
});
|
|
2687
|
+
var NormalizedTraceMessageWireSchema = z2.object({
|
|
2688
|
+
role: z2.string(),
|
|
2689
|
+
name: z2.string().optional(),
|
|
2690
|
+
content: z2.unknown().optional(),
|
|
2691
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
2692
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
2693
|
+
metadata: MetadataWireSchema.optional()
|
|
2694
|
+
});
|
|
2695
|
+
var NormalizedTraceModelWireSchema = z2.object({
|
|
2696
|
+
provider: z2.string().optional(),
|
|
2697
|
+
name: z2.string().optional(),
|
|
2698
|
+
invocation_id: z2.string().optional(),
|
|
2699
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
2700
|
+
metadata: MetadataWireSchema.optional()
|
|
2701
|
+
});
|
|
2702
|
+
var NormalizedTraceToolWireSchema = z2.object({
|
|
2703
|
+
name: z2.string(),
|
|
2704
|
+
call_id: z2.string().optional(),
|
|
2705
|
+
input: z2.unknown().optional(),
|
|
2706
|
+
output: z2.unknown().optional(),
|
|
2707
|
+
status: z2.enum(NORMALIZED_TOOL_STATUSES).optional(),
|
|
2708
|
+
error: NormalizedTraceErrorWireSchema.optional(),
|
|
2709
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
2710
|
+
metadata: MetadataWireSchema.optional()
|
|
2711
|
+
});
|
|
2712
|
+
var NormalizedTraceEventWireSchema = z2.object({
|
|
2713
|
+
event_id: z2.string(),
|
|
2714
|
+
parent_event_id: z2.string().optional(),
|
|
2715
|
+
ordinal: z2.number().int().nonnegative(),
|
|
2716
|
+
type: z2.enum(NORMALIZED_TRACE_EVENT_TYPES),
|
|
2717
|
+
timestamp: z2.string().optional(),
|
|
2718
|
+
duration_ms: z2.number().nonnegative().optional(),
|
|
2719
|
+
duration_inferred: z2.boolean().optional(),
|
|
2720
|
+
turn_index: z2.number().int().nonnegative().optional(),
|
|
2721
|
+
message: NormalizedTraceMessageWireSchema.optional(),
|
|
2722
|
+
model: NormalizedTraceModelWireSchema.optional(),
|
|
2723
|
+
tool: NormalizedTraceToolWireSchema.optional(),
|
|
2724
|
+
source_ref: NormalizedTraceSourceRefWireSchema.optional(),
|
|
2725
|
+
raw_evidence: z2.array(NormalizedRawEvidenceWireSchema).optional(),
|
|
2726
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
2727
|
+
metadata: MetadataWireSchema.optional()
|
|
2728
|
+
});
|
|
2729
|
+
var NormalizedTrajectoryWireSchema = z2.object({
|
|
2730
|
+
schema_version: z2.literal(NORMALIZED_TRAJECTORY_SCHEMA_VERSION),
|
|
2731
|
+
source: NormalizedTraceSourceWireSchema,
|
|
2732
|
+
session: NormalizedTraceSessionWireSchema,
|
|
2733
|
+
branch: NormalizedTraceBranchWireSchema.optional(),
|
|
2734
|
+
events: z2.array(NormalizedTraceEventWireSchema),
|
|
2735
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
2736
|
+
cost_usd: z2.number().optional(),
|
|
2737
|
+
duration_ms: z2.number().optional(),
|
|
2738
|
+
started_at: z2.string().optional(),
|
|
2739
|
+
ended_at: z2.string().optional(),
|
|
2740
|
+
metadata: MetadataWireSchema.optional()
|
|
2741
|
+
});
|
|
2742
|
+
function toNormalizedTrajectoryWire(trajectory) {
|
|
2743
|
+
return NormalizedTrajectoryWireSchema.parse(
|
|
2744
|
+
omitUndefinedProperties({
|
|
2745
|
+
schema_version: trajectory.schemaVersion,
|
|
2746
|
+
source: toNormalizedTraceSourceWire(trajectory.source),
|
|
2747
|
+
session: toNormalizedTraceSessionWire(trajectory.session),
|
|
2748
|
+
branch: trajectory.branch ? toNormalizedTraceBranchWire(trajectory.branch) : void 0,
|
|
2749
|
+
events: trajectory.events.map(toNormalizedTraceEventWire),
|
|
2750
|
+
token_usage: trajectory.tokenUsage,
|
|
2751
|
+
cost_usd: trajectory.costUsd,
|
|
2752
|
+
duration_ms: trajectory.durationMs,
|
|
2753
|
+
started_at: trajectory.startedAt,
|
|
2754
|
+
ended_at: trajectory.endedAt,
|
|
2755
|
+
metadata: trajectory.metadata
|
|
2756
|
+
})
|
|
2757
|
+
);
|
|
2758
|
+
}
|
|
2759
|
+
function fromNormalizedTrajectoryWire(input) {
|
|
2760
|
+
const wire = NormalizedTrajectoryWireSchema.parse(input);
|
|
2761
|
+
return {
|
|
2762
|
+
schemaVersion: wire.schema_version,
|
|
2763
|
+
source: fromNormalizedTraceSourceWire(wire.source),
|
|
2764
|
+
session: fromNormalizedTraceSessionWire(wire.session),
|
|
2765
|
+
branch: wire.branch ? fromNormalizedTraceBranchWire(wire.branch) : void 0,
|
|
2766
|
+
events: wire.events.map(fromNormalizedTraceEventWire),
|
|
2767
|
+
tokenUsage: wire.token_usage,
|
|
2768
|
+
costUsd: wire.cost_usd,
|
|
2769
|
+
durationMs: wire.duration_ms,
|
|
2770
|
+
startedAt: wire.started_at,
|
|
2771
|
+
endedAt: wire.ended_at,
|
|
2772
|
+
metadata: wire.metadata
|
|
2773
|
+
};
|
|
2774
|
+
}
|
|
2775
|
+
function toNormalizedTraceSourceWire(source) {
|
|
2776
|
+
return omitUndefinedProperties({
|
|
2777
|
+
kind: source.kind,
|
|
2778
|
+
path: source.path,
|
|
2779
|
+
url: source.url,
|
|
2780
|
+
provider: source.provider,
|
|
2781
|
+
format: source.format,
|
|
2782
|
+
version: source.version,
|
|
2783
|
+
metadata: source.metadata
|
|
2784
|
+
});
|
|
2785
|
+
}
|
|
2786
|
+
function fromNormalizedTraceSourceWire(source) {
|
|
2787
|
+
return {
|
|
2788
|
+
kind: source.kind,
|
|
2789
|
+
path: source.path,
|
|
2790
|
+
url: source.url,
|
|
2791
|
+
provider: source.provider,
|
|
2792
|
+
format: source.format,
|
|
2793
|
+
version: source.version,
|
|
2794
|
+
metadata: source.metadata
|
|
2795
|
+
};
|
|
2796
|
+
}
|
|
2797
|
+
function toNormalizedTraceSessionWire(session) {
|
|
2798
|
+
return omitUndefinedProperties({
|
|
2799
|
+
session_id: session.sessionId,
|
|
2800
|
+
conversation_id: session.conversationId,
|
|
2801
|
+
cwd: session.cwd,
|
|
2802
|
+
started_at: session.startedAt,
|
|
2803
|
+
ended_at: session.endedAt,
|
|
2804
|
+
metadata: session.metadata
|
|
2805
|
+
});
|
|
2806
|
+
}
|
|
2807
|
+
function fromNormalizedTraceSessionWire(session) {
|
|
2808
|
+
return {
|
|
2809
|
+
sessionId: session.session_id,
|
|
2810
|
+
conversationId: session.conversation_id,
|
|
2811
|
+
cwd: session.cwd,
|
|
2812
|
+
startedAt: session.started_at,
|
|
2813
|
+
endedAt: session.ended_at,
|
|
2814
|
+
metadata: session.metadata
|
|
2815
|
+
};
|
|
2816
|
+
}
|
|
2817
|
+
function toNormalizedTraceBranchWire(branch) {
|
|
2818
|
+
return omitUndefinedProperties({
|
|
2819
|
+
selected_leaf_id: branch.selectedLeafId,
|
|
2820
|
+
selected_path_ids: branch.selectedPathIds,
|
|
2821
|
+
included_event_ids: branch.includedEventIds,
|
|
2822
|
+
omitted_event_ids: branch.omittedEventIds,
|
|
2823
|
+
selection_reason: branch.selectionReason
|
|
2824
|
+
});
|
|
2825
|
+
}
|
|
2826
|
+
function fromNormalizedTraceBranchWire(branch) {
|
|
2827
|
+
return {
|
|
2828
|
+
selectedLeafId: branch.selected_leaf_id,
|
|
2829
|
+
selectedPathIds: branch.selected_path_ids,
|
|
2830
|
+
includedEventIds: branch.included_event_ids,
|
|
2831
|
+
omittedEventIds: branch.omitted_event_ids,
|
|
2832
|
+
selectionReason: branch.selection_reason
|
|
2833
|
+
};
|
|
2834
|
+
}
|
|
2835
|
+
function toNormalizedTraceEventWire(event) {
|
|
2836
|
+
return NormalizedTraceEventWireSchema.parse(
|
|
2837
|
+
omitUndefinedProperties({
|
|
2838
|
+
event_id: event.eventId,
|
|
2839
|
+
parent_event_id: event.parentEventId,
|
|
2840
|
+
ordinal: event.ordinal,
|
|
2841
|
+
type: event.type,
|
|
2842
|
+
timestamp: event.timestamp,
|
|
2843
|
+
duration_ms: event.durationMs,
|
|
2844
|
+
duration_inferred: event.durationInferred,
|
|
2845
|
+
turn_index: event.turnIndex,
|
|
2846
|
+
message: event.message ? toNormalizedTraceMessageWire(event.message) : void 0,
|
|
2847
|
+
model: event.model ? toNormalizedTraceModelWire(event.model) : void 0,
|
|
2848
|
+
tool: event.tool ? toNormalizedTraceToolWire(event.tool) : void 0,
|
|
2849
|
+
source_ref: event.sourceRef ? toNormalizedTraceSourceRefWire(event.sourceRef) : void 0,
|
|
2850
|
+
raw_evidence: event.rawEvidence?.map(toNormalizedRawEvidenceWire),
|
|
2851
|
+
redaction: event.redaction,
|
|
2852
|
+
metadata: event.metadata
|
|
2853
|
+
})
|
|
2854
|
+
);
|
|
2855
|
+
}
|
|
2856
|
+
function fromNormalizedTraceEventWire(event) {
|
|
2857
|
+
return {
|
|
2858
|
+
eventId: event.event_id,
|
|
2859
|
+
parentEventId: event.parent_event_id,
|
|
2860
|
+
ordinal: event.ordinal,
|
|
2861
|
+
type: event.type,
|
|
2862
|
+
timestamp: event.timestamp,
|
|
2863
|
+
durationMs: event.duration_ms,
|
|
2864
|
+
durationInferred: event.duration_inferred,
|
|
2865
|
+
turnIndex: event.turn_index,
|
|
2866
|
+
message: event.message ? fromNormalizedTraceMessageWire(event.message) : void 0,
|
|
2867
|
+
model: event.model ? fromNormalizedTraceModelWire(event.model) : void 0,
|
|
2868
|
+
tool: event.tool ? fromNormalizedTraceToolWire(event.tool) : void 0,
|
|
2869
|
+
sourceRef: event.source_ref ? fromNormalizedTraceSourceRefWire(event.source_ref) : void 0,
|
|
2870
|
+
rawEvidence: event.raw_evidence?.map(fromNormalizedRawEvidenceWire),
|
|
2871
|
+
redaction: event.redaction,
|
|
2872
|
+
metadata: event.metadata
|
|
2873
|
+
};
|
|
2874
|
+
}
|
|
2875
|
+
function toNormalizedTraceMessageWire(message) {
|
|
2876
|
+
return omitUndefinedProperties({
|
|
2877
|
+
role: message.role,
|
|
2878
|
+
name: message.name,
|
|
2879
|
+
content: message.content,
|
|
2880
|
+
redaction: message.redaction,
|
|
2881
|
+
token_usage: message.tokenUsage,
|
|
2882
|
+
metadata: message.metadata
|
|
2883
|
+
});
|
|
2884
|
+
}
|
|
2885
|
+
function fromNormalizedTraceMessageWire(message) {
|
|
2886
|
+
return {
|
|
2887
|
+
role: message.role,
|
|
2888
|
+
name: message.name,
|
|
2889
|
+
content: message.content,
|
|
2890
|
+
redaction: message.redaction,
|
|
2891
|
+
tokenUsage: message.token_usage,
|
|
2892
|
+
metadata: message.metadata
|
|
2893
|
+
};
|
|
2894
|
+
}
|
|
2895
|
+
function toNormalizedTraceModelWire(model) {
|
|
2896
|
+
return omitUndefinedProperties({
|
|
2897
|
+
provider: model.provider,
|
|
2898
|
+
name: model.name,
|
|
2899
|
+
invocation_id: model.invocationId,
|
|
2900
|
+
token_usage: model.tokenUsage,
|
|
2901
|
+
metadata: model.metadata
|
|
2902
|
+
});
|
|
2903
|
+
}
|
|
2904
|
+
function fromNormalizedTraceModelWire(model) {
|
|
2905
|
+
return {
|
|
2906
|
+
provider: model.provider,
|
|
2907
|
+
name: model.name,
|
|
2908
|
+
invocationId: model.invocation_id,
|
|
2909
|
+
tokenUsage: model.token_usage,
|
|
2910
|
+
metadata: model.metadata
|
|
2911
|
+
};
|
|
2912
|
+
}
|
|
2913
|
+
function toNormalizedTraceToolWire(tool) {
|
|
2914
|
+
return omitUndefinedProperties({
|
|
2915
|
+
name: tool.name,
|
|
2916
|
+
call_id: tool.callId,
|
|
2917
|
+
input: tool.input,
|
|
2918
|
+
output: tool.output,
|
|
2919
|
+
status: tool.status,
|
|
2920
|
+
error: tool.error,
|
|
2921
|
+
redaction: tool.redaction,
|
|
2922
|
+
metadata: tool.metadata
|
|
2923
|
+
});
|
|
2924
|
+
}
|
|
2925
|
+
function fromNormalizedTraceToolWire(tool) {
|
|
2926
|
+
return {
|
|
2927
|
+
name: tool.name,
|
|
2928
|
+
callId: tool.call_id,
|
|
2929
|
+
input: tool.input,
|
|
2930
|
+
output: tool.output,
|
|
2931
|
+
status: tool.status,
|
|
2932
|
+
error: tool.error,
|
|
2933
|
+
redaction: tool.redaction,
|
|
2934
|
+
metadata: tool.metadata
|
|
2935
|
+
};
|
|
2936
|
+
}
|
|
2937
|
+
function toNormalizedTraceSourceRefWire(sourceRef) {
|
|
2938
|
+
return omitUndefinedProperties({
|
|
2939
|
+
event_id: sourceRef.eventId,
|
|
2940
|
+
message_id: sourceRef.messageId,
|
|
2941
|
+
span_id: sourceRef.spanId,
|
|
2942
|
+
trace_id: sourceRef.traceId,
|
|
2943
|
+
raw_kind: sourceRef.rawKind,
|
|
2944
|
+
path: sourceRef.path,
|
|
2945
|
+
line: sourceRef.line,
|
|
2946
|
+
metadata: sourceRef.metadata
|
|
2947
|
+
});
|
|
2948
|
+
}
|
|
2949
|
+
function fromNormalizedTraceSourceRefWire(sourceRef) {
|
|
2950
|
+
return {
|
|
2951
|
+
eventId: sourceRef.event_id,
|
|
2952
|
+
messageId: sourceRef.message_id,
|
|
2953
|
+
spanId: sourceRef.span_id,
|
|
2954
|
+
traceId: sourceRef.trace_id,
|
|
2955
|
+
rawKind: sourceRef.raw_kind,
|
|
2956
|
+
path: sourceRef.path,
|
|
2957
|
+
line: sourceRef.line,
|
|
2958
|
+
metadata: sourceRef.metadata
|
|
2959
|
+
};
|
|
2960
|
+
}
|
|
2961
|
+
function toNormalizedRawEvidenceWire(evidence) {
|
|
2962
|
+
return omitUndefinedProperties({
|
|
2963
|
+
kind: evidence.kind,
|
|
2964
|
+
ref: evidence.ref,
|
|
2965
|
+
media_type: evidence.mediaType,
|
|
2966
|
+
content: evidence.content,
|
|
2967
|
+
redacted: evidence.redacted,
|
|
2968
|
+
metadata: evidence.metadata
|
|
2969
|
+
});
|
|
2970
|
+
}
|
|
2971
|
+
function fromNormalizedRawEvidenceWire(evidence) {
|
|
2972
|
+
return {
|
|
2973
|
+
kind: evidence.kind,
|
|
2974
|
+
ref: evidence.ref,
|
|
2975
|
+
mediaType: evidence.media_type,
|
|
2976
|
+
content: evidence.content,
|
|
2977
|
+
redacted: evidence.redacted,
|
|
2978
|
+
metadata: evidence.metadata
|
|
2979
|
+
};
|
|
2980
|
+
}
|
|
2541
2981
|
function computeTraceSummary(messages) {
|
|
2542
2982
|
const toolCallCounts = {};
|
|
2543
2983
|
const toolDurations = {};
|
|
@@ -2605,6 +3045,82 @@ function computeTraceSummary(messages) {
|
|
|
2605
3045
|
endTime: latestEnd?.toISOString()
|
|
2606
3046
|
};
|
|
2607
3047
|
}
|
|
3048
|
+
function getSelectedTrajectoryEvents(trajectory) {
|
|
3049
|
+
if (!trajectory.branch?.includedEventIds || trajectory.branch.includedEventIds.length === 0) {
|
|
3050
|
+
return trajectory.events;
|
|
3051
|
+
}
|
|
3052
|
+
const includedIds = new Set(trajectory.branch.includedEventIds);
|
|
3053
|
+
return trajectory.events.filter((event) => includedIds.has(event.eventId));
|
|
3054
|
+
}
|
|
3055
|
+
function computeTraceSummaryFromTrajectory(trajectory) {
|
|
3056
|
+
const selectedEvents = getSelectedTrajectoryEvents(trajectory);
|
|
3057
|
+
const hasModelTurnEvents = selectedEvents.some((event) => event.type === "model_turn");
|
|
3058
|
+
const toolCallCounts = {};
|
|
3059
|
+
const toolDurations = {};
|
|
3060
|
+
let totalToolCalls = 0;
|
|
3061
|
+
let errorCount = 0;
|
|
3062
|
+
let llmCallCount = 0;
|
|
3063
|
+
let earliestStart;
|
|
3064
|
+
let latestEnd;
|
|
3065
|
+
let hasAnyDuration = false;
|
|
3066
|
+
for (const event of selectedEvents) {
|
|
3067
|
+
if (event.type === "model_turn" || !hasModelTurnEvents && event.type === "message" && event.message?.role === "assistant") {
|
|
3068
|
+
llmCallCount++;
|
|
3069
|
+
}
|
|
3070
|
+
const eventStart = parseTimestamp(event.timestamp);
|
|
3071
|
+
if (eventStart && (!earliestStart || eventStart < earliestStart)) {
|
|
3072
|
+
earliestStart = eventStart;
|
|
3073
|
+
}
|
|
3074
|
+
const eventEnd = deriveEventEnd(eventStart, event.durationMs);
|
|
3075
|
+
if (eventEnd && (!latestEnd || eventEnd > latestEnd)) {
|
|
3076
|
+
latestEnd = eventEnd;
|
|
3077
|
+
}
|
|
3078
|
+
if (event.type !== "tool_call" || !event.tool) {
|
|
3079
|
+
continue;
|
|
3080
|
+
}
|
|
3081
|
+
toolCallCounts[event.tool.name] = (toolCallCounts[event.tool.name] ?? 0) + 1;
|
|
3082
|
+
totalToolCalls++;
|
|
3083
|
+
if (isErrorToolEvent(event)) {
|
|
3084
|
+
errorCount++;
|
|
3085
|
+
}
|
|
3086
|
+
if (event.durationMs !== void 0) {
|
|
3087
|
+
hasAnyDuration = true;
|
|
3088
|
+
if (!toolDurations[event.tool.name]) {
|
|
3089
|
+
toolDurations[event.tool.name] = [];
|
|
3090
|
+
}
|
|
3091
|
+
toolDurations[event.tool.name].push(event.durationMs);
|
|
3092
|
+
}
|
|
3093
|
+
}
|
|
3094
|
+
return {
|
|
3095
|
+
trace: {
|
|
3096
|
+
eventCount: totalToolCalls,
|
|
3097
|
+
toolCalls: toolCallCounts,
|
|
3098
|
+
errorCount,
|
|
3099
|
+
llmCallCount,
|
|
3100
|
+
...hasAnyDuration ? { toolDurations } : {}
|
|
3101
|
+
},
|
|
3102
|
+
tokenUsage: trajectory.tokenUsage,
|
|
3103
|
+
costUsd: trajectory.costUsd,
|
|
3104
|
+
durationMs: trajectory.durationMs,
|
|
3105
|
+
startTime: trajectory.startedAt ?? earliestStart?.toISOString(),
|
|
3106
|
+
endTime: trajectory.endedAt ?? latestEnd?.toISOString()
|
|
3107
|
+
};
|
|
3108
|
+
}
|
|
3109
|
+
function parseTimestamp(timestamp) {
|
|
3110
|
+
if (!timestamp) return void 0;
|
|
3111
|
+
const value = new Date(timestamp);
|
|
3112
|
+
return Number.isNaN(value.getTime()) ? void 0 : value;
|
|
3113
|
+
}
|
|
3114
|
+
function deriveEventEnd(start, durationMs) {
|
|
3115
|
+
if (!start) return void 0;
|
|
3116
|
+
if (durationMs === void 0) return start;
|
|
3117
|
+
return new Date(start.getTime() + durationMs);
|
|
3118
|
+
}
|
|
3119
|
+
function isErrorToolEvent(event) {
|
|
3120
|
+
return Boolean(
|
|
3121
|
+
event.tool?.error || event.tool?.status === "error" || event.tool?.status === "timeout" || event.tool?.status === "cancelled"
|
|
3122
|
+
);
|
|
3123
|
+
}
|
|
2608
3124
|
var DEFAULT_EXPLORATION_TOOLS = [
|
|
2609
3125
|
"read",
|
|
2610
3126
|
"grep",
|
|
@@ -3401,6 +3917,30 @@ var SkillTriggerGrader = class {
|
|
|
3401
3917
|
};
|
|
3402
3918
|
|
|
3403
3919
|
// src/evaluation/graders/llm-grader-prompt.ts
|
|
3920
|
+
function stringifyPretty2(value) {
|
|
3921
|
+
return value === void 0 ? "" : JSON.stringify(value, null, 2);
|
|
3922
|
+
}
|
|
3923
|
+
function stringifyCompact2(value) {
|
|
3924
|
+
return value === void 0 ? "" : JSON.stringify(value);
|
|
3925
|
+
}
|
|
3926
|
+
function buildTemplateVariables2(input) {
|
|
3927
|
+
const formattedQuestion = input.promptInputs.question && input.promptInputs.question.trim().length > 0 ? input.promptInputs.question : input.evalCase.question;
|
|
3928
|
+
return {
|
|
3929
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
3930
|
+
[TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(),
|
|
3931
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? "").trim(),
|
|
3932
|
+
[TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
|
|
3933
|
+
[TEMPLATE_VARIABLES.METADATA]: stringifyPretty2(input.evalCase.metadata),
|
|
3934
|
+
[TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact2(input.evalCase.metadata),
|
|
3935
|
+
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty2(input.rubrics),
|
|
3936
|
+
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact2(input.rubrics),
|
|
3937
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? "",
|
|
3938
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? "",
|
|
3939
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
3940
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
|
|
3941
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? "").trim()
|
|
3942
|
+
};
|
|
3943
|
+
}
|
|
3404
3944
|
function assembleLlmGraderPrompt(input) {
|
|
3405
3945
|
const {
|
|
3406
3946
|
evalCase,
|
|
@@ -3413,6 +3953,17 @@ function assembleLlmGraderPrompt(input) {
|
|
|
3413
3953
|
} = input;
|
|
3414
3954
|
const rubrics = evaluatorConfig?.rubrics;
|
|
3415
3955
|
if (rubrics && rubrics.length > 0) {
|
|
3956
|
+
if (graderTemplateOverride) {
|
|
3957
|
+
return assembleCustom(
|
|
3958
|
+
evalCase,
|
|
3959
|
+
candidate,
|
|
3960
|
+
promptInputs,
|
|
3961
|
+
rubrics,
|
|
3962
|
+
fileChanges,
|
|
3963
|
+
toolCalls,
|
|
3964
|
+
graderTemplateOverride
|
|
3965
|
+
);
|
|
3966
|
+
}
|
|
3416
3967
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
3417
3968
|
if (hasScoreRanges) {
|
|
3418
3969
|
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
@@ -3429,19 +3980,13 @@ function assembleLlmGraderPrompt(input) {
|
|
|
3429
3980
|
);
|
|
3430
3981
|
}
|
|
3431
3982
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
|
|
3432
|
-
const
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
|
|
3438
|
-
|
|
3439
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
|
|
3440
|
-
// Deprecated aliases
|
|
3441
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
3442
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
3443
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
3444
|
-
};
|
|
3983
|
+
const variables = buildTemplateVariables2({
|
|
3984
|
+
evalCase,
|
|
3985
|
+
candidate,
|
|
3986
|
+
promptInputs,
|
|
3987
|
+
fileChanges,
|
|
3988
|
+
toolCalls
|
|
3989
|
+
});
|
|
3445
3990
|
const systemPrompt = buildOutputSchema();
|
|
3446
3991
|
const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
|
|
3447
3992
|
let userPrompt = substituteVariables(template, variables);
|
|
@@ -3464,6 +4009,27 @@ ${toolCalls}`;
|
|
|
3464
4009
|
mode: "freeform"
|
|
3465
4010
|
};
|
|
3466
4011
|
}
|
|
4012
|
+
function assembleCustom(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls, graderTemplateOverride) {
|
|
4013
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
4014
|
+
const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema();
|
|
4015
|
+
const userPrompt = substituteVariables(
|
|
4016
|
+
graderTemplateOverride,
|
|
4017
|
+
buildTemplateVariables2({
|
|
4018
|
+
evalCase,
|
|
4019
|
+
candidate,
|
|
4020
|
+
promptInputs,
|
|
4021
|
+
rubrics,
|
|
4022
|
+
fileChanges,
|
|
4023
|
+
toolCalls
|
|
4024
|
+
})
|
|
4025
|
+
);
|
|
4026
|
+
return {
|
|
4027
|
+
systemPrompt,
|
|
4028
|
+
userPrompt,
|
|
4029
|
+
responseSchema: systemPrompt,
|
|
4030
|
+
mode: hasScoreRanges ? "score_range" : "checklist"
|
|
4031
|
+
};
|
|
4032
|
+
}
|
|
3467
4033
|
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
3468
4034
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
3469
4035
|
const parts = [
|
|
@@ -3487,10 +4053,19 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
3487
4053
|
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
3488
4054
|
}
|
|
3489
4055
|
parts.push("[[ ## rubrics ## ]]");
|
|
4056
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
4057
|
+
if (operatorGuidance.length > 0) {
|
|
4058
|
+
parts.push("", "Operator guidance:");
|
|
4059
|
+
for (const guidance of operatorGuidance) {
|
|
4060
|
+
parts.push(`- ${guidance}`);
|
|
4061
|
+
}
|
|
4062
|
+
parts.push("");
|
|
4063
|
+
}
|
|
3490
4064
|
for (const rubric of rubrics) {
|
|
3491
4065
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3492
4066
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3493
|
-
|
|
4067
|
+
const operatorLabel = formatRubricOperatorLabel(rubric.operator);
|
|
4068
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`);
|
|
3494
4069
|
}
|
|
3495
4070
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3496
4071
|
const systemPrompt = buildRubricOutputSchema();
|
|
@@ -3530,6 +4105,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
3530
4105
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3531
4106
|
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
3532
4107
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
4108
|
+
if (rubric.operator) {
|
|
4109
|
+
parts.push(`Operator: ${rubric.operator}`);
|
|
4110
|
+
}
|
|
3533
4111
|
if (rubric.outcome) {
|
|
3534
4112
|
parts.push(`Description: ${rubric.outcome}`);
|
|
3535
4113
|
}
|
|
@@ -3542,6 +4120,10 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
3542
4120
|
}
|
|
3543
4121
|
}
|
|
3544
4122
|
}
|
|
4123
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
4124
|
+
if (operatorGuidance.length > 0) {
|
|
4125
|
+
parts.push("", ...operatorGuidance);
|
|
4126
|
+
}
|
|
3545
4127
|
parts.push(
|
|
3546
4128
|
"",
|
|
3547
4129
|
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
@@ -4260,7 +4842,7 @@ function runEqualsAssertion(output, value) {
|
|
|
4260
4842
|
import { spawn } from "node:child_process";
|
|
4261
4843
|
import { randomUUID } from "node:crypto";
|
|
4262
4844
|
import { createWriteStream } from "node:fs";
|
|
4263
|
-
import { mkdir } from "node:fs/promises";
|
|
4845
|
+
import { mkdir as mkdir2 } from "node:fs/promises";
|
|
4264
4846
|
import path5 from "node:path";
|
|
4265
4847
|
|
|
4266
4848
|
// src/runtime/child-tracker.ts
|
|
@@ -4760,7 +5342,7 @@ var ClaudeCliProvider = class {
|
|
|
4760
5342
|
return void 0;
|
|
4761
5343
|
}
|
|
4762
5344
|
try {
|
|
4763
|
-
await
|
|
5345
|
+
await mkdir2(logDir, { recursive: true });
|
|
4764
5346
|
} catch (error) {
|
|
4765
5347
|
const message = error instanceof Error ? error.message : String(error);
|
|
4766
5348
|
console.warn(`Skipping Claude CLI stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -5070,7 +5652,7 @@ function tryParseJson(line) {
|
|
|
5070
5652
|
// src/evaluation/providers/claude-sdk.ts
|
|
5071
5653
|
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
5072
5654
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
5073
|
-
import { mkdir as
|
|
5655
|
+
import { mkdir as mkdir3 } from "node:fs/promises";
|
|
5074
5656
|
import path6 from "node:path";
|
|
5075
5657
|
var claudeSdkModule = null;
|
|
5076
5658
|
async function loadClaudeSdk() {
|
|
@@ -5255,7 +5837,7 @@ var ClaudeSdkProvider = class {
|
|
|
5255
5837
|
return void 0;
|
|
5256
5838
|
}
|
|
5257
5839
|
try {
|
|
5258
|
-
await
|
|
5840
|
+
await mkdir3(logDir, { recursive: true });
|
|
5259
5841
|
} catch (error) {
|
|
5260
5842
|
const message = error instanceof Error ? error.message : String(error);
|
|
5261
5843
|
console.warn(`Skipping Claude stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -5450,44 +6032,44 @@ function formatElapsed2(startedAt) {
|
|
|
5450
6032
|
// src/evaluation/providers/cli.ts
|
|
5451
6033
|
import { exec as execWithCallback } from "node:child_process";
|
|
5452
6034
|
import fs2 from "node:fs/promises";
|
|
5453
|
-
import
|
|
6035
|
+
import os from "node:os";
|
|
5454
6036
|
import path7 from "node:path";
|
|
5455
6037
|
import { promisify } from "node:util";
|
|
5456
|
-
import { z as
|
|
5457
|
-
var ToolCallSchema =
|
|
5458
|
-
tool:
|
|
5459
|
-
input:
|
|
5460
|
-
output:
|
|
5461
|
-
id:
|
|
5462
|
-
start_time:
|
|
5463
|
-
end_time:
|
|
5464
|
-
duration_ms:
|
|
6038
|
+
import { z as z3 } from "zod";
|
|
6039
|
+
var ToolCallSchema = z3.object({
|
|
6040
|
+
tool: z3.string(),
|
|
6041
|
+
input: z3.unknown().optional(),
|
|
6042
|
+
output: z3.unknown().optional(),
|
|
6043
|
+
id: z3.string().optional(),
|
|
6044
|
+
start_time: z3.string().optional(),
|
|
6045
|
+
end_time: z3.string().optional(),
|
|
6046
|
+
duration_ms: z3.number().optional()
|
|
5465
6047
|
});
|
|
5466
|
-
var MessageInputSchema =
|
|
5467
|
-
role:
|
|
5468
|
-
name:
|
|
5469
|
-
content:
|
|
5470
|
-
tool_calls:
|
|
5471
|
-
start_time:
|
|
5472
|
-
end_time:
|
|
5473
|
-
duration_ms:
|
|
5474
|
-
metadata:
|
|
6048
|
+
var MessageInputSchema = z3.object({
|
|
6049
|
+
role: z3.string(),
|
|
6050
|
+
name: z3.string().optional(),
|
|
6051
|
+
content: z3.unknown().optional(),
|
|
6052
|
+
tool_calls: z3.array(ToolCallSchema).optional(),
|
|
6053
|
+
start_time: z3.string().optional(),
|
|
6054
|
+
end_time: z3.string().optional(),
|
|
6055
|
+
duration_ms: z3.number().optional(),
|
|
6056
|
+
metadata: z3.record(z3.unknown()).optional()
|
|
5475
6057
|
});
|
|
5476
|
-
var TokenUsageSchema =
|
|
5477
|
-
input:
|
|
5478
|
-
output:
|
|
5479
|
-
cached:
|
|
6058
|
+
var TokenUsageSchema = z3.object({
|
|
6059
|
+
input: z3.number(),
|
|
6060
|
+
output: z3.number(),
|
|
6061
|
+
cached: z3.number().optional()
|
|
5480
6062
|
});
|
|
5481
|
-
var CliOutputSchema =
|
|
5482
|
-
text:
|
|
5483
|
-
output:
|
|
5484
|
-
output_messages:
|
|
6063
|
+
var CliOutputSchema = z3.object({
|
|
6064
|
+
text: z3.unknown().optional(),
|
|
6065
|
+
output: z3.array(MessageInputSchema).optional(),
|
|
6066
|
+
output_messages: z3.array(MessageInputSchema).optional(),
|
|
5485
6067
|
token_usage: TokenUsageSchema.optional(),
|
|
5486
|
-
cost_usd:
|
|
5487
|
-
duration_ms:
|
|
6068
|
+
cost_usd: z3.number().optional(),
|
|
6069
|
+
duration_ms: z3.number().optional()
|
|
5488
6070
|
});
|
|
5489
6071
|
var CliJsonlRecordSchema = CliOutputSchema.extend({
|
|
5490
|
-
id:
|
|
6072
|
+
id: z3.string().min(1)
|
|
5491
6073
|
});
|
|
5492
6074
|
function validateMetrics(costUsd, durationMs, context) {
|
|
5493
6075
|
let validCostUsd = costUsd;
|
|
@@ -5992,7 +6574,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
5992
6574
|
const safeEvalId = evalCaseId || "unknown";
|
|
5993
6575
|
const timestamp = Date.now();
|
|
5994
6576
|
const random = Math.random().toString(36).substring(2, 9);
|
|
5995
|
-
return path7.join(
|
|
6577
|
+
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
5996
6578
|
}
|
|
5997
6579
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
5998
6580
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -6005,7 +6587,7 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
6005
6587
|
// src/evaluation/providers/codex.ts
|
|
6006
6588
|
import { randomUUID as randomUUID3 } from "node:crypto";
|
|
6007
6589
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
6008
|
-
import { mkdir as
|
|
6590
|
+
import { mkdir as mkdir4 } from "node:fs/promises";
|
|
6009
6591
|
import path8 from "node:path";
|
|
6010
6592
|
|
|
6011
6593
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -6098,6 +6680,9 @@ var CodexProvider = class {
|
|
|
6098
6680
|
const startMs = Date.now();
|
|
6099
6681
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
6100
6682
|
const codexOptions = {};
|
|
6683
|
+
if (this.config.executable) {
|
|
6684
|
+
codexOptions.codexPathOverride = this.config.executable;
|
|
6685
|
+
}
|
|
6101
6686
|
if (this.config.model) {
|
|
6102
6687
|
codexOptions.config = { model: this.config.model };
|
|
6103
6688
|
}
|
|
@@ -6109,6 +6694,9 @@ var CodexProvider = class {
|
|
|
6109
6694
|
if (cwd) {
|
|
6110
6695
|
threadOptions.workingDirectory = cwd;
|
|
6111
6696
|
}
|
|
6697
|
+
if (this.config.modelReasoningEffort) {
|
|
6698
|
+
threadOptions.modelReasoningEffort = this.config.modelReasoningEffort;
|
|
6699
|
+
}
|
|
6112
6700
|
const thread = codex.startThread(threadOptions);
|
|
6113
6701
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
6114
6702
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
@@ -6256,7 +6844,7 @@ ${basePrompt}` : basePrompt;
|
|
|
6256
6844
|
}
|
|
6257
6845
|
resolveLogDirectory() {
|
|
6258
6846
|
const disabled = isCodexLogStreamingDisabled();
|
|
6259
|
-
if (disabled) {
|
|
6847
|
+
if (disabled || this.config.streamLog === false) {
|
|
6260
6848
|
return void 0;
|
|
6261
6849
|
}
|
|
6262
6850
|
if (this.config.logDir) {
|
|
@@ -6270,7 +6858,7 @@ ${basePrompt}` : basePrompt;
|
|
|
6270
6858
|
return void 0;
|
|
6271
6859
|
}
|
|
6272
6860
|
try {
|
|
6273
|
-
await
|
|
6861
|
+
await mkdir4(logDir, { recursive: true });
|
|
6274
6862
|
} catch (error) {
|
|
6275
6863
|
const message = error instanceof Error ? error.message : String(error);
|
|
6276
6864
|
console.warn(`Skipping Codex SDK stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -6283,7 +6871,7 @@ ${basePrompt}` : basePrompt;
|
|
|
6283
6871
|
targetName: this.targetName,
|
|
6284
6872
|
evalCaseId: request.evalCaseId,
|
|
6285
6873
|
attempt: request.attempt,
|
|
6286
|
-
format: this.config.
|
|
6874
|
+
format: this.config.streamLog === "raw" ? "json" : "summary"
|
|
6287
6875
|
});
|
|
6288
6876
|
recordCodexLogEntry({
|
|
6289
6877
|
filePath,
|
|
@@ -6419,7 +7007,7 @@ function formatElapsed3(startedAt) {
|
|
|
6419
7007
|
|
|
6420
7008
|
// src/evaluation/providers/copilot-cli.ts
|
|
6421
7009
|
import { randomUUID as randomUUID5 } from "node:crypto";
|
|
6422
|
-
import { mkdir as
|
|
7010
|
+
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
6423
7011
|
import { homedir as homedir2 } from "node:os";
|
|
6424
7012
|
import path11 from "node:path";
|
|
6425
7013
|
import { Readable, Writable } from "node:stream";
|
|
@@ -6429,7 +7017,7 @@ import * as acp from "@agentclientprotocol/sdk";
|
|
|
6429
7017
|
// src/evaluation/workspace/file-changes.ts
|
|
6430
7018
|
import { exec as execCallback } from "node:child_process";
|
|
6431
7019
|
import { readdirSync, statSync } from "node:fs";
|
|
6432
|
-
import { readFile as
|
|
7020
|
+
import { readFile as readFile3, readdir, stat } from "node:fs/promises";
|
|
6433
7021
|
import path9 from "node:path";
|
|
6434
7022
|
import { promisify as promisify2 } from "node:util";
|
|
6435
7023
|
var execAsync2 = promisify2(execCallback);
|
|
@@ -6504,7 +7092,7 @@ async function walkDir(rootDir, currentDir, snapshot) {
|
|
|
6504
7092
|
if (fileStat.size > SNAPSHOT_MAX_FILE_BYTES) continue;
|
|
6505
7093
|
let content;
|
|
6506
7094
|
try {
|
|
6507
|
-
content = await
|
|
7095
|
+
content = await readFile3(fullPath, "utf8");
|
|
6508
7096
|
if (content.includes("\0")) continue;
|
|
6509
7097
|
} catch {
|
|
6510
7098
|
continue;
|
|
@@ -6597,7 +7185,7 @@ import { arch, homedir, platform } from "node:os";
|
|
|
6597
7185
|
import path10 from "node:path";
|
|
6598
7186
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
6599
7187
|
function resolvePlatformCliPath() {
|
|
6600
|
-
const
|
|
7188
|
+
const os2 = platform();
|
|
6601
7189
|
const cpu = arch();
|
|
6602
7190
|
const platformMap = {
|
|
6603
7191
|
linux: "linux",
|
|
@@ -6608,13 +7196,13 @@ function resolvePlatformCliPath() {
|
|
|
6608
7196
|
x64: "x64",
|
|
6609
7197
|
arm64: "arm64"
|
|
6610
7198
|
};
|
|
6611
|
-
const osPart = platformMap[
|
|
7199
|
+
const osPart = platformMap[os2];
|
|
6612
7200
|
const archPart = archMap[cpu];
|
|
6613
7201
|
if (!osPart || !archPart) {
|
|
6614
7202
|
return void 0;
|
|
6615
7203
|
}
|
|
6616
7204
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
6617
|
-
const binaryName =
|
|
7205
|
+
const binaryName = os2 === "win32" ? "copilot.exe" : "copilot";
|
|
6618
7206
|
try {
|
|
6619
7207
|
const resolved = import.meta.resolve(`${packageName}/package.json`);
|
|
6620
7208
|
const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
|
|
@@ -6682,9 +7270,9 @@ function resolvePlatformCliPath() {
|
|
|
6682
7270
|
}
|
|
6683
7271
|
function globalNpmRoots() {
|
|
6684
7272
|
const roots = [];
|
|
6685
|
-
const
|
|
7273
|
+
const os2 = platform();
|
|
6686
7274
|
const home = homedir();
|
|
6687
|
-
if (
|
|
7275
|
+
if (os2 === "win32") {
|
|
6688
7276
|
if (process.env.APPDATA) {
|
|
6689
7277
|
roots.push(path10.join(process.env.APPDATA, "npm", "node_modules"));
|
|
6690
7278
|
}
|
|
@@ -6699,7 +7287,7 @@ function globalNpmRoots() {
|
|
|
6699
7287
|
if (process.env.npm_config_prefix) {
|
|
6700
7288
|
const prefix = process.env.npm_config_prefix;
|
|
6701
7289
|
roots.push(
|
|
6702
|
-
|
|
7290
|
+
os2 === "win32" ? path10.join(prefix, "node_modules") : path10.join(prefix, "lib", "node_modules")
|
|
6703
7291
|
);
|
|
6704
7292
|
}
|
|
6705
7293
|
return Array.from(new Set(roots));
|
|
@@ -7120,7 +7708,7 @@ var CopilotCliProvider = class {
|
|
|
7120
7708
|
return void 0;
|
|
7121
7709
|
}
|
|
7122
7710
|
try {
|
|
7123
|
-
await
|
|
7711
|
+
await mkdir5(logDir, { recursive: true });
|
|
7124
7712
|
} catch (error) {
|
|
7125
7713
|
const message = error instanceof Error ? error.message : String(error);
|
|
7126
7714
|
console.warn(`Skipping Copilot CLI stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -7228,7 +7816,7 @@ function summarizeAcpEvent(eventType, data) {
|
|
|
7228
7816
|
}
|
|
7229
7817
|
|
|
7230
7818
|
// src/evaluation/providers/copilot-log.ts
|
|
7231
|
-
import { readFile as
|
|
7819
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
7232
7820
|
import { homedir as homedir4 } from "node:os";
|
|
7233
7821
|
import path13 from "node:path";
|
|
7234
7822
|
|
|
@@ -7364,7 +7952,7 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
7364
7952
|
}
|
|
7365
7953
|
|
|
7366
7954
|
// src/evaluation/providers/copilot-session-discovery.ts
|
|
7367
|
-
import { readFile as
|
|
7955
|
+
import { readFile as readFile4, readdir as readdir2, stat as stat2 } from "node:fs/promises";
|
|
7368
7956
|
import { homedir as homedir3 } from "node:os";
|
|
7369
7957
|
import path12 from "node:path";
|
|
7370
7958
|
var DEFAULT_SESSION_STATE_DIR = () => path12.join(homedir3(), ".copilot", "session-state");
|
|
@@ -7383,7 +7971,7 @@ async function discoverCopilotSessions(opts) {
|
|
|
7383
7971
|
const workspacePath = path12.join(sessionDir, "workspace.yaml");
|
|
7384
7972
|
const eventsPath = path12.join(sessionDir, "events.jsonl");
|
|
7385
7973
|
try {
|
|
7386
|
-
const workspaceContent = await
|
|
7974
|
+
const workspaceContent = await readFile4(workspacePath, "utf8");
|
|
7387
7975
|
const workspace = parseYamlValue(workspaceContent) ?? {};
|
|
7388
7976
|
const cwd = String(workspace.cwd ?? "");
|
|
7389
7977
|
let updatedAt;
|
|
@@ -7445,7 +8033,7 @@ var CopilotLogProvider = class {
|
|
|
7445
8033
|
const eventsPath = path13.join(sessionDir, "events.jsonl");
|
|
7446
8034
|
let eventsContent;
|
|
7447
8035
|
try {
|
|
7448
|
-
eventsContent = await
|
|
8036
|
+
eventsContent = await readFile5(eventsPath, "utf8");
|
|
7449
8037
|
} catch (err) {
|
|
7450
8038
|
throw new Error(
|
|
7451
8039
|
`Failed to read Copilot session transcript at ${eventsPath}: ${err instanceof Error ? err.message : String(err)}`
|
|
@@ -7492,7 +8080,7 @@ var CopilotLogProvider = class {
|
|
|
7492
8080
|
// src/evaluation/providers/copilot-sdk.ts
|
|
7493
8081
|
import { randomUUID as randomUUID6 } from "node:crypto";
|
|
7494
8082
|
import { existsSync as existsSync2 } from "node:fs";
|
|
7495
|
-
import { mkdir as
|
|
8083
|
+
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
7496
8084
|
import path14 from "node:path";
|
|
7497
8085
|
|
|
7498
8086
|
// src/evaluation/providers/copilot-sdk-log-tracker.ts
|
|
@@ -7832,7 +8420,7 @@ var CopilotSdkProvider = class {
|
|
|
7832
8420
|
return void 0;
|
|
7833
8421
|
}
|
|
7834
8422
|
try {
|
|
7835
|
-
await
|
|
8423
|
+
await mkdir6(logDir, { recursive: true });
|
|
7836
8424
|
} catch (error) {
|
|
7837
8425
|
const message = error instanceof Error ? error.message : String(error);
|
|
7838
8426
|
console.warn(`Skipping Copilot SDK stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -7958,7 +8546,7 @@ var MockProvider = class {
|
|
|
7958
8546
|
import { execSync, spawn as spawn3 } from "node:child_process";
|
|
7959
8547
|
import { randomUUID as randomUUID7 } from "node:crypto";
|
|
7960
8548
|
import { accessSync, createWriteStream as createWriteStream5, readFileSync } from "node:fs";
|
|
7961
|
-
import { mkdir as
|
|
8549
|
+
import { mkdir as mkdir7, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile3 } from "node:fs/promises";
|
|
7962
8550
|
import { tmpdir as tmpdir2 } from "node:os";
|
|
7963
8551
|
import path15 from "node:path";
|
|
7964
8552
|
|
|
@@ -8167,7 +8755,7 @@ var PiCliProvider = class {
|
|
|
8167
8755
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
8168
8756
|
try {
|
|
8169
8757
|
const promptFile = path15.join(cwd, PROMPT_FILENAME);
|
|
8170
|
-
await
|
|
8758
|
+
await writeFile3(promptFile, request.question, "utf8");
|
|
8171
8759
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
8172
8760
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
8173
8761
|
if (result.timedOut) {
|
|
@@ -8358,7 +8946,7 @@ ${prompt}` : prompt;
|
|
|
8358
8946
|
return void 0;
|
|
8359
8947
|
}
|
|
8360
8948
|
try {
|
|
8361
|
-
await
|
|
8949
|
+
await mkdir7(logDir, { recursive: true });
|
|
8362
8950
|
} catch (error) {
|
|
8363
8951
|
const message = error instanceof Error ? error.message : String(error);
|
|
8364
8952
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -8921,7 +9509,7 @@ async function defaultPiRunner(options) {
|
|
|
8921
9509
|
import { execSync as execSync2 } from "node:child_process";
|
|
8922
9510
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
8923
9511
|
import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
|
|
8924
|
-
import { mkdir as
|
|
9512
|
+
import { mkdir as mkdir8 } from "node:fs/promises";
|
|
8925
9513
|
import path16 from "node:path";
|
|
8926
9514
|
import { createInterface } from "node:readline";
|
|
8927
9515
|
import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
|
|
@@ -8943,7 +9531,7 @@ async function promptInstall() {
|
|
|
8943
9531
|
}
|
|
8944
9532
|
}
|
|
8945
9533
|
function findManagedSdkInstallRoot() {
|
|
8946
|
-
return path16.join(
|
|
9534
|
+
return path16.join(getAgentvDataDir(), "deps", "pi-sdk");
|
|
8947
9535
|
}
|
|
8948
9536
|
function resolveGlobalNpmRoot() {
|
|
8949
9537
|
try {
|
|
@@ -9358,7 +9946,7 @@ ${fileList}`;
|
|
|
9358
9946
|
return void 0;
|
|
9359
9947
|
}
|
|
9360
9948
|
try {
|
|
9361
|
-
await
|
|
9949
|
+
await mkdir8(logDir, { recursive: true });
|
|
9362
9950
|
} catch (error) {
|
|
9363
9951
|
const message = error instanceof Error ? error.message : String(error);
|
|
9364
9952
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -9583,12 +10171,12 @@ import path27 from "node:path";
|
|
|
9583
10171
|
import { promisify as promisify4 } from "node:util";
|
|
9584
10172
|
|
|
9585
10173
|
// src/evaluation/providers/vscode/dispatch/agentDispatch.ts
|
|
9586
|
-
import { stat as stat5, writeFile as
|
|
10174
|
+
import { stat as stat5, writeFile as writeFile6 } from "node:fs/promises";
|
|
9587
10175
|
import path25 from "node:path";
|
|
9588
10176
|
|
|
9589
10177
|
// src/evaluation/providers/vscode/utils/fs.ts
|
|
9590
10178
|
import { constants } from "node:fs";
|
|
9591
|
-
import { access, mkdir as
|
|
10179
|
+
import { access, mkdir as mkdir9, readdir as readdir3, rm as rm3, stat as stat3 } from "node:fs/promises";
|
|
9592
10180
|
import path17 from "node:path";
|
|
9593
10181
|
async function pathExists(target) {
|
|
9594
10182
|
try {
|
|
@@ -9599,7 +10187,7 @@ async function pathExists(target) {
|
|
|
9599
10187
|
}
|
|
9600
10188
|
}
|
|
9601
10189
|
async function ensureDir(target) {
|
|
9602
|
-
await
|
|
10190
|
+
await mkdir9(target, { recursive: true });
|
|
9603
10191
|
}
|
|
9604
10192
|
async function readDirEntries(target) {
|
|
9605
10193
|
const entries = await readdir3(target, { withFileTypes: true });
|
|
@@ -9732,7 +10320,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
9732
10320
|
}
|
|
9733
10321
|
|
|
9734
10322
|
// src/evaluation/providers/vscode/dispatch/responseWaiter.ts
|
|
9735
|
-
import { readFile as
|
|
10323
|
+
import { readFile as readFile6 } from "node:fs/promises";
|
|
9736
10324
|
import path20 from "node:path";
|
|
9737
10325
|
|
|
9738
10326
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
@@ -9771,7 +10359,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
9771
10359
|
const maxAttempts = 10;
|
|
9772
10360
|
while (attempts < maxAttempts) {
|
|
9773
10361
|
try {
|
|
9774
|
-
const content = await
|
|
10362
|
+
const content = await readFile6(responseFileFinal, { encoding: "utf8" });
|
|
9775
10363
|
if (!silent) {
|
|
9776
10364
|
process.stdout.write(`${content}
|
|
9777
10365
|
`);
|
|
@@ -9828,7 +10416,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
9828
10416
|
const maxAttempts = 10;
|
|
9829
10417
|
while (attempts < maxAttempts) {
|
|
9830
10418
|
try {
|
|
9831
|
-
const content = await
|
|
10419
|
+
const content = await readFile6(file, { encoding: "utf8" });
|
|
9832
10420
|
if (!silent) {
|
|
9833
10421
|
process.stdout.write(`${content}
|
|
9834
10422
|
`);
|
|
@@ -9851,7 +10439,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
9851
10439
|
|
|
9852
10440
|
// src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
|
|
9853
10441
|
import { exec, spawn as spawn4 } from "node:child_process";
|
|
9854
|
-
import { mkdir as
|
|
10442
|
+
import { mkdir as mkdir10, writeFile as writeFile4 } from "node:fs/promises";
|
|
9855
10443
|
import path22 from "node:path";
|
|
9856
10444
|
import { promisify as promisify3 } from "node:util";
|
|
9857
10445
|
|
|
@@ -9932,9 +10520,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
9932
10520
|
const aliveFile = path22.join(subagentDir, DEFAULT_ALIVE_FILENAME);
|
|
9933
10521
|
await removeIfExists(aliveFile);
|
|
9934
10522
|
const githubAgentsDir = path22.join(subagentDir, ".github", "agents");
|
|
9935
|
-
await
|
|
10523
|
+
await mkdir10(githubAgentsDir, { recursive: true });
|
|
9936
10524
|
const wakeupDst = path22.join(githubAgentsDir, "wakeup.md");
|
|
9937
|
-
await
|
|
10525
|
+
await writeFile4(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
|
|
9938
10526
|
const workspaceChild = spawnVsCode(vscodeCmd, [workspacePath], {
|
|
9939
10527
|
label: "open-workspace"
|
|
9940
10528
|
});
|
|
@@ -9963,9 +10551,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
9963
10551
|
async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, requestInstructions, timestamp, vscodeCmd) {
|
|
9964
10552
|
const workspacePath = path22.join(subagentDir, `${path22.basename(subagentDir)}.code-workspace`);
|
|
9965
10553
|
const messagesDir = path22.join(subagentDir, "messages");
|
|
9966
|
-
await
|
|
10554
|
+
await mkdir10(messagesDir, { recursive: true });
|
|
9967
10555
|
const reqFile = path22.join(messagesDir, `${timestamp}_req.md`);
|
|
9968
|
-
await
|
|
10556
|
+
await writeFile4(reqFile, requestInstructions, { encoding: "utf8" });
|
|
9969
10557
|
const reqUri = pathToFileUri2(reqFile);
|
|
9970
10558
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
9971
10559
|
for (const attachment of attachmentPaths) {
|
|
@@ -9991,7 +10579,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
|
|
|
9991
10579
|
async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, chatInstruction, vscodeCmd) {
|
|
9992
10580
|
const workspacePath = path22.join(subagentDir, `${path22.basename(subagentDir)}.code-workspace`);
|
|
9993
10581
|
const messagesDir = path22.join(subagentDir, "messages");
|
|
9994
|
-
await
|
|
10582
|
+
await mkdir10(messagesDir, { recursive: true });
|
|
9995
10583
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
9996
10584
|
for (const attachment of attachmentPaths) {
|
|
9997
10585
|
chatArgs.push("-a", attachment);
|
|
@@ -10014,7 +10602,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
|
|
|
10014
10602
|
}
|
|
10015
10603
|
|
|
10016
10604
|
// src/evaluation/providers/vscode/dispatch/workspaceManager.ts
|
|
10017
|
-
import { copyFile, mkdir as
|
|
10605
|
+
import { copyFile, mkdir as mkdir11, readFile as readFile7, readdir as readdir4, stat as stat4, writeFile as writeFile5 } from "node:fs/promises";
|
|
10018
10606
|
import path24 from "node:path";
|
|
10019
10607
|
|
|
10020
10608
|
// src/evaluation/providers/vscode/utils/workspace.ts
|
|
@@ -10131,7 +10719,7 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
|
|
|
10131
10719
|
if (!stats.isFile()) {
|
|
10132
10720
|
throw new Error(`workspace template must be a file, not a directory: ${workspaceSrc}`);
|
|
10133
10721
|
}
|
|
10134
|
-
const templateText = await
|
|
10722
|
+
const templateText = await readFile7(workspaceSrc, "utf8");
|
|
10135
10723
|
workspaceContent = JSON.parse(templateText);
|
|
10136
10724
|
} else {
|
|
10137
10725
|
workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
|
|
@@ -10150,9 +10738,9 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
|
|
|
10150
10738
|
transformedContent = JSON.stringify(parsed, null, 2);
|
|
10151
10739
|
}
|
|
10152
10740
|
}
|
|
10153
|
-
await
|
|
10741
|
+
await writeFile5(workspaceDst, transformedContent, "utf8");
|
|
10154
10742
|
const messagesDir = path24.join(subagentDir, "messages");
|
|
10155
|
-
await
|
|
10743
|
+
await mkdir11(messagesDir, { recursive: true });
|
|
10156
10744
|
return { workspace: workspaceDst, messagesDir };
|
|
10157
10745
|
}
|
|
10158
10746
|
async function createSubagentLock(subagentDir) {
|
|
@@ -10175,7 +10763,7 @@ async function createSubagentLock(subagentDir) {
|
|
|
10175
10763
|
);
|
|
10176
10764
|
}
|
|
10177
10765
|
const lockFile = path24.join(subagentDir, DEFAULT_LOCK_NAME);
|
|
10178
|
-
await
|
|
10766
|
+
await writeFile5(lockFile, "", { encoding: "utf8" });
|
|
10179
10767
|
return lockFile;
|
|
10180
10768
|
}
|
|
10181
10769
|
async function removeSubagentLock(subagentDir) {
|
|
@@ -10200,7 +10788,7 @@ async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspa
|
|
|
10200
10788
|
}
|
|
10201
10789
|
if (promptFile) {
|
|
10202
10790
|
const githubAgentsDir = path24.join(subagentDir, ".github", "agents");
|
|
10203
|
-
await
|
|
10791
|
+
await mkdir11(githubAgentsDir, { recursive: true });
|
|
10204
10792
|
const agentFile = path24.join(githubAgentsDir, `${chatId}.md`);
|
|
10205
10793
|
try {
|
|
10206
10794
|
await copyFile(promptFile, agentFile);
|
|
@@ -10461,7 +11049,7 @@ async function dispatchBatchAgent(options) {
|
|
|
10461
11049
|
const reqFile = requestFiles[index];
|
|
10462
11050
|
const tmpFile = responseTmpFiles[index];
|
|
10463
11051
|
const finalFile = responseFilesFinal[index];
|
|
10464
|
-
return
|
|
11052
|
+
return writeFile6(
|
|
10465
11053
|
reqFile,
|
|
10466
11054
|
createBatchRequestPrompt(query, tmpFile, finalFile, batchRequestTemplateContent),
|
|
10467
11055
|
{ encoding: "utf8" }
|
|
@@ -10473,7 +11061,7 @@ async function dispatchBatchAgent(options) {
|
|
|
10473
11061
|
responseFilesFinal,
|
|
10474
11062
|
orchestratorTemplateContent
|
|
10475
11063
|
);
|
|
10476
|
-
await
|
|
11064
|
+
await writeFile6(orchestratorFile, orchestratorContent, { encoding: "utf8" });
|
|
10477
11065
|
}
|
|
10478
11066
|
const chatAttachments = [orchestratorFile, ...attachments];
|
|
10479
11067
|
const orchestratorUri = pathToFileUri2(orchestratorFile);
|
|
@@ -10539,7 +11127,7 @@ async function dispatchBatchAgent(options) {
|
|
|
10539
11127
|
}
|
|
10540
11128
|
|
|
10541
11129
|
// src/evaluation/providers/vscode/dispatch/provision.ts
|
|
10542
|
-
import { writeFile as
|
|
11130
|
+
import { writeFile as writeFile7 } from "node:fs/promises";
|
|
10543
11131
|
import path26 from "node:path";
|
|
10544
11132
|
var DEFAULT_WORKSPACE_TEMPLATE2 = {
|
|
10545
11133
|
folders: [
|
|
@@ -10620,8 +11208,8 @@ async function provisionSubagents(options) {
|
|
|
10620
11208
|
if (!dryRun) {
|
|
10621
11209
|
await removeIfExists(lockFile);
|
|
10622
11210
|
await ensureDir(githubAgentsDir);
|
|
10623
|
-
await
|
|
10624
|
-
await
|
|
11211
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11212
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10625
11213
|
}
|
|
10626
11214
|
created.push(subagentDir);
|
|
10627
11215
|
lockedSubagents.delete(subagentDir);
|
|
@@ -10631,8 +11219,8 @@ async function provisionSubagents(options) {
|
|
|
10631
11219
|
if (!isLocked && force) {
|
|
10632
11220
|
if (!dryRun) {
|
|
10633
11221
|
await ensureDir(githubAgentsDir);
|
|
10634
|
-
await
|
|
10635
|
-
await
|
|
11222
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11223
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10636
11224
|
}
|
|
10637
11225
|
created.push(subagentDir);
|
|
10638
11226
|
subagentsProvisioned += 1;
|
|
@@ -10640,8 +11228,8 @@ async function provisionSubagents(options) {
|
|
|
10640
11228
|
}
|
|
10641
11229
|
if (!dryRun && !await pathExists(workspaceDst)) {
|
|
10642
11230
|
await ensureDir(githubAgentsDir);
|
|
10643
|
-
await
|
|
10644
|
-
await
|
|
11231
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11232
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10645
11233
|
}
|
|
10646
11234
|
skippedExisting.push(subagentDir);
|
|
10647
11235
|
subagentsProvisioned += 1;
|
|
@@ -10656,8 +11244,8 @@ async function provisionSubagents(options) {
|
|
|
10656
11244
|
if (!dryRun) {
|
|
10657
11245
|
await ensureDir(subagentDir);
|
|
10658
11246
|
await ensureDir(githubAgentsDir);
|
|
10659
|
-
await
|
|
10660
|
-
await
|
|
11247
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11248
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10661
11249
|
}
|
|
10662
11250
|
created.push(subagentDir);
|
|
10663
11251
|
subagentsProvisioned += 1;
|
|
@@ -10982,7 +11570,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
10982
11570
|
|
|
10983
11571
|
// src/evaluation/providers/targets-file.ts
|
|
10984
11572
|
import { constants as constants3 } from "node:fs";
|
|
10985
|
-
import { access as access3, readFile as
|
|
11573
|
+
import { access as access3, readFile as readFile8 } from "node:fs/promises";
|
|
10986
11574
|
import path28 from "node:path";
|
|
10987
11575
|
function isRecord(value) {
|
|
10988
11576
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -11026,7 +11614,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
11026
11614
|
if (!await fileExists2(absolutePath)) {
|
|
11027
11615
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
11028
11616
|
}
|
|
11029
|
-
const raw = await
|
|
11617
|
+
const raw = await readFile8(absolutePath, "utf8");
|
|
11030
11618
|
const parsed = parseYamlValue(raw);
|
|
11031
11619
|
if (!isRecord(parsed)) {
|
|
11032
11620
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -11217,6 +11805,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
|
11217
11805
|
output: context.output ?? null,
|
|
11218
11806
|
inputFiles: context.evalCase.file_paths,
|
|
11219
11807
|
input: context.evalCase.input,
|
|
11808
|
+
metadata: context.evalCase.metadata ?? null,
|
|
11220
11809
|
trace: context.trace ?? null,
|
|
11221
11810
|
fileChanges: context.fileChanges ?? null,
|
|
11222
11811
|
workspacePath: context.workspacePath ?? null,
|
|
@@ -11734,7 +12323,7 @@ function getTCritical(df) {
|
|
|
11734
12323
|
}
|
|
11735
12324
|
|
|
11736
12325
|
// src/evaluation/workspace/manager.ts
|
|
11737
|
-
import { cp, mkdir as
|
|
12326
|
+
import { cp, mkdir as mkdir13, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
|
|
11738
12327
|
import path33 from "node:path";
|
|
11739
12328
|
var TemplateNotFoundError = class extends Error {
|
|
11740
12329
|
constructor(templatePath) {
|
|
@@ -11768,7 +12357,7 @@ function getWorkspacePath(evalRunId, caseId, workspaceRoot) {
|
|
|
11768
12357
|
return path33.join(root, evalRunId, caseId);
|
|
11769
12358
|
}
|
|
11770
12359
|
async function copyDirectoryRecursive(src, dest) {
|
|
11771
|
-
await
|
|
12360
|
+
await mkdir13(dest, { recursive: true });
|
|
11772
12361
|
const entries = await readdir5(src, { withFileTypes: true });
|
|
11773
12362
|
for (const entry of entries) {
|
|
11774
12363
|
const srcPath = path33.join(src, entry.name);
|
|
@@ -11843,7 +12432,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
11843
12432
|
import { execFile } from "node:child_process";
|
|
11844
12433
|
import { createHash } from "node:crypto";
|
|
11845
12434
|
import { existsSync as existsSync3 } from "node:fs";
|
|
11846
|
-
import { cp as cp2, mkdir as
|
|
12435
|
+
import { cp as cp2, mkdir as mkdir14, readFile as readFile9, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile8 } from "node:fs/promises";
|
|
11847
12436
|
import path34 from "node:path";
|
|
11848
12437
|
import { promisify as promisify5 } from "node:util";
|
|
11849
12438
|
var execFileAsync = promisify5(execFile);
|
|
@@ -11897,7 +12486,7 @@ function computeWorkspaceFingerprint(repos) {
|
|
|
11897
12486
|
return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
11898
12487
|
}
|
|
11899
12488
|
async function copyDirectoryRecursive2(src, dest, skipDirs) {
|
|
11900
|
-
await
|
|
12489
|
+
await mkdir14(dest, { recursive: true });
|
|
11901
12490
|
const entries = await readdir6(src, { withFileTypes: true });
|
|
11902
12491
|
for (const entry of entries) {
|
|
11903
12492
|
const srcPath = path34.join(src, entry.name);
|
|
@@ -11935,7 +12524,7 @@ var WorkspacePoolManager = class {
|
|
|
11935
12524
|
const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
|
|
11936
12525
|
const fingerprint = computeWorkspaceFingerprint(repos);
|
|
11937
12526
|
const poolDir = path34.join(this.poolRoot, fingerprint);
|
|
11938
|
-
await
|
|
12527
|
+
await mkdir14(poolDir, { recursive: true });
|
|
11939
12528
|
const drifted = await this.checkDrift(poolDir, fingerprint);
|
|
11940
12529
|
if (drifted) {
|
|
11941
12530
|
console.warn(
|
|
@@ -11962,7 +12551,7 @@ var WorkspacePoolManager = class {
|
|
|
11962
12551
|
poolDir
|
|
11963
12552
|
};
|
|
11964
12553
|
}
|
|
11965
|
-
await
|
|
12554
|
+
await mkdir14(slotPath, { recursive: true });
|
|
11966
12555
|
if (templatePath) {
|
|
11967
12556
|
await copyDirectoryRecursive2(templatePath, slotPath);
|
|
11968
12557
|
}
|
|
@@ -11999,14 +12588,14 @@ var WorkspacePoolManager = class {
|
|
|
11999
12588
|
async tryLock(lockPath) {
|
|
12000
12589
|
for (let attempt = 0; attempt < 3; attempt++) {
|
|
12001
12590
|
try {
|
|
12002
|
-
await
|
|
12591
|
+
await writeFile8(lockPath, String(process.pid), { flag: "wx" });
|
|
12003
12592
|
return true;
|
|
12004
12593
|
} catch (err) {
|
|
12005
12594
|
if (err.code !== "EEXIST") {
|
|
12006
12595
|
throw err;
|
|
12007
12596
|
}
|
|
12008
12597
|
try {
|
|
12009
|
-
const pidStr = await
|
|
12598
|
+
const pidStr = await readFile9(lockPath, "utf-8");
|
|
12010
12599
|
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
12011
12600
|
if (!Number.isNaN(pid)) {
|
|
12012
12601
|
try {
|
|
@@ -12033,7 +12622,7 @@ var WorkspacePoolManager = class {
|
|
|
12033
12622
|
async checkDrift(poolDir, fingerprint) {
|
|
12034
12623
|
const metadataPath = path34.join(poolDir, "metadata.json");
|
|
12035
12624
|
try {
|
|
12036
|
-
const raw = await
|
|
12625
|
+
const raw = await readFile9(metadataPath, "utf-8");
|
|
12037
12626
|
const metadata = JSON.parse(raw);
|
|
12038
12627
|
return metadata.fingerprint !== fingerprint;
|
|
12039
12628
|
} catch {
|
|
@@ -12048,7 +12637,7 @@ var WorkspacePoolManager = class {
|
|
|
12048
12637
|
repos,
|
|
12049
12638
|
createdAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
12050
12639
|
};
|
|
12051
|
-
await
|
|
12640
|
+
await writeFile8(path34.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
12052
12641
|
}
|
|
12053
12642
|
/** Remove all slot directories and their lock files from a pool directory. */
|
|
12054
12643
|
async removeAllSlots(poolDir) {
|
|
@@ -12058,7 +12647,7 @@ var WorkspacePoolManager = class {
|
|
|
12058
12647
|
const lockPath = path34.join(poolDir, `${entry}.lock`);
|
|
12059
12648
|
if (existsSync3(lockPath)) {
|
|
12060
12649
|
try {
|
|
12061
|
-
const pidStr = await
|
|
12650
|
+
const pidStr = await readFile9(lockPath, "utf-8");
|
|
12062
12651
|
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
12063
12652
|
if (!Number.isNaN(pid)) {
|
|
12064
12653
|
try {
|
|
@@ -12417,9 +13006,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
12417
13006
|
}
|
|
12418
13007
|
|
|
12419
13008
|
// src/evaluation/yaml-parser.ts
|
|
12420
|
-
import { readFile as
|
|
13009
|
+
import { readFile as readFile16, stat as stat8 } from "node:fs/promises";
|
|
12421
13010
|
import path43 from "node:path";
|
|
12422
13011
|
import micromatch2 from "micromatch";
|
|
13012
|
+
import { stringify as stringifyYaml } from "yaml";
|
|
12423
13013
|
|
|
12424
13014
|
// src/evaluation/input-message-utils.ts
|
|
12425
13015
|
function flattenInputMessages(messages) {
|
|
@@ -12486,7 +13076,7 @@ function cloneJsonValue(value) {
|
|
|
12486
13076
|
}
|
|
12487
13077
|
|
|
12488
13078
|
// src/evaluation/loaders/agent-skills-parser.ts
|
|
12489
|
-
import { readFile as
|
|
13079
|
+
import { readFile as readFile10 } from "node:fs/promises";
|
|
12490
13080
|
import path37 from "node:path";
|
|
12491
13081
|
var ANSI_RED = "\x1B[31m";
|
|
12492
13082
|
var ANSI_RESET2 = "\x1B[0m";
|
|
@@ -12499,7 +13089,7 @@ function isAgentSkillsFormat(parsed) {
|
|
|
12499
13089
|
return Array.isArray(obj.evals);
|
|
12500
13090
|
}
|
|
12501
13091
|
async function loadTestsFromAgentSkills(filePath) {
|
|
12502
|
-
const raw = await
|
|
13092
|
+
const raw = await readFile10(filePath, "utf8");
|
|
12503
13093
|
let parsed;
|
|
12504
13094
|
try {
|
|
12505
13095
|
parsed = JSON.parse(raw);
|
|
@@ -12566,7 +13156,7 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
|
|
|
12566
13156
|
}
|
|
12567
13157
|
|
|
12568
13158
|
// src/evaluation/loaders/config-loader.ts
|
|
12569
|
-
import { readFile as
|
|
13159
|
+
import { readFile as readFile11 } from "node:fs/promises";
|
|
12570
13160
|
import path39 from "node:path";
|
|
12571
13161
|
|
|
12572
13162
|
// src/evaluation/loaders/file-resolver.ts
|
|
@@ -12680,53 +13270,59 @@ var DEFAULT_EVAL_PATTERNS = [
|
|
|
12680
13270
|
];
|
|
12681
13271
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
12682
13272
|
const directories = buildDirectoryChain2(evalFilePath, repoRoot);
|
|
13273
|
+
const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
|
|
12683
13274
|
for (const directory of directories) {
|
|
12684
13275
|
const configPath = path39.join(directory, ".agentv", "config.yaml");
|
|
12685
13276
|
if (!await fileExists3(configPath)) {
|
|
12686
13277
|
continue;
|
|
12687
13278
|
}
|
|
12688
|
-
|
|
12689
|
-
|
|
12690
|
-
|
|
12691
|
-
if (!isJsonObject(parsed)) {
|
|
12692
|
-
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
12693
|
-
continue;
|
|
12694
|
-
}
|
|
12695
|
-
const config = parsed;
|
|
12696
|
-
const requiredVersion = parsed.required_version;
|
|
12697
|
-
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
12698
|
-
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
12699
|
-
continue;
|
|
12700
|
-
}
|
|
12701
|
-
const evalPatterns = config.eval_patterns;
|
|
12702
|
-
if (evalPatterns !== void 0 && !Array.isArray(evalPatterns)) {
|
|
12703
|
-
logWarning(`Invalid eval_patterns in ${configPath}, expected array`);
|
|
12704
|
-
continue;
|
|
12705
|
-
}
|
|
12706
|
-
if (Array.isArray(evalPatterns) && !evalPatterns.every((p) => typeof p === "string")) {
|
|
12707
|
-
logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
|
|
12708
|
-
continue;
|
|
12709
|
-
}
|
|
12710
|
-
const executionDefaults = parseExecutionDefaults(
|
|
12711
|
-
parsed.execution,
|
|
12712
|
-
configPath
|
|
12713
|
-
);
|
|
12714
|
-
const results = parseResultsConfig(parsed.results, configPath);
|
|
12715
|
-
const hooks = parseHooksConfig(parsed.hooks, configPath);
|
|
12716
|
-
return {
|
|
12717
|
-
required_version: requiredVersion,
|
|
12718
|
-
eval_patterns: evalPatterns,
|
|
12719
|
-
execution: executionDefaults,
|
|
12720
|
-
results,
|
|
12721
|
-
...hooks && { hooks }
|
|
12722
|
-
};
|
|
12723
|
-
} catch (error) {
|
|
12724
|
-
logWarning(
|
|
12725
|
-
`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`
|
|
12726
|
-
);
|
|
13279
|
+
const config = await readConfigFile(configPath);
|
|
13280
|
+
if (config) {
|
|
13281
|
+
return config;
|
|
12727
13282
|
}
|
|
12728
13283
|
}
|
|
12729
|
-
return null;
|
|
13284
|
+
return await fileExists3(globalConfigPath) ? readConfigFile(globalConfigPath) : null;
|
|
13285
|
+
}
|
|
13286
|
+
async function readConfigFile(configPath) {
|
|
13287
|
+
try {
|
|
13288
|
+
const rawConfig = await readFile11(configPath, "utf8");
|
|
13289
|
+
const parsed = interpolateEnv(parseYamlValue(rawConfig), process.env);
|
|
13290
|
+
if (!isJsonObject(parsed)) {
|
|
13291
|
+
logWarning(`Invalid config.yaml format at ${configPath}`);
|
|
13292
|
+
return null;
|
|
13293
|
+
}
|
|
13294
|
+
const config = parsed;
|
|
13295
|
+
const requiredVersion = parsed.required_version;
|
|
13296
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
13297
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
13298
|
+
return null;
|
|
13299
|
+
}
|
|
13300
|
+
const evalPatterns = config.eval_patterns;
|
|
13301
|
+
if (evalPatterns !== void 0 && !Array.isArray(evalPatterns)) {
|
|
13302
|
+
logWarning(`Invalid eval_patterns in ${configPath}, expected array`);
|
|
13303
|
+
return null;
|
|
13304
|
+
}
|
|
13305
|
+
if (Array.isArray(evalPatterns) && !evalPatterns.every((p) => typeof p === "string")) {
|
|
13306
|
+
logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
|
|
13307
|
+
return null;
|
|
13308
|
+
}
|
|
13309
|
+
const executionDefaults = parseExecutionDefaults(
|
|
13310
|
+
parsed.execution,
|
|
13311
|
+
configPath
|
|
13312
|
+
);
|
|
13313
|
+
const results = parseResultsConfig(parsed.results, configPath);
|
|
13314
|
+
const hooks = parseHooksConfig(parsed.hooks, configPath);
|
|
13315
|
+
return {
|
|
13316
|
+
required_version: requiredVersion,
|
|
13317
|
+
eval_patterns: evalPatterns,
|
|
13318
|
+
execution: executionDefaults,
|
|
13319
|
+
results,
|
|
13320
|
+
...hooks && { hooks }
|
|
13321
|
+
};
|
|
13322
|
+
} catch (error) {
|
|
13323
|
+
logWarning(`Could not read config.yaml at ${configPath}: ${error.message}`);
|
|
13324
|
+
return null;
|
|
13325
|
+
}
|
|
12730
13326
|
}
|
|
12731
13327
|
function extractTargetFromSuite(suite) {
|
|
12732
13328
|
const execution = suite.execution;
|
|
@@ -12902,7 +13498,10 @@ function extractCacheConfig(suite) {
|
|
|
12902
13498
|
logWarning(`Invalid execution.cache: ${cache}. Must be a boolean. Ignoring.`);
|
|
12903
13499
|
return void 0;
|
|
12904
13500
|
}
|
|
12905
|
-
|
|
13501
|
+
if (executionObj.cachePath !== void 0) {
|
|
13502
|
+
logWarning("Invalid execution.cachePath: use snake_case execution.cache_path in YAML.");
|
|
13503
|
+
}
|
|
13504
|
+
const cachePath = executionObj.cache_path;
|
|
12906
13505
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
12907
13506
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
12908
13507
|
}
|
|
@@ -13071,6 +13670,12 @@ function parseResultsConfig(raw, configPath) {
|
|
|
13071
13670
|
...branchPrefix && { branch_prefix: branchPrefix }
|
|
13072
13671
|
};
|
|
13073
13672
|
}
|
|
13673
|
+
function resolveResultsConfigForProject(config, _projectId) {
|
|
13674
|
+
if (!config) {
|
|
13675
|
+
return void 0;
|
|
13676
|
+
}
|
|
13677
|
+
return config.results;
|
|
13678
|
+
}
|
|
13074
13679
|
function parseHooksConfig(raw, configPath) {
|
|
13075
13680
|
if (raw === void 0 || raw === null) {
|
|
13076
13681
|
return void 0;
|
|
@@ -13095,15 +13700,15 @@ function logWarning(message) {
|
|
|
13095
13700
|
}
|
|
13096
13701
|
|
|
13097
13702
|
// src/evaluation/loaders/grader-parser.ts
|
|
13098
|
-
import { readFile as
|
|
13703
|
+
import { readFile as readFile13 } from "node:fs/promises";
|
|
13099
13704
|
import path40 from "node:path";
|
|
13100
13705
|
|
|
13101
13706
|
// src/evaluation/validation/prompt-validator.ts
|
|
13102
|
-
import { readFile as
|
|
13707
|
+
import { readFile as readFile12 } from "node:fs/promises";
|
|
13103
13708
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
13104
13709
|
var ANSI_RESET4 = "\x1B[0m";
|
|
13105
13710
|
async function validateCustomPromptContent(promptPath) {
|
|
13106
|
-
const content = await
|
|
13711
|
+
const content = await readFile12(promptPath, "utf8");
|
|
13107
13712
|
validateTemplateVariables(content, promptPath);
|
|
13108
13713
|
}
|
|
13109
13714
|
function validateTemplateVariables(content, source) {
|
|
@@ -13235,7 +13840,7 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
|
|
|
13235
13840
|
const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
|
|
13236
13841
|
throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
|
|
13237
13842
|
}
|
|
13238
|
-
const content = await
|
|
13843
|
+
const content = await readFile13(resolved.resolvedPath, "utf8");
|
|
13239
13844
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
13240
13845
|
if (!isJsonObject2(parsed)) {
|
|
13241
13846
|
throw new Error(
|
|
@@ -13282,6 +13887,103 @@ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, inc
|
|
|
13282
13887
|
}
|
|
13283
13888
|
return expanded;
|
|
13284
13889
|
}
|
|
13890
|
+
async function collectAssertionTemplateSourceReferences(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
13891
|
+
const execution = rawEvalCase.execution;
|
|
13892
|
+
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
13893
|
+
const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators;
|
|
13894
|
+
const skipDefaults = executionObject?.skip_defaults === true;
|
|
13895
|
+
const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
|
|
13896
|
+
return [
|
|
13897
|
+
...await collectAssertionTemplateReferencesFromValue(caseEvaluators, searchRoots, evalId),
|
|
13898
|
+
...await collectAssertionTemplateReferencesFromValue(rootEvaluators, searchRoots, evalId)
|
|
13899
|
+
];
|
|
13900
|
+
}
|
|
13901
|
+
async function collectAssertionTemplateReferencesFromValue(value, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
|
|
13902
|
+
if (value === void 0) {
|
|
13903
|
+
return [];
|
|
13904
|
+
}
|
|
13905
|
+
const references = [];
|
|
13906
|
+
if (Array.isArray(value)) {
|
|
13907
|
+
for (const item of value) {
|
|
13908
|
+
if (isIncludeEntry(item)) {
|
|
13909
|
+
const nextDepth = includeContext.depth + 1;
|
|
13910
|
+
if (nextDepth > MAX_ASSERTION_INCLUDE_DEPTH) {
|
|
13911
|
+
const chain = [...includeContext.chain, item.include].join(" -> ");
|
|
13912
|
+
throw new Error(
|
|
13913
|
+
`Assertion template include depth exceeded ${MAX_ASSERTION_INCLUDE_DEPTH} in '${evalId}'. Include chain: ${chain}`
|
|
13914
|
+
);
|
|
13915
|
+
}
|
|
13916
|
+
const resolved = await resolveAssertionTemplateReference(item.include, searchRoots);
|
|
13917
|
+
references.push({
|
|
13918
|
+
kind: "assertion_template",
|
|
13919
|
+
displayPath: resolved.displayPath,
|
|
13920
|
+
...resolved.resolvedPath ? { resolvedPath: path40.resolve(resolved.resolvedPath) } : {}
|
|
13921
|
+
});
|
|
13922
|
+
if (resolved.resolvedPath) {
|
|
13923
|
+
if (includeContext.chain.includes(resolved.resolvedPath)) {
|
|
13924
|
+
const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
|
|
13925
|
+
throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
|
|
13926
|
+
}
|
|
13927
|
+
const content = await readFile13(resolved.resolvedPath, "utf8");
|
|
13928
|
+
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
13929
|
+
if (isJsonObject2(parsed) && Array.isArray(parsed.assertions)) {
|
|
13930
|
+
const templateDir = path40.dirname(resolved.resolvedPath);
|
|
13931
|
+
const nestedSearchRoots = [
|
|
13932
|
+
templateDir,
|
|
13933
|
+
...searchRoots.filter((root) => path40.resolve(root) !== templateDir)
|
|
13934
|
+
];
|
|
13935
|
+
references.push(
|
|
13936
|
+
...await collectAssertionTemplateReferencesFromValue(
|
|
13937
|
+
parsed.assertions,
|
|
13938
|
+
nestedSearchRoots,
|
|
13939
|
+
evalId,
|
|
13940
|
+
{
|
|
13941
|
+
depth: nextDepth,
|
|
13942
|
+
chain: [...includeContext.chain, resolved.resolvedPath]
|
|
13943
|
+
}
|
|
13944
|
+
)
|
|
13945
|
+
);
|
|
13946
|
+
}
|
|
13947
|
+
}
|
|
13948
|
+
continue;
|
|
13949
|
+
}
|
|
13950
|
+
if (isJsonObject2(item)) {
|
|
13951
|
+
references.push(
|
|
13952
|
+
...await collectAssertionTemplateReferencesFromObject(
|
|
13953
|
+
item,
|
|
13954
|
+
searchRoots,
|
|
13955
|
+
evalId,
|
|
13956
|
+
includeContext
|
|
13957
|
+
)
|
|
13958
|
+
);
|
|
13959
|
+
}
|
|
13960
|
+
}
|
|
13961
|
+
} else if (isJsonObject2(value)) {
|
|
13962
|
+
references.push(
|
|
13963
|
+
...await collectAssertionTemplateReferencesFromObject(
|
|
13964
|
+
value,
|
|
13965
|
+
searchRoots,
|
|
13966
|
+
evalId,
|
|
13967
|
+
includeContext
|
|
13968
|
+
)
|
|
13969
|
+
);
|
|
13970
|
+
}
|
|
13971
|
+
return references;
|
|
13972
|
+
}
|
|
13973
|
+
async function collectAssertionTemplateReferencesFromObject(value, searchRoots, evalId, includeContext) {
|
|
13974
|
+
const references = [];
|
|
13975
|
+
for (const key of ["assertions", "assert", "evaluators"]) {
|
|
13976
|
+
references.push(
|
|
13977
|
+
...await collectAssertionTemplateReferencesFromValue(
|
|
13978
|
+
value[key],
|
|
13979
|
+
searchRoots,
|
|
13980
|
+
evalId,
|
|
13981
|
+
includeContext
|
|
13982
|
+
)
|
|
13983
|
+
);
|
|
13984
|
+
}
|
|
13985
|
+
return references;
|
|
13986
|
+
}
|
|
13285
13987
|
async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
|
|
13286
13988
|
const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
|
|
13287
13989
|
if (!expandedEvaluators) {
|
|
@@ -13408,6 +14110,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
13408
14110
|
continue;
|
|
13409
14111
|
}
|
|
13410
14112
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
14113
|
+
const resolvedScriptPath = await resolveOptionalCommandSource(command, searchRoots);
|
|
13411
14114
|
const cwd = asString(rawEvaluator.cwd);
|
|
13412
14115
|
let resolvedCwd;
|
|
13413
14116
|
if (cwd) {
|
|
@@ -13473,6 +14176,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
13473
14176
|
name,
|
|
13474
14177
|
type: "code-grader",
|
|
13475
14178
|
command,
|
|
14179
|
+
...resolvedScriptPath ? { resolvedScriptPath } : {},
|
|
13476
14180
|
cwd,
|
|
13477
14181
|
resolvedCwd,
|
|
13478
14182
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -14540,6 +15244,17 @@ function asStringArray(value, description) {
|
|
|
14540
15244
|
}
|
|
14541
15245
|
return result;
|
|
14542
15246
|
}
|
|
15247
|
+
async function resolveOptionalCommandSource(command, searchRoots) {
|
|
15248
|
+
const candidate = command.at(-1);
|
|
15249
|
+
if (!candidate || !looksLikeFilePath(candidate)) {
|
|
15250
|
+
return void 0;
|
|
15251
|
+
}
|
|
15252
|
+
const resolved = await resolveFileReference(candidate, searchRoots);
|
|
15253
|
+
return resolved.resolvedPath ? path40.resolve(resolved.resolvedPath) : void 0;
|
|
15254
|
+
}
|
|
15255
|
+
function looksLikeFilePath(value) {
|
|
15256
|
+
return path40.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
|
|
15257
|
+
}
|
|
14543
15258
|
function parseCommandToArgv(command) {
|
|
14544
15259
|
if (process.platform === "win32") {
|
|
14545
15260
|
return ["cmd.exe", "/c", command];
|
|
@@ -14608,6 +15323,19 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
14608
15323
|
function isValidFieldAggregationType(value) {
|
|
14609
15324
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
14610
15325
|
}
|
|
15326
|
+
var VALID_RUBRIC_OPERATORS = new Set(RUBRIC_OPERATOR_VALUES);
|
|
15327
|
+
function parseRubricOperator(value, rubricId, evaluatorName, evalId) {
|
|
15328
|
+
if (value === void 0) {
|
|
15329
|
+
return void 0;
|
|
15330
|
+
}
|
|
15331
|
+
if (typeof value === "string" && VALID_RUBRIC_OPERATORS.has(value)) {
|
|
15332
|
+
return value;
|
|
15333
|
+
}
|
|
15334
|
+
logWarning2(
|
|
15335
|
+
`Ignoring invalid operator for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be one of ${RUBRIC_OPERATOR_VALUES.join(", ")}`
|
|
15336
|
+
);
|
|
15337
|
+
return void 0;
|
|
15338
|
+
}
|
|
14611
15339
|
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
14612
15340
|
const items = [];
|
|
14613
15341
|
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
@@ -14618,7 +15346,8 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
14618
15346
|
continue;
|
|
14619
15347
|
}
|
|
14620
15348
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
14621
|
-
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
15349
|
+
const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? "";
|
|
15350
|
+
const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId);
|
|
14622
15351
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
14623
15352
|
let minScore;
|
|
14624
15353
|
let requiredMinScore;
|
|
@@ -14662,6 +15391,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
14662
15391
|
id,
|
|
14663
15392
|
weight,
|
|
14664
15393
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
15394
|
+
...operator !== void 0 ? { operator } : {},
|
|
14665
15395
|
...required !== void 0 ? { required } : {},
|
|
14666
15396
|
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
14667
15397
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
@@ -14677,6 +15407,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
14677
15407
|
items.push({
|
|
14678
15408
|
id,
|
|
14679
15409
|
outcome: expectedOutcome,
|
|
15410
|
+
...operator !== void 0 ? { operator } : {},
|
|
14680
15411
|
weight,
|
|
14681
15412
|
// Default to required: true if not specified (backward compatibility)
|
|
14682
15413
|
required: required ?? true,
|
|
@@ -14799,6 +15530,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
14799
15530
|
};
|
|
14800
15531
|
}
|
|
14801
15532
|
const expectedOutcome = asString(rubric.outcome) ?? "";
|
|
15533
|
+
const id = asString(rubric.id) ?? `rubric-${index + 1}`;
|
|
15534
|
+
const operator = parseRubricOperator(rubric.operator, id, "rubrics", "<inline>");
|
|
14802
15535
|
const rawScoreRanges = rubric.score_ranges;
|
|
14803
15536
|
const normalizedScoreRanges = rawScoreRanges !== void 0 ? normalizeScoreRangesShorthand(rawScoreRanges) : void 0;
|
|
14804
15537
|
const scoreRanges = Array.isArray(normalizedScoreRanges) && normalizedScoreRanges.length > 0 ? normalizedScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
@@ -14806,7 +15539,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
14806
15539
|
outcome: asString(range.outcome) ?? ""
|
|
14807
15540
|
})).filter((r) => r.outcome.length > 0) : void 0;
|
|
14808
15541
|
const baseRubric = {
|
|
14809
|
-
id
|
|
15542
|
+
id,
|
|
15543
|
+
...operator !== void 0 ? { operator } : {},
|
|
14810
15544
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
14811
15545
|
};
|
|
14812
15546
|
let inlineMinScore;
|
|
@@ -14847,12 +15581,12 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
14847
15581
|
}
|
|
14848
15582
|
|
|
14849
15583
|
// src/evaluation/loaders/jsonl-parser.ts
|
|
14850
|
-
import { readFile as
|
|
15584
|
+
import { readFile as readFile15 } from "node:fs/promises";
|
|
14851
15585
|
import path42 from "node:path";
|
|
14852
15586
|
import micromatch from "micromatch";
|
|
14853
15587
|
|
|
14854
15588
|
// src/evaluation/loaders/message-processor.ts
|
|
14855
|
-
import { readFile as
|
|
15589
|
+
import { readFile as readFile14 } from "node:fs/promises";
|
|
14856
15590
|
import path41 from "node:path";
|
|
14857
15591
|
|
|
14858
15592
|
// src/evaluation/formatting/segment-formatter.ts
|
|
@@ -14979,7 +15713,7 @@ async function processMessages(options) {
|
|
|
14979
15713
|
continue;
|
|
14980
15714
|
}
|
|
14981
15715
|
try {
|
|
14982
|
-
const fileContent = (await
|
|
15716
|
+
const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
14983
15717
|
processedContent.push({
|
|
14984
15718
|
...cloneJsonObject(rawSegment),
|
|
14985
15719
|
path: displayPath,
|
|
@@ -15020,7 +15754,7 @@ async function processMessages(options) {
|
|
|
15020
15754
|
continue;
|
|
15021
15755
|
}
|
|
15022
15756
|
try {
|
|
15023
|
-
const imageBuffer = await
|
|
15757
|
+
const imageBuffer = await readFile14(resolvedPath);
|
|
15024
15758
|
const base64 = imageBuffer.toString("base64");
|
|
15025
15759
|
processedContent.push({
|
|
15026
15760
|
type: "image",
|
|
@@ -15103,7 +15837,7 @@ async function processExpectedMessages(options) {
|
|
|
15103
15837
|
continue;
|
|
15104
15838
|
}
|
|
15105
15839
|
try {
|
|
15106
|
-
const fileContent = (await
|
|
15840
|
+
const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
15107
15841
|
processedContent.push({
|
|
15108
15842
|
type: "file",
|
|
15109
15843
|
path: displayPath,
|
|
@@ -15143,7 +15877,7 @@ async function processExpectedMessages(options) {
|
|
|
15143
15877
|
continue;
|
|
15144
15878
|
}
|
|
15145
15879
|
try {
|
|
15146
|
-
const imageBuffer = await
|
|
15880
|
+
const imageBuffer = await readFile14(resolvedPath);
|
|
15147
15881
|
const base64 = imageBuffer.toString("base64");
|
|
15148
15882
|
processedContent.push({
|
|
15149
15883
|
type: "image",
|
|
@@ -15185,6 +15919,12 @@ function expandInputShorthand(value) {
|
|
|
15185
15919
|
if (typeof value === "string") {
|
|
15186
15920
|
return [{ role: "user", content: value }];
|
|
15187
15921
|
}
|
|
15922
|
+
if (isJsonObject(value)) {
|
|
15923
|
+
if ("role" in value) {
|
|
15924
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
15925
|
+
}
|
|
15926
|
+
return [{ role: "user", content: value }];
|
|
15927
|
+
}
|
|
15188
15928
|
if (Array.isArray(value)) {
|
|
15189
15929
|
const messages = value.filter((msg) => isTestMessage(msg));
|
|
15190
15930
|
return messages.length > 0 ? messages : void 0;
|
|
@@ -15272,7 +16012,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
|
15272
16012
|
return {};
|
|
15273
16013
|
}
|
|
15274
16014
|
try {
|
|
15275
|
-
const content = await
|
|
16015
|
+
const content = await readFile15(sidecarPath, "utf8");
|
|
15276
16016
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
15277
16017
|
if (!isJsonObject(parsed)) {
|
|
15278
16018
|
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
@@ -15317,7 +16057,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
15317
16057
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
15318
16058
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
15319
16059
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
15320
|
-
const rawFile = await
|
|
16060
|
+
const rawFile = await readFile15(absoluteTestPath, "utf8");
|
|
15321
16061
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
15322
16062
|
const fallbackSuiteName = path42.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
15323
16063
|
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
@@ -15454,16 +16194,16 @@ ${detailBlock}${ANSI_RESET7}`);
|
|
|
15454
16194
|
}
|
|
15455
16195
|
|
|
15456
16196
|
// src/evaluation/metadata.ts
|
|
15457
|
-
import { z as
|
|
15458
|
-
var MetadataSchema =
|
|
15459
|
-
name:
|
|
15460
|
-
description:
|
|
15461
|
-
version:
|
|
15462
|
-
author:
|
|
15463
|
-
tags:
|
|
15464
|
-
license:
|
|
15465
|
-
requires:
|
|
15466
|
-
agentv:
|
|
16197
|
+
import { z as z4 } from "zod";
|
|
16198
|
+
var MetadataSchema = z4.object({
|
|
16199
|
+
name: z4.string().min(1).max(64).regex(/^[a-z0-9-]+$/).optional(),
|
|
16200
|
+
description: z4.string().min(1).max(1024).optional(),
|
|
16201
|
+
version: z4.string().optional(),
|
|
16202
|
+
author: z4.string().optional(),
|
|
16203
|
+
tags: z4.array(z4.string()).optional(),
|
|
16204
|
+
license: z4.string().optional(),
|
|
16205
|
+
requires: z4.object({
|
|
16206
|
+
agentv: z4.string().optional()
|
|
15467
16207
|
}).optional()
|
|
15468
16208
|
});
|
|
15469
16209
|
function parseMetadata(suite) {
|
|
@@ -15735,7 +16475,7 @@ function interpolateRawEvalCase(raw, vars) {
|
|
|
15735
16475
|
async function readTestSuiteMetadata(testFilePath) {
|
|
15736
16476
|
try {
|
|
15737
16477
|
const absolutePath = path43.resolve(testFilePath);
|
|
15738
|
-
const content = await
|
|
16478
|
+
const content = await readFile16(absolutePath, "utf8");
|
|
15739
16479
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
15740
16480
|
if (!isJsonObject(parsed)) {
|
|
15741
16481
|
return {};
|
|
@@ -15759,7 +16499,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
15759
16499
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
15760
16500
|
}
|
|
15761
16501
|
if (format === "typescript") {
|
|
15762
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
16502
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT.js");
|
|
15763
16503
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
15764
16504
|
}
|
|
15765
16505
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -15794,7 +16534,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
15794
16534
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
15795
16535
|
}
|
|
15796
16536
|
if (format === "typescript") {
|
|
15797
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
16537
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT.js");
|
|
15798
16538
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
15799
16539
|
return suite.tests;
|
|
15800
16540
|
}
|
|
@@ -15809,8 +16549,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15809
16549
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
15810
16550
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
15811
16551
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
15812
|
-
const rawFile = await
|
|
15813
|
-
const
|
|
16552
|
+
const rawFile = await readFile16(absoluteTestPath, "utf8");
|
|
16553
|
+
const rawParsed = parseYamlValue(rawFile);
|
|
16554
|
+
const rawCaseSnapshots = buildRawInlineTestSnapshots(rawParsed);
|
|
16555
|
+
const interpolated = interpolateEnv(rawParsed, process.env);
|
|
15814
16556
|
if (!isJsonObject(interpolated)) {
|
|
15815
16557
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
15816
16558
|
}
|
|
@@ -15847,7 +16589,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15847
16589
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
15848
16590
|
}
|
|
15849
16591
|
const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
|
|
15850
|
-
const
|
|
16592
|
+
const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
|
|
15851
16593
|
const rawSuiteInput = suite.input;
|
|
15852
16594
|
const rawSuiteInputFiles = suite.input_files;
|
|
15853
16595
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
@@ -15949,6 +16691,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15949
16691
|
logError3(`Skipping test '${id}': ${message}`);
|
|
15950
16692
|
continue;
|
|
15951
16693
|
}
|
|
16694
|
+
const assertionTemplateReferences = await collectAssertionTemplateSourceReferences(
|
|
16695
|
+
renderedCase,
|
|
16696
|
+
globalExecution,
|
|
16697
|
+
searchRoots,
|
|
16698
|
+
id ?? "unknown"
|
|
16699
|
+
);
|
|
15952
16700
|
const inlineRubrics = renderedCase.rubrics;
|
|
15953
16701
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
15954
16702
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
@@ -15961,8 +16709,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15961
16709
|
const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir);
|
|
15962
16710
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
15963
16711
|
const rawCaseMetadata = isJsonObject(renderedCase.metadata) ? renderedCase.metadata : void 0;
|
|
15964
|
-
const
|
|
15965
|
-
const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload);
|
|
16712
|
+
const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload);
|
|
15966
16713
|
const caseTargets = extractTargetsFromTestCase(renderedCase);
|
|
15967
16714
|
const dependsOn = Array.isArray(renderedCase.depends_on) ? renderedCase.depends_on.filter(
|
|
15968
16715
|
(v) => typeof v === "string"
|
|
@@ -16001,12 +16748,245 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16001
16748
|
...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
|
|
16002
16749
|
...windowSize !== void 0 ? { window_size: windowSize } : {},
|
|
16003
16750
|
...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
|
|
16004
|
-
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
|
|
16751
|
+
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {},
|
|
16752
|
+
source: buildEvalTestSource({
|
|
16753
|
+
evalFilePath,
|
|
16754
|
+
absoluteTestPath,
|
|
16755
|
+
repoRootPath,
|
|
16756
|
+
id,
|
|
16757
|
+
renderedCase,
|
|
16758
|
+
rawCaseSnapshots,
|
|
16759
|
+
inputMessages,
|
|
16760
|
+
evaluators,
|
|
16761
|
+
assertionTemplateReferences
|
|
16762
|
+
})
|
|
16005
16763
|
};
|
|
16006
16764
|
results.push(testCase);
|
|
16007
16765
|
}
|
|
16008
16766
|
return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
|
|
16009
16767
|
}
|
|
16768
|
+
var SOURCE_SECRET_KEY_PATTERN = /(api[_-]?key|authorization|bearer|credential|password|private[_-]?key|secret|token)/i;
|
|
16769
|
+
var REDACTED_SOURCE_VALUE = "[redacted]";
|
|
16770
|
+
function buildRawInlineTestSnapshots(rawParsed) {
|
|
16771
|
+
const snapshots = /* @__PURE__ */ new Map();
|
|
16772
|
+
if (!isJsonObject(rawParsed)) {
|
|
16773
|
+
return snapshots;
|
|
16774
|
+
}
|
|
16775
|
+
const rawTests = rawParsed.tests ?? rawParsed.eval_cases ?? rawParsed.evalcases;
|
|
16776
|
+
if (!Array.isArray(rawTests)) {
|
|
16777
|
+
return snapshots;
|
|
16778
|
+
}
|
|
16779
|
+
for (const rawTest of rawTests) {
|
|
16780
|
+
if (!isJsonObject(rawTest) || typeof rawTest.id !== "string") {
|
|
16781
|
+
continue;
|
|
16782
|
+
}
|
|
16783
|
+
snapshots.set(rawTest.id, stringifySourceYaml(rawTest));
|
|
16784
|
+
}
|
|
16785
|
+
return snapshots;
|
|
16786
|
+
}
|
|
16787
|
+
function buildEvalTestSource(params) {
|
|
16788
|
+
const evalFileRepoPath = toPortableRelativePath(params.repoRootPath, params.absoluteTestPath);
|
|
16789
|
+
const testSnapshotYaml = params.rawCaseSnapshots.get(params.id) ?? stringifySourceYaml(params.renderedCase);
|
|
16790
|
+
const evaluatorReferences = collectGraderSourceReferences(params.evaluators);
|
|
16791
|
+
const inputReferences = collectInputSourceReferences(params.inputMessages);
|
|
16792
|
+
const references = dedupeSourceReferences([
|
|
16793
|
+
...inputReferences,
|
|
16794
|
+
...evaluatorReferences,
|
|
16795
|
+
...params.assertionTemplateReferences
|
|
16796
|
+
]);
|
|
16797
|
+
return {
|
|
16798
|
+
evalFilePath: params.evalFilePath,
|
|
16799
|
+
evalFileAbsolutePath: params.absoluteTestPath,
|
|
16800
|
+
...evalFileRepoPath ? { evalFileRepoPath } : {},
|
|
16801
|
+
testId: params.id,
|
|
16802
|
+
testSnapshotYaml,
|
|
16803
|
+
graderDefinitions: buildGraderSourceDefinitions(params.evaluators),
|
|
16804
|
+
references
|
|
16805
|
+
};
|
|
16806
|
+
}
|
|
16807
|
+
function stringifySourceYaml(value) {
|
|
16808
|
+
return stringifyYaml(sanitizeSourceValue(value), { lineWidth: 0 }).trimEnd();
|
|
16809
|
+
}
|
|
16810
|
+
function sanitizeSourceValue(value, keyHint) {
|
|
16811
|
+
if (keyHint && SOURCE_SECRET_KEY_PATTERN.test(keyHint)) {
|
|
16812
|
+
return REDACTED_SOURCE_VALUE;
|
|
16813
|
+
}
|
|
16814
|
+
if (value === null || typeof value === "string" || typeof value === "number") {
|
|
16815
|
+
return value;
|
|
16816
|
+
}
|
|
16817
|
+
if (typeof value === "boolean") {
|
|
16818
|
+
return value;
|
|
16819
|
+
}
|
|
16820
|
+
if (Array.isArray(value)) {
|
|
16821
|
+
return value.map((item) => sanitizeSourceValue(item));
|
|
16822
|
+
}
|
|
16823
|
+
if (typeof value === "object" && value !== null) {
|
|
16824
|
+
const entries = Object.entries(value).map(([key, entryValue]) => [
|
|
16825
|
+
key,
|
|
16826
|
+
sanitizeSourceValue(entryValue, key)
|
|
16827
|
+
]);
|
|
16828
|
+
return Object.fromEntries(entries);
|
|
16829
|
+
}
|
|
16830
|
+
return String(value);
|
|
16831
|
+
}
|
|
16832
|
+
function buildGraderSourceDefinitions(evaluators) {
|
|
16833
|
+
return (evaluators ?? []).map((evaluator) => ({
|
|
16834
|
+
name: evaluator.name,
|
|
16835
|
+
type: evaluator.type,
|
|
16836
|
+
...evaluator.weight !== void 0 ? { weight: evaluator.weight } : {},
|
|
16837
|
+
...evaluator.required !== void 0 ? { required: evaluator.required } : {},
|
|
16838
|
+
..."min_score" in evaluator && evaluator.min_score !== void 0 ? { minScore: evaluator.min_score } : {},
|
|
16839
|
+
definition: sanitizeGraderDefinition(evaluator)
|
|
16840
|
+
}));
|
|
16841
|
+
}
|
|
16842
|
+
function sanitizeGraderDefinition(evaluator) {
|
|
16843
|
+
const copy = sanitizeSourceValue(evaluator);
|
|
16844
|
+
return stripRuntimeResolutionFields(copy);
|
|
16845
|
+
}
|
|
16846
|
+
function stripRuntimeResolutionFields(value) {
|
|
16847
|
+
const stripped = {};
|
|
16848
|
+
for (const [key, entryValue] of Object.entries(value)) {
|
|
16849
|
+
if (key === "resolvedPromptPath" || key === "promptPath" || key === "resolvedPromptScript" || key === "resolvedScriptPath" || key === "resolvedCwd" || key === "resolvedCommand") {
|
|
16850
|
+
continue;
|
|
16851
|
+
}
|
|
16852
|
+
if (Array.isArray(entryValue)) {
|
|
16853
|
+
stripped[key] = entryValue.map(
|
|
16854
|
+
(item) => isJsonObject(item) ? stripRuntimeResolutionFields(item) : item
|
|
16855
|
+
);
|
|
16856
|
+
} else if (isJsonObject(entryValue)) {
|
|
16857
|
+
stripped[key] = stripRuntimeResolutionFields(entryValue);
|
|
16858
|
+
} else {
|
|
16859
|
+
stripped[key] = entryValue;
|
|
16860
|
+
}
|
|
16861
|
+
}
|
|
16862
|
+
return stripped;
|
|
16863
|
+
}
|
|
16864
|
+
function collectInputSourceReferences(inputMessages) {
|
|
16865
|
+
const references = [];
|
|
16866
|
+
for (const message of inputMessages) {
|
|
16867
|
+
if (!Array.isArray(message.content)) {
|
|
16868
|
+
continue;
|
|
16869
|
+
}
|
|
16870
|
+
for (const segment of message.content) {
|
|
16871
|
+
if (!isJsonObject(segment) || segment.type !== "file") {
|
|
16872
|
+
continue;
|
|
16873
|
+
}
|
|
16874
|
+
const displayPath = typeof segment.path === "string" ? segment.path : typeof segment.value === "string" ? segment.value : "input file";
|
|
16875
|
+
references.push({
|
|
16876
|
+
kind: "input_file",
|
|
16877
|
+
displayPath,
|
|
16878
|
+
...typeof segment.resolvedPath === "string" ? { resolvedPath: path43.resolve(segment.resolvedPath) } : {}
|
|
16879
|
+
});
|
|
16880
|
+
}
|
|
16881
|
+
}
|
|
16882
|
+
return references;
|
|
16883
|
+
}
|
|
16884
|
+
function collectGraderSourceReferences(evaluators) {
|
|
16885
|
+
const references = [];
|
|
16886
|
+
for (const evaluator of evaluators ?? []) {
|
|
16887
|
+
references.push(...collectSingleGraderSourceReferences(evaluator));
|
|
16888
|
+
}
|
|
16889
|
+
return references;
|
|
16890
|
+
}
|
|
16891
|
+
function collectSingleGraderSourceReferences(evaluator) {
|
|
16892
|
+
const references = [];
|
|
16893
|
+
if (evaluator.type === "code-grader") {
|
|
16894
|
+
const command = evaluator.command ?? evaluator.script ?? [];
|
|
16895
|
+
references.push({
|
|
16896
|
+
kind: "code_grader_command",
|
|
16897
|
+
displayPath: evaluator.resolvedScriptPath ?? command.join(" "),
|
|
16898
|
+
...evaluator.resolvedScriptPath ? { resolvedPath: evaluator.resolvedScriptPath } : {},
|
|
16899
|
+
graderName: evaluator.name,
|
|
16900
|
+
command
|
|
16901
|
+
});
|
|
16902
|
+
if (evaluator.resolvedCwd) {
|
|
16903
|
+
references.push({
|
|
16904
|
+
kind: "code_grader_cwd",
|
|
16905
|
+
displayPath: evaluator.cwd ?? evaluator.resolvedCwd,
|
|
16906
|
+
resolvedPath: evaluator.resolvedCwd,
|
|
16907
|
+
graderName: evaluator.name
|
|
16908
|
+
});
|
|
16909
|
+
}
|
|
16910
|
+
}
|
|
16911
|
+
if (evaluator.type === "llm-grader") {
|
|
16912
|
+
const promptPath = evaluator.resolvedPromptPath ?? evaluator.promptPath;
|
|
16913
|
+
if (promptPath) {
|
|
16914
|
+
references.push({
|
|
16915
|
+
kind: "llm_grader_prompt",
|
|
16916
|
+
displayPath: typeof evaluator.prompt === "string" ? evaluator.prompt : promptPath,
|
|
16917
|
+
resolvedPath: promptPath,
|
|
16918
|
+
graderName: evaluator.name
|
|
16919
|
+
});
|
|
16920
|
+
}
|
|
16921
|
+
if (evaluator.resolvedPromptScript && evaluator.resolvedPromptScript.length > 0) {
|
|
16922
|
+
references.push({
|
|
16923
|
+
kind: "prompt_script",
|
|
16924
|
+
displayPath: evaluator.resolvedPromptScript.at(-1) ?? evaluator.name,
|
|
16925
|
+
resolvedPath: evaluator.resolvedPromptScript.at(-1),
|
|
16926
|
+
graderName: evaluator.name,
|
|
16927
|
+
command: evaluator.resolvedPromptScript
|
|
16928
|
+
});
|
|
16929
|
+
}
|
|
16930
|
+
}
|
|
16931
|
+
const preprocessors = "preprocessors" in evaluator ? evaluator.preprocessors : void 0;
|
|
16932
|
+
for (const preprocessor of preprocessors ?? []) {
|
|
16933
|
+
if (preprocessor.resolvedCommand && preprocessor.resolvedCommand.length > 0) {
|
|
16934
|
+
references.push({
|
|
16935
|
+
kind: "preprocessor_command",
|
|
16936
|
+
displayPath: preprocessor.resolvedCommand.at(-1) ?? preprocessor.type,
|
|
16937
|
+
resolvedPath: preprocessor.resolvedCommand.at(-1),
|
|
16938
|
+
graderName: evaluator.name,
|
|
16939
|
+
command: preprocessor.resolvedCommand
|
|
16940
|
+
});
|
|
16941
|
+
}
|
|
16942
|
+
}
|
|
16943
|
+
if (evaluator.type === "composite") {
|
|
16944
|
+
for (const member of evaluator.assertions) {
|
|
16945
|
+
references.push(...collectSingleGraderSourceReferences(member));
|
|
16946
|
+
}
|
|
16947
|
+
if (evaluator.aggregator.type === "code-grader") {
|
|
16948
|
+
references.push({
|
|
16949
|
+
kind: "code_grader_command",
|
|
16950
|
+
displayPath: evaluator.aggregator.path,
|
|
16951
|
+
resolvedPath: path43.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
|
|
16952
|
+
graderName: evaluator.name
|
|
16953
|
+
});
|
|
16954
|
+
} else if (evaluator.aggregator.type === "llm-grader" && evaluator.aggregator.promptPath) {
|
|
16955
|
+
references.push({
|
|
16956
|
+
kind: "llm_grader_prompt",
|
|
16957
|
+
displayPath: evaluator.aggregator.prompt ?? evaluator.aggregator.promptPath,
|
|
16958
|
+
resolvedPath: evaluator.aggregator.promptPath,
|
|
16959
|
+
graderName: evaluator.name
|
|
16960
|
+
});
|
|
16961
|
+
}
|
|
16962
|
+
}
|
|
16963
|
+
return references;
|
|
16964
|
+
}
|
|
16965
|
+
function dedupeSourceReferences(references) {
|
|
16966
|
+
const seen = /* @__PURE__ */ new Set();
|
|
16967
|
+
const deduped = [];
|
|
16968
|
+
for (const reference of references) {
|
|
16969
|
+
const key = JSON.stringify([
|
|
16970
|
+
reference.kind,
|
|
16971
|
+
reference.resolvedPath ?? reference.displayPath,
|
|
16972
|
+
reference.graderName ?? "",
|
|
16973
|
+
reference.command?.join("\0") ?? ""
|
|
16974
|
+
]);
|
|
16975
|
+
if (seen.has(key)) {
|
|
16976
|
+
continue;
|
|
16977
|
+
}
|
|
16978
|
+
seen.add(key);
|
|
16979
|
+
deduped.push(reference);
|
|
16980
|
+
}
|
|
16981
|
+
return deduped;
|
|
16982
|
+
}
|
|
16983
|
+
function toPortableRelativePath(root, candidate) {
|
|
16984
|
+
const relative = path43.relative(root, candidate);
|
|
16985
|
+
if (relative && !relative.startsWith("..") && !path43.isAbsolute(relative)) {
|
|
16986
|
+
return relative.split(path43.sep).join("/");
|
|
16987
|
+
}
|
|
16988
|
+
return void 0;
|
|
16989
|
+
}
|
|
16010
16990
|
async function loadTestById(evalFilePath, repoRoot, evalId) {
|
|
16011
16991
|
const tests = await loadTests(evalFilePath, repoRoot);
|
|
16012
16992
|
const match = tests.find((c) => c.id === evalId);
|
|
@@ -16099,7 +17079,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
|
16099
17079
|
const workspaceFilePath = path43.resolve(evalFileDir, raw);
|
|
16100
17080
|
let content;
|
|
16101
17081
|
try {
|
|
16102
|
-
content = await
|
|
17082
|
+
content = await readFile16(workspaceFilePath, "utf8");
|
|
16103
17083
|
} catch {
|
|
16104
17084
|
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
16105
17085
|
}
|
|
@@ -16223,19 +17203,18 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
16223
17203
|
function asString5(value) {
|
|
16224
17204
|
return typeof value === "string" ? value : void 0;
|
|
16225
17205
|
}
|
|
16226
|
-
function
|
|
17206
|
+
function extractSuiteMetadataPayload(suite) {
|
|
17207
|
+
const payload = isJsonObject(suite.metadata) ? { ...suite.metadata } : {};
|
|
16227
17208
|
const top = suite.governance;
|
|
16228
17209
|
if (isJsonObject(top)) {
|
|
16229
|
-
|
|
16230
|
-
}
|
|
16231
|
-
|
|
16232
|
-
if (isJsonObject(wrapper)) {
|
|
16233
|
-
const nested = wrapper.governance;
|
|
17210
|
+
payload.governance = top;
|
|
17211
|
+
} else {
|
|
17212
|
+
const nested = payload.governance;
|
|
16234
17213
|
if (isJsonObject(nested)) {
|
|
16235
|
-
|
|
17214
|
+
payload.governance = nested;
|
|
16236
17215
|
}
|
|
16237
17216
|
}
|
|
16238
|
-
return void 0;
|
|
17217
|
+
return Object.keys(payload).length > 0 ? payload : void 0;
|
|
16239
17218
|
}
|
|
16240
17219
|
function mergeSuiteMetadataPayload(caseMetadata, suitePayload) {
|
|
16241
17220
|
if (!suitePayload) return caseMetadata;
|
|
@@ -16726,7 +17705,7 @@ async function runEvaluation(options) {
|
|
|
16726
17705
|
const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
|
|
16727
17706
|
if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
|
|
16728
17707
|
if (!dirExists) {
|
|
16729
|
-
await
|
|
17708
|
+
await mkdir15(configuredStaticPath, { recursive: true });
|
|
16730
17709
|
}
|
|
16731
17710
|
if (workspaceTemplate) {
|
|
16732
17711
|
await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
|
|
@@ -16771,7 +17750,7 @@ async function runEvaluation(options) {
|
|
|
16771
17750
|
}
|
|
16772
17751
|
} else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
|
|
16773
17752
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
16774
|
-
await
|
|
17753
|
+
await mkdir15(sharedWorkspacePath, { recursive: true });
|
|
16775
17754
|
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
16776
17755
|
}
|
|
16777
17756
|
try {
|
|
@@ -17621,7 +18600,7 @@ async function runEvalCase(options) {
|
|
|
17621
18600
|
}
|
|
17622
18601
|
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
17623
18602
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
17624
|
-
await
|
|
18603
|
+
await mkdir15(workspacePath, { recursive: true });
|
|
17625
18604
|
}
|
|
17626
18605
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
17627
18606
|
const localPathErrors = RepoManager.validateLocalPaths(evalCase.workspace.repos);
|
|
@@ -17676,7 +18655,7 @@ async function runEvalCase(options) {
|
|
|
17676
18655
|
const srcPath = path44.resolve(baseDir, relPath);
|
|
17677
18656
|
const destPath = path44.resolve(workspacePath, relPath);
|
|
17678
18657
|
try {
|
|
17679
|
-
await
|
|
18658
|
+
await mkdir15(path44.dirname(destPath), { recursive: true });
|
|
17680
18659
|
await copyFile2(srcPath, destPath);
|
|
17681
18660
|
} catch (error) {
|
|
17682
18661
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -19244,6 +20223,12 @@ async function evaluate(config) {
|
|
|
19244
20223
|
resolvedTarget = resolveTargetDefinition(targetDef);
|
|
19245
20224
|
}
|
|
19246
20225
|
const collectedResults = [];
|
|
20226
|
+
const cacheEnabled = shouldEnableCache({
|
|
20227
|
+
cliCache: config.cache === true,
|
|
20228
|
+
cliNoCache: false,
|
|
20229
|
+
yamlCache: config.cache === void 0 ? materialized.cache : void 0
|
|
20230
|
+
});
|
|
20231
|
+
const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path45.resolve(materialized.cachePath) : void 0) : void 0;
|
|
19247
20232
|
const results = await runEvaluation({
|
|
19248
20233
|
testFilePath,
|
|
19249
20234
|
repoRoot,
|
|
@@ -19256,6 +20241,8 @@ async function evaluate(config) {
|
|
|
19256
20241
|
filter: config.filter,
|
|
19257
20242
|
threshold: config.threshold,
|
|
19258
20243
|
evalCases: materialized.tests,
|
|
20244
|
+
cache,
|
|
20245
|
+
useCache: !!cache && !shouldSkipCacheForTemperature(resolvedTarget.config),
|
|
19259
20246
|
...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
|
|
19260
20247
|
onResult: async (result) => {
|
|
19261
20248
|
collectedResults.push(result);
|
|
@@ -19286,6 +20273,7 @@ async function materializeEvalConfig(config, options) {
|
|
|
19286
20273
|
tests: tests2,
|
|
19287
20274
|
workers: config.workers ?? suite.workers,
|
|
19288
20275
|
cache: config.cache ?? suite.cacheConfig?.enabled,
|
|
20276
|
+
cachePath: config.cachePath ?? suite.cacheConfig?.cachePath,
|
|
19289
20277
|
budgetUsd: config.budgetUsd ?? suite.budgetUsd,
|
|
19290
20278
|
threshold: config.threshold ?? suite.threshold,
|
|
19291
20279
|
metadata: config.metadata ?? suite.metadata,
|
|
@@ -19304,6 +20292,7 @@ async function materializeEvalConfig(config, options) {
|
|
|
19304
20292
|
tests,
|
|
19305
20293
|
workers: config.workers,
|
|
19306
20294
|
cache: config.cache,
|
|
20295
|
+
cachePath: config.cachePath,
|
|
19307
20296
|
budgetUsd: config.budgetUsd,
|
|
19308
20297
|
threshold: config.threshold,
|
|
19309
20298
|
metadata: config.metadata,
|
|
@@ -19421,9 +20410,11 @@ function mapAssertionType(type) {
|
|
|
19421
20410
|
}
|
|
19422
20411
|
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
19423
20412
|
const total = results.length;
|
|
20413
|
+
const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
|
|
20414
|
+
const executionErrors = total - qualityResults.length;
|
|
19424
20415
|
let passed = 0;
|
|
19425
20416
|
let scoreSum = 0;
|
|
19426
|
-
for (const r of
|
|
20417
|
+
for (const r of qualityResults) {
|
|
19427
20418
|
scoreSum += r.score;
|
|
19428
20419
|
if (r.score >= threshold) {
|
|
19429
20420
|
passed++;
|
|
@@ -19432,9 +20423,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
|
19432
20423
|
return {
|
|
19433
20424
|
total,
|
|
19434
20425
|
passed,
|
|
19435
|
-
failed:
|
|
20426
|
+
failed: qualityResults.length - passed,
|
|
20427
|
+
executionErrors,
|
|
19436
20428
|
durationMs,
|
|
19437
|
-
meanScore:
|
|
20429
|
+
meanScore: qualityResults.length > 0 ? scoreSum / qualityResults.length : 0
|
|
19438
20430
|
};
|
|
19439
20431
|
}
|
|
19440
20432
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
@@ -19517,7 +20509,12 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
|
|
|
19517
20509
|
return {
|
|
19518
20510
|
tests: materialized.tests,
|
|
19519
20511
|
...materialized.workers !== void 0 && { workers: materialized.workers },
|
|
19520
|
-
...materialized.cache !== void 0 && {
|
|
20512
|
+
...materialized.cache !== void 0 && {
|
|
20513
|
+
cacheConfig: {
|
|
20514
|
+
enabled: materialized.cache,
|
|
20515
|
+
...materialized.cachePath !== void 0 && { cachePath: materialized.cachePath }
|
|
20516
|
+
}
|
|
20517
|
+
},
|
|
19521
20518
|
...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
|
|
19522
20519
|
...materialized.threshold !== void 0 && { threshold: materialized.threshold },
|
|
19523
20520
|
...materialized.metadata !== void 0 && { metadata: materialized.metadata },
|
|
@@ -19540,7 +20537,28 @@ function isEvalConfigLike(value) {
|
|
|
19540
20537
|
}
|
|
19541
20538
|
|
|
19542
20539
|
export {
|
|
20540
|
+
NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
|
|
20541
|
+
NORMALIZED_TRACE_SOURCE_KINDS,
|
|
20542
|
+
NORMALIZED_TRACE_EVENT_TYPES,
|
|
20543
|
+
NORMALIZED_TOOL_STATUSES,
|
|
20544
|
+
NORMALIZED_REDACTION_LEVELS,
|
|
20545
|
+
NormalizedRedactionStateWireSchema,
|
|
20546
|
+
NormalizedTraceErrorWireSchema,
|
|
20547
|
+
NormalizedTraceSourceWireSchema,
|
|
20548
|
+
NormalizedTraceSessionWireSchema,
|
|
20549
|
+
NormalizedTraceBranchWireSchema,
|
|
20550
|
+
NormalizedTraceSourceRefWireSchema,
|
|
20551
|
+
NormalizedRawEvidenceWireSchema,
|
|
20552
|
+
NormalizedTraceMessageWireSchema,
|
|
20553
|
+
NormalizedTraceModelWireSchema,
|
|
20554
|
+
NormalizedTraceToolWireSchema,
|
|
20555
|
+
NormalizedTraceEventWireSchema,
|
|
20556
|
+
NormalizedTrajectoryWireSchema,
|
|
20557
|
+
toNormalizedTrajectoryWire,
|
|
20558
|
+
fromNormalizedTrajectoryWire,
|
|
19543
20559
|
computeTraceSummary,
|
|
20560
|
+
getSelectedTrajectoryEvents,
|
|
20561
|
+
computeTraceSummaryFromTrajectory,
|
|
19544
20562
|
DEFAULT_EXPLORATION_TOOLS,
|
|
19545
20563
|
explorationRatio,
|
|
19546
20564
|
tokensPerTool,
|
|
@@ -19559,11 +20577,15 @@ export {
|
|
|
19559
20577
|
extractCacheConfig,
|
|
19560
20578
|
extractFailOnError,
|
|
19561
20579
|
extractThreshold,
|
|
20580
|
+
resolveResultsConfigForProject,
|
|
19562
20581
|
detectFormat,
|
|
19563
20582
|
parseRepoSource,
|
|
19564
20583
|
parseRepoCheckout,
|
|
19565
20584
|
parseRepoClone,
|
|
19566
20585
|
buildPromptInputs,
|
|
20586
|
+
ResponseCache,
|
|
20587
|
+
shouldEnableCache,
|
|
20588
|
+
shouldSkipCacheForTemperature,
|
|
19567
20589
|
DEFAULT_THRESHOLD,
|
|
19568
20590
|
PASS_THRESHOLD,
|
|
19569
20591
|
scoreToVerdict,
|
|
@@ -19574,12 +20596,6 @@ export {
|
|
|
19574
20596
|
parseJsonSafe,
|
|
19575
20597
|
deepEqual,
|
|
19576
20598
|
negateScore,
|
|
19577
|
-
getAgentvConfigDir,
|
|
19578
|
-
getAgentvHome,
|
|
19579
|
-
getWorkspacesRoot,
|
|
19580
|
-
getSubagentsRoot,
|
|
19581
|
-
getTraceStateRoot,
|
|
19582
|
-
getWorkspacePoolRoot,
|
|
19583
20599
|
toSnakeCaseDeep,
|
|
19584
20600
|
toCamelCaseDeep,
|
|
19585
20601
|
CodeGrader,
|
|
@@ -19672,4 +20688,4 @@ export {
|
|
|
19672
20688
|
loadTestById,
|
|
19673
20689
|
loadEvalCaseById
|
|
19674
20690
|
};
|
|
19675
|
-
//# sourceMappingURL=chunk-
|
|
20691
|
+
//# sourceMappingURL=chunk-7QB53OPK.js.map
|