@agentv/core 4.32.0-next.1 → 4.33.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-N5EU446L.js → chunk-7QB53OPK.js} +1277 -265
- package/dist/chunk-7QB53OPK.js.map +1 -0
- package/dist/{chunk-5RQMJZDJ.js → chunk-EW5X2RGJ.js} +110 -50
- package/dist/chunk-EW5X2RGJ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +196 -87
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +3 -1
- package/dist/evaluation/validation/index.d.ts +3 -1
- package/dist/evaluation/validation/index.js +170 -75
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +2346 -853
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1826 -62
- package/dist/index.d.ts +1826 -62
- package/dist/index.js +624 -197
- package/dist/index.js.map +1 -1
- package/dist/{ts-eval-loader-Z6IUSDNA.js → ts-eval-loader-EQJX3OLT.js} +3 -3
- package/package.json +2 -2
- package/dist/chunk-5RQMJZDJ.js.map +0 -1
- package/dist/chunk-N5EU446L.js.map +0 -1
- /package/dist/{ts-eval-loader-Z6IUSDNA.js.map → ts-eval-loader-EQJX3OLT.js.map} +0 -0
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
import {
|
|
2
2
|
LLM_GRADER_CAPABLE_KINDS,
|
|
3
|
+
RUBRIC_OPERATOR_VALUES,
|
|
3
4
|
buildDirectoryChain,
|
|
4
5
|
expandFileReferences,
|
|
5
6
|
extractLastAssistantContent,
|
|
6
7
|
fileExists,
|
|
7
8
|
findGitRoot,
|
|
9
|
+
getAgentvConfigDir,
|
|
10
|
+
getAgentvDataDir,
|
|
11
|
+
getSubagentsRoot,
|
|
12
|
+
getWorkspacePoolRoot,
|
|
13
|
+
getWorkspacesRoot,
|
|
8
14
|
interpolateEnv,
|
|
9
15
|
interpolateTemplateVars,
|
|
10
16
|
isAgentProvider,
|
|
@@ -18,7 +24,7 @@ import {
|
|
|
18
24
|
readTextFile,
|
|
19
25
|
resolveDelegatedTargetDefinition,
|
|
20
26
|
resolveTargetDefinition
|
|
21
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-EW5X2RGJ.js";
|
|
22
28
|
import {
|
|
23
29
|
execFileWithStdin,
|
|
24
30
|
execShellWithStdin
|
|
@@ -41,6 +47,49 @@ import { existsSync as existsSync6 } from "node:fs";
|
|
|
41
47
|
import path45 from "node:path";
|
|
42
48
|
import micromatch4 from "micromatch";
|
|
43
49
|
|
|
50
|
+
// src/evaluation/cache/response-cache.ts
|
|
51
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
52
|
+
import path from "node:path";
|
|
53
|
+
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
54
|
+
var ResponseCache = class {
|
|
55
|
+
cachePath;
|
|
56
|
+
constructor(cachePath) {
|
|
57
|
+
this.cachePath = cachePath ?? DEFAULT_CACHE_PATH;
|
|
58
|
+
}
|
|
59
|
+
async get(key) {
|
|
60
|
+
const filePath = this.keyToPath(key);
|
|
61
|
+
try {
|
|
62
|
+
const data = await readFile(filePath, "utf8");
|
|
63
|
+
return JSON.parse(data);
|
|
64
|
+
} catch {
|
|
65
|
+
return void 0;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
async set(key, value) {
|
|
69
|
+
const filePath = this.keyToPath(key);
|
|
70
|
+
const dir = path.dirname(filePath);
|
|
71
|
+
await mkdir(dir, { recursive: true });
|
|
72
|
+
await writeFile(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
73
|
+
}
|
|
74
|
+
keyToPath(key) {
|
|
75
|
+
const prefix = key.slice(0, 2);
|
|
76
|
+
return path.join(this.cachePath, prefix, `${key}.json`);
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
function shouldEnableCache(params) {
|
|
80
|
+
if (params.cliNoCache) return false;
|
|
81
|
+
if (params.cliCache) return true;
|
|
82
|
+
if (params.yamlCache !== void 0) return params.yamlCache;
|
|
83
|
+
return params.tsConfigCache === true;
|
|
84
|
+
}
|
|
85
|
+
function shouldSkipCacheForTemperature(targetConfig) {
|
|
86
|
+
const temp = targetConfig.temperature;
|
|
87
|
+
if (typeof temp === "number" && temp > 0) {
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
return false;
|
|
91
|
+
}
|
|
92
|
+
|
|
44
93
|
// src/evaluation/graders/scoring.ts
|
|
45
94
|
var DEFAULT_THRESHOLD = 0.8;
|
|
46
95
|
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
@@ -133,7 +182,7 @@ function negateScore(score) {
|
|
|
133
182
|
import { execFile as execFile3 } from "node:child_process";
|
|
134
183
|
import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
|
|
135
184
|
import { existsSync as existsSync5 } from "node:fs";
|
|
136
|
-
import { copyFile as copyFile2, mkdir as
|
|
185
|
+
import { copyFile as copyFile2, mkdir as mkdir15, readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
137
186
|
import path44 from "node:path";
|
|
138
187
|
import { promisify as promisify7 } from "node:util";
|
|
139
188
|
import micromatch3 from "micromatch";
|
|
@@ -277,38 +326,8 @@ function validateConcurrency(concurrency) {
|
|
|
277
326
|
}
|
|
278
327
|
}
|
|
279
328
|
|
|
280
|
-
// src/paths.ts
|
|
281
|
-
import os from "node:os";
|
|
282
|
-
import path from "node:path";
|
|
283
|
-
function readEnvPath(name) {
|
|
284
|
-
const value = process.env[name];
|
|
285
|
-
if (!value || value === "undefined") return void 0;
|
|
286
|
-
return value;
|
|
287
|
-
}
|
|
288
|
-
function getAgentvConfigDir() {
|
|
289
|
-
return readEnvPath("AGENTV_HOME") ?? path.join(os.homedir(), ".agentv");
|
|
290
|
-
}
|
|
291
|
-
function getAgentvHome() {
|
|
292
|
-
return getAgentvConfigDir();
|
|
293
|
-
}
|
|
294
|
-
function getAgentvDataDir() {
|
|
295
|
-
return readEnvPath("AGENTV_DATA_DIR") ?? getAgentvConfigDir();
|
|
296
|
-
}
|
|
297
|
-
function getWorkspacesRoot() {
|
|
298
|
-
return path.join(getAgentvDataDir(), "workspaces");
|
|
299
|
-
}
|
|
300
|
-
function getSubagentsRoot() {
|
|
301
|
-
return path.join(getAgentvDataDir(), "subagents");
|
|
302
|
-
}
|
|
303
|
-
function getTraceStateRoot() {
|
|
304
|
-
return path.join(getAgentvDataDir(), "trace-state");
|
|
305
|
-
}
|
|
306
|
-
function getWorkspacePoolRoot() {
|
|
307
|
-
return path.join(getAgentvDataDir(), "workspace-pool");
|
|
308
|
-
}
|
|
309
|
-
|
|
310
329
|
// src/evaluation/graders/code-grader.ts
|
|
311
|
-
import { mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
330
|
+
import { mkdtemp, rm, writeFile as writeFile2 } from "node:fs/promises";
|
|
312
331
|
import { tmpdir } from "node:os";
|
|
313
332
|
import { dirname, join } from "node:path";
|
|
314
333
|
|
|
@@ -642,7 +661,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
642
661
|
const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
|
|
643
662
|
const dir = await getWorkDir();
|
|
644
663
|
const filePath = join(dir, `img-${counter++}.${ext}`);
|
|
645
|
-
await
|
|
664
|
+
await writeFile2(filePath, Buffer.from(base64Data, "base64"));
|
|
646
665
|
blocks.push({ type: "image", media_type: img.media_type, path: filePath });
|
|
647
666
|
} else {
|
|
648
667
|
blocks.push({ type: "image", media_type: img.media_type, path: img.source });
|
|
@@ -685,7 +704,7 @@ var CodeGrader = class {
|
|
|
685
704
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
686
705
|
const tmpDir = await mkdtemp(join(tmpdir(), "agentv-grader-"));
|
|
687
706
|
outputPath = join(tmpDir, "output.json");
|
|
688
|
-
await
|
|
707
|
+
await writeFile2(outputPath, serialized);
|
|
689
708
|
outputForPayload = null;
|
|
690
709
|
}
|
|
691
710
|
}
|
|
@@ -702,6 +721,7 @@ var CodeGrader = class {
|
|
|
702
721
|
context.evalCase.input,
|
|
703
722
|
getImageDir
|
|
704
723
|
),
|
|
724
|
+
metadata: context.evalCase.metadata ?? null,
|
|
705
725
|
trace: context.trace ?? null,
|
|
706
726
|
tokenUsage: context.tokenUsage ?? null,
|
|
707
727
|
costUsd: context.costUsd ?? null,
|
|
@@ -874,7 +894,7 @@ import path3 from "node:path";
|
|
|
874
894
|
import { z } from "zod";
|
|
875
895
|
|
|
876
896
|
// src/evaluation/content-preprocessor.ts
|
|
877
|
-
import { readFile } from "node:fs/promises";
|
|
897
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
878
898
|
import path2 from "node:path";
|
|
879
899
|
import { fileURLToPath } from "node:url";
|
|
880
900
|
var MIME_TYPE_ALIASES = {
|
|
@@ -943,7 +963,7 @@ async function preprocessContentFile(block, preprocessors, basePath) {
|
|
|
943
963
|
return runContentPreprocessor(block, resolvedPath, preprocessor);
|
|
944
964
|
}
|
|
945
965
|
try {
|
|
946
|
-
const buffer = await
|
|
966
|
+
const buffer = await readFile2(resolvedPath);
|
|
947
967
|
const text = buffer.toString("utf8").replace(/\r\n/g, "\n");
|
|
948
968
|
if (buffer.includes(0) || text.includes(REPLACEMENT_CHAR)) {
|
|
949
969
|
return {
|
|
@@ -1039,6 +1059,10 @@ ${text}`;
|
|
|
1039
1059
|
var TEMPLATE_VARIABLES = {
|
|
1040
1060
|
EXPECTED_OUTPUT: "expected_output",
|
|
1041
1061
|
CRITERIA: "criteria",
|
|
1062
|
+
METADATA: "metadata",
|
|
1063
|
+
METADATA_JSON: "metadata_json",
|
|
1064
|
+
RUBRICS: "rubrics",
|
|
1065
|
+
RUBRICS_JSON: "rubrics_json",
|
|
1042
1066
|
INPUT: "input",
|
|
1043
1067
|
OUTPUT: "output",
|
|
1044
1068
|
FILE_CHANGES: "file_changes",
|
|
@@ -1061,6 +1085,27 @@ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
|
|
|
1061
1085
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
|
|
1062
1086
|
]);
|
|
1063
1087
|
|
|
1088
|
+
// src/evaluation/graders/rubric-operators.ts
|
|
1089
|
+
var OPERATOR_GUIDANCE = {
|
|
1090
|
+
correctness: "Correctness: mark satisfied only when the answer positively supports or fulfills the outcome. Omission or contradiction should not satisfy it.",
|
|
1091
|
+
contradiction: "Contradiction guard: mark satisfied when the answer does not make a claim that contradicts the outcome. Do not require the answer to mention the outcome; mark unsatisfied only for incompatible claims."
|
|
1092
|
+
};
|
|
1093
|
+
function formatRubricOperatorLabel(operator) {
|
|
1094
|
+
return operator ? ` (operator: ${operator})` : "";
|
|
1095
|
+
}
|
|
1096
|
+
function formatRubricOperatorGuidance(rubrics) {
|
|
1097
|
+
const operators = /* @__PURE__ */ new Set();
|
|
1098
|
+
for (const rubric of rubrics) {
|
|
1099
|
+
if (rubric.operator) {
|
|
1100
|
+
operators.add(rubric.operator);
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
if (operators.size === 0) {
|
|
1104
|
+
return [];
|
|
1105
|
+
}
|
|
1106
|
+
return [...operators].map((operator) => OPERATOR_GUIDANCE[operator]);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1064
1109
|
// src/evaluation/graders/llm-grader.ts
|
|
1065
1110
|
var DEFAULT_MAX_STEPS = 10;
|
|
1066
1111
|
var MAX_STEPS_LIMIT = 50;
|
|
@@ -1143,6 +1188,32 @@ var scoreRangeEvaluationSchema = z.object({
|
|
|
1143
1188
|
checks: z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
1144
1189
|
overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
1145
1190
|
});
|
|
1191
|
+
function stringifyPretty(value) {
|
|
1192
|
+
return value === void 0 ? "" : JSON.stringify(value, null, 2);
|
|
1193
|
+
}
|
|
1194
|
+
function stringifyCompact(value) {
|
|
1195
|
+
return value === void 0 ? "" : JSON.stringify(value);
|
|
1196
|
+
}
|
|
1197
|
+
function buildTemplateVariables(context) {
|
|
1198
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1199
|
+
const rubrics = context.evaluator?.type === "llm-grader" ? context.evaluator.rubrics : void 0;
|
|
1200
|
+
return {
|
|
1201
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
1202
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
1203
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1204
|
+
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
1205
|
+
[TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
|
|
1206
|
+
[TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
|
|
1207
|
+
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
|
|
1208
|
+
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
|
|
1209
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1210
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1211
|
+
// Deprecated aliases — same values as the primary variables above
|
|
1212
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1213
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1214
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1215
|
+
};
|
|
1216
|
+
}
|
|
1146
1217
|
function resolveContentBasePath(context) {
|
|
1147
1218
|
if (context.workspacePath) {
|
|
1148
1219
|
return context.workspacePath;
|
|
@@ -1214,19 +1285,7 @@ var LlmGrader = class {
|
|
|
1214
1285
|
// LLM mode (existing)
|
|
1215
1286
|
// ---------------------------------------------------------------------------
|
|
1216
1287
|
async evaluateFreeform(context, graderProvider) {
|
|
1217
|
-
const
|
|
1218
|
-
const variables = {
|
|
1219
|
-
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
1220
|
-
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
1221
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1222
|
-
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
1223
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1224
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1225
|
-
// Deprecated aliases — same values as the primary variables above
|
|
1226
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1227
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1228
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1229
|
-
};
|
|
1288
|
+
const variables = buildTemplateVariables(context);
|
|
1230
1289
|
const systemPrompt = buildOutputSchema();
|
|
1231
1290
|
const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
|
|
1232
1291
|
warnDeprecatedTemplateVars(graderTemplate);
|
|
@@ -1293,7 +1352,7 @@ ${context.toolCalls}`;
|
|
|
1293
1352
|
if (hasScoreRanges) {
|
|
1294
1353
|
return this.evaluateWithScoreRanges(context, graderProvider, rubrics);
|
|
1295
1354
|
}
|
|
1296
|
-
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
1355
|
+
const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildRubricPrompt(context, rubrics);
|
|
1297
1356
|
const systemPrompt = buildRubricOutputSchema();
|
|
1298
1357
|
const graderRawRequest = {
|
|
1299
1358
|
userPrompt: prompt,
|
|
@@ -1338,7 +1397,7 @@ ${context.toolCalls}`;
|
|
|
1338
1397
|
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
1339
1398
|
*/
|
|
1340
1399
|
async evaluateWithScoreRanges(context, graderProvider, rubrics) {
|
|
1341
|
-
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
1400
|
+
const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildScoreRangePrompt(context, rubrics);
|
|
1342
1401
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
1343
1402
|
const graderRawRequest = {
|
|
1344
1403
|
userPrompt: prompt,
|
|
@@ -1557,21 +1616,11 @@ ${context.toolCalls}`;
|
|
|
1557
1616
|
*/
|
|
1558
1617
|
buildAgentUserPrompt(context) {
|
|
1559
1618
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1560
|
-
const variables =
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1566
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1567
|
-
// Deprecated aliases
|
|
1568
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1569
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1570
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1571
|
-
};
|
|
1572
|
-
if (this.graderTemplate) {
|
|
1573
|
-
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
1574
|
-
return substituteVariables(this.graderTemplate, variables);
|
|
1619
|
+
const variables = buildTemplateVariables(context);
|
|
1620
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
1621
|
+
if (template) {
|
|
1622
|
+
warnDeprecatedTemplateVars(template);
|
|
1623
|
+
return substituteVariables(template, variables);
|
|
1575
1624
|
}
|
|
1576
1625
|
const config = context.evaluator;
|
|
1577
1626
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
@@ -1621,21 +1670,11 @@ ${context.toolCalls}`;
|
|
|
1621
1670
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1622
1671
|
const config = context.evaluator;
|
|
1623
1672
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1630
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1631
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1632
|
-
// Deprecated aliases
|
|
1633
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1634
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
1635
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
1636
|
-
};
|
|
1637
|
-
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
1638
|
-
const customPrompt = substituteVariables(this.graderTemplate, variables);
|
|
1673
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
1674
|
+
if (template) {
|
|
1675
|
+
const variables = buildTemplateVariables(context);
|
|
1676
|
+
warnDeprecatedTemplateVars(template);
|
|
1677
|
+
const customPrompt = substituteVariables(template, variables);
|
|
1639
1678
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
1640
1679
|
return `${customPrompt}
|
|
1641
1680
|
|
|
@@ -1761,6 +1800,9 @@ ${outputSchema}`;
|
|
|
1761
1800
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
1762
1801
|
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
1763
1802
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
1803
|
+
if (rubric.operator) {
|
|
1804
|
+
parts.push(`Operator: ${rubric.operator}`);
|
|
1805
|
+
}
|
|
1764
1806
|
if (rubric.outcome) {
|
|
1765
1807
|
parts.push(`Description: ${rubric.outcome}`);
|
|
1766
1808
|
}
|
|
@@ -1773,12 +1815,21 @@ ${outputSchema}`;
|
|
|
1773
1815
|
}
|
|
1774
1816
|
}
|
|
1775
1817
|
}
|
|
1818
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
1819
|
+
if (operatorGuidance.length > 0) {
|
|
1820
|
+
parts.push("", ...operatorGuidance);
|
|
1821
|
+
}
|
|
1776
1822
|
parts.push(
|
|
1777
1823
|
"",
|
|
1778
1824
|
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
1779
1825
|
);
|
|
1780
1826
|
return parts.join("\n");
|
|
1781
1827
|
}
|
|
1828
|
+
buildCustomPrompt(context) {
|
|
1829
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate ?? "";
|
|
1830
|
+
warnDeprecatedTemplateVars(template);
|
|
1831
|
+
return substituteVariables(template, buildTemplateVariables(context));
|
|
1832
|
+
}
|
|
1782
1833
|
buildRubricPrompt(context, rubrics) {
|
|
1783
1834
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
1784
1835
|
const parts = [
|
|
@@ -1802,10 +1853,21 @@ ${outputSchema}`;
|
|
|
1802
1853
|
parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
|
|
1803
1854
|
}
|
|
1804
1855
|
parts.push("[[ ## rubrics ## ]]");
|
|
1856
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
1857
|
+
if (operatorGuidance.length > 0) {
|
|
1858
|
+
parts.push("", "Operator guidance:");
|
|
1859
|
+
for (const guidance of operatorGuidance) {
|
|
1860
|
+
parts.push(`- ${guidance}`);
|
|
1861
|
+
}
|
|
1862
|
+
parts.push("");
|
|
1863
|
+
}
|
|
1805
1864
|
for (const rubric of rubrics) {
|
|
1806
1865
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
1807
1866
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
1808
|
-
|
|
1867
|
+
const operatorLabel = formatRubricOperatorLabel(rubric.operator);
|
|
1868
|
+
parts.push(
|
|
1869
|
+
`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`
|
|
1870
|
+
);
|
|
1809
1871
|
}
|
|
1810
1872
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
1811
1873
|
return parts.join("\n");
|
|
@@ -2537,6 +2599,385 @@ var CostGrader = class {
|
|
|
2537
2599
|
};
|
|
2538
2600
|
|
|
2539
2601
|
// src/evaluation/trace.ts
|
|
2602
|
+
import { z as z2 } from "zod";
|
|
2603
|
+
var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trace.v1";
|
|
2604
|
+
var NORMALIZED_TRACE_SOURCE_KINDS = [
|
|
2605
|
+
"agentv_run",
|
|
2606
|
+
"otlp",
|
|
2607
|
+
"phoenix",
|
|
2608
|
+
"langfuse",
|
|
2609
|
+
"pi_session",
|
|
2610
|
+
"imported_transcript",
|
|
2611
|
+
"compact_transcript"
|
|
2612
|
+
];
|
|
2613
|
+
var NORMALIZED_TRACE_EVENT_TYPES = [
|
|
2614
|
+
"message",
|
|
2615
|
+
"model_turn",
|
|
2616
|
+
"tool_call",
|
|
2617
|
+
"tool_result"
|
|
2618
|
+
];
|
|
2619
|
+
var NORMALIZED_TOOL_STATUSES = ["ok", "error", "timeout", "cancelled", "unknown"];
|
|
2620
|
+
var NORMALIZED_REDACTION_LEVELS = ["none", "partial", "full"];
|
|
2621
|
+
function omitUndefinedProperties(value) {
|
|
2622
|
+
return Object.fromEntries(
|
|
2623
|
+
Object.entries(value).filter(([, property]) => property !== void 0)
|
|
2624
|
+
);
|
|
2625
|
+
}
|
|
2626
|
+
var MetadataWireSchema = z2.record(z2.string(), z2.unknown());
|
|
2627
|
+
var TokenUsageWireSchema = z2.object({
|
|
2628
|
+
input: z2.number(),
|
|
2629
|
+
output: z2.number(),
|
|
2630
|
+
cached: z2.number().optional(),
|
|
2631
|
+
reasoning: z2.number().optional()
|
|
2632
|
+
});
|
|
2633
|
+
var NormalizedRedactionStateWireSchema = z2.object({
|
|
2634
|
+
level: z2.enum(NORMALIZED_REDACTION_LEVELS),
|
|
2635
|
+
fields: z2.array(z2.string()).optional(),
|
|
2636
|
+
reason: z2.string().optional()
|
|
2637
|
+
});
|
|
2638
|
+
var NormalizedTraceErrorWireSchema = z2.object({
|
|
2639
|
+
message: z2.string(),
|
|
2640
|
+
name: z2.string().optional(),
|
|
2641
|
+
code: z2.string().optional(),
|
|
2642
|
+
stack: z2.string().optional(),
|
|
2643
|
+
metadata: MetadataWireSchema.optional()
|
|
2644
|
+
});
|
|
2645
|
+
var NormalizedTraceSourceWireSchema = z2.object({
|
|
2646
|
+
kind: z2.enum(NORMALIZED_TRACE_SOURCE_KINDS),
|
|
2647
|
+
path: z2.string().optional(),
|
|
2648
|
+
url: z2.string().optional(),
|
|
2649
|
+
provider: z2.string().optional(),
|
|
2650
|
+
format: z2.string().optional(),
|
|
2651
|
+
version: z2.string().optional(),
|
|
2652
|
+
metadata: MetadataWireSchema.optional()
|
|
2653
|
+
});
|
|
2654
|
+
var NormalizedTraceSessionWireSchema = z2.object({
|
|
2655
|
+
session_id: z2.string().optional(),
|
|
2656
|
+
conversation_id: z2.string().optional(),
|
|
2657
|
+
cwd: z2.string().optional(),
|
|
2658
|
+
started_at: z2.string().optional(),
|
|
2659
|
+
ended_at: z2.string().optional(),
|
|
2660
|
+
metadata: MetadataWireSchema.optional()
|
|
2661
|
+
});
|
|
2662
|
+
var NormalizedTraceBranchWireSchema = z2.object({
|
|
2663
|
+
selected_leaf_id: z2.string().optional(),
|
|
2664
|
+
selected_path_ids: z2.array(z2.string()).optional(),
|
|
2665
|
+
included_event_ids: z2.array(z2.string()).optional(),
|
|
2666
|
+
omitted_event_ids: z2.array(z2.string()).optional(),
|
|
2667
|
+
selection_reason: z2.string().optional()
|
|
2668
|
+
});
|
|
2669
|
+
var NormalizedTraceSourceRefWireSchema = z2.object({
|
|
2670
|
+
event_id: z2.string().optional(),
|
|
2671
|
+
message_id: z2.string().optional(),
|
|
2672
|
+
span_id: z2.string().optional(),
|
|
2673
|
+
trace_id: z2.string().optional(),
|
|
2674
|
+
raw_kind: z2.string().optional(),
|
|
2675
|
+
path: z2.string().optional(),
|
|
2676
|
+
line: z2.number().int().nonnegative().optional(),
|
|
2677
|
+
metadata: MetadataWireSchema.optional()
|
|
2678
|
+
});
|
|
2679
|
+
var NormalizedRawEvidenceWireSchema = z2.object({
|
|
2680
|
+
kind: z2.string(),
|
|
2681
|
+
ref: z2.string().optional(),
|
|
2682
|
+
media_type: z2.string().optional(),
|
|
2683
|
+
content: z2.unknown().optional(),
|
|
2684
|
+
redacted: z2.boolean().optional(),
|
|
2685
|
+
metadata: MetadataWireSchema.optional()
|
|
2686
|
+
});
|
|
2687
|
+
var NormalizedTraceMessageWireSchema = z2.object({
|
|
2688
|
+
role: z2.string(),
|
|
2689
|
+
name: z2.string().optional(),
|
|
2690
|
+
content: z2.unknown().optional(),
|
|
2691
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
2692
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
2693
|
+
metadata: MetadataWireSchema.optional()
|
|
2694
|
+
});
|
|
2695
|
+
var NormalizedTraceModelWireSchema = z2.object({
|
|
2696
|
+
provider: z2.string().optional(),
|
|
2697
|
+
name: z2.string().optional(),
|
|
2698
|
+
invocation_id: z2.string().optional(),
|
|
2699
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
2700
|
+
metadata: MetadataWireSchema.optional()
|
|
2701
|
+
});
|
|
2702
|
+
var NormalizedTraceToolWireSchema = z2.object({
|
|
2703
|
+
name: z2.string(),
|
|
2704
|
+
call_id: z2.string().optional(),
|
|
2705
|
+
input: z2.unknown().optional(),
|
|
2706
|
+
output: z2.unknown().optional(),
|
|
2707
|
+
status: z2.enum(NORMALIZED_TOOL_STATUSES).optional(),
|
|
2708
|
+
error: NormalizedTraceErrorWireSchema.optional(),
|
|
2709
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
2710
|
+
metadata: MetadataWireSchema.optional()
|
|
2711
|
+
});
|
|
2712
|
+
var NormalizedTraceEventWireSchema = z2.object({
|
|
2713
|
+
event_id: z2.string(),
|
|
2714
|
+
parent_event_id: z2.string().optional(),
|
|
2715
|
+
ordinal: z2.number().int().nonnegative(),
|
|
2716
|
+
type: z2.enum(NORMALIZED_TRACE_EVENT_TYPES),
|
|
2717
|
+
timestamp: z2.string().optional(),
|
|
2718
|
+
duration_ms: z2.number().nonnegative().optional(),
|
|
2719
|
+
duration_inferred: z2.boolean().optional(),
|
|
2720
|
+
turn_index: z2.number().int().nonnegative().optional(),
|
|
2721
|
+
message: NormalizedTraceMessageWireSchema.optional(),
|
|
2722
|
+
model: NormalizedTraceModelWireSchema.optional(),
|
|
2723
|
+
tool: NormalizedTraceToolWireSchema.optional(),
|
|
2724
|
+
source_ref: NormalizedTraceSourceRefWireSchema.optional(),
|
|
2725
|
+
raw_evidence: z2.array(NormalizedRawEvidenceWireSchema).optional(),
|
|
2726
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
2727
|
+
metadata: MetadataWireSchema.optional()
|
|
2728
|
+
});
|
|
2729
|
+
var NormalizedTrajectoryWireSchema = z2.object({
|
|
2730
|
+
schema_version: z2.literal(NORMALIZED_TRAJECTORY_SCHEMA_VERSION),
|
|
2731
|
+
source: NormalizedTraceSourceWireSchema,
|
|
2732
|
+
session: NormalizedTraceSessionWireSchema,
|
|
2733
|
+
branch: NormalizedTraceBranchWireSchema.optional(),
|
|
2734
|
+
events: z2.array(NormalizedTraceEventWireSchema),
|
|
2735
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
2736
|
+
cost_usd: z2.number().optional(),
|
|
2737
|
+
duration_ms: z2.number().optional(),
|
|
2738
|
+
started_at: z2.string().optional(),
|
|
2739
|
+
ended_at: z2.string().optional(),
|
|
2740
|
+
metadata: MetadataWireSchema.optional()
|
|
2741
|
+
});
|
|
2742
|
+
function toNormalizedTrajectoryWire(trajectory) {
|
|
2743
|
+
return NormalizedTrajectoryWireSchema.parse(
|
|
2744
|
+
omitUndefinedProperties({
|
|
2745
|
+
schema_version: trajectory.schemaVersion,
|
|
2746
|
+
source: toNormalizedTraceSourceWire(trajectory.source),
|
|
2747
|
+
session: toNormalizedTraceSessionWire(trajectory.session),
|
|
2748
|
+
branch: trajectory.branch ? toNormalizedTraceBranchWire(trajectory.branch) : void 0,
|
|
2749
|
+
events: trajectory.events.map(toNormalizedTraceEventWire),
|
|
2750
|
+
token_usage: trajectory.tokenUsage,
|
|
2751
|
+
cost_usd: trajectory.costUsd,
|
|
2752
|
+
duration_ms: trajectory.durationMs,
|
|
2753
|
+
started_at: trajectory.startedAt,
|
|
2754
|
+
ended_at: trajectory.endedAt,
|
|
2755
|
+
metadata: trajectory.metadata
|
|
2756
|
+
})
|
|
2757
|
+
);
|
|
2758
|
+
}
|
|
2759
|
+
function fromNormalizedTrajectoryWire(input) {
|
|
2760
|
+
const wire = NormalizedTrajectoryWireSchema.parse(input);
|
|
2761
|
+
return {
|
|
2762
|
+
schemaVersion: wire.schema_version,
|
|
2763
|
+
source: fromNormalizedTraceSourceWire(wire.source),
|
|
2764
|
+
session: fromNormalizedTraceSessionWire(wire.session),
|
|
2765
|
+
branch: wire.branch ? fromNormalizedTraceBranchWire(wire.branch) : void 0,
|
|
2766
|
+
events: wire.events.map(fromNormalizedTraceEventWire),
|
|
2767
|
+
tokenUsage: wire.token_usage,
|
|
2768
|
+
costUsd: wire.cost_usd,
|
|
2769
|
+
durationMs: wire.duration_ms,
|
|
2770
|
+
startedAt: wire.started_at,
|
|
2771
|
+
endedAt: wire.ended_at,
|
|
2772
|
+
metadata: wire.metadata
|
|
2773
|
+
};
|
|
2774
|
+
}
|
|
2775
|
+
function toNormalizedTraceSourceWire(source) {
|
|
2776
|
+
return omitUndefinedProperties({
|
|
2777
|
+
kind: source.kind,
|
|
2778
|
+
path: source.path,
|
|
2779
|
+
url: source.url,
|
|
2780
|
+
provider: source.provider,
|
|
2781
|
+
format: source.format,
|
|
2782
|
+
version: source.version,
|
|
2783
|
+
metadata: source.metadata
|
|
2784
|
+
});
|
|
2785
|
+
}
|
|
2786
|
+
function fromNormalizedTraceSourceWire(source) {
|
|
2787
|
+
return {
|
|
2788
|
+
kind: source.kind,
|
|
2789
|
+
path: source.path,
|
|
2790
|
+
url: source.url,
|
|
2791
|
+
provider: source.provider,
|
|
2792
|
+
format: source.format,
|
|
2793
|
+
version: source.version,
|
|
2794
|
+
metadata: source.metadata
|
|
2795
|
+
};
|
|
2796
|
+
}
|
|
2797
|
+
function toNormalizedTraceSessionWire(session) {
|
|
2798
|
+
return omitUndefinedProperties({
|
|
2799
|
+
session_id: session.sessionId,
|
|
2800
|
+
conversation_id: session.conversationId,
|
|
2801
|
+
cwd: session.cwd,
|
|
2802
|
+
started_at: session.startedAt,
|
|
2803
|
+
ended_at: session.endedAt,
|
|
2804
|
+
metadata: session.metadata
|
|
2805
|
+
});
|
|
2806
|
+
}
|
|
2807
|
+
function fromNormalizedTraceSessionWire(session) {
|
|
2808
|
+
return {
|
|
2809
|
+
sessionId: session.session_id,
|
|
2810
|
+
conversationId: session.conversation_id,
|
|
2811
|
+
cwd: session.cwd,
|
|
2812
|
+
startedAt: session.started_at,
|
|
2813
|
+
endedAt: session.ended_at,
|
|
2814
|
+
metadata: session.metadata
|
|
2815
|
+
};
|
|
2816
|
+
}
|
|
2817
|
+
function toNormalizedTraceBranchWire(branch) {
|
|
2818
|
+
return omitUndefinedProperties({
|
|
2819
|
+
selected_leaf_id: branch.selectedLeafId,
|
|
2820
|
+
selected_path_ids: branch.selectedPathIds,
|
|
2821
|
+
included_event_ids: branch.includedEventIds,
|
|
2822
|
+
omitted_event_ids: branch.omittedEventIds,
|
|
2823
|
+
selection_reason: branch.selectionReason
|
|
2824
|
+
});
|
|
2825
|
+
}
|
|
2826
|
+
function fromNormalizedTraceBranchWire(branch) {
|
|
2827
|
+
return {
|
|
2828
|
+
selectedLeafId: branch.selected_leaf_id,
|
|
2829
|
+
selectedPathIds: branch.selected_path_ids,
|
|
2830
|
+
includedEventIds: branch.included_event_ids,
|
|
2831
|
+
omittedEventIds: branch.omitted_event_ids,
|
|
2832
|
+
selectionReason: branch.selection_reason
|
|
2833
|
+
};
|
|
2834
|
+
}
|
|
2835
|
+
function toNormalizedTraceEventWire(event) {
|
|
2836
|
+
return NormalizedTraceEventWireSchema.parse(
|
|
2837
|
+
omitUndefinedProperties({
|
|
2838
|
+
event_id: event.eventId,
|
|
2839
|
+
parent_event_id: event.parentEventId,
|
|
2840
|
+
ordinal: event.ordinal,
|
|
2841
|
+
type: event.type,
|
|
2842
|
+
timestamp: event.timestamp,
|
|
2843
|
+
duration_ms: event.durationMs,
|
|
2844
|
+
duration_inferred: event.durationInferred,
|
|
2845
|
+
turn_index: event.turnIndex,
|
|
2846
|
+
message: event.message ? toNormalizedTraceMessageWire(event.message) : void 0,
|
|
2847
|
+
model: event.model ? toNormalizedTraceModelWire(event.model) : void 0,
|
|
2848
|
+
tool: event.tool ? toNormalizedTraceToolWire(event.tool) : void 0,
|
|
2849
|
+
source_ref: event.sourceRef ? toNormalizedTraceSourceRefWire(event.sourceRef) : void 0,
|
|
2850
|
+
raw_evidence: event.rawEvidence?.map(toNormalizedRawEvidenceWire),
|
|
2851
|
+
redaction: event.redaction,
|
|
2852
|
+
metadata: event.metadata
|
|
2853
|
+
})
|
|
2854
|
+
);
|
|
2855
|
+
}
|
|
2856
|
+
function fromNormalizedTraceEventWire(event) {
|
|
2857
|
+
return {
|
|
2858
|
+
eventId: event.event_id,
|
|
2859
|
+
parentEventId: event.parent_event_id,
|
|
2860
|
+
ordinal: event.ordinal,
|
|
2861
|
+
type: event.type,
|
|
2862
|
+
timestamp: event.timestamp,
|
|
2863
|
+
durationMs: event.duration_ms,
|
|
2864
|
+
durationInferred: event.duration_inferred,
|
|
2865
|
+
turnIndex: event.turn_index,
|
|
2866
|
+
message: event.message ? fromNormalizedTraceMessageWire(event.message) : void 0,
|
|
2867
|
+
model: event.model ? fromNormalizedTraceModelWire(event.model) : void 0,
|
|
2868
|
+
tool: event.tool ? fromNormalizedTraceToolWire(event.tool) : void 0,
|
|
2869
|
+
sourceRef: event.source_ref ? fromNormalizedTraceSourceRefWire(event.source_ref) : void 0,
|
|
2870
|
+
rawEvidence: event.raw_evidence?.map(fromNormalizedRawEvidenceWire),
|
|
2871
|
+
redaction: event.redaction,
|
|
2872
|
+
metadata: event.metadata
|
|
2873
|
+
};
|
|
2874
|
+
}
|
|
2875
|
+
function toNormalizedTraceMessageWire(message) {
|
|
2876
|
+
return omitUndefinedProperties({
|
|
2877
|
+
role: message.role,
|
|
2878
|
+
name: message.name,
|
|
2879
|
+
content: message.content,
|
|
2880
|
+
redaction: message.redaction,
|
|
2881
|
+
token_usage: message.tokenUsage,
|
|
2882
|
+
metadata: message.metadata
|
|
2883
|
+
});
|
|
2884
|
+
}
|
|
2885
|
+
function fromNormalizedTraceMessageWire(message) {
|
|
2886
|
+
return {
|
|
2887
|
+
role: message.role,
|
|
2888
|
+
name: message.name,
|
|
2889
|
+
content: message.content,
|
|
2890
|
+
redaction: message.redaction,
|
|
2891
|
+
tokenUsage: message.token_usage,
|
|
2892
|
+
metadata: message.metadata
|
|
2893
|
+
};
|
|
2894
|
+
}
|
|
2895
|
+
function toNormalizedTraceModelWire(model) {
|
|
2896
|
+
return omitUndefinedProperties({
|
|
2897
|
+
provider: model.provider,
|
|
2898
|
+
name: model.name,
|
|
2899
|
+
invocation_id: model.invocationId,
|
|
2900
|
+
token_usage: model.tokenUsage,
|
|
2901
|
+
metadata: model.metadata
|
|
2902
|
+
});
|
|
2903
|
+
}
|
|
2904
|
+
function fromNormalizedTraceModelWire(model) {
|
|
2905
|
+
return {
|
|
2906
|
+
provider: model.provider,
|
|
2907
|
+
name: model.name,
|
|
2908
|
+
invocationId: model.invocation_id,
|
|
2909
|
+
tokenUsage: model.token_usage,
|
|
2910
|
+
metadata: model.metadata
|
|
2911
|
+
};
|
|
2912
|
+
}
|
|
2913
|
+
function toNormalizedTraceToolWire(tool) {
|
|
2914
|
+
return omitUndefinedProperties({
|
|
2915
|
+
name: tool.name,
|
|
2916
|
+
call_id: tool.callId,
|
|
2917
|
+
input: tool.input,
|
|
2918
|
+
output: tool.output,
|
|
2919
|
+
status: tool.status,
|
|
2920
|
+
error: tool.error,
|
|
2921
|
+
redaction: tool.redaction,
|
|
2922
|
+
metadata: tool.metadata
|
|
2923
|
+
});
|
|
2924
|
+
}
|
|
2925
|
+
function fromNormalizedTraceToolWire(tool) {
|
|
2926
|
+
return {
|
|
2927
|
+
name: tool.name,
|
|
2928
|
+
callId: tool.call_id,
|
|
2929
|
+
input: tool.input,
|
|
2930
|
+
output: tool.output,
|
|
2931
|
+
status: tool.status,
|
|
2932
|
+
error: tool.error,
|
|
2933
|
+
redaction: tool.redaction,
|
|
2934
|
+
metadata: tool.metadata
|
|
2935
|
+
};
|
|
2936
|
+
}
|
|
2937
|
+
function toNormalizedTraceSourceRefWire(sourceRef) {
|
|
2938
|
+
return omitUndefinedProperties({
|
|
2939
|
+
event_id: sourceRef.eventId,
|
|
2940
|
+
message_id: sourceRef.messageId,
|
|
2941
|
+
span_id: sourceRef.spanId,
|
|
2942
|
+
trace_id: sourceRef.traceId,
|
|
2943
|
+
raw_kind: sourceRef.rawKind,
|
|
2944
|
+
path: sourceRef.path,
|
|
2945
|
+
line: sourceRef.line,
|
|
2946
|
+
metadata: sourceRef.metadata
|
|
2947
|
+
});
|
|
2948
|
+
}
|
|
2949
|
+
function fromNormalizedTraceSourceRefWire(sourceRef) {
|
|
2950
|
+
return {
|
|
2951
|
+
eventId: sourceRef.event_id,
|
|
2952
|
+
messageId: sourceRef.message_id,
|
|
2953
|
+
spanId: sourceRef.span_id,
|
|
2954
|
+
traceId: sourceRef.trace_id,
|
|
2955
|
+
rawKind: sourceRef.raw_kind,
|
|
2956
|
+
path: sourceRef.path,
|
|
2957
|
+
line: sourceRef.line,
|
|
2958
|
+
metadata: sourceRef.metadata
|
|
2959
|
+
};
|
|
2960
|
+
}
|
|
2961
|
+
function toNormalizedRawEvidenceWire(evidence) {
|
|
2962
|
+
return omitUndefinedProperties({
|
|
2963
|
+
kind: evidence.kind,
|
|
2964
|
+
ref: evidence.ref,
|
|
2965
|
+
media_type: evidence.mediaType,
|
|
2966
|
+
content: evidence.content,
|
|
2967
|
+
redacted: evidence.redacted,
|
|
2968
|
+
metadata: evidence.metadata
|
|
2969
|
+
});
|
|
2970
|
+
}
|
|
2971
|
+
function fromNormalizedRawEvidenceWire(evidence) {
|
|
2972
|
+
return {
|
|
2973
|
+
kind: evidence.kind,
|
|
2974
|
+
ref: evidence.ref,
|
|
2975
|
+
mediaType: evidence.media_type,
|
|
2976
|
+
content: evidence.content,
|
|
2977
|
+
redacted: evidence.redacted,
|
|
2978
|
+
metadata: evidence.metadata
|
|
2979
|
+
};
|
|
2980
|
+
}
|
|
2540
2981
|
function computeTraceSummary(messages) {
|
|
2541
2982
|
const toolCallCounts = {};
|
|
2542
2983
|
const toolDurations = {};
|
|
@@ -2604,6 +3045,82 @@ function computeTraceSummary(messages) {
|
|
|
2604
3045
|
endTime: latestEnd?.toISOString()
|
|
2605
3046
|
};
|
|
2606
3047
|
}
|
|
3048
|
+
function getSelectedTrajectoryEvents(trajectory) {
|
|
3049
|
+
if (!trajectory.branch?.includedEventIds || trajectory.branch.includedEventIds.length === 0) {
|
|
3050
|
+
return trajectory.events;
|
|
3051
|
+
}
|
|
3052
|
+
const includedIds = new Set(trajectory.branch.includedEventIds);
|
|
3053
|
+
return trajectory.events.filter((event) => includedIds.has(event.eventId));
|
|
3054
|
+
}
|
|
3055
|
+
function computeTraceSummaryFromTrajectory(trajectory) {
|
|
3056
|
+
const selectedEvents = getSelectedTrajectoryEvents(trajectory);
|
|
3057
|
+
const hasModelTurnEvents = selectedEvents.some((event) => event.type === "model_turn");
|
|
3058
|
+
const toolCallCounts = {};
|
|
3059
|
+
const toolDurations = {};
|
|
3060
|
+
let totalToolCalls = 0;
|
|
3061
|
+
let errorCount = 0;
|
|
3062
|
+
let llmCallCount = 0;
|
|
3063
|
+
let earliestStart;
|
|
3064
|
+
let latestEnd;
|
|
3065
|
+
let hasAnyDuration = false;
|
|
3066
|
+
for (const event of selectedEvents) {
|
|
3067
|
+
if (event.type === "model_turn" || !hasModelTurnEvents && event.type === "message" && event.message?.role === "assistant") {
|
|
3068
|
+
llmCallCount++;
|
|
3069
|
+
}
|
|
3070
|
+
const eventStart = parseTimestamp(event.timestamp);
|
|
3071
|
+
if (eventStart && (!earliestStart || eventStart < earliestStart)) {
|
|
3072
|
+
earliestStart = eventStart;
|
|
3073
|
+
}
|
|
3074
|
+
const eventEnd = deriveEventEnd(eventStart, event.durationMs);
|
|
3075
|
+
if (eventEnd && (!latestEnd || eventEnd > latestEnd)) {
|
|
3076
|
+
latestEnd = eventEnd;
|
|
3077
|
+
}
|
|
3078
|
+
if (event.type !== "tool_call" || !event.tool) {
|
|
3079
|
+
continue;
|
|
3080
|
+
}
|
|
3081
|
+
toolCallCounts[event.tool.name] = (toolCallCounts[event.tool.name] ?? 0) + 1;
|
|
3082
|
+
totalToolCalls++;
|
|
3083
|
+
if (isErrorToolEvent(event)) {
|
|
3084
|
+
errorCount++;
|
|
3085
|
+
}
|
|
3086
|
+
if (event.durationMs !== void 0) {
|
|
3087
|
+
hasAnyDuration = true;
|
|
3088
|
+
if (!toolDurations[event.tool.name]) {
|
|
3089
|
+
toolDurations[event.tool.name] = [];
|
|
3090
|
+
}
|
|
3091
|
+
toolDurations[event.tool.name].push(event.durationMs);
|
|
3092
|
+
}
|
|
3093
|
+
}
|
|
3094
|
+
return {
|
|
3095
|
+
trace: {
|
|
3096
|
+
eventCount: totalToolCalls,
|
|
3097
|
+
toolCalls: toolCallCounts,
|
|
3098
|
+
errorCount,
|
|
3099
|
+
llmCallCount,
|
|
3100
|
+
...hasAnyDuration ? { toolDurations } : {}
|
|
3101
|
+
},
|
|
3102
|
+
tokenUsage: trajectory.tokenUsage,
|
|
3103
|
+
costUsd: trajectory.costUsd,
|
|
3104
|
+
durationMs: trajectory.durationMs,
|
|
3105
|
+
startTime: trajectory.startedAt ?? earliestStart?.toISOString(),
|
|
3106
|
+
endTime: trajectory.endedAt ?? latestEnd?.toISOString()
|
|
3107
|
+
};
|
|
3108
|
+
}
|
|
3109
|
+
function parseTimestamp(timestamp) {
|
|
3110
|
+
if (!timestamp) return void 0;
|
|
3111
|
+
const value = new Date(timestamp);
|
|
3112
|
+
return Number.isNaN(value.getTime()) ? void 0 : value;
|
|
3113
|
+
}
|
|
3114
|
+
function deriveEventEnd(start, durationMs) {
|
|
3115
|
+
if (!start) return void 0;
|
|
3116
|
+
if (durationMs === void 0) return start;
|
|
3117
|
+
return new Date(start.getTime() + durationMs);
|
|
3118
|
+
}
|
|
3119
|
+
function isErrorToolEvent(event) {
|
|
3120
|
+
return Boolean(
|
|
3121
|
+
event.tool?.error || event.tool?.status === "error" || event.tool?.status === "timeout" || event.tool?.status === "cancelled"
|
|
3122
|
+
);
|
|
3123
|
+
}
|
|
2607
3124
|
var DEFAULT_EXPLORATION_TOOLS = [
|
|
2608
3125
|
"read",
|
|
2609
3126
|
"grep",
|
|
@@ -3400,6 +3917,30 @@ var SkillTriggerGrader = class {
|
|
|
3400
3917
|
};
|
|
3401
3918
|
|
|
3402
3919
|
// src/evaluation/graders/llm-grader-prompt.ts
|
|
3920
|
+
function stringifyPretty2(value) {
|
|
3921
|
+
return value === void 0 ? "" : JSON.stringify(value, null, 2);
|
|
3922
|
+
}
|
|
3923
|
+
function stringifyCompact2(value) {
|
|
3924
|
+
return value === void 0 ? "" : JSON.stringify(value);
|
|
3925
|
+
}
|
|
3926
|
+
function buildTemplateVariables2(input) {
|
|
3927
|
+
const formattedQuestion = input.promptInputs.question && input.promptInputs.question.trim().length > 0 ? input.promptInputs.question : input.evalCase.question;
|
|
3928
|
+
return {
|
|
3929
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
3930
|
+
[TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(),
|
|
3931
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? "").trim(),
|
|
3932
|
+
[TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
|
|
3933
|
+
[TEMPLATE_VARIABLES.METADATA]: stringifyPretty2(input.evalCase.metadata),
|
|
3934
|
+
[TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact2(input.evalCase.metadata),
|
|
3935
|
+
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty2(input.rubrics),
|
|
3936
|
+
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact2(input.rubrics),
|
|
3937
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? "",
|
|
3938
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? "",
|
|
3939
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
3940
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
|
|
3941
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? "").trim()
|
|
3942
|
+
};
|
|
3943
|
+
}
|
|
3403
3944
|
function assembleLlmGraderPrompt(input) {
|
|
3404
3945
|
const {
|
|
3405
3946
|
evalCase,
|
|
@@ -3412,6 +3953,17 @@ function assembleLlmGraderPrompt(input) {
|
|
|
3412
3953
|
} = input;
|
|
3413
3954
|
const rubrics = evaluatorConfig?.rubrics;
|
|
3414
3955
|
if (rubrics && rubrics.length > 0) {
|
|
3956
|
+
if (graderTemplateOverride) {
|
|
3957
|
+
return assembleCustom(
|
|
3958
|
+
evalCase,
|
|
3959
|
+
candidate,
|
|
3960
|
+
promptInputs,
|
|
3961
|
+
rubrics,
|
|
3962
|
+
fileChanges,
|
|
3963
|
+
toolCalls,
|
|
3964
|
+
graderTemplateOverride
|
|
3965
|
+
);
|
|
3966
|
+
}
|
|
3415
3967
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
3416
3968
|
if (hasScoreRanges) {
|
|
3417
3969
|
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
@@ -3428,19 +3980,13 @@ function assembleLlmGraderPrompt(input) {
|
|
|
3428
3980
|
);
|
|
3429
3981
|
}
|
|
3430
3982
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
|
|
3431
|
-
const
|
|
3432
|
-
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
|
|
3438
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
|
|
3439
|
-
// Deprecated aliases
|
|
3440
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
3441
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
3442
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
3443
|
-
};
|
|
3983
|
+
const variables = buildTemplateVariables2({
|
|
3984
|
+
evalCase,
|
|
3985
|
+
candidate,
|
|
3986
|
+
promptInputs,
|
|
3987
|
+
fileChanges,
|
|
3988
|
+
toolCalls
|
|
3989
|
+
});
|
|
3444
3990
|
const systemPrompt = buildOutputSchema();
|
|
3445
3991
|
const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
|
|
3446
3992
|
let userPrompt = substituteVariables(template, variables);
|
|
@@ -3463,6 +4009,27 @@ ${toolCalls}`;
|
|
|
3463
4009
|
mode: "freeform"
|
|
3464
4010
|
};
|
|
3465
4011
|
}
|
|
4012
|
+
function assembleCustom(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls, graderTemplateOverride) {
|
|
4013
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
4014
|
+
const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema();
|
|
4015
|
+
const userPrompt = substituteVariables(
|
|
4016
|
+
graderTemplateOverride,
|
|
4017
|
+
buildTemplateVariables2({
|
|
4018
|
+
evalCase,
|
|
4019
|
+
candidate,
|
|
4020
|
+
promptInputs,
|
|
4021
|
+
rubrics,
|
|
4022
|
+
fileChanges,
|
|
4023
|
+
toolCalls
|
|
4024
|
+
})
|
|
4025
|
+
);
|
|
4026
|
+
return {
|
|
4027
|
+
systemPrompt,
|
|
4028
|
+
userPrompt,
|
|
4029
|
+
responseSchema: systemPrompt,
|
|
4030
|
+
mode: hasScoreRanges ? "score_range" : "checklist"
|
|
4031
|
+
};
|
|
4032
|
+
}
|
|
3466
4033
|
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
3467
4034
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
3468
4035
|
const parts = [
|
|
@@ -3486,10 +4053,19 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
3486
4053
|
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
3487
4054
|
}
|
|
3488
4055
|
parts.push("[[ ## rubrics ## ]]");
|
|
4056
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
4057
|
+
if (operatorGuidance.length > 0) {
|
|
4058
|
+
parts.push("", "Operator guidance:");
|
|
4059
|
+
for (const guidance of operatorGuidance) {
|
|
4060
|
+
parts.push(`- ${guidance}`);
|
|
4061
|
+
}
|
|
4062
|
+
parts.push("");
|
|
4063
|
+
}
|
|
3489
4064
|
for (const rubric of rubrics) {
|
|
3490
4065
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3491
4066
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3492
|
-
|
|
4067
|
+
const operatorLabel = formatRubricOperatorLabel(rubric.operator);
|
|
4068
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`);
|
|
3493
4069
|
}
|
|
3494
4070
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3495
4071
|
const systemPrompt = buildRubricOutputSchema();
|
|
@@ -3529,6 +4105,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
3529
4105
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3530
4106
|
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
3531
4107
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
4108
|
+
if (rubric.operator) {
|
|
4109
|
+
parts.push(`Operator: ${rubric.operator}`);
|
|
4110
|
+
}
|
|
3532
4111
|
if (rubric.outcome) {
|
|
3533
4112
|
parts.push(`Description: ${rubric.outcome}`);
|
|
3534
4113
|
}
|
|
@@ -3541,6 +4120,10 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
3541
4120
|
}
|
|
3542
4121
|
}
|
|
3543
4122
|
}
|
|
4123
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
4124
|
+
if (operatorGuidance.length > 0) {
|
|
4125
|
+
parts.push("", ...operatorGuidance);
|
|
4126
|
+
}
|
|
3544
4127
|
parts.push(
|
|
3545
4128
|
"",
|
|
3546
4129
|
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
@@ -4259,7 +4842,7 @@ function runEqualsAssertion(output, value) {
|
|
|
4259
4842
|
import { spawn } from "node:child_process";
|
|
4260
4843
|
import { randomUUID } from "node:crypto";
|
|
4261
4844
|
import { createWriteStream } from "node:fs";
|
|
4262
|
-
import { mkdir } from "node:fs/promises";
|
|
4845
|
+
import { mkdir as mkdir2 } from "node:fs/promises";
|
|
4263
4846
|
import path5 from "node:path";
|
|
4264
4847
|
|
|
4265
4848
|
// src/runtime/child-tracker.ts
|
|
@@ -4759,7 +5342,7 @@ var ClaudeCliProvider = class {
|
|
|
4759
5342
|
return void 0;
|
|
4760
5343
|
}
|
|
4761
5344
|
try {
|
|
4762
|
-
await
|
|
5345
|
+
await mkdir2(logDir, { recursive: true });
|
|
4763
5346
|
} catch (error) {
|
|
4764
5347
|
const message = error instanceof Error ? error.message : String(error);
|
|
4765
5348
|
console.warn(`Skipping Claude CLI stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -5069,7 +5652,7 @@ function tryParseJson(line) {
|
|
|
5069
5652
|
// src/evaluation/providers/claude-sdk.ts
|
|
5070
5653
|
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
5071
5654
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
5072
|
-
import { mkdir as
|
|
5655
|
+
import { mkdir as mkdir3 } from "node:fs/promises";
|
|
5073
5656
|
import path6 from "node:path";
|
|
5074
5657
|
var claudeSdkModule = null;
|
|
5075
5658
|
async function loadClaudeSdk() {
|
|
@@ -5254,7 +5837,7 @@ var ClaudeSdkProvider = class {
|
|
|
5254
5837
|
return void 0;
|
|
5255
5838
|
}
|
|
5256
5839
|
try {
|
|
5257
|
-
await
|
|
5840
|
+
await mkdir3(logDir, { recursive: true });
|
|
5258
5841
|
} catch (error) {
|
|
5259
5842
|
const message = error instanceof Error ? error.message : String(error);
|
|
5260
5843
|
console.warn(`Skipping Claude stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -5449,44 +6032,44 @@ function formatElapsed2(startedAt) {
|
|
|
5449
6032
|
// src/evaluation/providers/cli.ts
|
|
5450
6033
|
import { exec as execWithCallback } from "node:child_process";
|
|
5451
6034
|
import fs2 from "node:fs/promises";
|
|
5452
|
-
import
|
|
6035
|
+
import os from "node:os";
|
|
5453
6036
|
import path7 from "node:path";
|
|
5454
6037
|
import { promisify } from "node:util";
|
|
5455
|
-
import { z as
|
|
5456
|
-
var ToolCallSchema =
|
|
5457
|
-
tool:
|
|
5458
|
-
input:
|
|
5459
|
-
output:
|
|
5460
|
-
id:
|
|
5461
|
-
start_time:
|
|
5462
|
-
end_time:
|
|
5463
|
-
duration_ms:
|
|
6038
|
+
import { z as z3 } from "zod";
|
|
6039
|
+
var ToolCallSchema = z3.object({
|
|
6040
|
+
tool: z3.string(),
|
|
6041
|
+
input: z3.unknown().optional(),
|
|
6042
|
+
output: z3.unknown().optional(),
|
|
6043
|
+
id: z3.string().optional(),
|
|
6044
|
+
start_time: z3.string().optional(),
|
|
6045
|
+
end_time: z3.string().optional(),
|
|
6046
|
+
duration_ms: z3.number().optional()
|
|
5464
6047
|
});
|
|
5465
|
-
var MessageInputSchema =
|
|
5466
|
-
role:
|
|
5467
|
-
name:
|
|
5468
|
-
content:
|
|
5469
|
-
tool_calls:
|
|
5470
|
-
start_time:
|
|
5471
|
-
end_time:
|
|
5472
|
-
duration_ms:
|
|
5473
|
-
metadata:
|
|
6048
|
+
var MessageInputSchema = z3.object({
|
|
6049
|
+
role: z3.string(),
|
|
6050
|
+
name: z3.string().optional(),
|
|
6051
|
+
content: z3.unknown().optional(),
|
|
6052
|
+
tool_calls: z3.array(ToolCallSchema).optional(),
|
|
6053
|
+
start_time: z3.string().optional(),
|
|
6054
|
+
end_time: z3.string().optional(),
|
|
6055
|
+
duration_ms: z3.number().optional(),
|
|
6056
|
+
metadata: z3.record(z3.unknown()).optional()
|
|
5474
6057
|
});
|
|
5475
|
-
var TokenUsageSchema =
|
|
5476
|
-
input:
|
|
5477
|
-
output:
|
|
5478
|
-
cached:
|
|
6058
|
+
var TokenUsageSchema = z3.object({
|
|
6059
|
+
input: z3.number(),
|
|
6060
|
+
output: z3.number(),
|
|
6061
|
+
cached: z3.number().optional()
|
|
5479
6062
|
});
|
|
5480
|
-
var CliOutputSchema =
|
|
5481
|
-
text:
|
|
5482
|
-
output:
|
|
5483
|
-
output_messages:
|
|
6063
|
+
var CliOutputSchema = z3.object({
|
|
6064
|
+
text: z3.unknown().optional(),
|
|
6065
|
+
output: z3.array(MessageInputSchema).optional(),
|
|
6066
|
+
output_messages: z3.array(MessageInputSchema).optional(),
|
|
5484
6067
|
token_usage: TokenUsageSchema.optional(),
|
|
5485
|
-
cost_usd:
|
|
5486
|
-
duration_ms:
|
|
6068
|
+
cost_usd: z3.number().optional(),
|
|
6069
|
+
duration_ms: z3.number().optional()
|
|
5487
6070
|
});
|
|
5488
6071
|
var CliJsonlRecordSchema = CliOutputSchema.extend({
|
|
5489
|
-
id:
|
|
6072
|
+
id: z3.string().min(1)
|
|
5490
6073
|
});
|
|
5491
6074
|
function validateMetrics(costUsd, durationMs, context) {
|
|
5492
6075
|
let validCostUsd = costUsd;
|
|
@@ -5991,7 +6574,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
5991
6574
|
const safeEvalId = evalCaseId || "unknown";
|
|
5992
6575
|
const timestamp = Date.now();
|
|
5993
6576
|
const random = Math.random().toString(36).substring(2, 9);
|
|
5994
|
-
return path7.join(
|
|
6577
|
+
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
5995
6578
|
}
|
|
5996
6579
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
5997
6580
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -6004,7 +6587,7 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
6004
6587
|
// src/evaluation/providers/codex.ts
|
|
6005
6588
|
import { randomUUID as randomUUID3 } from "node:crypto";
|
|
6006
6589
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
6007
|
-
import { mkdir as
|
|
6590
|
+
import { mkdir as mkdir4 } from "node:fs/promises";
|
|
6008
6591
|
import path8 from "node:path";
|
|
6009
6592
|
|
|
6010
6593
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -6097,6 +6680,9 @@ var CodexProvider = class {
|
|
|
6097
6680
|
const startMs = Date.now();
|
|
6098
6681
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
6099
6682
|
const codexOptions = {};
|
|
6683
|
+
if (this.config.executable) {
|
|
6684
|
+
codexOptions.codexPathOverride = this.config.executable;
|
|
6685
|
+
}
|
|
6100
6686
|
if (this.config.model) {
|
|
6101
6687
|
codexOptions.config = { model: this.config.model };
|
|
6102
6688
|
}
|
|
@@ -6108,6 +6694,9 @@ var CodexProvider = class {
|
|
|
6108
6694
|
if (cwd) {
|
|
6109
6695
|
threadOptions.workingDirectory = cwd;
|
|
6110
6696
|
}
|
|
6697
|
+
if (this.config.modelReasoningEffort) {
|
|
6698
|
+
threadOptions.modelReasoningEffort = this.config.modelReasoningEffort;
|
|
6699
|
+
}
|
|
6111
6700
|
const thread = codex.startThread(threadOptions);
|
|
6112
6701
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
6113
6702
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
@@ -6255,7 +6844,7 @@ ${basePrompt}` : basePrompt;
|
|
|
6255
6844
|
}
|
|
6256
6845
|
resolveLogDirectory() {
|
|
6257
6846
|
const disabled = isCodexLogStreamingDisabled();
|
|
6258
|
-
if (disabled) {
|
|
6847
|
+
if (disabled || this.config.streamLog === false) {
|
|
6259
6848
|
return void 0;
|
|
6260
6849
|
}
|
|
6261
6850
|
if (this.config.logDir) {
|
|
@@ -6269,7 +6858,7 @@ ${basePrompt}` : basePrompt;
|
|
|
6269
6858
|
return void 0;
|
|
6270
6859
|
}
|
|
6271
6860
|
try {
|
|
6272
|
-
await
|
|
6861
|
+
await mkdir4(logDir, { recursive: true });
|
|
6273
6862
|
} catch (error) {
|
|
6274
6863
|
const message = error instanceof Error ? error.message : String(error);
|
|
6275
6864
|
console.warn(`Skipping Codex SDK stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -6282,7 +6871,7 @@ ${basePrompt}` : basePrompt;
|
|
|
6282
6871
|
targetName: this.targetName,
|
|
6283
6872
|
evalCaseId: request.evalCaseId,
|
|
6284
6873
|
attempt: request.attempt,
|
|
6285
|
-
format: this.config.
|
|
6874
|
+
format: this.config.streamLog === "raw" ? "json" : "summary"
|
|
6286
6875
|
});
|
|
6287
6876
|
recordCodexLogEntry({
|
|
6288
6877
|
filePath,
|
|
@@ -6418,7 +7007,7 @@ function formatElapsed3(startedAt) {
|
|
|
6418
7007
|
|
|
6419
7008
|
// src/evaluation/providers/copilot-cli.ts
|
|
6420
7009
|
import { randomUUID as randomUUID5 } from "node:crypto";
|
|
6421
|
-
import { mkdir as
|
|
7010
|
+
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
6422
7011
|
import { homedir as homedir2 } from "node:os";
|
|
6423
7012
|
import path11 from "node:path";
|
|
6424
7013
|
import { Readable, Writable } from "node:stream";
|
|
@@ -6428,7 +7017,7 @@ import * as acp from "@agentclientprotocol/sdk";
|
|
|
6428
7017
|
// src/evaluation/workspace/file-changes.ts
|
|
6429
7018
|
import { exec as execCallback } from "node:child_process";
|
|
6430
7019
|
import { readdirSync, statSync } from "node:fs";
|
|
6431
|
-
import { readFile as
|
|
7020
|
+
import { readFile as readFile3, readdir, stat } from "node:fs/promises";
|
|
6432
7021
|
import path9 from "node:path";
|
|
6433
7022
|
import { promisify as promisify2 } from "node:util";
|
|
6434
7023
|
var execAsync2 = promisify2(execCallback);
|
|
@@ -6503,7 +7092,7 @@ async function walkDir(rootDir, currentDir, snapshot) {
|
|
|
6503
7092
|
if (fileStat.size > SNAPSHOT_MAX_FILE_BYTES) continue;
|
|
6504
7093
|
let content;
|
|
6505
7094
|
try {
|
|
6506
|
-
content = await
|
|
7095
|
+
content = await readFile3(fullPath, "utf8");
|
|
6507
7096
|
if (content.includes("\0")) continue;
|
|
6508
7097
|
} catch {
|
|
6509
7098
|
continue;
|
|
@@ -6596,7 +7185,7 @@ import { arch, homedir, platform } from "node:os";
|
|
|
6596
7185
|
import path10 from "node:path";
|
|
6597
7186
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
6598
7187
|
function resolvePlatformCliPath() {
|
|
6599
|
-
const
|
|
7188
|
+
const os2 = platform();
|
|
6600
7189
|
const cpu = arch();
|
|
6601
7190
|
const platformMap = {
|
|
6602
7191
|
linux: "linux",
|
|
@@ -6607,13 +7196,13 @@ function resolvePlatformCliPath() {
|
|
|
6607
7196
|
x64: "x64",
|
|
6608
7197
|
arm64: "arm64"
|
|
6609
7198
|
};
|
|
6610
|
-
const osPart = platformMap[
|
|
7199
|
+
const osPart = platformMap[os2];
|
|
6611
7200
|
const archPart = archMap[cpu];
|
|
6612
7201
|
if (!osPart || !archPart) {
|
|
6613
7202
|
return void 0;
|
|
6614
7203
|
}
|
|
6615
7204
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
6616
|
-
const binaryName =
|
|
7205
|
+
const binaryName = os2 === "win32" ? "copilot.exe" : "copilot";
|
|
6617
7206
|
try {
|
|
6618
7207
|
const resolved = import.meta.resolve(`${packageName}/package.json`);
|
|
6619
7208
|
const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
|
|
@@ -6681,9 +7270,9 @@ function resolvePlatformCliPath() {
|
|
|
6681
7270
|
}
|
|
6682
7271
|
function globalNpmRoots() {
|
|
6683
7272
|
const roots = [];
|
|
6684
|
-
const
|
|
7273
|
+
const os2 = platform();
|
|
6685
7274
|
const home = homedir();
|
|
6686
|
-
if (
|
|
7275
|
+
if (os2 === "win32") {
|
|
6687
7276
|
if (process.env.APPDATA) {
|
|
6688
7277
|
roots.push(path10.join(process.env.APPDATA, "npm", "node_modules"));
|
|
6689
7278
|
}
|
|
@@ -6698,7 +7287,7 @@ function globalNpmRoots() {
|
|
|
6698
7287
|
if (process.env.npm_config_prefix) {
|
|
6699
7288
|
const prefix = process.env.npm_config_prefix;
|
|
6700
7289
|
roots.push(
|
|
6701
|
-
|
|
7290
|
+
os2 === "win32" ? path10.join(prefix, "node_modules") : path10.join(prefix, "lib", "node_modules")
|
|
6702
7291
|
);
|
|
6703
7292
|
}
|
|
6704
7293
|
return Array.from(new Set(roots));
|
|
@@ -7119,7 +7708,7 @@ var CopilotCliProvider = class {
|
|
|
7119
7708
|
return void 0;
|
|
7120
7709
|
}
|
|
7121
7710
|
try {
|
|
7122
|
-
await
|
|
7711
|
+
await mkdir5(logDir, { recursive: true });
|
|
7123
7712
|
} catch (error) {
|
|
7124
7713
|
const message = error instanceof Error ? error.message : String(error);
|
|
7125
7714
|
console.warn(`Skipping Copilot CLI stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -7227,7 +7816,7 @@ function summarizeAcpEvent(eventType, data) {
|
|
|
7227
7816
|
}
|
|
7228
7817
|
|
|
7229
7818
|
// src/evaluation/providers/copilot-log.ts
|
|
7230
|
-
import { readFile as
|
|
7819
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
7231
7820
|
import { homedir as homedir4 } from "node:os";
|
|
7232
7821
|
import path13 from "node:path";
|
|
7233
7822
|
|
|
@@ -7363,7 +7952,7 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
7363
7952
|
}
|
|
7364
7953
|
|
|
7365
7954
|
// src/evaluation/providers/copilot-session-discovery.ts
|
|
7366
|
-
import { readFile as
|
|
7955
|
+
import { readFile as readFile4, readdir as readdir2, stat as stat2 } from "node:fs/promises";
|
|
7367
7956
|
import { homedir as homedir3 } from "node:os";
|
|
7368
7957
|
import path12 from "node:path";
|
|
7369
7958
|
var DEFAULT_SESSION_STATE_DIR = () => path12.join(homedir3(), ".copilot", "session-state");
|
|
@@ -7382,7 +7971,7 @@ async function discoverCopilotSessions(opts) {
|
|
|
7382
7971
|
const workspacePath = path12.join(sessionDir, "workspace.yaml");
|
|
7383
7972
|
const eventsPath = path12.join(sessionDir, "events.jsonl");
|
|
7384
7973
|
try {
|
|
7385
|
-
const workspaceContent = await
|
|
7974
|
+
const workspaceContent = await readFile4(workspacePath, "utf8");
|
|
7386
7975
|
const workspace = parseYamlValue(workspaceContent) ?? {};
|
|
7387
7976
|
const cwd = String(workspace.cwd ?? "");
|
|
7388
7977
|
let updatedAt;
|
|
@@ -7444,7 +8033,7 @@ var CopilotLogProvider = class {
|
|
|
7444
8033
|
const eventsPath = path13.join(sessionDir, "events.jsonl");
|
|
7445
8034
|
let eventsContent;
|
|
7446
8035
|
try {
|
|
7447
|
-
eventsContent = await
|
|
8036
|
+
eventsContent = await readFile5(eventsPath, "utf8");
|
|
7448
8037
|
} catch (err) {
|
|
7449
8038
|
throw new Error(
|
|
7450
8039
|
`Failed to read Copilot session transcript at ${eventsPath}: ${err instanceof Error ? err.message : String(err)}`
|
|
@@ -7491,7 +8080,7 @@ var CopilotLogProvider = class {
|
|
|
7491
8080
|
// src/evaluation/providers/copilot-sdk.ts
|
|
7492
8081
|
import { randomUUID as randomUUID6 } from "node:crypto";
|
|
7493
8082
|
import { existsSync as existsSync2 } from "node:fs";
|
|
7494
|
-
import { mkdir as
|
|
8083
|
+
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
7495
8084
|
import path14 from "node:path";
|
|
7496
8085
|
|
|
7497
8086
|
// src/evaluation/providers/copilot-sdk-log-tracker.ts
|
|
@@ -7831,7 +8420,7 @@ var CopilotSdkProvider = class {
|
|
|
7831
8420
|
return void 0;
|
|
7832
8421
|
}
|
|
7833
8422
|
try {
|
|
7834
|
-
await
|
|
8423
|
+
await mkdir6(logDir, { recursive: true });
|
|
7835
8424
|
} catch (error) {
|
|
7836
8425
|
const message = error instanceof Error ? error.message : String(error);
|
|
7837
8426
|
console.warn(`Skipping Copilot SDK stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -7957,7 +8546,7 @@ var MockProvider = class {
|
|
|
7957
8546
|
import { execSync, spawn as spawn3 } from "node:child_process";
|
|
7958
8547
|
import { randomUUID as randomUUID7 } from "node:crypto";
|
|
7959
8548
|
import { accessSync, createWriteStream as createWriteStream5, readFileSync } from "node:fs";
|
|
7960
|
-
import { mkdir as
|
|
8549
|
+
import { mkdir as mkdir7, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile3 } from "node:fs/promises";
|
|
7961
8550
|
import { tmpdir as tmpdir2 } from "node:os";
|
|
7962
8551
|
import path15 from "node:path";
|
|
7963
8552
|
|
|
@@ -8166,7 +8755,7 @@ var PiCliProvider = class {
|
|
|
8166
8755
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
8167
8756
|
try {
|
|
8168
8757
|
const promptFile = path15.join(cwd, PROMPT_FILENAME);
|
|
8169
|
-
await
|
|
8758
|
+
await writeFile3(promptFile, request.question, "utf8");
|
|
8170
8759
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
8171
8760
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
8172
8761
|
if (result.timedOut) {
|
|
@@ -8357,7 +8946,7 @@ ${prompt}` : prompt;
|
|
|
8357
8946
|
return void 0;
|
|
8358
8947
|
}
|
|
8359
8948
|
try {
|
|
8360
|
-
await
|
|
8949
|
+
await mkdir7(logDir, { recursive: true });
|
|
8361
8950
|
} catch (error) {
|
|
8362
8951
|
const message = error instanceof Error ? error.message : String(error);
|
|
8363
8952
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -8920,7 +9509,7 @@ async function defaultPiRunner(options) {
|
|
|
8920
9509
|
import { execSync as execSync2 } from "node:child_process";
|
|
8921
9510
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
8922
9511
|
import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
|
|
8923
|
-
import { mkdir as
|
|
9512
|
+
import { mkdir as mkdir8 } from "node:fs/promises";
|
|
8924
9513
|
import path16 from "node:path";
|
|
8925
9514
|
import { createInterface } from "node:readline";
|
|
8926
9515
|
import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
|
|
@@ -9357,7 +9946,7 @@ ${fileList}`;
|
|
|
9357
9946
|
return void 0;
|
|
9358
9947
|
}
|
|
9359
9948
|
try {
|
|
9360
|
-
await
|
|
9949
|
+
await mkdir8(logDir, { recursive: true });
|
|
9361
9950
|
} catch (error) {
|
|
9362
9951
|
const message = error instanceof Error ? error.message : String(error);
|
|
9363
9952
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -9582,12 +10171,12 @@ import path27 from "node:path";
|
|
|
9582
10171
|
import { promisify as promisify4 } from "node:util";
|
|
9583
10172
|
|
|
9584
10173
|
// src/evaluation/providers/vscode/dispatch/agentDispatch.ts
|
|
9585
|
-
import { stat as stat5, writeFile as
|
|
10174
|
+
import { stat as stat5, writeFile as writeFile6 } from "node:fs/promises";
|
|
9586
10175
|
import path25 from "node:path";
|
|
9587
10176
|
|
|
9588
10177
|
// src/evaluation/providers/vscode/utils/fs.ts
|
|
9589
10178
|
import { constants } from "node:fs";
|
|
9590
|
-
import { access, mkdir as
|
|
10179
|
+
import { access, mkdir as mkdir9, readdir as readdir3, rm as rm3, stat as stat3 } from "node:fs/promises";
|
|
9591
10180
|
import path17 from "node:path";
|
|
9592
10181
|
async function pathExists(target) {
|
|
9593
10182
|
try {
|
|
@@ -9598,7 +10187,7 @@ async function pathExists(target) {
|
|
|
9598
10187
|
}
|
|
9599
10188
|
}
|
|
9600
10189
|
async function ensureDir(target) {
|
|
9601
|
-
await
|
|
10190
|
+
await mkdir9(target, { recursive: true });
|
|
9602
10191
|
}
|
|
9603
10192
|
async function readDirEntries(target) {
|
|
9604
10193
|
const entries = await readdir3(target, { withFileTypes: true });
|
|
@@ -9731,7 +10320,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
9731
10320
|
}
|
|
9732
10321
|
|
|
9733
10322
|
// src/evaluation/providers/vscode/dispatch/responseWaiter.ts
|
|
9734
|
-
import { readFile as
|
|
10323
|
+
import { readFile as readFile6 } from "node:fs/promises";
|
|
9735
10324
|
import path20 from "node:path";
|
|
9736
10325
|
|
|
9737
10326
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
@@ -9770,7 +10359,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
9770
10359
|
const maxAttempts = 10;
|
|
9771
10360
|
while (attempts < maxAttempts) {
|
|
9772
10361
|
try {
|
|
9773
|
-
const content = await
|
|
10362
|
+
const content = await readFile6(responseFileFinal, { encoding: "utf8" });
|
|
9774
10363
|
if (!silent) {
|
|
9775
10364
|
process.stdout.write(`${content}
|
|
9776
10365
|
`);
|
|
@@ -9827,7 +10416,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
9827
10416
|
const maxAttempts = 10;
|
|
9828
10417
|
while (attempts < maxAttempts) {
|
|
9829
10418
|
try {
|
|
9830
|
-
const content = await
|
|
10419
|
+
const content = await readFile6(file, { encoding: "utf8" });
|
|
9831
10420
|
if (!silent) {
|
|
9832
10421
|
process.stdout.write(`${content}
|
|
9833
10422
|
`);
|
|
@@ -9850,7 +10439,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
9850
10439
|
|
|
9851
10440
|
// src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
|
|
9852
10441
|
import { exec, spawn as spawn4 } from "node:child_process";
|
|
9853
|
-
import { mkdir as
|
|
10442
|
+
import { mkdir as mkdir10, writeFile as writeFile4 } from "node:fs/promises";
|
|
9854
10443
|
import path22 from "node:path";
|
|
9855
10444
|
import { promisify as promisify3 } from "node:util";
|
|
9856
10445
|
|
|
@@ -9931,9 +10520,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
9931
10520
|
const aliveFile = path22.join(subagentDir, DEFAULT_ALIVE_FILENAME);
|
|
9932
10521
|
await removeIfExists(aliveFile);
|
|
9933
10522
|
const githubAgentsDir = path22.join(subagentDir, ".github", "agents");
|
|
9934
|
-
await
|
|
10523
|
+
await mkdir10(githubAgentsDir, { recursive: true });
|
|
9935
10524
|
const wakeupDst = path22.join(githubAgentsDir, "wakeup.md");
|
|
9936
|
-
await
|
|
10525
|
+
await writeFile4(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
|
|
9937
10526
|
const workspaceChild = spawnVsCode(vscodeCmd, [workspacePath], {
|
|
9938
10527
|
label: "open-workspace"
|
|
9939
10528
|
});
|
|
@@ -9962,9 +10551,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
9962
10551
|
async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, requestInstructions, timestamp, vscodeCmd) {
|
|
9963
10552
|
const workspacePath = path22.join(subagentDir, `${path22.basename(subagentDir)}.code-workspace`);
|
|
9964
10553
|
const messagesDir = path22.join(subagentDir, "messages");
|
|
9965
|
-
await
|
|
10554
|
+
await mkdir10(messagesDir, { recursive: true });
|
|
9966
10555
|
const reqFile = path22.join(messagesDir, `${timestamp}_req.md`);
|
|
9967
|
-
await
|
|
10556
|
+
await writeFile4(reqFile, requestInstructions, { encoding: "utf8" });
|
|
9968
10557
|
const reqUri = pathToFileUri2(reqFile);
|
|
9969
10558
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
9970
10559
|
for (const attachment of attachmentPaths) {
|
|
@@ -9990,7 +10579,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
|
|
|
9990
10579
|
async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, chatInstruction, vscodeCmd) {
|
|
9991
10580
|
const workspacePath = path22.join(subagentDir, `${path22.basename(subagentDir)}.code-workspace`);
|
|
9992
10581
|
const messagesDir = path22.join(subagentDir, "messages");
|
|
9993
|
-
await
|
|
10582
|
+
await mkdir10(messagesDir, { recursive: true });
|
|
9994
10583
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
9995
10584
|
for (const attachment of attachmentPaths) {
|
|
9996
10585
|
chatArgs.push("-a", attachment);
|
|
@@ -10013,7 +10602,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
|
|
|
10013
10602
|
}
|
|
10014
10603
|
|
|
10015
10604
|
// src/evaluation/providers/vscode/dispatch/workspaceManager.ts
|
|
10016
|
-
import { copyFile, mkdir as
|
|
10605
|
+
import { copyFile, mkdir as mkdir11, readFile as readFile7, readdir as readdir4, stat as stat4, writeFile as writeFile5 } from "node:fs/promises";
|
|
10017
10606
|
import path24 from "node:path";
|
|
10018
10607
|
|
|
10019
10608
|
// src/evaluation/providers/vscode/utils/workspace.ts
|
|
@@ -10130,7 +10719,7 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
|
|
|
10130
10719
|
if (!stats.isFile()) {
|
|
10131
10720
|
throw new Error(`workspace template must be a file, not a directory: ${workspaceSrc}`);
|
|
10132
10721
|
}
|
|
10133
|
-
const templateText = await
|
|
10722
|
+
const templateText = await readFile7(workspaceSrc, "utf8");
|
|
10134
10723
|
workspaceContent = JSON.parse(templateText);
|
|
10135
10724
|
} else {
|
|
10136
10725
|
workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
|
|
@@ -10149,9 +10738,9 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
|
|
|
10149
10738
|
transformedContent = JSON.stringify(parsed, null, 2);
|
|
10150
10739
|
}
|
|
10151
10740
|
}
|
|
10152
|
-
await
|
|
10741
|
+
await writeFile5(workspaceDst, transformedContent, "utf8");
|
|
10153
10742
|
const messagesDir = path24.join(subagentDir, "messages");
|
|
10154
|
-
await
|
|
10743
|
+
await mkdir11(messagesDir, { recursive: true });
|
|
10155
10744
|
return { workspace: workspaceDst, messagesDir };
|
|
10156
10745
|
}
|
|
10157
10746
|
async function createSubagentLock(subagentDir) {
|
|
@@ -10174,7 +10763,7 @@ async function createSubagentLock(subagentDir) {
|
|
|
10174
10763
|
);
|
|
10175
10764
|
}
|
|
10176
10765
|
const lockFile = path24.join(subagentDir, DEFAULT_LOCK_NAME);
|
|
10177
|
-
await
|
|
10766
|
+
await writeFile5(lockFile, "", { encoding: "utf8" });
|
|
10178
10767
|
return lockFile;
|
|
10179
10768
|
}
|
|
10180
10769
|
async function removeSubagentLock(subagentDir) {
|
|
@@ -10199,7 +10788,7 @@ async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspa
|
|
|
10199
10788
|
}
|
|
10200
10789
|
if (promptFile) {
|
|
10201
10790
|
const githubAgentsDir = path24.join(subagentDir, ".github", "agents");
|
|
10202
|
-
await
|
|
10791
|
+
await mkdir11(githubAgentsDir, { recursive: true });
|
|
10203
10792
|
const agentFile = path24.join(githubAgentsDir, `${chatId}.md`);
|
|
10204
10793
|
try {
|
|
10205
10794
|
await copyFile(promptFile, agentFile);
|
|
@@ -10460,7 +11049,7 @@ async function dispatchBatchAgent(options) {
|
|
|
10460
11049
|
const reqFile = requestFiles[index];
|
|
10461
11050
|
const tmpFile = responseTmpFiles[index];
|
|
10462
11051
|
const finalFile = responseFilesFinal[index];
|
|
10463
|
-
return
|
|
11052
|
+
return writeFile6(
|
|
10464
11053
|
reqFile,
|
|
10465
11054
|
createBatchRequestPrompt(query, tmpFile, finalFile, batchRequestTemplateContent),
|
|
10466
11055
|
{ encoding: "utf8" }
|
|
@@ -10472,7 +11061,7 @@ async function dispatchBatchAgent(options) {
|
|
|
10472
11061
|
responseFilesFinal,
|
|
10473
11062
|
orchestratorTemplateContent
|
|
10474
11063
|
);
|
|
10475
|
-
await
|
|
11064
|
+
await writeFile6(orchestratorFile, orchestratorContent, { encoding: "utf8" });
|
|
10476
11065
|
}
|
|
10477
11066
|
const chatAttachments = [orchestratorFile, ...attachments];
|
|
10478
11067
|
const orchestratorUri = pathToFileUri2(orchestratorFile);
|
|
@@ -10538,7 +11127,7 @@ async function dispatchBatchAgent(options) {
|
|
|
10538
11127
|
}
|
|
10539
11128
|
|
|
10540
11129
|
// src/evaluation/providers/vscode/dispatch/provision.ts
|
|
10541
|
-
import { writeFile as
|
|
11130
|
+
import { writeFile as writeFile7 } from "node:fs/promises";
|
|
10542
11131
|
import path26 from "node:path";
|
|
10543
11132
|
var DEFAULT_WORKSPACE_TEMPLATE2 = {
|
|
10544
11133
|
folders: [
|
|
@@ -10619,8 +11208,8 @@ async function provisionSubagents(options) {
|
|
|
10619
11208
|
if (!dryRun) {
|
|
10620
11209
|
await removeIfExists(lockFile);
|
|
10621
11210
|
await ensureDir(githubAgentsDir);
|
|
10622
|
-
await
|
|
10623
|
-
await
|
|
11211
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11212
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10624
11213
|
}
|
|
10625
11214
|
created.push(subagentDir);
|
|
10626
11215
|
lockedSubagents.delete(subagentDir);
|
|
@@ -10630,8 +11219,8 @@ async function provisionSubagents(options) {
|
|
|
10630
11219
|
if (!isLocked && force) {
|
|
10631
11220
|
if (!dryRun) {
|
|
10632
11221
|
await ensureDir(githubAgentsDir);
|
|
10633
|
-
await
|
|
10634
|
-
await
|
|
11222
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11223
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10635
11224
|
}
|
|
10636
11225
|
created.push(subagentDir);
|
|
10637
11226
|
subagentsProvisioned += 1;
|
|
@@ -10639,8 +11228,8 @@ async function provisionSubagents(options) {
|
|
|
10639
11228
|
}
|
|
10640
11229
|
if (!dryRun && !await pathExists(workspaceDst)) {
|
|
10641
11230
|
await ensureDir(githubAgentsDir);
|
|
10642
|
-
await
|
|
10643
|
-
await
|
|
11231
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11232
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10644
11233
|
}
|
|
10645
11234
|
skippedExisting.push(subagentDir);
|
|
10646
11235
|
subagentsProvisioned += 1;
|
|
@@ -10655,8 +11244,8 @@ async function provisionSubagents(options) {
|
|
|
10655
11244
|
if (!dryRun) {
|
|
10656
11245
|
await ensureDir(subagentDir);
|
|
10657
11246
|
await ensureDir(githubAgentsDir);
|
|
10658
|
-
await
|
|
10659
|
-
await
|
|
11247
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
11248
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
10660
11249
|
}
|
|
10661
11250
|
created.push(subagentDir);
|
|
10662
11251
|
subagentsProvisioned += 1;
|
|
@@ -10981,7 +11570,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
10981
11570
|
|
|
10982
11571
|
// src/evaluation/providers/targets-file.ts
|
|
10983
11572
|
import { constants as constants3 } from "node:fs";
|
|
10984
|
-
import { access as access3, readFile as
|
|
11573
|
+
import { access as access3, readFile as readFile8 } from "node:fs/promises";
|
|
10985
11574
|
import path28 from "node:path";
|
|
10986
11575
|
function isRecord(value) {
|
|
10987
11576
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -11025,7 +11614,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
11025
11614
|
if (!await fileExists2(absolutePath)) {
|
|
11026
11615
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
11027
11616
|
}
|
|
11028
|
-
const raw = await
|
|
11617
|
+
const raw = await readFile8(absolutePath, "utf8");
|
|
11029
11618
|
const parsed = parseYamlValue(raw);
|
|
11030
11619
|
if (!isRecord(parsed)) {
|
|
11031
11620
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -11216,6 +11805,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
|
11216
11805
|
output: context.output ?? null,
|
|
11217
11806
|
inputFiles: context.evalCase.file_paths,
|
|
11218
11807
|
input: context.evalCase.input,
|
|
11808
|
+
metadata: context.evalCase.metadata ?? null,
|
|
11219
11809
|
trace: context.trace ?? null,
|
|
11220
11810
|
fileChanges: context.fileChanges ?? null,
|
|
11221
11811
|
workspacePath: context.workspacePath ?? null,
|
|
@@ -11733,7 +12323,7 @@ function getTCritical(df) {
|
|
|
11733
12323
|
}
|
|
11734
12324
|
|
|
11735
12325
|
// src/evaluation/workspace/manager.ts
|
|
11736
|
-
import { cp, mkdir as
|
|
12326
|
+
import { cp, mkdir as mkdir13, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
|
|
11737
12327
|
import path33 from "node:path";
|
|
11738
12328
|
var TemplateNotFoundError = class extends Error {
|
|
11739
12329
|
constructor(templatePath) {
|
|
@@ -11767,7 +12357,7 @@ function getWorkspacePath(evalRunId, caseId, workspaceRoot) {
|
|
|
11767
12357
|
return path33.join(root, evalRunId, caseId);
|
|
11768
12358
|
}
|
|
11769
12359
|
async function copyDirectoryRecursive(src, dest) {
|
|
11770
|
-
await
|
|
12360
|
+
await mkdir13(dest, { recursive: true });
|
|
11771
12361
|
const entries = await readdir5(src, { withFileTypes: true });
|
|
11772
12362
|
for (const entry of entries) {
|
|
11773
12363
|
const srcPath = path33.join(src, entry.name);
|
|
@@ -11842,7 +12432,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
11842
12432
|
import { execFile } from "node:child_process";
|
|
11843
12433
|
import { createHash } from "node:crypto";
|
|
11844
12434
|
import { existsSync as existsSync3 } from "node:fs";
|
|
11845
|
-
import { cp as cp2, mkdir as
|
|
12435
|
+
import { cp as cp2, mkdir as mkdir14, readFile as readFile9, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile8 } from "node:fs/promises";
|
|
11846
12436
|
import path34 from "node:path";
|
|
11847
12437
|
import { promisify as promisify5 } from "node:util";
|
|
11848
12438
|
var execFileAsync = promisify5(execFile);
|
|
@@ -11896,7 +12486,7 @@ function computeWorkspaceFingerprint(repos) {
|
|
|
11896
12486
|
return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
11897
12487
|
}
|
|
11898
12488
|
async function copyDirectoryRecursive2(src, dest, skipDirs) {
|
|
11899
|
-
await
|
|
12489
|
+
await mkdir14(dest, { recursive: true });
|
|
11900
12490
|
const entries = await readdir6(src, { withFileTypes: true });
|
|
11901
12491
|
for (const entry of entries) {
|
|
11902
12492
|
const srcPath = path34.join(src, entry.name);
|
|
@@ -11934,7 +12524,7 @@ var WorkspacePoolManager = class {
|
|
|
11934
12524
|
const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
|
|
11935
12525
|
const fingerprint = computeWorkspaceFingerprint(repos);
|
|
11936
12526
|
const poolDir = path34.join(this.poolRoot, fingerprint);
|
|
11937
|
-
await
|
|
12527
|
+
await mkdir14(poolDir, { recursive: true });
|
|
11938
12528
|
const drifted = await this.checkDrift(poolDir, fingerprint);
|
|
11939
12529
|
if (drifted) {
|
|
11940
12530
|
console.warn(
|
|
@@ -11961,7 +12551,7 @@ var WorkspacePoolManager = class {
|
|
|
11961
12551
|
poolDir
|
|
11962
12552
|
};
|
|
11963
12553
|
}
|
|
11964
|
-
await
|
|
12554
|
+
await mkdir14(slotPath, { recursive: true });
|
|
11965
12555
|
if (templatePath) {
|
|
11966
12556
|
await copyDirectoryRecursive2(templatePath, slotPath);
|
|
11967
12557
|
}
|
|
@@ -11998,14 +12588,14 @@ var WorkspacePoolManager = class {
|
|
|
11998
12588
|
async tryLock(lockPath) {
|
|
11999
12589
|
for (let attempt = 0; attempt < 3; attempt++) {
|
|
12000
12590
|
try {
|
|
12001
|
-
await
|
|
12591
|
+
await writeFile8(lockPath, String(process.pid), { flag: "wx" });
|
|
12002
12592
|
return true;
|
|
12003
12593
|
} catch (err) {
|
|
12004
12594
|
if (err.code !== "EEXIST") {
|
|
12005
12595
|
throw err;
|
|
12006
12596
|
}
|
|
12007
12597
|
try {
|
|
12008
|
-
const pidStr = await
|
|
12598
|
+
const pidStr = await readFile9(lockPath, "utf-8");
|
|
12009
12599
|
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
12010
12600
|
if (!Number.isNaN(pid)) {
|
|
12011
12601
|
try {
|
|
@@ -12032,7 +12622,7 @@ var WorkspacePoolManager = class {
|
|
|
12032
12622
|
async checkDrift(poolDir, fingerprint) {
|
|
12033
12623
|
const metadataPath = path34.join(poolDir, "metadata.json");
|
|
12034
12624
|
try {
|
|
12035
|
-
const raw = await
|
|
12625
|
+
const raw = await readFile9(metadataPath, "utf-8");
|
|
12036
12626
|
const metadata = JSON.parse(raw);
|
|
12037
12627
|
return metadata.fingerprint !== fingerprint;
|
|
12038
12628
|
} catch {
|
|
@@ -12047,7 +12637,7 @@ var WorkspacePoolManager = class {
|
|
|
12047
12637
|
repos,
|
|
12048
12638
|
createdAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
12049
12639
|
};
|
|
12050
|
-
await
|
|
12640
|
+
await writeFile8(path34.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
12051
12641
|
}
|
|
12052
12642
|
/** Remove all slot directories and their lock files from a pool directory. */
|
|
12053
12643
|
async removeAllSlots(poolDir) {
|
|
@@ -12057,7 +12647,7 @@ var WorkspacePoolManager = class {
|
|
|
12057
12647
|
const lockPath = path34.join(poolDir, `${entry}.lock`);
|
|
12058
12648
|
if (existsSync3(lockPath)) {
|
|
12059
12649
|
try {
|
|
12060
|
-
const pidStr = await
|
|
12650
|
+
const pidStr = await readFile9(lockPath, "utf-8");
|
|
12061
12651
|
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
12062
12652
|
if (!Number.isNaN(pid)) {
|
|
12063
12653
|
try {
|
|
@@ -12416,9 +13006,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
12416
13006
|
}
|
|
12417
13007
|
|
|
12418
13008
|
// src/evaluation/yaml-parser.ts
|
|
12419
|
-
import { readFile as
|
|
13009
|
+
import { readFile as readFile16, stat as stat8 } from "node:fs/promises";
|
|
12420
13010
|
import path43 from "node:path";
|
|
12421
13011
|
import micromatch2 from "micromatch";
|
|
13012
|
+
import { stringify as stringifyYaml } from "yaml";
|
|
12422
13013
|
|
|
12423
13014
|
// src/evaluation/input-message-utils.ts
|
|
12424
13015
|
function flattenInputMessages(messages) {
|
|
@@ -12485,7 +13076,7 @@ function cloneJsonValue(value) {
|
|
|
12485
13076
|
}
|
|
12486
13077
|
|
|
12487
13078
|
// src/evaluation/loaders/agent-skills-parser.ts
|
|
12488
|
-
import { readFile as
|
|
13079
|
+
import { readFile as readFile10 } from "node:fs/promises";
|
|
12489
13080
|
import path37 from "node:path";
|
|
12490
13081
|
var ANSI_RED = "\x1B[31m";
|
|
12491
13082
|
var ANSI_RESET2 = "\x1B[0m";
|
|
@@ -12498,7 +13089,7 @@ function isAgentSkillsFormat(parsed) {
|
|
|
12498
13089
|
return Array.isArray(obj.evals);
|
|
12499
13090
|
}
|
|
12500
13091
|
async function loadTestsFromAgentSkills(filePath) {
|
|
12501
|
-
const raw = await
|
|
13092
|
+
const raw = await readFile10(filePath, "utf8");
|
|
12502
13093
|
let parsed;
|
|
12503
13094
|
try {
|
|
12504
13095
|
parsed = JSON.parse(raw);
|
|
@@ -12565,7 +13156,7 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
|
|
|
12565
13156
|
}
|
|
12566
13157
|
|
|
12567
13158
|
// src/evaluation/loaders/config-loader.ts
|
|
12568
|
-
import { readFile as
|
|
13159
|
+
import { readFile as readFile11 } from "node:fs/promises";
|
|
12569
13160
|
import path39 from "node:path";
|
|
12570
13161
|
|
|
12571
13162
|
// src/evaluation/loaders/file-resolver.ts
|
|
@@ -12679,20 +13270,22 @@ var DEFAULT_EVAL_PATTERNS = [
|
|
|
12679
13270
|
];
|
|
12680
13271
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
12681
13272
|
const directories = buildDirectoryChain2(evalFilePath, repoRoot);
|
|
13273
|
+
const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
|
|
12682
13274
|
for (const directory of directories) {
|
|
12683
13275
|
const configPath = path39.join(directory, ".agentv", "config.yaml");
|
|
12684
13276
|
if (!await fileExists3(configPath)) {
|
|
12685
13277
|
continue;
|
|
12686
13278
|
}
|
|
12687
13279
|
const config = await readConfigFile(configPath);
|
|
12688
|
-
if (config)
|
|
13280
|
+
if (config) {
|
|
13281
|
+
return config;
|
|
13282
|
+
}
|
|
12689
13283
|
}
|
|
12690
|
-
const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
|
|
12691
13284
|
return await fileExists3(globalConfigPath) ? readConfigFile(globalConfigPath) : null;
|
|
12692
13285
|
}
|
|
12693
13286
|
async function readConfigFile(configPath) {
|
|
12694
13287
|
try {
|
|
12695
|
-
const rawConfig = await
|
|
13288
|
+
const rawConfig = await readFile11(configPath, "utf8");
|
|
12696
13289
|
const parsed = interpolateEnv(parseYamlValue(rawConfig), process.env);
|
|
12697
13290
|
if (!isJsonObject(parsed)) {
|
|
12698
13291
|
logWarning(`Invalid config.yaml format at ${configPath}`);
|
|
@@ -12905,7 +13498,10 @@ function extractCacheConfig(suite) {
|
|
|
12905
13498
|
logWarning(`Invalid execution.cache: ${cache}. Must be a boolean. Ignoring.`);
|
|
12906
13499
|
return void 0;
|
|
12907
13500
|
}
|
|
12908
|
-
|
|
13501
|
+
if (executionObj.cachePath !== void 0) {
|
|
13502
|
+
logWarning("Invalid execution.cachePath: use snake_case execution.cache_path in YAML.");
|
|
13503
|
+
}
|
|
13504
|
+
const cachePath = executionObj.cache_path;
|
|
12909
13505
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
12910
13506
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
12911
13507
|
}
|
|
@@ -13074,6 +13670,12 @@ function parseResultsConfig(raw, configPath) {
|
|
|
13074
13670
|
...branchPrefix && { branch_prefix: branchPrefix }
|
|
13075
13671
|
};
|
|
13076
13672
|
}
|
|
13673
|
+
function resolveResultsConfigForProject(config, _projectId) {
|
|
13674
|
+
if (!config) {
|
|
13675
|
+
return void 0;
|
|
13676
|
+
}
|
|
13677
|
+
return config.results;
|
|
13678
|
+
}
|
|
13077
13679
|
function parseHooksConfig(raw, configPath) {
|
|
13078
13680
|
if (raw === void 0 || raw === null) {
|
|
13079
13681
|
return void 0;
|
|
@@ -13098,15 +13700,15 @@ function logWarning(message) {
|
|
|
13098
13700
|
}
|
|
13099
13701
|
|
|
13100
13702
|
// src/evaluation/loaders/grader-parser.ts
|
|
13101
|
-
import { readFile as
|
|
13703
|
+
import { readFile as readFile13 } from "node:fs/promises";
|
|
13102
13704
|
import path40 from "node:path";
|
|
13103
13705
|
|
|
13104
13706
|
// src/evaluation/validation/prompt-validator.ts
|
|
13105
|
-
import { readFile as
|
|
13707
|
+
import { readFile as readFile12 } from "node:fs/promises";
|
|
13106
13708
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
13107
13709
|
var ANSI_RESET4 = "\x1B[0m";
|
|
13108
13710
|
async function validateCustomPromptContent(promptPath) {
|
|
13109
|
-
const content = await
|
|
13711
|
+
const content = await readFile12(promptPath, "utf8");
|
|
13110
13712
|
validateTemplateVariables(content, promptPath);
|
|
13111
13713
|
}
|
|
13112
13714
|
function validateTemplateVariables(content, source) {
|
|
@@ -13238,7 +13840,7 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
|
|
|
13238
13840
|
const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
|
|
13239
13841
|
throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
|
|
13240
13842
|
}
|
|
13241
|
-
const content = await
|
|
13843
|
+
const content = await readFile13(resolved.resolvedPath, "utf8");
|
|
13242
13844
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
13243
13845
|
if (!isJsonObject2(parsed)) {
|
|
13244
13846
|
throw new Error(
|
|
@@ -13285,6 +13887,103 @@ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, inc
|
|
|
13285
13887
|
}
|
|
13286
13888
|
return expanded;
|
|
13287
13889
|
}
|
|
13890
|
+
async function collectAssertionTemplateSourceReferences(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
13891
|
+
const execution = rawEvalCase.execution;
|
|
13892
|
+
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
13893
|
+
const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators;
|
|
13894
|
+
const skipDefaults = executionObject?.skip_defaults === true;
|
|
13895
|
+
const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
|
|
13896
|
+
return [
|
|
13897
|
+
...await collectAssertionTemplateReferencesFromValue(caseEvaluators, searchRoots, evalId),
|
|
13898
|
+
...await collectAssertionTemplateReferencesFromValue(rootEvaluators, searchRoots, evalId)
|
|
13899
|
+
];
|
|
13900
|
+
}
|
|
13901
|
+
async function collectAssertionTemplateReferencesFromValue(value, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
|
|
13902
|
+
if (value === void 0) {
|
|
13903
|
+
return [];
|
|
13904
|
+
}
|
|
13905
|
+
const references = [];
|
|
13906
|
+
if (Array.isArray(value)) {
|
|
13907
|
+
for (const item of value) {
|
|
13908
|
+
if (isIncludeEntry(item)) {
|
|
13909
|
+
const nextDepth = includeContext.depth + 1;
|
|
13910
|
+
if (nextDepth > MAX_ASSERTION_INCLUDE_DEPTH) {
|
|
13911
|
+
const chain = [...includeContext.chain, item.include].join(" -> ");
|
|
13912
|
+
throw new Error(
|
|
13913
|
+
`Assertion template include depth exceeded ${MAX_ASSERTION_INCLUDE_DEPTH} in '${evalId}'. Include chain: ${chain}`
|
|
13914
|
+
);
|
|
13915
|
+
}
|
|
13916
|
+
const resolved = await resolveAssertionTemplateReference(item.include, searchRoots);
|
|
13917
|
+
references.push({
|
|
13918
|
+
kind: "assertion_template",
|
|
13919
|
+
displayPath: resolved.displayPath,
|
|
13920
|
+
...resolved.resolvedPath ? { resolvedPath: path40.resolve(resolved.resolvedPath) } : {}
|
|
13921
|
+
});
|
|
13922
|
+
if (resolved.resolvedPath) {
|
|
13923
|
+
if (includeContext.chain.includes(resolved.resolvedPath)) {
|
|
13924
|
+
const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
|
|
13925
|
+
throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
|
|
13926
|
+
}
|
|
13927
|
+
const content = await readFile13(resolved.resolvedPath, "utf8");
|
|
13928
|
+
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
13929
|
+
if (isJsonObject2(parsed) && Array.isArray(parsed.assertions)) {
|
|
13930
|
+
const templateDir = path40.dirname(resolved.resolvedPath);
|
|
13931
|
+
const nestedSearchRoots = [
|
|
13932
|
+
templateDir,
|
|
13933
|
+
...searchRoots.filter((root) => path40.resolve(root) !== templateDir)
|
|
13934
|
+
];
|
|
13935
|
+
references.push(
|
|
13936
|
+
...await collectAssertionTemplateReferencesFromValue(
|
|
13937
|
+
parsed.assertions,
|
|
13938
|
+
nestedSearchRoots,
|
|
13939
|
+
evalId,
|
|
13940
|
+
{
|
|
13941
|
+
depth: nextDepth,
|
|
13942
|
+
chain: [...includeContext.chain, resolved.resolvedPath]
|
|
13943
|
+
}
|
|
13944
|
+
)
|
|
13945
|
+
);
|
|
13946
|
+
}
|
|
13947
|
+
}
|
|
13948
|
+
continue;
|
|
13949
|
+
}
|
|
13950
|
+
if (isJsonObject2(item)) {
|
|
13951
|
+
references.push(
|
|
13952
|
+
...await collectAssertionTemplateReferencesFromObject(
|
|
13953
|
+
item,
|
|
13954
|
+
searchRoots,
|
|
13955
|
+
evalId,
|
|
13956
|
+
includeContext
|
|
13957
|
+
)
|
|
13958
|
+
);
|
|
13959
|
+
}
|
|
13960
|
+
}
|
|
13961
|
+
} else if (isJsonObject2(value)) {
|
|
13962
|
+
references.push(
|
|
13963
|
+
...await collectAssertionTemplateReferencesFromObject(
|
|
13964
|
+
value,
|
|
13965
|
+
searchRoots,
|
|
13966
|
+
evalId,
|
|
13967
|
+
includeContext
|
|
13968
|
+
)
|
|
13969
|
+
);
|
|
13970
|
+
}
|
|
13971
|
+
return references;
|
|
13972
|
+
}
|
|
13973
|
+
async function collectAssertionTemplateReferencesFromObject(value, searchRoots, evalId, includeContext) {
|
|
13974
|
+
const references = [];
|
|
13975
|
+
for (const key of ["assertions", "assert", "evaluators"]) {
|
|
13976
|
+
references.push(
|
|
13977
|
+
...await collectAssertionTemplateReferencesFromValue(
|
|
13978
|
+
value[key],
|
|
13979
|
+
searchRoots,
|
|
13980
|
+
evalId,
|
|
13981
|
+
includeContext
|
|
13982
|
+
)
|
|
13983
|
+
);
|
|
13984
|
+
}
|
|
13985
|
+
return references;
|
|
13986
|
+
}
|
|
13288
13987
|
async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
|
|
13289
13988
|
const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
|
|
13290
13989
|
if (!expandedEvaluators) {
|
|
@@ -13411,6 +14110,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
13411
14110
|
continue;
|
|
13412
14111
|
}
|
|
13413
14112
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
14113
|
+
const resolvedScriptPath = await resolveOptionalCommandSource(command, searchRoots);
|
|
13414
14114
|
const cwd = asString(rawEvaluator.cwd);
|
|
13415
14115
|
let resolvedCwd;
|
|
13416
14116
|
if (cwd) {
|
|
@@ -13476,6 +14176,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
13476
14176
|
name,
|
|
13477
14177
|
type: "code-grader",
|
|
13478
14178
|
command,
|
|
14179
|
+
...resolvedScriptPath ? { resolvedScriptPath } : {},
|
|
13479
14180
|
cwd,
|
|
13480
14181
|
resolvedCwd,
|
|
13481
14182
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -14543,6 +15244,17 @@ function asStringArray(value, description) {
|
|
|
14543
15244
|
}
|
|
14544
15245
|
return result;
|
|
14545
15246
|
}
|
|
15247
|
+
async function resolveOptionalCommandSource(command, searchRoots) {
|
|
15248
|
+
const candidate = command.at(-1);
|
|
15249
|
+
if (!candidate || !looksLikeFilePath(candidate)) {
|
|
15250
|
+
return void 0;
|
|
15251
|
+
}
|
|
15252
|
+
const resolved = await resolveFileReference(candidate, searchRoots);
|
|
15253
|
+
return resolved.resolvedPath ? path40.resolve(resolved.resolvedPath) : void 0;
|
|
15254
|
+
}
|
|
15255
|
+
function looksLikeFilePath(value) {
|
|
15256
|
+
return path40.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
|
|
15257
|
+
}
|
|
14546
15258
|
function parseCommandToArgv(command) {
|
|
14547
15259
|
if (process.platform === "win32") {
|
|
14548
15260
|
return ["cmd.exe", "/c", command];
|
|
@@ -14611,6 +15323,19 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
14611
15323
|
function isValidFieldAggregationType(value) {
|
|
14612
15324
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
14613
15325
|
}
|
|
15326
|
+
var VALID_RUBRIC_OPERATORS = new Set(RUBRIC_OPERATOR_VALUES);
|
|
15327
|
+
function parseRubricOperator(value, rubricId, evaluatorName, evalId) {
|
|
15328
|
+
if (value === void 0) {
|
|
15329
|
+
return void 0;
|
|
15330
|
+
}
|
|
15331
|
+
if (typeof value === "string" && VALID_RUBRIC_OPERATORS.has(value)) {
|
|
15332
|
+
return value;
|
|
15333
|
+
}
|
|
15334
|
+
logWarning2(
|
|
15335
|
+
`Ignoring invalid operator for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be one of ${RUBRIC_OPERATOR_VALUES.join(", ")}`
|
|
15336
|
+
);
|
|
15337
|
+
return void 0;
|
|
15338
|
+
}
|
|
14614
15339
|
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
14615
15340
|
const items = [];
|
|
14616
15341
|
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
@@ -14621,7 +15346,8 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
14621
15346
|
continue;
|
|
14622
15347
|
}
|
|
14623
15348
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
14624
|
-
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
15349
|
+
const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? "";
|
|
15350
|
+
const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId);
|
|
14625
15351
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
14626
15352
|
let minScore;
|
|
14627
15353
|
let requiredMinScore;
|
|
@@ -14665,6 +15391,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
14665
15391
|
id,
|
|
14666
15392
|
weight,
|
|
14667
15393
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
15394
|
+
...operator !== void 0 ? { operator } : {},
|
|
14668
15395
|
...required !== void 0 ? { required } : {},
|
|
14669
15396
|
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
14670
15397
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
@@ -14680,6 +15407,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
14680
15407
|
items.push({
|
|
14681
15408
|
id,
|
|
14682
15409
|
outcome: expectedOutcome,
|
|
15410
|
+
...operator !== void 0 ? { operator } : {},
|
|
14683
15411
|
weight,
|
|
14684
15412
|
// Default to required: true if not specified (backward compatibility)
|
|
14685
15413
|
required: required ?? true,
|
|
@@ -14802,6 +15530,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
14802
15530
|
};
|
|
14803
15531
|
}
|
|
14804
15532
|
const expectedOutcome = asString(rubric.outcome) ?? "";
|
|
15533
|
+
const id = asString(rubric.id) ?? `rubric-${index + 1}`;
|
|
15534
|
+
const operator = parseRubricOperator(rubric.operator, id, "rubrics", "<inline>");
|
|
14805
15535
|
const rawScoreRanges = rubric.score_ranges;
|
|
14806
15536
|
const normalizedScoreRanges = rawScoreRanges !== void 0 ? normalizeScoreRangesShorthand(rawScoreRanges) : void 0;
|
|
14807
15537
|
const scoreRanges = Array.isArray(normalizedScoreRanges) && normalizedScoreRanges.length > 0 ? normalizedScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
@@ -14809,7 +15539,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
14809
15539
|
outcome: asString(range.outcome) ?? ""
|
|
14810
15540
|
})).filter((r) => r.outcome.length > 0) : void 0;
|
|
14811
15541
|
const baseRubric = {
|
|
14812
|
-
id
|
|
15542
|
+
id,
|
|
15543
|
+
...operator !== void 0 ? { operator } : {},
|
|
14813
15544
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
14814
15545
|
};
|
|
14815
15546
|
let inlineMinScore;
|
|
@@ -14850,12 +15581,12 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
14850
15581
|
}
|
|
14851
15582
|
|
|
14852
15583
|
// src/evaluation/loaders/jsonl-parser.ts
|
|
14853
|
-
import { readFile as
|
|
15584
|
+
import { readFile as readFile15 } from "node:fs/promises";
|
|
14854
15585
|
import path42 from "node:path";
|
|
14855
15586
|
import micromatch from "micromatch";
|
|
14856
15587
|
|
|
14857
15588
|
// src/evaluation/loaders/message-processor.ts
|
|
14858
|
-
import { readFile as
|
|
15589
|
+
import { readFile as readFile14 } from "node:fs/promises";
|
|
14859
15590
|
import path41 from "node:path";
|
|
14860
15591
|
|
|
14861
15592
|
// src/evaluation/formatting/segment-formatter.ts
|
|
@@ -14982,7 +15713,7 @@ async function processMessages(options) {
|
|
|
14982
15713
|
continue;
|
|
14983
15714
|
}
|
|
14984
15715
|
try {
|
|
14985
|
-
const fileContent = (await
|
|
15716
|
+
const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
14986
15717
|
processedContent.push({
|
|
14987
15718
|
...cloneJsonObject(rawSegment),
|
|
14988
15719
|
path: displayPath,
|
|
@@ -15023,7 +15754,7 @@ async function processMessages(options) {
|
|
|
15023
15754
|
continue;
|
|
15024
15755
|
}
|
|
15025
15756
|
try {
|
|
15026
|
-
const imageBuffer = await
|
|
15757
|
+
const imageBuffer = await readFile14(resolvedPath);
|
|
15027
15758
|
const base64 = imageBuffer.toString("base64");
|
|
15028
15759
|
processedContent.push({
|
|
15029
15760
|
type: "image",
|
|
@@ -15106,7 +15837,7 @@ async function processExpectedMessages(options) {
|
|
|
15106
15837
|
continue;
|
|
15107
15838
|
}
|
|
15108
15839
|
try {
|
|
15109
|
-
const fileContent = (await
|
|
15840
|
+
const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
15110
15841
|
processedContent.push({
|
|
15111
15842
|
type: "file",
|
|
15112
15843
|
path: displayPath,
|
|
@@ -15146,7 +15877,7 @@ async function processExpectedMessages(options) {
|
|
|
15146
15877
|
continue;
|
|
15147
15878
|
}
|
|
15148
15879
|
try {
|
|
15149
|
-
const imageBuffer = await
|
|
15880
|
+
const imageBuffer = await readFile14(resolvedPath);
|
|
15150
15881
|
const base64 = imageBuffer.toString("base64");
|
|
15151
15882
|
processedContent.push({
|
|
15152
15883
|
type: "image",
|
|
@@ -15188,6 +15919,12 @@ function expandInputShorthand(value) {
|
|
|
15188
15919
|
if (typeof value === "string") {
|
|
15189
15920
|
return [{ role: "user", content: value }];
|
|
15190
15921
|
}
|
|
15922
|
+
if (isJsonObject(value)) {
|
|
15923
|
+
if ("role" in value) {
|
|
15924
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
15925
|
+
}
|
|
15926
|
+
return [{ role: "user", content: value }];
|
|
15927
|
+
}
|
|
15191
15928
|
if (Array.isArray(value)) {
|
|
15192
15929
|
const messages = value.filter((msg) => isTestMessage(msg));
|
|
15193
15930
|
return messages.length > 0 ? messages : void 0;
|
|
@@ -15275,7 +16012,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
|
15275
16012
|
return {};
|
|
15276
16013
|
}
|
|
15277
16014
|
try {
|
|
15278
|
-
const content = await
|
|
16015
|
+
const content = await readFile15(sidecarPath, "utf8");
|
|
15279
16016
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
15280
16017
|
if (!isJsonObject(parsed)) {
|
|
15281
16018
|
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
@@ -15320,7 +16057,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
15320
16057
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
15321
16058
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
15322
16059
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
15323
|
-
const rawFile = await
|
|
16060
|
+
const rawFile = await readFile15(absoluteTestPath, "utf8");
|
|
15324
16061
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
15325
16062
|
const fallbackSuiteName = path42.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
15326
16063
|
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
@@ -15457,16 +16194,16 @@ ${detailBlock}${ANSI_RESET7}`);
|
|
|
15457
16194
|
}
|
|
15458
16195
|
|
|
15459
16196
|
// src/evaluation/metadata.ts
|
|
15460
|
-
import { z as
|
|
15461
|
-
var MetadataSchema =
|
|
15462
|
-
name:
|
|
15463
|
-
description:
|
|
15464
|
-
version:
|
|
15465
|
-
author:
|
|
15466
|
-
tags:
|
|
15467
|
-
license:
|
|
15468
|
-
requires:
|
|
15469
|
-
agentv:
|
|
16197
|
+
import { z as z4 } from "zod";
|
|
16198
|
+
var MetadataSchema = z4.object({
|
|
16199
|
+
name: z4.string().min(1).max(64).regex(/^[a-z0-9-]+$/).optional(),
|
|
16200
|
+
description: z4.string().min(1).max(1024).optional(),
|
|
16201
|
+
version: z4.string().optional(),
|
|
16202
|
+
author: z4.string().optional(),
|
|
16203
|
+
tags: z4.array(z4.string()).optional(),
|
|
16204
|
+
license: z4.string().optional(),
|
|
16205
|
+
requires: z4.object({
|
|
16206
|
+
agentv: z4.string().optional()
|
|
15470
16207
|
}).optional()
|
|
15471
16208
|
});
|
|
15472
16209
|
function parseMetadata(suite) {
|
|
@@ -15738,7 +16475,7 @@ function interpolateRawEvalCase(raw, vars) {
|
|
|
15738
16475
|
async function readTestSuiteMetadata(testFilePath) {
|
|
15739
16476
|
try {
|
|
15740
16477
|
const absolutePath = path43.resolve(testFilePath);
|
|
15741
|
-
const content = await
|
|
16478
|
+
const content = await readFile16(absolutePath, "utf8");
|
|
15742
16479
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
15743
16480
|
if (!isJsonObject(parsed)) {
|
|
15744
16481
|
return {};
|
|
@@ -15762,7 +16499,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
15762
16499
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
15763
16500
|
}
|
|
15764
16501
|
if (format === "typescript") {
|
|
15765
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
16502
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT.js");
|
|
15766
16503
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
15767
16504
|
}
|
|
15768
16505
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -15797,7 +16534,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
15797
16534
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
15798
16535
|
}
|
|
15799
16536
|
if (format === "typescript") {
|
|
15800
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
16537
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT.js");
|
|
15801
16538
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
15802
16539
|
return suite.tests;
|
|
15803
16540
|
}
|
|
@@ -15812,8 +16549,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15812
16549
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
15813
16550
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
15814
16551
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
15815
|
-
const rawFile = await
|
|
15816
|
-
const
|
|
16552
|
+
const rawFile = await readFile16(absoluteTestPath, "utf8");
|
|
16553
|
+
const rawParsed = parseYamlValue(rawFile);
|
|
16554
|
+
const rawCaseSnapshots = buildRawInlineTestSnapshots(rawParsed);
|
|
16555
|
+
const interpolated = interpolateEnv(rawParsed, process.env);
|
|
15817
16556
|
if (!isJsonObject(interpolated)) {
|
|
15818
16557
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
15819
16558
|
}
|
|
@@ -15850,7 +16589,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15850
16589
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
15851
16590
|
}
|
|
15852
16591
|
const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
|
|
15853
|
-
const
|
|
16592
|
+
const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
|
|
15854
16593
|
const rawSuiteInput = suite.input;
|
|
15855
16594
|
const rawSuiteInputFiles = suite.input_files;
|
|
15856
16595
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
@@ -15952,6 +16691,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15952
16691
|
logError3(`Skipping test '${id}': ${message}`);
|
|
15953
16692
|
continue;
|
|
15954
16693
|
}
|
|
16694
|
+
const assertionTemplateReferences = await collectAssertionTemplateSourceReferences(
|
|
16695
|
+
renderedCase,
|
|
16696
|
+
globalExecution,
|
|
16697
|
+
searchRoots,
|
|
16698
|
+
id ?? "unknown"
|
|
16699
|
+
);
|
|
15955
16700
|
const inlineRubrics = renderedCase.rubrics;
|
|
15956
16701
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
15957
16702
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
@@ -15964,8 +16709,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
15964
16709
|
const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir);
|
|
15965
16710
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
15966
16711
|
const rawCaseMetadata = isJsonObject(renderedCase.metadata) ? renderedCase.metadata : void 0;
|
|
15967
|
-
const
|
|
15968
|
-
const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload);
|
|
16712
|
+
const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload);
|
|
15969
16713
|
const caseTargets = extractTargetsFromTestCase(renderedCase);
|
|
15970
16714
|
const dependsOn = Array.isArray(renderedCase.depends_on) ? renderedCase.depends_on.filter(
|
|
15971
16715
|
(v) => typeof v === "string"
|
|
@@ -16004,12 +16748,245 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16004
16748
|
...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
|
|
16005
16749
|
...windowSize !== void 0 ? { window_size: windowSize } : {},
|
|
16006
16750
|
...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
|
|
16007
|
-
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
|
|
16751
|
+
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {},
|
|
16752
|
+
source: buildEvalTestSource({
|
|
16753
|
+
evalFilePath,
|
|
16754
|
+
absoluteTestPath,
|
|
16755
|
+
repoRootPath,
|
|
16756
|
+
id,
|
|
16757
|
+
renderedCase,
|
|
16758
|
+
rawCaseSnapshots,
|
|
16759
|
+
inputMessages,
|
|
16760
|
+
evaluators,
|
|
16761
|
+
assertionTemplateReferences
|
|
16762
|
+
})
|
|
16008
16763
|
};
|
|
16009
16764
|
results.push(testCase);
|
|
16010
16765
|
}
|
|
16011
16766
|
return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
|
|
16012
16767
|
}
|
|
16768
|
+
var SOURCE_SECRET_KEY_PATTERN = /(api[_-]?key|authorization|bearer|credential|password|private[_-]?key|secret|token)/i;
|
|
16769
|
+
var REDACTED_SOURCE_VALUE = "[redacted]";
|
|
16770
|
+
function buildRawInlineTestSnapshots(rawParsed) {
|
|
16771
|
+
const snapshots = /* @__PURE__ */ new Map();
|
|
16772
|
+
if (!isJsonObject(rawParsed)) {
|
|
16773
|
+
return snapshots;
|
|
16774
|
+
}
|
|
16775
|
+
const rawTests = rawParsed.tests ?? rawParsed.eval_cases ?? rawParsed.evalcases;
|
|
16776
|
+
if (!Array.isArray(rawTests)) {
|
|
16777
|
+
return snapshots;
|
|
16778
|
+
}
|
|
16779
|
+
for (const rawTest of rawTests) {
|
|
16780
|
+
if (!isJsonObject(rawTest) || typeof rawTest.id !== "string") {
|
|
16781
|
+
continue;
|
|
16782
|
+
}
|
|
16783
|
+
snapshots.set(rawTest.id, stringifySourceYaml(rawTest));
|
|
16784
|
+
}
|
|
16785
|
+
return snapshots;
|
|
16786
|
+
}
|
|
16787
|
+
function buildEvalTestSource(params) {
|
|
16788
|
+
const evalFileRepoPath = toPortableRelativePath(params.repoRootPath, params.absoluteTestPath);
|
|
16789
|
+
const testSnapshotYaml = params.rawCaseSnapshots.get(params.id) ?? stringifySourceYaml(params.renderedCase);
|
|
16790
|
+
const evaluatorReferences = collectGraderSourceReferences(params.evaluators);
|
|
16791
|
+
const inputReferences = collectInputSourceReferences(params.inputMessages);
|
|
16792
|
+
const references = dedupeSourceReferences([
|
|
16793
|
+
...inputReferences,
|
|
16794
|
+
...evaluatorReferences,
|
|
16795
|
+
...params.assertionTemplateReferences
|
|
16796
|
+
]);
|
|
16797
|
+
return {
|
|
16798
|
+
evalFilePath: params.evalFilePath,
|
|
16799
|
+
evalFileAbsolutePath: params.absoluteTestPath,
|
|
16800
|
+
...evalFileRepoPath ? { evalFileRepoPath } : {},
|
|
16801
|
+
testId: params.id,
|
|
16802
|
+
testSnapshotYaml,
|
|
16803
|
+
graderDefinitions: buildGraderSourceDefinitions(params.evaluators),
|
|
16804
|
+
references
|
|
16805
|
+
};
|
|
16806
|
+
}
|
|
16807
|
+
function stringifySourceYaml(value) {
|
|
16808
|
+
return stringifyYaml(sanitizeSourceValue(value), { lineWidth: 0 }).trimEnd();
|
|
16809
|
+
}
|
|
16810
|
+
function sanitizeSourceValue(value, keyHint) {
|
|
16811
|
+
if (keyHint && SOURCE_SECRET_KEY_PATTERN.test(keyHint)) {
|
|
16812
|
+
return REDACTED_SOURCE_VALUE;
|
|
16813
|
+
}
|
|
16814
|
+
if (value === null || typeof value === "string" || typeof value === "number") {
|
|
16815
|
+
return value;
|
|
16816
|
+
}
|
|
16817
|
+
if (typeof value === "boolean") {
|
|
16818
|
+
return value;
|
|
16819
|
+
}
|
|
16820
|
+
if (Array.isArray(value)) {
|
|
16821
|
+
return value.map((item) => sanitizeSourceValue(item));
|
|
16822
|
+
}
|
|
16823
|
+
if (typeof value === "object" && value !== null) {
|
|
16824
|
+
const entries = Object.entries(value).map(([key, entryValue]) => [
|
|
16825
|
+
key,
|
|
16826
|
+
sanitizeSourceValue(entryValue, key)
|
|
16827
|
+
]);
|
|
16828
|
+
return Object.fromEntries(entries);
|
|
16829
|
+
}
|
|
16830
|
+
return String(value);
|
|
16831
|
+
}
|
|
16832
|
+
function buildGraderSourceDefinitions(evaluators) {
|
|
16833
|
+
return (evaluators ?? []).map((evaluator) => ({
|
|
16834
|
+
name: evaluator.name,
|
|
16835
|
+
type: evaluator.type,
|
|
16836
|
+
...evaluator.weight !== void 0 ? { weight: evaluator.weight } : {},
|
|
16837
|
+
...evaluator.required !== void 0 ? { required: evaluator.required } : {},
|
|
16838
|
+
..."min_score" in evaluator && evaluator.min_score !== void 0 ? { minScore: evaluator.min_score } : {},
|
|
16839
|
+
definition: sanitizeGraderDefinition(evaluator)
|
|
16840
|
+
}));
|
|
16841
|
+
}
|
|
16842
|
+
function sanitizeGraderDefinition(evaluator) {
|
|
16843
|
+
const copy = sanitizeSourceValue(evaluator);
|
|
16844
|
+
return stripRuntimeResolutionFields(copy);
|
|
16845
|
+
}
|
|
16846
|
+
function stripRuntimeResolutionFields(value) {
|
|
16847
|
+
const stripped = {};
|
|
16848
|
+
for (const [key, entryValue] of Object.entries(value)) {
|
|
16849
|
+
if (key === "resolvedPromptPath" || key === "promptPath" || key === "resolvedPromptScript" || key === "resolvedScriptPath" || key === "resolvedCwd" || key === "resolvedCommand") {
|
|
16850
|
+
continue;
|
|
16851
|
+
}
|
|
16852
|
+
if (Array.isArray(entryValue)) {
|
|
16853
|
+
stripped[key] = entryValue.map(
|
|
16854
|
+
(item) => isJsonObject(item) ? stripRuntimeResolutionFields(item) : item
|
|
16855
|
+
);
|
|
16856
|
+
} else if (isJsonObject(entryValue)) {
|
|
16857
|
+
stripped[key] = stripRuntimeResolutionFields(entryValue);
|
|
16858
|
+
} else {
|
|
16859
|
+
stripped[key] = entryValue;
|
|
16860
|
+
}
|
|
16861
|
+
}
|
|
16862
|
+
return stripped;
|
|
16863
|
+
}
|
|
16864
|
+
function collectInputSourceReferences(inputMessages) {
|
|
16865
|
+
const references = [];
|
|
16866
|
+
for (const message of inputMessages) {
|
|
16867
|
+
if (!Array.isArray(message.content)) {
|
|
16868
|
+
continue;
|
|
16869
|
+
}
|
|
16870
|
+
for (const segment of message.content) {
|
|
16871
|
+
if (!isJsonObject(segment) || segment.type !== "file") {
|
|
16872
|
+
continue;
|
|
16873
|
+
}
|
|
16874
|
+
const displayPath = typeof segment.path === "string" ? segment.path : typeof segment.value === "string" ? segment.value : "input file";
|
|
16875
|
+
references.push({
|
|
16876
|
+
kind: "input_file",
|
|
16877
|
+
displayPath,
|
|
16878
|
+
...typeof segment.resolvedPath === "string" ? { resolvedPath: path43.resolve(segment.resolvedPath) } : {}
|
|
16879
|
+
});
|
|
16880
|
+
}
|
|
16881
|
+
}
|
|
16882
|
+
return references;
|
|
16883
|
+
}
|
|
16884
|
+
function collectGraderSourceReferences(evaluators) {
|
|
16885
|
+
const references = [];
|
|
16886
|
+
for (const evaluator of evaluators ?? []) {
|
|
16887
|
+
references.push(...collectSingleGraderSourceReferences(evaluator));
|
|
16888
|
+
}
|
|
16889
|
+
return references;
|
|
16890
|
+
}
|
|
16891
|
+
function collectSingleGraderSourceReferences(evaluator) {
|
|
16892
|
+
const references = [];
|
|
16893
|
+
if (evaluator.type === "code-grader") {
|
|
16894
|
+
const command = evaluator.command ?? evaluator.script ?? [];
|
|
16895
|
+
references.push({
|
|
16896
|
+
kind: "code_grader_command",
|
|
16897
|
+
displayPath: evaluator.resolvedScriptPath ?? command.join(" "),
|
|
16898
|
+
...evaluator.resolvedScriptPath ? { resolvedPath: evaluator.resolvedScriptPath } : {},
|
|
16899
|
+
graderName: evaluator.name,
|
|
16900
|
+
command
|
|
16901
|
+
});
|
|
16902
|
+
if (evaluator.resolvedCwd) {
|
|
16903
|
+
references.push({
|
|
16904
|
+
kind: "code_grader_cwd",
|
|
16905
|
+
displayPath: evaluator.cwd ?? evaluator.resolvedCwd,
|
|
16906
|
+
resolvedPath: evaluator.resolvedCwd,
|
|
16907
|
+
graderName: evaluator.name
|
|
16908
|
+
});
|
|
16909
|
+
}
|
|
16910
|
+
}
|
|
16911
|
+
if (evaluator.type === "llm-grader") {
|
|
16912
|
+
const promptPath = evaluator.resolvedPromptPath ?? evaluator.promptPath;
|
|
16913
|
+
if (promptPath) {
|
|
16914
|
+
references.push({
|
|
16915
|
+
kind: "llm_grader_prompt",
|
|
16916
|
+
displayPath: typeof evaluator.prompt === "string" ? evaluator.prompt : promptPath,
|
|
16917
|
+
resolvedPath: promptPath,
|
|
16918
|
+
graderName: evaluator.name
|
|
16919
|
+
});
|
|
16920
|
+
}
|
|
16921
|
+
if (evaluator.resolvedPromptScript && evaluator.resolvedPromptScript.length > 0) {
|
|
16922
|
+
references.push({
|
|
16923
|
+
kind: "prompt_script",
|
|
16924
|
+
displayPath: evaluator.resolvedPromptScript.at(-1) ?? evaluator.name,
|
|
16925
|
+
resolvedPath: evaluator.resolvedPromptScript.at(-1),
|
|
16926
|
+
graderName: evaluator.name,
|
|
16927
|
+
command: evaluator.resolvedPromptScript
|
|
16928
|
+
});
|
|
16929
|
+
}
|
|
16930
|
+
}
|
|
16931
|
+
const preprocessors = "preprocessors" in evaluator ? evaluator.preprocessors : void 0;
|
|
16932
|
+
for (const preprocessor of preprocessors ?? []) {
|
|
16933
|
+
if (preprocessor.resolvedCommand && preprocessor.resolvedCommand.length > 0) {
|
|
16934
|
+
references.push({
|
|
16935
|
+
kind: "preprocessor_command",
|
|
16936
|
+
displayPath: preprocessor.resolvedCommand.at(-1) ?? preprocessor.type,
|
|
16937
|
+
resolvedPath: preprocessor.resolvedCommand.at(-1),
|
|
16938
|
+
graderName: evaluator.name,
|
|
16939
|
+
command: preprocessor.resolvedCommand
|
|
16940
|
+
});
|
|
16941
|
+
}
|
|
16942
|
+
}
|
|
16943
|
+
if (evaluator.type === "composite") {
|
|
16944
|
+
for (const member of evaluator.assertions) {
|
|
16945
|
+
references.push(...collectSingleGraderSourceReferences(member));
|
|
16946
|
+
}
|
|
16947
|
+
if (evaluator.aggregator.type === "code-grader") {
|
|
16948
|
+
references.push({
|
|
16949
|
+
kind: "code_grader_command",
|
|
16950
|
+
displayPath: evaluator.aggregator.path,
|
|
16951
|
+
resolvedPath: path43.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
|
|
16952
|
+
graderName: evaluator.name
|
|
16953
|
+
});
|
|
16954
|
+
} else if (evaluator.aggregator.type === "llm-grader" && evaluator.aggregator.promptPath) {
|
|
16955
|
+
references.push({
|
|
16956
|
+
kind: "llm_grader_prompt",
|
|
16957
|
+
displayPath: evaluator.aggregator.prompt ?? evaluator.aggregator.promptPath,
|
|
16958
|
+
resolvedPath: evaluator.aggregator.promptPath,
|
|
16959
|
+
graderName: evaluator.name
|
|
16960
|
+
});
|
|
16961
|
+
}
|
|
16962
|
+
}
|
|
16963
|
+
return references;
|
|
16964
|
+
}
|
|
16965
|
+
function dedupeSourceReferences(references) {
|
|
16966
|
+
const seen = /* @__PURE__ */ new Set();
|
|
16967
|
+
const deduped = [];
|
|
16968
|
+
for (const reference of references) {
|
|
16969
|
+
const key = JSON.stringify([
|
|
16970
|
+
reference.kind,
|
|
16971
|
+
reference.resolvedPath ?? reference.displayPath,
|
|
16972
|
+
reference.graderName ?? "",
|
|
16973
|
+
reference.command?.join("\0") ?? ""
|
|
16974
|
+
]);
|
|
16975
|
+
if (seen.has(key)) {
|
|
16976
|
+
continue;
|
|
16977
|
+
}
|
|
16978
|
+
seen.add(key);
|
|
16979
|
+
deduped.push(reference);
|
|
16980
|
+
}
|
|
16981
|
+
return deduped;
|
|
16982
|
+
}
|
|
16983
|
+
function toPortableRelativePath(root, candidate) {
|
|
16984
|
+
const relative = path43.relative(root, candidate);
|
|
16985
|
+
if (relative && !relative.startsWith("..") && !path43.isAbsolute(relative)) {
|
|
16986
|
+
return relative.split(path43.sep).join("/");
|
|
16987
|
+
}
|
|
16988
|
+
return void 0;
|
|
16989
|
+
}
|
|
16013
16990
|
async function loadTestById(evalFilePath, repoRoot, evalId) {
|
|
16014
16991
|
const tests = await loadTests(evalFilePath, repoRoot);
|
|
16015
16992
|
const match = tests.find((c) => c.id === evalId);
|
|
@@ -16102,7 +17079,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
|
16102
17079
|
const workspaceFilePath = path43.resolve(evalFileDir, raw);
|
|
16103
17080
|
let content;
|
|
16104
17081
|
try {
|
|
16105
|
-
content = await
|
|
17082
|
+
content = await readFile16(workspaceFilePath, "utf8");
|
|
16106
17083
|
} catch {
|
|
16107
17084
|
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
16108
17085
|
}
|
|
@@ -16226,19 +17203,18 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
16226
17203
|
function asString5(value) {
|
|
16227
17204
|
return typeof value === "string" ? value : void 0;
|
|
16228
17205
|
}
|
|
16229
|
-
function
|
|
17206
|
+
function extractSuiteMetadataPayload(suite) {
|
|
17207
|
+
const payload = isJsonObject(suite.metadata) ? { ...suite.metadata } : {};
|
|
16230
17208
|
const top = suite.governance;
|
|
16231
17209
|
if (isJsonObject(top)) {
|
|
16232
|
-
|
|
16233
|
-
}
|
|
16234
|
-
|
|
16235
|
-
if (isJsonObject(wrapper)) {
|
|
16236
|
-
const nested = wrapper.governance;
|
|
17210
|
+
payload.governance = top;
|
|
17211
|
+
} else {
|
|
17212
|
+
const nested = payload.governance;
|
|
16237
17213
|
if (isJsonObject(nested)) {
|
|
16238
|
-
|
|
17214
|
+
payload.governance = nested;
|
|
16239
17215
|
}
|
|
16240
17216
|
}
|
|
16241
|
-
return void 0;
|
|
17217
|
+
return Object.keys(payload).length > 0 ? payload : void 0;
|
|
16242
17218
|
}
|
|
16243
17219
|
function mergeSuiteMetadataPayload(caseMetadata, suitePayload) {
|
|
16244
17220
|
if (!suitePayload) return caseMetadata;
|
|
@@ -16729,7 +17705,7 @@ async function runEvaluation(options) {
|
|
|
16729
17705
|
const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
|
|
16730
17706
|
if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
|
|
16731
17707
|
if (!dirExists) {
|
|
16732
|
-
await
|
|
17708
|
+
await mkdir15(configuredStaticPath, { recursive: true });
|
|
16733
17709
|
}
|
|
16734
17710
|
if (workspaceTemplate) {
|
|
16735
17711
|
await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
|
|
@@ -16774,7 +17750,7 @@ async function runEvaluation(options) {
|
|
|
16774
17750
|
}
|
|
16775
17751
|
} else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
|
|
16776
17752
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
16777
|
-
await
|
|
17753
|
+
await mkdir15(sharedWorkspacePath, { recursive: true });
|
|
16778
17754
|
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
16779
17755
|
}
|
|
16780
17756
|
try {
|
|
@@ -17624,7 +18600,7 @@ async function runEvalCase(options) {
|
|
|
17624
18600
|
}
|
|
17625
18601
|
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
17626
18602
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
17627
|
-
await
|
|
18603
|
+
await mkdir15(workspacePath, { recursive: true });
|
|
17628
18604
|
}
|
|
17629
18605
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
17630
18606
|
const localPathErrors = RepoManager.validateLocalPaths(evalCase.workspace.repos);
|
|
@@ -17679,7 +18655,7 @@ async function runEvalCase(options) {
|
|
|
17679
18655
|
const srcPath = path44.resolve(baseDir, relPath);
|
|
17680
18656
|
const destPath = path44.resolve(workspacePath, relPath);
|
|
17681
18657
|
try {
|
|
17682
|
-
await
|
|
18658
|
+
await mkdir15(path44.dirname(destPath), { recursive: true });
|
|
17683
18659
|
await copyFile2(srcPath, destPath);
|
|
17684
18660
|
} catch (error) {
|
|
17685
18661
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -19247,6 +20223,12 @@ async function evaluate(config) {
|
|
|
19247
20223
|
resolvedTarget = resolveTargetDefinition(targetDef);
|
|
19248
20224
|
}
|
|
19249
20225
|
const collectedResults = [];
|
|
20226
|
+
const cacheEnabled = shouldEnableCache({
|
|
20227
|
+
cliCache: config.cache === true,
|
|
20228
|
+
cliNoCache: false,
|
|
20229
|
+
yamlCache: config.cache === void 0 ? materialized.cache : void 0
|
|
20230
|
+
});
|
|
20231
|
+
const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path45.resolve(materialized.cachePath) : void 0) : void 0;
|
|
19250
20232
|
const results = await runEvaluation({
|
|
19251
20233
|
testFilePath,
|
|
19252
20234
|
repoRoot,
|
|
@@ -19259,6 +20241,8 @@ async function evaluate(config) {
|
|
|
19259
20241
|
filter: config.filter,
|
|
19260
20242
|
threshold: config.threshold,
|
|
19261
20243
|
evalCases: materialized.tests,
|
|
20244
|
+
cache,
|
|
20245
|
+
useCache: !!cache && !shouldSkipCacheForTemperature(resolvedTarget.config),
|
|
19262
20246
|
...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
|
|
19263
20247
|
onResult: async (result) => {
|
|
19264
20248
|
collectedResults.push(result);
|
|
@@ -19289,6 +20273,7 @@ async function materializeEvalConfig(config, options) {
|
|
|
19289
20273
|
tests: tests2,
|
|
19290
20274
|
workers: config.workers ?? suite.workers,
|
|
19291
20275
|
cache: config.cache ?? suite.cacheConfig?.enabled,
|
|
20276
|
+
cachePath: config.cachePath ?? suite.cacheConfig?.cachePath,
|
|
19292
20277
|
budgetUsd: config.budgetUsd ?? suite.budgetUsd,
|
|
19293
20278
|
threshold: config.threshold ?? suite.threshold,
|
|
19294
20279
|
metadata: config.metadata ?? suite.metadata,
|
|
@@ -19307,6 +20292,7 @@ async function materializeEvalConfig(config, options) {
|
|
|
19307
20292
|
tests,
|
|
19308
20293
|
workers: config.workers,
|
|
19309
20294
|
cache: config.cache,
|
|
20295
|
+
cachePath: config.cachePath,
|
|
19310
20296
|
budgetUsd: config.budgetUsd,
|
|
19311
20297
|
threshold: config.threshold,
|
|
19312
20298
|
metadata: config.metadata,
|
|
@@ -19424,9 +20410,11 @@ function mapAssertionType(type) {
|
|
|
19424
20410
|
}
|
|
19425
20411
|
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
19426
20412
|
const total = results.length;
|
|
20413
|
+
const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
|
|
20414
|
+
const executionErrors = total - qualityResults.length;
|
|
19427
20415
|
let passed = 0;
|
|
19428
20416
|
let scoreSum = 0;
|
|
19429
|
-
for (const r of
|
|
20417
|
+
for (const r of qualityResults) {
|
|
19430
20418
|
scoreSum += r.score;
|
|
19431
20419
|
if (r.score >= threshold) {
|
|
19432
20420
|
passed++;
|
|
@@ -19435,9 +20423,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
|
19435
20423
|
return {
|
|
19436
20424
|
total,
|
|
19437
20425
|
passed,
|
|
19438
|
-
failed:
|
|
20426
|
+
failed: qualityResults.length - passed,
|
|
20427
|
+
executionErrors,
|
|
19439
20428
|
durationMs,
|
|
19440
|
-
meanScore:
|
|
20429
|
+
meanScore: qualityResults.length > 0 ? scoreSum / qualityResults.length : 0
|
|
19441
20430
|
};
|
|
19442
20431
|
}
|
|
19443
20432
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
@@ -19520,7 +20509,12 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
|
|
|
19520
20509
|
return {
|
|
19521
20510
|
tests: materialized.tests,
|
|
19522
20511
|
...materialized.workers !== void 0 && { workers: materialized.workers },
|
|
19523
|
-
...materialized.cache !== void 0 && {
|
|
20512
|
+
...materialized.cache !== void 0 && {
|
|
20513
|
+
cacheConfig: {
|
|
20514
|
+
enabled: materialized.cache,
|
|
20515
|
+
...materialized.cachePath !== void 0 && { cachePath: materialized.cachePath }
|
|
20516
|
+
}
|
|
20517
|
+
},
|
|
19524
20518
|
...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
|
|
19525
20519
|
...materialized.threshold !== void 0 && { threshold: materialized.threshold },
|
|
19526
20520
|
...materialized.metadata !== void 0 && { metadata: materialized.metadata },
|
|
@@ -19543,7 +20537,28 @@ function isEvalConfigLike(value) {
|
|
|
19543
20537
|
}
|
|
19544
20538
|
|
|
19545
20539
|
export {
|
|
20540
|
+
NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
|
|
20541
|
+
NORMALIZED_TRACE_SOURCE_KINDS,
|
|
20542
|
+
NORMALIZED_TRACE_EVENT_TYPES,
|
|
20543
|
+
NORMALIZED_TOOL_STATUSES,
|
|
20544
|
+
NORMALIZED_REDACTION_LEVELS,
|
|
20545
|
+
NormalizedRedactionStateWireSchema,
|
|
20546
|
+
NormalizedTraceErrorWireSchema,
|
|
20547
|
+
NormalizedTraceSourceWireSchema,
|
|
20548
|
+
NormalizedTraceSessionWireSchema,
|
|
20549
|
+
NormalizedTraceBranchWireSchema,
|
|
20550
|
+
NormalizedTraceSourceRefWireSchema,
|
|
20551
|
+
NormalizedRawEvidenceWireSchema,
|
|
20552
|
+
NormalizedTraceMessageWireSchema,
|
|
20553
|
+
NormalizedTraceModelWireSchema,
|
|
20554
|
+
NormalizedTraceToolWireSchema,
|
|
20555
|
+
NormalizedTraceEventWireSchema,
|
|
20556
|
+
NormalizedTrajectoryWireSchema,
|
|
20557
|
+
toNormalizedTrajectoryWire,
|
|
20558
|
+
fromNormalizedTrajectoryWire,
|
|
19546
20559
|
computeTraceSummary,
|
|
20560
|
+
getSelectedTrajectoryEvents,
|
|
20561
|
+
computeTraceSummaryFromTrajectory,
|
|
19547
20562
|
DEFAULT_EXPLORATION_TOOLS,
|
|
19548
20563
|
explorationRatio,
|
|
19549
20564
|
tokensPerTool,
|
|
@@ -19551,13 +20566,6 @@ export {
|
|
|
19551
20566
|
mergeExecutionMetrics,
|
|
19552
20567
|
isAgentSkillsFormat,
|
|
19553
20568
|
parseAgentSkillsEvals,
|
|
19554
|
-
getAgentvConfigDir,
|
|
19555
|
-
getAgentvHome,
|
|
19556
|
-
getAgentvDataDir,
|
|
19557
|
-
getWorkspacesRoot,
|
|
19558
|
-
getSubagentsRoot,
|
|
19559
|
-
getTraceStateRoot,
|
|
19560
|
-
getWorkspacePoolRoot,
|
|
19561
20569
|
DEFAULT_EVAL_PATTERNS,
|
|
19562
20570
|
loadConfig,
|
|
19563
20571
|
extractTargetFromSuite,
|
|
@@ -19569,11 +20577,15 @@ export {
|
|
|
19569
20577
|
extractCacheConfig,
|
|
19570
20578
|
extractFailOnError,
|
|
19571
20579
|
extractThreshold,
|
|
20580
|
+
resolveResultsConfigForProject,
|
|
19572
20581
|
detectFormat,
|
|
19573
20582
|
parseRepoSource,
|
|
19574
20583
|
parseRepoCheckout,
|
|
19575
20584
|
parseRepoClone,
|
|
19576
20585
|
buildPromptInputs,
|
|
20586
|
+
ResponseCache,
|
|
20587
|
+
shouldEnableCache,
|
|
20588
|
+
shouldSkipCacheForTemperature,
|
|
19577
20589
|
DEFAULT_THRESHOLD,
|
|
19578
20590
|
PASS_THRESHOLD,
|
|
19579
20591
|
scoreToVerdict,
|
|
@@ -19676,4 +20688,4 @@ export {
|
|
|
19676
20688
|
loadTestById,
|
|
19677
20689
|
loadEvalCaseById
|
|
19678
20690
|
};
|
|
19679
|
-
//# sourceMappingURL=chunk-
|
|
20691
|
+
//# sourceMappingURL=chunk-7QB53OPK.js.map
|