agentv 4.32.0-next.1 → 4.34.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -5
- package/dist/{artifact-writer-VDF7KRWL.js → artifact-writer-UWZX5JKX.js} +4 -4
- package/dist/{chunk-TAZBCVEZ.js → chunk-6QEIZ33V.js} +1340 -279
- package/dist/chunk-6QEIZ33V.js.map +1 -0
- package/dist/{chunk-IGTRNQAM.js → chunk-FK5FLLME.js} +2383 -674
- package/dist/chunk-FK5FLLME.js.map +1 -0
- package/dist/chunk-GPRZ7XSC.js +1234 -0
- package/dist/chunk-GPRZ7XSC.js.map +1 -0
- package/dist/{chunk-5JMFFG36.js → chunk-KMO527KH.js} +784 -1081
- package/dist/chunk-KMO527KH.js.map +1 -0
- package/dist/{chunk-LX5AK3P7.js → chunk-KP4SPQ2M.js} +585 -191
- package/dist/chunk-KP4SPQ2M.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/dashboard/assets/{index-BdoQWnyM.js → index-79OddHgT.js} +1 -1
- package/dist/dashboard/assets/index-BPMAZqjE.css +1 -0
- package/dist/dashboard/assets/index-BycNIWwy.js +118 -0
- package/dist/dashboard/index.html +3 -3
- package/dist/{dist-GICSKMNP.js → dist-Z5VWSDOO.js} +58 -6
- package/dist/index.js +5 -5
- package/dist/{interactive-GIDBBDYZ.js → interactive-NTT2QLPR.js} +5 -5
- package/dist/skills/agentv-eval-writer/SKILL.md +2 -1
- package/dist/skills/agentv-eval-writer/references/eval-schema.json +104 -0
- package/dist/skills/agentv-eval-writer/references/rubric-evaluator.md +20 -0
- package/dist/{ts-eval-loader-Z6IUSDNA-YBOE4JIQ.js → ts-eval-loader-EQJX3OLT-THE7D3GR.js} +2 -2
- package/package.json +2 -2
- package/dist/chunk-2ZEY3WBH.js +0 -729
- package/dist/chunk-2ZEY3WBH.js.map +0 -1
- package/dist/chunk-5JMFFG36.js.map +0 -1
- package/dist/chunk-IGTRNQAM.js.map +0 -1
- package/dist/chunk-LX5AK3P7.js.map +0 -1
- package/dist/chunk-TAZBCVEZ.js.map +0 -1
- package/dist/dashboard/assets/index-DcPH8PyS.css +0 -1
- package/dist/dashboard/assets/index-EXkiwqam.js +0 -116
- /package/dist/{artifact-writer-VDF7KRWL.js.map → artifact-writer-UWZX5JKX.js.map} +0 -0
- /package/dist/{dist-GICSKMNP.js.map → dist-Z5VWSDOO.js.map} +0 -0
- /package/dist/{interactive-GIDBBDYZ.js.map → interactive-NTT2QLPR.js.map} +0 -0
- /package/dist/{ts-eval-loader-Z6IUSDNA-YBOE4JIQ.js.map → ts-eval-loader-EQJX3OLT-THE7D3GR.js.map} +0 -0
|
@@ -4056,16 +4056,18 @@ var coerce = {
|
|
|
4056
4056
|
};
|
|
4057
4057
|
var NEVER = INVALID;
|
|
4058
4058
|
|
|
4059
|
-
// ../../packages/core/dist/chunk-
|
|
4059
|
+
// ../../packages/core/dist/chunk-EW5X2RGJ.js
|
|
4060
4060
|
import { parse } from "yaml";
|
|
4061
|
+
import os from "node:os";
|
|
4062
|
+
import path from "node:path";
|
|
4061
4063
|
import { constants } from "node:fs";
|
|
4062
4064
|
import { access, readFile } from "node:fs/promises";
|
|
4063
|
-
import
|
|
4065
|
+
import path2 from "node:path";
|
|
4064
4066
|
import { existsSync, readFileSync } from "node:fs";
|
|
4065
4067
|
import { homedir } from "node:os";
|
|
4066
|
-
import path2 from "node:path";
|
|
4067
|
-
import { readFile as readFile2, readdir, stat } from "node:fs/promises";
|
|
4068
4068
|
import path3 from "node:path";
|
|
4069
|
+
import { readFile as readFile2, readdir, stat } from "node:fs/promises";
|
|
4070
|
+
import path4 from "node:path";
|
|
4069
4071
|
import fg from "fast-glob";
|
|
4070
4072
|
var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
|
|
4071
4073
|
function isContent(value) {
|
|
@@ -4164,10 +4166,37 @@ var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
|
|
|
4164
4166
|
function isGraderKind(value) {
|
|
4165
4167
|
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
4166
4168
|
}
|
|
4169
|
+
var RUBRIC_OPERATOR_VALUES = ["correctness", "contradiction"];
|
|
4167
4170
|
var PARSE_OPTIONS = { merge: true };
|
|
4168
4171
|
function parseYamlValue(content) {
|
|
4169
4172
|
return parse(content, PARSE_OPTIONS);
|
|
4170
4173
|
}
|
|
4174
|
+
function readEnvPath(name) {
|
|
4175
|
+
const value = process.env[name];
|
|
4176
|
+
if (!value || value === "undefined") return void 0;
|
|
4177
|
+
return value;
|
|
4178
|
+
}
|
|
4179
|
+
function getAgentvConfigDir() {
|
|
4180
|
+
return readEnvPath("AGENTV_HOME") ?? path.join(os.homedir(), ".agentv");
|
|
4181
|
+
}
|
|
4182
|
+
function getAgentvHome() {
|
|
4183
|
+
return getAgentvConfigDir();
|
|
4184
|
+
}
|
|
4185
|
+
function getAgentvDataDir() {
|
|
4186
|
+
return readEnvPath("AGENTV_DATA_DIR") ?? getAgentvConfigDir();
|
|
4187
|
+
}
|
|
4188
|
+
function getWorkspacesRoot() {
|
|
4189
|
+
return path.join(getAgentvDataDir(), "workspaces");
|
|
4190
|
+
}
|
|
4191
|
+
function getSubagentsRoot() {
|
|
4192
|
+
return path.join(getAgentvDataDir(), "subagents");
|
|
4193
|
+
}
|
|
4194
|
+
function getTraceStateRoot() {
|
|
4195
|
+
return path.join(getAgentvDataDir(), "trace-state");
|
|
4196
|
+
}
|
|
4197
|
+
function getWorkspacePoolRoot() {
|
|
4198
|
+
return path.join(getAgentvDataDir(), "workspace-pool");
|
|
4199
|
+
}
|
|
4171
4200
|
async function fileExists(filePath) {
|
|
4172
4201
|
try {
|
|
4173
4202
|
await access(filePath, constants.F_OK);
|
|
@@ -4188,14 +4217,14 @@ async function readJsonFile(filePath) {
|
|
|
4188
4217
|
return JSON.parse(content);
|
|
4189
4218
|
}
|
|
4190
4219
|
async function findGitRoot(startPath) {
|
|
4191
|
-
let currentDir =
|
|
4192
|
-
const root =
|
|
4220
|
+
let currentDir = path2.dirname(path2.resolve(startPath));
|
|
4221
|
+
const root = path2.parse(currentDir).root;
|
|
4193
4222
|
while (currentDir !== root) {
|
|
4194
|
-
const gitPath =
|
|
4223
|
+
const gitPath = path2.join(currentDir, ".git");
|
|
4195
4224
|
if (await fileExists(gitPath)) {
|
|
4196
4225
|
return currentDir;
|
|
4197
4226
|
}
|
|
4198
|
-
const parentDir =
|
|
4227
|
+
const parentDir = path2.dirname(currentDir);
|
|
4199
4228
|
if (parentDir === currentDir) {
|
|
4200
4229
|
break;
|
|
4201
4230
|
}
|
|
@@ -4206,8 +4235,8 @@ async function findGitRoot(startPath) {
|
|
|
4206
4235
|
function buildDirectoryChain(filePath, repoRoot) {
|
|
4207
4236
|
const directories = [];
|
|
4208
4237
|
const seen = /* @__PURE__ */ new Set();
|
|
4209
|
-
const boundary =
|
|
4210
|
-
let current =
|
|
4238
|
+
const boundary = path2.resolve(repoRoot);
|
|
4239
|
+
let current = path2.resolve(path2.dirname(filePath));
|
|
4211
4240
|
while (current !== void 0) {
|
|
4212
4241
|
if (!seen.has(current)) {
|
|
4213
4242
|
directories.push(current);
|
|
@@ -4216,7 +4245,7 @@ function buildDirectoryChain(filePath, repoRoot) {
|
|
|
4216
4245
|
if (current === boundary) {
|
|
4217
4246
|
break;
|
|
4218
4247
|
}
|
|
4219
|
-
const parent =
|
|
4248
|
+
const parent = path2.dirname(current);
|
|
4220
4249
|
if (parent === current) {
|
|
4221
4250
|
break;
|
|
4222
4251
|
}
|
|
@@ -4230,16 +4259,16 @@ function buildDirectoryChain(filePath, repoRoot) {
|
|
|
4230
4259
|
function buildSearchRoots(evalPath, repoRoot) {
|
|
4231
4260
|
const uniqueRoots = [];
|
|
4232
4261
|
const addRoot = (root) => {
|
|
4233
|
-
const normalized =
|
|
4262
|
+
const normalized = path2.resolve(root);
|
|
4234
4263
|
if (!uniqueRoots.includes(normalized)) {
|
|
4235
4264
|
uniqueRoots.push(normalized);
|
|
4236
4265
|
}
|
|
4237
4266
|
};
|
|
4238
|
-
let currentDir =
|
|
4267
|
+
let currentDir = path2.dirname(evalPath);
|
|
4239
4268
|
let reachedBoundary = false;
|
|
4240
4269
|
while (!reachedBoundary) {
|
|
4241
4270
|
addRoot(currentDir);
|
|
4242
|
-
const parentDir =
|
|
4271
|
+
const parentDir = path2.dirname(currentDir);
|
|
4243
4272
|
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
4244
4273
|
reachedBoundary = true;
|
|
4245
4274
|
} else {
|
|
@@ -4257,16 +4286,16 @@ function trimLeadingSeparators(value) {
|
|
|
4257
4286
|
async function resolveFileReference(rawValue, searchRoots) {
|
|
4258
4287
|
const displayPath = trimLeadingSeparators(rawValue);
|
|
4259
4288
|
const potentialPaths = [];
|
|
4260
|
-
if (
|
|
4261
|
-
potentialPaths.push(
|
|
4289
|
+
if (path2.isAbsolute(rawValue)) {
|
|
4290
|
+
potentialPaths.push(path2.normalize(rawValue));
|
|
4262
4291
|
}
|
|
4263
4292
|
for (const base of searchRoots) {
|
|
4264
|
-
potentialPaths.push(
|
|
4293
|
+
potentialPaths.push(path2.resolve(base, displayPath));
|
|
4265
4294
|
}
|
|
4266
4295
|
const attempted = [];
|
|
4267
4296
|
const seen = /* @__PURE__ */ new Set();
|
|
4268
4297
|
for (const candidate of potentialPaths) {
|
|
4269
|
-
const absoluteCandidate =
|
|
4298
|
+
const absoluteCandidate = path2.resolve(candidate);
|
|
4270
4299
|
if (seen.has(absoluteCandidate)) {
|
|
4271
4300
|
continue;
|
|
4272
4301
|
}
|
|
@@ -4448,11 +4477,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
4448
4477
|
allowLiteral: true,
|
|
4449
4478
|
optionalEnv: true
|
|
4450
4479
|
});
|
|
4451
|
-
if (cwd && evalFilePath && !
|
|
4452
|
-
cwd =
|
|
4480
|
+
if (cwd && evalFilePath && !path3.isAbsolute(cwd)) {
|
|
4481
|
+
cwd = path3.resolve(path3.dirname(path3.resolve(evalFilePath)), cwd);
|
|
4453
4482
|
}
|
|
4454
4483
|
if (!cwd && evalFilePath) {
|
|
4455
|
-
cwd =
|
|
4484
|
+
cwd = path3.dirname(path3.resolve(evalFilePath));
|
|
4456
4485
|
}
|
|
4457
4486
|
return {
|
|
4458
4487
|
command,
|
|
@@ -4469,11 +4498,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
4469
4498
|
allowLiteral: true,
|
|
4470
4499
|
optionalEnv: true
|
|
4471
4500
|
});
|
|
4472
|
-
if (cwd && evalFilePath && !
|
|
4473
|
-
cwd =
|
|
4501
|
+
if (cwd && evalFilePath && !path3.isAbsolute(cwd)) {
|
|
4502
|
+
cwd = path3.resolve(path3.dirname(path3.resolve(evalFilePath)), cwd);
|
|
4474
4503
|
}
|
|
4475
4504
|
if (!cwd && evalFilePath) {
|
|
4476
|
-
cwd =
|
|
4505
|
+
cwd = path3.dirname(path3.resolve(evalFilePath));
|
|
4477
4506
|
}
|
|
4478
4507
|
const timeoutSeconds = input.timeout_seconds;
|
|
4479
4508
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
@@ -4531,7 +4560,15 @@ var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
|
4531
4560
|
["retryInitialDelayMs", "retry_initial_delay_ms"],
|
|
4532
4561
|
["retryMaxDelayMs", "retry_max_delay_ms"],
|
|
4533
4562
|
["retryBackoffFactor", "retry_backoff_factor"],
|
|
4534
|
-
["retryStatusCodes", "retry_status_codes"]
|
|
4563
|
+
["retryStatusCodes", "retry_status_codes"],
|
|
4564
|
+
["modelReasoningEffort", "model_reasoning_effort"]
|
|
4565
|
+
]);
|
|
4566
|
+
var CODEX_MODEL_REASONING_EFFORT_VALUES = /* @__PURE__ */ new Set([
|
|
4567
|
+
"minimal",
|
|
4568
|
+
"low",
|
|
4569
|
+
"medium",
|
|
4570
|
+
"high",
|
|
4571
|
+
"xhigh"
|
|
4535
4572
|
]);
|
|
4536
4573
|
var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
4537
4574
|
["timeoutSeconds", "timeout_seconds"]
|
|
@@ -4869,6 +4906,9 @@ function normalizeOpenAIBaseUrl(value) {
|
|
|
4869
4906
|
if (trimmed.length === 0) {
|
|
4870
4907
|
return DEFAULT_OPENAI_BASE_URL;
|
|
4871
4908
|
}
|
|
4909
|
+
if (/\.openai\.azure\.com\/openai\/deployments\/[^/]+$/i.test(trimmed)) {
|
|
4910
|
+
return trimmed;
|
|
4911
|
+
}
|
|
4872
4912
|
return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
|
|
4873
4913
|
}
|
|
4874
4914
|
function resolveAzureConfig(target, env) {
|
|
@@ -4997,22 +5037,34 @@ function resolveGeminiConfig(target, env) {
|
|
|
4997
5037
|
}
|
|
4998
5038
|
function resolveCodexConfig(target, env, _evalFilePath) {
|
|
4999
5039
|
const modelSource = target.model;
|
|
5040
|
+
const modelReasoningEffortSource = target.model_reasoning_effort;
|
|
5000
5041
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
5001
5042
|
const argsSource = target.args ?? target.arguments;
|
|
5002
5043
|
const cwdSource = target.cwd;
|
|
5003
5044
|
const timeoutSource = target.timeout_seconds;
|
|
5004
5045
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
5005
|
-
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
5006
5046
|
const systemPromptSource = target.system_prompt;
|
|
5007
|
-
|
|
5008
|
-
|
|
5009
|
-
|
|
5010
|
-
|
|
5047
|
+
if (target.log_format !== void 0 || target.log_output_format !== void 0) {
|
|
5048
|
+
throw new Error(
|
|
5049
|
+
`${target.name}: log_format is no longer supported for codex targets. Use stream_log instead.`
|
|
5050
|
+
);
|
|
5011
5051
|
}
|
|
5052
|
+
const streamLogResult = resolveStreamLog({ name: target.name, stream_log: target.stream_log });
|
|
5012
5053
|
const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
|
|
5013
5054
|
allowLiteral: true,
|
|
5014
5055
|
optionalEnv: true
|
|
5015
5056
|
});
|
|
5057
|
+
const modelReasoningEffort = normalizeCodexModelReasoningEffort(
|
|
5058
|
+
resolveOptionalString(
|
|
5059
|
+
modelReasoningEffortSource,
|
|
5060
|
+
env,
|
|
5061
|
+
`${target.name} codex model reasoning effort`,
|
|
5062
|
+
{
|
|
5063
|
+
allowLiteral: true,
|
|
5064
|
+
optionalEnv: true
|
|
5065
|
+
}
|
|
5066
|
+
)
|
|
5067
|
+
);
|
|
5016
5068
|
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
5017
5069
|
allowLiteral: true,
|
|
5018
5070
|
optionalEnv: true
|
|
@@ -5027,32 +5079,30 @@ function resolveCodexConfig(target, env, _evalFilePath) {
|
|
|
5027
5079
|
allowLiteral: true,
|
|
5028
5080
|
optionalEnv: true
|
|
5029
5081
|
});
|
|
5030
|
-
const logFormat = normalizeCodexLogFormat(logFormatSource);
|
|
5031
5082
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
5032
5083
|
return {
|
|
5033
5084
|
model,
|
|
5085
|
+
modelReasoningEffort,
|
|
5034
5086
|
executable,
|
|
5035
5087
|
args,
|
|
5036
5088
|
cwd,
|
|
5037
5089
|
timeoutMs,
|
|
5038
5090
|
logDir,
|
|
5039
|
-
logFormat,
|
|
5040
5091
|
streamLog: streamLogResult.streamLog,
|
|
5041
5092
|
systemPrompt
|
|
5042
5093
|
};
|
|
5043
5094
|
}
|
|
5044
|
-
function
|
|
5045
|
-
if (value === void 0
|
|
5095
|
+
function normalizeCodexModelReasoningEffort(value) {
|
|
5096
|
+
if (value === void 0) {
|
|
5046
5097
|
return void 0;
|
|
5047
5098
|
}
|
|
5048
|
-
if (typeof value !== "string") {
|
|
5049
|
-
throw new Error("codex log format must be 'summary' or 'json'");
|
|
5050
|
-
}
|
|
5051
5099
|
const normalized = value.trim().toLowerCase();
|
|
5052
|
-
if (normalized
|
|
5100
|
+
if (CODEX_MODEL_REASONING_EFFORT_VALUES.has(normalized)) {
|
|
5053
5101
|
return normalized;
|
|
5054
5102
|
}
|
|
5055
|
-
throw new Error(
|
|
5103
|
+
throw new Error(
|
|
5104
|
+
`codex model_reasoning_effort must be one of: ${[...CODEX_MODEL_REASONING_EFFORT_VALUES].join(", ")}`
|
|
5105
|
+
);
|
|
5056
5106
|
}
|
|
5057
5107
|
function resolveStreamLog(target, envFallback) {
|
|
5058
5108
|
if (target.stream_log !== void 0 && target.stream_log !== null) {
|
|
@@ -5461,7 +5511,7 @@ function resolveClaudeConfig(target, env, _evalFilePath) {
|
|
|
5461
5511
|
};
|
|
5462
5512
|
}
|
|
5463
5513
|
function resolveCcMirrorBinaryPath(variant) {
|
|
5464
|
-
const variantJsonPath =
|
|
5514
|
+
const variantJsonPath = path3.join(homedir(), ".cc-mirror", variant, "variant.json");
|
|
5465
5515
|
if (!existsSync(variantJsonPath)) {
|
|
5466
5516
|
throw new Error(
|
|
5467
5517
|
`cc-mirror variant "${variant}": ${variantJsonPath} not found. Install the variant or set "executable" explicitly.`
|
|
@@ -5538,8 +5588,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
5538
5588
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
5539
5589
|
if (!parseResult.success) {
|
|
5540
5590
|
const firstError = parseResult.error.errors[0];
|
|
5541
|
-
const
|
|
5542
|
-
const prefix =
|
|
5591
|
+
const path53 = firstError?.path.join(".") || "";
|
|
5592
|
+
const prefix = path53 ? `${target.name} ${path53}: ` : `${target.name}: `;
|
|
5543
5593
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
5544
5594
|
}
|
|
5545
5595
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -5560,11 +5610,11 @@ function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath
|
|
|
5560
5610
|
allowLiteral: true,
|
|
5561
5611
|
optionalEnv: true
|
|
5562
5612
|
});
|
|
5563
|
-
if (cwd && evalFilePath && !
|
|
5564
|
-
cwd =
|
|
5613
|
+
if (cwd && evalFilePath && !path3.isAbsolute(cwd)) {
|
|
5614
|
+
cwd = path3.resolve(path3.dirname(path3.resolve(evalFilePath)), cwd);
|
|
5565
5615
|
}
|
|
5566
5616
|
if (!cwd && evalFilePath) {
|
|
5567
|
-
cwd =
|
|
5617
|
+
cwd = path3.dirname(path3.resolve(evalFilePath));
|
|
5568
5618
|
}
|
|
5569
5619
|
return {
|
|
5570
5620
|
command,
|
|
@@ -5918,7 +5968,7 @@ function parseJsonlCases(content, filePath) {
|
|
|
5918
5968
|
return results;
|
|
5919
5969
|
}
|
|
5920
5970
|
async function loadCasesFromFile(filePath) {
|
|
5921
|
-
const ext =
|
|
5971
|
+
const ext = path4.extname(filePath).toLowerCase();
|
|
5922
5972
|
let content;
|
|
5923
5973
|
try {
|
|
5924
5974
|
content = await readFile2(filePath, "utf8");
|
|
@@ -5945,7 +5995,7 @@ async function loadCasesFromFile(filePath) {
|
|
|
5945
5995
|
}
|
|
5946
5996
|
async function resolveFileReference2(ref, evalFileDir) {
|
|
5947
5997
|
const rawPath = extractFilePath(ref);
|
|
5948
|
-
const absolutePattern =
|
|
5998
|
+
const absolutePattern = path4.resolve(evalFileDir, rawPath);
|
|
5949
5999
|
if (isGlobPattern(rawPath)) {
|
|
5950
6000
|
const matches = await fg(absolutePattern.replaceAll("\\", "/"), {
|
|
5951
6001
|
onlyFiles: true,
|
|
@@ -5972,10 +6022,10 @@ async function loadCasesFromDirectory(dirPath) {
|
|
|
5972
6022
|
const subdirs = entries.filter((e) => e.isDirectory()).sort((a, b) => a.name < b.name ? -1 : a.name > b.name ? 1 : 0);
|
|
5973
6023
|
const results = [];
|
|
5974
6024
|
for (const subdir of subdirs) {
|
|
5975
|
-
const subdirPath =
|
|
6025
|
+
const subdirPath = path4.join(dirPath, subdir.name);
|
|
5976
6026
|
let caseFilePath;
|
|
5977
6027
|
for (const filename of ["case.yaml", "case.yml"]) {
|
|
5978
|
-
const candidate =
|
|
6028
|
+
const candidate = path4.join(subdirPath, filename);
|
|
5979
6029
|
try {
|
|
5980
6030
|
const s = await stat(candidate);
|
|
5981
6031
|
if (s.isFile()) {
|
|
@@ -6011,7 +6061,7 @@ async function loadCasesFromDirectory(dirPath) {
|
|
|
6011
6061
|
caseObj.id = subdir.name;
|
|
6012
6062
|
}
|
|
6013
6063
|
if (!caseObj.workspace) {
|
|
6014
|
-
const workspaceDirPath =
|
|
6064
|
+
const workspaceDirPath = path4.join(subdirPath, "workspace");
|
|
6015
6065
|
try {
|
|
6016
6066
|
const s = await stat(workspaceDirPath);
|
|
6017
6067
|
if (s.isDirectory()) {
|
|
@@ -6037,40 +6087,40 @@ async function expandFileReferences(tests, evalFileDir) {
|
|
|
6037
6087
|
return expanded;
|
|
6038
6088
|
}
|
|
6039
6089
|
|
|
6040
|
-
// ../../packages/core/dist/chunk-
|
|
6090
|
+
// ../../packages/core/dist/chunk-7QB53OPK.js
|
|
6041
6091
|
import path46 from "node:path";
|
|
6042
6092
|
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
6043
6093
|
import { existsSync as existsSync6 } from "node:fs";
|
|
6044
6094
|
import path45 from "node:path";
|
|
6045
6095
|
import micromatch4 from "micromatch";
|
|
6096
|
+
import { mkdir, readFile as readFile3, writeFile } from "node:fs/promises";
|
|
6097
|
+
import path5 from "node:path";
|
|
6046
6098
|
import { execFile as execFile3 } from "node:child_process";
|
|
6047
6099
|
import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
|
|
6048
6100
|
import { existsSync as existsSync5 } from "node:fs";
|
|
6049
|
-
import { copyFile as copyFile2, mkdir as
|
|
6101
|
+
import { copyFile as copyFile2, mkdir as mkdir15, readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
6050
6102
|
import path44 from "node:path";
|
|
6051
6103
|
import { promisify as promisify7 } from "node:util";
|
|
6052
6104
|
import micromatch3 from "micromatch";
|
|
6053
|
-
import
|
|
6054
|
-
import path4 from "node:path";
|
|
6055
|
-
import { mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
6105
|
+
import { mkdtemp, rm, writeFile as writeFile2 } from "node:fs/promises";
|
|
6056
6106
|
import { tmpdir } from "node:os";
|
|
6057
6107
|
import { dirname, join } from "node:path";
|
|
6058
6108
|
import { randomBytes } from "node:crypto";
|
|
6059
6109
|
import { createServer } from "node:http";
|
|
6060
6110
|
import fs from "node:fs/promises";
|
|
6061
6111
|
import path32 from "node:path";
|
|
6062
|
-
import { readFile as
|
|
6112
|
+
import { readFile as readFile22 } from "node:fs/promises";
|
|
6063
6113
|
import path22 from "node:path";
|
|
6064
6114
|
import { fileURLToPath } from "node:url";
|
|
6065
6115
|
import { spawn } from "node:child_process";
|
|
6066
6116
|
import { randomUUID } from "node:crypto";
|
|
6067
6117
|
import { createWriteStream } from "node:fs";
|
|
6068
|
-
import { mkdir } from "node:fs/promises";
|
|
6069
|
-
import
|
|
6118
|
+
import { mkdir as mkdir2 } from "node:fs/promises";
|
|
6119
|
+
import path52 from "node:path";
|
|
6070
6120
|
import path42 from "node:path";
|
|
6071
6121
|
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
6072
6122
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
6073
|
-
import { mkdir as
|
|
6123
|
+
import { mkdir as mkdir3 } from "node:fs/promises";
|
|
6074
6124
|
import path6 from "node:path";
|
|
6075
6125
|
import { exec as execWithCallback } from "node:child_process";
|
|
6076
6126
|
import fs2 from "node:fs/promises";
|
|
@@ -6079,10 +6129,10 @@ import path7 from "node:path";
|
|
|
6079
6129
|
import { promisify } from "node:util";
|
|
6080
6130
|
import { randomUUID as randomUUID3 } from "node:crypto";
|
|
6081
6131
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
6082
|
-
import { mkdir as
|
|
6132
|
+
import { mkdir as mkdir4 } from "node:fs/promises";
|
|
6083
6133
|
import path8 from "node:path";
|
|
6084
6134
|
import { randomUUID as randomUUID5 } from "node:crypto";
|
|
6085
|
-
import { mkdir as
|
|
6135
|
+
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
6086
6136
|
import { homedir as homedir2 } from "node:os";
|
|
6087
6137
|
import path11 from "node:path";
|
|
6088
6138
|
import { Readable, Writable } from "node:stream";
|
|
@@ -18704,10 +18754,10 @@ var RequestError = class _RequestError extends Error {
|
|
|
18704
18754
|
}
|
|
18705
18755
|
};
|
|
18706
18756
|
|
|
18707
|
-
// ../../packages/core/dist/chunk-
|
|
18757
|
+
// ../../packages/core/dist/chunk-7QB53OPK.js
|
|
18708
18758
|
import { exec as execCallback } from "node:child_process";
|
|
18709
18759
|
import { readdirSync, statSync } from "node:fs";
|
|
18710
|
-
import { readFile as
|
|
18760
|
+
import { readFile as readFile32, readdir as readdir2, stat as stat2 } from "node:fs/promises";
|
|
18711
18761
|
import path9 from "node:path";
|
|
18712
18762
|
import { promisify as promisify2 } from "node:util";
|
|
18713
18763
|
import { randomUUID as randomUUID4 } from "node:crypto";
|
|
@@ -18715,26 +18765,26 @@ import { createWriteStream as createWriteStream4, existsSync as existsSync2, rea
|
|
|
18715
18765
|
import { arch, homedir as homedir3, platform } from "node:os";
|
|
18716
18766
|
import path10 from "node:path";
|
|
18717
18767
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
18718
|
-
import { readFile as
|
|
18768
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
18719
18769
|
import { homedir as homedir4 } from "node:os";
|
|
18720
18770
|
import path13 from "node:path";
|
|
18721
|
-
import { readFile as
|
|
18771
|
+
import { readFile as readFile4, readdir as readdir22, stat as stat22 } from "node:fs/promises";
|
|
18722
18772
|
import { homedir as homedir32 } from "node:os";
|
|
18723
18773
|
import path12 from "node:path";
|
|
18724
18774
|
import { randomUUID as randomUUID6 } from "node:crypto";
|
|
18725
18775
|
import { existsSync as existsSync22 } from "node:fs";
|
|
18726
|
-
import { mkdir as
|
|
18776
|
+
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
18727
18777
|
import path14 from "node:path";
|
|
18728
18778
|
import { execSync, spawn as spawn3 } from "node:child_process";
|
|
18729
18779
|
import { randomUUID as randomUUID7 } from "node:crypto";
|
|
18730
18780
|
import { accessSync, createWriteStream as createWriteStream5, readFileSync as readFileSync2 } from "node:fs";
|
|
18731
|
-
import { mkdir as
|
|
18781
|
+
import { mkdir as mkdir7, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile3 } from "node:fs/promises";
|
|
18732
18782
|
import { tmpdir as tmpdir2 } from "node:os";
|
|
18733
18783
|
import path15 from "node:path";
|
|
18734
18784
|
import { execSync as execSync2 } from "node:child_process";
|
|
18735
18785
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
18736
18786
|
import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
|
|
18737
|
-
import { mkdir as
|
|
18787
|
+
import { mkdir as mkdir8 } from "node:fs/promises";
|
|
18738
18788
|
import path16 from "node:path";
|
|
18739
18789
|
import { createInterface } from "node:readline";
|
|
18740
18790
|
import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
|
|
@@ -18742,28 +18792,28 @@ import { exec as exec2 } from "node:child_process";
|
|
|
18742
18792
|
import { constants as constants2, access as access2 } from "node:fs/promises";
|
|
18743
18793
|
import path27 from "node:path";
|
|
18744
18794
|
import { promisify as promisify4 } from "node:util";
|
|
18745
|
-
import { stat as stat5, writeFile as
|
|
18795
|
+
import { stat as stat5, writeFile as writeFile6 } from "node:fs/promises";
|
|
18746
18796
|
import path25 from "node:path";
|
|
18747
18797
|
import { constants as constants3 } from "node:fs";
|
|
18748
|
-
import { access as access3, mkdir as
|
|
18798
|
+
import { access as access3, mkdir as mkdir9, readdir as readdir3, rm as rm3, stat as stat3 } from "node:fs/promises";
|
|
18749
18799
|
import path17 from "node:path";
|
|
18750
18800
|
import path18 from "node:path";
|
|
18751
18801
|
import path19 from "node:path";
|
|
18752
|
-
import { readFile as
|
|
18802
|
+
import { readFile as readFile6 } from "node:fs/promises";
|
|
18753
18803
|
import path20 from "node:path";
|
|
18754
18804
|
import { exec, spawn as spawn4 } from "node:child_process";
|
|
18755
|
-
import { mkdir as
|
|
18805
|
+
import { mkdir as mkdir10, writeFile as writeFile4 } from "node:fs/promises";
|
|
18756
18806
|
import path222 from "node:path";
|
|
18757
18807
|
import { promisify as promisify3 } from "node:util";
|
|
18758
18808
|
import path21 from "node:path";
|
|
18759
|
-
import { copyFile, mkdir as
|
|
18809
|
+
import { copyFile, mkdir as mkdir11, readFile as readFile7, readdir as readdir4, stat as stat4, writeFile as writeFile5 } from "node:fs/promises";
|
|
18760
18810
|
import path24 from "node:path";
|
|
18761
18811
|
import path23 from "node:path";
|
|
18762
18812
|
import JSON5 from "json5";
|
|
18763
|
-
import { writeFile as
|
|
18813
|
+
import { writeFile as writeFile7 } from "node:fs/promises";
|
|
18764
18814
|
import path26 from "node:path";
|
|
18765
18815
|
import { constants as constants32 } from "node:fs";
|
|
18766
|
-
import { access as access32, readFile as
|
|
18816
|
+
import { access as access32, readFile as readFile8 } from "node:fs/promises";
|
|
18767
18817
|
import path28 from "node:path";
|
|
18768
18818
|
import path29 from "node:path";
|
|
18769
18819
|
import fg2 from "fast-glob";
|
|
@@ -18772,12 +18822,12 @@ import path31 from "node:path";
|
|
|
18772
18822
|
import fg22 from "fast-glob";
|
|
18773
18823
|
import path322 from "node:path";
|
|
18774
18824
|
import fg3 from "fast-glob";
|
|
18775
|
-
import { cp, mkdir as
|
|
18825
|
+
import { cp, mkdir as mkdir13, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
|
|
18776
18826
|
import path33 from "node:path";
|
|
18777
18827
|
import { execFile } from "node:child_process";
|
|
18778
18828
|
import { createHash } from "node:crypto";
|
|
18779
18829
|
import { existsSync as existsSync3 } from "node:fs";
|
|
18780
|
-
import { cp as cp2, mkdir as
|
|
18830
|
+
import { cp as cp2, mkdir as mkdir14, readFile as readFile9, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile8 } from "node:fs/promises";
|
|
18781
18831
|
import path34 from "node:path";
|
|
18782
18832
|
import { promisify as promisify5 } from "node:util";
|
|
18783
18833
|
import { execFile as execFile2 } from "node:child_process";
|
|
@@ -18786,25 +18836,65 @@ import path35 from "node:path";
|
|
|
18786
18836
|
import { promisify as promisify6 } from "node:util";
|
|
18787
18837
|
import { readdir as readdir7, stat as stat7 } from "node:fs/promises";
|
|
18788
18838
|
import path36 from "node:path";
|
|
18789
|
-
import { readFile as
|
|
18839
|
+
import { readFile as readFile16, stat as stat8 } from "node:fs/promises";
|
|
18790
18840
|
import path43 from "node:path";
|
|
18791
18841
|
import micromatch2 from "micromatch";
|
|
18792
|
-
import {
|
|
18793
|
-
import path37 from "node:path";
|
|
18842
|
+
import { stringify as stringifyYaml } from "yaml";
|
|
18794
18843
|
import { readFile as readFile10 } from "node:fs/promises";
|
|
18844
|
+
import path37 from "node:path";
|
|
18845
|
+
import { readFile as readFile11 } from "node:fs/promises";
|
|
18795
18846
|
import path39 from "node:path";
|
|
18796
18847
|
import { constants as constants4 } from "node:fs";
|
|
18797
18848
|
import { access as access4 } from "node:fs/promises";
|
|
18798
18849
|
import path38 from "node:path";
|
|
18799
18850
|
import { fileURLToPath as fileURLToPath4 } from "node:url";
|
|
18800
|
-
import { readFile as
|
|
18851
|
+
import { readFile as readFile13 } from "node:fs/promises";
|
|
18801
18852
|
import path40 from "node:path";
|
|
18802
|
-
import { readFile as
|
|
18803
|
-
import { readFile as
|
|
18853
|
+
import { readFile as readFile12 } from "node:fs/promises";
|
|
18854
|
+
import { readFile as readFile15 } from "node:fs/promises";
|
|
18804
18855
|
import path422 from "node:path";
|
|
18805
18856
|
import micromatch from "micromatch";
|
|
18806
|
-
import { readFile as
|
|
18857
|
+
import { readFile as readFile14 } from "node:fs/promises";
|
|
18807
18858
|
import path41 from "node:path";
|
|
18859
|
+
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
18860
|
+
var ResponseCache = class {
|
|
18861
|
+
cachePath;
|
|
18862
|
+
constructor(cachePath) {
|
|
18863
|
+
this.cachePath = cachePath ?? DEFAULT_CACHE_PATH;
|
|
18864
|
+
}
|
|
18865
|
+
async get(key) {
|
|
18866
|
+
const filePath = this.keyToPath(key);
|
|
18867
|
+
try {
|
|
18868
|
+
const data = await readFile3(filePath, "utf8");
|
|
18869
|
+
return JSON.parse(data);
|
|
18870
|
+
} catch {
|
|
18871
|
+
return void 0;
|
|
18872
|
+
}
|
|
18873
|
+
}
|
|
18874
|
+
async set(key, value) {
|
|
18875
|
+
const filePath = this.keyToPath(key);
|
|
18876
|
+
const dir = path5.dirname(filePath);
|
|
18877
|
+
await mkdir(dir, { recursive: true });
|
|
18878
|
+
await writeFile(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
18879
|
+
}
|
|
18880
|
+
keyToPath(key) {
|
|
18881
|
+
const prefix = key.slice(0, 2);
|
|
18882
|
+
return path5.join(this.cachePath, prefix, `${key}.json`);
|
|
18883
|
+
}
|
|
18884
|
+
};
|
|
18885
|
+
function shouldEnableCache(params) {
|
|
18886
|
+
if (params.cliNoCache) return false;
|
|
18887
|
+
if (params.cliCache) return true;
|
|
18888
|
+
if (params.yamlCache !== void 0) return params.yamlCache;
|
|
18889
|
+
return params.tsConfigCache === true;
|
|
18890
|
+
}
|
|
18891
|
+
function shouldSkipCacheForTemperature(targetConfig) {
|
|
18892
|
+
const temp = targetConfig.temperature;
|
|
18893
|
+
if (typeof temp === "number" && temp > 0) {
|
|
18894
|
+
return true;
|
|
18895
|
+
}
|
|
18896
|
+
return false;
|
|
18897
|
+
}
|
|
18808
18898
|
var DEFAULT_THRESHOLD = 0.8;
|
|
18809
18899
|
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
18810
18900
|
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
@@ -19026,32 +19116,6 @@ function validateConcurrency(concurrency) {
|
|
|
19026
19116
|
throw new TypeError("Expected `concurrency` to be a number from 1 and up");
|
|
19027
19117
|
}
|
|
19028
19118
|
}
|
|
19029
|
-
function readEnvPath(name) {
|
|
19030
|
-
const value = process.env[name];
|
|
19031
|
-
if (!value || value === "undefined") return void 0;
|
|
19032
|
-
return value;
|
|
19033
|
-
}
|
|
19034
|
-
function getAgentvConfigDir() {
|
|
19035
|
-
return readEnvPath("AGENTV_HOME") ?? path4.join(os.homedir(), ".agentv");
|
|
19036
|
-
}
|
|
19037
|
-
function getAgentvHome() {
|
|
19038
|
-
return getAgentvConfigDir();
|
|
19039
|
-
}
|
|
19040
|
-
function getAgentvDataDir() {
|
|
19041
|
-
return readEnvPath("AGENTV_DATA_DIR") ?? getAgentvConfigDir();
|
|
19042
|
-
}
|
|
19043
|
-
function getWorkspacesRoot() {
|
|
19044
|
-
return path4.join(getAgentvDataDir(), "workspaces");
|
|
19045
|
-
}
|
|
19046
|
-
function getSubagentsRoot() {
|
|
19047
|
-
return path4.join(getAgentvDataDir(), "subagents");
|
|
19048
|
-
}
|
|
19049
|
-
function getTraceStateRoot() {
|
|
19050
|
-
return path4.join(getAgentvDataDir(), "trace-state");
|
|
19051
|
-
}
|
|
19052
|
-
function getWorkspacePoolRoot() {
|
|
19053
|
-
return path4.join(getAgentvDataDir(), "workspace-pool");
|
|
19054
|
-
}
|
|
19055
19119
|
var DEFAULT_MAX_CALLS = 50;
|
|
19056
19120
|
async function createTargetProxy(options) {
|
|
19057
19121
|
const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
|
|
@@ -19373,7 +19437,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
19373
19437
|
const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
|
|
19374
19438
|
const dir = await getWorkDir();
|
|
19375
19439
|
const filePath = join(dir, `img-${counter++}.${ext}`);
|
|
19376
|
-
await
|
|
19440
|
+
await writeFile2(filePath, Buffer.from(base64Data, "base64"));
|
|
19377
19441
|
blocks.push({ type: "image", media_type: img.media_type, path: filePath });
|
|
19378
19442
|
} else {
|
|
19379
19443
|
blocks.push({ type: "image", media_type: img.media_type, path: img.source });
|
|
@@ -19416,7 +19480,7 @@ var CodeGrader = class {
|
|
|
19416
19480
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
19417
19481
|
const tmpDir = await mkdtemp(join(tmpdir(), "agentv-grader-"));
|
|
19418
19482
|
outputPath = join(tmpDir, "output.json");
|
|
19419
|
-
await
|
|
19483
|
+
await writeFile2(outputPath, serialized);
|
|
19420
19484
|
outputForPayload = null;
|
|
19421
19485
|
}
|
|
19422
19486
|
}
|
|
@@ -19433,6 +19497,7 @@ var CodeGrader = class {
|
|
|
19433
19497
|
context.evalCase.input,
|
|
19434
19498
|
getImageDir
|
|
19435
19499
|
),
|
|
19500
|
+
metadata: context.evalCase.metadata ?? null,
|
|
19436
19501
|
trace: context.trace ?? null,
|
|
19437
19502
|
tokenUsage: context.tokenUsage ?? null,
|
|
19438
19503
|
costUsd: context.costUsd ?? null,
|
|
@@ -19664,7 +19729,7 @@ async function preprocessContentFile(block, preprocessors, basePath) {
|
|
|
19664
19729
|
return runContentPreprocessor(block, resolvedPath, preprocessor);
|
|
19665
19730
|
}
|
|
19666
19731
|
try {
|
|
19667
|
-
const buffer = await
|
|
19732
|
+
const buffer = await readFile22(resolvedPath);
|
|
19668
19733
|
const text = buffer.toString("utf8").replace(/\r\n/g, "\n");
|
|
19669
19734
|
if (buffer.includes(0) || text.includes(REPLACEMENT_CHAR)) {
|
|
19670
19735
|
return {
|
|
@@ -19758,6 +19823,10 @@ ${text}`;
|
|
|
19758
19823
|
var TEMPLATE_VARIABLES = {
|
|
19759
19824
|
EXPECTED_OUTPUT: "expected_output",
|
|
19760
19825
|
CRITERIA: "criteria",
|
|
19826
|
+
METADATA: "metadata",
|
|
19827
|
+
METADATA_JSON: "metadata_json",
|
|
19828
|
+
RUBRICS: "rubrics",
|
|
19829
|
+
RUBRICS_JSON: "rubrics_json",
|
|
19761
19830
|
INPUT: "input",
|
|
19762
19831
|
OUTPUT: "output",
|
|
19763
19832
|
FILE_CHANGES: "file_changes",
|
|
@@ -19779,6 +19848,25 @@ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
|
|
|
19779
19848
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
|
|
19780
19849
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
|
|
19781
19850
|
]);
|
|
19851
|
+
var OPERATOR_GUIDANCE = {
|
|
19852
|
+
correctness: "Correctness: mark satisfied only when the answer positively supports or fulfills the outcome. Omission or contradiction should not satisfy it.",
|
|
19853
|
+
contradiction: "Contradiction guard: mark satisfied when the answer does not make a claim that contradicts the outcome. Do not require the answer to mention the outcome; mark unsatisfied only for incompatible claims."
|
|
19854
|
+
};
|
|
19855
|
+
function formatRubricOperatorLabel(operator) {
|
|
19856
|
+
return operator ? ` (operator: ${operator})` : "";
|
|
19857
|
+
}
|
|
19858
|
+
function formatRubricOperatorGuidance(rubrics) {
|
|
19859
|
+
const operators = /* @__PURE__ */ new Set();
|
|
19860
|
+
for (const rubric of rubrics) {
|
|
19861
|
+
if (rubric.operator) {
|
|
19862
|
+
operators.add(rubric.operator);
|
|
19863
|
+
}
|
|
19864
|
+
}
|
|
19865
|
+
if (operators.size === 0) {
|
|
19866
|
+
return [];
|
|
19867
|
+
}
|
|
19868
|
+
return [...operators].map((operator) => OPERATOR_GUIDANCE[operator]);
|
|
19869
|
+
}
|
|
19782
19870
|
var DEFAULT_MAX_STEPS = 10;
|
|
19783
19871
|
var MAX_STEPS_LIMIT = 50;
|
|
19784
19872
|
var MAX_FILE_SIZE = 50 * 1024;
|
|
@@ -19860,6 +19948,32 @@ var scoreRangeEvaluationSchema = external_exports.object({
|
|
|
19860
19948
|
checks: external_exports.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
19861
19949
|
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
19862
19950
|
});
|
|
19951
|
+
function stringifyPretty(value) {
|
|
19952
|
+
return value === void 0 ? "" : JSON.stringify(value, null, 2);
|
|
19953
|
+
}
|
|
19954
|
+
function stringifyCompact(value) {
|
|
19955
|
+
return value === void 0 ? "" : JSON.stringify(value);
|
|
19956
|
+
}
|
|
19957
|
+
function buildTemplateVariables(context) {
|
|
19958
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
19959
|
+
const rubrics = context.evaluator?.type === "llm-grader" ? context.evaluator.rubrics : void 0;
|
|
19960
|
+
return {
|
|
19961
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
19962
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
19963
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
19964
|
+
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
19965
|
+
[TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
|
|
19966
|
+
[TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
|
|
19967
|
+
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
|
|
19968
|
+
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
|
|
19969
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
19970
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
19971
|
+
// Deprecated aliases — same values as the primary variables above
|
|
19972
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
19973
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
19974
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
19975
|
+
};
|
|
19976
|
+
}
|
|
19863
19977
|
function resolveContentBasePath(context) {
|
|
19864
19978
|
if (context.workspacePath) {
|
|
19865
19979
|
return context.workspacePath;
|
|
@@ -19931,19 +20045,7 @@ var LlmGrader = class {
|
|
|
19931
20045
|
// LLM mode (existing)
|
|
19932
20046
|
// ---------------------------------------------------------------------------
|
|
19933
20047
|
async evaluateFreeform(context, graderProvider) {
|
|
19934
|
-
const
|
|
19935
|
-
const variables = {
|
|
19936
|
-
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
19937
|
-
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
19938
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
19939
|
-
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
19940
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
19941
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
19942
|
-
// Deprecated aliases — same values as the primary variables above
|
|
19943
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
19944
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
19945
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
19946
|
-
};
|
|
20048
|
+
const variables = buildTemplateVariables(context);
|
|
19947
20049
|
const systemPrompt = buildOutputSchema();
|
|
19948
20050
|
const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
|
|
19949
20051
|
warnDeprecatedTemplateVars(graderTemplate);
|
|
@@ -20010,7 +20112,7 @@ ${context.toolCalls}`;
|
|
|
20010
20112
|
if (hasScoreRanges) {
|
|
20011
20113
|
return this.evaluateWithScoreRanges(context, graderProvider, rubrics);
|
|
20012
20114
|
}
|
|
20013
|
-
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
20115
|
+
const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildRubricPrompt(context, rubrics);
|
|
20014
20116
|
const systemPrompt = buildRubricOutputSchema();
|
|
20015
20117
|
const graderRawRequest = {
|
|
20016
20118
|
userPrompt: prompt,
|
|
@@ -20055,7 +20157,7 @@ ${context.toolCalls}`;
|
|
|
20055
20157
|
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
20056
20158
|
*/
|
|
20057
20159
|
async evaluateWithScoreRanges(context, graderProvider, rubrics) {
|
|
20058
|
-
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
20160
|
+
const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildScoreRangePrompt(context, rubrics);
|
|
20059
20161
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
20060
20162
|
const graderRawRequest = {
|
|
20061
20163
|
userPrompt: prompt,
|
|
@@ -20274,21 +20376,11 @@ ${context.toolCalls}`;
|
|
|
20274
20376
|
*/
|
|
20275
20377
|
buildAgentUserPrompt(context) {
|
|
20276
20378
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
20277
|
-
const variables =
|
|
20278
|
-
|
|
20279
|
-
|
|
20280
|
-
|
|
20281
|
-
|
|
20282
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
20283
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
20284
|
-
// Deprecated aliases
|
|
20285
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
20286
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
20287
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
20288
|
-
};
|
|
20289
|
-
if (this.graderTemplate) {
|
|
20290
|
-
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
20291
|
-
return substituteVariables(this.graderTemplate, variables);
|
|
20379
|
+
const variables = buildTemplateVariables(context);
|
|
20380
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
20381
|
+
if (template) {
|
|
20382
|
+
warnDeprecatedTemplateVars(template);
|
|
20383
|
+
return substituteVariables(template, variables);
|
|
20292
20384
|
}
|
|
20293
20385
|
const config2 = context.evaluator;
|
|
20294
20386
|
const rubrics = config2?.type === "llm-grader" ? config2.rubrics : void 0;
|
|
@@ -20338,21 +20430,11 @@ ${context.toolCalls}`;
|
|
|
20338
20430
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
20339
20431
|
const config2 = context.evaluator;
|
|
20340
20432
|
const rubrics = config2?.type === "llm-grader" ? config2.rubrics : void 0;
|
|
20341
|
-
|
|
20342
|
-
|
|
20343
|
-
|
|
20344
|
-
|
|
20345
|
-
|
|
20346
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
20347
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
20348
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
20349
|
-
// Deprecated aliases
|
|
20350
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
20351
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
20352
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
20353
|
-
};
|
|
20354
|
-
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
20355
|
-
const customPrompt = substituteVariables(this.graderTemplate, variables);
|
|
20433
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
20434
|
+
if (template) {
|
|
20435
|
+
const variables = buildTemplateVariables(context);
|
|
20436
|
+
warnDeprecatedTemplateVars(template);
|
|
20437
|
+
const customPrompt = substituteVariables(template, variables);
|
|
20356
20438
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
20357
20439
|
return `${customPrompt}
|
|
20358
20440
|
|
|
@@ -20478,6 +20560,9 @@ ${outputSchema}`;
|
|
|
20478
20560
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
20479
20561
|
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
20480
20562
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
20563
|
+
if (rubric.operator) {
|
|
20564
|
+
parts.push(`Operator: ${rubric.operator}`);
|
|
20565
|
+
}
|
|
20481
20566
|
if (rubric.outcome) {
|
|
20482
20567
|
parts.push(`Description: ${rubric.outcome}`);
|
|
20483
20568
|
}
|
|
@@ -20490,12 +20575,21 @@ ${outputSchema}`;
|
|
|
20490
20575
|
}
|
|
20491
20576
|
}
|
|
20492
20577
|
}
|
|
20578
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
20579
|
+
if (operatorGuidance.length > 0) {
|
|
20580
|
+
parts.push("", ...operatorGuidance);
|
|
20581
|
+
}
|
|
20493
20582
|
parts.push(
|
|
20494
20583
|
"",
|
|
20495
20584
|
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
20496
20585
|
);
|
|
20497
20586
|
return parts.join("\n");
|
|
20498
20587
|
}
|
|
20588
|
+
buildCustomPrompt(context) {
|
|
20589
|
+
const template = context.graderTemplateOverride ?? this.graderTemplate ?? "";
|
|
20590
|
+
warnDeprecatedTemplateVars(template);
|
|
20591
|
+
return substituteVariables(template, buildTemplateVariables(context));
|
|
20592
|
+
}
|
|
20499
20593
|
buildRubricPrompt(context, rubrics) {
|
|
20500
20594
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
20501
20595
|
const parts = [
|
|
@@ -20519,10 +20613,21 @@ ${outputSchema}`;
|
|
|
20519
20613
|
parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
|
|
20520
20614
|
}
|
|
20521
20615
|
parts.push("[[ ## rubrics ## ]]");
|
|
20616
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
20617
|
+
if (operatorGuidance.length > 0) {
|
|
20618
|
+
parts.push("", "Operator guidance:");
|
|
20619
|
+
for (const guidance of operatorGuidance) {
|
|
20620
|
+
parts.push(`- ${guidance}`);
|
|
20621
|
+
}
|
|
20622
|
+
parts.push("");
|
|
20623
|
+
}
|
|
20522
20624
|
for (const rubric of rubrics) {
|
|
20523
20625
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
20524
20626
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
20525
|
-
|
|
20627
|
+
const operatorLabel = formatRubricOperatorLabel(rubric.operator);
|
|
20628
|
+
parts.push(
|
|
20629
|
+
`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`
|
|
20630
|
+
);
|
|
20526
20631
|
}
|
|
20527
20632
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
20528
20633
|
return parts.join("\n");
|
|
@@ -21248,6 +21353,384 @@ var CostGrader = class {
|
|
|
21248
21353
|
};
|
|
21249
21354
|
}
|
|
21250
21355
|
};
|
|
21356
|
+
var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trace.v1";
|
|
21357
|
+
var NORMALIZED_TRACE_SOURCE_KINDS = [
|
|
21358
|
+
"agentv_run",
|
|
21359
|
+
"otlp",
|
|
21360
|
+
"phoenix",
|
|
21361
|
+
"langfuse",
|
|
21362
|
+
"pi_session",
|
|
21363
|
+
"imported_transcript",
|
|
21364
|
+
"compact_transcript"
|
|
21365
|
+
];
|
|
21366
|
+
var NORMALIZED_TRACE_EVENT_TYPES = [
|
|
21367
|
+
"message",
|
|
21368
|
+
"model_turn",
|
|
21369
|
+
"tool_call",
|
|
21370
|
+
"tool_result"
|
|
21371
|
+
];
|
|
21372
|
+
var NORMALIZED_TOOL_STATUSES = ["ok", "error", "timeout", "cancelled", "unknown"];
|
|
21373
|
+
var NORMALIZED_REDACTION_LEVELS = ["none", "partial", "full"];
|
|
21374
|
+
function omitUndefinedProperties(value) {
|
|
21375
|
+
return Object.fromEntries(
|
|
21376
|
+
Object.entries(value).filter(([, property]) => property !== void 0)
|
|
21377
|
+
);
|
|
21378
|
+
}
|
|
21379
|
+
var MetadataWireSchema = external_exports.record(external_exports.string(), external_exports.unknown());
|
|
21380
|
+
var TokenUsageWireSchema = external_exports.object({
|
|
21381
|
+
input: external_exports.number(),
|
|
21382
|
+
output: external_exports.number(),
|
|
21383
|
+
cached: external_exports.number().optional(),
|
|
21384
|
+
reasoning: external_exports.number().optional()
|
|
21385
|
+
});
|
|
21386
|
+
var NormalizedRedactionStateWireSchema = external_exports.object({
|
|
21387
|
+
level: external_exports.enum(NORMALIZED_REDACTION_LEVELS),
|
|
21388
|
+
fields: external_exports.array(external_exports.string()).optional(),
|
|
21389
|
+
reason: external_exports.string().optional()
|
|
21390
|
+
});
|
|
21391
|
+
var NormalizedTraceErrorWireSchema = external_exports.object({
|
|
21392
|
+
message: external_exports.string(),
|
|
21393
|
+
name: external_exports.string().optional(),
|
|
21394
|
+
code: external_exports.string().optional(),
|
|
21395
|
+
stack: external_exports.string().optional(),
|
|
21396
|
+
metadata: MetadataWireSchema.optional()
|
|
21397
|
+
});
|
|
21398
|
+
var NormalizedTraceSourceWireSchema = external_exports.object({
|
|
21399
|
+
kind: external_exports.enum(NORMALIZED_TRACE_SOURCE_KINDS),
|
|
21400
|
+
path: external_exports.string().optional(),
|
|
21401
|
+
url: external_exports.string().optional(),
|
|
21402
|
+
provider: external_exports.string().optional(),
|
|
21403
|
+
format: external_exports.string().optional(),
|
|
21404
|
+
version: external_exports.string().optional(),
|
|
21405
|
+
metadata: MetadataWireSchema.optional()
|
|
21406
|
+
});
|
|
21407
|
+
var NormalizedTraceSessionWireSchema = external_exports.object({
|
|
21408
|
+
session_id: external_exports.string().optional(),
|
|
21409
|
+
conversation_id: external_exports.string().optional(),
|
|
21410
|
+
cwd: external_exports.string().optional(),
|
|
21411
|
+
started_at: external_exports.string().optional(),
|
|
21412
|
+
ended_at: external_exports.string().optional(),
|
|
21413
|
+
metadata: MetadataWireSchema.optional()
|
|
21414
|
+
});
|
|
21415
|
+
var NormalizedTraceBranchWireSchema = external_exports.object({
|
|
21416
|
+
selected_leaf_id: external_exports.string().optional(),
|
|
21417
|
+
selected_path_ids: external_exports.array(external_exports.string()).optional(),
|
|
21418
|
+
included_event_ids: external_exports.array(external_exports.string()).optional(),
|
|
21419
|
+
omitted_event_ids: external_exports.array(external_exports.string()).optional(),
|
|
21420
|
+
selection_reason: external_exports.string().optional()
|
|
21421
|
+
});
|
|
21422
|
+
var NormalizedTraceSourceRefWireSchema = external_exports.object({
|
|
21423
|
+
event_id: external_exports.string().optional(),
|
|
21424
|
+
message_id: external_exports.string().optional(),
|
|
21425
|
+
span_id: external_exports.string().optional(),
|
|
21426
|
+
trace_id: external_exports.string().optional(),
|
|
21427
|
+
raw_kind: external_exports.string().optional(),
|
|
21428
|
+
path: external_exports.string().optional(),
|
|
21429
|
+
line: external_exports.number().int().nonnegative().optional(),
|
|
21430
|
+
metadata: MetadataWireSchema.optional()
|
|
21431
|
+
});
|
|
21432
|
+
var NormalizedRawEvidenceWireSchema = external_exports.object({
|
|
21433
|
+
kind: external_exports.string(),
|
|
21434
|
+
ref: external_exports.string().optional(),
|
|
21435
|
+
media_type: external_exports.string().optional(),
|
|
21436
|
+
content: external_exports.unknown().optional(),
|
|
21437
|
+
redacted: external_exports.boolean().optional(),
|
|
21438
|
+
metadata: MetadataWireSchema.optional()
|
|
21439
|
+
});
|
|
21440
|
+
var NormalizedTraceMessageWireSchema = external_exports.object({
|
|
21441
|
+
role: external_exports.string(),
|
|
21442
|
+
name: external_exports.string().optional(),
|
|
21443
|
+
content: external_exports.unknown().optional(),
|
|
21444
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
21445
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
21446
|
+
metadata: MetadataWireSchema.optional()
|
|
21447
|
+
});
|
|
21448
|
+
var NormalizedTraceModelWireSchema = external_exports.object({
|
|
21449
|
+
provider: external_exports.string().optional(),
|
|
21450
|
+
name: external_exports.string().optional(),
|
|
21451
|
+
invocation_id: external_exports.string().optional(),
|
|
21452
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
21453
|
+
metadata: MetadataWireSchema.optional()
|
|
21454
|
+
});
|
|
21455
|
+
var NormalizedTraceToolWireSchema = external_exports.object({
|
|
21456
|
+
name: external_exports.string(),
|
|
21457
|
+
call_id: external_exports.string().optional(),
|
|
21458
|
+
input: external_exports.unknown().optional(),
|
|
21459
|
+
output: external_exports.unknown().optional(),
|
|
21460
|
+
status: external_exports.enum(NORMALIZED_TOOL_STATUSES).optional(),
|
|
21461
|
+
error: NormalizedTraceErrorWireSchema.optional(),
|
|
21462
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
21463
|
+
metadata: MetadataWireSchema.optional()
|
|
21464
|
+
});
|
|
21465
|
+
var NormalizedTraceEventWireSchema = external_exports.object({
|
|
21466
|
+
event_id: external_exports.string(),
|
|
21467
|
+
parent_event_id: external_exports.string().optional(),
|
|
21468
|
+
ordinal: external_exports.number().int().nonnegative(),
|
|
21469
|
+
type: external_exports.enum(NORMALIZED_TRACE_EVENT_TYPES),
|
|
21470
|
+
timestamp: external_exports.string().optional(),
|
|
21471
|
+
duration_ms: external_exports.number().nonnegative().optional(),
|
|
21472
|
+
duration_inferred: external_exports.boolean().optional(),
|
|
21473
|
+
turn_index: external_exports.number().int().nonnegative().optional(),
|
|
21474
|
+
message: NormalizedTraceMessageWireSchema.optional(),
|
|
21475
|
+
model: NormalizedTraceModelWireSchema.optional(),
|
|
21476
|
+
tool: NormalizedTraceToolWireSchema.optional(),
|
|
21477
|
+
source_ref: NormalizedTraceSourceRefWireSchema.optional(),
|
|
21478
|
+
raw_evidence: external_exports.array(NormalizedRawEvidenceWireSchema).optional(),
|
|
21479
|
+
redaction: NormalizedRedactionStateWireSchema.optional(),
|
|
21480
|
+
metadata: MetadataWireSchema.optional()
|
|
21481
|
+
});
|
|
21482
|
+
var NormalizedTrajectoryWireSchema = external_exports.object({
|
|
21483
|
+
schema_version: external_exports.literal(NORMALIZED_TRAJECTORY_SCHEMA_VERSION),
|
|
21484
|
+
source: NormalizedTraceSourceWireSchema,
|
|
21485
|
+
session: NormalizedTraceSessionWireSchema,
|
|
21486
|
+
branch: NormalizedTraceBranchWireSchema.optional(),
|
|
21487
|
+
events: external_exports.array(NormalizedTraceEventWireSchema),
|
|
21488
|
+
token_usage: TokenUsageWireSchema.optional(),
|
|
21489
|
+
cost_usd: external_exports.number().optional(),
|
|
21490
|
+
duration_ms: external_exports.number().optional(),
|
|
21491
|
+
started_at: external_exports.string().optional(),
|
|
21492
|
+
ended_at: external_exports.string().optional(),
|
|
21493
|
+
metadata: MetadataWireSchema.optional()
|
|
21494
|
+
});
|
|
21495
|
+
function toNormalizedTrajectoryWire(trajectory) {
|
|
21496
|
+
return NormalizedTrajectoryWireSchema.parse(
|
|
21497
|
+
omitUndefinedProperties({
|
|
21498
|
+
schema_version: trajectory.schemaVersion,
|
|
21499
|
+
source: toNormalizedTraceSourceWire(trajectory.source),
|
|
21500
|
+
session: toNormalizedTraceSessionWire(trajectory.session),
|
|
21501
|
+
branch: trajectory.branch ? toNormalizedTraceBranchWire(trajectory.branch) : void 0,
|
|
21502
|
+
events: trajectory.events.map(toNormalizedTraceEventWire),
|
|
21503
|
+
token_usage: trajectory.tokenUsage,
|
|
21504
|
+
cost_usd: trajectory.costUsd,
|
|
21505
|
+
duration_ms: trajectory.durationMs,
|
|
21506
|
+
started_at: trajectory.startedAt,
|
|
21507
|
+
ended_at: trajectory.endedAt,
|
|
21508
|
+
metadata: trajectory.metadata
|
|
21509
|
+
})
|
|
21510
|
+
);
|
|
21511
|
+
}
|
|
21512
|
+
function fromNormalizedTrajectoryWire(input) {
|
|
21513
|
+
const wire = NormalizedTrajectoryWireSchema.parse(input);
|
|
21514
|
+
return {
|
|
21515
|
+
schemaVersion: wire.schema_version,
|
|
21516
|
+
source: fromNormalizedTraceSourceWire(wire.source),
|
|
21517
|
+
session: fromNormalizedTraceSessionWire(wire.session),
|
|
21518
|
+
branch: wire.branch ? fromNormalizedTraceBranchWire(wire.branch) : void 0,
|
|
21519
|
+
events: wire.events.map(fromNormalizedTraceEventWire),
|
|
21520
|
+
tokenUsage: wire.token_usage,
|
|
21521
|
+
costUsd: wire.cost_usd,
|
|
21522
|
+
durationMs: wire.duration_ms,
|
|
21523
|
+
startedAt: wire.started_at,
|
|
21524
|
+
endedAt: wire.ended_at,
|
|
21525
|
+
metadata: wire.metadata
|
|
21526
|
+
};
|
|
21527
|
+
}
|
|
21528
|
+
function toNormalizedTraceSourceWire(source) {
|
|
21529
|
+
return omitUndefinedProperties({
|
|
21530
|
+
kind: source.kind,
|
|
21531
|
+
path: source.path,
|
|
21532
|
+
url: source.url,
|
|
21533
|
+
provider: source.provider,
|
|
21534
|
+
format: source.format,
|
|
21535
|
+
version: source.version,
|
|
21536
|
+
metadata: source.metadata
|
|
21537
|
+
});
|
|
21538
|
+
}
|
|
21539
|
+
function fromNormalizedTraceSourceWire(source) {
|
|
21540
|
+
return {
|
|
21541
|
+
kind: source.kind,
|
|
21542
|
+
path: source.path,
|
|
21543
|
+
url: source.url,
|
|
21544
|
+
provider: source.provider,
|
|
21545
|
+
format: source.format,
|
|
21546
|
+
version: source.version,
|
|
21547
|
+
metadata: source.metadata
|
|
21548
|
+
};
|
|
21549
|
+
}
|
|
21550
|
+
function toNormalizedTraceSessionWire(session) {
|
|
21551
|
+
return omitUndefinedProperties({
|
|
21552
|
+
session_id: session.sessionId,
|
|
21553
|
+
conversation_id: session.conversationId,
|
|
21554
|
+
cwd: session.cwd,
|
|
21555
|
+
started_at: session.startedAt,
|
|
21556
|
+
ended_at: session.endedAt,
|
|
21557
|
+
metadata: session.metadata
|
|
21558
|
+
});
|
|
21559
|
+
}
|
|
21560
|
+
function fromNormalizedTraceSessionWire(session) {
|
|
21561
|
+
return {
|
|
21562
|
+
sessionId: session.session_id,
|
|
21563
|
+
conversationId: session.conversation_id,
|
|
21564
|
+
cwd: session.cwd,
|
|
21565
|
+
startedAt: session.started_at,
|
|
21566
|
+
endedAt: session.ended_at,
|
|
21567
|
+
metadata: session.metadata
|
|
21568
|
+
};
|
|
21569
|
+
}
|
|
21570
|
+
function toNormalizedTraceBranchWire(branch) {
|
|
21571
|
+
return omitUndefinedProperties({
|
|
21572
|
+
selected_leaf_id: branch.selectedLeafId,
|
|
21573
|
+
selected_path_ids: branch.selectedPathIds,
|
|
21574
|
+
included_event_ids: branch.includedEventIds,
|
|
21575
|
+
omitted_event_ids: branch.omittedEventIds,
|
|
21576
|
+
selection_reason: branch.selectionReason
|
|
21577
|
+
});
|
|
21578
|
+
}
|
|
21579
|
+
function fromNormalizedTraceBranchWire(branch) {
|
|
21580
|
+
return {
|
|
21581
|
+
selectedLeafId: branch.selected_leaf_id,
|
|
21582
|
+
selectedPathIds: branch.selected_path_ids,
|
|
21583
|
+
includedEventIds: branch.included_event_ids,
|
|
21584
|
+
omittedEventIds: branch.omitted_event_ids,
|
|
21585
|
+
selectionReason: branch.selection_reason
|
|
21586
|
+
};
|
|
21587
|
+
}
|
|
21588
|
+
function toNormalizedTraceEventWire(event) {
|
|
21589
|
+
return NormalizedTraceEventWireSchema.parse(
|
|
21590
|
+
omitUndefinedProperties({
|
|
21591
|
+
event_id: event.eventId,
|
|
21592
|
+
parent_event_id: event.parentEventId,
|
|
21593
|
+
ordinal: event.ordinal,
|
|
21594
|
+
type: event.type,
|
|
21595
|
+
timestamp: event.timestamp,
|
|
21596
|
+
duration_ms: event.durationMs,
|
|
21597
|
+
duration_inferred: event.durationInferred,
|
|
21598
|
+
turn_index: event.turnIndex,
|
|
21599
|
+
message: event.message ? toNormalizedTraceMessageWire(event.message) : void 0,
|
|
21600
|
+
model: event.model ? toNormalizedTraceModelWire(event.model) : void 0,
|
|
21601
|
+
tool: event.tool ? toNormalizedTraceToolWire(event.tool) : void 0,
|
|
21602
|
+
source_ref: event.sourceRef ? toNormalizedTraceSourceRefWire(event.sourceRef) : void 0,
|
|
21603
|
+
raw_evidence: event.rawEvidence?.map(toNormalizedRawEvidenceWire),
|
|
21604
|
+
redaction: event.redaction,
|
|
21605
|
+
metadata: event.metadata
|
|
21606
|
+
})
|
|
21607
|
+
);
|
|
21608
|
+
}
|
|
21609
|
+
function fromNormalizedTraceEventWire(event) {
|
|
21610
|
+
return {
|
|
21611
|
+
eventId: event.event_id,
|
|
21612
|
+
parentEventId: event.parent_event_id,
|
|
21613
|
+
ordinal: event.ordinal,
|
|
21614
|
+
type: event.type,
|
|
21615
|
+
timestamp: event.timestamp,
|
|
21616
|
+
durationMs: event.duration_ms,
|
|
21617
|
+
durationInferred: event.duration_inferred,
|
|
21618
|
+
turnIndex: event.turn_index,
|
|
21619
|
+
message: event.message ? fromNormalizedTraceMessageWire(event.message) : void 0,
|
|
21620
|
+
model: event.model ? fromNormalizedTraceModelWire(event.model) : void 0,
|
|
21621
|
+
tool: event.tool ? fromNormalizedTraceToolWire(event.tool) : void 0,
|
|
21622
|
+
sourceRef: event.source_ref ? fromNormalizedTraceSourceRefWire(event.source_ref) : void 0,
|
|
21623
|
+
rawEvidence: event.raw_evidence?.map(fromNormalizedRawEvidenceWire),
|
|
21624
|
+
redaction: event.redaction,
|
|
21625
|
+
metadata: event.metadata
|
|
21626
|
+
};
|
|
21627
|
+
}
|
|
21628
|
+
function toNormalizedTraceMessageWire(message) {
|
|
21629
|
+
return omitUndefinedProperties({
|
|
21630
|
+
role: message.role,
|
|
21631
|
+
name: message.name,
|
|
21632
|
+
content: message.content,
|
|
21633
|
+
redaction: message.redaction,
|
|
21634
|
+
token_usage: message.tokenUsage,
|
|
21635
|
+
metadata: message.metadata
|
|
21636
|
+
});
|
|
21637
|
+
}
|
|
21638
|
+
function fromNormalizedTraceMessageWire(message) {
|
|
21639
|
+
return {
|
|
21640
|
+
role: message.role,
|
|
21641
|
+
name: message.name,
|
|
21642
|
+
content: message.content,
|
|
21643
|
+
redaction: message.redaction,
|
|
21644
|
+
tokenUsage: message.token_usage,
|
|
21645
|
+
metadata: message.metadata
|
|
21646
|
+
};
|
|
21647
|
+
}
|
|
21648
|
+
function toNormalizedTraceModelWire(model) {
|
|
21649
|
+
return omitUndefinedProperties({
|
|
21650
|
+
provider: model.provider,
|
|
21651
|
+
name: model.name,
|
|
21652
|
+
invocation_id: model.invocationId,
|
|
21653
|
+
token_usage: model.tokenUsage,
|
|
21654
|
+
metadata: model.metadata
|
|
21655
|
+
});
|
|
21656
|
+
}
|
|
21657
|
+
function fromNormalizedTraceModelWire(model) {
|
|
21658
|
+
return {
|
|
21659
|
+
provider: model.provider,
|
|
21660
|
+
name: model.name,
|
|
21661
|
+
invocationId: model.invocation_id,
|
|
21662
|
+
tokenUsage: model.token_usage,
|
|
21663
|
+
metadata: model.metadata
|
|
21664
|
+
};
|
|
21665
|
+
}
|
|
21666
|
+
function toNormalizedTraceToolWire(tool) {
|
|
21667
|
+
return omitUndefinedProperties({
|
|
21668
|
+
name: tool.name,
|
|
21669
|
+
call_id: tool.callId,
|
|
21670
|
+
input: tool.input,
|
|
21671
|
+
output: tool.output,
|
|
21672
|
+
status: tool.status,
|
|
21673
|
+
error: tool.error,
|
|
21674
|
+
redaction: tool.redaction,
|
|
21675
|
+
metadata: tool.metadata
|
|
21676
|
+
});
|
|
21677
|
+
}
|
|
21678
|
+
function fromNormalizedTraceToolWire(tool) {
|
|
21679
|
+
return {
|
|
21680
|
+
name: tool.name,
|
|
21681
|
+
callId: tool.call_id,
|
|
21682
|
+
input: tool.input,
|
|
21683
|
+
output: tool.output,
|
|
21684
|
+
status: tool.status,
|
|
21685
|
+
error: tool.error,
|
|
21686
|
+
redaction: tool.redaction,
|
|
21687
|
+
metadata: tool.metadata
|
|
21688
|
+
};
|
|
21689
|
+
}
|
|
21690
|
+
function toNormalizedTraceSourceRefWire(sourceRef) {
|
|
21691
|
+
return omitUndefinedProperties({
|
|
21692
|
+
event_id: sourceRef.eventId,
|
|
21693
|
+
message_id: sourceRef.messageId,
|
|
21694
|
+
span_id: sourceRef.spanId,
|
|
21695
|
+
trace_id: sourceRef.traceId,
|
|
21696
|
+
raw_kind: sourceRef.rawKind,
|
|
21697
|
+
path: sourceRef.path,
|
|
21698
|
+
line: sourceRef.line,
|
|
21699
|
+
metadata: sourceRef.metadata
|
|
21700
|
+
});
|
|
21701
|
+
}
|
|
21702
|
+
function fromNormalizedTraceSourceRefWire(sourceRef) {
|
|
21703
|
+
return {
|
|
21704
|
+
eventId: sourceRef.event_id,
|
|
21705
|
+
messageId: sourceRef.message_id,
|
|
21706
|
+
spanId: sourceRef.span_id,
|
|
21707
|
+
traceId: sourceRef.trace_id,
|
|
21708
|
+
rawKind: sourceRef.raw_kind,
|
|
21709
|
+
path: sourceRef.path,
|
|
21710
|
+
line: sourceRef.line,
|
|
21711
|
+
metadata: sourceRef.metadata
|
|
21712
|
+
};
|
|
21713
|
+
}
|
|
21714
|
+
function toNormalizedRawEvidenceWire(evidence) {
|
|
21715
|
+
return omitUndefinedProperties({
|
|
21716
|
+
kind: evidence.kind,
|
|
21717
|
+
ref: evidence.ref,
|
|
21718
|
+
media_type: evidence.mediaType,
|
|
21719
|
+
content: evidence.content,
|
|
21720
|
+
redacted: evidence.redacted,
|
|
21721
|
+
metadata: evidence.metadata
|
|
21722
|
+
});
|
|
21723
|
+
}
|
|
21724
|
+
function fromNormalizedRawEvidenceWire(evidence) {
|
|
21725
|
+
return {
|
|
21726
|
+
kind: evidence.kind,
|
|
21727
|
+
ref: evidence.ref,
|
|
21728
|
+
mediaType: evidence.media_type,
|
|
21729
|
+
content: evidence.content,
|
|
21730
|
+
redacted: evidence.redacted,
|
|
21731
|
+
metadata: evidence.metadata
|
|
21732
|
+
};
|
|
21733
|
+
}
|
|
21251
21734
|
function computeTraceSummary(messages) {
|
|
21252
21735
|
const toolCallCounts = {};
|
|
21253
21736
|
const toolDurations = {};
|
|
@@ -21315,6 +21798,82 @@ function computeTraceSummary(messages) {
|
|
|
21315
21798
|
endTime: latestEnd?.toISOString()
|
|
21316
21799
|
};
|
|
21317
21800
|
}
|
|
21801
|
+
function getSelectedTrajectoryEvents(trajectory) {
|
|
21802
|
+
if (!trajectory.branch?.includedEventIds || trajectory.branch.includedEventIds.length === 0) {
|
|
21803
|
+
return trajectory.events;
|
|
21804
|
+
}
|
|
21805
|
+
const includedIds = new Set(trajectory.branch.includedEventIds);
|
|
21806
|
+
return trajectory.events.filter((event) => includedIds.has(event.eventId));
|
|
21807
|
+
}
|
|
21808
|
+
function computeTraceSummaryFromTrajectory(trajectory) {
|
|
21809
|
+
const selectedEvents = getSelectedTrajectoryEvents(trajectory);
|
|
21810
|
+
const hasModelTurnEvents = selectedEvents.some((event) => event.type === "model_turn");
|
|
21811
|
+
const toolCallCounts = {};
|
|
21812
|
+
const toolDurations = {};
|
|
21813
|
+
let totalToolCalls = 0;
|
|
21814
|
+
let errorCount = 0;
|
|
21815
|
+
let llmCallCount = 0;
|
|
21816
|
+
let earliestStart;
|
|
21817
|
+
let latestEnd;
|
|
21818
|
+
let hasAnyDuration = false;
|
|
21819
|
+
for (const event of selectedEvents) {
|
|
21820
|
+
if (event.type === "model_turn" || !hasModelTurnEvents && event.type === "message" && event.message?.role === "assistant") {
|
|
21821
|
+
llmCallCount++;
|
|
21822
|
+
}
|
|
21823
|
+
const eventStart = parseTimestamp(event.timestamp);
|
|
21824
|
+
if (eventStart && (!earliestStart || eventStart < earliestStart)) {
|
|
21825
|
+
earliestStart = eventStart;
|
|
21826
|
+
}
|
|
21827
|
+
const eventEnd = deriveEventEnd(eventStart, event.durationMs);
|
|
21828
|
+
if (eventEnd && (!latestEnd || eventEnd > latestEnd)) {
|
|
21829
|
+
latestEnd = eventEnd;
|
|
21830
|
+
}
|
|
21831
|
+
if (event.type !== "tool_call" || !event.tool) {
|
|
21832
|
+
continue;
|
|
21833
|
+
}
|
|
21834
|
+
toolCallCounts[event.tool.name] = (toolCallCounts[event.tool.name] ?? 0) + 1;
|
|
21835
|
+
totalToolCalls++;
|
|
21836
|
+
if (isErrorToolEvent(event)) {
|
|
21837
|
+
errorCount++;
|
|
21838
|
+
}
|
|
21839
|
+
if (event.durationMs !== void 0) {
|
|
21840
|
+
hasAnyDuration = true;
|
|
21841
|
+
if (!toolDurations[event.tool.name]) {
|
|
21842
|
+
toolDurations[event.tool.name] = [];
|
|
21843
|
+
}
|
|
21844
|
+
toolDurations[event.tool.name].push(event.durationMs);
|
|
21845
|
+
}
|
|
21846
|
+
}
|
|
21847
|
+
return {
|
|
21848
|
+
trace: {
|
|
21849
|
+
eventCount: totalToolCalls,
|
|
21850
|
+
toolCalls: toolCallCounts,
|
|
21851
|
+
errorCount,
|
|
21852
|
+
llmCallCount,
|
|
21853
|
+
...hasAnyDuration ? { toolDurations } : {}
|
|
21854
|
+
},
|
|
21855
|
+
tokenUsage: trajectory.tokenUsage,
|
|
21856
|
+
costUsd: trajectory.costUsd,
|
|
21857
|
+
durationMs: trajectory.durationMs,
|
|
21858
|
+
startTime: trajectory.startedAt ?? earliestStart?.toISOString(),
|
|
21859
|
+
endTime: trajectory.endedAt ?? latestEnd?.toISOString()
|
|
21860
|
+
};
|
|
21861
|
+
}
|
|
21862
|
+
function parseTimestamp(timestamp) {
|
|
21863
|
+
if (!timestamp) return void 0;
|
|
21864
|
+
const value = new Date(timestamp);
|
|
21865
|
+
return Number.isNaN(value.getTime()) ? void 0 : value;
|
|
21866
|
+
}
|
|
21867
|
+
function deriveEventEnd(start, durationMs) {
|
|
21868
|
+
if (!start) return void 0;
|
|
21869
|
+
if (durationMs === void 0) return start;
|
|
21870
|
+
return new Date(start.getTime() + durationMs);
|
|
21871
|
+
}
|
|
21872
|
+
function isErrorToolEvent(event) {
|
|
21873
|
+
return Boolean(
|
|
21874
|
+
event.tool?.error || event.tool?.status === "error" || event.tool?.status === "timeout" || event.tool?.status === "cancelled"
|
|
21875
|
+
);
|
|
21876
|
+
}
|
|
21318
21877
|
var DEFAULT_EXPLORATION_TOOLS = [
|
|
21319
21878
|
"read",
|
|
21320
21879
|
"grep",
|
|
@@ -22099,6 +22658,30 @@ var SkillTriggerGrader = class {
|
|
|
22099
22658
|
};
|
|
22100
22659
|
}
|
|
22101
22660
|
};
|
|
22661
|
+
function stringifyPretty2(value) {
|
|
22662
|
+
return value === void 0 ? "" : JSON.stringify(value, null, 2);
|
|
22663
|
+
}
|
|
22664
|
+
function stringifyCompact2(value) {
|
|
22665
|
+
return value === void 0 ? "" : JSON.stringify(value);
|
|
22666
|
+
}
|
|
22667
|
+
function buildTemplateVariables2(input) {
|
|
22668
|
+
const formattedQuestion = input.promptInputs.question && input.promptInputs.question.trim().length > 0 ? input.promptInputs.question : input.evalCase.question;
|
|
22669
|
+
return {
|
|
22670
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
22671
|
+
[TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(),
|
|
22672
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? "").trim(),
|
|
22673
|
+
[TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
|
|
22674
|
+
[TEMPLATE_VARIABLES.METADATA]: stringifyPretty2(input.evalCase.metadata),
|
|
22675
|
+
[TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact2(input.evalCase.metadata),
|
|
22676
|
+
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty2(input.rubrics),
|
|
22677
|
+
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact2(input.rubrics),
|
|
22678
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? "",
|
|
22679
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? "",
|
|
22680
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
22681
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
|
|
22682
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? "").trim()
|
|
22683
|
+
};
|
|
22684
|
+
}
|
|
22102
22685
|
function assembleLlmGraderPrompt(input) {
|
|
22103
22686
|
const {
|
|
22104
22687
|
evalCase,
|
|
@@ -22111,6 +22694,17 @@ function assembleLlmGraderPrompt(input) {
|
|
|
22111
22694
|
} = input;
|
|
22112
22695
|
const rubrics = evaluatorConfig?.rubrics;
|
|
22113
22696
|
if (rubrics && rubrics.length > 0) {
|
|
22697
|
+
if (graderTemplateOverride) {
|
|
22698
|
+
return assembleCustom(
|
|
22699
|
+
evalCase,
|
|
22700
|
+
candidate,
|
|
22701
|
+
promptInputs,
|
|
22702
|
+
rubrics,
|
|
22703
|
+
fileChanges,
|
|
22704
|
+
toolCalls,
|
|
22705
|
+
graderTemplateOverride
|
|
22706
|
+
);
|
|
22707
|
+
}
|
|
22114
22708
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
22115
22709
|
if (hasScoreRanges) {
|
|
22116
22710
|
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
@@ -22127,19 +22721,13 @@ function assembleLlmGraderPrompt(input) {
|
|
|
22127
22721
|
);
|
|
22128
22722
|
}
|
|
22129
22723
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
|
|
22130
|
-
const
|
|
22131
|
-
|
|
22132
|
-
|
|
22133
|
-
|
|
22134
|
-
|
|
22135
|
-
|
|
22136
|
-
|
|
22137
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
|
|
22138
|
-
// Deprecated aliases
|
|
22139
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
22140
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
22141
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
22142
|
-
};
|
|
22724
|
+
const variables = buildTemplateVariables2({
|
|
22725
|
+
evalCase,
|
|
22726
|
+
candidate,
|
|
22727
|
+
promptInputs,
|
|
22728
|
+
fileChanges,
|
|
22729
|
+
toolCalls
|
|
22730
|
+
});
|
|
22143
22731
|
const systemPrompt = buildOutputSchema();
|
|
22144
22732
|
const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
|
|
22145
22733
|
let userPrompt = substituteVariables(template, variables);
|
|
@@ -22162,6 +22750,27 @@ ${toolCalls}`;
|
|
|
22162
22750
|
mode: "freeform"
|
|
22163
22751
|
};
|
|
22164
22752
|
}
|
|
22753
|
+
function assembleCustom(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls, graderTemplateOverride) {
|
|
22754
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
22755
|
+
const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema();
|
|
22756
|
+
const userPrompt = substituteVariables(
|
|
22757
|
+
graderTemplateOverride,
|
|
22758
|
+
buildTemplateVariables2({
|
|
22759
|
+
evalCase,
|
|
22760
|
+
candidate,
|
|
22761
|
+
promptInputs,
|
|
22762
|
+
rubrics,
|
|
22763
|
+
fileChanges,
|
|
22764
|
+
toolCalls
|
|
22765
|
+
})
|
|
22766
|
+
);
|
|
22767
|
+
return {
|
|
22768
|
+
systemPrompt,
|
|
22769
|
+
userPrompt,
|
|
22770
|
+
responseSchema: systemPrompt,
|
|
22771
|
+
mode: hasScoreRanges ? "score_range" : "checklist"
|
|
22772
|
+
};
|
|
22773
|
+
}
|
|
22165
22774
|
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
22166
22775
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
22167
22776
|
const parts = [
|
|
@@ -22185,10 +22794,19 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
22185
22794
|
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
22186
22795
|
}
|
|
22187
22796
|
parts.push("[[ ## rubrics ## ]]");
|
|
22797
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
22798
|
+
if (operatorGuidance.length > 0) {
|
|
22799
|
+
parts.push("", "Operator guidance:");
|
|
22800
|
+
for (const guidance of operatorGuidance) {
|
|
22801
|
+
parts.push(`- ${guidance}`);
|
|
22802
|
+
}
|
|
22803
|
+
parts.push("");
|
|
22804
|
+
}
|
|
22188
22805
|
for (const rubric of rubrics) {
|
|
22189
22806
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
22190
22807
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
22191
|
-
|
|
22808
|
+
const operatorLabel = formatRubricOperatorLabel(rubric.operator);
|
|
22809
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`);
|
|
22192
22810
|
}
|
|
22193
22811
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
22194
22812
|
const systemPrompt = buildRubricOutputSchema();
|
|
@@ -22228,6 +22846,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
22228
22846
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
22229
22847
|
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
22230
22848
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
22849
|
+
if (rubric.operator) {
|
|
22850
|
+
parts.push(`Operator: ${rubric.operator}`);
|
|
22851
|
+
}
|
|
22231
22852
|
if (rubric.outcome) {
|
|
22232
22853
|
parts.push(`Description: ${rubric.outcome}`);
|
|
22233
22854
|
}
|
|
@@ -22240,6 +22861,10 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
22240
22861
|
}
|
|
22241
22862
|
}
|
|
22242
22863
|
}
|
|
22864
|
+
const operatorGuidance = formatRubricOperatorGuidance(rubrics);
|
|
22865
|
+
if (operatorGuidance.length > 0) {
|
|
22866
|
+
parts.push("", ...operatorGuidance);
|
|
22867
|
+
}
|
|
22243
22868
|
parts.push(
|
|
22244
22869
|
"",
|
|
22245
22870
|
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
@@ -23409,10 +24034,10 @@ var ClaudeCliProvider = class {
|
|
|
23409
24034
|
}
|
|
23410
24035
|
resolveCwd(cwdOverride) {
|
|
23411
24036
|
if (cwdOverride) {
|
|
23412
|
-
return
|
|
24037
|
+
return path52.resolve(cwdOverride);
|
|
23413
24038
|
}
|
|
23414
24039
|
if (this.config.cwd) {
|
|
23415
|
-
return
|
|
24040
|
+
return path52.resolve(this.config.cwd);
|
|
23416
24041
|
}
|
|
23417
24042
|
return void 0;
|
|
23418
24043
|
}
|
|
@@ -23422,9 +24047,9 @@ var ClaudeCliProvider = class {
|
|
|
23422
24047
|
return void 0;
|
|
23423
24048
|
}
|
|
23424
24049
|
if (this.config.logDir) {
|
|
23425
|
-
return
|
|
24050
|
+
return path52.resolve(this.config.logDir);
|
|
23426
24051
|
}
|
|
23427
|
-
return
|
|
24052
|
+
return path52.join(process.cwd(), ".agentv", "logs", "claude-cli");
|
|
23428
24053
|
}
|
|
23429
24054
|
async createStreamLogger(request) {
|
|
23430
24055
|
const logDir = this.resolveLogDirectory();
|
|
@@ -23432,13 +24057,13 @@ var ClaudeCliProvider = class {
|
|
|
23432
24057
|
return void 0;
|
|
23433
24058
|
}
|
|
23434
24059
|
try {
|
|
23435
|
-
await
|
|
24060
|
+
await mkdir2(logDir, { recursive: true });
|
|
23436
24061
|
} catch (error40) {
|
|
23437
24062
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
23438
24063
|
console.warn(`Skipping Claude CLI stream logging (could not create ${logDir}): ${message}`);
|
|
23439
24064
|
return void 0;
|
|
23440
24065
|
}
|
|
23441
|
-
const filePath =
|
|
24066
|
+
const filePath = path52.join(logDir, buildLogFilename(request, this.targetName));
|
|
23442
24067
|
try {
|
|
23443
24068
|
const logger = await ClaudeCliStreamLogger.create({
|
|
23444
24069
|
filePath,
|
|
@@ -23921,7 +24546,7 @@ var ClaudeSdkProvider = class {
|
|
|
23921
24546
|
return void 0;
|
|
23922
24547
|
}
|
|
23923
24548
|
try {
|
|
23924
|
-
await
|
|
24549
|
+
await mkdir3(logDir, { recursive: true });
|
|
23925
24550
|
} catch (error40) {
|
|
23926
24551
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
23927
24552
|
console.warn(`Skipping Claude stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -24746,6 +25371,9 @@ var CodexProvider = class {
|
|
|
24746
25371
|
const startMs = Date.now();
|
|
24747
25372
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
24748
25373
|
const codexOptions = {};
|
|
25374
|
+
if (this.config.executable) {
|
|
25375
|
+
codexOptions.codexPathOverride = this.config.executable;
|
|
25376
|
+
}
|
|
24749
25377
|
if (this.config.model) {
|
|
24750
25378
|
codexOptions.config = { model: this.config.model };
|
|
24751
25379
|
}
|
|
@@ -24757,6 +25385,9 @@ var CodexProvider = class {
|
|
|
24757
25385
|
if (cwd) {
|
|
24758
25386
|
threadOptions.workingDirectory = cwd;
|
|
24759
25387
|
}
|
|
25388
|
+
if (this.config.modelReasoningEffort) {
|
|
25389
|
+
threadOptions.modelReasoningEffort = this.config.modelReasoningEffort;
|
|
25390
|
+
}
|
|
24760
25391
|
const thread = codex.startThread(threadOptions);
|
|
24761
25392
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
24762
25393
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
@@ -24904,7 +25535,7 @@ ${basePrompt}` : basePrompt;
|
|
|
24904
25535
|
}
|
|
24905
25536
|
resolveLogDirectory() {
|
|
24906
25537
|
const disabled = isCodexLogStreamingDisabled();
|
|
24907
|
-
if (disabled) {
|
|
25538
|
+
if (disabled || this.config.streamLog === false) {
|
|
24908
25539
|
return void 0;
|
|
24909
25540
|
}
|
|
24910
25541
|
if (this.config.logDir) {
|
|
@@ -24918,7 +25549,7 @@ ${basePrompt}` : basePrompt;
|
|
|
24918
25549
|
return void 0;
|
|
24919
25550
|
}
|
|
24920
25551
|
try {
|
|
24921
|
-
await
|
|
25552
|
+
await mkdir4(logDir, { recursive: true });
|
|
24922
25553
|
} catch (error40) {
|
|
24923
25554
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
24924
25555
|
console.warn(`Skipping Codex SDK stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -24931,7 +25562,7 @@ ${basePrompt}` : basePrompt;
|
|
|
24931
25562
|
targetName: this.targetName,
|
|
24932
25563
|
evalCaseId: request.evalCaseId,
|
|
24933
25564
|
attempt: request.attempt,
|
|
24934
|
-
format: this.config.
|
|
25565
|
+
format: this.config.streamLog === "raw" ? "json" : "summary"
|
|
24935
25566
|
});
|
|
24936
25567
|
recordCodexLogEntry({
|
|
24937
25568
|
filePath,
|
|
@@ -25136,7 +25767,7 @@ async function walkDir(rootDir, currentDir, snapshot) {
|
|
|
25136
25767
|
if (fileStat.size > SNAPSHOT_MAX_FILE_BYTES) continue;
|
|
25137
25768
|
let content;
|
|
25138
25769
|
try {
|
|
25139
|
-
content = await
|
|
25770
|
+
content = await readFile32(fullPath, "utf8");
|
|
25140
25771
|
if (content.includes("\0")) continue;
|
|
25141
25772
|
} catch {
|
|
25142
25773
|
continue;
|
|
@@ -25220,7 +25851,7 @@ function subscribeToCopilotCliLogEntries(listener) {
|
|
|
25220
25851
|
};
|
|
25221
25852
|
}
|
|
25222
25853
|
function resolvePlatformCliPath() {
|
|
25223
|
-
const
|
|
25854
|
+
const os22 = platform();
|
|
25224
25855
|
const cpu = arch();
|
|
25225
25856
|
const platformMap = {
|
|
25226
25857
|
linux: "linux",
|
|
@@ -25231,13 +25862,13 @@ function resolvePlatformCliPath() {
|
|
|
25231
25862
|
x64: "x64",
|
|
25232
25863
|
arm64: "arm64"
|
|
25233
25864
|
};
|
|
25234
|
-
const osPart = platformMap[
|
|
25865
|
+
const osPart = platformMap[os22];
|
|
25235
25866
|
const archPart = archMap[cpu];
|
|
25236
25867
|
if (!osPart || !archPart) {
|
|
25237
25868
|
return void 0;
|
|
25238
25869
|
}
|
|
25239
25870
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
25240
|
-
const binaryName =
|
|
25871
|
+
const binaryName = os22 === "win32" ? "copilot.exe" : "copilot";
|
|
25241
25872
|
try {
|
|
25242
25873
|
const resolved = import.meta.resolve(`${packageName}/package.json`);
|
|
25243
25874
|
const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
|
|
@@ -25305,9 +25936,9 @@ function resolvePlatformCliPath() {
|
|
|
25305
25936
|
}
|
|
25306
25937
|
function globalNpmRoots() {
|
|
25307
25938
|
const roots = [];
|
|
25308
|
-
const
|
|
25939
|
+
const os22 = platform();
|
|
25309
25940
|
const home = homedir3();
|
|
25310
|
-
if (
|
|
25941
|
+
if (os22 === "win32") {
|
|
25311
25942
|
if (process.env.APPDATA) {
|
|
25312
25943
|
roots.push(path10.join(process.env.APPDATA, "npm", "node_modules"));
|
|
25313
25944
|
}
|
|
@@ -25322,7 +25953,7 @@ function globalNpmRoots() {
|
|
|
25322
25953
|
if (process.env.npm_config_prefix) {
|
|
25323
25954
|
const prefix = process.env.npm_config_prefix;
|
|
25324
25955
|
roots.push(
|
|
25325
|
-
|
|
25956
|
+
os22 === "win32" ? path10.join(prefix, "node_modules") : path10.join(prefix, "lib", "node_modules")
|
|
25326
25957
|
);
|
|
25327
25958
|
}
|
|
25328
25959
|
return Array.from(new Set(roots));
|
|
@@ -25741,7 +26372,7 @@ var CopilotCliProvider = class {
|
|
|
25741
26372
|
return void 0;
|
|
25742
26373
|
}
|
|
25743
26374
|
try {
|
|
25744
|
-
await
|
|
26375
|
+
await mkdir5(logDir, { recursive: true });
|
|
25745
26376
|
} catch (error40) {
|
|
25746
26377
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
25747
26378
|
console.warn(`Skipping Copilot CLI stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -25992,7 +26623,7 @@ async function discoverCopilotSessions(opts) {
|
|
|
25992
26623
|
const workspacePath = path12.join(sessionDir, "workspace.yaml");
|
|
25993
26624
|
const eventsPath = path12.join(sessionDir, "events.jsonl");
|
|
25994
26625
|
try {
|
|
25995
|
-
const workspaceContent = await
|
|
26626
|
+
const workspaceContent = await readFile4(workspacePath, "utf8");
|
|
25996
26627
|
const workspace = parseYamlValue(workspaceContent) ?? {};
|
|
25997
26628
|
const cwd = String(workspace.cwd ?? "");
|
|
25998
26629
|
let updatedAt;
|
|
@@ -26052,7 +26683,7 @@ var CopilotLogProvider = class {
|
|
|
26052
26683
|
const eventsPath = path13.join(sessionDir, "events.jsonl");
|
|
26053
26684
|
let eventsContent;
|
|
26054
26685
|
try {
|
|
26055
|
-
eventsContent = await
|
|
26686
|
+
eventsContent = await readFile5(eventsPath, "utf8");
|
|
26056
26687
|
} catch (err) {
|
|
26057
26688
|
throw new Error(
|
|
26058
26689
|
`Failed to read Copilot session transcript at ${eventsPath}: ${err instanceof Error ? err.message : String(err)}`
|
|
@@ -26429,7 +27060,7 @@ var CopilotSdkProvider = class {
|
|
|
26429
27060
|
return void 0;
|
|
26430
27061
|
}
|
|
26431
27062
|
try {
|
|
26432
|
-
await
|
|
27063
|
+
await mkdir6(logDir, { recursive: true });
|
|
26433
27064
|
} catch (error40) {
|
|
26434
27065
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
26435
27066
|
console.warn(`Skipping Copilot SDK stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -26746,7 +27377,7 @@ var PiCliProvider = class {
|
|
|
26746
27377
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
26747
27378
|
try {
|
|
26748
27379
|
const promptFile = path15.join(cwd, PROMPT_FILENAME);
|
|
26749
|
-
await
|
|
27380
|
+
await writeFile3(promptFile, request.question, "utf8");
|
|
26750
27381
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
26751
27382
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
26752
27383
|
if (result.timedOut) {
|
|
@@ -26937,7 +27568,7 @@ ${prompt}` : prompt;
|
|
|
26937
27568
|
return void 0;
|
|
26938
27569
|
}
|
|
26939
27570
|
try {
|
|
26940
|
-
await
|
|
27571
|
+
await mkdir7(logDir, { recursive: true });
|
|
26941
27572
|
} catch (error40) {
|
|
26942
27573
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
26943
27574
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -27928,7 +28559,7 @@ ${fileList}`;
|
|
|
27928
28559
|
return void 0;
|
|
27929
28560
|
}
|
|
27930
28561
|
try {
|
|
27931
|
-
await
|
|
28562
|
+
await mkdir8(logDir, { recursive: true });
|
|
27932
28563
|
} catch (error40) {
|
|
27933
28564
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
27934
28565
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -28152,7 +28783,7 @@ async function pathExists(target) {
|
|
|
28152
28783
|
}
|
|
28153
28784
|
}
|
|
28154
28785
|
async function ensureDir(target) {
|
|
28155
|
-
await
|
|
28786
|
+
await mkdir9(target, { recursive: true });
|
|
28156
28787
|
}
|
|
28157
28788
|
async function readDirEntries(target) {
|
|
28158
28789
|
const entries = await readdir3(target, { withFileTypes: true });
|
|
@@ -28304,7 +28935,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
28304
28935
|
const maxAttempts = 10;
|
|
28305
28936
|
while (attempts < maxAttempts) {
|
|
28306
28937
|
try {
|
|
28307
|
-
const content = await
|
|
28938
|
+
const content = await readFile6(responseFileFinal, { encoding: "utf8" });
|
|
28308
28939
|
if (!silent) {
|
|
28309
28940
|
process.stdout.write(`${content}
|
|
28310
28941
|
`);
|
|
@@ -28361,7 +28992,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
28361
28992
|
const maxAttempts = 10;
|
|
28362
28993
|
while (attempts < maxAttempts) {
|
|
28363
28994
|
try {
|
|
28364
|
-
const content = await
|
|
28995
|
+
const content = await readFile6(file2, { encoding: "utf8" });
|
|
28365
28996
|
if (!silent) {
|
|
28366
28997
|
process.stdout.write(`${content}
|
|
28367
28998
|
`);
|
|
@@ -28454,9 +29085,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
28454
29085
|
const aliveFile = path222.join(subagentDir, DEFAULT_ALIVE_FILENAME);
|
|
28455
29086
|
await removeIfExists(aliveFile);
|
|
28456
29087
|
const githubAgentsDir = path222.join(subagentDir, ".github", "agents");
|
|
28457
|
-
await
|
|
29088
|
+
await mkdir10(githubAgentsDir, { recursive: true });
|
|
28458
29089
|
const wakeupDst = path222.join(githubAgentsDir, "wakeup.md");
|
|
28459
|
-
await
|
|
29090
|
+
await writeFile4(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
|
|
28460
29091
|
const workspaceChild = spawnVsCode(vscodeCmd, [workspacePath], {
|
|
28461
29092
|
label: "open-workspace"
|
|
28462
29093
|
});
|
|
@@ -28485,9 +29116,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
28485
29116
|
async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, requestInstructions, timestamp, vscodeCmd) {
|
|
28486
29117
|
const workspacePath = path222.join(subagentDir, `${path222.basename(subagentDir)}.code-workspace`);
|
|
28487
29118
|
const messagesDir = path222.join(subagentDir, "messages");
|
|
28488
|
-
await
|
|
29119
|
+
await mkdir10(messagesDir, { recursive: true });
|
|
28489
29120
|
const reqFile = path222.join(messagesDir, `${timestamp}_req.md`);
|
|
28490
|
-
await
|
|
29121
|
+
await writeFile4(reqFile, requestInstructions, { encoding: "utf8" });
|
|
28491
29122
|
const reqUri = pathToFileUri2(reqFile);
|
|
28492
29123
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
28493
29124
|
for (const attachment of attachmentPaths) {
|
|
@@ -28513,7 +29144,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
|
|
|
28513
29144
|
async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, chatInstruction, vscodeCmd) {
|
|
28514
29145
|
const workspacePath = path222.join(subagentDir, `${path222.basename(subagentDir)}.code-workspace`);
|
|
28515
29146
|
const messagesDir = path222.join(subagentDir, "messages");
|
|
28516
|
-
await
|
|
29147
|
+
await mkdir10(messagesDir, { recursive: true });
|
|
28517
29148
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
28518
29149
|
for (const attachment of attachmentPaths) {
|
|
28519
29150
|
chatArgs.push("-a", attachment);
|
|
@@ -28643,7 +29274,7 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
|
|
|
28643
29274
|
if (!stats.isFile()) {
|
|
28644
29275
|
throw new Error(`workspace template must be a file, not a directory: ${workspaceSrc}`);
|
|
28645
29276
|
}
|
|
28646
|
-
const templateText = await
|
|
29277
|
+
const templateText = await readFile7(workspaceSrc, "utf8");
|
|
28647
29278
|
workspaceContent = JSON.parse(templateText);
|
|
28648
29279
|
} else {
|
|
28649
29280
|
workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
|
|
@@ -28662,9 +29293,9 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
|
|
|
28662
29293
|
transformedContent = JSON.stringify(parsed, null, 2);
|
|
28663
29294
|
}
|
|
28664
29295
|
}
|
|
28665
|
-
await
|
|
29296
|
+
await writeFile5(workspaceDst, transformedContent, "utf8");
|
|
28666
29297
|
const messagesDir = path24.join(subagentDir, "messages");
|
|
28667
|
-
await
|
|
29298
|
+
await mkdir11(messagesDir, { recursive: true });
|
|
28668
29299
|
return { workspace: workspaceDst, messagesDir };
|
|
28669
29300
|
}
|
|
28670
29301
|
async function createSubagentLock(subagentDir) {
|
|
@@ -28687,7 +29318,7 @@ async function createSubagentLock(subagentDir) {
|
|
|
28687
29318
|
);
|
|
28688
29319
|
}
|
|
28689
29320
|
const lockFile = path24.join(subagentDir, DEFAULT_LOCK_NAME);
|
|
28690
|
-
await
|
|
29321
|
+
await writeFile5(lockFile, "", { encoding: "utf8" });
|
|
28691
29322
|
return lockFile;
|
|
28692
29323
|
}
|
|
28693
29324
|
async function removeSubagentLock(subagentDir) {
|
|
@@ -28712,7 +29343,7 @@ async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspa
|
|
|
28712
29343
|
}
|
|
28713
29344
|
if (promptFile) {
|
|
28714
29345
|
const githubAgentsDir = path24.join(subagentDir, ".github", "agents");
|
|
28715
|
-
await
|
|
29346
|
+
await mkdir11(githubAgentsDir, { recursive: true });
|
|
28716
29347
|
const agentFile = path24.join(githubAgentsDir, `${chatId}.md`);
|
|
28717
29348
|
try {
|
|
28718
29349
|
await copyFile(promptFile, agentFile);
|
|
@@ -28971,7 +29602,7 @@ async function dispatchBatchAgent(options) {
|
|
|
28971
29602
|
const reqFile = requestFiles[index];
|
|
28972
29603
|
const tmpFile = responseTmpFiles[index];
|
|
28973
29604
|
const finalFile = responseFilesFinal[index];
|
|
28974
|
-
return
|
|
29605
|
+
return writeFile6(
|
|
28975
29606
|
reqFile,
|
|
28976
29607
|
createBatchRequestPrompt(query, tmpFile, finalFile, batchRequestTemplateContent),
|
|
28977
29608
|
{ encoding: "utf8" }
|
|
@@ -28983,7 +29614,7 @@ async function dispatchBatchAgent(options) {
|
|
|
28983
29614
|
responseFilesFinal,
|
|
28984
29615
|
orchestratorTemplateContent
|
|
28985
29616
|
);
|
|
28986
|
-
await
|
|
29617
|
+
await writeFile6(orchestratorFile, orchestratorContent, { encoding: "utf8" });
|
|
28987
29618
|
}
|
|
28988
29619
|
const chatAttachments = [orchestratorFile, ...attachments];
|
|
28989
29620
|
const orchestratorUri = pathToFileUri2(orchestratorFile);
|
|
@@ -29126,8 +29757,8 @@ async function provisionSubagents(options) {
|
|
|
29126
29757
|
if (!dryRun) {
|
|
29127
29758
|
await removeIfExists(lockFile);
|
|
29128
29759
|
await ensureDir(githubAgentsDir);
|
|
29129
|
-
await
|
|
29130
|
-
await
|
|
29760
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
29761
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
29131
29762
|
}
|
|
29132
29763
|
created.push(subagentDir);
|
|
29133
29764
|
lockedSubagents.delete(subagentDir);
|
|
@@ -29137,8 +29768,8 @@ async function provisionSubagents(options) {
|
|
|
29137
29768
|
if (!isLocked && force) {
|
|
29138
29769
|
if (!dryRun) {
|
|
29139
29770
|
await ensureDir(githubAgentsDir);
|
|
29140
|
-
await
|
|
29141
|
-
await
|
|
29771
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
29772
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
29142
29773
|
}
|
|
29143
29774
|
created.push(subagentDir);
|
|
29144
29775
|
subagentsProvisioned += 1;
|
|
@@ -29146,8 +29777,8 @@ async function provisionSubagents(options) {
|
|
|
29146
29777
|
}
|
|
29147
29778
|
if (!dryRun && !await pathExists(workspaceDst)) {
|
|
29148
29779
|
await ensureDir(githubAgentsDir);
|
|
29149
|
-
await
|
|
29150
|
-
await
|
|
29780
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
29781
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
29151
29782
|
}
|
|
29152
29783
|
skippedExisting.push(subagentDir);
|
|
29153
29784
|
subagentsProvisioned += 1;
|
|
@@ -29162,8 +29793,8 @@ async function provisionSubagents(options) {
|
|
|
29162
29793
|
if (!dryRun) {
|
|
29163
29794
|
await ensureDir(subagentDir);
|
|
29164
29795
|
await ensureDir(githubAgentsDir);
|
|
29165
|
-
await
|
|
29166
|
-
await
|
|
29796
|
+
await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
|
|
29797
|
+
await writeFile7(wakeupDst, wakeupContent, "utf8");
|
|
29167
29798
|
}
|
|
29168
29799
|
created.push(subagentDir);
|
|
29169
29800
|
subagentsProvisioned += 1;
|
|
@@ -29523,7 +30154,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
29523
30154
|
if (!await fileExists2(absolutePath)) {
|
|
29524
30155
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
29525
30156
|
}
|
|
29526
|
-
const raw = await
|
|
30157
|
+
const raw = await readFile8(absolutePath, "utf8");
|
|
29527
30158
|
const parsed = parseYamlValue(raw);
|
|
29528
30159
|
if (!isRecord(parsed)) {
|
|
29529
30160
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -29701,6 +30332,7 @@ async function executePromptTemplate(script, context, config2, timeoutMs) {
|
|
|
29701
30332
|
output: context.output ?? null,
|
|
29702
30333
|
inputFiles: context.evalCase.file_paths,
|
|
29703
30334
|
input: context.evalCase.input,
|
|
30335
|
+
metadata: context.evalCase.metadata ?? null,
|
|
29704
30336
|
trace: context.trace ?? null,
|
|
29705
30337
|
fileChanges: context.fileChanges ?? null,
|
|
29706
30338
|
workspacePath: context.workspacePath ?? null,
|
|
@@ -30236,7 +30868,7 @@ function getWorkspacePath(evalRunId, caseId, workspaceRoot) {
|
|
|
30236
30868
|
return path33.join(root, evalRunId, caseId);
|
|
30237
30869
|
}
|
|
30238
30870
|
async function copyDirectoryRecursive(src, dest) {
|
|
30239
|
-
await
|
|
30871
|
+
await mkdir13(dest, { recursive: true });
|
|
30240
30872
|
const entries = await readdir5(src, { withFileTypes: true });
|
|
30241
30873
|
for (const entry of entries) {
|
|
30242
30874
|
const srcPath = path33.join(src, entry.name);
|
|
@@ -30357,7 +30989,7 @@ function computeWorkspaceFingerprint(repos) {
|
|
|
30357
30989
|
return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
30358
30990
|
}
|
|
30359
30991
|
async function copyDirectoryRecursive2(src, dest, skipDirs) {
|
|
30360
|
-
await
|
|
30992
|
+
await mkdir14(dest, { recursive: true });
|
|
30361
30993
|
const entries = await readdir6(src, { withFileTypes: true });
|
|
30362
30994
|
for (const entry of entries) {
|
|
30363
30995
|
const srcPath = path34.join(src, entry.name);
|
|
@@ -30395,7 +31027,7 @@ var WorkspacePoolManager = class {
|
|
|
30395
31027
|
const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
|
|
30396
31028
|
const fingerprint = computeWorkspaceFingerprint(repos);
|
|
30397
31029
|
const poolDir = path34.join(this.poolRoot, fingerprint);
|
|
30398
|
-
await
|
|
31030
|
+
await mkdir14(poolDir, { recursive: true });
|
|
30399
31031
|
const drifted = await this.checkDrift(poolDir, fingerprint);
|
|
30400
31032
|
if (drifted) {
|
|
30401
31033
|
console.warn(
|
|
@@ -30422,7 +31054,7 @@ var WorkspacePoolManager = class {
|
|
|
30422
31054
|
poolDir
|
|
30423
31055
|
};
|
|
30424
31056
|
}
|
|
30425
|
-
await
|
|
31057
|
+
await mkdir14(slotPath, { recursive: true });
|
|
30426
31058
|
if (templatePath) {
|
|
30427
31059
|
await copyDirectoryRecursive2(templatePath, slotPath);
|
|
30428
31060
|
}
|
|
@@ -30459,14 +31091,14 @@ var WorkspacePoolManager = class {
|
|
|
30459
31091
|
async tryLock(lockPath) {
|
|
30460
31092
|
for (let attempt = 0; attempt < 3; attempt++) {
|
|
30461
31093
|
try {
|
|
30462
|
-
await
|
|
31094
|
+
await writeFile8(lockPath, String(process.pid), { flag: "wx" });
|
|
30463
31095
|
return true;
|
|
30464
31096
|
} catch (err) {
|
|
30465
31097
|
if (err.code !== "EEXIST") {
|
|
30466
31098
|
throw err;
|
|
30467
31099
|
}
|
|
30468
31100
|
try {
|
|
30469
|
-
const pidStr = await
|
|
31101
|
+
const pidStr = await readFile9(lockPath, "utf-8");
|
|
30470
31102
|
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
30471
31103
|
if (!Number.isNaN(pid)) {
|
|
30472
31104
|
try {
|
|
@@ -30493,7 +31125,7 @@ var WorkspacePoolManager = class {
|
|
|
30493
31125
|
async checkDrift(poolDir, fingerprint) {
|
|
30494
31126
|
const metadataPath = path34.join(poolDir, "metadata.json");
|
|
30495
31127
|
try {
|
|
30496
|
-
const raw = await
|
|
31128
|
+
const raw = await readFile9(metadataPath, "utf-8");
|
|
30497
31129
|
const metadata = JSON.parse(raw);
|
|
30498
31130
|
return metadata.fingerprint !== fingerprint;
|
|
30499
31131
|
} catch {
|
|
@@ -30508,7 +31140,7 @@ var WorkspacePoolManager = class {
|
|
|
30508
31140
|
repos,
|
|
30509
31141
|
createdAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
30510
31142
|
};
|
|
30511
|
-
await
|
|
31143
|
+
await writeFile8(path34.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
30512
31144
|
}
|
|
30513
31145
|
/** Remove all slot directories and their lock files from a pool directory. */
|
|
30514
31146
|
async removeAllSlots(poolDir) {
|
|
@@ -30518,7 +31150,7 @@ var WorkspacePoolManager = class {
|
|
|
30518
31150
|
const lockPath = path34.join(poolDir, `${entry}.lock`);
|
|
30519
31151
|
if (existsSync3(lockPath)) {
|
|
30520
31152
|
try {
|
|
30521
|
-
const pidStr = await
|
|
31153
|
+
const pidStr = await readFile9(lockPath, "utf-8");
|
|
30522
31154
|
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
30523
31155
|
if (!Number.isNaN(pid)) {
|
|
30524
31156
|
try {
|
|
@@ -30936,7 +31568,7 @@ function isAgentSkillsFormat(parsed) {
|
|
|
30936
31568
|
return Array.isArray(obj.evals);
|
|
30937
31569
|
}
|
|
30938
31570
|
async function loadTestsFromAgentSkills(filePath) {
|
|
30939
|
-
const raw = await
|
|
31571
|
+
const raw = await readFile10(filePath, "utf8");
|
|
30940
31572
|
let parsed;
|
|
30941
31573
|
try {
|
|
30942
31574
|
parsed = JSON.parse(raw);
|
|
@@ -31105,20 +31737,22 @@ var DEFAULT_EVAL_PATTERNS = [
|
|
|
31105
31737
|
];
|
|
31106
31738
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
31107
31739
|
const directories = buildDirectoryChain2(evalFilePath, repoRoot);
|
|
31740
|
+
const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
|
|
31108
31741
|
for (const directory of directories) {
|
|
31109
31742
|
const configPath = path39.join(directory, ".agentv", "config.yaml");
|
|
31110
31743
|
if (!await fileExists3(configPath)) {
|
|
31111
31744
|
continue;
|
|
31112
31745
|
}
|
|
31113
31746
|
const config2 = await readConfigFile(configPath);
|
|
31114
|
-
if (config2)
|
|
31747
|
+
if (config2) {
|
|
31748
|
+
return config2;
|
|
31749
|
+
}
|
|
31115
31750
|
}
|
|
31116
|
-
const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
|
|
31117
31751
|
return await fileExists3(globalConfigPath) ? readConfigFile(globalConfigPath) : null;
|
|
31118
31752
|
}
|
|
31119
31753
|
async function readConfigFile(configPath) {
|
|
31120
31754
|
try {
|
|
31121
|
-
const rawConfig = await
|
|
31755
|
+
const rawConfig = await readFile11(configPath, "utf8");
|
|
31122
31756
|
const parsed = interpolateEnv(parseYamlValue(rawConfig), process.env);
|
|
31123
31757
|
if (!isJsonObject(parsed)) {
|
|
31124
31758
|
logWarning(`Invalid config.yaml format at ${configPath}`);
|
|
@@ -31331,7 +31965,10 @@ function extractCacheConfig(suite) {
|
|
|
31331
31965
|
logWarning(`Invalid execution.cache: ${cache}. Must be a boolean. Ignoring.`);
|
|
31332
31966
|
return void 0;
|
|
31333
31967
|
}
|
|
31334
|
-
|
|
31968
|
+
if (executionObj.cachePath !== void 0) {
|
|
31969
|
+
logWarning("Invalid execution.cachePath: use snake_case execution.cache_path in YAML.");
|
|
31970
|
+
}
|
|
31971
|
+
const cachePath = executionObj.cache_path;
|
|
31335
31972
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
31336
31973
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
31337
31974
|
}
|
|
@@ -31500,6 +32137,12 @@ function parseResultsConfig(raw, configPath) {
|
|
|
31500
32137
|
...branchPrefix && { branch_prefix: branchPrefix }
|
|
31501
32138
|
};
|
|
31502
32139
|
}
|
|
32140
|
+
function resolveResultsConfigForProject(config2, _projectId) {
|
|
32141
|
+
if (!config2) {
|
|
32142
|
+
return void 0;
|
|
32143
|
+
}
|
|
32144
|
+
return config2.results;
|
|
32145
|
+
}
|
|
31503
32146
|
function parseHooksConfig(raw, configPath) {
|
|
31504
32147
|
if (raw === void 0 || raw === null) {
|
|
31505
32148
|
return void 0;
|
|
@@ -31525,7 +32168,7 @@ function logWarning(message) {
|
|
|
31525
32168
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
31526
32169
|
var ANSI_RESET4 = "\x1B[0m";
|
|
31527
32170
|
async function validateCustomPromptContent(promptPath) {
|
|
31528
|
-
const content = await
|
|
32171
|
+
const content = await readFile12(promptPath, "utf8");
|
|
31529
32172
|
validateTemplateVariables(content, promptPath);
|
|
31530
32173
|
}
|
|
31531
32174
|
function validateTemplateVariables(content, source) {
|
|
@@ -31655,7 +32298,7 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
|
|
|
31655
32298
|
const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
|
|
31656
32299
|
throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
|
|
31657
32300
|
}
|
|
31658
|
-
const content = await
|
|
32301
|
+
const content = await readFile13(resolved.resolvedPath, "utf8");
|
|
31659
32302
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
31660
32303
|
if (!isJsonObject2(parsed)) {
|
|
31661
32304
|
throw new Error(
|
|
@@ -31702,6 +32345,103 @@ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, inc
|
|
|
31702
32345
|
}
|
|
31703
32346
|
return expanded;
|
|
31704
32347
|
}
|
|
32348
|
+
async function collectAssertionTemplateSourceReferences(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
32349
|
+
const execution = rawEvalCase.execution;
|
|
32350
|
+
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
32351
|
+
const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators;
|
|
32352
|
+
const skipDefaults = executionObject?.skip_defaults === true;
|
|
32353
|
+
const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
|
|
32354
|
+
return [
|
|
32355
|
+
...await collectAssertionTemplateReferencesFromValue(caseEvaluators, searchRoots, evalId),
|
|
32356
|
+
...await collectAssertionTemplateReferencesFromValue(rootEvaluators, searchRoots, evalId)
|
|
32357
|
+
];
|
|
32358
|
+
}
|
|
32359
|
+
async function collectAssertionTemplateReferencesFromValue(value, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
|
|
32360
|
+
if (value === void 0) {
|
|
32361
|
+
return [];
|
|
32362
|
+
}
|
|
32363
|
+
const references = [];
|
|
32364
|
+
if (Array.isArray(value)) {
|
|
32365
|
+
for (const item of value) {
|
|
32366
|
+
if (isIncludeEntry(item)) {
|
|
32367
|
+
const nextDepth = includeContext.depth + 1;
|
|
32368
|
+
if (nextDepth > MAX_ASSERTION_INCLUDE_DEPTH) {
|
|
32369
|
+
const chain = [...includeContext.chain, item.include].join(" -> ");
|
|
32370
|
+
throw new Error(
|
|
32371
|
+
`Assertion template include depth exceeded ${MAX_ASSERTION_INCLUDE_DEPTH} in '${evalId}'. Include chain: ${chain}`
|
|
32372
|
+
);
|
|
32373
|
+
}
|
|
32374
|
+
const resolved = await resolveAssertionTemplateReference(item.include, searchRoots);
|
|
32375
|
+
references.push({
|
|
32376
|
+
kind: "assertion_template",
|
|
32377
|
+
displayPath: resolved.displayPath,
|
|
32378
|
+
...resolved.resolvedPath ? { resolvedPath: path40.resolve(resolved.resolvedPath) } : {}
|
|
32379
|
+
});
|
|
32380
|
+
if (resolved.resolvedPath) {
|
|
32381
|
+
if (includeContext.chain.includes(resolved.resolvedPath)) {
|
|
32382
|
+
const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
|
|
32383
|
+
throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
|
|
32384
|
+
}
|
|
32385
|
+
const content = await readFile13(resolved.resolvedPath, "utf8");
|
|
32386
|
+
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
32387
|
+
if (isJsonObject2(parsed) && Array.isArray(parsed.assertions)) {
|
|
32388
|
+
const templateDir = path40.dirname(resolved.resolvedPath);
|
|
32389
|
+
const nestedSearchRoots = [
|
|
32390
|
+
templateDir,
|
|
32391
|
+
...searchRoots.filter((root) => path40.resolve(root) !== templateDir)
|
|
32392
|
+
];
|
|
32393
|
+
references.push(
|
|
32394
|
+
...await collectAssertionTemplateReferencesFromValue(
|
|
32395
|
+
parsed.assertions,
|
|
32396
|
+
nestedSearchRoots,
|
|
32397
|
+
evalId,
|
|
32398
|
+
{
|
|
32399
|
+
depth: nextDepth,
|
|
32400
|
+
chain: [...includeContext.chain, resolved.resolvedPath]
|
|
32401
|
+
}
|
|
32402
|
+
)
|
|
32403
|
+
);
|
|
32404
|
+
}
|
|
32405
|
+
}
|
|
32406
|
+
continue;
|
|
32407
|
+
}
|
|
32408
|
+
if (isJsonObject2(item)) {
|
|
32409
|
+
references.push(
|
|
32410
|
+
...await collectAssertionTemplateReferencesFromObject(
|
|
32411
|
+
item,
|
|
32412
|
+
searchRoots,
|
|
32413
|
+
evalId,
|
|
32414
|
+
includeContext
|
|
32415
|
+
)
|
|
32416
|
+
);
|
|
32417
|
+
}
|
|
32418
|
+
}
|
|
32419
|
+
} else if (isJsonObject2(value)) {
|
|
32420
|
+
references.push(
|
|
32421
|
+
...await collectAssertionTemplateReferencesFromObject(
|
|
32422
|
+
value,
|
|
32423
|
+
searchRoots,
|
|
32424
|
+
evalId,
|
|
32425
|
+
includeContext
|
|
32426
|
+
)
|
|
32427
|
+
);
|
|
32428
|
+
}
|
|
32429
|
+
return references;
|
|
32430
|
+
}
|
|
32431
|
+
async function collectAssertionTemplateReferencesFromObject(value, searchRoots, evalId, includeContext) {
|
|
32432
|
+
const references = [];
|
|
32433
|
+
for (const key of ["assertions", "assert", "evaluators"]) {
|
|
32434
|
+
references.push(
|
|
32435
|
+
...await collectAssertionTemplateReferencesFromValue(
|
|
32436
|
+
value[key],
|
|
32437
|
+
searchRoots,
|
|
32438
|
+
evalId,
|
|
32439
|
+
includeContext
|
|
32440
|
+
)
|
|
32441
|
+
);
|
|
32442
|
+
}
|
|
32443
|
+
return references;
|
|
32444
|
+
}
|
|
31705
32445
|
async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
|
|
31706
32446
|
const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
|
|
31707
32447
|
if (!expandedEvaluators) {
|
|
@@ -31828,6 +32568,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
31828
32568
|
continue;
|
|
31829
32569
|
}
|
|
31830
32570
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
32571
|
+
const resolvedScriptPath = await resolveOptionalCommandSource(command, searchRoots);
|
|
31831
32572
|
const cwd = asString(rawEvaluator.cwd);
|
|
31832
32573
|
let resolvedCwd;
|
|
31833
32574
|
if (cwd) {
|
|
@@ -31893,6 +32634,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
31893
32634
|
name,
|
|
31894
32635
|
type: "code-grader",
|
|
31895
32636
|
command,
|
|
32637
|
+
...resolvedScriptPath ? { resolvedScriptPath } : {},
|
|
31896
32638
|
cwd,
|
|
31897
32639
|
resolvedCwd,
|
|
31898
32640
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -32960,6 +33702,17 @@ function asStringArray(value, description) {
|
|
|
32960
33702
|
}
|
|
32961
33703
|
return result;
|
|
32962
33704
|
}
|
|
33705
|
+
async function resolveOptionalCommandSource(command, searchRoots) {
|
|
33706
|
+
const candidate = command.at(-1);
|
|
33707
|
+
if (!candidate || !looksLikeFilePath(candidate)) {
|
|
33708
|
+
return void 0;
|
|
33709
|
+
}
|
|
33710
|
+
const resolved = await resolveFileReference3(candidate, searchRoots);
|
|
33711
|
+
return resolved.resolvedPath ? path40.resolve(resolved.resolvedPath) : void 0;
|
|
33712
|
+
}
|
|
33713
|
+
function looksLikeFilePath(value) {
|
|
33714
|
+
return path40.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
|
|
33715
|
+
}
|
|
32963
33716
|
function parseCommandToArgv(command) {
|
|
32964
33717
|
if (process.platform === "win32") {
|
|
32965
33718
|
return ["cmd.exe", "/c", command];
|
|
@@ -33028,6 +33781,19 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
33028
33781
|
function isValidFieldAggregationType(value) {
|
|
33029
33782
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
33030
33783
|
}
|
|
33784
|
+
var VALID_RUBRIC_OPERATORS = new Set(RUBRIC_OPERATOR_VALUES);
|
|
33785
|
+
function parseRubricOperator(value, rubricId, evaluatorName, evalId) {
|
|
33786
|
+
if (value === void 0) {
|
|
33787
|
+
return void 0;
|
|
33788
|
+
}
|
|
33789
|
+
if (typeof value === "string" && VALID_RUBRIC_OPERATORS.has(value)) {
|
|
33790
|
+
return value;
|
|
33791
|
+
}
|
|
33792
|
+
logWarning2(
|
|
33793
|
+
`Ignoring invalid operator for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be one of ${RUBRIC_OPERATOR_VALUES.join(", ")}`
|
|
33794
|
+
);
|
|
33795
|
+
return void 0;
|
|
33796
|
+
}
|
|
33031
33797
|
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
33032
33798
|
const items = [];
|
|
33033
33799
|
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
@@ -33038,7 +33804,8 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
33038
33804
|
continue;
|
|
33039
33805
|
}
|
|
33040
33806
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
33041
|
-
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
33807
|
+
const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? "";
|
|
33808
|
+
const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId);
|
|
33042
33809
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
33043
33810
|
let minScore;
|
|
33044
33811
|
let requiredMinScore;
|
|
@@ -33082,6 +33849,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
33082
33849
|
id,
|
|
33083
33850
|
weight,
|
|
33084
33851
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
33852
|
+
...operator !== void 0 ? { operator } : {},
|
|
33085
33853
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
33086
33854
|
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
33087
33855
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
@@ -33097,6 +33865,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
33097
33865
|
items.push({
|
|
33098
33866
|
id,
|
|
33099
33867
|
outcome: expectedOutcome,
|
|
33868
|
+
...operator !== void 0 ? { operator } : {},
|
|
33100
33869
|
weight,
|
|
33101
33870
|
// Default to required: true if not specified (backward compatibility)
|
|
33102
33871
|
required: required2 ?? true,
|
|
@@ -33219,6 +33988,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
33219
33988
|
};
|
|
33220
33989
|
}
|
|
33221
33990
|
const expectedOutcome = asString(rubric.outcome) ?? "";
|
|
33991
|
+
const id = asString(rubric.id) ?? `rubric-${index + 1}`;
|
|
33992
|
+
const operator = parseRubricOperator(rubric.operator, id, "rubrics", "<inline>");
|
|
33222
33993
|
const rawScoreRanges = rubric.score_ranges;
|
|
33223
33994
|
const normalizedScoreRanges = rawScoreRanges !== void 0 ? normalizeScoreRangesShorthand(rawScoreRanges) : void 0;
|
|
33224
33995
|
const scoreRanges = Array.isArray(normalizedScoreRanges) && normalizedScoreRanges.length > 0 ? normalizedScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
@@ -33226,7 +33997,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
33226
33997
|
outcome: asString(range.outcome) ?? ""
|
|
33227
33998
|
})).filter((r) => r.outcome.length > 0) : void 0;
|
|
33228
33999
|
const baseRubric = {
|
|
33229
|
-
id
|
|
34000
|
+
id,
|
|
34001
|
+
...operator !== void 0 ? { operator } : {},
|
|
33230
34002
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
33231
34003
|
};
|
|
33232
34004
|
let inlineMinScore;
|
|
@@ -33386,7 +34158,7 @@ async function processMessages(options) {
|
|
|
33386
34158
|
continue;
|
|
33387
34159
|
}
|
|
33388
34160
|
try {
|
|
33389
|
-
const fileContent = (await
|
|
34161
|
+
const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
33390
34162
|
processedContent.push({
|
|
33391
34163
|
...cloneJsonObject(rawSegment),
|
|
33392
34164
|
path: displayPath,
|
|
@@ -33427,7 +34199,7 @@ async function processMessages(options) {
|
|
|
33427
34199
|
continue;
|
|
33428
34200
|
}
|
|
33429
34201
|
try {
|
|
33430
|
-
const imageBuffer = await
|
|
34202
|
+
const imageBuffer = await readFile14(resolvedPath);
|
|
33431
34203
|
const base643 = imageBuffer.toString("base64");
|
|
33432
34204
|
processedContent.push({
|
|
33433
34205
|
type: "image",
|
|
@@ -33510,7 +34282,7 @@ async function processExpectedMessages(options) {
|
|
|
33510
34282
|
continue;
|
|
33511
34283
|
}
|
|
33512
34284
|
try {
|
|
33513
|
-
const fileContent = (await
|
|
34285
|
+
const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
33514
34286
|
processedContent.push({
|
|
33515
34287
|
type: "file",
|
|
33516
34288
|
path: displayPath,
|
|
@@ -33550,7 +34322,7 @@ async function processExpectedMessages(options) {
|
|
|
33550
34322
|
continue;
|
|
33551
34323
|
}
|
|
33552
34324
|
try {
|
|
33553
|
-
const imageBuffer = await
|
|
34325
|
+
const imageBuffer = await readFile14(resolvedPath);
|
|
33554
34326
|
const base643 = imageBuffer.toString("base64");
|
|
33555
34327
|
processedContent.push({
|
|
33556
34328
|
type: "image",
|
|
@@ -33590,6 +34362,12 @@ function expandInputShorthand(value) {
|
|
|
33590
34362
|
if (typeof value === "string") {
|
|
33591
34363
|
return [{ role: "user", content: value }];
|
|
33592
34364
|
}
|
|
34365
|
+
if (isJsonObject(value)) {
|
|
34366
|
+
if ("role" in value) {
|
|
34367
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
34368
|
+
}
|
|
34369
|
+
return [{ role: "user", content: value }];
|
|
34370
|
+
}
|
|
33593
34371
|
if (Array.isArray(value)) {
|
|
33594
34372
|
const messages = value.filter((msg) => isTestMessage(msg));
|
|
33595
34373
|
return messages.length > 0 ? messages : void 0;
|
|
@@ -33675,7 +34453,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
|
33675
34453
|
return {};
|
|
33676
34454
|
}
|
|
33677
34455
|
try {
|
|
33678
|
-
const content = await
|
|
34456
|
+
const content = await readFile15(sidecarPath, "utf8");
|
|
33679
34457
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
33680
34458
|
if (!isJsonObject(parsed)) {
|
|
33681
34459
|
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
@@ -33720,7 +34498,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
33720
34498
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
33721
34499
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
33722
34500
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
33723
|
-
const rawFile = await
|
|
34501
|
+
const rawFile = await readFile15(absoluteTestPath, "utf8");
|
|
33724
34502
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
33725
34503
|
const fallbackSuiteName = path422.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
33726
34504
|
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
@@ -34129,7 +34907,7 @@ function interpolateRawEvalCase(raw, vars) {
|
|
|
34129
34907
|
async function readTestSuiteMetadata(testFilePath) {
|
|
34130
34908
|
try {
|
|
34131
34909
|
const absolutePath = path43.resolve(testFilePath);
|
|
34132
|
-
const content = await
|
|
34910
|
+
const content = await readFile16(absolutePath, "utf8");
|
|
34133
34911
|
const parsed = interpolateEnv(parseYamlValue(content), process.env);
|
|
34134
34912
|
if (!isJsonObject(parsed)) {
|
|
34135
34913
|
return {};
|
|
@@ -34153,7 +34931,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
34153
34931
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
34154
34932
|
}
|
|
34155
34933
|
if (format === "typescript") {
|
|
34156
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
34934
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT-THE7D3GR.js");
|
|
34157
34935
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
34158
34936
|
}
|
|
34159
34937
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -34188,7 +34966,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
34188
34966
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
34189
34967
|
}
|
|
34190
34968
|
if (format === "typescript") {
|
|
34191
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
34969
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT-THE7D3GR.js");
|
|
34192
34970
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
34193
34971
|
return suite.tests;
|
|
34194
34972
|
}
|
|
@@ -34203,8 +34981,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
34203
34981
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
34204
34982
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
34205
34983
|
const config2 = await loadConfig(absoluteTestPath, repoRootPath);
|
|
34206
|
-
const rawFile = await
|
|
34207
|
-
const
|
|
34984
|
+
const rawFile = await readFile16(absoluteTestPath, "utf8");
|
|
34985
|
+
const rawParsed = parseYamlValue(rawFile);
|
|
34986
|
+
const rawCaseSnapshots = buildRawInlineTestSnapshots(rawParsed);
|
|
34987
|
+
const interpolated = interpolateEnv(rawParsed, process.env);
|
|
34208
34988
|
if (!isJsonObject(interpolated)) {
|
|
34209
34989
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
34210
34990
|
}
|
|
@@ -34241,7 +35021,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
34241
35021
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
34242
35022
|
}
|
|
34243
35023
|
const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
|
|
34244
|
-
const
|
|
35024
|
+
const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
|
|
34245
35025
|
const rawSuiteInput = suite.input;
|
|
34246
35026
|
const rawSuiteInputFiles = suite.input_files;
|
|
34247
35027
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
@@ -34343,6 +35123,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
34343
35123
|
logError3(`Skipping test '${id}': ${message}`);
|
|
34344
35124
|
continue;
|
|
34345
35125
|
}
|
|
35126
|
+
const assertionTemplateReferences = await collectAssertionTemplateSourceReferences(
|
|
35127
|
+
renderedCase,
|
|
35128
|
+
globalExecution,
|
|
35129
|
+
searchRoots,
|
|
35130
|
+
id ?? "unknown"
|
|
35131
|
+
);
|
|
34346
35132
|
const inlineRubrics = renderedCase.rubrics;
|
|
34347
35133
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
34348
35134
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
@@ -34355,8 +35141,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
34355
35141
|
const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir);
|
|
34356
35142
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
34357
35143
|
const rawCaseMetadata = isJsonObject(renderedCase.metadata) ? renderedCase.metadata : void 0;
|
|
34358
|
-
const
|
|
34359
|
-
const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload);
|
|
35144
|
+
const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload);
|
|
34360
35145
|
const caseTargets = extractTargetsFromTestCase(renderedCase);
|
|
34361
35146
|
const dependsOn = Array.isArray(renderedCase.depends_on) ? renderedCase.depends_on.filter(
|
|
34362
35147
|
(v) => typeof v === "string"
|
|
@@ -34395,12 +35180,245 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
34395
35180
|
...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
|
|
34396
35181
|
...windowSize !== void 0 ? { window_size: windowSize } : {},
|
|
34397
35182
|
...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
|
|
34398
|
-
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
|
|
35183
|
+
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {},
|
|
35184
|
+
source: buildEvalTestSource({
|
|
35185
|
+
evalFilePath,
|
|
35186
|
+
absoluteTestPath,
|
|
35187
|
+
repoRootPath,
|
|
35188
|
+
id,
|
|
35189
|
+
renderedCase,
|
|
35190
|
+
rawCaseSnapshots,
|
|
35191
|
+
inputMessages,
|
|
35192
|
+
evaluators,
|
|
35193
|
+
assertionTemplateReferences
|
|
35194
|
+
})
|
|
34399
35195
|
};
|
|
34400
35196
|
results.push(testCase);
|
|
34401
35197
|
}
|
|
34402
35198
|
return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
|
|
34403
35199
|
}
|
|
35200
|
+
var SOURCE_SECRET_KEY_PATTERN = /(api[_-]?key|authorization|bearer|credential|password|private[_-]?key|secret|token)/i;
|
|
35201
|
+
var REDACTED_SOURCE_VALUE = "[redacted]";
|
|
35202
|
+
function buildRawInlineTestSnapshots(rawParsed) {
|
|
35203
|
+
const snapshots = /* @__PURE__ */ new Map();
|
|
35204
|
+
if (!isJsonObject(rawParsed)) {
|
|
35205
|
+
return snapshots;
|
|
35206
|
+
}
|
|
35207
|
+
const rawTests = rawParsed.tests ?? rawParsed.eval_cases ?? rawParsed.evalcases;
|
|
35208
|
+
if (!Array.isArray(rawTests)) {
|
|
35209
|
+
return snapshots;
|
|
35210
|
+
}
|
|
35211
|
+
for (const rawTest of rawTests) {
|
|
35212
|
+
if (!isJsonObject(rawTest) || typeof rawTest.id !== "string") {
|
|
35213
|
+
continue;
|
|
35214
|
+
}
|
|
35215
|
+
snapshots.set(rawTest.id, stringifySourceYaml(rawTest));
|
|
35216
|
+
}
|
|
35217
|
+
return snapshots;
|
|
35218
|
+
}
|
|
35219
|
+
function buildEvalTestSource(params) {
|
|
35220
|
+
const evalFileRepoPath = toPortableRelativePath(params.repoRootPath, params.absoluteTestPath);
|
|
35221
|
+
const testSnapshotYaml = params.rawCaseSnapshots.get(params.id) ?? stringifySourceYaml(params.renderedCase);
|
|
35222
|
+
const evaluatorReferences = collectGraderSourceReferences(params.evaluators);
|
|
35223
|
+
const inputReferences = collectInputSourceReferences(params.inputMessages);
|
|
35224
|
+
const references = dedupeSourceReferences([
|
|
35225
|
+
...inputReferences,
|
|
35226
|
+
...evaluatorReferences,
|
|
35227
|
+
...params.assertionTemplateReferences
|
|
35228
|
+
]);
|
|
35229
|
+
return {
|
|
35230
|
+
evalFilePath: params.evalFilePath,
|
|
35231
|
+
evalFileAbsolutePath: params.absoluteTestPath,
|
|
35232
|
+
...evalFileRepoPath ? { evalFileRepoPath } : {},
|
|
35233
|
+
testId: params.id,
|
|
35234
|
+
testSnapshotYaml,
|
|
35235
|
+
graderDefinitions: buildGraderSourceDefinitions(params.evaluators),
|
|
35236
|
+
references
|
|
35237
|
+
};
|
|
35238
|
+
}
|
|
35239
|
+
function stringifySourceYaml(value) {
|
|
35240
|
+
return stringifyYaml(sanitizeSourceValue(value), { lineWidth: 0 }).trimEnd();
|
|
35241
|
+
}
|
|
35242
|
+
function sanitizeSourceValue(value, keyHint) {
|
|
35243
|
+
if (keyHint && SOURCE_SECRET_KEY_PATTERN.test(keyHint)) {
|
|
35244
|
+
return REDACTED_SOURCE_VALUE;
|
|
35245
|
+
}
|
|
35246
|
+
if (value === null || typeof value === "string" || typeof value === "number") {
|
|
35247
|
+
return value;
|
|
35248
|
+
}
|
|
35249
|
+
if (typeof value === "boolean") {
|
|
35250
|
+
return value;
|
|
35251
|
+
}
|
|
35252
|
+
if (Array.isArray(value)) {
|
|
35253
|
+
return value.map((item) => sanitizeSourceValue(item));
|
|
35254
|
+
}
|
|
35255
|
+
if (typeof value === "object" && value !== null) {
|
|
35256
|
+
const entries = Object.entries(value).map(([key, entryValue]) => [
|
|
35257
|
+
key,
|
|
35258
|
+
sanitizeSourceValue(entryValue, key)
|
|
35259
|
+
]);
|
|
35260
|
+
return Object.fromEntries(entries);
|
|
35261
|
+
}
|
|
35262
|
+
return String(value);
|
|
35263
|
+
}
|
|
35264
|
+
function buildGraderSourceDefinitions(evaluators) {
|
|
35265
|
+
return (evaluators ?? []).map((evaluator) => ({
|
|
35266
|
+
name: evaluator.name,
|
|
35267
|
+
type: evaluator.type,
|
|
35268
|
+
...evaluator.weight !== void 0 ? { weight: evaluator.weight } : {},
|
|
35269
|
+
...evaluator.required !== void 0 ? { required: evaluator.required } : {},
|
|
35270
|
+
..."min_score" in evaluator && evaluator.min_score !== void 0 ? { minScore: evaluator.min_score } : {},
|
|
35271
|
+
definition: sanitizeGraderDefinition(evaluator)
|
|
35272
|
+
}));
|
|
35273
|
+
}
|
|
35274
|
+
function sanitizeGraderDefinition(evaluator) {
|
|
35275
|
+
const copy = sanitizeSourceValue(evaluator);
|
|
35276
|
+
return stripRuntimeResolutionFields(copy);
|
|
35277
|
+
}
|
|
35278
|
+
function stripRuntimeResolutionFields(value) {
|
|
35279
|
+
const stripped = {};
|
|
35280
|
+
for (const [key, entryValue] of Object.entries(value)) {
|
|
35281
|
+
if (key === "resolvedPromptPath" || key === "promptPath" || key === "resolvedPromptScript" || key === "resolvedScriptPath" || key === "resolvedCwd" || key === "resolvedCommand") {
|
|
35282
|
+
continue;
|
|
35283
|
+
}
|
|
35284
|
+
if (Array.isArray(entryValue)) {
|
|
35285
|
+
stripped[key] = entryValue.map(
|
|
35286
|
+
(item) => isJsonObject(item) ? stripRuntimeResolutionFields(item) : item
|
|
35287
|
+
);
|
|
35288
|
+
} else if (isJsonObject(entryValue)) {
|
|
35289
|
+
stripped[key] = stripRuntimeResolutionFields(entryValue);
|
|
35290
|
+
} else {
|
|
35291
|
+
stripped[key] = entryValue;
|
|
35292
|
+
}
|
|
35293
|
+
}
|
|
35294
|
+
return stripped;
|
|
35295
|
+
}
|
|
35296
|
+
function collectInputSourceReferences(inputMessages) {
|
|
35297
|
+
const references = [];
|
|
35298
|
+
for (const message of inputMessages) {
|
|
35299
|
+
if (!Array.isArray(message.content)) {
|
|
35300
|
+
continue;
|
|
35301
|
+
}
|
|
35302
|
+
for (const segment of message.content) {
|
|
35303
|
+
if (!isJsonObject(segment) || segment.type !== "file") {
|
|
35304
|
+
continue;
|
|
35305
|
+
}
|
|
35306
|
+
const displayPath = typeof segment.path === "string" ? segment.path : typeof segment.value === "string" ? segment.value : "input file";
|
|
35307
|
+
references.push({
|
|
35308
|
+
kind: "input_file",
|
|
35309
|
+
displayPath,
|
|
35310
|
+
...typeof segment.resolvedPath === "string" ? { resolvedPath: path43.resolve(segment.resolvedPath) } : {}
|
|
35311
|
+
});
|
|
35312
|
+
}
|
|
35313
|
+
}
|
|
35314
|
+
return references;
|
|
35315
|
+
}
|
|
35316
|
+
function collectGraderSourceReferences(evaluators) {
|
|
35317
|
+
const references = [];
|
|
35318
|
+
for (const evaluator of evaluators ?? []) {
|
|
35319
|
+
references.push(...collectSingleGraderSourceReferences(evaluator));
|
|
35320
|
+
}
|
|
35321
|
+
return references;
|
|
35322
|
+
}
|
|
35323
|
+
function collectSingleGraderSourceReferences(evaluator) {
|
|
35324
|
+
const references = [];
|
|
35325
|
+
if (evaluator.type === "code-grader") {
|
|
35326
|
+
const command = evaluator.command ?? evaluator.script ?? [];
|
|
35327
|
+
references.push({
|
|
35328
|
+
kind: "code_grader_command",
|
|
35329
|
+
displayPath: evaluator.resolvedScriptPath ?? command.join(" "),
|
|
35330
|
+
...evaluator.resolvedScriptPath ? { resolvedPath: evaluator.resolvedScriptPath } : {},
|
|
35331
|
+
graderName: evaluator.name,
|
|
35332
|
+
command
|
|
35333
|
+
});
|
|
35334
|
+
if (evaluator.resolvedCwd) {
|
|
35335
|
+
references.push({
|
|
35336
|
+
kind: "code_grader_cwd",
|
|
35337
|
+
displayPath: evaluator.cwd ?? evaluator.resolvedCwd,
|
|
35338
|
+
resolvedPath: evaluator.resolvedCwd,
|
|
35339
|
+
graderName: evaluator.name
|
|
35340
|
+
});
|
|
35341
|
+
}
|
|
35342
|
+
}
|
|
35343
|
+
if (evaluator.type === "llm-grader") {
|
|
35344
|
+
const promptPath = evaluator.resolvedPromptPath ?? evaluator.promptPath;
|
|
35345
|
+
if (promptPath) {
|
|
35346
|
+
references.push({
|
|
35347
|
+
kind: "llm_grader_prompt",
|
|
35348
|
+
displayPath: typeof evaluator.prompt === "string" ? evaluator.prompt : promptPath,
|
|
35349
|
+
resolvedPath: promptPath,
|
|
35350
|
+
graderName: evaluator.name
|
|
35351
|
+
});
|
|
35352
|
+
}
|
|
35353
|
+
if (evaluator.resolvedPromptScript && evaluator.resolvedPromptScript.length > 0) {
|
|
35354
|
+
references.push({
|
|
35355
|
+
kind: "prompt_script",
|
|
35356
|
+
displayPath: evaluator.resolvedPromptScript.at(-1) ?? evaluator.name,
|
|
35357
|
+
resolvedPath: evaluator.resolvedPromptScript.at(-1),
|
|
35358
|
+
graderName: evaluator.name,
|
|
35359
|
+
command: evaluator.resolvedPromptScript
|
|
35360
|
+
});
|
|
35361
|
+
}
|
|
35362
|
+
}
|
|
35363
|
+
const preprocessors = "preprocessors" in evaluator ? evaluator.preprocessors : void 0;
|
|
35364
|
+
for (const preprocessor of preprocessors ?? []) {
|
|
35365
|
+
if (preprocessor.resolvedCommand && preprocessor.resolvedCommand.length > 0) {
|
|
35366
|
+
references.push({
|
|
35367
|
+
kind: "preprocessor_command",
|
|
35368
|
+
displayPath: preprocessor.resolvedCommand.at(-1) ?? preprocessor.type,
|
|
35369
|
+
resolvedPath: preprocessor.resolvedCommand.at(-1),
|
|
35370
|
+
graderName: evaluator.name,
|
|
35371
|
+
command: preprocessor.resolvedCommand
|
|
35372
|
+
});
|
|
35373
|
+
}
|
|
35374
|
+
}
|
|
35375
|
+
if (evaluator.type === "composite") {
|
|
35376
|
+
for (const member of evaluator.assertions) {
|
|
35377
|
+
references.push(...collectSingleGraderSourceReferences(member));
|
|
35378
|
+
}
|
|
35379
|
+
if (evaluator.aggregator.type === "code-grader") {
|
|
35380
|
+
references.push({
|
|
35381
|
+
kind: "code_grader_command",
|
|
35382
|
+
displayPath: evaluator.aggregator.path,
|
|
35383
|
+
resolvedPath: path43.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
|
|
35384
|
+
graderName: evaluator.name
|
|
35385
|
+
});
|
|
35386
|
+
} else if (evaluator.aggregator.type === "llm-grader" && evaluator.aggregator.promptPath) {
|
|
35387
|
+
references.push({
|
|
35388
|
+
kind: "llm_grader_prompt",
|
|
35389
|
+
displayPath: evaluator.aggregator.prompt ?? evaluator.aggregator.promptPath,
|
|
35390
|
+
resolvedPath: evaluator.aggregator.promptPath,
|
|
35391
|
+
graderName: evaluator.name
|
|
35392
|
+
});
|
|
35393
|
+
}
|
|
35394
|
+
}
|
|
35395
|
+
return references;
|
|
35396
|
+
}
|
|
35397
|
+
function dedupeSourceReferences(references) {
|
|
35398
|
+
const seen = /* @__PURE__ */ new Set();
|
|
35399
|
+
const deduped = [];
|
|
35400
|
+
for (const reference of references) {
|
|
35401
|
+
const key = JSON.stringify([
|
|
35402
|
+
reference.kind,
|
|
35403
|
+
reference.resolvedPath ?? reference.displayPath,
|
|
35404
|
+
reference.graderName ?? "",
|
|
35405
|
+
reference.command?.join("\0") ?? ""
|
|
35406
|
+
]);
|
|
35407
|
+
if (seen.has(key)) {
|
|
35408
|
+
continue;
|
|
35409
|
+
}
|
|
35410
|
+
seen.add(key);
|
|
35411
|
+
deduped.push(reference);
|
|
35412
|
+
}
|
|
35413
|
+
return deduped;
|
|
35414
|
+
}
|
|
35415
|
+
function toPortableRelativePath(root, candidate) {
|
|
35416
|
+
const relative = path43.relative(root, candidate);
|
|
35417
|
+
if (relative && !relative.startsWith("..") && !path43.isAbsolute(relative)) {
|
|
35418
|
+
return relative.split(path43.sep).join("/");
|
|
35419
|
+
}
|
|
35420
|
+
return void 0;
|
|
35421
|
+
}
|
|
34404
35422
|
async function loadTestById(evalFilePath, repoRoot, evalId) {
|
|
34405
35423
|
const tests = await loadTests(evalFilePath, repoRoot);
|
|
34406
35424
|
const match = tests.find((c) => c.id === evalId);
|
|
@@ -34493,7 +35511,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
|
34493
35511
|
const workspaceFilePath = path43.resolve(evalFileDir, raw);
|
|
34494
35512
|
let content;
|
|
34495
35513
|
try {
|
|
34496
|
-
content = await
|
|
35514
|
+
content = await readFile16(workspaceFilePath, "utf8");
|
|
34497
35515
|
} catch {
|
|
34498
35516
|
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
34499
35517
|
}
|
|
@@ -34617,19 +35635,18 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
34617
35635
|
function asString5(value) {
|
|
34618
35636
|
return typeof value === "string" ? value : void 0;
|
|
34619
35637
|
}
|
|
34620
|
-
function
|
|
35638
|
+
function extractSuiteMetadataPayload(suite) {
|
|
35639
|
+
const payload = isJsonObject(suite.metadata) ? { ...suite.metadata } : {};
|
|
34621
35640
|
const top = suite.governance;
|
|
34622
35641
|
if (isJsonObject(top)) {
|
|
34623
|
-
|
|
34624
|
-
}
|
|
34625
|
-
|
|
34626
|
-
if (isJsonObject(wrapper)) {
|
|
34627
|
-
const nested = wrapper.governance;
|
|
35642
|
+
payload.governance = top;
|
|
35643
|
+
} else {
|
|
35644
|
+
const nested = payload.governance;
|
|
34628
35645
|
if (isJsonObject(nested)) {
|
|
34629
|
-
|
|
35646
|
+
payload.governance = nested;
|
|
34630
35647
|
}
|
|
34631
35648
|
}
|
|
34632
|
-
return void 0;
|
|
35649
|
+
return Object.keys(payload).length > 0 ? payload : void 0;
|
|
34633
35650
|
}
|
|
34634
35651
|
function mergeSuiteMetadataPayload(caseMetadata, suitePayload) {
|
|
34635
35652
|
if (!suitePayload) return caseMetadata;
|
|
@@ -35118,7 +36135,7 @@ async function runEvaluation(options) {
|
|
|
35118
36135
|
const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
|
|
35119
36136
|
if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
|
|
35120
36137
|
if (!dirExists) {
|
|
35121
|
-
await
|
|
36138
|
+
await mkdir15(configuredStaticPath, { recursive: true });
|
|
35122
36139
|
}
|
|
35123
36140
|
if (workspaceTemplate) {
|
|
35124
36141
|
await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
|
|
@@ -35163,7 +36180,7 @@ async function runEvaluation(options) {
|
|
|
35163
36180
|
}
|
|
35164
36181
|
} else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
|
|
35165
36182
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
35166
|
-
await
|
|
36183
|
+
await mkdir15(sharedWorkspacePath, { recursive: true });
|
|
35167
36184
|
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
35168
36185
|
}
|
|
35169
36186
|
try {
|
|
@@ -36013,7 +37030,7 @@ async function runEvalCase(options) {
|
|
|
36013
37030
|
}
|
|
36014
37031
|
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
36015
37032
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
36016
|
-
await
|
|
37033
|
+
await mkdir15(workspacePath, { recursive: true });
|
|
36017
37034
|
}
|
|
36018
37035
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
36019
37036
|
const localPathErrors = RepoManager.validateLocalPaths(evalCase.workspace.repos);
|
|
@@ -36068,7 +37085,7 @@ async function runEvalCase(options) {
|
|
|
36068
37085
|
const srcPath = path44.resolve(baseDir, relPath);
|
|
36069
37086
|
const destPath = path44.resolve(workspacePath, relPath);
|
|
36070
37087
|
try {
|
|
36071
|
-
await
|
|
37088
|
+
await mkdir15(path44.dirname(destPath), { recursive: true });
|
|
36072
37089
|
await copyFile2(srcPath, destPath);
|
|
36073
37090
|
} catch (error40) {
|
|
36074
37091
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
@@ -37632,6 +38649,12 @@ async function evaluate(config2) {
|
|
|
37632
38649
|
resolvedTarget = resolveTargetDefinition(targetDef);
|
|
37633
38650
|
}
|
|
37634
38651
|
const collectedResults = [];
|
|
38652
|
+
const cacheEnabled = shouldEnableCache({
|
|
38653
|
+
cliCache: config2.cache === true,
|
|
38654
|
+
cliNoCache: false,
|
|
38655
|
+
yamlCache: config2.cache === void 0 ? materialized.cache : void 0
|
|
38656
|
+
});
|
|
38657
|
+
const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path45.resolve(materialized.cachePath) : void 0) : void 0;
|
|
37635
38658
|
const results = await runEvaluation({
|
|
37636
38659
|
testFilePath,
|
|
37637
38660
|
repoRoot,
|
|
@@ -37644,6 +38667,8 @@ async function evaluate(config2) {
|
|
|
37644
38667
|
filter: config2.filter,
|
|
37645
38668
|
threshold: config2.threshold,
|
|
37646
38669
|
evalCases: materialized.tests,
|
|
38670
|
+
cache,
|
|
38671
|
+
useCache: !!cache && !shouldSkipCacheForTemperature(resolvedTarget.config),
|
|
37647
38672
|
...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
|
|
37648
38673
|
onResult: async (result) => {
|
|
37649
38674
|
collectedResults.push(result);
|
|
@@ -37674,6 +38699,7 @@ async function materializeEvalConfig(config2, options) {
|
|
|
37674
38699
|
tests: tests2,
|
|
37675
38700
|
workers: config2.workers ?? suite.workers,
|
|
37676
38701
|
cache: config2.cache ?? suite.cacheConfig?.enabled,
|
|
38702
|
+
cachePath: config2.cachePath ?? suite.cacheConfig?.cachePath,
|
|
37677
38703
|
budgetUsd: config2.budgetUsd ?? suite.budgetUsd,
|
|
37678
38704
|
threshold: config2.threshold ?? suite.threshold,
|
|
37679
38705
|
metadata: config2.metadata ?? suite.metadata,
|
|
@@ -37692,6 +38718,7 @@ async function materializeEvalConfig(config2, options) {
|
|
|
37692
38718
|
tests,
|
|
37693
38719
|
workers: config2.workers,
|
|
37694
38720
|
cache: config2.cache,
|
|
38721
|
+
cachePath: config2.cachePath,
|
|
37695
38722
|
budgetUsd: config2.budgetUsd,
|
|
37696
38723
|
threshold: config2.threshold,
|
|
37697
38724
|
metadata: config2.metadata,
|
|
@@ -37809,9 +38836,11 @@ function mapAssertionType(type) {
|
|
|
37809
38836
|
}
|
|
37810
38837
|
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
37811
38838
|
const total = results.length;
|
|
38839
|
+
const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
|
|
38840
|
+
const executionErrors = total - qualityResults.length;
|
|
37812
38841
|
let passed = 0;
|
|
37813
38842
|
let scoreSum = 0;
|
|
37814
|
-
for (const r of
|
|
38843
|
+
for (const r of qualityResults) {
|
|
37815
38844
|
scoreSum += r.score;
|
|
37816
38845
|
if (r.score >= threshold) {
|
|
37817
38846
|
passed++;
|
|
@@ -37820,9 +38849,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
|
37820
38849
|
return {
|
|
37821
38850
|
total,
|
|
37822
38851
|
passed,
|
|
37823
|
-
failed:
|
|
38852
|
+
failed: qualityResults.length - passed,
|
|
38853
|
+
executionErrors,
|
|
37824
38854
|
durationMs,
|
|
37825
|
-
meanScore:
|
|
38855
|
+
meanScore: qualityResults.length > 0 ? scoreSum / qualityResults.length : 0
|
|
37826
38856
|
};
|
|
37827
38857
|
}
|
|
37828
38858
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
@@ -37903,7 +38933,12 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
|
|
|
37903
38933
|
return {
|
|
37904
38934
|
tests: materialized.tests,
|
|
37905
38935
|
...materialized.workers !== void 0 && { workers: materialized.workers },
|
|
37906
|
-
...materialized.cache !== void 0 && {
|
|
38936
|
+
...materialized.cache !== void 0 && {
|
|
38937
|
+
cacheConfig: {
|
|
38938
|
+
enabled: materialized.cache,
|
|
38939
|
+
...materialized.cachePath !== void 0 && { cachePath: materialized.cachePath }
|
|
38940
|
+
}
|
|
38941
|
+
},
|
|
37907
38942
|
...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
|
|
37908
38943
|
...materialized.threshold !== void 0 && { threshold: materialized.threshold },
|
|
37909
38944
|
...materialized.metadata !== void 0 && { metadata: materialized.metadata },
|
|
@@ -37936,7 +38971,15 @@ export {
|
|
|
37936
38971
|
isJsonValue,
|
|
37937
38972
|
isTestMessage,
|
|
37938
38973
|
isGraderKind,
|
|
38974
|
+
RUBRIC_OPERATOR_VALUES,
|
|
37939
38975
|
parseYamlValue,
|
|
38976
|
+
getAgentvConfigDir,
|
|
38977
|
+
getAgentvHome,
|
|
38978
|
+
getAgentvDataDir,
|
|
38979
|
+
getWorkspacesRoot,
|
|
38980
|
+
getSubagentsRoot,
|
|
38981
|
+
getTraceStateRoot,
|
|
38982
|
+
getWorkspacePoolRoot,
|
|
37940
38983
|
fileExists,
|
|
37941
38984
|
normalizeLineEndings,
|
|
37942
38985
|
readTextFile,
|
|
@@ -37956,6 +38999,9 @@ export {
|
|
|
37956
38999
|
interpolateEnv,
|
|
37957
39000
|
loadCasesFromFile,
|
|
37958
39001
|
loadCasesFromDirectory,
|
|
39002
|
+
ResponseCache,
|
|
39003
|
+
shouldEnableCache,
|
|
39004
|
+
shouldSkipCacheForTemperature,
|
|
37959
39005
|
DEFAULT_THRESHOLD,
|
|
37960
39006
|
PASS_THRESHOLD,
|
|
37961
39007
|
scoreToVerdict,
|
|
@@ -37966,13 +39012,6 @@ export {
|
|
|
37966
39012
|
parseJsonSafe,
|
|
37967
39013
|
deepEqual,
|
|
37968
39014
|
negateScore,
|
|
37969
|
-
getAgentvConfigDir,
|
|
37970
|
-
getAgentvHome,
|
|
37971
|
-
getAgentvDataDir,
|
|
37972
|
-
getWorkspacesRoot,
|
|
37973
|
-
getSubagentsRoot,
|
|
37974
|
-
getTraceStateRoot,
|
|
37975
|
-
getWorkspacePoolRoot,
|
|
37976
39015
|
toSnakeCaseDeep,
|
|
37977
39016
|
toCamelCaseDeep,
|
|
37978
39017
|
CodeGrader,
|
|
@@ -37990,7 +39029,28 @@ export {
|
|
|
37990
39029
|
extractImageBlocks,
|
|
37991
39030
|
CompositeGrader,
|
|
37992
39031
|
CostGrader,
|
|
39032
|
+
NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
|
|
39033
|
+
NORMALIZED_TRACE_SOURCE_KINDS,
|
|
39034
|
+
NORMALIZED_TRACE_EVENT_TYPES,
|
|
39035
|
+
NORMALIZED_TOOL_STATUSES,
|
|
39036
|
+
NORMALIZED_REDACTION_LEVELS,
|
|
39037
|
+
NormalizedRedactionStateWireSchema,
|
|
39038
|
+
NormalizedTraceErrorWireSchema,
|
|
39039
|
+
NormalizedTraceSourceWireSchema,
|
|
39040
|
+
NormalizedTraceSessionWireSchema,
|
|
39041
|
+
NormalizedTraceBranchWireSchema,
|
|
39042
|
+
NormalizedTraceSourceRefWireSchema,
|
|
39043
|
+
NormalizedRawEvidenceWireSchema,
|
|
39044
|
+
NormalizedTraceMessageWireSchema,
|
|
39045
|
+
NormalizedTraceModelWireSchema,
|
|
39046
|
+
NormalizedTraceToolWireSchema,
|
|
39047
|
+
NormalizedTraceEventWireSchema,
|
|
39048
|
+
NormalizedTrajectoryWireSchema,
|
|
39049
|
+
toNormalizedTrajectoryWire,
|
|
39050
|
+
fromNormalizedTrajectoryWire,
|
|
37993
39051
|
computeTraceSummary,
|
|
39052
|
+
getSelectedTrajectoryEvents,
|
|
39053
|
+
computeTraceSummaryFromTrajectory,
|
|
37994
39054
|
DEFAULT_EXPLORATION_TOOLS,
|
|
37995
39055
|
explorationRatio,
|
|
37996
39056
|
tokensPerTool,
|
|
@@ -38071,6 +39131,7 @@ export {
|
|
|
38071
39131
|
extractCacheConfig,
|
|
38072
39132
|
extractFailOnError,
|
|
38073
39133
|
extractThreshold,
|
|
39134
|
+
resolveResultsConfigForProject,
|
|
38074
39135
|
detectFormat,
|
|
38075
39136
|
parseRepoSource,
|
|
38076
39137
|
parseRepoCheckout,
|
|
@@ -38089,4 +39150,4 @@ export {
|
|
|
38089
39150
|
loadTsEvalFile,
|
|
38090
39151
|
loadTsEvalSuite
|
|
38091
39152
|
};
|
|
38092
|
-
//# sourceMappingURL=chunk-
|
|
39153
|
+
//# sourceMappingURL=chunk-6QEIZ33V.js.map
|