agentv 0.23.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -11
- package/dist/{chunk-4T62HFF4.js → chunk-6ZM7WVSC.js} +900 -250
- package/dist/chunk-6ZM7WVSC.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +10 -10
- package/dist/templates/.agentv/targets.yaml +8 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +75 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +139 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +237 -0
- package/package.json +1 -1
- package/dist/chunk-4T62HFF4.js.map +0 -1
|
@@ -146,7 +146,7 @@ import { binary, run, subcommands as subcommands2 } from "cmd-ts";
|
|
|
146
146
|
|
|
147
147
|
// src/commands/eval/index.ts
|
|
148
148
|
import { stat as stat4 } from "node:fs/promises";
|
|
149
|
-
import
|
|
149
|
+
import path20 from "node:path";
|
|
150
150
|
import {
|
|
151
151
|
command,
|
|
152
152
|
flag,
|
|
@@ -161,13 +161,14 @@ import fg from "fast-glob";
|
|
|
161
161
|
// src/commands/eval/run-eval.ts
|
|
162
162
|
import { constants as constants6 } from "node:fs";
|
|
163
163
|
import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
164
|
-
import
|
|
164
|
+
import path19 from "node:path";
|
|
165
165
|
import { pathToFileURL } from "node:url";
|
|
166
166
|
|
|
167
|
-
// ../../packages/core/dist/chunk-
|
|
167
|
+
// ../../packages/core/dist/chunk-NDEN3H2B.js
|
|
168
168
|
import { constants } from "node:fs";
|
|
169
169
|
import { access, readFile } from "node:fs/promises";
|
|
170
170
|
import path from "node:path";
|
|
171
|
+
import path2 from "node:path";
|
|
171
172
|
|
|
172
173
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/external.js
|
|
173
174
|
var external_exports = {};
|
|
@@ -647,8 +648,8 @@ function getErrorMap() {
|
|
|
647
648
|
|
|
648
649
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
|
|
649
650
|
var makeIssue = (params) => {
|
|
650
|
-
const { data, path:
|
|
651
|
-
const fullPath = [...
|
|
651
|
+
const { data, path: path27, errorMaps, issueData } = params;
|
|
652
|
+
const fullPath = [...path27, ...issueData.path || []];
|
|
652
653
|
const fullIssue = {
|
|
653
654
|
...issueData,
|
|
654
655
|
path: fullPath
|
|
@@ -764,11 +765,11 @@ var errorUtil;
|
|
|
764
765
|
|
|
765
766
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
|
|
766
767
|
var ParseInputLazyPath = class {
|
|
767
|
-
constructor(parent, value,
|
|
768
|
+
constructor(parent, value, path27, key2) {
|
|
768
769
|
this._cachedPath = [];
|
|
769
770
|
this.parent = parent;
|
|
770
771
|
this.data = value;
|
|
771
|
-
this._path =
|
|
772
|
+
this._path = path27;
|
|
772
773
|
this._key = key2;
|
|
773
774
|
}
|
|
774
775
|
get path() {
|
|
@@ -4210,7 +4211,7 @@ var coerce = {
|
|
|
4210
4211
|
};
|
|
4211
4212
|
var NEVER = INVALID;
|
|
4212
4213
|
|
|
4213
|
-
// ../../packages/core/dist/chunk-
|
|
4214
|
+
// ../../packages/core/dist/chunk-NDEN3H2B.js
|
|
4214
4215
|
async function fileExists(filePath) {
|
|
4215
4216
|
try {
|
|
4216
4217
|
await access(filePath, constants.F_OK);
|
|
@@ -4226,6 +4227,10 @@ async function readTextFile(filePath) {
|
|
|
4226
4227
|
const content = await readFile(filePath, "utf8");
|
|
4227
4228
|
return normalizeLineEndings(content);
|
|
4228
4229
|
}
|
|
4230
|
+
async function readJsonFile(filePath) {
|
|
4231
|
+
const content = await readFile(filePath, "utf8");
|
|
4232
|
+
return JSON.parse(content);
|
|
4233
|
+
}
|
|
4229
4234
|
async function findGitRoot(startPath) {
|
|
4230
4235
|
let currentDir = path.dirname(path.resolve(startPath));
|
|
4231
4236
|
const root2 = path.parse(currentDir).root;
|
|
@@ -4331,7 +4336,7 @@ var BASE_TARGET_SCHEMA = external_exports.object({
|
|
|
4331
4336
|
judge_target: external_exports.string().optional(),
|
|
4332
4337
|
workers: external_exports.number().int().min(1).optional()
|
|
4333
4338
|
}).passthrough();
|
|
4334
|
-
var DEFAULT_AZURE_API_VERSION = "2024-
|
|
4339
|
+
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
4335
4340
|
function normalizeAzureApiVersion(value) {
|
|
4336
4341
|
if (!value) {
|
|
4337
4342
|
return DEFAULT_AZURE_API_VERSION;
|
|
@@ -4375,7 +4380,7 @@ function resolveRetryConfig(target) {
|
|
|
4375
4380
|
retryableStatusCodes
|
|
4376
4381
|
};
|
|
4377
4382
|
}
|
|
4378
|
-
function resolveTargetDefinition(definition, env = process.env) {
|
|
4383
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
4379
4384
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
4380
4385
|
const provider = parsed.provider.toLowerCase();
|
|
4381
4386
|
const providerBatching = resolveOptionalBoolean(
|
|
@@ -4448,7 +4453,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
4448
4453
|
judgeTarget: parsed.judge_target,
|
|
4449
4454
|
workers: parsed.workers,
|
|
4450
4455
|
providerBatching,
|
|
4451
|
-
config: resolveCliConfig(parsed, env)
|
|
4456
|
+
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
4452
4457
|
};
|
|
4453
4458
|
default:
|
|
4454
4459
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
@@ -4465,7 +4470,10 @@ function resolveAzureConfig(target, env) {
|
|
|
4465
4470
|
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
4466
4471
|
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
4467
4472
|
const version2 = normalizeAzureApiVersion(
|
|
4468
|
-
resolveOptionalString(versionSource, env, `${target.name} api version
|
|
4473
|
+
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
4474
|
+
allowLiteral: true,
|
|
4475
|
+
optionalEnv: true
|
|
4476
|
+
})
|
|
4469
4477
|
);
|
|
4470
4478
|
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
4471
4479
|
const maxOutputTokens = resolveOptionalNumber(
|
|
@@ -4566,7 +4574,8 @@ function normalizeCodexLogFormat(value) {
|
|
|
4566
4574
|
}
|
|
4567
4575
|
function resolveMockConfig(target) {
|
|
4568
4576
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
4569
|
-
|
|
4577
|
+
const trace2 = Array.isArray(target.trace) ? target.trace : void 0;
|
|
4578
|
+
return { response, trace: trace2 };
|
|
4570
4579
|
}
|
|
4571
4580
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
4572
4581
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -4598,15 +4607,18 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4598
4607
|
workspaceTemplate
|
|
4599
4608
|
};
|
|
4600
4609
|
}
|
|
4601
|
-
function resolveCliConfig(target, env) {
|
|
4610
|
+
function resolveCliConfig(target, env, evalFilePath) {
|
|
4602
4611
|
const commandTemplateSource = target.command_template ?? target.commandTemplate;
|
|
4603
4612
|
const filesFormat = resolveOptionalLiteralString(
|
|
4604
4613
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
4605
4614
|
);
|
|
4606
|
-
|
|
4615
|
+
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
4607
4616
|
allowLiteral: true,
|
|
4608
4617
|
optionalEnv: true
|
|
4609
4618
|
});
|
|
4619
|
+
if (!cwd && evalFilePath) {
|
|
4620
|
+
cwd = path2.dirname(path2.resolve(evalFilePath));
|
|
4621
|
+
}
|
|
4610
4622
|
const timeoutMs = resolveTimeoutMs(
|
|
4611
4623
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
4612
4624
|
`${target.name} timeout`
|
|
@@ -4724,17 +4736,15 @@ function resolveOptionalString(source2, env, description, options) {
|
|
|
4724
4736
|
if (envVarMatch) {
|
|
4725
4737
|
const varName = envVarMatch[1];
|
|
4726
4738
|
const envValue = env[varName];
|
|
4727
|
-
if (envValue !== void 0) {
|
|
4728
|
-
if (envValue.trim().length === 0) {
|
|
4729
|
-
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
4730
|
-
}
|
|
4731
|
-
return envValue;
|
|
4732
|
-
}
|
|
4733
4739
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
4734
|
-
if (
|
|
4735
|
-
|
|
4740
|
+
if (envValue === void 0 || envValue.trim().length === 0) {
|
|
4741
|
+
if (optionalEnv) {
|
|
4742
|
+
return void 0;
|
|
4743
|
+
}
|
|
4744
|
+
const status = envValue === void 0 ? "is not set" : "is empty";
|
|
4745
|
+
throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
|
|
4736
4746
|
}
|
|
4737
|
-
|
|
4747
|
+
return envValue;
|
|
4738
4748
|
}
|
|
4739
4749
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
4740
4750
|
if (!allowLiteral) {
|
|
@@ -4889,7 +4899,7 @@ import micromatch from "micromatch";
|
|
|
4889
4899
|
import { parse as parse5 } from "yaml";
|
|
4890
4900
|
import { constants as constants3 } from "node:fs";
|
|
4891
4901
|
import { access as access3 } from "node:fs/promises";
|
|
4892
|
-
import
|
|
4902
|
+
import path13 from "node:path";
|
|
4893
4903
|
import path32 from "node:path";
|
|
4894
4904
|
import { readFile as readFile22 } from "node:fs/promises";
|
|
4895
4905
|
import { readFile as readFile32 } from "node:fs/promises";
|
|
@@ -5985,10 +5995,10 @@ function assignProp(target, prop, value) {
|
|
|
5985
5995
|
configurable: true
|
|
5986
5996
|
});
|
|
5987
5997
|
}
|
|
5988
|
-
function getElementAtPath(obj,
|
|
5989
|
-
if (!
|
|
5998
|
+
function getElementAtPath(obj, path27) {
|
|
5999
|
+
if (!path27)
|
|
5990
6000
|
return obj;
|
|
5991
|
-
return
|
|
6001
|
+
return path27.reduce((acc, key2) => acc?.[key2], obj);
|
|
5992
6002
|
}
|
|
5993
6003
|
function promiseAllObject(promisesObj) {
|
|
5994
6004
|
const keys = Object.keys(promisesObj);
|
|
@@ -6308,11 +6318,11 @@ function aborted(x, startIndex = 0) {
|
|
|
6308
6318
|
}
|
|
6309
6319
|
return false;
|
|
6310
6320
|
}
|
|
6311
|
-
function prefixIssues(
|
|
6321
|
+
function prefixIssues(path27, issues) {
|
|
6312
6322
|
return issues.map((iss) => {
|
|
6313
6323
|
var _a17;
|
|
6314
6324
|
(_a17 = iss).path ?? (_a17.path = []);
|
|
6315
|
-
iss.path.unshift(
|
|
6325
|
+
iss.path.unshift(path27);
|
|
6316
6326
|
return iss;
|
|
6317
6327
|
});
|
|
6318
6328
|
}
|
|
@@ -6449,7 +6459,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6449
6459
|
return issue2.message;
|
|
6450
6460
|
};
|
|
6451
6461
|
const result = { errors: [] };
|
|
6452
|
-
const processError = (error41,
|
|
6462
|
+
const processError = (error41, path27 = []) => {
|
|
6453
6463
|
var _a17, _b8;
|
|
6454
6464
|
for (const issue2 of error41.issues) {
|
|
6455
6465
|
if (issue2.code === "invalid_union" && issue2.errors.length) {
|
|
@@ -6459,7 +6469,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6459
6469
|
} else if (issue2.code === "invalid_element") {
|
|
6460
6470
|
processError({ issues: issue2.issues }, issue2.path);
|
|
6461
6471
|
} else {
|
|
6462
|
-
const fullpath = [...
|
|
6472
|
+
const fullpath = [...path27, ...issue2.path];
|
|
6463
6473
|
if (fullpath.length === 0) {
|
|
6464
6474
|
result.errors.push(mapper(issue2));
|
|
6465
6475
|
continue;
|
|
@@ -6489,9 +6499,9 @@ function treeifyError(error40, _mapper) {
|
|
|
6489
6499
|
processError(error40);
|
|
6490
6500
|
return result;
|
|
6491
6501
|
}
|
|
6492
|
-
function toDotPath(
|
|
6502
|
+
function toDotPath(path27) {
|
|
6493
6503
|
const segs = [];
|
|
6494
|
-
for (const seg of
|
|
6504
|
+
for (const seg of path27) {
|
|
6495
6505
|
if (typeof seg === "number")
|
|
6496
6506
|
segs.push(`[${seg}]`);
|
|
6497
6507
|
else if (typeof seg === "symbol")
|
|
@@ -26044,14 +26054,14 @@ function createAzure(options = {}) {
|
|
|
26044
26054
|
description: "Azure OpenAI resource name"
|
|
26045
26055
|
});
|
|
26046
26056
|
const apiVersion = (_a17 = options.apiVersion) != null ? _a17 : "v1";
|
|
26047
|
-
const url2 = ({ path:
|
|
26057
|
+
const url2 = ({ path: path27, modelId }) => {
|
|
26048
26058
|
var _a24;
|
|
26049
26059
|
const baseUrlPrefix = (_a24 = options.baseURL) != null ? _a24 : `https://${getResourceName()}.openai.azure.com/openai`;
|
|
26050
26060
|
let fullUrl;
|
|
26051
26061
|
if (options.useDeploymentBasedUrls) {
|
|
26052
|
-
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${
|
|
26062
|
+
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${path27}`);
|
|
26053
26063
|
} else {
|
|
26054
|
-
fullUrl = new URL(`${baseUrlPrefix}/v1${
|
|
26064
|
+
fullUrl = new URL(`${baseUrlPrefix}/v1${path27}`);
|
|
26055
26065
|
}
|
|
26056
26066
|
fullUrl.searchParams.set("api-version", apiVersion);
|
|
26057
26067
|
return fullUrl.toString();
|
|
@@ -32499,12 +32509,12 @@ import path102 from "node:path";
|
|
|
32499
32509
|
|
|
32500
32510
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/agentDispatch.js
|
|
32501
32511
|
import { stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
|
|
32502
|
-
import
|
|
32512
|
+
import path11 from "node:path";
|
|
32503
32513
|
|
|
32504
32514
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/utils/fs.js
|
|
32505
32515
|
import { constants as constants2 } from "node:fs";
|
|
32506
32516
|
import { access as access2, mkdir, readdir, rm, stat } from "node:fs/promises";
|
|
32507
|
-
import
|
|
32517
|
+
import path3 from "node:path";
|
|
32508
32518
|
async function pathExists(target) {
|
|
32509
32519
|
try {
|
|
32510
32520
|
await access2(target, constants2.F_OK);
|
|
@@ -32520,7 +32530,7 @@ async function readDirEntries(target) {
|
|
|
32520
32530
|
const entries = await readdir(target, { withFileTypes: true });
|
|
32521
32531
|
return entries.map((entry) => ({
|
|
32522
32532
|
name: entry.name,
|
|
32523
|
-
absolutePath:
|
|
32533
|
+
absolutePath: path3.join(target, entry.name),
|
|
32524
32534
|
isDirectory: entry.isDirectory()
|
|
32525
32535
|
}));
|
|
32526
32536
|
}
|
|
@@ -32535,9 +32545,9 @@ async function removeIfExists(target) {
|
|
|
32535
32545
|
}
|
|
32536
32546
|
|
|
32537
32547
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/utils/path.js
|
|
32538
|
-
import
|
|
32548
|
+
import path4 from "node:path";
|
|
32539
32549
|
function pathToFileUri(filePath) {
|
|
32540
|
-
const absolutePath =
|
|
32550
|
+
const absolutePath = path4.isAbsolute(filePath) ? filePath : path4.resolve(filePath);
|
|
32541
32551
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
32542
32552
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
32543
32553
|
return `file:///${normalizedPath}`;
|
|
@@ -32546,7 +32556,7 @@ function pathToFileUri(filePath) {
|
|
|
32546
32556
|
}
|
|
32547
32557
|
|
|
32548
32558
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/promptBuilder.js
|
|
32549
|
-
import
|
|
32559
|
+
import path5 from "node:path";
|
|
32550
32560
|
|
|
32551
32561
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/utils/template.js
|
|
32552
32562
|
function renderTemplate(content, variables) {
|
|
@@ -32636,8 +32646,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
|
|
|
32636
32646
|
});
|
|
32637
32647
|
}
|
|
32638
32648
|
function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
|
|
32639
|
-
const requestLines = requestFiles.map((file2, index) => `${index + 1}. messages/${
|
|
32640
|
-
const responseList = responseFiles.map((file2) => `"${
|
|
32649
|
+
const requestLines = requestFiles.map((file2, index) => `${index + 1}. messages/${path5.basename(file2)}`).join("\n");
|
|
32650
|
+
const responseList = responseFiles.map((file2) => `"${path5.basename(file2)}"`).join(", ");
|
|
32641
32651
|
return renderTemplate(templateContent, {
|
|
32642
32652
|
requestFiles: requestLines,
|
|
32643
32653
|
responseList
|
|
@@ -32646,7 +32656,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
32646
32656
|
|
|
32647
32657
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/responseWaiter.js
|
|
32648
32658
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
32649
|
-
import
|
|
32659
|
+
import path6 from "node:path";
|
|
32650
32660
|
|
|
32651
32661
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/utils/time.js
|
|
32652
32662
|
function sleep(ms) {
|
|
@@ -32695,7 +32705,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
32695
32705
|
}
|
|
32696
32706
|
async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false) {
|
|
32697
32707
|
if (!silent) {
|
|
32698
|
-
const fileList = responseFilesFinal.map((file2) =>
|
|
32708
|
+
const fileList = responseFilesFinal.map((file2) => path6.basename(file2)).join(", ");
|
|
32699
32709
|
console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
|
|
32700
32710
|
}
|
|
32701
32711
|
try {
|
|
@@ -32745,17 +32755,17 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
32745
32755
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/vscodeProcess.js
|
|
32746
32756
|
import { exec, spawn } from "node:child_process";
|
|
32747
32757
|
import { mkdir as mkdir2, writeFile } from "node:fs/promises";
|
|
32748
|
-
import
|
|
32758
|
+
import path8 from "node:path";
|
|
32749
32759
|
import { promisify } from "node:util";
|
|
32750
32760
|
|
|
32751
32761
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/constants.js
|
|
32752
32762
|
import os from "node:os";
|
|
32753
|
-
import
|
|
32763
|
+
import path7 from "node:path";
|
|
32754
32764
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
32755
32765
|
var DEFAULT_ALIVE_FILENAME = ".alive";
|
|
32756
32766
|
function getDefaultSubagentRoot(vscodeCmd = "code") {
|
|
32757
32767
|
const folder = vscodeCmd === "code-insiders" ? "vscode-insiders-agents" : "vscode-agents";
|
|
32758
|
-
return
|
|
32768
|
+
return path7.join(os.homedir(), ".subagent", folder);
|
|
32759
32769
|
}
|
|
32760
32770
|
var DEFAULT_SUBAGENT_ROOT = getDefaultSubagentRoot();
|
|
32761
32771
|
|
|
@@ -32782,11 +32792,11 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
32782
32792
|
spawn(vscodeCmd, [workspacePath], { windowsHide: true, shell: true, detached: false });
|
|
32783
32793
|
return true;
|
|
32784
32794
|
}
|
|
32785
|
-
const aliveFile =
|
|
32795
|
+
const aliveFile = path8.join(subagentDir, DEFAULT_ALIVE_FILENAME);
|
|
32786
32796
|
await removeIfExists(aliveFile);
|
|
32787
|
-
const githubAgentsDir =
|
|
32797
|
+
const githubAgentsDir = path8.join(subagentDir, ".github", "agents");
|
|
32788
32798
|
await mkdir2(githubAgentsDir, { recursive: true });
|
|
32789
|
-
const wakeupDst =
|
|
32799
|
+
const wakeupDst = path8.join(githubAgentsDir, "wakeup.md");
|
|
32790
32800
|
await writeFile(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
|
|
32791
32801
|
spawn(vscodeCmd, [workspacePath], { windowsHide: true, shell: true, detached: false });
|
|
32792
32802
|
await sleep(100);
|
|
@@ -32796,7 +32806,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
32796
32806
|
"chat",
|
|
32797
32807
|
"-m",
|
|
32798
32808
|
wakeupChatId,
|
|
32799
|
-
`create a file named .alive in the ${
|
|
32809
|
+
`create a file named .alive in the ${path8.basename(subagentDir)} folder`
|
|
32800
32810
|
];
|
|
32801
32811
|
spawn(vscodeCmd, chatArgs, { windowsHide: true, shell: true, detached: false });
|
|
32802
32812
|
const start = Date.now();
|
|
@@ -32811,10 +32821,10 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
32811
32821
|
}
|
|
32812
32822
|
async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, requestInstructions, timestamp, vscodeCmd) {
|
|
32813
32823
|
try {
|
|
32814
|
-
const workspacePath =
|
|
32815
|
-
const messagesDir =
|
|
32824
|
+
const workspacePath = path8.join(subagentDir, `${path8.basename(subagentDir)}.code-workspace`);
|
|
32825
|
+
const messagesDir = path8.join(subagentDir, "messages");
|
|
32816
32826
|
await mkdir2(messagesDir, { recursive: true });
|
|
32817
|
-
const reqFile =
|
|
32827
|
+
const reqFile = path8.join(messagesDir, `${timestamp}_req.md`);
|
|
32818
32828
|
await writeFile(reqFile, requestInstructions, { encoding: "utf8" });
|
|
32819
32829
|
const reqUri = pathToFileUri(reqFile);
|
|
32820
32830
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
@@ -32822,8 +32832,8 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
|
|
|
32822
32832
|
chatArgs.push("-a", attachment);
|
|
32823
32833
|
}
|
|
32824
32834
|
chatArgs.push("-a", reqFile);
|
|
32825
|
-
chatArgs.push(`Follow instructions in [${
|
|
32826
|
-
const workspaceReady = await ensureWorkspaceFocused(workspacePath,
|
|
32835
|
+
chatArgs.push(`Follow instructions in [${path8.basename(reqFile)}](${reqUri})`);
|
|
32836
|
+
const workspaceReady = await ensureWorkspaceFocused(workspacePath, path8.basename(subagentDir), subagentDir, vscodeCmd);
|
|
32827
32837
|
if (!workspaceReady) {
|
|
32828
32838
|
console.error("warning: Workspace may not be fully ready");
|
|
32829
32839
|
}
|
|
@@ -32837,15 +32847,15 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
|
|
|
32837
32847
|
}
|
|
32838
32848
|
async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, chatInstruction, vscodeCmd) {
|
|
32839
32849
|
try {
|
|
32840
|
-
const workspacePath =
|
|
32841
|
-
const messagesDir =
|
|
32850
|
+
const workspacePath = path8.join(subagentDir, `${path8.basename(subagentDir)}.code-workspace`);
|
|
32851
|
+
const messagesDir = path8.join(subagentDir, "messages");
|
|
32842
32852
|
await mkdir2(messagesDir, { recursive: true });
|
|
32843
32853
|
const chatArgs = ["-r", "chat", "-m", chatId];
|
|
32844
32854
|
for (const attachment of attachmentPaths) {
|
|
32845
32855
|
chatArgs.push("-a", attachment);
|
|
32846
32856
|
}
|
|
32847
32857
|
chatArgs.push(chatInstruction);
|
|
32848
|
-
const workspaceReady = await ensureWorkspaceFocused(workspacePath,
|
|
32858
|
+
const workspaceReady = await ensureWorkspaceFocused(workspacePath, path8.basename(subagentDir), subagentDir, vscodeCmd);
|
|
32849
32859
|
if (!workspaceReady) {
|
|
32850
32860
|
console.error("warning: Workspace may not be fully ready");
|
|
32851
32861
|
}
|
|
@@ -32860,10 +32870,10 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
|
|
|
32860
32870
|
|
|
32861
32871
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/workspaceManager.js
|
|
32862
32872
|
import { copyFile, mkdir as mkdir3, readFile as readFile3, readdir as readdir2, stat as stat2, writeFile as writeFile2 } from "node:fs/promises";
|
|
32863
|
-
import
|
|
32873
|
+
import path10 from "node:path";
|
|
32864
32874
|
|
|
32865
32875
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/utils/workspace.js
|
|
32866
|
-
import
|
|
32876
|
+
import path9 from "node:path";
|
|
32867
32877
|
|
|
32868
32878
|
// ../../node_modules/.bun/json5@2.2.3/node_modules/json5/dist/index.mjs
|
|
32869
32879
|
var Space_Separator = /[\u1680\u2000-\u200A\u202F\u205F\u3000]/;
|
|
@@ -33966,10 +33976,10 @@ function transformWorkspacePaths(workspaceContent, templateDir) {
|
|
|
33966
33976
|
}
|
|
33967
33977
|
const transformedFolders = workspace.folders.map((folder) => {
|
|
33968
33978
|
const folderPath = folder.path;
|
|
33969
|
-
if (
|
|
33979
|
+
if (path9.isAbsolute(folderPath)) {
|
|
33970
33980
|
return folder;
|
|
33971
33981
|
}
|
|
33972
|
-
const absolutePath =
|
|
33982
|
+
const absolutePath = path9.resolve(templateDir, folderPath);
|
|
33973
33983
|
return {
|
|
33974
33984
|
...folder,
|
|
33975
33985
|
path: absolutePath
|
|
@@ -33991,19 +34001,19 @@ function transformWorkspacePaths(workspaceContent, templateDir) {
|
|
|
33991
34001
|
if (locationMap && typeof locationMap === "object") {
|
|
33992
34002
|
const transformedMap = {};
|
|
33993
34003
|
for (const [locationPath, value] of Object.entries(locationMap)) {
|
|
33994
|
-
const isAbsolute =
|
|
34004
|
+
const isAbsolute = path9.isAbsolute(locationPath);
|
|
33995
34005
|
if (isAbsolute) {
|
|
33996
34006
|
transformedMap[locationPath] = value;
|
|
33997
34007
|
} else {
|
|
33998
34008
|
const firstGlobIndex = locationPath.search(/[*]/);
|
|
33999
34009
|
if (firstGlobIndex === -1) {
|
|
34000
|
-
const resolvedPath =
|
|
34010
|
+
const resolvedPath = path9.resolve(templateDir, locationPath).replace(/\\/g, "/");
|
|
34001
34011
|
transformedMap[resolvedPath] = value;
|
|
34002
34012
|
} else {
|
|
34003
34013
|
const basePathEnd = locationPath.lastIndexOf("/", firstGlobIndex);
|
|
34004
34014
|
const basePath = basePathEnd !== -1 ? locationPath.substring(0, basePathEnd) : ".";
|
|
34005
34015
|
const patternPath = locationPath.substring(basePathEnd !== -1 ? basePathEnd : 0);
|
|
34006
|
-
const resolvedPath = (
|
|
34016
|
+
const resolvedPath = (path9.resolve(templateDir, basePath) + patternPath).replace(/\\/g, "/");
|
|
34007
34017
|
transformedMap[resolvedPath] = value;
|
|
34008
34018
|
}
|
|
34009
34019
|
}
|
|
@@ -34041,7 +34051,7 @@ async function findUnlockedSubagent(subagentRoot) {
|
|
|
34041
34051
|
number: Number.parseInt(entry.name.split("-")[1] ?? "", 10)
|
|
34042
34052
|
})).filter((entry) => Number.isInteger(entry.number)).sort((a, b) => a.number - b.number);
|
|
34043
34053
|
for (const subagent of subagents) {
|
|
34044
|
-
const lockFile =
|
|
34054
|
+
const lockFile = path10.join(subagent.absolutePath, DEFAULT_LOCK_NAME);
|
|
34045
34055
|
if (!await pathExists(lockFile)) {
|
|
34046
34056
|
return subagent.absolutePath;
|
|
34047
34057
|
}
|
|
@@ -34051,7 +34061,7 @@ async function findUnlockedSubagent(subagentRoot) {
|
|
|
34051
34061
|
async function copyAgentConfig(subagentDir, workspaceTemplate) {
|
|
34052
34062
|
let workspaceContent;
|
|
34053
34063
|
if (workspaceTemplate) {
|
|
34054
|
-
const workspaceSrc =
|
|
34064
|
+
const workspaceSrc = path10.resolve(workspaceTemplate);
|
|
34055
34065
|
if (!await pathExists(workspaceSrc)) {
|
|
34056
34066
|
throw new Error(`workspace template not found: ${workspaceSrc}`);
|
|
34057
34067
|
}
|
|
@@ -34064,37 +34074,37 @@ async function copyAgentConfig(subagentDir, workspaceTemplate) {
|
|
|
34064
34074
|
} else {
|
|
34065
34075
|
workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
|
|
34066
34076
|
}
|
|
34067
|
-
const workspaceName = `${
|
|
34068
|
-
const workspaceDst =
|
|
34069
|
-
const templateDir = workspaceTemplate ?
|
|
34077
|
+
const workspaceName = `${path10.basename(subagentDir)}.code-workspace`;
|
|
34078
|
+
const workspaceDst = path10.join(subagentDir, workspaceName);
|
|
34079
|
+
const templateDir = workspaceTemplate ? path10.dirname(path10.resolve(workspaceTemplate)) : subagentDir;
|
|
34070
34080
|
const workspaceJson = JSON.stringify(workspaceContent, null, 2);
|
|
34071
34081
|
const transformedContent = transformWorkspacePaths(workspaceJson, templateDir);
|
|
34072
34082
|
await writeFile2(workspaceDst, transformedContent, "utf8");
|
|
34073
|
-
const messagesDir =
|
|
34083
|
+
const messagesDir = path10.join(subagentDir, "messages");
|
|
34074
34084
|
await mkdir3(messagesDir, { recursive: true });
|
|
34075
34085
|
return { workspace: workspaceDst, messagesDir };
|
|
34076
34086
|
}
|
|
34077
34087
|
async function createSubagentLock(subagentDir) {
|
|
34078
|
-
const messagesDir =
|
|
34088
|
+
const messagesDir = path10.join(subagentDir, "messages");
|
|
34079
34089
|
if (await pathExists(messagesDir)) {
|
|
34080
34090
|
const files = await readdir2(messagesDir);
|
|
34081
34091
|
await Promise.all(files.map(async (file2) => {
|
|
34082
|
-
const target =
|
|
34092
|
+
const target = path10.join(messagesDir, file2);
|
|
34083
34093
|
await removeIfExists(target);
|
|
34084
34094
|
}));
|
|
34085
34095
|
}
|
|
34086
|
-
const githubAgentsDir =
|
|
34096
|
+
const githubAgentsDir = path10.join(subagentDir, ".github", "agents");
|
|
34087
34097
|
if (await pathExists(githubAgentsDir)) {
|
|
34088
34098
|
const agentFiles = await readdir2(githubAgentsDir);
|
|
34089
34099
|
const preservedFiles = /* @__PURE__ */ new Set(["wakeup.md", "subagent.md"]);
|
|
34090
|
-
await Promise.all(agentFiles.filter((file2) => file2.endsWith(".md") && !preservedFiles.has(file2)).map((file2) => removeIfExists(
|
|
34100
|
+
await Promise.all(agentFiles.filter((file2) => file2.endsWith(".md") && !preservedFiles.has(file2)).map((file2) => removeIfExists(path10.join(githubAgentsDir, file2))));
|
|
34091
34101
|
}
|
|
34092
|
-
const lockFile =
|
|
34102
|
+
const lockFile = path10.join(subagentDir, DEFAULT_LOCK_NAME);
|
|
34093
34103
|
await writeFile2(lockFile, "", { encoding: "utf8" });
|
|
34094
34104
|
return lockFile;
|
|
34095
34105
|
}
|
|
34096
34106
|
async function removeSubagentLock(subagentDir) {
|
|
34097
|
-
const lockFile =
|
|
34107
|
+
const lockFile = path10.join(subagentDir, DEFAULT_LOCK_NAME);
|
|
34098
34108
|
await removeIfExists(lockFile);
|
|
34099
34109
|
}
|
|
34100
34110
|
async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspaceTemplate, dryRun) {
|
|
@@ -34114,9 +34124,9 @@ async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspa
|
|
|
34114
34124
|
return 1;
|
|
34115
34125
|
}
|
|
34116
34126
|
if (promptFile) {
|
|
34117
|
-
const githubAgentsDir =
|
|
34127
|
+
const githubAgentsDir = path10.join(subagentDir, ".github", "agents");
|
|
34118
34128
|
await mkdir3(githubAgentsDir, { recursive: true });
|
|
34119
|
-
const agentFile =
|
|
34129
|
+
const agentFile = path10.join(githubAgentsDir, `${chatId}.md`);
|
|
34120
34130
|
try {
|
|
34121
34131
|
await copyFile(promptFile, agentFile);
|
|
34122
34132
|
} catch (error40) {
|
|
@@ -34135,7 +34145,7 @@ async function resolvePromptFile(promptFile) {
|
|
|
34135
34145
|
if (!promptFile) {
|
|
34136
34146
|
return void 0;
|
|
34137
34147
|
}
|
|
34138
|
-
const resolvedPrompt =
|
|
34148
|
+
const resolvedPrompt = path11.resolve(promptFile);
|
|
34139
34149
|
if (!await pathExists(resolvedPrompt)) {
|
|
34140
34150
|
throw new Error(`Prompt file not found: ${resolvedPrompt}`);
|
|
34141
34151
|
}
|
|
@@ -34151,7 +34161,7 @@ async function resolveAttachments(extraAttachments) {
|
|
|
34151
34161
|
}
|
|
34152
34162
|
const resolved = [];
|
|
34153
34163
|
for (const attachment of extraAttachments) {
|
|
34154
|
-
const resolvedPath =
|
|
34164
|
+
const resolvedPath = path11.resolve(attachment);
|
|
34155
34165
|
if (!await pathExists(resolvedPath)) {
|
|
34156
34166
|
throw new Error(`Attachment not found: ${resolvedPath}`);
|
|
34157
34167
|
}
|
|
@@ -34180,7 +34190,7 @@ async function dispatchAgentSession(options) {
|
|
|
34180
34190
|
error: "No unlocked subagents available. Provision additional subagents with: subagent code provision --subagents <desired_total>"
|
|
34181
34191
|
};
|
|
34182
34192
|
}
|
|
34183
|
-
const subagentName =
|
|
34193
|
+
const subagentName = path11.basename(subagentDir);
|
|
34184
34194
|
const chatId = Math.random().toString(16).slice(2, 10);
|
|
34185
34195
|
const preparationResult = await prepareSubagentDirectory(subagentDir, resolvedPrompt, chatId, workspaceTemplate, dryRun);
|
|
34186
34196
|
if (preparationResult !== 0) {
|
|
@@ -34201,9 +34211,9 @@ async function dispatchAgentSession(options) {
|
|
|
34201
34211
|
};
|
|
34202
34212
|
}
|
|
34203
34213
|
const timestamp = generateTimestamp();
|
|
34204
|
-
const messagesDir =
|
|
34205
|
-
const responseFileTmp =
|
|
34206
|
-
const responseFileFinal =
|
|
34214
|
+
const messagesDir = path11.join(subagentDir, "messages");
|
|
34215
|
+
const responseFileTmp = path11.join(messagesDir, `${timestamp}_res.tmp.md`);
|
|
34216
|
+
const responseFileFinal = path11.join(messagesDir, `${timestamp}_res.md`);
|
|
34207
34217
|
const requestInstructions = createRequestPrompt(userQuery, responseFileTmp, responseFileFinal, templateContent);
|
|
34208
34218
|
if (dryRun) {
|
|
34209
34219
|
return {
|
|
@@ -34293,7 +34303,7 @@ async function dispatchBatchAgent(options) {
|
|
|
34293
34303
|
error: "No unlocked subagents available. Provision additional subagents with: subagent code provision --subagents <desired_total>"
|
|
34294
34304
|
};
|
|
34295
34305
|
}
|
|
34296
|
-
subagentName =
|
|
34306
|
+
subagentName = path11.basename(subagentDir);
|
|
34297
34307
|
const chatId = Math.random().toString(16).slice(2, 10);
|
|
34298
34308
|
const preparationResult = await prepareSubagentDirectory(subagentDir, resolvedPrompt, chatId, workspaceTemplate, dryRun);
|
|
34299
34309
|
if (preparationResult !== 0) {
|
|
@@ -34318,11 +34328,11 @@ async function dispatchBatchAgent(options) {
|
|
|
34318
34328
|
};
|
|
34319
34329
|
}
|
|
34320
34330
|
const timestamp = generateTimestamp();
|
|
34321
|
-
const messagesDir =
|
|
34322
|
-
requestFiles = userQueries.map((_, index) =>
|
|
34323
|
-
const responseTmpFiles = userQueries.map((_, index) =>
|
|
34324
|
-
responseFilesFinal = userQueries.map((_, index) =>
|
|
34325
|
-
const orchestratorFile =
|
|
34331
|
+
const messagesDir = path11.join(subagentDir, "messages");
|
|
34332
|
+
requestFiles = userQueries.map((_, index) => path11.join(messagesDir, `${timestamp}_${index}_req.md`));
|
|
34333
|
+
const responseTmpFiles = userQueries.map((_, index) => path11.join(messagesDir, `${timestamp}_${index}_res.tmp.md`));
|
|
34334
|
+
responseFilesFinal = userQueries.map((_, index) => path11.join(messagesDir, `${timestamp}_${index}_res.md`));
|
|
34335
|
+
const orchestratorFile = path11.join(messagesDir, `${timestamp}_orchestrator.md`);
|
|
34326
34336
|
if (!dryRun) {
|
|
34327
34337
|
await Promise.all(userQueries.map((query, index) => writeFile3(requestFiles[index], createBatchRequestPrompt(query, responseTmpFiles[index], responseFilesFinal[index], batchRequestTemplateContent), { encoding: "utf8" })));
|
|
34328
34338
|
const orchestratorContent = createBatchOrchestratorPrompt(requestFiles, responseFilesFinal, orchestratorTemplateContent);
|
|
@@ -34391,7 +34401,7 @@ async function dispatchBatchAgent(options) {
|
|
|
34391
34401
|
|
|
34392
34402
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/provision.js
|
|
34393
34403
|
import { writeFile as writeFile4 } from "node:fs/promises";
|
|
34394
|
-
import
|
|
34404
|
+
import path12 from "node:path";
|
|
34395
34405
|
var DEFAULT_WORKSPACE_TEMPLATE2 = {
|
|
34396
34406
|
folders: [
|
|
34397
34407
|
{
|
|
@@ -34414,7 +34424,7 @@ async function provisionSubagents(options) {
|
|
|
34414
34424
|
if (!Number.isInteger(subagents) || subagents < 1) {
|
|
34415
34425
|
throw new Error("subagents must be a positive integer");
|
|
34416
34426
|
}
|
|
34417
|
-
const targetPath =
|
|
34427
|
+
const targetPath = path12.resolve(targetRoot);
|
|
34418
34428
|
if (!dryRun) {
|
|
34419
34429
|
await ensureDir(targetPath);
|
|
34420
34430
|
}
|
|
@@ -34435,7 +34445,7 @@ async function provisionSubagents(options) {
|
|
|
34435
34445
|
continue;
|
|
34436
34446
|
}
|
|
34437
34447
|
highestNumber = Math.max(highestNumber, parsed);
|
|
34438
|
-
const lockFile =
|
|
34448
|
+
const lockFile = path12.join(entry.absolutePath, lockName);
|
|
34439
34449
|
const locked = await pathExists(lockFile);
|
|
34440
34450
|
if (locked) {
|
|
34441
34451
|
lockedSubagents.add(entry.absolutePath);
|
|
@@ -34452,11 +34462,11 @@ async function provisionSubagents(options) {
|
|
|
34452
34462
|
break;
|
|
34453
34463
|
}
|
|
34454
34464
|
const subagentDir = subagent.absolutePath;
|
|
34455
|
-
const githubAgentsDir =
|
|
34456
|
-
const lockFile =
|
|
34457
|
-
const workspaceDst =
|
|
34458
|
-
const wakeupDst =
|
|
34459
|
-
const subagentDst =
|
|
34465
|
+
const githubAgentsDir = path12.join(subagentDir, ".github", "agents");
|
|
34466
|
+
const lockFile = path12.join(subagentDir, lockName);
|
|
34467
|
+
const workspaceDst = path12.join(subagentDir, `${path12.basename(subagentDir)}.code-workspace`);
|
|
34468
|
+
const wakeupDst = path12.join(githubAgentsDir, "wakeup.md");
|
|
34469
|
+
const subagentDst = path12.join(githubAgentsDir, "subagent.md");
|
|
34460
34470
|
const isLocked = await pathExists(lockFile);
|
|
34461
34471
|
if (isLocked && !force) {
|
|
34462
34472
|
continue;
|
|
@@ -34494,11 +34504,11 @@ async function provisionSubagents(options) {
|
|
|
34494
34504
|
let nextIndex = highestNumber;
|
|
34495
34505
|
while (subagentsProvisioned < subagents) {
|
|
34496
34506
|
nextIndex += 1;
|
|
34497
|
-
const subagentDir =
|
|
34498
|
-
const githubAgentsDir =
|
|
34499
|
-
const workspaceDst =
|
|
34500
|
-
const wakeupDst =
|
|
34501
|
-
const subagentDst =
|
|
34507
|
+
const subagentDir = path12.join(targetPath, `subagent-${nextIndex}`);
|
|
34508
|
+
const githubAgentsDir = path12.join(subagentDir, ".github", "agents");
|
|
34509
|
+
const workspaceDst = path12.join(subagentDir, `${path12.basename(subagentDir)}.code-workspace`);
|
|
34510
|
+
const wakeupDst = path12.join(githubAgentsDir, "wakeup.md");
|
|
34511
|
+
const subagentDst = path12.join(githubAgentsDir, "subagent.md");
|
|
34502
34512
|
if (!dryRun) {
|
|
34503
34513
|
await ensureDir(subagentDir);
|
|
34504
34514
|
await ensureDir(githubAgentsDir);
|
|
@@ -34562,11 +34572,47 @@ function isTestMessage(value) {
|
|
|
34562
34572
|
}
|
|
34563
34573
|
return candidate.content.every(isJsonObject);
|
|
34564
34574
|
}
|
|
34565
|
-
var EVALUATOR_KIND_VALUES = [
|
|
34575
|
+
var EVALUATOR_KIND_VALUES = [
|
|
34576
|
+
"code_judge",
|
|
34577
|
+
"llm_judge",
|
|
34578
|
+
"rubric",
|
|
34579
|
+
"composite",
|
|
34580
|
+
"tool_trajectory",
|
|
34581
|
+
"expected_messages"
|
|
34582
|
+
];
|
|
34566
34583
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
34567
34584
|
function isEvaluatorKind(value) {
|
|
34568
34585
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
34569
34586
|
}
|
|
34587
|
+
function isTraceEventType(value) {
|
|
34588
|
+
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
34589
|
+
}
|
|
34590
|
+
function isTraceEvent(value) {
|
|
34591
|
+
if (typeof value !== "object" || value === null) {
|
|
34592
|
+
return false;
|
|
34593
|
+
}
|
|
34594
|
+
const candidate = value;
|
|
34595
|
+
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
34596
|
+
}
|
|
34597
|
+
function computeTraceSummary(trace2) {
|
|
34598
|
+
const toolCallCounts = {};
|
|
34599
|
+
let errorCount = 0;
|
|
34600
|
+
for (const event of trace2) {
|
|
34601
|
+
if (event.type === "tool_call" && event.name) {
|
|
34602
|
+
toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
|
|
34603
|
+
}
|
|
34604
|
+
if (event.type === "error") {
|
|
34605
|
+
errorCount++;
|
|
34606
|
+
}
|
|
34607
|
+
}
|
|
34608
|
+
const toolNames = Object.keys(toolCallCounts).sort();
|
|
34609
|
+
return {
|
|
34610
|
+
eventCount: trace2.length,
|
|
34611
|
+
toolNames,
|
|
34612
|
+
toolCallsByName: toolCallCounts,
|
|
34613
|
+
errorCount
|
|
34614
|
+
};
|
|
34615
|
+
}
|
|
34570
34616
|
function extractCodeBlocks(segments) {
|
|
34571
34617
|
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
34572
34618
|
const codeBlocks = [];
|
|
@@ -34660,15 +34706,15 @@ function resolveToAbsolutePath(candidate) {
|
|
|
34660
34706
|
if (candidate.startsWith("file://")) {
|
|
34661
34707
|
return new URL(candidate).pathname;
|
|
34662
34708
|
}
|
|
34663
|
-
return
|
|
34709
|
+
return path13.resolve(candidate);
|
|
34664
34710
|
}
|
|
34665
34711
|
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
34666
34712
|
}
|
|
34667
34713
|
function buildDirectoryChain2(filePath, repoRoot) {
|
|
34668
34714
|
const directories = [];
|
|
34669
34715
|
const seen = /* @__PURE__ */ new Set();
|
|
34670
|
-
const boundary =
|
|
34671
|
-
let current =
|
|
34716
|
+
const boundary = path13.resolve(repoRoot);
|
|
34717
|
+
let current = path13.resolve(path13.dirname(filePath));
|
|
34672
34718
|
while (current !== void 0) {
|
|
34673
34719
|
if (!seen.has(current)) {
|
|
34674
34720
|
directories.push(current);
|
|
@@ -34677,7 +34723,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
34677
34723
|
if (current === boundary) {
|
|
34678
34724
|
break;
|
|
34679
34725
|
}
|
|
34680
|
-
const parent =
|
|
34726
|
+
const parent = path13.dirname(current);
|
|
34681
34727
|
if (parent === current) {
|
|
34682
34728
|
break;
|
|
34683
34729
|
}
|
|
@@ -34691,16 +34737,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
34691
34737
|
function buildSearchRoots2(evalPath, repoRoot) {
|
|
34692
34738
|
const uniqueRoots = [];
|
|
34693
34739
|
const addRoot = (root2) => {
|
|
34694
|
-
const normalized =
|
|
34740
|
+
const normalized = path13.resolve(root2);
|
|
34695
34741
|
if (!uniqueRoots.includes(normalized)) {
|
|
34696
34742
|
uniqueRoots.push(normalized);
|
|
34697
34743
|
}
|
|
34698
34744
|
};
|
|
34699
|
-
let currentDir =
|
|
34745
|
+
let currentDir = path13.dirname(evalPath);
|
|
34700
34746
|
let reachedBoundary = false;
|
|
34701
34747
|
while (!reachedBoundary) {
|
|
34702
34748
|
addRoot(currentDir);
|
|
34703
|
-
const parentDir =
|
|
34749
|
+
const parentDir = path13.dirname(currentDir);
|
|
34704
34750
|
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
34705
34751
|
reachedBoundary = true;
|
|
34706
34752
|
} else {
|
|
@@ -34718,16 +34764,16 @@ function trimLeadingSeparators2(value) {
|
|
|
34718
34764
|
async function resolveFileReference2(rawValue, searchRoots) {
|
|
34719
34765
|
const displayPath = trimLeadingSeparators2(rawValue);
|
|
34720
34766
|
const potentialPaths = [];
|
|
34721
|
-
if (
|
|
34722
|
-
potentialPaths.push(
|
|
34767
|
+
if (path13.isAbsolute(rawValue)) {
|
|
34768
|
+
potentialPaths.push(path13.normalize(rawValue));
|
|
34723
34769
|
}
|
|
34724
34770
|
for (const base of searchRoots) {
|
|
34725
|
-
potentialPaths.push(
|
|
34771
|
+
potentialPaths.push(path13.resolve(base, displayPath));
|
|
34726
34772
|
}
|
|
34727
34773
|
const attempted = [];
|
|
34728
34774
|
const seen = /* @__PURE__ */ new Set();
|
|
34729
34775
|
for (const candidate of potentialPaths) {
|
|
34730
|
-
const absoluteCandidate =
|
|
34776
|
+
const absoluteCandidate = path13.resolve(candidate);
|
|
34731
34777
|
if (seen.has(absoluteCandidate)) {
|
|
34732
34778
|
continue;
|
|
34733
34779
|
}
|
|
@@ -34885,6 +34931,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34885
34931
|
logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing script`);
|
|
34886
34932
|
continue;
|
|
34887
34933
|
}
|
|
34934
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
34888
34935
|
const cwd = asString2(rawEvaluator.cwd);
|
|
34889
34936
|
let resolvedCwd;
|
|
34890
34937
|
if (cwd) {
|
|
@@ -34905,7 +34952,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34905
34952
|
type: "code",
|
|
34906
34953
|
script,
|
|
34907
34954
|
cwd,
|
|
34908
|
-
resolvedCwd
|
|
34955
|
+
resolvedCwd,
|
|
34956
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
34909
34957
|
});
|
|
34910
34958
|
continue;
|
|
34911
34959
|
}
|
|
@@ -35000,14 +35048,89 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35000
35048
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
35001
35049
|
};
|
|
35002
35050
|
}
|
|
35051
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35003
35052
|
evaluators.push({
|
|
35004
35053
|
name: name16,
|
|
35005
35054
|
type: "composite",
|
|
35006
35055
|
evaluators: memberEvaluators,
|
|
35007
|
-
aggregator
|
|
35056
|
+
aggregator,
|
|
35057
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35058
|
+
});
|
|
35059
|
+
continue;
|
|
35060
|
+
}
|
|
35061
|
+
if (typeValue === "expected_messages") {
|
|
35062
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35063
|
+
evaluators.push({
|
|
35064
|
+
name: name16,
|
|
35065
|
+
type: "expected_messages",
|
|
35066
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35008
35067
|
});
|
|
35009
35068
|
continue;
|
|
35010
35069
|
}
|
|
35070
|
+
if (typeValue === "tool_trajectory") {
|
|
35071
|
+
const mode = asString2(rawEvaluator.mode);
|
|
35072
|
+
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
35073
|
+
logWarning2(
|
|
35074
|
+
`Skipping tool_trajectory evaluator '${name16}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
|
|
35075
|
+
);
|
|
35076
|
+
continue;
|
|
35077
|
+
}
|
|
35078
|
+
const rawMinimums = rawEvaluator.minimums;
|
|
35079
|
+
let minimums;
|
|
35080
|
+
if (rawMinimums !== void 0) {
|
|
35081
|
+
if (!isJsonObject2(rawMinimums)) {
|
|
35082
|
+
logWarning2(
|
|
35083
|
+
`Skipping tool_trajectory evaluator '${name16}' in '${evalId}': minimums must be an object`
|
|
35084
|
+
);
|
|
35085
|
+
continue;
|
|
35086
|
+
}
|
|
35087
|
+
minimums = {};
|
|
35088
|
+
for (const [toolName, count] of Object.entries(rawMinimums)) {
|
|
35089
|
+
if (typeof count === "number" && count >= 0) {
|
|
35090
|
+
minimums[toolName] = count;
|
|
35091
|
+
}
|
|
35092
|
+
}
|
|
35093
|
+
}
|
|
35094
|
+
const rawExpected = rawEvaluator.expected;
|
|
35095
|
+
let expected;
|
|
35096
|
+
if (rawExpected !== void 0) {
|
|
35097
|
+
if (!Array.isArray(rawExpected)) {
|
|
35098
|
+
logWarning2(
|
|
35099
|
+
`Skipping tool_trajectory evaluator '${name16}' in '${evalId}': expected must be an array`
|
|
35100
|
+
);
|
|
35101
|
+
continue;
|
|
35102
|
+
}
|
|
35103
|
+
expected = [];
|
|
35104
|
+
for (const item of rawExpected) {
|
|
35105
|
+
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
35106
|
+
expected.push({ tool: item.tool });
|
|
35107
|
+
}
|
|
35108
|
+
}
|
|
35109
|
+
}
|
|
35110
|
+
if (mode === "any_order" && !minimums) {
|
|
35111
|
+
logWarning2(
|
|
35112
|
+
`Skipping tool_trajectory evaluator '${name16}' in '${evalId}': any_order mode requires minimums`
|
|
35113
|
+
);
|
|
35114
|
+
continue;
|
|
35115
|
+
}
|
|
35116
|
+
if ((mode === "in_order" || mode === "exact") && !expected) {
|
|
35117
|
+
logWarning2(
|
|
35118
|
+
`Skipping tool_trajectory evaluator '${name16}' in '${evalId}': ${mode} mode requires expected`
|
|
35119
|
+
);
|
|
35120
|
+
continue;
|
|
35121
|
+
}
|
|
35122
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35123
|
+
const config2 = {
|
|
35124
|
+
name: name16,
|
|
35125
|
+
type: "tool_trajectory",
|
|
35126
|
+
mode,
|
|
35127
|
+
...minimums ? { minimums } : {},
|
|
35128
|
+
...expected ? { expected } : {},
|
|
35129
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35130
|
+
};
|
|
35131
|
+
evaluators.push(config2);
|
|
35132
|
+
continue;
|
|
35133
|
+
}
|
|
35011
35134
|
const prompt = asString2(rawEvaluator.prompt);
|
|
35012
35135
|
let promptPath;
|
|
35013
35136
|
if (prompt) {
|
|
@@ -35044,19 +35167,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35044
35167
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
|
|
35045
35168
|
continue;
|
|
35046
35169
|
}
|
|
35170
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35047
35171
|
evaluators.push({
|
|
35048
35172
|
name: name16,
|
|
35049
35173
|
type: "llm_judge",
|
|
35050
|
-
rubrics: parsedRubrics
|
|
35174
|
+
rubrics: parsedRubrics,
|
|
35175
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35051
35176
|
});
|
|
35052
35177
|
continue;
|
|
35053
35178
|
}
|
|
35179
|
+
const weight = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35054
35180
|
evaluators.push({
|
|
35055
35181
|
name: name16,
|
|
35056
35182
|
type: "llm_judge",
|
|
35057
35183
|
prompt,
|
|
35058
35184
|
promptPath,
|
|
35059
|
-
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
35185
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
35186
|
+
...weight !== void 0 ? { weight } : {}
|
|
35060
35187
|
});
|
|
35061
35188
|
}
|
|
35062
35189
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -35086,6 +35213,27 @@ ${detailBlock}${ANSI_RESET3}`);
|
|
|
35086
35213
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
35087
35214
|
}
|
|
35088
35215
|
}
|
|
35216
|
+
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
35217
|
+
if (rawWeight === void 0) {
|
|
35218
|
+
return void 0;
|
|
35219
|
+
}
|
|
35220
|
+
if (typeof rawWeight !== "number") {
|
|
35221
|
+
throw new Error(
|
|
35222
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
|
|
35223
|
+
);
|
|
35224
|
+
}
|
|
35225
|
+
if (!Number.isFinite(rawWeight)) {
|
|
35226
|
+
throw new Error(
|
|
35227
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
|
|
35228
|
+
);
|
|
35229
|
+
}
|
|
35230
|
+
if (rawWeight < 0) {
|
|
35231
|
+
throw new Error(
|
|
35232
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
|
|
35233
|
+
);
|
|
35234
|
+
}
|
|
35235
|
+
return rawWeight;
|
|
35236
|
+
}
|
|
35089
35237
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
35090
35238
|
var ANSI_RESET4 = "\x1B[0m";
|
|
35091
35239
|
async function processMessages(options) {
|
|
@@ -35257,6 +35405,67 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
35257
35405
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
35258
35406
|
}
|
|
35259
35407
|
}
|
|
35408
|
+
async function processExpectedMessages(options) {
|
|
35409
|
+
const { messages, searchRoots, repoRootPath, verbose } = options;
|
|
35410
|
+
const segments = [];
|
|
35411
|
+
for (const message of messages) {
|
|
35412
|
+
const segment = {
|
|
35413
|
+
role: message.role
|
|
35414
|
+
};
|
|
35415
|
+
if (message.role === "assistant" && message.tool_calls !== void 0) {
|
|
35416
|
+
segment.tool_calls = message.tool_calls;
|
|
35417
|
+
}
|
|
35418
|
+
const content = message.content;
|
|
35419
|
+
if (typeof content === "string") {
|
|
35420
|
+
segment.content = content;
|
|
35421
|
+
} else if (Array.isArray(content)) {
|
|
35422
|
+
const processedContent = [];
|
|
35423
|
+
for (const rawSegment of content) {
|
|
35424
|
+
if (!isJsonObject(rawSegment)) {
|
|
35425
|
+
continue;
|
|
35426
|
+
}
|
|
35427
|
+
const segmentType = asString3(rawSegment.type);
|
|
35428
|
+
if (segmentType === "file") {
|
|
35429
|
+
const rawValue = asString3(rawSegment.value);
|
|
35430
|
+
if (!rawValue) {
|
|
35431
|
+
continue;
|
|
35432
|
+
}
|
|
35433
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
35434
|
+
rawValue,
|
|
35435
|
+
searchRoots
|
|
35436
|
+
);
|
|
35437
|
+
if (!resolvedPath) {
|
|
35438
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
35439
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
35440
|
+
continue;
|
|
35441
|
+
}
|
|
35442
|
+
try {
|
|
35443
|
+
const fileContent = (await readFile32(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
35444
|
+
processedContent.push({
|
|
35445
|
+
type: "file",
|
|
35446
|
+
path: displayPath,
|
|
35447
|
+
text: fileContent,
|
|
35448
|
+
resolvedPath: path42.resolve(resolvedPath)
|
|
35449
|
+
});
|
|
35450
|
+
if (verbose) {
|
|
35451
|
+
console.log(` [Expected Output File] Found: ${displayPath}`);
|
|
35452
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
35453
|
+
}
|
|
35454
|
+
} catch (error40) {
|
|
35455
|
+
logWarning3(
|
|
35456
|
+
`Could not read expected output file ${resolvedPath}: ${error40.message}`
|
|
35457
|
+
);
|
|
35458
|
+
}
|
|
35459
|
+
continue;
|
|
35460
|
+
}
|
|
35461
|
+
processedContent.push(cloneJsonObject(rawSegment));
|
|
35462
|
+
}
|
|
35463
|
+
segment.content = processedContent;
|
|
35464
|
+
}
|
|
35465
|
+
segments.push(segment);
|
|
35466
|
+
}
|
|
35467
|
+
return segments;
|
|
35468
|
+
}
|
|
35260
35469
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
35261
35470
|
var ANSI_RESET5 = "\x1B[0m";
|
|
35262
35471
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
@@ -35555,12 +35764,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35555
35764
|
messageType: "input",
|
|
35556
35765
|
verbose
|
|
35557
35766
|
});
|
|
35558
|
-
const outputSegments = hasExpectedMessages ? await
|
|
35767
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
35559
35768
|
messages: expectedMessages,
|
|
35560
35769
|
searchRoots,
|
|
35561
35770
|
repoRootPath,
|
|
35562
|
-
guidelinePatterns,
|
|
35563
|
-
messageType: "output",
|
|
35564
35771
|
verbose
|
|
35565
35772
|
}) : [];
|
|
35566
35773
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
@@ -36071,9 +36278,11 @@ var CliProvider = class {
|
|
|
36071
36278
|
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
36072
36279
|
throw new Error(message);
|
|
36073
36280
|
}
|
|
36074
|
-
const
|
|
36281
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
36282
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
36075
36283
|
return {
|
|
36076
|
-
text:
|
|
36284
|
+
text: parsed.text,
|
|
36285
|
+
trace: parsed.trace,
|
|
36077
36286
|
raw: {
|
|
36078
36287
|
command: renderedCommand,
|
|
36079
36288
|
stderr: result.stderr,
|
|
@@ -36083,6 +36292,31 @@ var CliProvider = class {
|
|
|
36083
36292
|
}
|
|
36084
36293
|
};
|
|
36085
36294
|
}
|
|
36295
|
+
/**
|
|
36296
|
+
* Parse output content from CLI.
|
|
36297
|
+
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
36298
|
+
* Otherwise, treat the entire content as plain text.
|
|
36299
|
+
*/
|
|
36300
|
+
parseOutputContent(content) {
|
|
36301
|
+
try {
|
|
36302
|
+
const parsed = JSON.parse(content);
|
|
36303
|
+
if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
|
|
36304
|
+
const obj = parsed;
|
|
36305
|
+
const text2 = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
36306
|
+
const trace2 = this.parseTrace(obj.trace);
|
|
36307
|
+
return { text: text2, trace: trace2 };
|
|
36308
|
+
}
|
|
36309
|
+
} catch {
|
|
36310
|
+
}
|
|
36311
|
+
return { text: content };
|
|
36312
|
+
}
|
|
36313
|
+
parseTrace(trace2) {
|
|
36314
|
+
if (!Array.isArray(trace2)) {
|
|
36315
|
+
return void 0;
|
|
36316
|
+
}
|
|
36317
|
+
const validEvents = trace2.filter(isTraceEvent);
|
|
36318
|
+
return validEvents.length > 0 ? validEvents : void 0;
|
|
36319
|
+
}
|
|
36086
36320
|
async readAndCleanupOutputFile(filePath) {
|
|
36087
36321
|
try {
|
|
36088
36322
|
const content = await readTextFile(filePath);
|
|
@@ -37044,6 +37278,7 @@ var MockProvider = class {
|
|
|
37044
37278
|
delayMs;
|
|
37045
37279
|
delayMinMs;
|
|
37046
37280
|
delayMaxMs;
|
|
37281
|
+
trace;
|
|
37047
37282
|
constructor(targetName, config2) {
|
|
37048
37283
|
this.id = `mock:${targetName}`;
|
|
37049
37284
|
this.targetName = targetName;
|
|
@@ -37051,6 +37286,7 @@ var MockProvider = class {
|
|
|
37051
37286
|
this.delayMs = config2.delayMs ?? 0;
|
|
37052
37287
|
this.delayMinMs = config2.delayMinMs ?? 0;
|
|
37053
37288
|
this.delayMaxMs = config2.delayMaxMs ?? 0;
|
|
37289
|
+
this.trace = config2.trace;
|
|
37054
37290
|
}
|
|
37055
37291
|
async invoke(request) {
|
|
37056
37292
|
const delay2 = this.calculateDelay();
|
|
@@ -37062,7 +37298,8 @@ var MockProvider = class {
|
|
|
37062
37298
|
raw: {
|
|
37063
37299
|
question: request.question,
|
|
37064
37300
|
guidelines: request.guidelines
|
|
37065
|
-
}
|
|
37301
|
+
},
|
|
37302
|
+
trace: this.trace
|
|
37066
37303
|
};
|
|
37067
37304
|
}
|
|
37068
37305
|
calculateDelay() {
|
|
@@ -37705,9 +37942,11 @@ var CodeEvaluator = class {
|
|
|
37705
37942
|
expected_outcome: context.evalCase.expected_outcome,
|
|
37706
37943
|
reference_answer: context.evalCase.reference_answer,
|
|
37707
37944
|
candidate_answer: context.candidate,
|
|
37708
|
-
|
|
37709
|
-
input_files: context.evalCase.file_paths
|
|
37710
|
-
|
|
37945
|
+
guideline_files: context.evalCase.guideline_paths,
|
|
37946
|
+
input_files: context.evalCase.file_paths.filter(
|
|
37947
|
+
(path132) => !context.evalCase.guideline_paths.includes(path132)
|
|
37948
|
+
),
|
|
37949
|
+
input_messages: context.evalCase.input_messages
|
|
37711
37950
|
},
|
|
37712
37951
|
null,
|
|
37713
37952
|
2
|
|
@@ -37827,6 +38066,251 @@ function substituteVariables(template, variables) {
|
|
|
37827
38066
|
return variables[varName] ?? match;
|
|
37828
38067
|
});
|
|
37829
38068
|
}
|
|
38069
|
+
var ToolTrajectoryEvaluator = class {
|
|
38070
|
+
kind = "tool_trajectory";
|
|
38071
|
+
config;
|
|
38072
|
+
constructor(options) {
|
|
38073
|
+
this.config = options.config;
|
|
38074
|
+
}
|
|
38075
|
+
evaluate(context) {
|
|
38076
|
+
const { candidateTrace, candidateTraceSummary } = context;
|
|
38077
|
+
if (!candidateTrace || !candidateTraceSummary) {
|
|
38078
|
+
return {
|
|
38079
|
+
score: 0,
|
|
38080
|
+
verdict: "fail",
|
|
38081
|
+
hits: [],
|
|
38082
|
+
misses: ["No trace available for evaluation"],
|
|
38083
|
+
expectedAspectCount: 1
|
|
38084
|
+
};
|
|
38085
|
+
}
|
|
38086
|
+
switch (this.config.mode) {
|
|
38087
|
+
case "any_order":
|
|
38088
|
+
return this.evaluateAnyOrder(candidateTraceSummary);
|
|
38089
|
+
case "in_order":
|
|
38090
|
+
return this.evaluateInOrder(candidateTrace);
|
|
38091
|
+
case "exact":
|
|
38092
|
+
return this.evaluateExact(candidateTrace);
|
|
38093
|
+
default:
|
|
38094
|
+
return {
|
|
38095
|
+
score: 0,
|
|
38096
|
+
verdict: "fail",
|
|
38097
|
+
hits: [],
|
|
38098
|
+
misses: [`Unknown mode: ${this.config.mode}`],
|
|
38099
|
+
expectedAspectCount: 1
|
|
38100
|
+
};
|
|
38101
|
+
}
|
|
38102
|
+
}
|
|
38103
|
+
evaluateAnyOrder(summary) {
|
|
38104
|
+
const minimums = this.config.minimums ?? {};
|
|
38105
|
+
const toolNames = Object.keys(minimums);
|
|
38106
|
+
if (toolNames.length === 0) {
|
|
38107
|
+
return {
|
|
38108
|
+
score: 1,
|
|
38109
|
+
verdict: "pass",
|
|
38110
|
+
hits: ["No tool requirements specified"],
|
|
38111
|
+
misses: [],
|
|
38112
|
+
expectedAspectCount: 0
|
|
38113
|
+
};
|
|
38114
|
+
}
|
|
38115
|
+
const hits = [];
|
|
38116
|
+
const misses = [];
|
|
38117
|
+
for (const toolName of toolNames) {
|
|
38118
|
+
const required2 = minimums[toolName];
|
|
38119
|
+
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
38120
|
+
if (actual >= required2) {
|
|
38121
|
+
hits.push(`${toolName}: called ${actual} times (required \u2265${required2})`);
|
|
38122
|
+
} else {
|
|
38123
|
+
misses.push(`${toolName}: called ${actual} times (required \u2265${required2})`);
|
|
38124
|
+
}
|
|
38125
|
+
}
|
|
38126
|
+
const score = hits.length / toolNames.length;
|
|
38127
|
+
return {
|
|
38128
|
+
score,
|
|
38129
|
+
verdict: scoreToVerdict(score),
|
|
38130
|
+
hits,
|
|
38131
|
+
misses,
|
|
38132
|
+
expectedAspectCount: toolNames.length
|
|
38133
|
+
};
|
|
38134
|
+
}
|
|
38135
|
+
evaluateInOrder(trace2) {
|
|
38136
|
+
const expected = this.config.expected ?? [];
|
|
38137
|
+
if (expected.length === 0) {
|
|
38138
|
+
return {
|
|
38139
|
+
score: 1,
|
|
38140
|
+
verdict: "pass",
|
|
38141
|
+
hits: ["No tool sequence specified"],
|
|
38142
|
+
misses: [],
|
|
38143
|
+
expectedAspectCount: 0
|
|
38144
|
+
};
|
|
38145
|
+
}
|
|
38146
|
+
const actualToolCalls = trace2.filter((e) => e.type === "tool_call" && e.name);
|
|
38147
|
+
const hits = [];
|
|
38148
|
+
const misses = [];
|
|
38149
|
+
let actualIndex = 0;
|
|
38150
|
+
for (let i = 0; i < expected.length; i++) {
|
|
38151
|
+
const expectedTool = expected[i].tool;
|
|
38152
|
+
let found = false;
|
|
38153
|
+
while (actualIndex < actualToolCalls.length) {
|
|
38154
|
+
if (actualToolCalls[actualIndex].name === expectedTool) {
|
|
38155
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
38156
|
+
actualIndex++;
|
|
38157
|
+
found = true;
|
|
38158
|
+
break;
|
|
38159
|
+
}
|
|
38160
|
+
actualIndex++;
|
|
38161
|
+
}
|
|
38162
|
+
if (!found) {
|
|
38163
|
+
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
38164
|
+
}
|
|
38165
|
+
}
|
|
38166
|
+
const score = hits.length / expected.length;
|
|
38167
|
+
return {
|
|
38168
|
+
score,
|
|
38169
|
+
verdict: scoreToVerdict(score),
|
|
38170
|
+
hits,
|
|
38171
|
+
misses,
|
|
38172
|
+
expectedAspectCount: expected.length
|
|
38173
|
+
};
|
|
38174
|
+
}
|
|
38175
|
+
evaluateExact(trace2) {
|
|
38176
|
+
const expected = this.config.expected ?? [];
|
|
38177
|
+
if (expected.length === 0) {
|
|
38178
|
+
return {
|
|
38179
|
+
score: 1,
|
|
38180
|
+
verdict: "pass",
|
|
38181
|
+
hits: ["No tool sequence specified"],
|
|
38182
|
+
misses: [],
|
|
38183
|
+
expectedAspectCount: 0
|
|
38184
|
+
};
|
|
38185
|
+
}
|
|
38186
|
+
const actualToolCalls = trace2.filter((e) => e.type === "tool_call" && e.name);
|
|
38187
|
+
const hits = [];
|
|
38188
|
+
const misses = [];
|
|
38189
|
+
if (actualToolCalls.length !== expected.length) {
|
|
38190
|
+
misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
|
|
38191
|
+
}
|
|
38192
|
+
const checkLength = Math.min(expected.length, actualToolCalls.length);
|
|
38193
|
+
for (let i = 0; i < checkLength; i++) {
|
|
38194
|
+
const expectedTool = expected[i].tool;
|
|
38195
|
+
const actualTool = actualToolCalls[i].name;
|
|
38196
|
+
if (actualTool === expectedTool) {
|
|
38197
|
+
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
38198
|
+
} else {
|
|
38199
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
38200
|
+
}
|
|
38201
|
+
}
|
|
38202
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
38203
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
38204
|
+
}
|
|
38205
|
+
const score = hits.length / expected.length;
|
|
38206
|
+
return {
|
|
38207
|
+
score,
|
|
38208
|
+
verdict: scoreToVerdict(score),
|
|
38209
|
+
hits,
|
|
38210
|
+
misses,
|
|
38211
|
+
expectedAspectCount: expected.length
|
|
38212
|
+
};
|
|
38213
|
+
}
|
|
38214
|
+
};
|
|
38215
|
+
var ExpectedMessagesEvaluator = class {
|
|
38216
|
+
kind = "expected_messages";
|
|
38217
|
+
evaluate(context) {
|
|
38218
|
+
const { candidateTrace, evalCase } = context;
|
|
38219
|
+
const expectedSegments = evalCase.expected_segments;
|
|
38220
|
+
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
38221
|
+
if (expectedToolCalls.length === 0) {
|
|
38222
|
+
return {
|
|
38223
|
+
score: 1,
|
|
38224
|
+
verdict: "pass",
|
|
38225
|
+
hits: ["No tool_calls specified in expected_messages"],
|
|
38226
|
+
misses: [],
|
|
38227
|
+
expectedAspectCount: 1
|
|
38228
|
+
};
|
|
38229
|
+
}
|
|
38230
|
+
if (!candidateTrace || candidateTrace.length === 0) {
|
|
38231
|
+
return {
|
|
38232
|
+
score: 0,
|
|
38233
|
+
verdict: "fail",
|
|
38234
|
+
hits: [],
|
|
38235
|
+
misses: ["No trace available to validate tool_calls"],
|
|
38236
|
+
expectedAspectCount: expectedToolCalls.length
|
|
38237
|
+
};
|
|
38238
|
+
}
|
|
38239
|
+
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
38240
|
+
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
38241
|
+
}
|
|
38242
|
+
extractExpectedToolCalls(segments) {
|
|
38243
|
+
if (!segments) {
|
|
38244
|
+
return [];
|
|
38245
|
+
}
|
|
38246
|
+
const toolCalls = [];
|
|
38247
|
+
for (const segment of segments) {
|
|
38248
|
+
const role = segment.role;
|
|
38249
|
+
const segmentToolCalls = segment.tool_calls;
|
|
38250
|
+
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
38251
|
+
for (const tc of segmentToolCalls) {
|
|
38252
|
+
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
38253
|
+
const toolCall = tc;
|
|
38254
|
+
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
38255
|
+
}
|
|
38256
|
+
}
|
|
38257
|
+
}
|
|
38258
|
+
}
|
|
38259
|
+
return toolCalls;
|
|
38260
|
+
}
|
|
38261
|
+
validateToolCalls(expected, actual) {
|
|
38262
|
+
const hits = [];
|
|
38263
|
+
const misses = [];
|
|
38264
|
+
for (let i = 0; i < expected.length; i++) {
|
|
38265
|
+
const expectedCall = expected[i];
|
|
38266
|
+
const actualCall = actual[i];
|
|
38267
|
+
if (!actualCall) {
|
|
38268
|
+
misses.push(
|
|
38269
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
38270
|
+
);
|
|
38271
|
+
continue;
|
|
38272
|
+
}
|
|
38273
|
+
if (actualCall.name !== expectedCall.tool) {
|
|
38274
|
+
misses.push(
|
|
38275
|
+
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
38276
|
+
);
|
|
38277
|
+
continue;
|
|
38278
|
+
}
|
|
38279
|
+
if (expectedCall.input !== void 0) {
|
|
38280
|
+
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
38281
|
+
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
38282
|
+
continue;
|
|
38283
|
+
}
|
|
38284
|
+
}
|
|
38285
|
+
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
38286
|
+
}
|
|
38287
|
+
const totalChecks = expected.length || 1;
|
|
38288
|
+
const score = hits.length / totalChecks;
|
|
38289
|
+
return {
|
|
38290
|
+
score,
|
|
38291
|
+
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
38292
|
+
hits,
|
|
38293
|
+
misses,
|
|
38294
|
+
expectedAspectCount: totalChecks
|
|
38295
|
+
};
|
|
38296
|
+
}
|
|
38297
|
+
deepEquals(a, b) {
|
|
38298
|
+
if (a === b) return true;
|
|
38299
|
+
if (typeof a !== typeof b) return false;
|
|
38300
|
+
if (typeof a !== "object" || a === null || b === null) return false;
|
|
38301
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
38302
|
+
if (a.length !== b.length) return false;
|
|
38303
|
+
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
38304
|
+
}
|
|
38305
|
+
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
38306
|
+
const aObj = a;
|
|
38307
|
+
const bObj = b;
|
|
38308
|
+
const aKeys = Object.keys(aObj);
|
|
38309
|
+
const bKeys = Object.keys(bObj);
|
|
38310
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
38311
|
+
return aKeys.every((key2) => this.deepEquals(aObj[key2], bObj[key2]));
|
|
38312
|
+
}
|
|
38313
|
+
};
|
|
37830
38314
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
37831
38315
|
{{EVALUATOR_RESULTS_JSON}}
|
|
37832
38316
|
|
|
@@ -38239,7 +38723,7 @@ async function runEvaluation(options) {
|
|
|
38239
38723
|
if (!definition) {
|
|
38240
38724
|
return void 0;
|
|
38241
38725
|
}
|
|
38242
|
-
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
38726
|
+
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
38243
38727
|
resolvedTargetsByName.set(name16, resolved);
|
|
38244
38728
|
return resolved;
|
|
38245
38729
|
};
|
|
@@ -38553,6 +39037,17 @@ async function runEvalCase(options) {
|
|
|
38553
39037
|
if (cacheKey && cache && !cachedResponse) {
|
|
38554
39038
|
await cache.set(cacheKey, providerResponse);
|
|
38555
39039
|
}
|
|
39040
|
+
let candidateTrace = providerResponse.trace;
|
|
39041
|
+
if (!candidateTrace && providerResponse.traceRef) {
|
|
39042
|
+
try {
|
|
39043
|
+
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
39044
|
+
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
39045
|
+
candidateTrace = rawTrace;
|
|
39046
|
+
}
|
|
39047
|
+
} catch {
|
|
39048
|
+
}
|
|
39049
|
+
}
|
|
39050
|
+
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
38556
39051
|
try {
|
|
38557
39052
|
return await evaluateCandidate({
|
|
38558
39053
|
evalCase,
|
|
@@ -38564,7 +39059,9 @@ async function runEvalCase(options) {
|
|
|
38564
39059
|
nowFn,
|
|
38565
39060
|
attempt,
|
|
38566
39061
|
judgeProvider,
|
|
38567
|
-
agentTimeoutMs
|
|
39062
|
+
agentTimeoutMs,
|
|
39063
|
+
candidateTrace,
|
|
39064
|
+
candidateTraceSummary
|
|
38568
39065
|
});
|
|
38569
39066
|
} catch (error40) {
|
|
38570
39067
|
return buildErrorResult(evalCase, target.name, nowFn(), error40, promptInputs, provider);
|
|
@@ -38581,7 +39078,9 @@ async function evaluateCandidate(options) {
|
|
|
38581
39078
|
nowFn,
|
|
38582
39079
|
attempt,
|
|
38583
39080
|
judgeProvider,
|
|
38584
|
-
agentTimeoutMs
|
|
39081
|
+
agentTimeoutMs,
|
|
39082
|
+
candidateTrace,
|
|
39083
|
+
candidateTraceSummary
|
|
38585
39084
|
} = options;
|
|
38586
39085
|
const gradeTimestamp = nowFn();
|
|
38587
39086
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -38594,7 +39093,9 @@ async function evaluateCandidate(options) {
|
|
|
38594
39093
|
promptInputs,
|
|
38595
39094
|
now: gradeTimestamp,
|
|
38596
39095
|
judgeProvider,
|
|
38597
|
-
agentTimeoutMs
|
|
39096
|
+
agentTimeoutMs,
|
|
39097
|
+
candidateTrace,
|
|
39098
|
+
candidateTraceSummary
|
|
38598
39099
|
});
|
|
38599
39100
|
const completedAt = nowFn();
|
|
38600
39101
|
let agentProviderRequest;
|
|
@@ -38607,14 +39108,12 @@ async function evaluateCandidate(options) {
|
|
|
38607
39108
|
} else {
|
|
38608
39109
|
if (promptInputs.chatPrompt) {
|
|
38609
39110
|
lmProviderRequest = {
|
|
38610
|
-
chat_prompt: promptInputs.chatPrompt
|
|
38611
|
-
guideline_paths: evalCase.guideline_paths
|
|
39111
|
+
chat_prompt: promptInputs.chatPrompt
|
|
38612
39112
|
};
|
|
38613
39113
|
} else {
|
|
38614
39114
|
lmProviderRequest = {
|
|
38615
39115
|
question: promptInputs.question,
|
|
38616
|
-
guidelines: promptInputs.guidelines
|
|
38617
|
-
guideline_paths: evalCase.guideline_paths
|
|
39116
|
+
guidelines: promptInputs.guidelines
|
|
38618
39117
|
};
|
|
38619
39118
|
}
|
|
38620
39119
|
}
|
|
@@ -38633,7 +39132,8 @@ async function evaluateCandidate(options) {
|
|
|
38633
39132
|
agent_provider_request: agentProviderRequest,
|
|
38634
39133
|
lm_provider_request: lmProviderRequest,
|
|
38635
39134
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
38636
|
-
evaluator_results: evaluatorResults
|
|
39135
|
+
evaluator_results: evaluatorResults,
|
|
39136
|
+
trace_summary: candidateTraceSummary
|
|
38637
39137
|
};
|
|
38638
39138
|
}
|
|
38639
39139
|
async function runEvaluatorsForCase(options) {
|
|
@@ -38647,7 +39147,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
38647
39147
|
promptInputs,
|
|
38648
39148
|
now,
|
|
38649
39149
|
judgeProvider,
|
|
38650
|
-
agentTimeoutMs
|
|
39150
|
+
agentTimeoutMs,
|
|
39151
|
+
candidateTrace,
|
|
39152
|
+
candidateTraceSummary
|
|
38651
39153
|
} = options;
|
|
38652
39154
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
38653
39155
|
return runEvaluatorList({
|
|
@@ -38661,7 +39163,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
38661
39163
|
promptInputs,
|
|
38662
39164
|
now,
|
|
38663
39165
|
judgeProvider,
|
|
38664
|
-
agentTimeoutMs
|
|
39166
|
+
agentTimeoutMs,
|
|
39167
|
+
candidateTrace,
|
|
39168
|
+
candidateTraceSummary
|
|
38665
39169
|
});
|
|
38666
39170
|
}
|
|
38667
39171
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -38677,7 +39181,9 @@ async function runEvaluatorsForCase(options) {
|
|
|
38677
39181
|
attempt,
|
|
38678
39182
|
promptInputs,
|
|
38679
39183
|
now,
|
|
38680
|
-
judgeProvider
|
|
39184
|
+
judgeProvider,
|
|
39185
|
+
candidateTrace,
|
|
39186
|
+
candidateTraceSummary
|
|
38681
39187
|
});
|
|
38682
39188
|
return { score };
|
|
38683
39189
|
}
|
|
@@ -38693,7 +39199,9 @@ async function runEvaluatorList(options) {
|
|
|
38693
39199
|
promptInputs,
|
|
38694
39200
|
now,
|
|
38695
39201
|
judgeProvider,
|
|
38696
|
-
agentTimeoutMs
|
|
39202
|
+
agentTimeoutMs,
|
|
39203
|
+
candidateTrace,
|
|
39204
|
+
candidateTraceSummary
|
|
38697
39205
|
} = options;
|
|
38698
39206
|
const scored = [];
|
|
38699
39207
|
const evaluatorResults = [];
|
|
@@ -38712,11 +39220,13 @@ async function runEvaluatorList(options) {
|
|
|
38712
39220
|
now,
|
|
38713
39221
|
judgeProvider
|
|
38714
39222
|
});
|
|
38715
|
-
|
|
39223
|
+
const weight = evaluator.weight ?? 1;
|
|
39224
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
38716
39225
|
evaluatorResults.push({
|
|
38717
39226
|
name: evaluator.name,
|
|
38718
39227
|
type: evaluator.type,
|
|
38719
39228
|
score: score2.score,
|
|
39229
|
+
weight,
|
|
38720
39230
|
verdict: score2.verdict,
|
|
38721
39231
|
hits: score2.hits,
|
|
38722
39232
|
misses: score2.misses,
|
|
@@ -38739,11 +39249,13 @@ async function runEvaluatorList(options) {
|
|
|
38739
39249
|
promptInputs,
|
|
38740
39250
|
now
|
|
38741
39251
|
});
|
|
38742
|
-
|
|
39252
|
+
const weight = evaluator.weight ?? 1;
|
|
39253
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
38743
39254
|
evaluatorResults.push({
|
|
38744
39255
|
name: evaluator.name,
|
|
38745
39256
|
type: "code_judge",
|
|
38746
39257
|
score: score2.score,
|
|
39258
|
+
weight,
|
|
38747
39259
|
verdict: score2.verdict,
|
|
38748
39260
|
hits: score2.hits,
|
|
38749
39261
|
misses: score2.misses,
|
|
@@ -38769,6 +39281,12 @@ async function runEvaluatorList(options) {
|
|
|
38769
39281
|
cwd: evalFileDir,
|
|
38770
39282
|
evaluatorFactory: { create: createEvaluator }
|
|
38771
39283
|
});
|
|
39284
|
+
case "tool_trajectory":
|
|
39285
|
+
return new ToolTrajectoryEvaluator({
|
|
39286
|
+
config: memberConfig
|
|
39287
|
+
});
|
|
39288
|
+
case "expected_messages":
|
|
39289
|
+
return new ExpectedMessagesEvaluator();
|
|
38772
39290
|
default: {
|
|
38773
39291
|
const unknownConfig = memberConfig;
|
|
38774
39292
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -38790,11 +39308,13 @@ async function runEvaluatorList(options) {
|
|
|
38790
39308
|
now,
|
|
38791
39309
|
judgeProvider
|
|
38792
39310
|
});
|
|
38793
|
-
|
|
39311
|
+
const weight = evaluator.weight ?? 1;
|
|
39312
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
38794
39313
|
evaluatorResults.push({
|
|
38795
39314
|
name: evaluator.name,
|
|
38796
39315
|
type: evaluator.type,
|
|
38797
39316
|
score: score2.score,
|
|
39317
|
+
weight,
|
|
38798
39318
|
verdict: score2.verdict,
|
|
38799
39319
|
hits: score2.hits,
|
|
38800
39320
|
misses: score2.misses,
|
|
@@ -38803,6 +39323,60 @@ async function runEvaluatorList(options) {
|
|
|
38803
39323
|
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
38804
39324
|
});
|
|
38805
39325
|
}
|
|
39326
|
+
if (evaluator.type === "tool_trajectory") {
|
|
39327
|
+
const trajectoryEvaluator = new ToolTrajectoryEvaluator({
|
|
39328
|
+
config: evaluator
|
|
39329
|
+
});
|
|
39330
|
+
const score2 = trajectoryEvaluator.evaluate({
|
|
39331
|
+
evalCase,
|
|
39332
|
+
candidate,
|
|
39333
|
+
target,
|
|
39334
|
+
provider,
|
|
39335
|
+
attempt,
|
|
39336
|
+
promptInputs,
|
|
39337
|
+
now,
|
|
39338
|
+
candidateTrace,
|
|
39339
|
+
candidateTraceSummary
|
|
39340
|
+
});
|
|
39341
|
+
const weight = evaluator.weight ?? 1;
|
|
39342
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39343
|
+
evaluatorResults.push({
|
|
39344
|
+
name: evaluator.name,
|
|
39345
|
+
type: evaluator.type,
|
|
39346
|
+
score: score2.score,
|
|
39347
|
+
weight,
|
|
39348
|
+
verdict: score2.verdict,
|
|
39349
|
+
hits: score2.hits,
|
|
39350
|
+
misses: score2.misses,
|
|
39351
|
+
reasoning: score2.reasoning
|
|
39352
|
+
});
|
|
39353
|
+
}
|
|
39354
|
+
if (evaluator.type === "expected_messages") {
|
|
39355
|
+
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
39356
|
+
const score2 = expectedMessagesEvaluator.evaluate({
|
|
39357
|
+
evalCase,
|
|
39358
|
+
candidate,
|
|
39359
|
+
target,
|
|
39360
|
+
provider,
|
|
39361
|
+
attempt,
|
|
39362
|
+
promptInputs,
|
|
39363
|
+
now,
|
|
39364
|
+
candidateTrace,
|
|
39365
|
+
candidateTraceSummary
|
|
39366
|
+
});
|
|
39367
|
+
const weight = evaluator.weight ?? 1;
|
|
39368
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39369
|
+
evaluatorResults.push({
|
|
39370
|
+
name: evaluator.name,
|
|
39371
|
+
type: evaluator.type,
|
|
39372
|
+
score: score2.score,
|
|
39373
|
+
weight,
|
|
39374
|
+
verdict: score2.verdict,
|
|
39375
|
+
hits: score2.hits,
|
|
39376
|
+
misses: score2.misses,
|
|
39377
|
+
reasoning: score2.reasoning
|
|
39378
|
+
});
|
|
39379
|
+
}
|
|
38806
39380
|
} catch (error40) {
|
|
38807
39381
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38808
39382
|
const fallbackScore = {
|
|
@@ -38814,15 +39388,18 @@ async function runEvaluatorList(options) {
|
|
|
38814
39388
|
reasoning: message
|
|
38815
39389
|
};
|
|
38816
39390
|
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
39391
|
+
const weight = evaluator.weight ?? 1;
|
|
38817
39392
|
scored.push({
|
|
38818
39393
|
score: fallbackScore,
|
|
38819
39394
|
name: evaluator.name ?? "unknown",
|
|
38820
|
-
type: resultType ?? "llm_judge"
|
|
39395
|
+
type: resultType ?? "llm_judge",
|
|
39396
|
+
weight
|
|
38821
39397
|
});
|
|
38822
39398
|
evaluatorResults.push({
|
|
38823
39399
|
name: evaluator.name ?? "unknown",
|
|
38824
39400
|
type: resultType ?? "llm_judge",
|
|
38825
39401
|
score: 0,
|
|
39402
|
+
weight,
|
|
38826
39403
|
verdict: "fail",
|
|
38827
39404
|
hits: [],
|
|
38828
39405
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
@@ -38830,7 +39407,9 @@ async function runEvaluatorList(options) {
|
|
|
38830
39407
|
});
|
|
38831
39408
|
}
|
|
38832
39409
|
}
|
|
38833
|
-
const aggregateScore = scored.length > 0 ?
|
|
39410
|
+
const aggregateScore = scored.length > 0 ? computeWeightedMean(
|
|
39411
|
+
scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
39412
|
+
) : 0;
|
|
38834
39413
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
38835
39414
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
38836
39415
|
const expectedAspectCount = scored.reduce(
|
|
@@ -39056,6 +39635,16 @@ function mapChildResults(children) {
|
|
|
39056
39635
|
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
39057
39636
|
}));
|
|
39058
39637
|
}
|
|
39638
|
+
function computeWeightedMean(entries) {
|
|
39639
|
+
let totalWeight = 0;
|
|
39640
|
+
let weightedSum = 0;
|
|
39641
|
+
for (const entry of entries) {
|
|
39642
|
+
const weight = entry.weight ?? 1;
|
|
39643
|
+
totalWeight += weight;
|
|
39644
|
+
weightedSum += entry.score * weight;
|
|
39645
|
+
}
|
|
39646
|
+
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
39647
|
+
}
|
|
39059
39648
|
var rubricItemSchema = external_exports.object({
|
|
39060
39649
|
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
39061
39650
|
description: external_exports.string().describe("What this rubric checks for"),
|
|
@@ -39136,13 +39725,13 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
39136
39725
|
// src/commands/eval/env.ts
|
|
39137
39726
|
import { constants as constants4 } from "node:fs";
|
|
39138
39727
|
import { access as access4 } from "node:fs/promises";
|
|
39139
|
-
import
|
|
39728
|
+
import path14 from "node:path";
|
|
39140
39729
|
import { config as loadDotenv } from "dotenv";
|
|
39141
39730
|
function uniqueDirs(directories) {
|
|
39142
39731
|
const seen = /* @__PURE__ */ new Set();
|
|
39143
39732
|
const result = [];
|
|
39144
39733
|
for (const dir of directories) {
|
|
39145
|
-
const absolute =
|
|
39734
|
+
const absolute = path14.resolve(dir);
|
|
39146
39735
|
if (seen.has(absolute)) {
|
|
39147
39736
|
continue;
|
|
39148
39737
|
}
|
|
@@ -39161,14 +39750,14 @@ async function fileExists4(filePath) {
|
|
|
39161
39750
|
}
|
|
39162
39751
|
function collectAncestorDirectories(start, boundary) {
|
|
39163
39752
|
const directories = [];
|
|
39164
|
-
const boundaryDir =
|
|
39165
|
-
let current =
|
|
39753
|
+
const boundaryDir = path14.resolve(boundary);
|
|
39754
|
+
let current = path14.resolve(start);
|
|
39166
39755
|
while (current !== void 0) {
|
|
39167
39756
|
directories.push(current);
|
|
39168
39757
|
if (current === boundaryDir) {
|
|
39169
39758
|
break;
|
|
39170
39759
|
}
|
|
39171
|
-
const parent =
|
|
39760
|
+
const parent = path14.dirname(current);
|
|
39172
39761
|
if (parent === current) {
|
|
39173
39762
|
break;
|
|
39174
39763
|
}
|
|
@@ -39178,29 +39767,36 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
39178
39767
|
}
|
|
39179
39768
|
async function loadEnvFromHierarchy(options) {
|
|
39180
39769
|
const { testFilePath, repoRoot, verbose } = options;
|
|
39181
|
-
const testDir =
|
|
39770
|
+
const testDir = path14.dirname(path14.resolve(testFilePath));
|
|
39182
39771
|
const cwd = process.cwd();
|
|
39183
39772
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
39773
|
+
const envFiles = [];
|
|
39184
39774
|
for (const dir of searchDirs) {
|
|
39185
|
-
const candidate =
|
|
39775
|
+
const candidate = path14.join(dir, ".env");
|
|
39186
39776
|
if (await fileExists4(candidate)) {
|
|
39187
|
-
|
|
39188
|
-
if (verbose) {
|
|
39189
|
-
console.log(`Loaded environment from: ${candidate}`);
|
|
39190
|
-
}
|
|
39191
|
-
return candidate;
|
|
39777
|
+
envFiles.push(candidate);
|
|
39192
39778
|
}
|
|
39193
39779
|
}
|
|
39194
|
-
if (
|
|
39195
|
-
|
|
39780
|
+
if (envFiles.length === 0) {
|
|
39781
|
+
if (verbose) {
|
|
39782
|
+
console.log("No .env file found in hierarchy");
|
|
39783
|
+
}
|
|
39784
|
+
return void 0;
|
|
39196
39785
|
}
|
|
39197
|
-
|
|
39786
|
+
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
39787
|
+
const envFile = envFiles[i];
|
|
39788
|
+
loadDotenv({ path: envFile, override: false });
|
|
39789
|
+
if (verbose) {
|
|
39790
|
+
console.log(`Loaded environment from: ${envFile}`);
|
|
39791
|
+
}
|
|
39792
|
+
}
|
|
39793
|
+
return envFiles[0];
|
|
39198
39794
|
}
|
|
39199
39795
|
|
|
39200
39796
|
// src/commands/eval/jsonl-writer.ts
|
|
39201
39797
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
39202
39798
|
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
39203
|
-
import
|
|
39799
|
+
import path15 from "node:path";
|
|
39204
39800
|
import { finished } from "node:stream/promises";
|
|
39205
39801
|
|
|
39206
39802
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
@@ -39418,7 +40014,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
39418
40014
|
this.stream = stream;
|
|
39419
40015
|
}
|
|
39420
40016
|
static async open(filePath) {
|
|
39421
|
-
await mkdir5(
|
|
40017
|
+
await mkdir5(path15.dirname(filePath), { recursive: true });
|
|
39422
40018
|
const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
|
|
39423
40019
|
return new _JsonlWriter(stream);
|
|
39424
40020
|
}
|
|
@@ -39450,7 +40046,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
39450
40046
|
// src/commands/eval/yaml-writer.ts
|
|
39451
40047
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
39452
40048
|
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
39453
|
-
import
|
|
40049
|
+
import path16 from "node:path";
|
|
39454
40050
|
import { finished as finished2 } from "node:stream/promises";
|
|
39455
40051
|
import { stringify as stringifyYaml } from "yaml";
|
|
39456
40052
|
var YamlWriter = class _YamlWriter {
|
|
@@ -39462,7 +40058,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
39462
40058
|
this.stream = stream;
|
|
39463
40059
|
}
|
|
39464
40060
|
static async open(filePath) {
|
|
39465
|
-
await mkdir6(
|
|
40061
|
+
await mkdir6(path16.dirname(filePath), { recursive: true });
|
|
39466
40062
|
const stream = createWriteStream3(filePath, { flags: "w", encoding: "utf8" });
|
|
39467
40063
|
return new _YamlWriter(stream);
|
|
39468
40064
|
}
|
|
@@ -39586,12 +40182,12 @@ var ProgressDisplay = class {
|
|
|
39586
40182
|
}
|
|
39587
40183
|
addLogPaths(paths) {
|
|
39588
40184
|
const newPaths = [];
|
|
39589
|
-
for (const
|
|
39590
|
-
if (this.logPathSet.has(
|
|
40185
|
+
for (const path27 of paths) {
|
|
40186
|
+
if (this.logPathSet.has(path27)) {
|
|
39591
40187
|
continue;
|
|
39592
40188
|
}
|
|
39593
|
-
this.logPathSet.add(
|
|
39594
|
-
newPaths.push(
|
|
40189
|
+
this.logPathSet.add(path27);
|
|
40190
|
+
newPaths.push(path27);
|
|
39595
40191
|
}
|
|
39596
40192
|
if (newPaths.length === 0) {
|
|
39597
40193
|
return;
|
|
@@ -39607,8 +40203,8 @@ var ProgressDisplay = class {
|
|
|
39607
40203
|
this.hasPrintedLogHeader = true;
|
|
39608
40204
|
}
|
|
39609
40205
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
39610
|
-
newPaths.forEach((
|
|
39611
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
40206
|
+
newPaths.forEach((path27, offset) => {
|
|
40207
|
+
console.log(`${startIndex + offset + 1}. ${path27}`);
|
|
39612
40208
|
});
|
|
39613
40209
|
}
|
|
39614
40210
|
scheduleRender() {
|
|
@@ -39656,8 +40252,8 @@ var ProgressDisplay = class {
|
|
|
39656
40252
|
if (this.logPaths.length > 0) {
|
|
39657
40253
|
lines.push("");
|
|
39658
40254
|
lines.push("Codex CLI logs:");
|
|
39659
|
-
this.logPaths.forEach((
|
|
39660
|
-
lines.push(`${index + 1}. ${
|
|
40255
|
+
this.logPaths.forEach((path27, index) => {
|
|
40256
|
+
lines.push(`${index + 1}. ${path27}`);
|
|
39661
40257
|
});
|
|
39662
40258
|
}
|
|
39663
40259
|
const rowCount = this.getRenderedRowCount(lines);
|
|
@@ -39864,7 +40460,7 @@ function formatEvaluationSummary(summary) {
|
|
|
39864
40460
|
|
|
39865
40461
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
39866
40462
|
import { readFile as readFile7 } from "node:fs/promises";
|
|
39867
|
-
import
|
|
40463
|
+
import path17 from "node:path";
|
|
39868
40464
|
import { parse as parse6 } from "yaml";
|
|
39869
40465
|
import { readFile as readFile23 } from "node:fs/promises";
|
|
39870
40466
|
import path23 from "node:path";
|
|
@@ -39907,8 +40503,8 @@ async function detectFileType(filePath) {
|
|
|
39907
40503
|
}
|
|
39908
40504
|
}
|
|
39909
40505
|
function inferFileTypeFromPath(filePath) {
|
|
39910
|
-
const normalized =
|
|
39911
|
-
const basename =
|
|
40506
|
+
const normalized = path17.normalize(filePath).replace(/\\/g, "/");
|
|
40507
|
+
const basename = path17.basename(filePath);
|
|
39912
40508
|
if (normalized.includes("/.agentv/")) {
|
|
39913
40509
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
39914
40510
|
return "config";
|
|
@@ -40053,6 +40649,26 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
40053
40649
|
message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
|
|
40054
40650
|
});
|
|
40055
40651
|
}
|
|
40652
|
+
const toolCalls = message.tool_calls;
|
|
40653
|
+
if (toolCalls !== void 0) {
|
|
40654
|
+
if (role !== "assistant") {
|
|
40655
|
+
errors.push({
|
|
40656
|
+
severity: "error",
|
|
40657
|
+
filePath,
|
|
40658
|
+
location: `${msgLocation}.tool_calls`,
|
|
40659
|
+
message: "tool_calls can only be specified on assistant messages"
|
|
40660
|
+
});
|
|
40661
|
+
} else if (!Array.isArray(toolCalls)) {
|
|
40662
|
+
errors.push({
|
|
40663
|
+
severity: "error",
|
|
40664
|
+
filePath,
|
|
40665
|
+
location: `${msgLocation}.tool_calls`,
|
|
40666
|
+
message: "tool_calls must be an array"
|
|
40667
|
+
});
|
|
40668
|
+
} else {
|
|
40669
|
+
validateToolCalls(toolCalls, `${msgLocation}.tool_calls`, filePath, errors);
|
|
40670
|
+
}
|
|
40671
|
+
}
|
|
40056
40672
|
const content = message.content;
|
|
40057
40673
|
if (typeof content === "string") {
|
|
40058
40674
|
validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
|
|
@@ -40117,6 +40733,30 @@ function validateContentForRoleMarkers(content, location, filePath, errors) {
|
|
|
40117
40733
|
}
|
|
40118
40734
|
}
|
|
40119
40735
|
}
|
|
40736
|
+
function validateToolCalls(toolCalls, location, filePath, errors) {
|
|
40737
|
+
for (let i = 0; i < toolCalls.length; i++) {
|
|
40738
|
+
const toolCall = toolCalls[i];
|
|
40739
|
+
const callLocation = `${location}[${i}]`;
|
|
40740
|
+
if (!isObject2(toolCall)) {
|
|
40741
|
+
errors.push({
|
|
40742
|
+
severity: "error",
|
|
40743
|
+
filePath,
|
|
40744
|
+
location: callLocation,
|
|
40745
|
+
message: "Tool call must be an object"
|
|
40746
|
+
});
|
|
40747
|
+
continue;
|
|
40748
|
+
}
|
|
40749
|
+
const tool2 = toolCall.tool;
|
|
40750
|
+
if (typeof tool2 !== "string" || tool2.trim().length === 0) {
|
|
40751
|
+
errors.push({
|
|
40752
|
+
severity: "error",
|
|
40753
|
+
filePath,
|
|
40754
|
+
location: `${callLocation}.tool`,
|
|
40755
|
+
message: "Missing or invalid 'tool' field (must be a non-empty string)"
|
|
40756
|
+
});
|
|
40757
|
+
}
|
|
40758
|
+
}
|
|
40759
|
+
}
|
|
40120
40760
|
function isObject22(value) {
|
|
40121
40761
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
40122
40762
|
}
|
|
@@ -40212,7 +40852,9 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
40212
40852
|
"response",
|
|
40213
40853
|
"delayMs",
|
|
40214
40854
|
"delayMinMs",
|
|
40215
|
-
"delayMaxMs"
|
|
40855
|
+
"delayMaxMs",
|
|
40856
|
+
"trace"
|
|
40857
|
+
// For testing tool_trajectory evaluator
|
|
40216
40858
|
]);
|
|
40217
40859
|
var CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
40218
40860
|
...COMMON_SETTINGS,
|
|
@@ -40735,12 +41377,12 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
40735
41377
|
// src/utils/targets.ts
|
|
40736
41378
|
import { constants as constants5 } from "node:fs";
|
|
40737
41379
|
import { access as access5 } from "node:fs/promises";
|
|
40738
|
-
import
|
|
41380
|
+
import path18 from "node:path";
|
|
40739
41381
|
var TARGET_FILE_CANDIDATES = [
|
|
40740
41382
|
"targets.yaml",
|
|
40741
41383
|
"targets.yml",
|
|
40742
|
-
|
|
40743
|
-
|
|
41384
|
+
path18.join(".agentv", "targets.yaml"),
|
|
41385
|
+
path18.join(".agentv", "targets.yml")
|
|
40744
41386
|
];
|
|
40745
41387
|
async function fileExists5(filePath) {
|
|
40746
41388
|
try {
|
|
@@ -40753,12 +41395,12 @@ async function fileExists5(filePath) {
|
|
|
40753
41395
|
async function discoverTargetsFile(options) {
|
|
40754
41396
|
const { explicitPath, testFilePath, repoRoot, cwd } = options;
|
|
40755
41397
|
if (explicitPath) {
|
|
40756
|
-
const resolvedExplicit =
|
|
41398
|
+
const resolvedExplicit = path18.resolve(explicitPath);
|
|
40757
41399
|
if (await fileExists5(resolvedExplicit)) {
|
|
40758
41400
|
return resolvedExplicit;
|
|
40759
41401
|
}
|
|
40760
41402
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
40761
|
-
const nested =
|
|
41403
|
+
const nested = path18.join(resolvedExplicit, candidate);
|
|
40762
41404
|
if (await fileExists5(nested)) {
|
|
40763
41405
|
return nested;
|
|
40764
41406
|
}
|
|
@@ -40766,13 +41408,13 @@ async function discoverTargetsFile(options) {
|
|
|
40766
41408
|
throw new Error(`targets.yaml not found at provided path: ${resolvedExplicit}`);
|
|
40767
41409
|
}
|
|
40768
41410
|
const directories = [...buildDirectoryChain(testFilePath, repoRoot)];
|
|
40769
|
-
const resolvedCwd =
|
|
41411
|
+
const resolvedCwd = path18.resolve(cwd);
|
|
40770
41412
|
if (!directories.includes(resolvedCwd)) {
|
|
40771
41413
|
directories.push(resolvedCwd);
|
|
40772
41414
|
}
|
|
40773
41415
|
for (const directory of directories) {
|
|
40774
41416
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
40775
|
-
const fullPath =
|
|
41417
|
+
const fullPath = path18.join(directory, candidate);
|
|
40776
41418
|
if (await fileExists5(fullPath)) {
|
|
40777
41419
|
return fullPath;
|
|
40778
41420
|
}
|
|
@@ -40881,7 +41523,7 @@ Errors in ${targetsFilePath}:`);
|
|
|
40881
41523
|
};
|
|
40882
41524
|
}
|
|
40883
41525
|
try {
|
|
40884
|
-
const resolvedTarget = resolveTargetDefinition(targetDefinition, env);
|
|
41526
|
+
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
|
|
40885
41527
|
return {
|
|
40886
41528
|
definitions,
|
|
40887
41529
|
resolvedTarget,
|
|
@@ -40938,7 +41580,9 @@ function normalizeOptions(rawOptions) {
|
|
|
40938
41580
|
maxRetries: normalizeNumber(rawOptions.maxRetries, 2),
|
|
40939
41581
|
cache: normalizeBoolean(rawOptions.cache),
|
|
40940
41582
|
verbose: normalizeBoolean(rawOptions.verbose),
|
|
40941
|
-
dumpPrompts: rawOptions.dumpPrompts
|
|
41583
|
+
dumpPrompts: rawOptions.dumpPrompts,
|
|
41584
|
+
dumpTraces: normalizeBoolean(rawOptions.dumpTraces),
|
|
41585
|
+
includeTrace: normalizeBoolean(rawOptions.includeTrace)
|
|
40942
41586
|
};
|
|
40943
41587
|
}
|
|
40944
41588
|
async function ensureFileExists(filePath, description) {
|
|
@@ -40949,15 +41593,15 @@ async function ensureFileExists(filePath, description) {
|
|
|
40949
41593
|
}
|
|
40950
41594
|
}
|
|
40951
41595
|
async function findRepoRoot(start) {
|
|
40952
|
-
const fallback =
|
|
41596
|
+
const fallback = path19.resolve(start);
|
|
40953
41597
|
let current = fallback;
|
|
40954
41598
|
while (current !== void 0) {
|
|
40955
|
-
const candidate =
|
|
41599
|
+
const candidate = path19.join(current, ".git");
|
|
40956
41600
|
try {
|
|
40957
41601
|
await access6(candidate, constants6.F_OK);
|
|
40958
41602
|
return current;
|
|
40959
41603
|
} catch {
|
|
40960
|
-
const parent =
|
|
41604
|
+
const parent = path19.dirname(current);
|
|
40961
41605
|
if (parent === current) {
|
|
40962
41606
|
break;
|
|
40963
41607
|
}
|
|
@@ -40970,16 +41614,16 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
40970
41614
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
40971
41615
|
const baseName = "eval";
|
|
40972
41616
|
const extension = getDefaultExtension(format);
|
|
40973
|
-
return
|
|
41617
|
+
return path19.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
40974
41618
|
}
|
|
40975
41619
|
function resolvePromptDirectory(option4, cwd) {
|
|
40976
41620
|
if (option4 === void 0) {
|
|
40977
41621
|
return void 0;
|
|
40978
41622
|
}
|
|
40979
41623
|
if (typeof option4 === "string" && option4.trim().length > 0) {
|
|
40980
|
-
return
|
|
41624
|
+
return path19.resolve(cwd, option4);
|
|
40981
41625
|
}
|
|
40982
|
-
return
|
|
41626
|
+
return path19.join(cwd, ".agentv", "prompts");
|
|
40983
41627
|
}
|
|
40984
41628
|
function createEvaluationCache() {
|
|
40985
41629
|
const store = /* @__PURE__ */ new Map();
|
|
@@ -41004,7 +41648,7 @@ function createProgressReporter(maxWorkers) {
|
|
|
41004
41648
|
};
|
|
41005
41649
|
}
|
|
41006
41650
|
function makeEvalKey(testFilePath, evalId) {
|
|
41007
|
-
return `${
|
|
41651
|
+
return `${path19.resolve(testFilePath)}::${evalId}`;
|
|
41008
41652
|
}
|
|
41009
41653
|
function createDisplayIdTracker() {
|
|
41010
41654
|
const map2 = /* @__PURE__ */ new Map();
|
|
@@ -41108,10 +41752,6 @@ async function runSingleEvalFile(params) {
|
|
|
41108
41752
|
);
|
|
41109
41753
|
resolvedWorkers = 1;
|
|
41110
41754
|
}
|
|
41111
|
-
if (options.verbose) {
|
|
41112
|
-
const workersSource = workerPreference ? "CLI flag (balanced across files)" : resolvedTargetSelection.resolvedTarget.workers ? "target setting" : "default";
|
|
41113
|
-
console.log(`Using ${resolvedWorkers} worker(s) (source: ${workersSource})`);
|
|
41114
|
-
}
|
|
41115
41755
|
if (isVSCodeProvider && !options.dryRun) {
|
|
41116
41756
|
await ensureVSCodeSubagents({
|
|
41117
41757
|
kind: resolvedTargetSelection.resolvedTarget.kind,
|
|
@@ -41164,7 +41804,7 @@ async function runEvalCommand(input) {
|
|
|
41164
41804
|
if (options.verbose) {
|
|
41165
41805
|
console.log(`Repository root: ${repoRoot}`);
|
|
41166
41806
|
}
|
|
41167
|
-
const outputPath = options.outPath ?
|
|
41807
|
+
const outputPath = options.outPath ? path19.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
|
|
41168
41808
|
console.log(`Output path: ${outputPath}`);
|
|
41169
41809
|
const outputWriter = await createOutputWriter(outputPath, options.format);
|
|
41170
41810
|
const cache = options.cache ? createEvaluationCache() : void 0;
|
|
@@ -41172,7 +41812,7 @@ async function runEvalCommand(input) {
|
|
|
41172
41812
|
const allResults = [];
|
|
41173
41813
|
let lastPromptDumpDir;
|
|
41174
41814
|
const seenEvalCases = /* @__PURE__ */ new Set();
|
|
41175
|
-
const resolvedTestFiles = input.testFiles.map((file2) =>
|
|
41815
|
+
const resolvedTestFiles = input.testFiles.map((file2) => path19.resolve(file2));
|
|
41176
41816
|
const displayIdTracker = createDisplayIdTracker();
|
|
41177
41817
|
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
|
|
41178
41818
|
const fileConcurrency = Math.min(
|
|
@@ -41268,7 +41908,7 @@ async function resolveEvaluationRunner() {
|
|
|
41268
41908
|
if (!overridePath) {
|
|
41269
41909
|
return runEvaluation;
|
|
41270
41910
|
}
|
|
41271
|
-
const resolved =
|
|
41911
|
+
const resolved = path19.isAbsolute(overridePath) ? overridePath : path19.resolve(process.cwd(), overridePath);
|
|
41272
41912
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
41273
41913
|
const mod = await import(moduleUrl);
|
|
41274
41914
|
const candidate = mod.runEvaluation;
|
|
@@ -41369,6 +42009,14 @@ var evalCommand = command({
|
|
|
41369
42009
|
type: optional2(string4),
|
|
41370
42010
|
long: "dump-prompts",
|
|
41371
42011
|
description: "Directory path for persisting prompt payloads for debugging"
|
|
42012
|
+
}),
|
|
42013
|
+
dumpTraces: flag({
|
|
42014
|
+
long: "dump-traces",
|
|
42015
|
+
description: "Write trace files to .agentv/traces/"
|
|
42016
|
+
}),
|
|
42017
|
+
includeTrace: flag({
|
|
42018
|
+
long: "include-trace",
|
|
42019
|
+
description: "Include full trace in result output (verbose)"
|
|
41372
42020
|
})
|
|
41373
42021
|
},
|
|
41374
42022
|
handler: async (args) => {
|
|
@@ -41389,7 +42037,9 @@ var evalCommand = command({
|
|
|
41389
42037
|
maxRetries: args.maxRetries,
|
|
41390
42038
|
cache: args.cache,
|
|
41391
42039
|
verbose: args.verbose,
|
|
41392
|
-
dumpPrompts
|
|
42040
|
+
dumpPrompts,
|
|
42041
|
+
dumpTraces: args.dumpTraces,
|
|
42042
|
+
includeTrace: args.includeTrace
|
|
41393
42043
|
};
|
|
41394
42044
|
await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
|
|
41395
42045
|
}
|
|
@@ -41402,7 +42052,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
41402
42052
|
const unmatched = [];
|
|
41403
42053
|
const results = /* @__PURE__ */ new Set();
|
|
41404
42054
|
for (const pattern of normalizedInputs) {
|
|
41405
|
-
const candidatePath =
|
|
42055
|
+
const candidatePath = path20.isAbsolute(pattern) ? path20.normalize(pattern) : path20.resolve(cwd, pattern);
|
|
41406
42056
|
try {
|
|
41407
42057
|
const stats = await stat4(candidatePath);
|
|
41408
42058
|
if (stats.isFile() && /\.ya?ml$/i.test(candidatePath)) {
|
|
@@ -41426,7 +42076,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
41426
42076
|
continue;
|
|
41427
42077
|
}
|
|
41428
42078
|
for (const filePath of yamlMatches) {
|
|
41429
|
-
results.add(
|
|
42079
|
+
results.add(path20.normalize(filePath));
|
|
41430
42080
|
}
|
|
41431
42081
|
}
|
|
41432
42082
|
if (unmatched.length > 0) {
|
|
@@ -41446,7 +42096,7 @@ import { command as command2, flag as flag2, option as option2, optional as opti
|
|
|
41446
42096
|
|
|
41447
42097
|
// src/commands/generate/rubrics.ts
|
|
41448
42098
|
import { readFile as readFile8, writeFile as writeFile6 } from "node:fs/promises";
|
|
41449
|
-
import
|
|
42099
|
+
import path21 from "node:path";
|
|
41450
42100
|
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
41451
42101
|
import { isMap, isSeq, parseDocument } from "yaml";
|
|
41452
42102
|
function isJsonObject3(value) {
|
|
@@ -41458,7 +42108,7 @@ function asString6(value) {
|
|
|
41458
42108
|
async function loadRubricGenerator() {
|
|
41459
42109
|
const customGenerator = process.env.AGENTEVO_CLI_RUBRIC_GENERATOR;
|
|
41460
42110
|
if (customGenerator) {
|
|
41461
|
-
const generatorPath =
|
|
42111
|
+
const generatorPath = path21.resolve(customGenerator);
|
|
41462
42112
|
const generatorUrl = pathToFileURL2(generatorPath).href;
|
|
41463
42113
|
const module = await import(generatorUrl);
|
|
41464
42114
|
return module.generateRubrics;
|
|
@@ -41468,7 +42118,7 @@ async function loadRubricGenerator() {
|
|
|
41468
42118
|
async function generateRubricsCommand(options) {
|
|
41469
42119
|
const { file: file2, target: targetOverride, verbose } = options;
|
|
41470
42120
|
console.log(`Generating rubrics for: ${file2}`);
|
|
41471
|
-
const absolutePath =
|
|
42121
|
+
const absolutePath = path21.resolve(file2);
|
|
41472
42122
|
const content = await readFile8(absolutePath, "utf8");
|
|
41473
42123
|
const doc = parseDocument(content);
|
|
41474
42124
|
const parsed = doc.toJSON();
|
|
@@ -41629,13 +42279,13 @@ var generateCommand = subcommands({
|
|
|
41629
42279
|
|
|
41630
42280
|
// src/commands/init/index.ts
|
|
41631
42281
|
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
41632
|
-
import
|
|
42282
|
+
import path25 from "node:path";
|
|
41633
42283
|
import * as readline from "node:readline/promises";
|
|
41634
42284
|
import { command as command3, option as option3, optional as optional4, string as string6 } from "cmd-ts";
|
|
41635
42285
|
|
|
41636
42286
|
// src/templates/index.ts
|
|
41637
42287
|
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
41638
|
-
import
|
|
42288
|
+
import path24 from "node:path";
|
|
41639
42289
|
import { fileURLToPath } from "node:url";
|
|
41640
42290
|
function getGithubTemplates() {
|
|
41641
42291
|
return getTemplatesFromDir(".github");
|
|
@@ -41647,12 +42297,12 @@ function getClaudeTemplates() {
|
|
|
41647
42297
|
return getTemplatesFromDir(".claude");
|
|
41648
42298
|
}
|
|
41649
42299
|
function getTemplatesFromDir(subdir) {
|
|
41650
|
-
const currentDir =
|
|
42300
|
+
const currentDir = path24.dirname(fileURLToPath(import.meta.url));
|
|
41651
42301
|
let templatesDir;
|
|
41652
|
-
if (currentDir.includes(`${
|
|
41653
|
-
templatesDir =
|
|
42302
|
+
if (currentDir.includes(`${path24.sep}dist`)) {
|
|
42303
|
+
templatesDir = path24.join(currentDir, "templates", subdir);
|
|
41654
42304
|
} else {
|
|
41655
|
-
templatesDir =
|
|
42305
|
+
templatesDir = path24.join(currentDir, subdir);
|
|
41656
42306
|
}
|
|
41657
42307
|
return readTemplatesRecursively(templatesDir, "");
|
|
41658
42308
|
}
|
|
@@ -41660,15 +42310,15 @@ function readTemplatesRecursively(dir, relativePath) {
|
|
|
41660
42310
|
const templates = [];
|
|
41661
42311
|
const entries = readdirSync(dir);
|
|
41662
42312
|
for (const entry of entries) {
|
|
41663
|
-
const fullPath =
|
|
42313
|
+
const fullPath = path24.join(dir, entry);
|
|
41664
42314
|
const stat6 = statSync(fullPath);
|
|
41665
|
-
const entryRelativePath = relativePath ?
|
|
42315
|
+
const entryRelativePath = relativePath ? path24.join(relativePath, entry) : entry;
|
|
41666
42316
|
if (stat6.isDirectory()) {
|
|
41667
42317
|
templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
|
|
41668
42318
|
} else {
|
|
41669
42319
|
const content = readFileSync(fullPath, "utf-8");
|
|
41670
42320
|
templates.push({
|
|
41671
|
-
path: entryRelativePath.split(
|
|
42321
|
+
path: entryRelativePath.split(path24.sep).join("/"),
|
|
41672
42322
|
// Normalize to forward slashes
|
|
41673
42323
|
content
|
|
41674
42324
|
});
|
|
@@ -41691,10 +42341,10 @@ async function promptYesNo(message) {
|
|
|
41691
42341
|
}
|
|
41692
42342
|
}
|
|
41693
42343
|
async function initCommand(options = {}) {
|
|
41694
|
-
const targetPath =
|
|
41695
|
-
const githubDir =
|
|
41696
|
-
const agentvDir =
|
|
41697
|
-
const claudeDir =
|
|
42344
|
+
const targetPath = path25.resolve(options.targetPath ?? ".");
|
|
42345
|
+
const githubDir = path25.join(targetPath, ".github");
|
|
42346
|
+
const agentvDir = path25.join(targetPath, ".agentv");
|
|
42347
|
+
const claudeDir = path25.join(targetPath, ".claude");
|
|
41698
42348
|
const githubTemplates = getGithubTemplates();
|
|
41699
42349
|
const agentvTemplates = getAgentvTemplates();
|
|
41700
42350
|
const claudeTemplates = getClaudeTemplates();
|
|
@@ -41702,32 +42352,32 @@ async function initCommand(options = {}) {
|
|
|
41702
42352
|
const otherAgentvTemplates = agentvTemplates.filter((t) => t.path !== ".env.template");
|
|
41703
42353
|
const existingFiles = [];
|
|
41704
42354
|
if (envTemplate) {
|
|
41705
|
-
const envFilePath =
|
|
42355
|
+
const envFilePath = path25.join(targetPath, ".env.template");
|
|
41706
42356
|
if (existsSync(envFilePath)) {
|
|
41707
42357
|
existingFiles.push(".env.template");
|
|
41708
42358
|
}
|
|
41709
42359
|
}
|
|
41710
42360
|
if (existsSync(githubDir)) {
|
|
41711
42361
|
for (const template of githubTemplates) {
|
|
41712
|
-
const targetFilePath =
|
|
42362
|
+
const targetFilePath = path25.join(githubDir, template.path);
|
|
41713
42363
|
if (existsSync(targetFilePath)) {
|
|
41714
|
-
existingFiles.push(
|
|
42364
|
+
existingFiles.push(path25.relative(targetPath, targetFilePath));
|
|
41715
42365
|
}
|
|
41716
42366
|
}
|
|
41717
42367
|
}
|
|
41718
42368
|
if (existsSync(agentvDir)) {
|
|
41719
42369
|
for (const template of otherAgentvTemplates) {
|
|
41720
|
-
const targetFilePath =
|
|
42370
|
+
const targetFilePath = path25.join(agentvDir, template.path);
|
|
41721
42371
|
if (existsSync(targetFilePath)) {
|
|
41722
|
-
existingFiles.push(
|
|
42372
|
+
existingFiles.push(path25.relative(targetPath, targetFilePath));
|
|
41723
42373
|
}
|
|
41724
42374
|
}
|
|
41725
42375
|
}
|
|
41726
42376
|
if (existsSync(claudeDir)) {
|
|
41727
42377
|
for (const template of claudeTemplates) {
|
|
41728
|
-
const targetFilePath =
|
|
42378
|
+
const targetFilePath = path25.join(claudeDir, template.path);
|
|
41729
42379
|
if (existsSync(targetFilePath)) {
|
|
41730
|
-
existingFiles.push(
|
|
42380
|
+
existingFiles.push(path25.relative(targetPath, targetFilePath));
|
|
41731
42381
|
}
|
|
41732
42382
|
}
|
|
41733
42383
|
}
|
|
@@ -41754,36 +42404,36 @@ async function initCommand(options = {}) {
|
|
|
41754
42404
|
mkdirSync(claudeDir, { recursive: true });
|
|
41755
42405
|
}
|
|
41756
42406
|
if (envTemplate) {
|
|
41757
|
-
const envFilePath =
|
|
42407
|
+
const envFilePath = path25.join(targetPath, ".env.template");
|
|
41758
42408
|
writeFileSync(envFilePath, envTemplate.content, "utf-8");
|
|
41759
42409
|
console.log("Created .env.template");
|
|
41760
42410
|
}
|
|
41761
42411
|
for (const template of githubTemplates) {
|
|
41762
|
-
const targetFilePath =
|
|
41763
|
-
const targetDirPath =
|
|
42412
|
+
const targetFilePath = path25.join(githubDir, template.path);
|
|
42413
|
+
const targetDirPath = path25.dirname(targetFilePath);
|
|
41764
42414
|
if (!existsSync(targetDirPath)) {
|
|
41765
42415
|
mkdirSync(targetDirPath, { recursive: true });
|
|
41766
42416
|
}
|
|
41767
42417
|
writeFileSync(targetFilePath, template.content, "utf-8");
|
|
41768
|
-
console.log(`Created ${
|
|
42418
|
+
console.log(`Created ${path25.relative(targetPath, targetFilePath)}`);
|
|
41769
42419
|
}
|
|
41770
42420
|
for (const template of otherAgentvTemplates) {
|
|
41771
|
-
const targetFilePath =
|
|
41772
|
-
const targetDirPath =
|
|
42421
|
+
const targetFilePath = path25.join(agentvDir, template.path);
|
|
42422
|
+
const targetDirPath = path25.dirname(targetFilePath);
|
|
41773
42423
|
if (!existsSync(targetDirPath)) {
|
|
41774
42424
|
mkdirSync(targetDirPath, { recursive: true });
|
|
41775
42425
|
}
|
|
41776
42426
|
writeFileSync(targetFilePath, template.content, "utf-8");
|
|
41777
|
-
console.log(`Created ${
|
|
42427
|
+
console.log(`Created ${path25.relative(targetPath, targetFilePath)}`);
|
|
41778
42428
|
}
|
|
41779
42429
|
for (const template of claudeTemplates) {
|
|
41780
|
-
const targetFilePath =
|
|
41781
|
-
const targetDirPath =
|
|
42430
|
+
const targetFilePath = path25.join(claudeDir, template.path);
|
|
42431
|
+
const targetDirPath = path25.dirname(targetFilePath);
|
|
41782
42432
|
if (!existsSync(targetDirPath)) {
|
|
41783
42433
|
mkdirSync(targetDirPath, { recursive: true });
|
|
41784
42434
|
}
|
|
41785
42435
|
writeFileSync(targetFilePath, template.content, "utf-8");
|
|
41786
|
-
console.log(`Created ${
|
|
42436
|
+
console.log(`Created ${path25.relative(targetPath, targetFilePath)}`);
|
|
41787
42437
|
}
|
|
41788
42438
|
console.log("\nAgentV initialized successfully!");
|
|
41789
42439
|
console.log("\nFiles installed to root:");
|
|
@@ -41791,17 +42441,17 @@ async function initCommand(options = {}) {
|
|
|
41791
42441
|
console.log(" - .env.template");
|
|
41792
42442
|
}
|
|
41793
42443
|
console.log(`
|
|
41794
|
-
Files installed to ${
|
|
42444
|
+
Files installed to ${path25.relative(targetPath, githubDir)}:`);
|
|
41795
42445
|
for (const t of githubTemplates) {
|
|
41796
42446
|
console.log(` - ${t.path}`);
|
|
41797
42447
|
}
|
|
41798
42448
|
console.log(`
|
|
41799
|
-
Files installed to ${
|
|
42449
|
+
Files installed to ${path25.relative(targetPath, agentvDir)}:`);
|
|
41800
42450
|
for (const t of otherAgentvTemplates) {
|
|
41801
42451
|
console.log(` - ${t.path}`);
|
|
41802
42452
|
}
|
|
41803
42453
|
console.log(`
|
|
41804
|
-
Files installed to ${
|
|
42454
|
+
Files installed to ${path25.relative(targetPath, claudeDir)}:`);
|
|
41805
42455
|
for (const t of claudeTemplates) {
|
|
41806
42456
|
console.log(` - ${t.path}`);
|
|
41807
42457
|
}
|
|
@@ -41916,7 +42566,7 @@ function isTTY2() {
|
|
|
41916
42566
|
// src/commands/validate/validate-files.ts
|
|
41917
42567
|
import { constants as constants7 } from "node:fs";
|
|
41918
42568
|
import { access as access7, readdir as readdir3, stat as stat5 } from "node:fs/promises";
|
|
41919
|
-
import
|
|
42569
|
+
import path26 from "node:path";
|
|
41920
42570
|
async function validateFiles(paths) {
|
|
41921
42571
|
const filePaths = await expandPaths(paths);
|
|
41922
42572
|
const results = [];
|
|
@@ -41934,7 +42584,7 @@ async function validateFiles(paths) {
|
|
|
41934
42584
|
};
|
|
41935
42585
|
}
|
|
41936
42586
|
async function validateSingleFile(filePath) {
|
|
41937
|
-
const absolutePath =
|
|
42587
|
+
const absolutePath = path26.resolve(filePath);
|
|
41938
42588
|
const fileType = await detectFileType(absolutePath);
|
|
41939
42589
|
let result;
|
|
41940
42590
|
if (fileType === "eval") {
|
|
@@ -41959,7 +42609,7 @@ async function validateSingleFile(filePath) {
|
|
|
41959
42609
|
async function expandPaths(paths) {
|
|
41960
42610
|
const expanded = [];
|
|
41961
42611
|
for (const inputPath of paths) {
|
|
41962
|
-
const absolutePath =
|
|
42612
|
+
const absolutePath = path26.resolve(inputPath);
|
|
41963
42613
|
try {
|
|
41964
42614
|
await access7(absolutePath, constants7.F_OK);
|
|
41965
42615
|
} catch {
|
|
@@ -41983,7 +42633,7 @@ async function findYamlFiles(dirPath) {
|
|
|
41983
42633
|
try {
|
|
41984
42634
|
const entries = await readdir3(dirPath, { withFileTypes: true });
|
|
41985
42635
|
for (const entry of entries) {
|
|
41986
|
-
const fullPath =
|
|
42636
|
+
const fullPath = path26.join(dirPath, entry.name);
|
|
41987
42637
|
if (entry.isDirectory()) {
|
|
41988
42638
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
41989
42639
|
continue;
|
|
@@ -42000,7 +42650,7 @@ async function findYamlFiles(dirPath) {
|
|
|
42000
42650
|
return results;
|
|
42001
42651
|
}
|
|
42002
42652
|
function isYamlFile(filePath) {
|
|
42003
|
-
const ext =
|
|
42653
|
+
const ext = path26.extname(filePath).toLowerCase();
|
|
42004
42654
|
return ext === ".yaml" || ext === ".yml";
|
|
42005
42655
|
}
|
|
42006
42656
|
|
|
@@ -42058,4 +42708,4 @@ export {
|
|
|
42058
42708
|
app,
|
|
42059
42709
|
runCli
|
|
42060
42710
|
};
|
|
42061
|
-
//# sourceMappingURL=chunk-
|
|
42711
|
+
//# sourceMappingURL=chunk-6ZM7WVSC.js.map
|