agentv 2.1.1 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -1
- package/dist/{chunk-HTTN5OWL.js → chunk-XREH4WAJ.js} +1197 -385
- package/dist/chunk-XREH4WAJ.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.example +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +65 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +57 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +59 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +85 -18
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +78 -4
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +78 -77
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/package.json +1 -1
- package/dist/chunk-HTTN5OWL.js.map +0 -1
|
@@ -373,9 +373,9 @@ var compareCommand = command({
|
|
|
373
373
|
|
|
374
374
|
// src/commands/convert/index.ts
|
|
375
375
|
import { readFileSync as readFileSync2, writeFileSync } from "node:fs";
|
|
376
|
-
import
|
|
376
|
+
import path16 from "node:path";
|
|
377
377
|
|
|
378
|
-
// ../../packages/core/dist/chunk-
|
|
378
|
+
// ../../packages/core/dist/chunk-RP3M7COZ.js
|
|
379
379
|
import { constants } from "node:fs";
|
|
380
380
|
import { access, readFile } from "node:fs/promises";
|
|
381
381
|
import path from "node:path";
|
|
@@ -859,8 +859,8 @@ function getErrorMap() {
|
|
|
859
859
|
|
|
860
860
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
|
|
861
861
|
var makeIssue = (params) => {
|
|
862
|
-
const { data, path:
|
|
863
|
-
const fullPath = [...
|
|
862
|
+
const { data, path: path30, errorMaps, issueData } = params;
|
|
863
|
+
const fullPath = [...path30, ...issueData.path || []];
|
|
864
864
|
const fullIssue = {
|
|
865
865
|
...issueData,
|
|
866
866
|
path: fullPath
|
|
@@ -976,11 +976,11 @@ var errorUtil;
|
|
|
976
976
|
|
|
977
977
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
|
|
978
978
|
var ParseInputLazyPath = class {
|
|
979
|
-
constructor(parent, value,
|
|
979
|
+
constructor(parent, value, path30, key2) {
|
|
980
980
|
this._cachedPath = [];
|
|
981
981
|
this.parent = parent;
|
|
982
982
|
this.data = value;
|
|
983
|
-
this._path =
|
|
983
|
+
this._path = path30;
|
|
984
984
|
this._key = key2;
|
|
985
985
|
}
|
|
986
986
|
get path() {
|
|
@@ -4422,7 +4422,7 @@ var coerce = {
|
|
|
4422
4422
|
};
|
|
4423
4423
|
var NEVER = INVALID;
|
|
4424
4424
|
|
|
4425
|
-
// ../../packages/core/dist/chunk-
|
|
4425
|
+
// ../../packages/core/dist/chunk-RP3M7COZ.js
|
|
4426
4426
|
async function fileExists(filePath) {
|
|
4427
4427
|
try {
|
|
4428
4428
|
await access(filePath, constants.F_OK);
|
|
@@ -5418,8 +5418,9 @@ function isAgentProvider(provider) {
|
|
|
5418
5418
|
}
|
|
5419
5419
|
|
|
5420
5420
|
// ../../packages/core/dist/index.js
|
|
5421
|
-
import { readFile as
|
|
5422
|
-
import
|
|
5421
|
+
import { readFile as readFile6 } from "node:fs/promises";
|
|
5422
|
+
import path72 from "node:path";
|
|
5423
|
+
import micromatch3 from "micromatch";
|
|
5423
5424
|
import { parse as parse22 } from "yaml";
|
|
5424
5425
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
5425
5426
|
import path22 from "node:path";
|
|
@@ -5430,10 +5431,14 @@ import { access as access3 } from "node:fs/promises";
|
|
|
5430
5431
|
import path13 from "node:path";
|
|
5431
5432
|
import path32 from "node:path";
|
|
5432
5433
|
import { readFile as readFile22 } from "node:fs/promises";
|
|
5433
|
-
import { readFile as readFile32 } from "node:fs/promises";
|
|
5434
|
-
import path42 from "node:path";
|
|
5435
5434
|
import { readFile as readFile42 } from "node:fs/promises";
|
|
5436
5435
|
import path52 from "node:path";
|
|
5436
|
+
import micromatch2 from "micromatch";
|
|
5437
|
+
import { parse as parseYaml } from "yaml";
|
|
5438
|
+
import { readFile as readFile32 } from "node:fs/promises";
|
|
5439
|
+
import path42 from "node:path";
|
|
5440
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
5441
|
+
import path62 from "node:path";
|
|
5437
5442
|
|
|
5438
5443
|
// ../../node_modules/.bun/@ai-sdk+provider@2.0.0/node_modules/@ai-sdk/provider/dist/index.mjs
|
|
5439
5444
|
var marker = "vercel.ai.error";
|
|
@@ -6523,10 +6528,10 @@ function assignProp(target, prop, value) {
|
|
|
6523
6528
|
configurable: true
|
|
6524
6529
|
});
|
|
6525
6530
|
}
|
|
6526
|
-
function getElementAtPath(obj,
|
|
6527
|
-
if (!
|
|
6531
|
+
function getElementAtPath(obj, path30) {
|
|
6532
|
+
if (!path30)
|
|
6528
6533
|
return obj;
|
|
6529
|
-
return
|
|
6534
|
+
return path30.reduce((acc, key2) => acc?.[key2], obj);
|
|
6530
6535
|
}
|
|
6531
6536
|
function promiseAllObject(promisesObj) {
|
|
6532
6537
|
const keys = Object.keys(promisesObj);
|
|
@@ -6846,11 +6851,11 @@ function aborted(x, startIndex = 0) {
|
|
|
6846
6851
|
}
|
|
6847
6852
|
return false;
|
|
6848
6853
|
}
|
|
6849
|
-
function prefixIssues(
|
|
6854
|
+
function prefixIssues(path30, issues) {
|
|
6850
6855
|
return issues.map((iss) => {
|
|
6851
6856
|
var _a17;
|
|
6852
6857
|
(_a17 = iss).path ?? (_a17.path = []);
|
|
6853
|
-
iss.path.unshift(
|
|
6858
|
+
iss.path.unshift(path30);
|
|
6854
6859
|
return iss;
|
|
6855
6860
|
});
|
|
6856
6861
|
}
|
|
@@ -6987,7 +6992,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6987
6992
|
return issue2.message;
|
|
6988
6993
|
};
|
|
6989
6994
|
const result = { errors: [] };
|
|
6990
|
-
const processError = (error41,
|
|
6995
|
+
const processError = (error41, path30 = []) => {
|
|
6991
6996
|
var _a17, _b8;
|
|
6992
6997
|
for (const issue2 of error41.issues) {
|
|
6993
6998
|
if (issue2.code === "invalid_union" && issue2.errors.length) {
|
|
@@ -6997,7 +7002,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6997
7002
|
} else if (issue2.code === "invalid_element") {
|
|
6998
7003
|
processError({ issues: issue2.issues }, issue2.path);
|
|
6999
7004
|
} else {
|
|
7000
|
-
const fullpath = [...
|
|
7005
|
+
const fullpath = [...path30, ...issue2.path];
|
|
7001
7006
|
if (fullpath.length === 0) {
|
|
7002
7007
|
result.errors.push(mapper(issue2));
|
|
7003
7008
|
continue;
|
|
@@ -7027,9 +7032,9 @@ function treeifyError(error40, _mapper) {
|
|
|
7027
7032
|
processError(error40);
|
|
7028
7033
|
return result;
|
|
7029
7034
|
}
|
|
7030
|
-
function toDotPath(
|
|
7035
|
+
function toDotPath(path30) {
|
|
7031
7036
|
const segs = [];
|
|
7032
|
-
for (const seg of
|
|
7037
|
+
for (const seg of path30) {
|
|
7033
7038
|
if (typeof seg === "number")
|
|
7034
7039
|
segs.push(`[${seg}]`);
|
|
7035
7040
|
else if (typeof seg === "symbol")
|
|
@@ -26582,14 +26587,14 @@ function createAzure(options = {}) {
|
|
|
26582
26587
|
description: "Azure OpenAI resource name"
|
|
26583
26588
|
});
|
|
26584
26589
|
const apiVersion = (_a17 = options.apiVersion) != null ? _a17 : "v1";
|
|
26585
|
-
const url2 = ({ path:
|
|
26590
|
+
const url2 = ({ path: path30, modelId }) => {
|
|
26586
26591
|
var _a24;
|
|
26587
26592
|
const baseUrlPrefix = (_a24 = options.baseURL) != null ? _a24 : `https://${getResourceName()}.openai.azure.com/openai`;
|
|
26588
26593
|
let fullUrl;
|
|
26589
26594
|
if (options.useDeploymentBasedUrls) {
|
|
26590
|
-
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${
|
|
26595
|
+
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${path30}`);
|
|
26591
26596
|
} else {
|
|
26592
|
-
fullUrl = new URL(`${baseUrlPrefix}/v1${
|
|
26597
|
+
fullUrl = new URL(`${baseUrlPrefix}/v1${path30}`);
|
|
26593
26598
|
}
|
|
26594
26599
|
fullUrl.searchParams.set("api-version", apiVersion);
|
|
26595
26600
|
return fullUrl.toString();
|
|
@@ -33025,27 +33030,27 @@ import { randomUUID } from "node:crypto";
|
|
|
33025
33030
|
import { createWriteStream } from "node:fs";
|
|
33026
33031
|
import { mkdir as mkdir4, mkdtemp, rm as rm2, writeFile as writeFile5 } from "node:fs/promises";
|
|
33027
33032
|
import { tmpdir } from "node:os";
|
|
33033
|
+
import path92 from "node:path";
|
|
33028
33034
|
import path82 from "node:path";
|
|
33029
|
-
import path72 from "node:path";
|
|
33030
33035
|
import { exec as execWithCallback } from "node:child_process";
|
|
33031
33036
|
import fs from "node:fs/promises";
|
|
33032
33037
|
import os2 from "node:os";
|
|
33033
|
-
import
|
|
33038
|
+
import path102 from "node:path";
|
|
33034
33039
|
import { promisify as promisify2 } from "node:util";
|
|
33035
33040
|
import { exec as execCallback, spawn as spawn22 } from "node:child_process";
|
|
33036
33041
|
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
33037
33042
|
import { constants as constants22, createWriteStream as createWriteStream2 } from "node:fs";
|
|
33038
33043
|
import { access as access22, mkdir as mkdir22, mkdtemp as mkdtemp2, rm as rm22, writeFile as writeFile22 } from "node:fs/promises";
|
|
33039
33044
|
import { tmpdir as tmpdir2 } from "node:os";
|
|
33040
|
-
import
|
|
33045
|
+
import path112 from "node:path";
|
|
33041
33046
|
import { promisify as promisify22 } from "node:util";
|
|
33042
33047
|
import { spawn as spawn3 } from "node:child_process";
|
|
33043
33048
|
import { randomUUID as randomUUID3 } from "node:crypto";
|
|
33044
33049
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
33045
33050
|
import { mkdir as mkdir32, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile32 } from "node:fs/promises";
|
|
33046
33051
|
import { tmpdir as tmpdir3 } from "node:os";
|
|
33047
|
-
import path112 from "node:path";
|
|
33048
33052
|
import path122 from "node:path";
|
|
33053
|
+
import path132 from "node:path";
|
|
33049
33054
|
|
|
33050
33055
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/agentDispatch.js
|
|
33051
33056
|
import { stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
|
|
@@ -35067,13 +35072,14 @@ async function provisionSubagents(options) {
|
|
|
35067
35072
|
|
|
35068
35073
|
// ../../packages/core/dist/index.js
|
|
35069
35074
|
import { constants as constants32 } from "node:fs";
|
|
35070
|
-
import { access as access32, readFile as
|
|
35071
|
-
import
|
|
35075
|
+
import { access as access32, readFile as readFile7 } from "node:fs/promises";
|
|
35076
|
+
import path14 from "node:path";
|
|
35072
35077
|
import { parse as parse32 } from "yaml";
|
|
35073
35078
|
import { randomBytes } from "node:crypto";
|
|
35074
35079
|
import { createServer } from "node:http";
|
|
35075
35080
|
import { createHash } from "node:crypto";
|
|
35076
|
-
import
|
|
35081
|
+
import path15 from "node:path";
|
|
35082
|
+
import micromatch4 from "micromatch";
|
|
35077
35083
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
35078
35084
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
35079
35085
|
function isTestMessageRole(value) {
|
|
@@ -35449,11 +35455,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35449
35455
|
);
|
|
35450
35456
|
}
|
|
35451
35457
|
}
|
|
35452
|
-
const
|
|
35453
|
-
const
|
|
35458
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
35459
|
+
const config22 = {};
|
|
35454
35460
|
for (const [key2, value] of Object.entries(rawEvaluator)) {
|
|
35455
|
-
if (!
|
|
35456
|
-
|
|
35461
|
+
if (!knownProps2.has(key2) && value !== void 0) {
|
|
35462
|
+
config22[key2] = value;
|
|
35457
35463
|
}
|
|
35458
35464
|
}
|
|
35459
35465
|
evaluators.push({
|
|
@@ -35463,7 +35469,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35463
35469
|
cwd,
|
|
35464
35470
|
resolvedCwd,
|
|
35465
35471
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35466
|
-
...Object.keys(
|
|
35472
|
+
...Object.keys(config22).length > 0 ? { config: config22 } : {},
|
|
35467
35473
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
35468
35474
|
});
|
|
35469
35475
|
continue;
|
|
@@ -35628,7 +35634,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35628
35634
|
continue;
|
|
35629
35635
|
}
|
|
35630
35636
|
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35631
|
-
const
|
|
35637
|
+
const config22 = {
|
|
35632
35638
|
name: name16,
|
|
35633
35639
|
type: "tool_trajectory",
|
|
35634
35640
|
mode,
|
|
@@ -35636,7 +35642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35636
35642
|
...expected ? { expected } : {},
|
|
35637
35643
|
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35638
35644
|
};
|
|
35639
|
-
evaluators.push(
|
|
35645
|
+
evaluators.push(config22);
|
|
35640
35646
|
continue;
|
|
35641
35647
|
}
|
|
35642
35648
|
if (typeValue === "field_accuracy") {
|
|
@@ -35773,9 +35779,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35773
35779
|
});
|
|
35774
35780
|
continue;
|
|
35775
35781
|
}
|
|
35776
|
-
const
|
|
35782
|
+
const rawPrompt = rawEvaluator.prompt;
|
|
35783
|
+
let prompt;
|
|
35777
35784
|
let promptPath;
|
|
35778
|
-
|
|
35785
|
+
let resolvedPromptScript;
|
|
35786
|
+
let promptScriptConfig;
|
|
35787
|
+
if (isJsonObject2(rawPrompt)) {
|
|
35788
|
+
const scriptArray = asStringArray(
|
|
35789
|
+
rawPrompt.script,
|
|
35790
|
+
`prompt.script for evaluator '${name16}' in '${evalId}'`
|
|
35791
|
+
);
|
|
35792
|
+
if (!scriptArray) {
|
|
35793
|
+
throw new Error(`Evaluator '${name16}' in '${evalId}': prompt object requires script array`);
|
|
35794
|
+
}
|
|
35795
|
+
const scriptPath = scriptArray[scriptArray.length - 1];
|
|
35796
|
+
const resolved = await resolveFileReference2(scriptPath, searchRoots);
|
|
35797
|
+
if (resolved.resolvedPath) {
|
|
35798
|
+
resolvedPromptScript = [...scriptArray.slice(0, -1), path32.resolve(resolved.resolvedPath)];
|
|
35799
|
+
} else {
|
|
35800
|
+
throw new Error(
|
|
35801
|
+
`Evaluator '${name16}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
|
|
35802
|
+
);
|
|
35803
|
+
}
|
|
35804
|
+
if (isJsonObject2(rawPrompt.config)) {
|
|
35805
|
+
promptScriptConfig = rawPrompt.config;
|
|
35806
|
+
}
|
|
35807
|
+
} else if (typeof rawPrompt === "string") {
|
|
35808
|
+
prompt = rawPrompt;
|
|
35779
35809
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
35780
35810
|
if (resolved.resolvedPath) {
|
|
35781
35811
|
promptPath = path32.resolve(resolved.resolvedPath);
|
|
@@ -35794,12 +35824,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35794
35824
|
}
|
|
35795
35825
|
const _model = asString(rawEvaluator.model);
|
|
35796
35826
|
const rawRubrics = rawEvaluator.rubrics;
|
|
35797
|
-
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics
|
|
35798
|
-
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
35799
|
-
description: asString(rubric.description) ?? "",
|
|
35800
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
35801
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
35802
|
-
})).filter((r) => r.description.length > 0) : void 0;
|
|
35827
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name16, evalId) : void 0;
|
|
35803
35828
|
if (typeValue === "rubric") {
|
|
35804
35829
|
if (!parsedRubrics) {
|
|
35805
35830
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
|
|
@@ -35819,13 +35844,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35819
35844
|
continue;
|
|
35820
35845
|
}
|
|
35821
35846
|
const weight = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35847
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
|
|
35848
|
+
const config2 = {};
|
|
35849
|
+
for (const [key2, value] of Object.entries(rawEvaluator)) {
|
|
35850
|
+
if (!knownProps.has(key2) && value !== void 0) {
|
|
35851
|
+
config2[key2] = value;
|
|
35852
|
+
}
|
|
35853
|
+
}
|
|
35854
|
+
const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
|
|
35855
|
+
const mergedConfig = { ...config2, ...topLevelConfig };
|
|
35856
|
+
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
35822
35857
|
evaluators.push({
|
|
35823
35858
|
name: name16,
|
|
35824
35859
|
type: "llm_judge",
|
|
35825
35860
|
prompt,
|
|
35826
35861
|
promptPath,
|
|
35862
|
+
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
35863
|
+
...resolvedPromptScript ? { resolvedPromptScript } : {},
|
|
35827
35864
|
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
35828
|
-
...weight !== void 0 ? { weight } : {}
|
|
35865
|
+
...weight !== void 0 ? { weight } : {},
|
|
35866
|
+
...finalConfig ? { config: finalConfig } : {}
|
|
35829
35867
|
});
|
|
35830
35868
|
}
|
|
35831
35869
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -35912,6 +35950,185 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
35912
35950
|
function isValidFieldAggregationType(value) {
|
|
35913
35951
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
35914
35952
|
}
|
|
35953
|
+
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
35954
|
+
const items = [];
|
|
35955
|
+
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
35956
|
+
if (!isJsonObject2(rawRubric)) {
|
|
35957
|
+
logWarning2(
|
|
35958
|
+
`Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
|
|
35959
|
+
);
|
|
35960
|
+
continue;
|
|
35961
|
+
}
|
|
35962
|
+
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
35963
|
+
const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
|
|
35964
|
+
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
35965
|
+
let requiredMinScore;
|
|
35966
|
+
let required2;
|
|
35967
|
+
if (typeof rawRubric.required_min_score === "number") {
|
|
35968
|
+
const minScore = rawRubric.required_min_score;
|
|
35969
|
+
if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
|
|
35970
|
+
throw new Error(
|
|
35971
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
|
|
35972
|
+
);
|
|
35973
|
+
}
|
|
35974
|
+
requiredMinScore = minScore;
|
|
35975
|
+
}
|
|
35976
|
+
if (typeof rawRubric.required === "boolean") {
|
|
35977
|
+
required2 = rawRubric.required;
|
|
35978
|
+
}
|
|
35979
|
+
let scoreRanges;
|
|
35980
|
+
const rawScoreRanges = rawRubric.score_ranges;
|
|
35981
|
+
if (rawScoreRanges !== void 0) {
|
|
35982
|
+
if (!Array.isArray(rawScoreRanges)) {
|
|
35983
|
+
throw new Error(
|
|
35984
|
+
`Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
|
|
35985
|
+
);
|
|
35986
|
+
}
|
|
35987
|
+
scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
|
|
35988
|
+
items.push({
|
|
35989
|
+
id,
|
|
35990
|
+
weight,
|
|
35991
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
35992
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
35993
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
35994
|
+
score_ranges: scoreRanges
|
|
35995
|
+
});
|
|
35996
|
+
} else {
|
|
35997
|
+
if (expectedOutcome.length === 0) {
|
|
35998
|
+
logWarning2(
|
|
35999
|
+
`Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
|
|
36000
|
+
);
|
|
36001
|
+
continue;
|
|
36002
|
+
}
|
|
36003
|
+
items.push({
|
|
36004
|
+
id,
|
|
36005
|
+
expected_outcome: expectedOutcome,
|
|
36006
|
+
weight,
|
|
36007
|
+
// Default to required: true if not specified (backward compatibility)
|
|
36008
|
+
required: required2 ?? true,
|
|
36009
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
36010
|
+
});
|
|
36011
|
+
}
|
|
36012
|
+
}
|
|
36013
|
+
return items.length > 0 ? items : void 0;
|
|
36014
|
+
}
|
|
36015
|
+
function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
|
|
36016
|
+
const ranges = [];
|
|
36017
|
+
for (const [index, rawRange] of rawRanges.entries()) {
|
|
36018
|
+
if (!isJsonObject2(rawRange)) {
|
|
36019
|
+
throw new Error(
|
|
36020
|
+
`Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
|
|
36021
|
+
);
|
|
36022
|
+
}
|
|
36023
|
+
const scoreRangeValue = rawRange.score_range;
|
|
36024
|
+
if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
|
|
36025
|
+
throw new Error(
|
|
36026
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
|
|
36027
|
+
);
|
|
36028
|
+
}
|
|
36029
|
+
const [min, max] = scoreRangeValue;
|
|
36030
|
+
if (!Number.isInteger(min) || !Number.isInteger(max)) {
|
|
36031
|
+
throw new Error(
|
|
36032
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
|
|
36033
|
+
);
|
|
36034
|
+
}
|
|
36035
|
+
if (min < 0 || min > 10 || max < 0 || max > 10) {
|
|
36036
|
+
throw new Error(
|
|
36037
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
|
|
36038
|
+
);
|
|
36039
|
+
}
|
|
36040
|
+
if (min > max) {
|
|
36041
|
+
throw new Error(
|
|
36042
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
|
|
36043
|
+
);
|
|
36044
|
+
}
|
|
36045
|
+
const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
|
|
36046
|
+
if (expectedOutcome.length === 0) {
|
|
36047
|
+
throw new Error(
|
|
36048
|
+
`Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
|
|
36049
|
+
);
|
|
36050
|
+
}
|
|
36051
|
+
ranges.push({
|
|
36052
|
+
score_range: [min, max],
|
|
36053
|
+
expected_outcome: expectedOutcome
|
|
36054
|
+
});
|
|
36055
|
+
}
|
|
36056
|
+
const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
|
|
36057
|
+
for (let i = 1; i < sortedRanges.length; i++) {
|
|
36058
|
+
const prev = sortedRanges[i - 1];
|
|
36059
|
+
const curr = sortedRanges[i];
|
|
36060
|
+
if (curr.score_range[0] <= prev.score_range[1]) {
|
|
36061
|
+
throw new Error(
|
|
36062
|
+
`Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
|
|
36063
|
+
);
|
|
36064
|
+
}
|
|
36065
|
+
}
|
|
36066
|
+
const covered = /* @__PURE__ */ new Set();
|
|
36067
|
+
for (const range of ranges) {
|
|
36068
|
+
for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
|
|
36069
|
+
covered.add(i);
|
|
36070
|
+
}
|
|
36071
|
+
}
|
|
36072
|
+
const missing = [];
|
|
36073
|
+
for (let i = 0; i <= 10; i++) {
|
|
36074
|
+
if (!covered.has(i)) {
|
|
36075
|
+
missing.push(i);
|
|
36076
|
+
}
|
|
36077
|
+
}
|
|
36078
|
+
if (missing.length > 0) {
|
|
36079
|
+
throw new Error(
|
|
36080
|
+
`Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
|
|
36081
|
+
);
|
|
36082
|
+
}
|
|
36083
|
+
return ranges;
|
|
36084
|
+
}
|
|
36085
|
+
function parseInlineRubrics(rawRubrics) {
|
|
36086
|
+
const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
|
|
36087
|
+
if (typeof rubric === "string") {
|
|
36088
|
+
return {
|
|
36089
|
+
id: `rubric-${index + 1}`,
|
|
36090
|
+
expected_outcome: rubric,
|
|
36091
|
+
weight: 1,
|
|
36092
|
+
required: true
|
|
36093
|
+
};
|
|
36094
|
+
}
|
|
36095
|
+
const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
|
|
36096
|
+
const rawScoreRanges = rubric.score_ranges;
|
|
36097
|
+
const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
36098
|
+
score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
|
|
36099
|
+
expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
|
|
36100
|
+
})).filter((r) => r.expected_outcome.length > 0) : void 0;
|
|
36101
|
+
const baseRubric = {
|
|
36102
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
36103
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
36104
|
+
};
|
|
36105
|
+
if (scoreRanges && scoreRanges.length > 0) {
|
|
36106
|
+
return {
|
|
36107
|
+
...baseRubric,
|
|
36108
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
36109
|
+
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
36110
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
|
|
36111
|
+
score_ranges: scoreRanges
|
|
36112
|
+
};
|
|
36113
|
+
}
|
|
36114
|
+
return {
|
|
36115
|
+
...baseRubric,
|
|
36116
|
+
expected_outcome: expectedOutcome,
|
|
36117
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
36118
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
|
|
36119
|
+
};
|
|
36120
|
+
}).filter(
|
|
36121
|
+
(r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
|
|
36122
|
+
);
|
|
36123
|
+
if (rubricItems.length === 0) {
|
|
36124
|
+
return void 0;
|
|
36125
|
+
}
|
|
36126
|
+
return {
|
|
36127
|
+
name: "rubric",
|
|
36128
|
+
type: "llm_judge",
|
|
36129
|
+
rubrics: rubricItems
|
|
36130
|
+
};
|
|
36131
|
+
}
|
|
35915
36132
|
function formatFileContents(parts) {
|
|
35916
36133
|
const fileCount = parts.filter((p) => p.isFile).length;
|
|
35917
36134
|
if (fileCount > 0) {
|
|
@@ -36164,25 +36381,295 @@ async function processExpectedMessages(options) {
|
|
|
36164
36381
|
}
|
|
36165
36382
|
return segments;
|
|
36166
36383
|
}
|
|
36384
|
+
function expandInputShorthand(value) {
|
|
36385
|
+
if (value === void 0 || value === null) {
|
|
36386
|
+
return void 0;
|
|
36387
|
+
}
|
|
36388
|
+
if (typeof value === "string") {
|
|
36389
|
+
return [{ role: "user", content: value }];
|
|
36390
|
+
}
|
|
36391
|
+
if (Array.isArray(value)) {
|
|
36392
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
36393
|
+
return messages.length > 0 ? messages : void 0;
|
|
36394
|
+
}
|
|
36395
|
+
return void 0;
|
|
36396
|
+
}
|
|
36397
|
+
function expandExpectedOutputShorthand(value) {
|
|
36398
|
+
if (value === void 0 || value === null) {
|
|
36399
|
+
return void 0;
|
|
36400
|
+
}
|
|
36401
|
+
if (typeof value === "string") {
|
|
36402
|
+
return [{ role: "assistant", content: value }];
|
|
36403
|
+
}
|
|
36404
|
+
if (Array.isArray(value)) {
|
|
36405
|
+
if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
|
|
36406
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
36407
|
+
return messages.length > 0 ? messages : void 0;
|
|
36408
|
+
}
|
|
36409
|
+
return [{ role: "assistant", content: value }];
|
|
36410
|
+
}
|
|
36411
|
+
if (isJsonObject(value)) {
|
|
36412
|
+
if ("role" in value) {
|
|
36413
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
36414
|
+
}
|
|
36415
|
+
return [{ role: "assistant", content: value }];
|
|
36416
|
+
}
|
|
36417
|
+
return void 0;
|
|
36418
|
+
}
|
|
36419
|
+
function resolveInputMessages(raw) {
|
|
36420
|
+
if (raw.input_messages !== void 0) {
|
|
36421
|
+
if (Array.isArray(raw.input_messages)) {
|
|
36422
|
+
const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
|
|
36423
|
+
return messages.length > 0 ? messages : void 0;
|
|
36424
|
+
}
|
|
36425
|
+
return void 0;
|
|
36426
|
+
}
|
|
36427
|
+
return expandInputShorthand(raw.input);
|
|
36428
|
+
}
|
|
36429
|
+
function resolveExpectedMessages(raw) {
|
|
36430
|
+
if (raw.expected_messages !== void 0) {
|
|
36431
|
+
if (Array.isArray(raw.expected_messages)) {
|
|
36432
|
+
const messages = raw.expected_messages.filter(
|
|
36433
|
+
(msg) => isTestMessage(msg)
|
|
36434
|
+
);
|
|
36435
|
+
return messages.length > 0 ? messages : void 0;
|
|
36436
|
+
}
|
|
36437
|
+
return void 0;
|
|
36438
|
+
}
|
|
36439
|
+
return expandExpectedOutputShorthand(raw.expected_output);
|
|
36440
|
+
}
|
|
36167
36441
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
36442
|
+
var ANSI_RED = "\x1B[31m";
|
|
36168
36443
|
var ANSI_RESET5 = "\x1B[0m";
|
|
36444
|
+
function detectFormat(filePath) {
|
|
36445
|
+
const ext = path52.extname(filePath).toLowerCase();
|
|
36446
|
+
if (ext === ".jsonl") return "jsonl";
|
|
36447
|
+
if (ext === ".yaml" || ext === ".yml") return "yaml";
|
|
36448
|
+
throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
|
|
36449
|
+
}
|
|
36450
|
+
async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
36451
|
+
const dir = path52.dirname(jsonlPath);
|
|
36452
|
+
const base = path52.basename(jsonlPath, ".jsonl");
|
|
36453
|
+
const sidecarPath = path52.join(dir, `${base}.yaml`);
|
|
36454
|
+
if (!await fileExists2(sidecarPath)) {
|
|
36455
|
+
if (verbose) {
|
|
36456
|
+
logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
|
|
36457
|
+
}
|
|
36458
|
+
return {};
|
|
36459
|
+
}
|
|
36460
|
+
try {
|
|
36461
|
+
const content = await readFile42(sidecarPath, "utf8");
|
|
36462
|
+
const parsed = parseYaml(content);
|
|
36463
|
+
if (!isJsonObject(parsed)) {
|
|
36464
|
+
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
36465
|
+
return {};
|
|
36466
|
+
}
|
|
36467
|
+
return {
|
|
36468
|
+
description: asString4(parsed.description),
|
|
36469
|
+
dataset: asString4(parsed.dataset),
|
|
36470
|
+
execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
|
|
36471
|
+
evaluator: parsed.evaluator
|
|
36472
|
+
};
|
|
36473
|
+
} catch (error40) {
|
|
36474
|
+
logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error40.message}`);
|
|
36475
|
+
return {};
|
|
36476
|
+
}
|
|
36477
|
+
}
|
|
36478
|
+
function parseJsonlContent(content, filePath) {
|
|
36479
|
+
const lines = content.split("\n");
|
|
36480
|
+
const cases = [];
|
|
36481
|
+
for (let i = 0; i < lines.length; i++) {
|
|
36482
|
+
const line2 = lines[i].trim();
|
|
36483
|
+
if (line2 === "") continue;
|
|
36484
|
+
try {
|
|
36485
|
+
const parsed = JSON.parse(line2);
|
|
36486
|
+
if (!isJsonObject(parsed)) {
|
|
36487
|
+
throw new Error("Expected JSON object");
|
|
36488
|
+
}
|
|
36489
|
+
cases.push(parsed);
|
|
36490
|
+
} catch (error40) {
|
|
36491
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
36492
|
+
throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
|
|
36493
|
+
File: ${filePath}`);
|
|
36494
|
+
}
|
|
36495
|
+
}
|
|
36496
|
+
return cases;
|
|
36497
|
+
}
|
|
36498
|
+
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
36499
|
+
const verbose = options?.verbose ?? false;
|
|
36500
|
+
const filterPattern = options?.filter;
|
|
36501
|
+
const absoluteTestPath = path52.resolve(evalFilePath);
|
|
36502
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
36503
|
+
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
36504
|
+
const config2 = await loadConfig(absoluteTestPath, repoRootPath);
|
|
36505
|
+
const guidelinePatterns = config2?.guideline_patterns;
|
|
36506
|
+
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
36507
|
+
const rawFile = await readFile42(absoluteTestPath, "utf8");
|
|
36508
|
+
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
36509
|
+
const fallbackDataset = path52.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
36510
|
+
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
36511
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
|
|
36512
|
+
const globalExecution = sidecar.execution;
|
|
36513
|
+
if (verbose) {
|
|
36514
|
+
console.log(`
|
|
36515
|
+
[JSONL Dataset: ${evalFilePath}]`);
|
|
36516
|
+
console.log(` Cases: ${rawCases.length}`);
|
|
36517
|
+
console.log(` Dataset name: ${datasetName}`);
|
|
36518
|
+
if (sidecar.description) {
|
|
36519
|
+
console.log(` Description: ${sidecar.description}`);
|
|
36520
|
+
}
|
|
36521
|
+
}
|
|
36522
|
+
const results = [];
|
|
36523
|
+
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
36524
|
+
const evalcase = rawCases[lineIndex];
|
|
36525
|
+
const lineNumber = lineIndex + 1;
|
|
36526
|
+
const id = asString4(evalcase.id);
|
|
36527
|
+
if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
|
|
36528
|
+
continue;
|
|
36529
|
+
}
|
|
36530
|
+
const conversationId = asString4(evalcase.conversation_id);
|
|
36531
|
+
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
36532
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
36533
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
36534
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
36535
|
+
logError(
|
|
36536
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
|
|
36537
|
+
);
|
|
36538
|
+
continue;
|
|
36539
|
+
}
|
|
36540
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
36541
|
+
const guidelinePaths = [];
|
|
36542
|
+
const inputTextParts = [];
|
|
36543
|
+
const inputSegments = await processMessages({
|
|
36544
|
+
messages: inputMessages,
|
|
36545
|
+
searchRoots,
|
|
36546
|
+
repoRootPath,
|
|
36547
|
+
guidelinePatterns,
|
|
36548
|
+
guidelinePaths,
|
|
36549
|
+
textParts: inputTextParts,
|
|
36550
|
+
messageType: "input",
|
|
36551
|
+
verbose
|
|
36552
|
+
});
|
|
36553
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
36554
|
+
messages: expectedMessages,
|
|
36555
|
+
searchRoots,
|
|
36556
|
+
repoRootPath,
|
|
36557
|
+
verbose
|
|
36558
|
+
}) : [];
|
|
36559
|
+
let referenceAnswer = "";
|
|
36560
|
+
if (outputSegments.length > 0) {
|
|
36561
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
36562
|
+
const content = lastMessage.content;
|
|
36563
|
+
const toolCalls = lastMessage.tool_calls;
|
|
36564
|
+
if (typeof content === "string") {
|
|
36565
|
+
referenceAnswer = content;
|
|
36566
|
+
} else if (content !== void 0 && content !== null) {
|
|
36567
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
36568
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
36569
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
36570
|
+
}
|
|
36571
|
+
}
|
|
36572
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
36573
|
+
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
36574
|
+
const mergedExecution = caseExecution ?? globalExecution;
|
|
36575
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
36576
|
+
let evaluators;
|
|
36577
|
+
try {
|
|
36578
|
+
evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
|
|
36579
|
+
} catch (error40) {
|
|
36580
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
36581
|
+
logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
|
|
36582
|
+
continue;
|
|
36583
|
+
}
|
|
36584
|
+
const inlineRubrics = evalcase.rubrics;
|
|
36585
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
36586
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
36587
|
+
if (rubricEvaluator) {
|
|
36588
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36589
|
+
}
|
|
36590
|
+
}
|
|
36591
|
+
const userFilePaths = [];
|
|
36592
|
+
for (const segment of inputSegments) {
|
|
36593
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
36594
|
+
userFilePaths.push(segment.resolvedPath);
|
|
36595
|
+
}
|
|
36596
|
+
}
|
|
36597
|
+
const allFilePaths = [
|
|
36598
|
+
...guidelinePaths.map((guidelinePath) => path52.resolve(guidelinePath)),
|
|
36599
|
+
...userFilePaths
|
|
36600
|
+
];
|
|
36601
|
+
const testCase = {
|
|
36602
|
+
id,
|
|
36603
|
+
dataset: datasetName,
|
|
36604
|
+
conversation_id: conversationId,
|
|
36605
|
+
question,
|
|
36606
|
+
input_messages: inputMessages,
|
|
36607
|
+
input_segments: inputSegments,
|
|
36608
|
+
expected_messages: outputSegments,
|
|
36609
|
+
reference_answer: referenceAnswer,
|
|
36610
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path52.resolve(guidelinePath)),
|
|
36611
|
+
guideline_patterns: guidelinePatterns,
|
|
36612
|
+
file_paths: allFilePaths,
|
|
36613
|
+
expected_outcome: outcome,
|
|
36614
|
+
evaluator: evalCaseEvaluatorKind,
|
|
36615
|
+
evaluators
|
|
36616
|
+
};
|
|
36617
|
+
if (verbose) {
|
|
36618
|
+
console.log(`
|
|
36619
|
+
[Eval Case: ${id}]`);
|
|
36620
|
+
if (testCase.guideline_paths.length > 0) {
|
|
36621
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
36622
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
36623
|
+
console.log(` - ${guidelinePath}`);
|
|
36624
|
+
}
|
|
36625
|
+
} else {
|
|
36626
|
+
console.log(" No guidelines found");
|
|
36627
|
+
}
|
|
36628
|
+
}
|
|
36629
|
+
results.push(testCase);
|
|
36630
|
+
}
|
|
36631
|
+
return results;
|
|
36632
|
+
}
|
|
36633
|
+
function asString4(value) {
|
|
36634
|
+
return typeof value === "string" ? value : void 0;
|
|
36635
|
+
}
|
|
36636
|
+
function logWarning4(message, details) {
|
|
36637
|
+
if (details && details.length > 0) {
|
|
36638
|
+
const detailBlock = details.join("\n");
|
|
36639
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
36640
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
36641
|
+
} else {
|
|
36642
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
36643
|
+
}
|
|
36644
|
+
}
|
|
36645
|
+
function logError(message, details) {
|
|
36646
|
+
if (details && details.length > 0) {
|
|
36647
|
+
const detailBlock = details.join("\n");
|
|
36648
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
36649
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
36650
|
+
} else {
|
|
36651
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
|
|
36652
|
+
}
|
|
36653
|
+
}
|
|
36654
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
36655
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
36169
36656
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
36170
36657
|
const guidelineParts = [];
|
|
36171
36658
|
for (const rawPath of testCase.guideline_paths) {
|
|
36172
|
-
const absolutePath =
|
|
36659
|
+
const absolutePath = path62.resolve(rawPath);
|
|
36173
36660
|
if (!await fileExists2(absolutePath)) {
|
|
36174
|
-
|
|
36661
|
+
logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
36175
36662
|
continue;
|
|
36176
36663
|
}
|
|
36177
36664
|
try {
|
|
36178
|
-
const content = (await
|
|
36665
|
+
const content = (await readFile5(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
36179
36666
|
guidelineParts.push({
|
|
36180
36667
|
content,
|
|
36181
36668
|
isFile: true,
|
|
36182
|
-
displayPath:
|
|
36669
|
+
displayPath: path62.basename(absolutePath)
|
|
36183
36670
|
});
|
|
36184
36671
|
} catch (error40) {
|
|
36185
|
-
|
|
36672
|
+
logWarning5(`Could not read guideline file ${absolutePath}: ${error40.message}`);
|
|
36186
36673
|
}
|
|
36187
36674
|
}
|
|
36188
36675
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -36206,9 +36693,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
36206
36693
|
messageSegments.push({ type: "text", value: segment });
|
|
36207
36694
|
}
|
|
36208
36695
|
} else if (isJsonObject(segment)) {
|
|
36209
|
-
const type =
|
|
36696
|
+
const type = asString5(segment.type);
|
|
36210
36697
|
if (type === "file") {
|
|
36211
|
-
const value =
|
|
36698
|
+
const value = asString5(segment.value);
|
|
36212
36699
|
if (!value) continue;
|
|
36213
36700
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
36214
36701
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -36219,7 +36706,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
36219
36706
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
36220
36707
|
}
|
|
36221
36708
|
} else if (type === "text") {
|
|
36222
|
-
const textValue =
|
|
36709
|
+
const textValue = asString5(segment.value);
|
|
36223
36710
|
if (textValue && textValue.trim().length > 0) {
|
|
36224
36711
|
messageSegments.push({ type: "text", value: textValue });
|
|
36225
36712
|
}
|
|
@@ -36373,19 +36860,19 @@ ${guidelineContent.trim()}`);
|
|
|
36373
36860
|
}
|
|
36374
36861
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
36375
36862
|
}
|
|
36376
|
-
function
|
|
36863
|
+
function asString5(value) {
|
|
36377
36864
|
return typeof value === "string" ? value : void 0;
|
|
36378
36865
|
}
|
|
36379
|
-
function
|
|
36380
|
-
console.warn(`${
|
|
36866
|
+
function logWarning5(message) {
|
|
36867
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
36381
36868
|
}
|
|
36382
|
-
var
|
|
36383
|
-
var
|
|
36384
|
-
var
|
|
36869
|
+
var ANSI_YELLOW7 = "\x1B[33m";
|
|
36870
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
36871
|
+
var ANSI_RESET7 = "\x1B[0m";
|
|
36385
36872
|
async function readTestSuiteMetadata(testFilePath) {
|
|
36386
36873
|
try {
|
|
36387
|
-
const absolutePath =
|
|
36388
|
-
const content = await
|
|
36874
|
+
const absolutePath = path72.resolve(testFilePath);
|
|
36875
|
+
const content = await readFile6(absolutePath, "utf8");
|
|
36389
36876
|
const parsed = parse22(content);
|
|
36390
36877
|
if (!isJsonObject(parsed)) {
|
|
36391
36878
|
return {};
|
|
@@ -36396,21 +36883,25 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
36396
36883
|
}
|
|
36397
36884
|
}
|
|
36398
36885
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
36886
|
+
const format = detectFormat(evalFilePath);
|
|
36887
|
+
if (format === "jsonl") {
|
|
36888
|
+
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
36889
|
+
}
|
|
36399
36890
|
const verbose = options?.verbose ?? false;
|
|
36400
|
-
const
|
|
36401
|
-
const absoluteTestPath =
|
|
36891
|
+
const filterPattern = options?.filter;
|
|
36892
|
+
const absoluteTestPath = path72.resolve(evalFilePath);
|
|
36402
36893
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
36403
36894
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
36404
36895
|
const config2 = await loadConfig(absoluteTestPath, repoRootPath);
|
|
36405
36896
|
const guidelinePatterns = config2?.guideline_patterns;
|
|
36406
|
-
const rawFile = await
|
|
36897
|
+
const rawFile = await readFile6(absoluteTestPath, "utf8");
|
|
36407
36898
|
const parsed = parse22(rawFile);
|
|
36408
36899
|
if (!isJsonObject(parsed)) {
|
|
36409
36900
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
36410
36901
|
}
|
|
36411
36902
|
const suite = parsed;
|
|
36412
|
-
const datasetNameFromSuite =
|
|
36413
|
-
const fallbackDataset =
|
|
36903
|
+
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
36904
|
+
const fallbackDataset = path72.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
36414
36905
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
36415
36906
|
const rawTestcases = suite.evalcases;
|
|
36416
36907
|
if (!Array.isArray(rawTestcases)) {
|
|
@@ -36418,37 +36909,29 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36418
36909
|
}
|
|
36419
36910
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
36420
36911
|
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
36421
|
-
const _globalTarget =
|
|
36912
|
+
const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
|
|
36422
36913
|
const results = [];
|
|
36423
36914
|
for (const rawEvalcase of rawTestcases) {
|
|
36424
36915
|
if (!isJsonObject(rawEvalcase)) {
|
|
36425
|
-
|
|
36916
|
+
logWarning6("Skipping invalid eval case entry (expected object)");
|
|
36426
36917
|
continue;
|
|
36427
36918
|
}
|
|
36428
36919
|
const evalcase = rawEvalcase;
|
|
36429
|
-
const id =
|
|
36430
|
-
if (
|
|
36920
|
+
const id = asString6(evalcase.id);
|
|
36921
|
+
if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
|
|
36431
36922
|
continue;
|
|
36432
36923
|
}
|
|
36433
|
-
const conversationId =
|
|
36434
|
-
const outcome =
|
|
36435
|
-
const
|
|
36436
|
-
const
|
|
36437
|
-
if (!id || !outcome || !
|
|
36438
|
-
|
|
36439
|
-
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
36924
|
+
const conversationId = asString6(evalcase.conversation_id);
|
|
36925
|
+
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
36926
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
36927
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
36928
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
36929
|
+
logError2(
|
|
36930
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
|
|
36440
36931
|
);
|
|
36441
36932
|
continue;
|
|
36442
36933
|
}
|
|
36443
|
-
const hasExpectedMessages =
|
|
36444
|
-
const inputMessages = inputMessagesValue.filter(
|
|
36445
|
-
(msg) => isTestMessage(msg)
|
|
36446
|
-
);
|
|
36447
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
36448
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
36449
|
-
logError(`No valid expected message found for eval case: ${id}`);
|
|
36450
|
-
continue;
|
|
36451
|
-
}
|
|
36934
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
36452
36935
|
const guidelinePaths = [];
|
|
36453
36936
|
const inputTextParts = [];
|
|
36454
36937
|
const inputSegments = await processMessages({
|
|
@@ -36487,33 +36970,13 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36487
36970
|
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
36488
36971
|
} catch (error40) {
|
|
36489
36972
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
36490
|
-
|
|
36973
|
+
logError2(`Skipping eval case '${id}': ${message}`);
|
|
36491
36974
|
continue;
|
|
36492
36975
|
}
|
|
36493
36976
|
const inlineRubrics = evalcase.rubrics;
|
|
36494
36977
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
36495
|
-
const
|
|
36496
|
-
|
|
36497
|
-
return {
|
|
36498
|
-
id: `rubric-${index + 1}`,
|
|
36499
|
-
description: rubric,
|
|
36500
|
-
weight: 1,
|
|
36501
|
-
required: true
|
|
36502
|
-
};
|
|
36503
|
-
}
|
|
36504
|
-
return {
|
|
36505
|
-
id: asString5(rubric.id) ?? `rubric-${index + 1}`,
|
|
36506
|
-
description: asString5(rubric.description) ?? "",
|
|
36507
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
36508
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
36509
|
-
};
|
|
36510
|
-
}).filter((r) => r.description.length > 0);
|
|
36511
|
-
if (rubricItems.length > 0) {
|
|
36512
|
-
const rubricEvaluator = {
|
|
36513
|
-
name: "rubric",
|
|
36514
|
-
type: "llm_judge",
|
|
36515
|
-
rubrics: rubricItems
|
|
36516
|
-
};
|
|
36978
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
36979
|
+
if (rubricEvaluator) {
|
|
36517
36980
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36518
36981
|
}
|
|
36519
36982
|
}
|
|
@@ -36524,7 +36987,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36524
36987
|
}
|
|
36525
36988
|
}
|
|
36526
36989
|
const allFilePaths = [
|
|
36527
|
-
...guidelinePaths.map((guidelinePath) =>
|
|
36990
|
+
...guidelinePaths.map((guidelinePath) => path72.resolve(guidelinePath)),
|
|
36528
36991
|
...userFilePaths
|
|
36529
36992
|
];
|
|
36530
36993
|
const testCase = {
|
|
@@ -36536,7 +36999,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36536
36999
|
input_segments: inputSegments,
|
|
36537
37000
|
expected_messages: outputSegments,
|
|
36538
37001
|
reference_answer: referenceAnswer,
|
|
36539
|
-
guideline_paths: guidelinePaths.map((guidelinePath) =>
|
|
37002
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path72.resolve(guidelinePath)),
|
|
36540
37003
|
guideline_patterns: guidelinePatterns,
|
|
36541
37004
|
file_paths: allFilePaths,
|
|
36542
37005
|
expected_outcome: outcome,
|
|
@@ -36559,25 +37022,25 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36559
37022
|
}
|
|
36560
37023
|
return results;
|
|
36561
37024
|
}
|
|
36562
|
-
function
|
|
37025
|
+
function asString6(value) {
|
|
36563
37026
|
return typeof value === "string" ? value : void 0;
|
|
36564
37027
|
}
|
|
36565
|
-
function
|
|
37028
|
+
function logWarning6(message, details) {
|
|
36566
37029
|
if (details && details.length > 0) {
|
|
36567
37030
|
const detailBlock = details.join("\n");
|
|
36568
|
-
console.warn(`${
|
|
36569
|
-
${detailBlock}${
|
|
37031
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}
|
|
37032
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
36570
37033
|
} else {
|
|
36571
|
-
console.warn(`${
|
|
37034
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
|
|
36572
37035
|
}
|
|
36573
37036
|
}
|
|
36574
|
-
function
|
|
37037
|
+
function logError2(message, details) {
|
|
36575
37038
|
if (details && details.length > 0) {
|
|
36576
37039
|
const detailBlock = details.join("\n");
|
|
36577
|
-
console.error(`${
|
|
36578
|
-
${detailBlock}${
|
|
37040
|
+
console.error(`${ANSI_RED2}Error: ${message}
|
|
37041
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
36579
37042
|
} else {
|
|
36580
|
-
console.error(`${
|
|
37043
|
+
console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
|
|
36581
37044
|
}
|
|
36582
37045
|
}
|
|
36583
37046
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
@@ -36966,7 +37429,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
36966
37429
|
}
|
|
36967
37430
|
const deduped = /* @__PURE__ */ new Map();
|
|
36968
37431
|
for (const inputFile of inputFiles) {
|
|
36969
|
-
const absolutePath =
|
|
37432
|
+
const absolutePath = path82.resolve(inputFile);
|
|
36970
37433
|
if (!deduped.has(absolutePath)) {
|
|
36971
37434
|
deduped.set(absolutePath, absolutePath);
|
|
36972
37435
|
}
|
|
@@ -36979,14 +37442,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
36979
37442
|
}
|
|
36980
37443
|
const unique = /* @__PURE__ */ new Map();
|
|
36981
37444
|
for (const inputFile of inputFiles) {
|
|
36982
|
-
const absolutePath =
|
|
37445
|
+
const absolutePath = path82.resolve(inputFile);
|
|
36983
37446
|
if (overrides?.has(absolutePath)) {
|
|
36984
37447
|
if (!unique.has(absolutePath)) {
|
|
36985
37448
|
unique.set(absolutePath, absolutePath);
|
|
36986
37449
|
}
|
|
36987
37450
|
continue;
|
|
36988
37451
|
}
|
|
36989
|
-
const normalized = absolutePath.split(
|
|
37452
|
+
const normalized = absolutePath.split(path82.sep).join("/");
|
|
36990
37453
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
36991
37454
|
if (!unique.has(absolutePath)) {
|
|
36992
37455
|
unique.set(absolutePath, absolutePath);
|
|
@@ -37001,7 +37464,7 @@ function collectInputFiles(inputFiles) {
|
|
|
37001
37464
|
}
|
|
37002
37465
|
const unique = /* @__PURE__ */ new Map();
|
|
37003
37466
|
for (const inputFile of inputFiles) {
|
|
37004
|
-
const absolutePath =
|
|
37467
|
+
const absolutePath = path82.resolve(inputFile);
|
|
37005
37468
|
if (!unique.has(absolutePath)) {
|
|
37006
37469
|
unique.set(absolutePath, absolutePath);
|
|
37007
37470
|
}
|
|
@@ -37013,7 +37476,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
37013
37476
|
return "";
|
|
37014
37477
|
}
|
|
37015
37478
|
const buildList = (files) => files.map((absolutePath) => {
|
|
37016
|
-
const fileName =
|
|
37479
|
+
const fileName = path82.basename(absolutePath);
|
|
37017
37480
|
const fileUri = pathToFileUri2(absolutePath);
|
|
37018
37481
|
return `* [${fileName}](${fileUri})`;
|
|
37019
37482
|
});
|
|
@@ -37033,7 +37496,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
37033
37496
|
return sections.join("\n");
|
|
37034
37497
|
}
|
|
37035
37498
|
function pathToFileUri2(filePath) {
|
|
37036
|
-
const absolutePath =
|
|
37499
|
+
const absolutePath = path82.isAbsolute(filePath) ? filePath : path82.resolve(filePath);
|
|
37037
37500
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
37038
37501
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
37039
37502
|
return `file:///${normalizedPath}`;
|
|
@@ -37068,7 +37531,7 @@ var ClaudeCodeProvider = class {
|
|
|
37068
37531
|
const workspaceRoot = await this.createWorkspace();
|
|
37069
37532
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
37070
37533
|
try {
|
|
37071
|
-
const promptFile =
|
|
37534
|
+
const promptFile = path92.join(workspaceRoot, PROMPT_FILENAME);
|
|
37072
37535
|
await writeFile5(promptFile, request.question, "utf8");
|
|
37073
37536
|
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
37074
37537
|
const cwd = this.resolveCwd();
|
|
@@ -37116,7 +37579,7 @@ var ClaudeCodeProvider = class {
|
|
|
37116
37579
|
if (!this.config.cwd) {
|
|
37117
37580
|
return process.cwd();
|
|
37118
37581
|
}
|
|
37119
|
-
return
|
|
37582
|
+
return path92.resolve(this.config.cwd);
|
|
37120
37583
|
}
|
|
37121
37584
|
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
37122
37585
|
const args = [];
|
|
@@ -37173,7 +37636,7 @@ ${filesContext}`;
|
|
|
37173
37636
|
}
|
|
37174
37637
|
}
|
|
37175
37638
|
async createWorkspace() {
|
|
37176
|
-
return await mkdtemp(
|
|
37639
|
+
return await mkdtemp(path92.join(tmpdir(), WORKSPACE_PREFIX));
|
|
37177
37640
|
}
|
|
37178
37641
|
async cleanupWorkspace(workspaceRoot) {
|
|
37179
37642
|
try {
|
|
@@ -37187,9 +37650,9 @@ ${filesContext}`;
|
|
|
37187
37650
|
return void 0;
|
|
37188
37651
|
}
|
|
37189
37652
|
if (this.config.logDir) {
|
|
37190
|
-
return
|
|
37653
|
+
return path92.resolve(this.config.logDir);
|
|
37191
37654
|
}
|
|
37192
|
-
return
|
|
37655
|
+
return path92.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
37193
37656
|
}
|
|
37194
37657
|
async createStreamLogger(request) {
|
|
37195
37658
|
const logDir = this.resolveLogDirectory();
|
|
@@ -37203,7 +37666,7 @@ ${filesContext}`;
|
|
|
37203
37666
|
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
37204
37667
|
return void 0;
|
|
37205
37668
|
}
|
|
37206
|
-
const filePath =
|
|
37669
|
+
const filePath = path92.join(logDir, buildLogFilename(request, this.targetName));
|
|
37207
37670
|
try {
|
|
37208
37671
|
const logger = await ClaudeCodeStreamLogger.create({
|
|
37209
37672
|
filePath,
|
|
@@ -37608,10 +38071,10 @@ function escapeShellArg(arg) {
|
|
|
37608
38071
|
}
|
|
37609
38072
|
async function defaultClaudeCodeRunner(options) {
|
|
37610
38073
|
const tempId = randomUUID();
|
|
37611
|
-
const stdoutFile =
|
|
37612
|
-
const stderrFile =
|
|
37613
|
-
const exitFile =
|
|
37614
|
-
const pidFile =
|
|
38074
|
+
const stdoutFile = path92.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
|
|
38075
|
+
const stderrFile = path92.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
|
|
38076
|
+
const exitFile = path92.join(tmpdir(), `agentv-cc-${tempId}-exit`);
|
|
38077
|
+
const pidFile = path92.join(tmpdir(), `agentv-cc-${tempId}-pid`);
|
|
37615
38078
|
try {
|
|
37616
38079
|
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
37617
38080
|
} finally {
|
|
@@ -37651,8 +38114,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
37651
38114
|
let lastStdoutSize = 0;
|
|
37652
38115
|
const readFileIfExists = async (filePath) => {
|
|
37653
38116
|
try {
|
|
37654
|
-
const { readFile:
|
|
37655
|
-
return await
|
|
38117
|
+
const { readFile: readFile82 } = await import("node:fs/promises");
|
|
38118
|
+
return await readFile82(filePath, "utf8");
|
|
37656
38119
|
} catch {
|
|
37657
38120
|
return "";
|
|
37658
38121
|
}
|
|
@@ -37727,7 +38190,8 @@ var ToolCallSchema = external_exports.object({
|
|
|
37727
38190
|
input: external_exports.unknown().optional(),
|
|
37728
38191
|
output: external_exports.unknown().optional(),
|
|
37729
38192
|
id: external_exports.string().optional(),
|
|
37730
|
-
timestamp: external_exports.string().optional()
|
|
38193
|
+
timestamp: external_exports.string().optional(),
|
|
38194
|
+
duration_ms: external_exports.number().optional()
|
|
37731
38195
|
});
|
|
37732
38196
|
var OutputMessageInputSchema = external_exports.object({
|
|
37733
38197
|
role: external_exports.string(),
|
|
@@ -37735,6 +38199,7 @@ var OutputMessageInputSchema = external_exports.object({
|
|
|
37735
38199
|
content: external_exports.unknown().optional(),
|
|
37736
38200
|
tool_calls: external_exports.array(ToolCallSchema).optional(),
|
|
37737
38201
|
timestamp: external_exports.string().optional(),
|
|
38202
|
+
duration_ms: external_exports.number().optional(),
|
|
37738
38203
|
metadata: external_exports.record(external_exports.unknown()).optional()
|
|
37739
38204
|
});
|
|
37740
38205
|
var TokenUsageSchema = external_exports.object({
|
|
@@ -37773,8 +38238,16 @@ function convertOutputMessages(messages) {
|
|
|
37773
38238
|
role: msg.role,
|
|
37774
38239
|
name: msg.name,
|
|
37775
38240
|
content: msg.content,
|
|
37776
|
-
toolCalls: msg.tool_calls
|
|
38241
|
+
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
38242
|
+
tool: tc.tool,
|
|
38243
|
+
input: tc.input,
|
|
38244
|
+
output: tc.output,
|
|
38245
|
+
id: tc.id,
|
|
38246
|
+
timestamp: tc.timestamp,
|
|
38247
|
+
durationMs: tc.duration_ms
|
|
38248
|
+
})),
|
|
37777
38249
|
timestamp: msg.timestamp,
|
|
38250
|
+
durationMs: msg.duration_ms,
|
|
37778
38251
|
metadata: msg.metadata
|
|
37779
38252
|
}));
|
|
37780
38253
|
}
|
|
@@ -38176,7 +38649,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
38176
38649
|
}
|
|
38177
38650
|
const unique = /* @__PURE__ */ new Map();
|
|
38178
38651
|
for (const inputFile of inputFiles) {
|
|
38179
|
-
const absolutePath =
|
|
38652
|
+
const absolutePath = path102.resolve(inputFile);
|
|
38180
38653
|
if (!unique.has(absolutePath)) {
|
|
38181
38654
|
unique.set(absolutePath, absolutePath);
|
|
38182
38655
|
}
|
|
@@ -38190,7 +38663,7 @@ function formatFileList(files, template) {
|
|
|
38190
38663
|
const formatter = template ?? "{path}";
|
|
38191
38664
|
return files.map((filePath) => {
|
|
38192
38665
|
const escapedPath = shellEscape(filePath);
|
|
38193
|
-
const escapedName = shellEscape(
|
|
38666
|
+
const escapedName = shellEscape(path102.basename(filePath));
|
|
38194
38667
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
38195
38668
|
}).join(" ");
|
|
38196
38669
|
}
|
|
@@ -38214,7 +38687,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
38214
38687
|
const safeEvalId = evalCaseId || "unknown";
|
|
38215
38688
|
const timestamp = Date.now();
|
|
38216
38689
|
const random = Math.random().toString(36).substring(2, 9);
|
|
38217
|
-
return
|
|
38690
|
+
return path102.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
38218
38691
|
}
|
|
38219
38692
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
38220
38693
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -38305,7 +38778,7 @@ var CodexProvider = class {
|
|
|
38305
38778
|
const promptContent = `${systemPrompt}
|
|
38306
38779
|
|
|
38307
38780
|
${basePrompt}`;
|
|
38308
|
-
const promptFile =
|
|
38781
|
+
const promptFile = path112.join(workspaceRoot, PROMPT_FILENAME2);
|
|
38309
38782
|
await writeFile22(promptFile, promptContent, "utf8");
|
|
38310
38783
|
const args = this.buildCodexArgs();
|
|
38311
38784
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -38355,7 +38828,7 @@ ${basePrompt}`;
|
|
|
38355
38828
|
if (!this.config.cwd) {
|
|
38356
38829
|
return workspaceRoot;
|
|
38357
38830
|
}
|
|
38358
|
-
return
|
|
38831
|
+
return path112.resolve(this.config.cwd);
|
|
38359
38832
|
}
|
|
38360
38833
|
buildCodexArgs() {
|
|
38361
38834
|
const args = [
|
|
@@ -38397,7 +38870,7 @@ ${basePrompt}`;
|
|
|
38397
38870
|
}
|
|
38398
38871
|
}
|
|
38399
38872
|
async createWorkspace() {
|
|
38400
|
-
return await mkdtemp2(
|
|
38873
|
+
return await mkdtemp2(path112.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
38401
38874
|
}
|
|
38402
38875
|
async cleanupWorkspace(workspaceRoot) {
|
|
38403
38876
|
try {
|
|
@@ -38411,9 +38884,9 @@ ${basePrompt}`;
|
|
|
38411
38884
|
return void 0;
|
|
38412
38885
|
}
|
|
38413
38886
|
if (this.config.logDir) {
|
|
38414
|
-
return
|
|
38887
|
+
return path112.resolve(this.config.logDir);
|
|
38415
38888
|
}
|
|
38416
|
-
return
|
|
38889
|
+
return path112.join(process.cwd(), ".agentv", "logs", "codex");
|
|
38417
38890
|
}
|
|
38418
38891
|
async createStreamLogger(request) {
|
|
38419
38892
|
const logDir = this.resolveLogDirectory();
|
|
@@ -38427,7 +38900,7 @@ ${basePrompt}`;
|
|
|
38427
38900
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
38428
38901
|
return void 0;
|
|
38429
38902
|
}
|
|
38430
|
-
const filePath =
|
|
38903
|
+
const filePath = path112.join(logDir, buildLogFilename2(request, this.targetName));
|
|
38431
38904
|
try {
|
|
38432
38905
|
const logger = await CodexStreamLogger.create({
|
|
38433
38906
|
filePath,
|
|
@@ -38642,7 +39115,7 @@ function tryParseJsonValue2(rawLine) {
|
|
|
38642
39115
|
async function locateExecutable(candidate) {
|
|
38643
39116
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
38644
39117
|
if (includesPathSeparator) {
|
|
38645
|
-
const resolved =
|
|
39118
|
+
const resolved = path112.isAbsolute(candidate) ? candidate : path112.resolve(candidate);
|
|
38646
39119
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
38647
39120
|
await access22(executablePath, constants22.F_OK);
|
|
38648
39121
|
return executablePath;
|
|
@@ -39216,7 +39689,7 @@ var PiCodingAgentProvider = class {
|
|
|
39216
39689
|
const workspaceRoot = await this.createWorkspace();
|
|
39217
39690
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
39218
39691
|
try {
|
|
39219
|
-
const promptFile =
|
|
39692
|
+
const promptFile = path122.join(workspaceRoot, PROMPT_FILENAME3);
|
|
39220
39693
|
await writeFile32(promptFile, request.question, "utf8");
|
|
39221
39694
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
39222
39695
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -39258,7 +39731,7 @@ var PiCodingAgentProvider = class {
|
|
|
39258
39731
|
if (!this.config.cwd) {
|
|
39259
39732
|
return workspaceRoot;
|
|
39260
39733
|
}
|
|
39261
|
-
return
|
|
39734
|
+
return path122.resolve(this.config.cwd);
|
|
39262
39735
|
}
|
|
39263
39736
|
buildPiArgs(prompt, inputFiles) {
|
|
39264
39737
|
const args = [];
|
|
@@ -39347,7 +39820,7 @@ ${prompt}`;
|
|
|
39347
39820
|
return env;
|
|
39348
39821
|
}
|
|
39349
39822
|
async createWorkspace() {
|
|
39350
|
-
return await mkdtemp3(
|
|
39823
|
+
return await mkdtemp3(path122.join(tmpdir3(), WORKSPACE_PREFIX3));
|
|
39351
39824
|
}
|
|
39352
39825
|
async cleanupWorkspace(workspaceRoot) {
|
|
39353
39826
|
try {
|
|
@@ -39357,9 +39830,9 @@ ${prompt}`;
|
|
|
39357
39830
|
}
|
|
39358
39831
|
resolveLogDirectory() {
|
|
39359
39832
|
if (this.config.logDir) {
|
|
39360
|
-
return
|
|
39833
|
+
return path122.resolve(this.config.logDir);
|
|
39361
39834
|
}
|
|
39362
|
-
return
|
|
39835
|
+
return path122.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
39363
39836
|
}
|
|
39364
39837
|
async createStreamLogger(request) {
|
|
39365
39838
|
const logDir = this.resolveLogDirectory();
|
|
@@ -39373,7 +39846,7 @@ ${prompt}`;
|
|
|
39373
39846
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
39374
39847
|
return void 0;
|
|
39375
39848
|
}
|
|
39376
|
-
const filePath =
|
|
39849
|
+
const filePath = path122.join(logDir, buildLogFilename3(request, this.targetName));
|
|
39377
39850
|
try {
|
|
39378
39851
|
const logger = await PiStreamLogger.create({
|
|
39379
39852
|
filePath,
|
|
@@ -39968,7 +40441,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
39968
40441
|
return "";
|
|
39969
40442
|
}
|
|
39970
40443
|
const buildList = (files) => files.map((absolutePath) => {
|
|
39971
|
-
const fileName =
|
|
40444
|
+
const fileName = path132.basename(absolutePath);
|
|
39972
40445
|
const fileUri = pathToFileUri22(absolutePath);
|
|
39973
40446
|
return `* [${fileName}](${fileUri})`;
|
|
39974
40447
|
});
|
|
@@ -39993,8 +40466,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
39993
40466
|
}
|
|
39994
40467
|
const unique = /* @__PURE__ */ new Map();
|
|
39995
40468
|
for (const attachment of attachments) {
|
|
39996
|
-
const absolutePath =
|
|
39997
|
-
const normalized = absolutePath.split(
|
|
40469
|
+
const absolutePath = path132.resolve(attachment);
|
|
40470
|
+
const normalized = absolutePath.split(path132.sep).join("/");
|
|
39998
40471
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
39999
40472
|
if (!unique.has(absolutePath)) {
|
|
40000
40473
|
unique.set(absolutePath, absolutePath);
|
|
@@ -40009,7 +40482,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
40009
40482
|
}
|
|
40010
40483
|
const unique = /* @__PURE__ */ new Map();
|
|
40011
40484
|
for (const attachment of attachments) {
|
|
40012
|
-
const absolutePath =
|
|
40485
|
+
const absolutePath = path132.resolve(attachment);
|
|
40013
40486
|
if (!unique.has(absolutePath)) {
|
|
40014
40487
|
unique.set(absolutePath, absolutePath);
|
|
40015
40488
|
}
|
|
@@ -40017,7 +40490,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
40017
40490
|
return Array.from(unique.values());
|
|
40018
40491
|
}
|
|
40019
40492
|
function pathToFileUri22(filePath) {
|
|
40020
|
-
const absolutePath =
|
|
40493
|
+
const absolutePath = path132.isAbsolute(filePath) ? filePath : path132.resolve(filePath);
|
|
40021
40494
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
40022
40495
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
40023
40496
|
return `file:///${normalizedPath}`;
|
|
@@ -40030,7 +40503,7 @@ function normalizeAttachments(attachments) {
|
|
|
40030
40503
|
}
|
|
40031
40504
|
const deduped = /* @__PURE__ */ new Set();
|
|
40032
40505
|
for (const attachment of attachments) {
|
|
40033
|
-
deduped.add(
|
|
40506
|
+
deduped.add(path132.resolve(attachment));
|
|
40034
40507
|
}
|
|
40035
40508
|
return Array.from(deduped);
|
|
40036
40509
|
}
|
|
@@ -40039,7 +40512,7 @@ function mergeAttachments(all) {
|
|
|
40039
40512
|
for (const list of all) {
|
|
40040
40513
|
if (!list) continue;
|
|
40041
40514
|
for (const inputFile of list) {
|
|
40042
|
-
deduped.add(
|
|
40515
|
+
deduped.add(path132.resolve(inputFile));
|
|
40043
40516
|
}
|
|
40044
40517
|
}
|
|
40045
40518
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -40119,11 +40592,11 @@ async function fileExists3(filePath) {
|
|
|
40119
40592
|
}
|
|
40120
40593
|
}
|
|
40121
40594
|
async function readTargetDefinitions(filePath) {
|
|
40122
|
-
const absolutePath =
|
|
40595
|
+
const absolutePath = path14.resolve(filePath);
|
|
40123
40596
|
if (!await fileExists3(absolutePath)) {
|
|
40124
40597
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
40125
40598
|
}
|
|
40126
|
-
const raw = await
|
|
40599
|
+
const raw = await readFile7(absolutePath, "utf8");
|
|
40127
40600
|
const parsed = parse32(raw);
|
|
40128
40601
|
if (!isRecord(parsed)) {
|
|
40129
40602
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -40320,15 +40793,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
40320
40793
|
});
|
|
40321
40794
|
}
|
|
40322
40795
|
async function execShellWithStdin(command7, stdinPayload, options = {}) {
|
|
40323
|
-
const { mkdir: mkdir42, readFile:
|
|
40796
|
+
const { mkdir: mkdir42, readFile: readFile82, rm: rm4, writeFile: writeFile42 } = await import("node:fs/promises");
|
|
40324
40797
|
const { tmpdir: tmpdir4 } = await import("node:os");
|
|
40325
|
-
const
|
|
40798
|
+
const path162 = await import("node:path");
|
|
40326
40799
|
const { randomUUID: randomUUID4 } = await import("node:crypto");
|
|
40327
|
-
const dir =
|
|
40800
|
+
const dir = path162.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
40328
40801
|
await mkdir42(dir, { recursive: true });
|
|
40329
|
-
const stdinPath =
|
|
40330
|
-
const stdoutPath =
|
|
40331
|
-
const stderrPath =
|
|
40802
|
+
const stdinPath = path162.join(dir, "stdin.txt");
|
|
40803
|
+
const stdoutPath = path162.join(dir, "stdout.txt");
|
|
40804
|
+
const stderrPath = path162.join(dir, "stderr.txt");
|
|
40332
40805
|
await writeFile42(stdinPath, stdinPayload, "utf8");
|
|
40333
40806
|
const wrappedCommand = process.platform === "win32" ? `(${command7}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command7}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
40334
40807
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
@@ -40358,8 +40831,8 @@ async function execShellWithStdin(command7, stdinPayload, options = {}) {
|
|
|
40358
40831
|
resolve2(code ?? 0);
|
|
40359
40832
|
});
|
|
40360
40833
|
});
|
|
40361
|
-
const stdout = (await
|
|
40362
|
-
const stderr = (await
|
|
40834
|
+
const stdout = (await readFile82(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
40835
|
+
const stderr = (await readFile82(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
40363
40836
|
return { stdout, stderr, exitCode };
|
|
40364
40837
|
} finally {
|
|
40365
40838
|
await rm4(dir, { recursive: true, force: true });
|
|
@@ -40623,7 +41096,7 @@ var CodeEvaluator = class {
|
|
|
40623
41096
|
outputMessages: context.outputMessages ?? null,
|
|
40624
41097
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
40625
41098
|
inputFiles: context.evalCase.file_paths.filter(
|
|
40626
|
-
(
|
|
41099
|
+
(path162) => !context.evalCase.guideline_paths.includes(path162)
|
|
40627
41100
|
),
|
|
40628
41101
|
inputMessages: context.evalCase.input_messages,
|
|
40629
41102
|
traceSummary: context.traceSummary ?? null,
|
|
@@ -40764,6 +41237,15 @@ var rubricEvaluationSchema = external_exports.object({
|
|
|
40764
41237
|
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
40765
41238
|
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
40766
41239
|
});
|
|
41240
|
+
var scoreRangeCheckResultSchema = external_exports.object({
|
|
41241
|
+
id: external_exports.string().describe("The ID of the rubric criterion being scored"),
|
|
41242
|
+
score: external_exports.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
|
|
41243
|
+
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this score").optional()
|
|
41244
|
+
});
|
|
41245
|
+
var scoreRangeEvaluationSchema = external_exports.object({
|
|
41246
|
+
checks: external_exports.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
41247
|
+
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
41248
|
+
});
|
|
40767
41249
|
var LlmJudgeEvaluator = class {
|
|
40768
41250
|
kind = "llm_judge";
|
|
40769
41251
|
resolveJudgeProvider;
|
|
@@ -40849,6 +41331,10 @@ var LlmJudgeEvaluator = class {
|
|
|
40849
41331
|
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
40850
41332
|
);
|
|
40851
41333
|
}
|
|
41334
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
41335
|
+
if (hasScoreRanges) {
|
|
41336
|
+
return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
|
|
41337
|
+
}
|
|
40852
41338
|
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
40853
41339
|
const systemPrompt = buildRubricOutputSchema();
|
|
40854
41340
|
const evaluatorRawRequest = {
|
|
@@ -40874,6 +41360,84 @@ var LlmJudgeEvaluator = class {
|
|
|
40874
41360
|
evaluatorRawRequest
|
|
40875
41361
|
};
|
|
40876
41362
|
}
|
|
41363
|
+
/**
|
|
41364
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
41365
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
41366
|
+
*/
|
|
41367
|
+
async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
|
|
41368
|
+
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
41369
|
+
const systemPrompt = buildScoreRangeOutputSchema();
|
|
41370
|
+
const evaluatorRawRequest = {
|
|
41371
|
+
userPrompt: prompt,
|
|
41372
|
+
systemPrompt,
|
|
41373
|
+
target: judgeProvider.targetName
|
|
41374
|
+
};
|
|
41375
|
+
const { data } = await this.runWithRetry({
|
|
41376
|
+
context,
|
|
41377
|
+
judgeProvider,
|
|
41378
|
+
systemPrompt,
|
|
41379
|
+
userPrompt: prompt,
|
|
41380
|
+
schema: scoreRangeEvaluationSchema
|
|
41381
|
+
});
|
|
41382
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
41383
|
+
return {
|
|
41384
|
+
score,
|
|
41385
|
+
verdict,
|
|
41386
|
+
hits,
|
|
41387
|
+
misses,
|
|
41388
|
+
expectedAspectCount: rubrics.length,
|
|
41389
|
+
reasoning: data.overall_reasoning,
|
|
41390
|
+
evaluatorRawRequest,
|
|
41391
|
+
details
|
|
41392
|
+
};
|
|
41393
|
+
}
|
|
41394
|
+
/**
|
|
41395
|
+
* Build prompt for score-range rubric evaluation.
|
|
41396
|
+
*/
|
|
41397
|
+
buildScoreRangePrompt(context, rubrics) {
|
|
41398
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
41399
|
+
const parts = [
|
|
41400
|
+
"You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
41401
|
+
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
41402
|
+
"",
|
|
41403
|
+
"[[ ## question ## ]]",
|
|
41404
|
+
formattedQuestion,
|
|
41405
|
+
"",
|
|
41406
|
+
"[[ ## expected_outcome ## ]]",
|
|
41407
|
+
context.evalCase.expected_outcome,
|
|
41408
|
+
""
|
|
41409
|
+
];
|
|
41410
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
41411
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
41412
|
+
}
|
|
41413
|
+
parts.push(
|
|
41414
|
+
"[[ ## candidate_answer ## ]]",
|
|
41415
|
+
context.candidate,
|
|
41416
|
+
"",
|
|
41417
|
+
"[[ ## scoring_criteria ## ]]"
|
|
41418
|
+
);
|
|
41419
|
+
for (const rubric of rubrics) {
|
|
41420
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
41421
|
+
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
41422
|
+
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
41423
|
+
if (rubric.expected_outcome) {
|
|
41424
|
+
parts.push(`Description: ${rubric.expected_outcome}`);
|
|
41425
|
+
}
|
|
41426
|
+
if (rubric.score_ranges && rubric.score_ranges.length > 0) {
|
|
41427
|
+
parts.push("Score ranges:");
|
|
41428
|
+
for (const range of rubric.score_ranges) {
|
|
41429
|
+
const [min, max] = range.score_range;
|
|
41430
|
+
const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
|
|
41431
|
+
parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
|
|
41432
|
+
}
|
|
41433
|
+
}
|
|
41434
|
+
}
|
|
41435
|
+
parts.push(
|
|
41436
|
+
"",
|
|
41437
|
+
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
41438
|
+
);
|
|
41439
|
+
return parts.join("\n");
|
|
41440
|
+
}
|
|
40877
41441
|
buildRubricPrompt(context, rubrics) {
|
|
40878
41442
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
40879
41443
|
const parts = [
|
|
@@ -40893,7 +41457,7 @@ var LlmJudgeEvaluator = class {
|
|
|
40893
41457
|
for (const rubric of rubrics) {
|
|
40894
41458
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
40895
41459
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
40896
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.
|
|
41460
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
|
|
40897
41461
|
}
|
|
40898
41462
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
40899
41463
|
return parts.join("\n");
|
|
@@ -40980,9 +41544,9 @@ function calculateRubricScore(result, rubrics) {
|
|
|
40980
41544
|
totalWeight += rubric.weight;
|
|
40981
41545
|
if (check2.satisfied) {
|
|
40982
41546
|
earnedWeight += rubric.weight;
|
|
40983
|
-
hits.push(`[${rubric.id}] ${rubric.
|
|
41547
|
+
hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check2.reasoning}`);
|
|
40984
41548
|
} else {
|
|
40985
|
-
misses.push(`[${rubric.id}] ${rubric.
|
|
41549
|
+
misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check2.reasoning}`);
|
|
40986
41550
|
if (rubric.required) {
|
|
40987
41551
|
failedRequired = true;
|
|
40988
41552
|
}
|
|
@@ -40992,6 +41556,76 @@ function calculateRubricScore(result, rubrics) {
|
|
|
40992
41556
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
40993
41557
|
return { score, verdict, hits, misses };
|
|
40994
41558
|
}
|
|
41559
|
+
function buildScoreRangeOutputSchema() {
|
|
41560
|
+
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
41561
|
+
You must return a valid JSON object matching this schema:
|
|
41562
|
+
{
|
|
41563
|
+
"checks": [
|
|
41564
|
+
{
|
|
41565
|
+
"id": "string (criterion id)",
|
|
41566
|
+
"score": integer (0-10),
|
|
41567
|
+
"reasoning": "string (brief explanation for score)"
|
|
41568
|
+
}
|
|
41569
|
+
],
|
|
41570
|
+
"overall_reasoning": "string (summary, optional)"
|
|
41571
|
+
}
|
|
41572
|
+
|
|
41573
|
+
Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
|
|
41574
|
+
}
|
|
41575
|
+
function calculateScoreRangeResult(result, rubrics) {
|
|
41576
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
41577
|
+
const hits = [];
|
|
41578
|
+
const misses = [];
|
|
41579
|
+
const rawScores = {};
|
|
41580
|
+
let totalWeight = 0;
|
|
41581
|
+
let weightedScoreSum = 0;
|
|
41582
|
+
let failedRequired = false;
|
|
41583
|
+
for (const check2 of result.checks) {
|
|
41584
|
+
const rubric = rubricMap.get(check2.id);
|
|
41585
|
+
if (!rubric) {
|
|
41586
|
+
continue;
|
|
41587
|
+
}
|
|
41588
|
+
const rawScore = Math.max(0, Math.min(10, check2.score));
|
|
41589
|
+
const normalizedScore = rawScore / 10;
|
|
41590
|
+
rawScores[rubric.id] = rawScore;
|
|
41591
|
+
totalWeight += rubric.weight;
|
|
41592
|
+
weightedScoreSum += normalizedScore * rubric.weight;
|
|
41593
|
+
let requiredMinScore;
|
|
41594
|
+
if (rubric.required_min_score !== void 0) {
|
|
41595
|
+
requiredMinScore = rubric.required_min_score;
|
|
41596
|
+
} else if (rubric.required === true) {
|
|
41597
|
+
requiredMinScore = 10;
|
|
41598
|
+
}
|
|
41599
|
+
const matchingRange = rubric.score_ranges?.find(
|
|
41600
|
+
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
41601
|
+
);
|
|
41602
|
+
const rangeDescription = matchingRange?.expected_outcome ?? "";
|
|
41603
|
+
const criterionLabel = rubric.expected_outcome ?? rubric.id;
|
|
41604
|
+
const reasoningText = check2.reasoning ? `: ${check2.reasoning}` : "";
|
|
41605
|
+
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
41606
|
+
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
41607
|
+
failedRequired = true;
|
|
41608
|
+
misses.push(scoreInfo);
|
|
41609
|
+
} else if (rawScore >= 7) {
|
|
41610
|
+
hits.push(scoreInfo);
|
|
41611
|
+
} else {
|
|
41612
|
+
misses.push(scoreInfo);
|
|
41613
|
+
}
|
|
41614
|
+
}
|
|
41615
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
41616
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
41617
|
+
return {
|
|
41618
|
+
score,
|
|
41619
|
+
verdict,
|
|
41620
|
+
hits,
|
|
41621
|
+
misses,
|
|
41622
|
+
details: {
|
|
41623
|
+
raw_scores: rawScores,
|
|
41624
|
+
normalization: "score / 10",
|
|
41625
|
+
aggregation: "weighted_average"
|
|
41626
|
+
}
|
|
41627
|
+
};
|
|
41628
|
+
}
|
|
40995
41629
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
40996
41630
|
{{EVALUATOR_RESULTS_JSON}}
|
|
40997
41631
|
|
|
@@ -41369,115 +42003,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
41369
42003
|
* Evaluate a single field against the expected value.
|
|
41370
42004
|
*/
|
|
41371
42005
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
41372
|
-
const { path:
|
|
41373
|
-
const candidateValue = resolvePath(candidateData,
|
|
41374
|
-
const expectedValue = resolvePath(expectedData,
|
|
42006
|
+
const { path: path162, match, required: required2 = true, weight = 1 } = fieldConfig;
|
|
42007
|
+
const candidateValue = resolvePath(candidateData, path162);
|
|
42008
|
+
const expectedValue = resolvePath(expectedData, path162);
|
|
41375
42009
|
if (expectedValue === void 0) {
|
|
41376
42010
|
return {
|
|
41377
|
-
path:
|
|
42011
|
+
path: path162,
|
|
41378
42012
|
score: 1,
|
|
41379
42013
|
// No expected value means no comparison needed
|
|
41380
42014
|
weight,
|
|
41381
42015
|
hit: true,
|
|
41382
|
-
message: `${
|
|
42016
|
+
message: `${path162}: no expected value`
|
|
41383
42017
|
};
|
|
41384
42018
|
}
|
|
41385
42019
|
if (candidateValue === void 0) {
|
|
41386
42020
|
if (required2) {
|
|
41387
42021
|
return {
|
|
41388
|
-
path:
|
|
42022
|
+
path: path162,
|
|
41389
42023
|
score: 0,
|
|
41390
42024
|
weight,
|
|
41391
42025
|
hit: false,
|
|
41392
|
-
message: `${
|
|
42026
|
+
message: `${path162} (required, missing)`
|
|
41393
42027
|
};
|
|
41394
42028
|
}
|
|
41395
42029
|
return {
|
|
41396
|
-
path:
|
|
42030
|
+
path: path162,
|
|
41397
42031
|
score: 1,
|
|
41398
42032
|
// Don't penalize missing optional fields
|
|
41399
42033
|
weight: 0,
|
|
41400
42034
|
// Zero weight means it won't affect the score
|
|
41401
42035
|
hit: true,
|
|
41402
|
-
message: `${
|
|
42036
|
+
message: `${path162}: optional field missing`
|
|
41403
42037
|
};
|
|
41404
42038
|
}
|
|
41405
42039
|
switch (match) {
|
|
41406
42040
|
case "exact":
|
|
41407
|
-
return this.compareExact(
|
|
42041
|
+
return this.compareExact(path162, candidateValue, expectedValue, weight);
|
|
41408
42042
|
case "numeric_tolerance":
|
|
41409
42043
|
return this.compareNumericTolerance(
|
|
41410
|
-
|
|
42044
|
+
path162,
|
|
41411
42045
|
candidateValue,
|
|
41412
42046
|
expectedValue,
|
|
41413
42047
|
fieldConfig,
|
|
41414
42048
|
weight
|
|
41415
42049
|
);
|
|
41416
42050
|
case "date":
|
|
41417
|
-
return this.compareDate(
|
|
42051
|
+
return this.compareDate(path162, candidateValue, expectedValue, fieldConfig, weight);
|
|
41418
42052
|
default:
|
|
41419
42053
|
return {
|
|
41420
|
-
path:
|
|
42054
|
+
path: path162,
|
|
41421
42055
|
score: 0,
|
|
41422
42056
|
weight,
|
|
41423
42057
|
hit: false,
|
|
41424
|
-
message: `${
|
|
42058
|
+
message: `${path162}: unknown match type "${match}"`
|
|
41425
42059
|
};
|
|
41426
42060
|
}
|
|
41427
42061
|
}
|
|
41428
42062
|
/**
|
|
41429
42063
|
* Exact equality comparison.
|
|
41430
42064
|
*/
|
|
41431
|
-
compareExact(
|
|
42065
|
+
compareExact(path162, candidateValue, expectedValue, weight) {
|
|
41432
42066
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
41433
42067
|
return {
|
|
41434
|
-
path:
|
|
42068
|
+
path: path162,
|
|
41435
42069
|
score: 1,
|
|
41436
42070
|
weight,
|
|
41437
42071
|
hit: true,
|
|
41438
|
-
message:
|
|
42072
|
+
message: path162
|
|
41439
42073
|
};
|
|
41440
42074
|
}
|
|
41441
42075
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
41442
42076
|
return {
|
|
41443
|
-
path:
|
|
42077
|
+
path: path162,
|
|
41444
42078
|
score: 0,
|
|
41445
42079
|
weight,
|
|
41446
42080
|
hit: false,
|
|
41447
|
-
message: `${
|
|
42081
|
+
message: `${path162} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
41448
42082
|
};
|
|
41449
42083
|
}
|
|
41450
42084
|
return {
|
|
41451
|
-
path:
|
|
42085
|
+
path: path162,
|
|
41452
42086
|
score: 0,
|
|
41453
42087
|
weight,
|
|
41454
42088
|
hit: false,
|
|
41455
|
-
message: `${
|
|
42089
|
+
message: `${path162} (value mismatch)`
|
|
41456
42090
|
};
|
|
41457
42091
|
}
|
|
41458
42092
|
/**
|
|
41459
42093
|
* Numeric comparison with absolute or relative tolerance.
|
|
41460
42094
|
*/
|
|
41461
|
-
compareNumericTolerance(
|
|
42095
|
+
compareNumericTolerance(path162, candidateValue, expectedValue, fieldConfig, weight) {
|
|
41462
42096
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
41463
42097
|
const candidateNum = toNumber(candidateValue);
|
|
41464
42098
|
const expectedNum = toNumber(expectedValue);
|
|
41465
42099
|
if (candidateNum === null || expectedNum === null) {
|
|
41466
42100
|
return {
|
|
41467
|
-
path:
|
|
42101
|
+
path: path162,
|
|
41468
42102
|
score: 0,
|
|
41469
42103
|
weight,
|
|
41470
42104
|
hit: false,
|
|
41471
|
-
message: `${
|
|
42105
|
+
message: `${path162} (non-numeric value)`
|
|
41472
42106
|
};
|
|
41473
42107
|
}
|
|
41474
42108
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
41475
42109
|
return {
|
|
41476
|
-
path:
|
|
42110
|
+
path: path162,
|
|
41477
42111
|
score: 0,
|
|
41478
42112
|
weight,
|
|
41479
42113
|
hit: false,
|
|
41480
|
-
message: `${
|
|
42114
|
+
message: `${path162} (invalid numeric value)`
|
|
41481
42115
|
};
|
|
41482
42116
|
}
|
|
41483
42117
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -41490,61 +42124,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
41490
42124
|
}
|
|
41491
42125
|
if (withinTolerance) {
|
|
41492
42126
|
return {
|
|
41493
|
-
path:
|
|
42127
|
+
path: path162,
|
|
41494
42128
|
score: 1,
|
|
41495
42129
|
weight,
|
|
41496
42130
|
hit: true,
|
|
41497
|
-
message: `${
|
|
42131
|
+
message: `${path162} (within tolerance: diff=${diff.toFixed(2)})`
|
|
41498
42132
|
};
|
|
41499
42133
|
}
|
|
41500
42134
|
return {
|
|
41501
|
-
path:
|
|
42135
|
+
path: path162,
|
|
41502
42136
|
score: 0,
|
|
41503
42137
|
weight,
|
|
41504
42138
|
hit: false,
|
|
41505
|
-
message: `${
|
|
42139
|
+
message: `${path162} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
41506
42140
|
};
|
|
41507
42141
|
}
|
|
41508
42142
|
/**
|
|
41509
42143
|
* Date comparison with format normalization.
|
|
41510
42144
|
*/
|
|
41511
|
-
compareDate(
|
|
42145
|
+
compareDate(path162, candidateValue, expectedValue, fieldConfig, weight) {
|
|
41512
42146
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
41513
42147
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
41514
42148
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
41515
42149
|
if (candidateDate === null) {
|
|
41516
42150
|
return {
|
|
41517
|
-
path:
|
|
42151
|
+
path: path162,
|
|
41518
42152
|
score: 0,
|
|
41519
42153
|
weight,
|
|
41520
42154
|
hit: false,
|
|
41521
|
-
message: `${
|
|
42155
|
+
message: `${path162} (unparseable candidate date)`
|
|
41522
42156
|
};
|
|
41523
42157
|
}
|
|
41524
42158
|
if (expectedDate === null) {
|
|
41525
42159
|
return {
|
|
41526
|
-
path:
|
|
42160
|
+
path: path162,
|
|
41527
42161
|
score: 0,
|
|
41528
42162
|
weight,
|
|
41529
42163
|
hit: false,
|
|
41530
|
-
message: `${
|
|
42164
|
+
message: `${path162} (unparseable expected date)`
|
|
41531
42165
|
};
|
|
41532
42166
|
}
|
|
41533
42167
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
41534
42168
|
return {
|
|
41535
|
-
path:
|
|
42169
|
+
path: path162,
|
|
41536
42170
|
score: 1,
|
|
41537
42171
|
weight,
|
|
41538
42172
|
hit: true,
|
|
41539
|
-
message:
|
|
42173
|
+
message: path162
|
|
41540
42174
|
};
|
|
41541
42175
|
}
|
|
41542
42176
|
return {
|
|
41543
|
-
path:
|
|
42177
|
+
path: path162,
|
|
41544
42178
|
score: 0,
|
|
41545
42179
|
weight,
|
|
41546
42180
|
hit: false,
|
|
41547
|
-
message: `${
|
|
42181
|
+
message: `${path162} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
41548
42182
|
};
|
|
41549
42183
|
}
|
|
41550
42184
|
/**
|
|
@@ -41584,11 +42218,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
41584
42218
|
};
|
|
41585
42219
|
}
|
|
41586
42220
|
};
|
|
41587
|
-
function resolvePath(obj,
|
|
41588
|
-
if (!
|
|
42221
|
+
function resolvePath(obj, path162) {
|
|
42222
|
+
if (!path162 || !obj) {
|
|
41589
42223
|
return void 0;
|
|
41590
42224
|
}
|
|
41591
|
-
const parts =
|
|
42225
|
+
const parts = path162.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
41592
42226
|
let current = obj;
|
|
41593
42227
|
for (const part of parts) {
|
|
41594
42228
|
if (current === null || current === void 0) {
|
|
@@ -41807,6 +42441,27 @@ function argsMatch(expected, actual) {
|
|
|
41807
42441
|
}
|
|
41808
42442
|
return true;
|
|
41809
42443
|
}
|
|
42444
|
+
function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
42445
|
+
if (maxDurationMs === void 0) {
|
|
42446
|
+
return { status: "skip", message: "" };
|
|
42447
|
+
}
|
|
42448
|
+
if (actualDurationMs === void 0) {
|
|
42449
|
+
return {
|
|
42450
|
+
status: "skip",
|
|
42451
|
+
message: `No duration data for ${toolName}; latency assertion skipped`
|
|
42452
|
+
};
|
|
42453
|
+
}
|
|
42454
|
+
if (actualDurationMs <= maxDurationMs) {
|
|
42455
|
+
return {
|
|
42456
|
+
status: "pass",
|
|
42457
|
+
message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
42458
|
+
};
|
|
42459
|
+
}
|
|
42460
|
+
return {
|
|
42461
|
+
status: "fail",
|
|
42462
|
+
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
42463
|
+
};
|
|
42464
|
+
}
|
|
41810
42465
|
var ToolTrajectoryEvaluator = class {
|
|
41811
42466
|
kind = "tool_trajectory";
|
|
41812
42467
|
config;
|
|
@@ -41865,7 +42520,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
41865
42520
|
for (const call of message.toolCalls) {
|
|
41866
42521
|
toolCalls.push({
|
|
41867
42522
|
name: call.tool,
|
|
41868
|
-
args: call.input
|
|
42523
|
+
args: call.input,
|
|
42524
|
+
durationMs: call.durationMs
|
|
41869
42525
|
});
|
|
41870
42526
|
}
|
|
41871
42527
|
}
|
|
@@ -41933,17 +42589,27 @@ var ToolTrajectoryEvaluator = class {
|
|
|
41933
42589
|
}
|
|
41934
42590
|
const hits = [];
|
|
41935
42591
|
const misses = [];
|
|
42592
|
+
const warnings = [];
|
|
41936
42593
|
let actualIndex = 0;
|
|
42594
|
+
let sequenceHits = 0;
|
|
42595
|
+
let latencyHits = 0;
|
|
42596
|
+
let latencySkips = 0;
|
|
42597
|
+
const latencyAssertionCount = expected.filter(
|
|
42598
|
+
(item) => item.maxDurationMs !== void 0
|
|
42599
|
+
).length;
|
|
41937
42600
|
for (let i = 0; i < expected.length; i++) {
|
|
41938
42601
|
const expectedItem = expected[i];
|
|
41939
42602
|
const expectedTool = expectedItem.tool;
|
|
41940
42603
|
let found = false;
|
|
41941
42604
|
let argsMismatch = false;
|
|
42605
|
+
let matchedCall;
|
|
41942
42606
|
while (actualIndex < toolCalls.length) {
|
|
41943
42607
|
const actualCall = toolCalls[actualIndex];
|
|
41944
42608
|
if (actualCall.name === expectedTool) {
|
|
41945
42609
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
41946
42610
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
42611
|
+
sequenceHits++;
|
|
42612
|
+
matchedCall = actualCall;
|
|
41947
42613
|
actualIndex++;
|
|
41948
42614
|
found = true;
|
|
41949
42615
|
break;
|
|
@@ -41960,14 +42626,35 @@ var ToolTrajectoryEvaluator = class {
|
|
|
41960
42626
|
if (!found && !argsMismatch) {
|
|
41961
42627
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
41962
42628
|
}
|
|
42629
|
+
if (found && matchedCall) {
|
|
42630
|
+
const latencyResult = checkLatency(
|
|
42631
|
+
expectedTool,
|
|
42632
|
+
expectedItem.maxDurationMs,
|
|
42633
|
+
matchedCall.durationMs
|
|
42634
|
+
);
|
|
42635
|
+
if (latencyResult.status === "pass") {
|
|
42636
|
+
hits.push(latencyResult.message);
|
|
42637
|
+
latencyHits++;
|
|
42638
|
+
} else if (latencyResult.status === "fail") {
|
|
42639
|
+
misses.push(latencyResult.message);
|
|
42640
|
+
} else if (latencyResult.message) {
|
|
42641
|
+
warnings.push(latencyResult.message);
|
|
42642
|
+
latencySkips++;
|
|
42643
|
+
}
|
|
42644
|
+
}
|
|
41963
42645
|
}
|
|
41964
|
-
const
|
|
42646
|
+
for (const warning of warnings) {
|
|
42647
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
42648
|
+
}
|
|
42649
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
42650
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
42651
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
41965
42652
|
return {
|
|
41966
42653
|
score,
|
|
41967
42654
|
verdict: scoreToVerdict(score),
|
|
41968
42655
|
hits,
|
|
41969
42656
|
misses,
|
|
41970
|
-
expectedAspectCount:
|
|
42657
|
+
expectedAspectCount: totalAssertions
|
|
41971
42658
|
};
|
|
41972
42659
|
}
|
|
41973
42660
|
evaluateExact(toolCalls) {
|
|
@@ -41983,6 +42670,13 @@ var ToolTrajectoryEvaluator = class {
|
|
|
41983
42670
|
}
|
|
41984
42671
|
const hits = [];
|
|
41985
42672
|
const misses = [];
|
|
42673
|
+
const warnings = [];
|
|
42674
|
+
let sequenceHits = 0;
|
|
42675
|
+
let latencyHits = 0;
|
|
42676
|
+
let latencySkips = 0;
|
|
42677
|
+
const latencyAssertionCount = expected.filter(
|
|
42678
|
+
(item) => item.maxDurationMs !== void 0
|
|
42679
|
+
).length;
|
|
41986
42680
|
if (toolCalls.length !== expected.length) {
|
|
41987
42681
|
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
41988
42682
|
}
|
|
@@ -41992,26 +42686,50 @@ var ToolTrajectoryEvaluator = class {
|
|
|
41992
42686
|
const expectedTool = expectedItem.tool;
|
|
41993
42687
|
const actualCall = toolCalls[i];
|
|
41994
42688
|
const actualTool = actualCall.name;
|
|
42689
|
+
let sequenceMatched = false;
|
|
41995
42690
|
if (actualTool === expectedTool) {
|
|
41996
42691
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
41997
42692
|
hits.push(`Position ${i}: ${expectedTool}`);
|
|
42693
|
+
sequenceHits++;
|
|
42694
|
+
sequenceMatched = true;
|
|
41998
42695
|
} else {
|
|
41999
42696
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
42000
42697
|
}
|
|
42001
42698
|
} else {
|
|
42002
42699
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
42003
42700
|
}
|
|
42701
|
+
if (sequenceMatched) {
|
|
42702
|
+
const latencyResult = checkLatency(
|
|
42703
|
+
expectedTool,
|
|
42704
|
+
expectedItem.maxDurationMs,
|
|
42705
|
+
actualCall.durationMs
|
|
42706
|
+
);
|
|
42707
|
+
if (latencyResult.status === "pass") {
|
|
42708
|
+
hits.push(latencyResult.message);
|
|
42709
|
+
latencyHits++;
|
|
42710
|
+
} else if (latencyResult.status === "fail") {
|
|
42711
|
+
misses.push(latencyResult.message);
|
|
42712
|
+
} else if (latencyResult.message) {
|
|
42713
|
+
warnings.push(latencyResult.message);
|
|
42714
|
+
latencySkips++;
|
|
42715
|
+
}
|
|
42716
|
+
}
|
|
42004
42717
|
}
|
|
42005
42718
|
for (let i = checkLength; i < expected.length; i++) {
|
|
42006
42719
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
42007
42720
|
}
|
|
42008
|
-
const
|
|
42721
|
+
for (const warning of warnings) {
|
|
42722
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
42723
|
+
}
|
|
42724
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
42725
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
42726
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
42009
42727
|
return {
|
|
42010
42728
|
score,
|
|
42011
42729
|
verdict: scoreToVerdict(score),
|
|
42012
42730
|
hits,
|
|
42013
42731
|
misses,
|
|
42014
|
-
expectedAspectCount:
|
|
42732
|
+
expectedAspectCount: totalAssertions
|
|
42015
42733
|
};
|
|
42016
42734
|
}
|
|
42017
42735
|
};
|
|
@@ -42167,17 +42885,17 @@ async function runEvaluation(options) {
|
|
|
42167
42885
|
cache,
|
|
42168
42886
|
useCache,
|
|
42169
42887
|
now,
|
|
42170
|
-
|
|
42888
|
+
filter: filter2,
|
|
42171
42889
|
verbose,
|
|
42172
42890
|
evalCases: preloadedEvalCases,
|
|
42173
42891
|
onResult,
|
|
42174
42892
|
onProgress
|
|
42175
42893
|
} = options;
|
|
42176
|
-
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose,
|
|
42177
|
-
const filteredEvalCases = filterEvalCases(evalCases,
|
|
42894
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter: filter2 });
|
|
42895
|
+
const filteredEvalCases = filterEvalCases(evalCases, filter2);
|
|
42178
42896
|
if (filteredEvalCases.length === 0) {
|
|
42179
|
-
if (
|
|
42180
|
-
throw new Error(`
|
|
42897
|
+
if (filter2) {
|
|
42898
|
+
throw new Error(`No eval cases matched filter '${filter2}' in ${evalFilePath}`);
|
|
42181
42899
|
}
|
|
42182
42900
|
return [];
|
|
42183
42901
|
}
|
|
@@ -42753,7 +43471,10 @@ async function runEvaluatorList(options) {
|
|
|
42753
43471
|
attempt,
|
|
42754
43472
|
promptInputs,
|
|
42755
43473
|
now,
|
|
42756
|
-
judgeProvider
|
|
43474
|
+
judgeProvider,
|
|
43475
|
+
outputMessages,
|
|
43476
|
+
traceSummary,
|
|
43477
|
+
agentTimeoutMs
|
|
42757
43478
|
});
|
|
42758
43479
|
const weight = evaluator.weight ?? 1;
|
|
42759
43480
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -42807,7 +43528,7 @@ async function runEvaluatorList(options) {
|
|
|
42807
43528
|
});
|
|
42808
43529
|
}
|
|
42809
43530
|
if (evaluator.type === "composite") {
|
|
42810
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
43531
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path15.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
42811
43532
|
const createEvaluator = (memberConfig) => {
|
|
42812
43533
|
switch (memberConfig.type) {
|
|
42813
43534
|
case "llm_judge":
|
|
@@ -43088,9 +43809,22 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
43088
43809
|
attempt,
|
|
43089
43810
|
promptInputs,
|
|
43090
43811
|
now,
|
|
43091
|
-
judgeProvider
|
|
43812
|
+
judgeProvider,
|
|
43813
|
+
outputMessages,
|
|
43814
|
+
traceSummary,
|
|
43815
|
+
agentTimeoutMs
|
|
43092
43816
|
} = options;
|
|
43093
|
-
const customPrompt = await resolveCustomPrompt(
|
|
43817
|
+
const customPrompt = await resolveCustomPrompt(
|
|
43818
|
+
config2,
|
|
43819
|
+
{
|
|
43820
|
+
evalCase,
|
|
43821
|
+
candidate,
|
|
43822
|
+
outputMessages,
|
|
43823
|
+
traceSummary,
|
|
43824
|
+
config: config2.config
|
|
43825
|
+
},
|
|
43826
|
+
agentTimeoutMs
|
|
43827
|
+
);
|
|
43094
43828
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
43095
43829
|
evalCase,
|
|
43096
43830
|
candidate,
|
|
@@ -43104,23 +43838,70 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
43104
43838
|
evaluator: config2
|
|
43105
43839
|
});
|
|
43106
43840
|
}
|
|
43107
|
-
async function resolveCustomPrompt(
|
|
43108
|
-
if (
|
|
43841
|
+
async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
43842
|
+
if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
|
|
43843
|
+
if (!context) {
|
|
43844
|
+
throw new Error("Context required for executable prompt templates");
|
|
43845
|
+
}
|
|
43846
|
+
return executePromptTemplate(
|
|
43847
|
+
promptConfig.resolvedPromptScript,
|
|
43848
|
+
context,
|
|
43849
|
+
promptConfig.config,
|
|
43850
|
+
timeoutMs
|
|
43851
|
+
);
|
|
43852
|
+
}
|
|
43853
|
+
const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
|
|
43854
|
+
if (promptPath) {
|
|
43109
43855
|
try {
|
|
43110
|
-
const content = await readTextFile(
|
|
43856
|
+
const content = await readTextFile(promptPath);
|
|
43111
43857
|
return content;
|
|
43112
43858
|
} catch (error40) {
|
|
43113
43859
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
43114
|
-
console.warn(`Could not read custom prompt at ${
|
|
43860
|
+
console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
|
|
43115
43861
|
}
|
|
43116
43862
|
}
|
|
43117
|
-
|
|
43863
|
+
const promptValue = promptConfig.prompt;
|
|
43864
|
+
if (typeof promptValue === "string") {
|
|
43865
|
+
return promptValue;
|
|
43866
|
+
}
|
|
43867
|
+
return void 0;
|
|
43868
|
+
}
|
|
43869
|
+
async function executePromptTemplate(script, context, config2, timeoutMs) {
|
|
43870
|
+
const payload = {
|
|
43871
|
+
question: context.evalCase.question,
|
|
43872
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
43873
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
43874
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
43875
|
+
candidateAnswer: context.candidate,
|
|
43876
|
+
outputMessages: context.outputMessages ?? null,
|
|
43877
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
43878
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
43879
|
+
(p) => !context.evalCase.guideline_paths.includes(p)
|
|
43880
|
+
),
|
|
43881
|
+
inputMessages: context.evalCase.input_messages,
|
|
43882
|
+
traceSummary: context.traceSummary ?? null,
|
|
43883
|
+
config: config2 ?? context.config ?? null
|
|
43884
|
+
};
|
|
43885
|
+
const inputJson = JSON.stringify(toSnakeCaseDeep2(payload), null, 2);
|
|
43886
|
+
const scriptPath = script[script.length - 1];
|
|
43887
|
+
const cwd = path15.dirname(scriptPath);
|
|
43888
|
+
try {
|
|
43889
|
+
const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
|
|
43890
|
+
const prompt = stdout.trim();
|
|
43891
|
+
if (!prompt) {
|
|
43892
|
+
throw new Error("Prompt template produced empty output");
|
|
43893
|
+
}
|
|
43894
|
+
return prompt;
|
|
43895
|
+
} catch (error40) {
|
|
43896
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
43897
|
+
throw new Error(`Prompt template execution failed: ${message}`);
|
|
43898
|
+
}
|
|
43118
43899
|
}
|
|
43119
|
-
function filterEvalCases(evalCases,
|
|
43120
|
-
if (!
|
|
43900
|
+
function filterEvalCases(evalCases, filter2) {
|
|
43901
|
+
if (!filter2) {
|
|
43121
43902
|
return evalCases;
|
|
43122
43903
|
}
|
|
43123
|
-
return evalCases.filter((evalCase) => evalCase.id
|
|
43904
|
+
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter2));
|
|
43124
43905
|
}
|
|
43125
43906
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
43126
43907
|
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
@@ -43274,7 +44055,7 @@ function computeWeightedMean(entries) {
|
|
|
43274
44055
|
}
|
|
43275
44056
|
var rubricItemSchema = external_exports.object({
|
|
43276
44057
|
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
43277
|
-
|
|
44058
|
+
expected_outcome: external_exports.string().describe("Concrete expected outcome for this rubric item"),
|
|
43278
44059
|
weight: external_exports.number().default(1).describe("Relative importance (default 1.0)"),
|
|
43279
44060
|
required: external_exports.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
43280
44061
|
});
|
|
@@ -43294,7 +44075,7 @@ You must return a valid JSON object matching this schema:
|
|
|
43294
44075
|
"rubrics": [
|
|
43295
44076
|
{
|
|
43296
44077
|
"id": "string (short identifier)",
|
|
43297
|
-
"
|
|
44078
|
+
"expected_outcome": "string (concrete expected outcome for this rubric item)",
|
|
43298
44079
|
"weight": number (default 1.0),
|
|
43299
44080
|
"required": boolean (default true)
|
|
43300
44081
|
}
|
|
@@ -43330,7 +44111,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
43330
44111
|
"Each rubric should:",
|
|
43331
44112
|
"- Be specific and testable",
|
|
43332
44113
|
"- Have a short, descriptive ID",
|
|
43333
|
-
"- Include a clear
|
|
44114
|
+
"- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
|
|
43334
44115
|
"- Indicate if it is required (mandatory) or optional",
|
|
43335
44116
|
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
43336
44117
|
"",
|
|
@@ -43395,7 +44176,7 @@ var convertCommand = command2({
|
|
|
43395
44176
|
const outputPath = out ?? input.replace(/\.jsonl$/, ".yaml");
|
|
43396
44177
|
try {
|
|
43397
44178
|
const count = convertJsonlToYaml(input, outputPath);
|
|
43398
|
-
console.log(`Converted ${count} records to ${
|
|
44179
|
+
console.log(`Converted ${count} records to ${path16.resolve(outputPath)}`);
|
|
43399
44180
|
} catch (error40) {
|
|
43400
44181
|
console.error(`Error: ${error40.message}`);
|
|
43401
44182
|
process.exit(1);
|
|
@@ -43405,7 +44186,7 @@ var convertCommand = command2({
|
|
|
43405
44186
|
|
|
43406
44187
|
// src/commands/eval/index.ts
|
|
43407
44188
|
import { stat as stat4 } from "node:fs/promises";
|
|
43408
|
-
import
|
|
44189
|
+
import path25 from "node:path";
|
|
43409
44190
|
import {
|
|
43410
44191
|
command as command3,
|
|
43411
44192
|
flag as flag2,
|
|
@@ -43420,19 +44201,19 @@ import fg from "fast-glob";
|
|
|
43420
44201
|
// src/commands/eval/run-eval.ts
|
|
43421
44202
|
import { constants as constants6 } from "node:fs";
|
|
43422
44203
|
import { access as access6 } from "node:fs/promises";
|
|
43423
|
-
import
|
|
44204
|
+
import path24 from "node:path";
|
|
43424
44205
|
import { pathToFileURL } from "node:url";
|
|
43425
44206
|
|
|
43426
44207
|
// src/commands/eval/env.ts
|
|
43427
44208
|
import { constants as constants4 } from "node:fs";
|
|
43428
44209
|
import { access as access4 } from "node:fs/promises";
|
|
43429
|
-
import
|
|
44210
|
+
import path17 from "node:path";
|
|
43430
44211
|
import { config as loadDotenv } from "dotenv";
|
|
43431
44212
|
function uniqueDirs(directories) {
|
|
43432
44213
|
const seen = /* @__PURE__ */ new Set();
|
|
43433
44214
|
const result = [];
|
|
43434
44215
|
for (const dir of directories) {
|
|
43435
|
-
const absolute =
|
|
44216
|
+
const absolute = path17.resolve(dir);
|
|
43436
44217
|
if (seen.has(absolute)) {
|
|
43437
44218
|
continue;
|
|
43438
44219
|
}
|
|
@@ -43451,14 +44232,14 @@ async function fileExists4(filePath) {
|
|
|
43451
44232
|
}
|
|
43452
44233
|
function collectAncestorDirectories(start, boundary) {
|
|
43453
44234
|
const directories = [];
|
|
43454
|
-
const boundaryDir =
|
|
43455
|
-
let current =
|
|
44235
|
+
const boundaryDir = path17.resolve(boundary);
|
|
44236
|
+
let current = path17.resolve(start);
|
|
43456
44237
|
while (current !== void 0) {
|
|
43457
44238
|
directories.push(current);
|
|
43458
44239
|
if (current === boundaryDir) {
|
|
43459
44240
|
break;
|
|
43460
44241
|
}
|
|
43461
|
-
const parent =
|
|
44242
|
+
const parent = path17.dirname(current);
|
|
43462
44243
|
if (parent === current) {
|
|
43463
44244
|
break;
|
|
43464
44245
|
}
|
|
@@ -43468,12 +44249,12 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
43468
44249
|
}
|
|
43469
44250
|
async function loadEnvFromHierarchy(options) {
|
|
43470
44251
|
const { testFilePath, repoRoot, verbose } = options;
|
|
43471
|
-
const testDir =
|
|
44252
|
+
const testDir = path17.dirname(path17.resolve(testFilePath));
|
|
43472
44253
|
const cwd = process.cwd();
|
|
43473
44254
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
43474
44255
|
const envFiles = [];
|
|
43475
44256
|
for (const dir of searchDirs) {
|
|
43476
|
-
const candidate =
|
|
44257
|
+
const candidate = path17.join(dir, ".env");
|
|
43477
44258
|
if (await fileExists4(candidate)) {
|
|
43478
44259
|
envFiles.push(candidate);
|
|
43479
44260
|
}
|
|
@@ -43497,7 +44278,7 @@ async function loadEnvFromHierarchy(options) {
|
|
|
43497
44278
|
// src/commands/eval/jsonl-writer.ts
|
|
43498
44279
|
import { createWriteStream as createWriteStream4 } from "node:fs";
|
|
43499
44280
|
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
43500
|
-
import
|
|
44281
|
+
import path18 from "node:path";
|
|
43501
44282
|
import { finished } from "node:stream/promises";
|
|
43502
44283
|
|
|
43503
44284
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
@@ -43715,7 +44496,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
43715
44496
|
this.stream = stream;
|
|
43716
44497
|
}
|
|
43717
44498
|
static async open(filePath) {
|
|
43718
|
-
await mkdir5(
|
|
44499
|
+
await mkdir5(path18.dirname(filePath), { recursive: true });
|
|
43719
44500
|
const stream = createWriteStream4(filePath, { flags: "w", encoding: "utf8" });
|
|
43720
44501
|
return new _JsonlWriter(stream);
|
|
43721
44502
|
}
|
|
@@ -43748,7 +44529,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
43748
44529
|
// src/commands/eval/yaml-writer.ts
|
|
43749
44530
|
import { createWriteStream as createWriteStream5 } from "node:fs";
|
|
43750
44531
|
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
43751
|
-
import
|
|
44532
|
+
import path19 from "node:path";
|
|
43752
44533
|
import { finished as finished2 } from "node:stream/promises";
|
|
43753
44534
|
import { stringify as stringifyYaml2 } from "yaml";
|
|
43754
44535
|
var YamlWriter = class _YamlWriter {
|
|
@@ -43760,7 +44541,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
43760
44541
|
this.stream = stream;
|
|
43761
44542
|
}
|
|
43762
44543
|
static async open(filePath) {
|
|
43763
|
-
await mkdir6(
|
|
44544
|
+
await mkdir6(path19.dirname(filePath), { recursive: true });
|
|
43764
44545
|
const stream = createWriteStream5(filePath, { flags: "w", encoding: "utf8" });
|
|
43765
44546
|
return new _YamlWriter(stream);
|
|
43766
44547
|
}
|
|
@@ -43880,12 +44661,12 @@ var ProgressDisplay = class {
|
|
|
43880
44661
|
}
|
|
43881
44662
|
addLogPaths(paths, provider) {
|
|
43882
44663
|
const newPaths = [];
|
|
43883
|
-
for (const
|
|
43884
|
-
if (this.logPathSet.has(
|
|
44664
|
+
for (const path30 of paths) {
|
|
44665
|
+
if (this.logPathSet.has(path30)) {
|
|
43885
44666
|
continue;
|
|
43886
44667
|
}
|
|
43887
|
-
this.logPathSet.add(
|
|
43888
|
-
newPaths.push(
|
|
44668
|
+
this.logPathSet.add(path30);
|
|
44669
|
+
newPaths.push(path30);
|
|
43889
44670
|
}
|
|
43890
44671
|
if (newPaths.length === 0) {
|
|
43891
44672
|
return;
|
|
@@ -43898,8 +44679,8 @@ var ProgressDisplay = class {
|
|
|
43898
44679
|
this.hasPrintedLogHeader = true;
|
|
43899
44680
|
}
|
|
43900
44681
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
43901
|
-
newPaths.forEach((
|
|
43902
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
44682
|
+
newPaths.forEach((path30, offset) => {
|
|
44683
|
+
console.log(`${startIndex + offset + 1}. ${path30}`);
|
|
43903
44684
|
});
|
|
43904
44685
|
}
|
|
43905
44686
|
finish() {
|
|
@@ -44053,8 +44834,8 @@ function formatEvaluationSummary(summary) {
|
|
|
44053
44834
|
}
|
|
44054
44835
|
|
|
44055
44836
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
44056
|
-
import { readFile as
|
|
44057
|
-
import
|
|
44837
|
+
import { readFile as readFile8 } from "node:fs/promises";
|
|
44838
|
+
import path20 from "node:path";
|
|
44058
44839
|
import { parse as parse6 } from "yaml";
|
|
44059
44840
|
import { readFile as readFile23 } from "node:fs/promises";
|
|
44060
44841
|
import path23 from "node:path";
|
|
@@ -44072,7 +44853,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
|
|
|
44072
44853
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
44073
44854
|
async function detectFileType(filePath) {
|
|
44074
44855
|
try {
|
|
44075
|
-
const content = await
|
|
44856
|
+
const content = await readFile8(filePath, "utf8");
|
|
44076
44857
|
const parsed = parse6(content);
|
|
44077
44858
|
if (typeof parsed !== "object" || parsed === null) {
|
|
44078
44859
|
return inferFileTypeFromPath(filePath);
|
|
@@ -44097,8 +44878,8 @@ async function detectFileType(filePath) {
|
|
|
44097
44878
|
}
|
|
44098
44879
|
}
|
|
44099
44880
|
function inferFileTypeFromPath(filePath) {
|
|
44100
|
-
const normalized =
|
|
44101
|
-
const basename =
|
|
44881
|
+
const normalized = path20.normalize(filePath).replace(/\\/g, "/");
|
|
44882
|
+
const basename = path20.basename(filePath);
|
|
44102
44883
|
if (normalized.includes("/.agentv/")) {
|
|
44103
44884
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
44104
44885
|
return "config";
|
|
@@ -44191,17 +44972,31 @@ async function validateEvalFile(filePath) {
|
|
|
44191
44972
|
});
|
|
44192
44973
|
}
|
|
44193
44974
|
const inputMessages = evalCase.input_messages;
|
|
44194
|
-
|
|
44975
|
+
const inputAlias = evalCase.input;
|
|
44976
|
+
if (Array.isArray(inputMessages)) {
|
|
44977
|
+
validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
|
|
44978
|
+
} else if (inputAlias !== void 0) {
|
|
44979
|
+
if (typeof inputAlias === "string") {
|
|
44980
|
+
} else if (Array.isArray(inputAlias)) {
|
|
44981
|
+
validateMessages(inputAlias, `${location}.input`, absolutePath, errors);
|
|
44982
|
+
} else {
|
|
44983
|
+
errors.push({
|
|
44984
|
+
severity: "error",
|
|
44985
|
+
filePath: absolutePath,
|
|
44986
|
+
location: `${location}.input`,
|
|
44987
|
+
message: "Invalid 'input' field (must be a string or array of messages)"
|
|
44988
|
+
});
|
|
44989
|
+
}
|
|
44990
|
+
} else {
|
|
44195
44991
|
errors.push({
|
|
44196
44992
|
severity: "error",
|
|
44197
44993
|
filePath: absolutePath,
|
|
44198
44994
|
location: `${location}.input_messages`,
|
|
44199
|
-
message: "Missing or
|
|
44995
|
+
message: "Missing 'input_messages' or 'input' field (must provide one)"
|
|
44200
44996
|
});
|
|
44201
|
-
} else {
|
|
44202
|
-
validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
|
|
44203
44997
|
}
|
|
44204
44998
|
const expectedMessages = evalCase.expected_messages;
|
|
44999
|
+
const expectedOutputAlias = evalCase.expected_output;
|
|
44205
45000
|
if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
|
|
44206
45001
|
errors.push({
|
|
44207
45002
|
severity: "error",
|
|
@@ -44211,6 +45006,26 @@ async function validateEvalFile(filePath) {
|
|
|
44211
45006
|
});
|
|
44212
45007
|
} else if (Array.isArray(expectedMessages)) {
|
|
44213
45008
|
validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
|
|
45009
|
+
} else if (expectedOutputAlias !== void 0) {
|
|
45010
|
+
if (typeof expectedOutputAlias === "string") {
|
|
45011
|
+
} else if (Array.isArray(expectedOutputAlias)) {
|
|
45012
|
+
if (expectedOutputAlias.length > 0 && isObject2(expectedOutputAlias[0]) && "role" in expectedOutputAlias[0]) {
|
|
45013
|
+
validateMessages(
|
|
45014
|
+
expectedOutputAlias,
|
|
45015
|
+
`${location}.expected_output`,
|
|
45016
|
+
absolutePath,
|
|
45017
|
+
errors
|
|
45018
|
+
);
|
|
45019
|
+
}
|
|
45020
|
+
} else if (isObject2(expectedOutputAlias)) {
|
|
45021
|
+
} else {
|
|
45022
|
+
errors.push({
|
|
45023
|
+
severity: "error",
|
|
45024
|
+
filePath: absolutePath,
|
|
45025
|
+
location: `${location}.expected_output`,
|
|
45026
|
+
message: "Invalid 'expected_output' field (must be a string, object, or array)"
|
|
45027
|
+
});
|
|
45028
|
+
}
|
|
44214
45029
|
}
|
|
44215
45030
|
}
|
|
44216
45031
|
return {
|
|
@@ -44863,12 +45678,12 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
44863
45678
|
// src/utils/targets.ts
|
|
44864
45679
|
import { constants as constants5 } from "node:fs";
|
|
44865
45680
|
import { access as access5 } from "node:fs/promises";
|
|
44866
|
-
import
|
|
45681
|
+
import path21 from "node:path";
|
|
44867
45682
|
var TARGET_FILE_CANDIDATES = [
|
|
44868
45683
|
"targets.yaml",
|
|
44869
45684
|
"targets.yml",
|
|
44870
|
-
|
|
44871
|
-
|
|
45685
|
+
path21.join(".agentv", "targets.yaml"),
|
|
45686
|
+
path21.join(".agentv", "targets.yml")
|
|
44872
45687
|
];
|
|
44873
45688
|
async function fileExists5(filePath) {
|
|
44874
45689
|
try {
|
|
@@ -44881,12 +45696,12 @@ async function fileExists5(filePath) {
|
|
|
44881
45696
|
async function discoverTargetsFile(options) {
|
|
44882
45697
|
const { explicitPath, testFilePath, repoRoot, cwd } = options;
|
|
44883
45698
|
if (explicitPath) {
|
|
44884
|
-
const resolvedExplicit =
|
|
45699
|
+
const resolvedExplicit = path21.resolve(explicitPath);
|
|
44885
45700
|
if (await fileExists5(resolvedExplicit)) {
|
|
44886
45701
|
return resolvedExplicit;
|
|
44887
45702
|
}
|
|
44888
45703
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
44889
|
-
const nested =
|
|
45704
|
+
const nested = path21.join(resolvedExplicit, candidate);
|
|
44890
45705
|
if (await fileExists5(nested)) {
|
|
44891
45706
|
return nested;
|
|
44892
45707
|
}
|
|
@@ -44894,13 +45709,13 @@ async function discoverTargetsFile(options) {
|
|
|
44894
45709
|
throw new Error(`targets.yaml not found at provided path: ${resolvedExplicit}`);
|
|
44895
45710
|
}
|
|
44896
45711
|
const directories = [...buildDirectoryChain(testFilePath, repoRoot)];
|
|
44897
|
-
const resolvedCwd =
|
|
45712
|
+
const resolvedCwd = path21.resolve(cwd);
|
|
44898
45713
|
if (!directories.includes(resolvedCwd)) {
|
|
44899
45714
|
directories.push(resolvedCwd);
|
|
44900
45715
|
}
|
|
44901
45716
|
for (const directory of directories) {
|
|
44902
45717
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
44903
|
-
const fullPath =
|
|
45718
|
+
const fullPath = path21.join(directory, candidate);
|
|
44904
45719
|
if (await fileExists5(fullPath)) {
|
|
44905
45720
|
return fullPath;
|
|
44906
45721
|
}
|
|
@@ -44910,9 +45725,9 @@ async function discoverTargetsFile(options) {
|
|
|
44910
45725
|
}
|
|
44911
45726
|
|
|
44912
45727
|
// src/commands/eval/targets.ts
|
|
44913
|
-
var
|
|
44914
|
-
var
|
|
44915
|
-
var
|
|
45728
|
+
var ANSI_YELLOW8 = "\x1B[33m";
|
|
45729
|
+
var ANSI_RED3 = "\x1B[31m";
|
|
45730
|
+
var ANSI_RESET8 = "\x1B[0m";
|
|
44916
45731
|
function isTTY() {
|
|
44917
45732
|
return process.stdout.isTTY ?? false;
|
|
44918
45733
|
}
|
|
@@ -44958,8 +45773,8 @@ async function selectTarget(options) {
|
|
|
44958
45773
|
Warnings in ${targetsFilePath}:`);
|
|
44959
45774
|
for (const warning of warnings) {
|
|
44960
45775
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
44961
|
-
const prefix = useColors ? `${
|
|
44962
|
-
const message = useColors ? `${
|
|
45776
|
+
const prefix = useColors ? `${ANSI_YELLOW8} \u26A0${ANSI_RESET8}` : " \u26A0";
|
|
45777
|
+
const message = useColors ? `${ANSI_YELLOW8}${warning.message}${ANSI_RESET8}` : warning.message;
|
|
44963
45778
|
console.warn(`${prefix}${location} ${message}`);
|
|
44964
45779
|
}
|
|
44965
45780
|
console.warn("");
|
|
@@ -44970,8 +45785,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
44970
45785
|
Errors in ${targetsFilePath}:`);
|
|
44971
45786
|
for (const error40 of errors) {
|
|
44972
45787
|
const location = error40.location ? ` [${error40.location}]` : "";
|
|
44973
|
-
const prefix = useColors ? `${
|
|
44974
|
-
const message = useColors ? `${
|
|
45788
|
+
const prefix = useColors ? `${ANSI_RED3} \u2717${ANSI_RESET8}` : " \u2717";
|
|
45789
|
+
const message = useColors ? `${ANSI_RED3}${error40.message}${ANSI_RESET8}` : error40.message;
|
|
44975
45790
|
console.error(`${prefix}${location} ${message}`);
|
|
44976
45791
|
}
|
|
44977
45792
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -45054,7 +45869,7 @@ function normalizeOptions(rawOptions) {
|
|
|
45054
45869
|
return {
|
|
45055
45870
|
target: normalizeString(rawOptions.target),
|
|
45056
45871
|
targetsPath: normalizeString(rawOptions.targets),
|
|
45057
|
-
|
|
45872
|
+
filter: normalizeString(rawOptions.filter),
|
|
45058
45873
|
workers: workers > 0 ? workers : void 0,
|
|
45059
45874
|
outPath: normalizeString(rawOptions.out),
|
|
45060
45875
|
format,
|
|
@@ -45076,15 +45891,15 @@ async function ensureFileExists(filePath, description) {
|
|
|
45076
45891
|
}
|
|
45077
45892
|
}
|
|
45078
45893
|
async function findRepoRoot(start) {
|
|
45079
|
-
const fallback =
|
|
45894
|
+
const fallback = path24.resolve(start);
|
|
45080
45895
|
let current = fallback;
|
|
45081
45896
|
while (current !== void 0) {
|
|
45082
|
-
const candidate =
|
|
45897
|
+
const candidate = path24.join(current, ".git");
|
|
45083
45898
|
try {
|
|
45084
45899
|
await access6(candidate, constants6.F_OK);
|
|
45085
45900
|
return current;
|
|
45086
45901
|
} catch {
|
|
45087
|
-
const parent =
|
|
45902
|
+
const parent = path24.dirname(current);
|
|
45088
45903
|
if (parent === current) {
|
|
45089
45904
|
break;
|
|
45090
45905
|
}
|
|
@@ -45097,7 +45912,7 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
45097
45912
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
45098
45913
|
const baseName = "eval";
|
|
45099
45914
|
const extension = getDefaultExtension(format);
|
|
45100
|
-
return
|
|
45915
|
+
return path24.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
45101
45916
|
}
|
|
45102
45917
|
function createEvaluationCache() {
|
|
45103
45918
|
const store = /* @__PURE__ */ new Map();
|
|
@@ -45122,7 +45937,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
45122
45937
|
};
|
|
45123
45938
|
}
|
|
45124
45939
|
function makeEvalKey(testFilePath, evalId) {
|
|
45125
|
-
return `${
|
|
45940
|
+
return `${path24.resolve(testFilePath)}::${evalId}`;
|
|
45126
45941
|
}
|
|
45127
45942
|
function createDisplayIdTracker() {
|
|
45128
45943
|
const map2 = /* @__PURE__ */ new Map();
|
|
@@ -45179,9 +45994,9 @@ async function prepareFileMetadata(params) {
|
|
|
45179
45994
|
const inlineTargetLabel = `${selection.targetName} [provider=${providerLabel}]`;
|
|
45180
45995
|
const evalCases = await loadEvalCases(testFilePath, repoRoot, {
|
|
45181
45996
|
verbose: options.verbose,
|
|
45182
|
-
|
|
45997
|
+
filter: options.filter
|
|
45183
45998
|
});
|
|
45184
|
-
const filteredIds =
|
|
45999
|
+
const filteredIds = evalCases.map((value) => value.id);
|
|
45185
46000
|
return { evalIds: filteredIds, evalCases, selection, inlineTargetLabel };
|
|
45186
46001
|
}
|
|
45187
46002
|
async function runWithLimit(items, limit, task) {
|
|
@@ -45252,7 +46067,6 @@ async function runSingleEvalFile(params) {
|
|
|
45252
46067
|
agentTimeoutMs,
|
|
45253
46068
|
cache,
|
|
45254
46069
|
useCache: options.cache,
|
|
45255
|
-
evalId: options.evalId,
|
|
45256
46070
|
evalCases,
|
|
45257
46071
|
verbose: options.verbose,
|
|
45258
46072
|
maxConcurrency: resolvedWorkers,
|
|
@@ -45286,14 +46100,14 @@ async function runEvalCommand(input) {
|
|
|
45286
46100
|
if (options.verbose) {
|
|
45287
46101
|
console.log(`Repository root: ${repoRoot}`);
|
|
45288
46102
|
}
|
|
45289
|
-
const outputPath = options.outPath ?
|
|
46103
|
+
const outputPath = options.outPath ? path24.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
|
|
45290
46104
|
console.log(`Output path: ${outputPath}`);
|
|
45291
46105
|
const outputWriter = await createOutputWriter(outputPath, options.format);
|
|
45292
46106
|
const cache = options.cache ? createEvaluationCache() : void 0;
|
|
45293
46107
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
45294
46108
|
const allResults = [];
|
|
45295
46109
|
const seenEvalCases = /* @__PURE__ */ new Set();
|
|
45296
|
-
const resolvedTestFiles = input.testFiles.map((file2) =>
|
|
46110
|
+
const resolvedTestFiles = input.testFiles.map((file2) => path24.resolve(file2));
|
|
45297
46111
|
const displayIdTracker = createDisplayIdTracker();
|
|
45298
46112
|
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
|
|
45299
46113
|
const fileConcurrency = Math.min(
|
|
@@ -45392,7 +46206,7 @@ async function resolveEvaluationRunner() {
|
|
|
45392
46206
|
if (!overridePath) {
|
|
45393
46207
|
return runEvaluation;
|
|
45394
46208
|
}
|
|
45395
|
-
const resolved =
|
|
46209
|
+
const resolved = path24.isAbsolute(overridePath) ? overridePath : path24.resolve(process.cwd(), overridePath);
|
|
45396
46210
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
45397
46211
|
const mod = await import(moduleUrl);
|
|
45398
46212
|
const candidate = mod.runEvaluation;
|
|
@@ -45428,7 +46242,7 @@ var evalCommand = command3({
|
|
|
45428
46242
|
evalId: option3({
|
|
45429
46243
|
type: optional4(string6),
|
|
45430
46244
|
long: "eval-id",
|
|
45431
|
-
description:
|
|
46245
|
+
description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")'
|
|
45432
46246
|
}),
|
|
45433
46247
|
workers: option3({
|
|
45434
46248
|
type: number5,
|
|
@@ -45495,7 +46309,7 @@ var evalCommand = command3({
|
|
|
45495
46309
|
const rawOptions = {
|
|
45496
46310
|
target: args.target,
|
|
45497
46311
|
targets: args.targets,
|
|
45498
|
-
|
|
46312
|
+
filter: args.evalId,
|
|
45499
46313
|
workers: args.workers,
|
|
45500
46314
|
out: args.out,
|
|
45501
46315
|
outputFormat: args.outputFormat,
|
|
@@ -45519,10 +46333,10 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
45519
46333
|
const unmatched = [];
|
|
45520
46334
|
const results = /* @__PURE__ */ new Set();
|
|
45521
46335
|
for (const pattern of normalizedInputs) {
|
|
45522
|
-
const candidatePath =
|
|
46336
|
+
const candidatePath = path25.isAbsolute(pattern) ? path25.normalize(pattern) : path25.resolve(cwd, pattern);
|
|
45523
46337
|
try {
|
|
45524
46338
|
const stats = await stat4(candidatePath);
|
|
45525
|
-
if (stats.isFile() && /\.ya?ml$/i.test(candidatePath)) {
|
|
46339
|
+
if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) {
|
|
45526
46340
|
results.add(candidatePath);
|
|
45527
46341
|
continue;
|
|
45528
46342
|
}
|
|
@@ -45537,20 +46351,20 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
45537
46351
|
dot: true,
|
|
45538
46352
|
followSymbolicLinks: true
|
|
45539
46353
|
});
|
|
45540
|
-
const yamlMatches = matches.filter((filePath) => /\.ya?ml$/i.test(filePath));
|
|
46354
|
+
const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath));
|
|
45541
46355
|
if (yamlMatches.length === 0) {
|
|
45542
46356
|
unmatched.push(pattern);
|
|
45543
46357
|
continue;
|
|
45544
46358
|
}
|
|
45545
46359
|
for (const filePath of yamlMatches) {
|
|
45546
|
-
results.add(
|
|
46360
|
+
results.add(path25.normalize(filePath));
|
|
45547
46361
|
}
|
|
45548
46362
|
}
|
|
45549
46363
|
if (unmatched.length > 0) {
|
|
45550
46364
|
throw new Error(
|
|
45551
46365
|
`No eval files matched: ${unmatched.join(
|
|
45552
46366
|
", "
|
|
45553
|
-
)}. Provide YAML paths or globs (e.g., "evals/**/*.yaml").`
|
|
46367
|
+
)}. Provide YAML or JSONL paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl").`
|
|
45554
46368
|
);
|
|
45555
46369
|
}
|
|
45556
46370
|
const sorted = Array.from(results);
|
|
@@ -45562,20 +46376,20 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
45562
46376
|
import { command as command4, flag as flag3, option as option4, optional as optional5, positional as positional4, string as string7, subcommands } from "cmd-ts";
|
|
45563
46377
|
|
|
45564
46378
|
// src/commands/generate/rubrics.ts
|
|
45565
|
-
import { readFile as
|
|
45566
|
-
import
|
|
46379
|
+
import { readFile as readFile9, writeFile as writeFile6 } from "node:fs/promises";
|
|
46380
|
+
import path26 from "node:path";
|
|
45567
46381
|
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
45568
46382
|
import { isMap, isSeq, parseDocument } from "yaml";
|
|
45569
46383
|
function isJsonObject3(value) {
|
|
45570
46384
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
45571
46385
|
}
|
|
45572
|
-
function
|
|
46386
|
+
function asString7(value) {
|
|
45573
46387
|
return typeof value === "string" ? value : void 0;
|
|
45574
46388
|
}
|
|
45575
46389
|
async function loadRubricGenerator() {
|
|
45576
46390
|
const customGenerator = process.env.AGENTEVO_CLI_RUBRIC_GENERATOR;
|
|
45577
46391
|
if (customGenerator) {
|
|
45578
|
-
const generatorPath =
|
|
46392
|
+
const generatorPath = path26.resolve(customGenerator);
|
|
45579
46393
|
const generatorUrl = pathToFileURL2(generatorPath).href;
|
|
45580
46394
|
const module = await import(generatorUrl);
|
|
45581
46395
|
return module.generateRubrics;
|
|
@@ -45585,8 +46399,8 @@ async function loadRubricGenerator() {
|
|
|
45585
46399
|
async function generateRubricsCommand(options) {
|
|
45586
46400
|
const { file: file2, target: targetOverride, verbose } = options;
|
|
45587
46401
|
console.log(`Generating rubrics for: ${file2}`);
|
|
45588
|
-
const absolutePath =
|
|
45589
|
-
const content = await
|
|
46402
|
+
const absolutePath = path26.resolve(file2);
|
|
46403
|
+
const content = await readFile9(absolutePath, "utf8");
|
|
45590
46404
|
const doc = parseDocument(content);
|
|
45591
46405
|
const parsed = doc.toJSON();
|
|
45592
46406
|
if (!isJsonObject3(parsed)) {
|
|
@@ -45625,8 +46439,8 @@ async function generateRubricsCommand(options) {
|
|
|
45625
46439
|
continue;
|
|
45626
46440
|
}
|
|
45627
46441
|
const evalCase = rawCase;
|
|
45628
|
-
const id =
|
|
45629
|
-
const expectedOutcome =
|
|
46442
|
+
const id = asString7(evalCase.id) ?? "unknown";
|
|
46443
|
+
const expectedOutcome = asString7(evalCase.expected_outcome) ?? asString7(evalCase.outcome);
|
|
45630
46444
|
if (!expectedOutcome) {
|
|
45631
46445
|
if (verbose) {
|
|
45632
46446
|
console.log(` Skipping ${id}: no expected_outcome`);
|
|
@@ -45643,7 +46457,7 @@ async function generateRubricsCommand(options) {
|
|
|
45643
46457
|
}
|
|
45644
46458
|
console.log(` Generating rubrics for: ${id}`);
|
|
45645
46459
|
const question = extractQuestion(evalCase);
|
|
45646
|
-
const referenceAnswer =
|
|
46460
|
+
const referenceAnswer = asString7(evalCase.reference_answer);
|
|
45647
46461
|
const rubrics = await generateRubricsFunc({
|
|
45648
46462
|
expectedOutcome,
|
|
45649
46463
|
question,
|
|
@@ -45654,14 +46468,12 @@ async function generateRubricsCommand(options) {
|
|
|
45654
46468
|
if (caseNode && isMap(caseNode)) {
|
|
45655
46469
|
caseNode.set(
|
|
45656
46470
|
"rubrics",
|
|
45657
|
-
rubrics.map(
|
|
45658
|
-
|
|
45659
|
-
|
|
45660
|
-
|
|
45661
|
-
|
|
45662
|
-
|
|
45663
|
-
})
|
|
45664
|
-
)
|
|
46471
|
+
rubrics.filter((r) => r.expected_outcome !== void 0).map((r) => ({
|
|
46472
|
+
id: r.id,
|
|
46473
|
+
expected_outcome: r.expected_outcome,
|
|
46474
|
+
weight: r.weight,
|
|
46475
|
+
required: r.required ?? true
|
|
46476
|
+
}))
|
|
45665
46477
|
);
|
|
45666
46478
|
}
|
|
45667
46479
|
updatedCount++;
|
|
@@ -45682,7 +46494,7 @@ Updated ${updatedCount} eval case(s) with generated rubrics`);
|
|
|
45682
46494
|
}
|
|
45683
46495
|
}
|
|
45684
46496
|
function extractQuestion(evalCase) {
|
|
45685
|
-
const explicitQuestion =
|
|
46497
|
+
const explicitQuestion = asString7(evalCase.question);
|
|
45686
46498
|
if (explicitQuestion) {
|
|
45687
46499
|
return explicitQuestion;
|
|
45688
46500
|
}
|
|
@@ -45746,24 +46558,24 @@ var generateCommand = subcommands({
|
|
|
45746
46558
|
|
|
45747
46559
|
// src/commands/init/index.ts
|
|
45748
46560
|
import { existsSync, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
|
|
45749
|
-
import
|
|
46561
|
+
import path28 from "node:path";
|
|
45750
46562
|
import * as readline from "node:readline/promises";
|
|
45751
46563
|
import { command as command5, option as option5, optional as optional6, string as string8 } from "cmd-ts";
|
|
45752
46564
|
|
|
45753
46565
|
// src/templates/index.ts
|
|
45754
46566
|
import { readFileSync as readFileSync3, readdirSync, statSync } from "node:fs";
|
|
45755
|
-
import
|
|
46567
|
+
import path27 from "node:path";
|
|
45756
46568
|
import { fileURLToPath } from "node:url";
|
|
45757
46569
|
function getGithubTemplates() {
|
|
45758
46570
|
if (isDistRuntime()) {
|
|
45759
46571
|
return getTemplatesFromDir(".github");
|
|
45760
46572
|
}
|
|
45761
46573
|
const templatesDir = getRepoRootFromDev();
|
|
45762
|
-
const promptsDir =
|
|
46574
|
+
const promptsDir = path27.join(templatesDir, ".github", "prompts");
|
|
45763
46575
|
const promptFiles = readdirSync(promptsDir).filter((file2) => file2.startsWith("agentv-"));
|
|
45764
46576
|
return promptFiles.map((file2) => ({
|
|
45765
46577
|
path: `prompts/${file2}`,
|
|
45766
|
-
content: readFileSync3(
|
|
46578
|
+
content: readFileSync3(path27.join(promptsDir, file2), "utf-8")
|
|
45767
46579
|
}));
|
|
45768
46580
|
}
|
|
45769
46581
|
function getAgentvTemplates() {
|
|
@@ -45774,47 +46586,47 @@ function getClaudeTemplates() {
|
|
|
45774
46586
|
return getTemplatesFromDir(".claude");
|
|
45775
46587
|
}
|
|
45776
46588
|
const repoRoot = getRepoRootFromDev();
|
|
45777
|
-
const skillsRoot =
|
|
46589
|
+
const skillsRoot = path27.join(repoRoot, ".claude", "skills");
|
|
45778
46590
|
const skillsToInclude = ["agentv-eval-builder", "agentv-prompt-optimizer"];
|
|
45779
46591
|
const templates = [];
|
|
45780
46592
|
for (const skill of skillsToInclude) {
|
|
45781
|
-
const skillDir =
|
|
45782
|
-
const skillTemplates = readTemplatesRecursively(skillDir,
|
|
46593
|
+
const skillDir = path27.join(skillsRoot, skill);
|
|
46594
|
+
const skillTemplates = readTemplatesRecursively(skillDir, path27.join("skills", skill));
|
|
45783
46595
|
templates.push(...skillTemplates);
|
|
45784
46596
|
}
|
|
45785
46597
|
return templates;
|
|
45786
46598
|
}
|
|
45787
46599
|
function getTemplatesFromDir(subdir) {
|
|
45788
|
-
const currentDir =
|
|
46600
|
+
const currentDir = path27.dirname(fileURLToPath(import.meta.url));
|
|
45789
46601
|
let templatesDir;
|
|
45790
|
-
if (currentDir.includes(`${
|
|
45791
|
-
templatesDir =
|
|
46602
|
+
if (currentDir.includes(`${path27.sep}dist`)) {
|
|
46603
|
+
templatesDir = path27.join(currentDir, "templates", subdir);
|
|
45792
46604
|
} else {
|
|
45793
|
-
templatesDir =
|
|
46605
|
+
templatesDir = path27.join(currentDir, subdir);
|
|
45794
46606
|
}
|
|
45795
46607
|
return readTemplatesRecursively(templatesDir, "");
|
|
45796
46608
|
}
|
|
45797
46609
|
function isDistRuntime() {
|
|
45798
|
-
const currentDir =
|
|
45799
|
-
return currentDir.includes(`${
|
|
46610
|
+
const currentDir = path27.dirname(fileURLToPath(import.meta.url));
|
|
46611
|
+
return currentDir.includes(`${path27.sep}dist`);
|
|
45800
46612
|
}
|
|
45801
46613
|
function getRepoRootFromDev() {
|
|
45802
|
-
const currentDir =
|
|
45803
|
-
return
|
|
46614
|
+
const currentDir = path27.dirname(fileURLToPath(import.meta.url));
|
|
46615
|
+
return path27.resolve(currentDir, "..", "..", "..", "..");
|
|
45804
46616
|
}
|
|
45805
46617
|
function readTemplatesRecursively(dir, relativePath) {
|
|
45806
46618
|
const templates = [];
|
|
45807
46619
|
const entries = readdirSync(dir);
|
|
45808
46620
|
for (const entry of entries) {
|
|
45809
|
-
const fullPath =
|
|
46621
|
+
const fullPath = path27.join(dir, entry);
|
|
45810
46622
|
const stat6 = statSync(fullPath);
|
|
45811
|
-
const entryRelativePath = relativePath ?
|
|
46623
|
+
const entryRelativePath = relativePath ? path27.join(relativePath, entry) : entry;
|
|
45812
46624
|
if (stat6.isDirectory()) {
|
|
45813
46625
|
templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
|
|
45814
46626
|
} else {
|
|
45815
46627
|
const content = readFileSync3(fullPath, "utf-8");
|
|
45816
46628
|
templates.push({
|
|
45817
|
-
path: entryRelativePath.split(
|
|
46629
|
+
path: entryRelativePath.split(path27.sep).join("/"),
|
|
45818
46630
|
// Normalize to forward slashes
|
|
45819
46631
|
content
|
|
45820
46632
|
});
|
|
@@ -45837,10 +46649,10 @@ async function promptYesNo(message) {
|
|
|
45837
46649
|
}
|
|
45838
46650
|
}
|
|
45839
46651
|
async function initCommand(options = {}) {
|
|
45840
|
-
const targetPath =
|
|
45841
|
-
const githubDir =
|
|
45842
|
-
const agentvDir =
|
|
45843
|
-
const claudeDir =
|
|
46652
|
+
const targetPath = path28.resolve(options.targetPath ?? ".");
|
|
46653
|
+
const githubDir = path28.join(targetPath, ".github");
|
|
46654
|
+
const agentvDir = path28.join(targetPath, ".agentv");
|
|
46655
|
+
const claudeDir = path28.join(targetPath, ".claude");
|
|
45844
46656
|
const githubTemplates = getGithubTemplates();
|
|
45845
46657
|
const agentvTemplates = getAgentvTemplates();
|
|
45846
46658
|
const claudeTemplates = getClaudeTemplates();
|
|
@@ -45848,32 +46660,32 @@ async function initCommand(options = {}) {
|
|
|
45848
46660
|
const otherAgentvTemplates = agentvTemplates.filter((t) => t.path !== ".env.example");
|
|
45849
46661
|
const existingFiles = [];
|
|
45850
46662
|
if (envTemplate) {
|
|
45851
|
-
const envFilePath =
|
|
46663
|
+
const envFilePath = path28.join(targetPath, ".env.example");
|
|
45852
46664
|
if (existsSync(envFilePath)) {
|
|
45853
46665
|
existingFiles.push(".env.example");
|
|
45854
46666
|
}
|
|
45855
46667
|
}
|
|
45856
46668
|
if (existsSync(githubDir)) {
|
|
45857
46669
|
for (const template of githubTemplates) {
|
|
45858
|
-
const targetFilePath =
|
|
46670
|
+
const targetFilePath = path28.join(githubDir, template.path);
|
|
45859
46671
|
if (existsSync(targetFilePath)) {
|
|
45860
|
-
existingFiles.push(
|
|
46672
|
+
existingFiles.push(path28.relative(targetPath, targetFilePath));
|
|
45861
46673
|
}
|
|
45862
46674
|
}
|
|
45863
46675
|
}
|
|
45864
46676
|
if (existsSync(agentvDir)) {
|
|
45865
46677
|
for (const template of otherAgentvTemplates) {
|
|
45866
|
-
const targetFilePath =
|
|
46678
|
+
const targetFilePath = path28.join(agentvDir, template.path);
|
|
45867
46679
|
if (existsSync(targetFilePath)) {
|
|
45868
|
-
existingFiles.push(
|
|
46680
|
+
existingFiles.push(path28.relative(targetPath, targetFilePath));
|
|
45869
46681
|
}
|
|
45870
46682
|
}
|
|
45871
46683
|
}
|
|
45872
46684
|
if (existsSync(claudeDir)) {
|
|
45873
46685
|
for (const template of claudeTemplates) {
|
|
45874
|
-
const targetFilePath =
|
|
46686
|
+
const targetFilePath = path28.join(claudeDir, template.path);
|
|
45875
46687
|
if (existsSync(targetFilePath)) {
|
|
45876
|
-
existingFiles.push(
|
|
46688
|
+
existingFiles.push(path28.relative(targetPath, targetFilePath));
|
|
45877
46689
|
}
|
|
45878
46690
|
}
|
|
45879
46691
|
}
|
|
@@ -45900,36 +46712,36 @@ async function initCommand(options = {}) {
|
|
|
45900
46712
|
mkdirSync(claudeDir, { recursive: true });
|
|
45901
46713
|
}
|
|
45902
46714
|
if (envTemplate) {
|
|
45903
|
-
const envFilePath =
|
|
46715
|
+
const envFilePath = path28.join(targetPath, ".env.example");
|
|
45904
46716
|
writeFileSync2(envFilePath, envTemplate.content, "utf-8");
|
|
45905
46717
|
console.log("Created .env.example");
|
|
45906
46718
|
}
|
|
45907
46719
|
for (const template of githubTemplates) {
|
|
45908
|
-
const targetFilePath =
|
|
45909
|
-
const targetDirPath =
|
|
46720
|
+
const targetFilePath = path28.join(githubDir, template.path);
|
|
46721
|
+
const targetDirPath = path28.dirname(targetFilePath);
|
|
45910
46722
|
if (!existsSync(targetDirPath)) {
|
|
45911
46723
|
mkdirSync(targetDirPath, { recursive: true });
|
|
45912
46724
|
}
|
|
45913
46725
|
writeFileSync2(targetFilePath, template.content, "utf-8");
|
|
45914
|
-
console.log(`Created ${
|
|
46726
|
+
console.log(`Created ${path28.relative(targetPath, targetFilePath)}`);
|
|
45915
46727
|
}
|
|
45916
46728
|
for (const template of otherAgentvTemplates) {
|
|
45917
|
-
const targetFilePath =
|
|
45918
|
-
const targetDirPath =
|
|
46729
|
+
const targetFilePath = path28.join(agentvDir, template.path);
|
|
46730
|
+
const targetDirPath = path28.dirname(targetFilePath);
|
|
45919
46731
|
if (!existsSync(targetDirPath)) {
|
|
45920
46732
|
mkdirSync(targetDirPath, { recursive: true });
|
|
45921
46733
|
}
|
|
45922
46734
|
writeFileSync2(targetFilePath, template.content, "utf-8");
|
|
45923
|
-
console.log(`Created ${
|
|
46735
|
+
console.log(`Created ${path28.relative(targetPath, targetFilePath)}`);
|
|
45924
46736
|
}
|
|
45925
46737
|
for (const template of claudeTemplates) {
|
|
45926
|
-
const targetFilePath =
|
|
45927
|
-
const targetDirPath =
|
|
46738
|
+
const targetFilePath = path28.join(claudeDir, template.path);
|
|
46739
|
+
const targetDirPath = path28.dirname(targetFilePath);
|
|
45928
46740
|
if (!existsSync(targetDirPath)) {
|
|
45929
46741
|
mkdirSync(targetDirPath, { recursive: true });
|
|
45930
46742
|
}
|
|
45931
46743
|
writeFileSync2(targetFilePath, template.content, "utf-8");
|
|
45932
|
-
console.log(`Created ${
|
|
46744
|
+
console.log(`Created ${path28.relative(targetPath, targetFilePath)}`);
|
|
45933
46745
|
}
|
|
45934
46746
|
console.log("\nAgentV initialized successfully!");
|
|
45935
46747
|
console.log("\nFiles installed to root:");
|
|
@@ -45937,17 +46749,17 @@ async function initCommand(options = {}) {
|
|
|
45937
46749
|
console.log(" - .env.example");
|
|
45938
46750
|
}
|
|
45939
46751
|
console.log(`
|
|
45940
|
-
Files installed to ${
|
|
46752
|
+
Files installed to ${path28.relative(targetPath, githubDir)}:`);
|
|
45941
46753
|
for (const t of githubTemplates) {
|
|
45942
46754
|
console.log(` - ${t.path}`);
|
|
45943
46755
|
}
|
|
45944
46756
|
console.log(`
|
|
45945
|
-
Files installed to ${
|
|
46757
|
+
Files installed to ${path28.relative(targetPath, agentvDir)}:`);
|
|
45946
46758
|
for (const t of otherAgentvTemplates) {
|
|
45947
46759
|
console.log(` - ${t.path}`);
|
|
45948
46760
|
}
|
|
45949
46761
|
console.log(`
|
|
45950
|
-
Files installed to ${
|
|
46762
|
+
Files installed to ${path28.relative(targetPath, claudeDir)}:`);
|
|
45951
46763
|
for (const t of claudeTemplates) {
|
|
45952
46764
|
console.log(` - ${t.path}`);
|
|
45953
46765
|
}
|
|
@@ -45980,12 +46792,12 @@ var initCmdTsCommand = command5({
|
|
|
45980
46792
|
import { command as command6, restPositionals as restPositionals2, string as string9 } from "cmd-ts";
|
|
45981
46793
|
|
|
45982
46794
|
// src/commands/validate/format-output.ts
|
|
45983
|
-
var
|
|
45984
|
-
var
|
|
46795
|
+
var ANSI_RED4 = "\x1B[31m";
|
|
46796
|
+
var ANSI_YELLOW9 = "\x1B[33m";
|
|
45985
46797
|
var ANSI_GREEN = "\x1B[32m";
|
|
45986
46798
|
var ANSI_CYAN = "\x1B[36m";
|
|
45987
46799
|
var ANSI_BOLD = "\x1B[1m";
|
|
45988
|
-
var
|
|
46800
|
+
var ANSI_RESET9 = "\x1B[0m";
|
|
45989
46801
|
function formatSummary(summary, useColors) {
|
|
45990
46802
|
const lines = [];
|
|
45991
46803
|
lines.push("");
|
|
@@ -46001,15 +46813,15 @@ function formatSummary(summary, useColors) {
|
|
|
46001
46813
|
}
|
|
46002
46814
|
function formatHeader(text2, useColors) {
|
|
46003
46815
|
if (useColors) {
|
|
46004
|
-
return `${ANSI_BOLD}${ANSI_CYAN}${text2}${
|
|
46816
|
+
return `${ANSI_BOLD}${ANSI_CYAN}${text2}${ANSI_RESET9}`;
|
|
46005
46817
|
}
|
|
46006
46818
|
return text2;
|
|
46007
46819
|
}
|
|
46008
46820
|
function formatFileResult(result, useColors) {
|
|
46009
46821
|
const lines = [];
|
|
46010
46822
|
const status = result.valid ? "\u2713" : "\u2717";
|
|
46011
|
-
const statusColor = result.valid ? ANSI_GREEN :
|
|
46012
|
-
const statusText = useColors ? `${statusColor}${status}${
|
|
46823
|
+
const statusColor = result.valid ? ANSI_GREEN : ANSI_RED4;
|
|
46824
|
+
const statusText = useColors ? `${statusColor}${status}${ANSI_RESET9}` : status;
|
|
46013
46825
|
const fileName = result.filePath;
|
|
46014
46826
|
lines.push(`${statusText} ${fileName}`);
|
|
46015
46827
|
if (result.errors.length > 0) {
|
|
@@ -46021,8 +46833,8 @@ function formatFileResult(result, useColors) {
|
|
|
46021
46833
|
}
|
|
46022
46834
|
function formatError2(error40, useColors) {
|
|
46023
46835
|
const prefix = error40.severity === "error" ? " \u2717" : " \u26A0";
|
|
46024
|
-
const color = error40.severity === "error" ?
|
|
46025
|
-
const coloredPrefix = useColors ? `${color}${prefix}${
|
|
46836
|
+
const color = error40.severity === "error" ? ANSI_RED4 : ANSI_YELLOW9;
|
|
46837
|
+
const coloredPrefix = useColors ? `${color}${prefix}${ANSI_RESET9}` : prefix;
|
|
46026
46838
|
const location = error40.location ? ` [${error40.location}]` : "";
|
|
46027
46839
|
return `${coloredPrefix}${location} ${error40.message}`;
|
|
46028
46840
|
}
|
|
@@ -46035,15 +46847,15 @@ function formatStats(summary, useColors) {
|
|
|
46035
46847
|
(r) => r.errors.some((e) => e.severity === "warning")
|
|
46036
46848
|
).length;
|
|
46037
46849
|
if (useColors) {
|
|
46038
|
-
lines.push(`${ANSI_BOLD}${totalText}${
|
|
46039
|
-
lines.push(`${ANSI_GREEN}${validText}${
|
|
46850
|
+
lines.push(`${ANSI_BOLD}${totalText}${ANSI_RESET9}`);
|
|
46851
|
+
lines.push(`${ANSI_GREEN}${validText}${ANSI_RESET9}`);
|
|
46040
46852
|
if (summary.invalidFiles > 0) {
|
|
46041
|
-
lines.push(`${
|
|
46853
|
+
lines.push(`${ANSI_RED4}${invalidText}${ANSI_RESET9}`);
|
|
46042
46854
|
} else {
|
|
46043
46855
|
lines.push(invalidText);
|
|
46044
46856
|
}
|
|
46045
46857
|
if (filesWithWarnings > 0) {
|
|
46046
|
-
lines.push(`${
|
|
46858
|
+
lines.push(`${ANSI_YELLOW9}Files with warnings: ${filesWithWarnings}${ANSI_RESET9}`);
|
|
46047
46859
|
}
|
|
46048
46860
|
} else {
|
|
46049
46861
|
lines.push(totalText);
|
|
@@ -46062,7 +46874,7 @@ function isTTY2() {
|
|
|
46062
46874
|
// src/commands/validate/validate-files.ts
|
|
46063
46875
|
import { constants as constants7 } from "node:fs";
|
|
46064
46876
|
import { access as access7, readdir as readdir3, stat as stat5 } from "node:fs/promises";
|
|
46065
|
-
import
|
|
46877
|
+
import path29 from "node:path";
|
|
46066
46878
|
async function validateFiles(paths) {
|
|
46067
46879
|
const filePaths = await expandPaths(paths);
|
|
46068
46880
|
const results = [];
|
|
@@ -46080,7 +46892,7 @@ async function validateFiles(paths) {
|
|
|
46080
46892
|
};
|
|
46081
46893
|
}
|
|
46082
46894
|
async function validateSingleFile(filePath) {
|
|
46083
|
-
const absolutePath =
|
|
46895
|
+
const absolutePath = path29.resolve(filePath);
|
|
46084
46896
|
const fileType = await detectFileType(absolutePath);
|
|
46085
46897
|
let result;
|
|
46086
46898
|
if (fileType === "eval") {
|
|
@@ -46105,7 +46917,7 @@ async function validateSingleFile(filePath) {
|
|
|
46105
46917
|
async function expandPaths(paths) {
|
|
46106
46918
|
const expanded = [];
|
|
46107
46919
|
for (const inputPath of paths) {
|
|
46108
|
-
const absolutePath =
|
|
46920
|
+
const absolutePath = path29.resolve(inputPath);
|
|
46109
46921
|
try {
|
|
46110
46922
|
await access7(absolutePath, constants7.F_OK);
|
|
46111
46923
|
} catch {
|
|
@@ -46129,7 +46941,7 @@ async function findYamlFiles(dirPath) {
|
|
|
46129
46941
|
try {
|
|
46130
46942
|
const entries = await readdir3(dirPath, { withFileTypes: true });
|
|
46131
46943
|
for (const entry of entries) {
|
|
46132
|
-
const fullPath =
|
|
46944
|
+
const fullPath = path29.join(dirPath, entry.name);
|
|
46133
46945
|
if (entry.isDirectory()) {
|
|
46134
46946
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
46135
46947
|
continue;
|
|
@@ -46146,7 +46958,7 @@ async function findYamlFiles(dirPath) {
|
|
|
46146
46958
|
return results;
|
|
46147
46959
|
}
|
|
46148
46960
|
function isYamlFile(filePath) {
|
|
46149
|
-
const ext =
|
|
46961
|
+
const ext = path29.extname(filePath).toLowerCase();
|
|
46150
46962
|
return ext === ".yaml" || ext === ".yml";
|
|
46151
46963
|
}
|
|
46152
46964
|
|
|
@@ -46206,4 +47018,4 @@ export {
|
|
|
46206
47018
|
app,
|
|
46207
47019
|
runCli
|
|
46208
47020
|
};
|
|
46209
|
-
//# sourceMappingURL=chunk-
|
|
47021
|
+
//# sourceMappingURL=chunk-XREH4WAJ.js.map
|