@agentv/core 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IOCVST3R.js → chunk-YCIZ33BO.js} +28 -11
- package/dist/chunk-YCIZ33BO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +68 -64
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +64 -67
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +137 -85
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -11
- package/dist/index.d.ts +11 -11
- package/dist/index.js +108 -68
- package/dist/index.js.map +1 -1
- package/package.json +15 -16
- package/LICENSE +0 -21
- package/dist/chunk-IOCVST3R.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -125,11 +125,11 @@ function extractCodeBlocks(segments) {
|
|
|
125
125
|
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
126
126
|
const codeBlocks = [];
|
|
127
127
|
for (const segment of segments) {
|
|
128
|
-
const typeValue = segment
|
|
128
|
+
const typeValue = segment.type;
|
|
129
129
|
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
130
130
|
continue;
|
|
131
131
|
}
|
|
132
|
-
const textValue = segment
|
|
132
|
+
const textValue = segment.value;
|
|
133
133
|
if (typeof textValue !== "string") {
|
|
134
134
|
continue;
|
|
135
135
|
}
|
|
@@ -200,9 +200,9 @@ function asString(value) {
|
|
|
200
200
|
}
|
|
201
201
|
|
|
202
202
|
// src/evaluation/loaders/config-loader.ts
|
|
203
|
-
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
204
203
|
var import_promises2 = require("fs/promises");
|
|
205
204
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
205
|
+
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
206
206
|
var import_yaml = require("yaml");
|
|
207
207
|
|
|
208
208
|
// src/evaluation/loaders/file-resolver.ts
|
|
@@ -344,8 +344,9 @@ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
|
344
344
|
guideline_patterns: guidelinePatterns
|
|
345
345
|
};
|
|
346
346
|
} catch (error) {
|
|
347
|
-
logWarning(
|
|
348
|
-
|
|
347
|
+
logWarning(
|
|
348
|
+
`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`
|
|
349
|
+
);
|
|
349
350
|
}
|
|
350
351
|
}
|
|
351
352
|
return null;
|
|
@@ -388,9 +389,7 @@ var TEMPLATE_VARIABLES = {
|
|
|
388
389
|
REFERENCE_ANSWER: "reference_answer",
|
|
389
390
|
INPUT_MESSAGES: "input_messages"
|
|
390
391
|
};
|
|
391
|
-
var VALID_TEMPLATE_VARIABLES = new Set(
|
|
392
|
-
Object.values(TEMPLATE_VARIABLES)
|
|
393
|
-
);
|
|
392
|
+
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
394
393
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
395
394
|
TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
|
|
396
395
|
TEMPLATE_VARIABLES.EXPECTED_MESSAGES
|
|
@@ -407,13 +406,14 @@ function validateTemplateVariables(content, source) {
|
|
|
407
406
|
const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
|
|
408
407
|
const foundVariables = /* @__PURE__ */ new Set();
|
|
409
408
|
const invalidVariables = [];
|
|
410
|
-
let match;
|
|
411
|
-
while (
|
|
409
|
+
let match = variablePattern.exec(content);
|
|
410
|
+
while (match !== null) {
|
|
412
411
|
const varName = match[1];
|
|
413
412
|
foundVariables.add(varName);
|
|
414
413
|
if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
|
|
415
414
|
invalidVariables.push(varName);
|
|
416
415
|
}
|
|
416
|
+
match = variablePattern.exec(content);
|
|
417
417
|
}
|
|
418
418
|
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
|
|
419
419
|
const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
|
|
@@ -850,7 +850,14 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
|
850
850
|
return messagesWithContent > 1;
|
|
851
851
|
}
|
|
852
852
|
function buildChatPromptFromSegments(options) {
|
|
853
|
-
const {
|
|
853
|
+
const {
|
|
854
|
+
messages,
|
|
855
|
+
segmentsByMessage,
|
|
856
|
+
guidelinePatterns,
|
|
857
|
+
guidelineContent,
|
|
858
|
+
systemPrompt,
|
|
859
|
+
mode = "lm"
|
|
860
|
+
} = options;
|
|
854
861
|
if (messages.length === 0) {
|
|
855
862
|
return void 0;
|
|
856
863
|
}
|
|
@@ -932,7 +939,6 @@ function logWarning4(message) {
|
|
|
932
939
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
933
940
|
var ANSI_RED = "\x1B[31m";
|
|
934
941
|
var ANSI_RESET6 = "\x1B[0m";
|
|
935
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
936
942
|
async function readTestSuiteMetadata(testFilePath) {
|
|
937
943
|
try {
|
|
938
944
|
const absolutePath = import_node_path6.default.resolve(testFilePath);
|
|
@@ -963,12 +969,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
963
969
|
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
964
970
|
const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
965
971
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
966
|
-
const schema = suite.$schema;
|
|
967
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
968
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
969
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
970
|
-
throw new Error(message);
|
|
971
|
-
}
|
|
972
972
|
const rawTestcases = suite.evalcases;
|
|
973
973
|
if (!Array.isArray(rawTestcases)) {
|
|
974
974
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
@@ -992,11 +992,15 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
992
992
|
const inputMessagesValue = evalcase.input_messages;
|
|
993
993
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
994
994
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
995
|
-
logError(
|
|
995
|
+
logError(
|
|
996
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
997
|
+
);
|
|
996
998
|
continue;
|
|
997
999
|
}
|
|
998
1000
|
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
999
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1001
|
+
const inputMessages = inputMessagesValue.filter(
|
|
1002
|
+
(msg) => isTestMessage(msg)
|
|
1003
|
+
);
|
|
1000
1004
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1001
1005
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1002
1006
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
@@ -1848,9 +1852,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
1848
1852
|
options?.guidelineOverrides
|
|
1849
1853
|
);
|
|
1850
1854
|
const inputFilesList = collectInputFiles(inputFiles);
|
|
1851
|
-
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1852
|
-
(file) => !guidelineFiles.includes(file)
|
|
1853
|
-
);
|
|
1855
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
1854
1856
|
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
1855
1857
|
if (prereadBlock.length > 0) {
|
|
1856
1858
|
parts.push("\n", prereadBlock);
|
|
@@ -2022,7 +2024,15 @@ var CodexProvider = class {
|
|
|
2022
2024
|
return import_node_path10.default.resolve(this.config.cwd);
|
|
2023
2025
|
}
|
|
2024
2026
|
buildCodexArgs() {
|
|
2025
|
-
const args = [
|
|
2027
|
+
const args = [
|
|
2028
|
+
"--ask-for-approval",
|
|
2029
|
+
"never",
|
|
2030
|
+
"exec",
|
|
2031
|
+
"--json",
|
|
2032
|
+
"--color",
|
|
2033
|
+
"never",
|
|
2034
|
+
"--skip-git-repo-check"
|
|
2035
|
+
];
|
|
2026
2036
|
if (this.config.args && this.config.args.length > 0) {
|
|
2027
2037
|
args.push(...this.config.args);
|
|
2028
2038
|
}
|
|
@@ -2646,7 +2656,14 @@ var MockProvider = class {
|
|
|
2646
2656
|
|
|
2647
2657
|
// src/evaluation/providers/targets.ts
|
|
2648
2658
|
var import_zod = require("zod");
|
|
2649
|
-
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
2659
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
2660
|
+
"PROMPT",
|
|
2661
|
+
"GUIDELINES",
|
|
2662
|
+
"EVAL_ID",
|
|
2663
|
+
"ATTEMPT",
|
|
2664
|
+
"FILES",
|
|
2665
|
+
"OUTPUT_FILE"
|
|
2666
|
+
]);
|
|
2650
2667
|
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
2651
2668
|
name: import_zod.z.string().min(1, "target name is required"),
|
|
2652
2669
|
provider: import_zod.z.string().min(1, "provider is required"),
|
|
@@ -2891,11 +2908,18 @@ function resolveMockConfig(target) {
|
|
|
2891
2908
|
return { response };
|
|
2892
2909
|
}
|
|
2893
2910
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
2894
|
-
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
2895
|
-
|
|
2896
|
-
|
|
2897
|
-
|
|
2898
|
-
|
|
2911
|
+
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
2912
|
+
target.workspace_template ?? target.workspaceTemplate
|
|
2913
|
+
);
|
|
2914
|
+
const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
|
|
2915
|
+
workspaceTemplateEnvVar,
|
|
2916
|
+
env,
|
|
2917
|
+
`${target.name} workspace template path`,
|
|
2918
|
+
{
|
|
2919
|
+
allowLiteral: false,
|
|
2920
|
+
optionalEnv: true
|
|
2921
|
+
}
|
|
2922
|
+
) : void 0;
|
|
2899
2923
|
const commandSource = target.vscode_cmd ?? target.command;
|
|
2900
2924
|
const waitSource = target.wait;
|
|
2901
2925
|
const dryRunSource = target.dry_run ?? target.dryRun;
|
|
@@ -2922,7 +2946,10 @@ function resolveCliConfig(target, env) {
|
|
|
2922
2946
|
allowLiteral: true,
|
|
2923
2947
|
optionalEnv: true
|
|
2924
2948
|
});
|
|
2925
|
-
const timeoutMs = resolveTimeoutMs(
|
|
2949
|
+
const timeoutMs = resolveTimeoutMs(
|
|
2950
|
+
target.timeout_seconds ?? target.timeoutSeconds,
|
|
2951
|
+
`${target.name} timeout`
|
|
2952
|
+
);
|
|
2926
2953
|
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
|
|
2927
2954
|
const commandTemplate = resolveString(
|
|
2928
2955
|
commandTemplateSource,
|
|
@@ -3050,7 +3077,9 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
3050
3077
|
}
|
|
3051
3078
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
3052
3079
|
if (!allowLiteral) {
|
|
3053
|
-
throw new Error(
|
|
3080
|
+
throw new Error(
|
|
3081
|
+
`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`
|
|
3082
|
+
);
|
|
3054
3083
|
}
|
|
3055
3084
|
return trimmed;
|
|
3056
3085
|
}
|
|
@@ -3274,9 +3303,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
3274
3303
|
}
|
|
3275
3304
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
3276
3305
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
3277
|
-
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
3278
|
-
(file) => !guidelineFiles.includes(file)
|
|
3279
|
-
);
|
|
3306
|
+
const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
|
|
3280
3307
|
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
|
|
3281
3308
|
if (prereadBlock.length > 0) {
|
|
3282
3309
|
parts.push("\n", prereadBlock);
|
|
@@ -3385,8 +3412,10 @@ async function ensureVSCodeSubagents(options) {
|
|
|
3385
3412
|
if (result.skippedExisting.length > 0) {
|
|
3386
3413
|
console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
|
|
3387
3414
|
}
|
|
3388
|
-
console.log(
|
|
3389
|
-
|
|
3415
|
+
console.log(
|
|
3416
|
+
`
|
|
3417
|
+
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`
|
|
3418
|
+
);
|
|
3390
3419
|
}
|
|
3391
3420
|
return {
|
|
3392
3421
|
provisioned: true,
|
|
@@ -3409,43 +3438,9 @@ var import_node_fs4 = require("fs");
|
|
|
3409
3438
|
var import_promises10 = require("fs/promises");
|
|
3410
3439
|
var import_node_path12 = __toESM(require("path"), 1);
|
|
3411
3440
|
var import_yaml3 = require("yaml");
|
|
3412
|
-
|
|
3413
|
-
// src/evaluation/providers/types.ts
|
|
3414
|
-
var AGENT_PROVIDER_KINDS = [
|
|
3415
|
-
"codex",
|
|
3416
|
-
"vscode",
|
|
3417
|
-
"vscode-insiders"
|
|
3418
|
-
];
|
|
3419
|
-
var TARGETS_SCHEMA_V2 = "agentv-targets-v2.2";
|
|
3420
|
-
function isAgentProvider(provider) {
|
|
3421
|
-
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
3422
|
-
}
|
|
3423
|
-
|
|
3424
|
-
// src/evaluation/providers/targets-file.ts
|
|
3425
3441
|
function isRecord(value) {
|
|
3426
3442
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3427
3443
|
}
|
|
3428
|
-
function checkSchema(parsed, absolutePath) {
|
|
3429
|
-
const schema = parsed.$schema;
|
|
3430
|
-
if (schema === void 0) {
|
|
3431
|
-
throw new Error(
|
|
3432
|
-
`Missing $schema field in targets.yaml at ${absolutePath}.
|
|
3433
|
-
Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
|
|
3434
|
-
);
|
|
3435
|
-
}
|
|
3436
|
-
if (typeof schema !== "string") {
|
|
3437
|
-
throw new Error(
|
|
3438
|
-
`Invalid $schema field in targets.yaml at ${absolutePath}.
|
|
3439
|
-
Expected a string value '${TARGETS_SCHEMA_V2}'.`
|
|
3440
|
-
);
|
|
3441
|
-
}
|
|
3442
|
-
if (schema !== TARGETS_SCHEMA_V2) {
|
|
3443
|
-
throw new Error(
|
|
3444
|
-
`Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
|
|
3445
|
-
Expected '${TARGETS_SCHEMA_V2}'.`
|
|
3446
|
-
);
|
|
3447
|
-
}
|
|
3448
|
-
}
|
|
3449
3444
|
function extractTargetsArray(parsed, absolutePath) {
|
|
3450
3445
|
const targets = parsed.targets;
|
|
3451
3446
|
if (!Array.isArray(targets)) {
|
|
@@ -3460,7 +3455,9 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
3460
3455
|
const name = value.name;
|
|
3461
3456
|
const provider = value.provider;
|
|
3462
3457
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
3463
|
-
throw new Error(
|
|
3458
|
+
throw new Error(
|
|
3459
|
+
`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`
|
|
3460
|
+
);
|
|
3464
3461
|
}
|
|
3465
3462
|
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
3466
3463
|
throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
|
|
@@ -3483,11 +3480,12 @@ async function readTargetDefinitions(filePath) {
|
|
|
3483
3480
|
const raw = await (0, import_promises10.readFile)(absolutePath, "utf8");
|
|
3484
3481
|
const parsed = (0, import_yaml3.parse)(raw);
|
|
3485
3482
|
if (!isRecord(parsed)) {
|
|
3486
|
-
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with
|
|
3483
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
3487
3484
|
}
|
|
3488
|
-
checkSchema(parsed, absolutePath);
|
|
3489
3485
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
3490
|
-
const definitions = targets.map(
|
|
3486
|
+
const definitions = targets.map(
|
|
3487
|
+
(entry, index) => assertTargetDefinition(entry, index, absolutePath)
|
|
3488
|
+
);
|
|
3491
3489
|
return definitions;
|
|
3492
3490
|
}
|
|
3493
3491
|
function listTargetNames(definitions) {
|
|
@@ -3564,7 +3562,11 @@ var LlmJudgeEvaluator = class {
|
|
|
3564
3562
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3565
3563
|
const variables = {
|
|
3566
3564
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3567
|
-
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
3565
|
+
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
3566
|
+
context.evalCase.expected_segments,
|
|
3567
|
+
null,
|
|
3568
|
+
2
|
|
3569
|
+
),
|
|
3568
3570
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
3569
3571
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
3570
3572
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -3810,7 +3812,7 @@ var import_node_crypto2 = require("crypto");
|
|
|
3810
3812
|
var import_promises11 = require("fs/promises");
|
|
3811
3813
|
var import_node_path13 = __toESM(require("path"), 1);
|
|
3812
3814
|
|
|
3813
|
-
// ../../node_modules/.
|
|
3815
|
+
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
3814
3816
|
var Node = class {
|
|
3815
3817
|
value;
|
|
3816
3818
|
next;
|
|
@@ -3843,6 +3845,9 @@ var Queue = class {
|
|
|
3843
3845
|
}
|
|
3844
3846
|
this.#head = this.#head.next;
|
|
3845
3847
|
this.#size--;
|
|
3848
|
+
if (!this.#head) {
|
|
3849
|
+
this.#tail = void 0;
|
|
3850
|
+
}
|
|
3846
3851
|
return current.value;
|
|
3847
3852
|
}
|
|
3848
3853
|
peek() {
|
|
@@ -3873,7 +3878,7 @@ var Queue = class {
|
|
|
3873
3878
|
}
|
|
3874
3879
|
};
|
|
3875
3880
|
|
|
3876
|
-
// ../../node_modules/.
|
|
3881
|
+
// ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
|
|
3877
3882
|
function pLimit(concurrency) {
|
|
3878
3883
|
validateConcurrency(concurrency);
|
|
3879
3884
|
const queue = new Queue();
|
|
@@ -3946,6 +3951,16 @@ function validateConcurrency(concurrency) {
|
|
|
3946
3951
|
}
|
|
3947
3952
|
}
|
|
3948
3953
|
|
|
3954
|
+
// src/evaluation/providers/types.ts
|
|
3955
|
+
var AGENT_PROVIDER_KINDS = [
|
|
3956
|
+
"codex",
|
|
3957
|
+
"vscode",
|
|
3958
|
+
"vscode-insiders"
|
|
3959
|
+
];
|
|
3960
|
+
function isAgentProvider(provider) {
|
|
3961
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
3962
|
+
}
|
|
3963
|
+
|
|
3949
3964
|
// src/evaluation/orchestrator.ts
|
|
3950
3965
|
async function runEvaluation(options) {
|
|
3951
3966
|
const {
|
|
@@ -4049,7 +4064,9 @@ async function runEvaluation(options) {
|
|
|
4049
4064
|
} catch (error) {
|
|
4050
4065
|
if (verbose) {
|
|
4051
4066
|
const message = error instanceof Error ? error.message : String(error);
|
|
4052
|
-
console.warn(
|
|
4067
|
+
console.warn(
|
|
4068
|
+
`Provider batch execution failed, falling back to per-case dispatch: ${message}`
|
|
4069
|
+
);
|
|
4053
4070
|
}
|
|
4054
4071
|
}
|
|
4055
4072
|
}
|
|
@@ -4213,7 +4230,14 @@ async function runBatchEvaluation(options) {
|
|
|
4213
4230
|
agentTimeoutMs
|
|
4214
4231
|
});
|
|
4215
4232
|
} catch (error) {
|
|
4216
|
-
const errorResult = buildErrorResult(
|
|
4233
|
+
const errorResult = buildErrorResult(
|
|
4234
|
+
evalCase,
|
|
4235
|
+
target.name,
|
|
4236
|
+
nowFn(),
|
|
4237
|
+
error,
|
|
4238
|
+
promptInputs,
|
|
4239
|
+
provider
|
|
4240
|
+
);
|
|
4217
4241
|
results.push(errorResult);
|
|
4218
4242
|
if (onResult) {
|
|
4219
4243
|
await onResult(errorResult);
|
|
@@ -4391,7 +4415,18 @@ async function evaluateCandidate(options) {
|
|
|
4391
4415
|
};
|
|
4392
4416
|
}
|
|
4393
4417
|
async function runEvaluatorsForCase(options) {
|
|
4394
|
-
const {
|
|
4418
|
+
const {
|
|
4419
|
+
evalCase,
|
|
4420
|
+
candidate,
|
|
4421
|
+
target,
|
|
4422
|
+
provider,
|
|
4423
|
+
evaluators,
|
|
4424
|
+
attempt,
|
|
4425
|
+
promptInputs,
|
|
4426
|
+
now,
|
|
4427
|
+
judgeProvider,
|
|
4428
|
+
agentTimeoutMs
|
|
4429
|
+
} = options;
|
|
4395
4430
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4396
4431
|
return runEvaluatorList({
|
|
4397
4432
|
evalCase,
|
|
@@ -4492,7 +4527,6 @@ async function runEvaluatorList(options) {
|
|
|
4492
4527
|
reasoning: score2.reasoning,
|
|
4493
4528
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4494
4529
|
});
|
|
4495
|
-
continue;
|
|
4496
4530
|
}
|
|
4497
4531
|
} catch (error) {
|
|
4498
4532
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -4503,7 +4537,11 @@ async function runEvaluatorList(options) {
|
|
|
4503
4537
|
expectedAspectCount: 1,
|
|
4504
4538
|
reasoning: message
|
|
4505
4539
|
};
|
|
4506
|
-
scored.push({
|
|
4540
|
+
scored.push({
|
|
4541
|
+
score: fallbackScore,
|
|
4542
|
+
name: evaluator.name ?? "unknown",
|
|
4543
|
+
type: evaluator.type ?? "unknown"
|
|
4544
|
+
});
|
|
4507
4545
|
evaluatorResults.push({
|
|
4508
4546
|
name: evaluator.name ?? "unknown",
|
|
4509
4547
|
type: evaluator.type ?? "unknown",
|
|
@@ -4517,7 +4555,10 @@ async function runEvaluatorList(options) {
|
|
|
4517
4555
|
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
4518
4556
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
4519
4557
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
4520
|
-
const expectedAspectCount = scored.reduce(
|
|
4558
|
+
const expectedAspectCount = scored.reduce(
|
|
4559
|
+
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
4560
|
+
0
|
|
4561
|
+
);
|
|
4521
4562
|
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
4522
4563
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
4523
4564
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
@@ -4532,7 +4573,18 @@ async function runEvaluatorList(options) {
|
|
|
4532
4573
|
return { score, evaluatorResults };
|
|
4533
4574
|
}
|
|
4534
4575
|
async function runLlmJudgeEvaluator(options) {
|
|
4535
|
-
const {
|
|
4576
|
+
const {
|
|
4577
|
+
config,
|
|
4578
|
+
evalCase,
|
|
4579
|
+
candidate,
|
|
4580
|
+
target,
|
|
4581
|
+
provider,
|
|
4582
|
+
evaluatorRegistry,
|
|
4583
|
+
attempt,
|
|
4584
|
+
promptInputs,
|
|
4585
|
+
now,
|
|
4586
|
+
judgeProvider
|
|
4587
|
+
} = options;
|
|
4536
4588
|
const customPrompt = await resolveCustomPrompt(config);
|
|
4537
4589
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
4538
4590
|
evalCase,
|