@agentv/core 4.5.1 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-M65PVDQ5.js → chunk-AIQ5FO4G.js} +27 -5
- package/dist/chunk-AIQ5FO4G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +15 -6
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +7 -4
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +108 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +25 -3
- package/dist/index.d.ts +25 -3
- package/dist/index.js +83 -41
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-M65PVDQ5.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -2585,6 +2585,7 @@ function validateTemplateVariables(content, source) {
|
|
|
2585
2585
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
2586
2586
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
2587
2587
|
var ANSI_RESET5 = "\x1B[0m";
|
|
2588
|
+
var PROMPT_FILE_PREFIX = "file://";
|
|
2588
2589
|
function normalizeEvaluatorType(type) {
|
|
2589
2590
|
return type.replace(/_/g, "-");
|
|
2590
2591
|
}
|
|
@@ -2883,12 +2884,23 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2883
2884
|
threshold: thresholdValue
|
|
2884
2885
|
};
|
|
2885
2886
|
} else {
|
|
2886
|
-
const
|
|
2887
|
+
const rawAggPrompt = asString(rawAggregator.prompt);
|
|
2888
|
+
let aggregatorPrompt;
|
|
2887
2889
|
let promptPath2;
|
|
2888
|
-
if (
|
|
2889
|
-
|
|
2890
|
-
|
|
2891
|
-
|
|
2890
|
+
if (rawAggPrompt) {
|
|
2891
|
+
if (rawAggPrompt.startsWith(PROMPT_FILE_PREFIX)) {
|
|
2892
|
+
const fileRef = rawAggPrompt.slice(PROMPT_FILE_PREFIX.length);
|
|
2893
|
+
aggregatorPrompt = fileRef;
|
|
2894
|
+
const resolved = await resolveFileReference2(fileRef, searchRoots);
|
|
2895
|
+
if (resolved.resolvedPath) {
|
|
2896
|
+
promptPath2 = import_node_path5.default.resolve(resolved.resolvedPath);
|
|
2897
|
+
} else {
|
|
2898
|
+
throw new Error(
|
|
2899
|
+
`Composite aggregator in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
2900
|
+
);
|
|
2901
|
+
}
|
|
2902
|
+
} else {
|
|
2903
|
+
aggregatorPrompt = rawAggPrompt;
|
|
2892
2904
|
}
|
|
2893
2905
|
}
|
|
2894
2906
|
aggregator = {
|
|
@@ -3468,21 +3480,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3468
3480
|
promptScriptConfig = rawPrompt.config;
|
|
3469
3481
|
}
|
|
3470
3482
|
} else if (typeof rawPrompt === "string") {
|
|
3471
|
-
|
|
3472
|
-
|
|
3473
|
-
|
|
3474
|
-
|
|
3475
|
-
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3479
|
-
|
|
3483
|
+
if (rawPrompt.startsWith(PROMPT_FILE_PREFIX)) {
|
|
3484
|
+
const fileRef = rawPrompt.slice(PROMPT_FILE_PREFIX.length);
|
|
3485
|
+
prompt = fileRef;
|
|
3486
|
+
const resolved = await resolveFileReference2(fileRef, searchRoots);
|
|
3487
|
+
if (resolved.resolvedPath) {
|
|
3488
|
+
promptPath = import_node_path5.default.resolve(resolved.resolvedPath);
|
|
3489
|
+
try {
|
|
3490
|
+
await validateCustomPromptContent(promptPath);
|
|
3491
|
+
} catch (error) {
|
|
3492
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3493
|
+
throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
|
|
3494
|
+
}
|
|
3495
|
+
} else {
|
|
3496
|
+
throw new Error(
|
|
3497
|
+
`Evaluator '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
3498
|
+
);
|
|
3480
3499
|
}
|
|
3481
3500
|
} else {
|
|
3482
|
-
|
|
3483
|
-
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
3484
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
3485
|
-
);
|
|
3501
|
+
prompt = rawPrompt;
|
|
3486
3502
|
}
|
|
3487
3503
|
}
|
|
3488
3504
|
const _model = asString(rawEvaluator.model);
|
|
@@ -5519,7 +5535,7 @@ var OpenAIProvider = class {
|
|
|
5519
5535
|
apiKey: config.apiKey,
|
|
5520
5536
|
baseURL: config.baseURL
|
|
5521
5537
|
});
|
|
5522
|
-
this.model = openai(config.model);
|
|
5538
|
+
this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
|
|
5523
5539
|
}
|
|
5524
5540
|
id;
|
|
5525
5541
|
kind = "openai";
|
|
@@ -10752,21 +10768,27 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
10752
10768
|
"OUTPUT_FILE"
|
|
10753
10769
|
]);
|
|
10754
10770
|
var COMMON_TARGET_SETTINGS = [
|
|
10771
|
+
"use_target",
|
|
10755
10772
|
"provider_batching",
|
|
10756
10773
|
"providerBatching",
|
|
10757
10774
|
"subagent_mode_allowed",
|
|
10758
|
-
"subagentModeAllowed"
|
|
10775
|
+
"subagentModeAllowed",
|
|
10776
|
+
"fallback_targets",
|
|
10777
|
+
"fallbackTargets"
|
|
10759
10778
|
];
|
|
10760
10779
|
var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
10761
10780
|
name: import_zod3.z.string().min(1, "target name is required"),
|
|
10762
|
-
provider: import_zod3.z.string().
|
|
10781
|
+
provider: import_zod3.z.string().optional(),
|
|
10782
|
+
use_target: import_zod3.z.string().optional(),
|
|
10763
10783
|
grader_target: import_zod3.z.string().optional(),
|
|
10764
10784
|
judge_target: import_zod3.z.string().optional(),
|
|
10765
10785
|
// backward compat
|
|
10766
10786
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
10767
10787
|
workspace_template: import_zod3.z.string().optional(),
|
|
10768
10788
|
workspaceTemplate: import_zod3.z.string().optional(),
|
|
10769
|
-
subagent_mode_allowed: import_zod3.z.boolean().optional()
|
|
10789
|
+
subagent_mode_allowed: import_zod3.z.boolean().optional(),
|
|
10790
|
+
fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional(),
|
|
10791
|
+
fallbackTargets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
|
|
10770
10792
|
}).passthrough();
|
|
10771
10793
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
10772
10794
|
var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
|
@@ -10820,6 +10842,11 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10820
10842
|
`${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
|
|
10821
10843
|
);
|
|
10822
10844
|
}
|
|
10845
|
+
if (!parsed.provider) {
|
|
10846
|
+
throw new Error(
|
|
10847
|
+
`${parsed.name}: 'provider' is required (targets with use_target must be resolved before calling resolveTargetDefinition)`
|
|
10848
|
+
);
|
|
10849
|
+
}
|
|
10823
10850
|
const provider = resolveString(
|
|
10824
10851
|
parsed.provider,
|
|
10825
10852
|
env,
|
|
@@ -10832,12 +10859,14 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10832
10859
|
const subagentModeAllowed = resolveOptionalBoolean(
|
|
10833
10860
|
parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
|
|
10834
10861
|
);
|
|
10862
|
+
const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
|
|
10835
10863
|
const base = {
|
|
10836
10864
|
name: parsed.name,
|
|
10837
10865
|
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10838
10866
|
workers: parsed.workers,
|
|
10839
10867
|
providerBatching,
|
|
10840
|
-
subagentModeAllowed
|
|
10868
|
+
subagentModeAllowed,
|
|
10869
|
+
...fallbackTargets ? { fallbackTargets } : {}
|
|
10841
10870
|
};
|
|
10842
10871
|
switch (provider) {
|
|
10843
10872
|
case "openai":
|
|
@@ -11011,6 +11040,14 @@ function resolveAzureConfig(target, env) {
|
|
|
11011
11040
|
retry
|
|
11012
11041
|
};
|
|
11013
11042
|
}
|
|
11043
|
+
function resolveApiFormat(target, targetName) {
|
|
11044
|
+
const raw = target.api_format ?? target.apiFormat;
|
|
11045
|
+
if (raw === void 0) return void 0;
|
|
11046
|
+
if (raw === "chat" || raw === "responses") return raw;
|
|
11047
|
+
throw new Error(
|
|
11048
|
+
`Invalid api_format '${raw}' for target '${targetName}'. Must be 'chat' or 'responses'.`
|
|
11049
|
+
);
|
|
11050
|
+
}
|
|
11014
11051
|
function resolveOpenAIConfig(target, env) {
|
|
11015
11052
|
const endpointSource = target.endpoint ?? target.base_url ?? target.baseUrl;
|
|
11016
11053
|
const apiKeySource = target.api_key ?? target.apiKey;
|
|
@@ -11030,6 +11067,7 @@ function resolveOpenAIConfig(target, env) {
|
|
|
11030
11067
|
baseURL,
|
|
11031
11068
|
apiKey,
|
|
11032
11069
|
model,
|
|
11070
|
+
apiFormat: resolveApiFormat(target, target.name),
|
|
11033
11071
|
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
11034
11072
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
11035
11073
|
retry
|
|
@@ -13364,8 +13402,11 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
13364
13402
|
`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`
|
|
13365
13403
|
);
|
|
13366
13404
|
}
|
|
13367
|
-
|
|
13368
|
-
|
|
13405
|
+
const hasUseTarget = typeof value.use_target === "string" && value.use_target.trim().length > 0;
|
|
13406
|
+
if (!hasUseTarget && (typeof provider !== "string" || provider.trim().length === 0)) {
|
|
13407
|
+
throw new Error(
|
|
13408
|
+
`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider' (or use use_target for delegation)`
|
|
13409
|
+
);
|
|
13369
13410
|
}
|
|
13370
13411
|
return value;
|
|
13371
13412
|
}
|
|
@@ -18818,10 +18859,20 @@ async function runEvaluation(options) {
|
|
|
18818
18859
|
if (resolvedTargetsByName.has(name)) {
|
|
18819
18860
|
return resolvedTargetsByName.get(name);
|
|
18820
18861
|
}
|
|
18821
|
-
|
|
18862
|
+
let definition = targetDefinitions.get(name);
|
|
18822
18863
|
if (!definition) {
|
|
18823
18864
|
return void 0;
|
|
18824
18865
|
}
|
|
18866
|
+
for (let depth = 0; depth < 5; depth++) {
|
|
18867
|
+
const useTarget = definition.use_target;
|
|
18868
|
+
if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
|
|
18869
|
+
const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
18870
|
+
const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
|
|
18871
|
+
if (resolvedName.length === 0) break;
|
|
18872
|
+
const next = targetDefinitions.get(resolvedName);
|
|
18873
|
+
if (!next) break;
|
|
18874
|
+
definition = next;
|
|
18875
|
+
}
|
|
18825
18876
|
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
18826
18877
|
resolvedTargetsByName.set(name, resolved);
|
|
18827
18878
|
return resolved;
|
|
@@ -19826,6 +19877,7 @@ async function runEvalCase(options) {
|
|
|
19826
19877
|
let attempt = 0;
|
|
19827
19878
|
let providerResponse = cachedResponse;
|
|
19828
19879
|
let lastError;
|
|
19880
|
+
let targetUsed;
|
|
19829
19881
|
while (!providerResponse && attempt < attemptBudget) {
|
|
19830
19882
|
try {
|
|
19831
19883
|
providerResponse = await invokeProvider(provider, {
|
|
@@ -19848,25 +19900,33 @@ async function runEvalCase(options) {
|
|
|
19848
19900
|
attempt += 1;
|
|
19849
19901
|
continue;
|
|
19850
19902
|
}
|
|
19851
|
-
|
|
19852
|
-
|
|
19853
|
-
|
|
19854
|
-
|
|
19855
|
-
|
|
19856
|
-
|
|
19857
|
-
|
|
19858
|
-
|
|
19859
|
-
|
|
19860
|
-
|
|
19861
|
-
|
|
19862
|
-
|
|
19863
|
-
|
|
19864
|
-
|
|
19865
|
-
|
|
19866
|
-
|
|
19867
|
-
|
|
19903
|
+
break;
|
|
19904
|
+
}
|
|
19905
|
+
}
|
|
19906
|
+
if (!providerResponse && target.fallbackTargets?.length && targetResolver) {
|
|
19907
|
+
for (const fallbackName of target.fallbackTargets) {
|
|
19908
|
+
const fallbackProvider = targetResolver(fallbackName);
|
|
19909
|
+
if (!fallbackProvider) {
|
|
19910
|
+
continue;
|
|
19911
|
+
}
|
|
19912
|
+
try {
|
|
19913
|
+
providerResponse = await invokeProvider(fallbackProvider, {
|
|
19914
|
+
evalCase,
|
|
19915
|
+
target,
|
|
19916
|
+
promptInputs,
|
|
19917
|
+
attempt: 0,
|
|
19918
|
+
agentTimeoutMs,
|
|
19919
|
+
signal,
|
|
19920
|
+
cwd: workspacePath,
|
|
19921
|
+
workspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
|
|
19922
|
+
captureFileChanges: !!baselineCommit,
|
|
19923
|
+
streamCallbacks: options.streamCallbacks
|
|
19924
|
+
});
|
|
19925
|
+
targetUsed = fallbackName;
|
|
19926
|
+
break;
|
|
19927
|
+
} catch (error) {
|
|
19928
|
+
lastError = error;
|
|
19868
19929
|
}
|
|
19869
|
-
return errorResult;
|
|
19870
19930
|
}
|
|
19871
19931
|
}
|
|
19872
19932
|
if (!providerResponse) {
|
|
@@ -19992,8 +20052,10 @@ async function runEvalCase(options) {
|
|
|
19992
20052
|
};
|
|
19993
20053
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
19994
20054
|
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
|
|
20055
|
+
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
19995
20056
|
const finalResult = providerError ? {
|
|
19996
20057
|
...result,
|
|
20058
|
+
...targetUsedField,
|
|
19997
20059
|
evalRun,
|
|
19998
20060
|
error: providerError,
|
|
19999
20061
|
executionStatus,
|
|
@@ -20005,6 +20067,7 @@ async function runEvalCase(options) {
|
|
|
20005
20067
|
afterEachOutput
|
|
20006
20068
|
} : skippedEvaluatorError ? {
|
|
20007
20069
|
...result,
|
|
20070
|
+
...targetUsedField,
|
|
20008
20071
|
score: 0,
|
|
20009
20072
|
evalRun,
|
|
20010
20073
|
error: skippedEvaluatorError,
|
|
@@ -20017,6 +20080,7 @@ async function runEvalCase(options) {
|
|
|
20017
20080
|
afterEachOutput
|
|
20018
20081
|
} : {
|
|
20019
20082
|
...result,
|
|
20083
|
+
...targetUsedField,
|
|
20020
20084
|
evalRun,
|
|
20021
20085
|
executionStatus,
|
|
20022
20086
|
beforeAllOutput,
|