agentv 4.5.2 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OIBYQMCK.js → chunk-5GZJIXTY.js} +155 -59
- package/dist/chunk-5GZJIXTY.js.map +1 -0
- package/dist/{chunk-7DRAXDVC.js → chunk-KQQTEWZF.js} +111 -47
- package/dist/chunk-KQQTEWZF.js.map +1 -0
- package/dist/{chunk-HF5UGZSZ.js → chunk-U2LSJ6Y4.js} +133 -159
- package/dist/chunk-U2LSJ6Y4.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-VWMHFUXR.js → dist-FBPCDLOY.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-BOJUYBJS.js → interactive-6D3ULOMN.js} +3 -3
- package/dist/studio/assets/{index-vn54AYtS.js → index-D-gfAa3s.js} +1 -1
- package/dist/studio/assets/{index-C7TnyYee.js → index-zWHsVvgi.js} +1 -1
- package/dist/studio/index.html +1 -1
- package/package.json +1 -1
- package/dist/chunk-7DRAXDVC.js.map +0 -1
- package/dist/chunk-HF5UGZSZ.js.map +0 -1
- package/dist/chunk-OIBYQMCK.js.map +0 -1
- /package/dist/{dist-VWMHFUXR.js.map → dist-FBPCDLOY.js.map} +0 -0
- /package/dist/{interactive-BOJUYBJS.js.map → interactive-6D3ULOMN.js.map} +0 -0
|
@@ -29,12 +29,12 @@ import {
|
|
|
29
29
|
subscribeToCopilotCliLogEntries,
|
|
30
30
|
subscribeToCopilotSdkLogEntries,
|
|
31
31
|
subscribeToPiLogEntries
|
|
32
|
-
} from "./chunk-
|
|
32
|
+
} from "./chunk-KQQTEWZF.js";
|
|
33
33
|
|
|
34
34
|
// package.json
|
|
35
35
|
var package_default = {
|
|
36
36
|
name: "agentv",
|
|
37
|
-
version: "4.
|
|
37
|
+
version: "4.6.0",
|
|
38
38
|
description: "CLI entry point for AgentV",
|
|
39
39
|
type: "module",
|
|
40
40
|
repository: {
|
|
@@ -102,9 +102,21 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
102
102
|
if (normalizedInputs.length === 0) {
|
|
103
103
|
throw new Error("No eval paths provided.");
|
|
104
104
|
}
|
|
105
|
+
const includePatterns = [];
|
|
106
|
+
const ignorePatterns = [];
|
|
107
|
+
for (const input of normalizedInputs) {
|
|
108
|
+
if (input.startsWith("!")) {
|
|
109
|
+
ignorePatterns.push(input.slice(1));
|
|
110
|
+
} else {
|
|
111
|
+
includePatterns.push(input);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (includePatterns.length === 0) {
|
|
115
|
+
throw new Error("No eval paths provided (only negation patterns found).");
|
|
116
|
+
}
|
|
105
117
|
const unmatched = [];
|
|
106
118
|
const results = /* @__PURE__ */ new Set();
|
|
107
|
-
for (const pattern of
|
|
119
|
+
for (const pattern of includePatterns) {
|
|
108
120
|
const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
|
|
109
121
|
try {
|
|
110
122
|
const stats = await stat(candidatePath);
|
|
@@ -119,7 +131,8 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
119
131
|
onlyFiles: true,
|
|
120
132
|
unique: true,
|
|
121
133
|
dot: true,
|
|
122
|
-
followSymbolicLinks: true
|
|
134
|
+
followSymbolicLinks: true,
|
|
135
|
+
ignore: ignorePatterns
|
|
123
136
|
});
|
|
124
137
|
if (dirMatches.length === 0) {
|
|
125
138
|
unmatched.push(pattern);
|
|
@@ -139,7 +152,8 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
139
152
|
onlyFiles: true,
|
|
140
153
|
unique: true,
|
|
141
154
|
dot: true,
|
|
142
|
-
followSymbolicLinks: true
|
|
155
|
+
followSymbolicLinks: true,
|
|
156
|
+
ignore: ignorePatterns
|
|
143
157
|
});
|
|
144
158
|
const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
|
|
145
159
|
if (yamlMatches.length === 0) {
|
|
@@ -3114,6 +3128,8 @@ var OPENAI_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3114
3128
|
"model",
|
|
3115
3129
|
"deployment",
|
|
3116
3130
|
"variant",
|
|
3131
|
+
"api_format",
|
|
3132
|
+
"apiFormat",
|
|
3117
3133
|
"temperature",
|
|
3118
3134
|
"max_output_tokens",
|
|
3119
3135
|
"maxTokens"
|
|
@@ -3506,16 +3522,17 @@ async function validateTargetsFile(filePath) {
|
|
|
3506
3522
|
});
|
|
3507
3523
|
}
|
|
3508
3524
|
const provider = target.provider;
|
|
3525
|
+
const hasUseTarget = typeof target.use_target === "string" && target.use_target.trim().length > 0;
|
|
3509
3526
|
const providerValue = typeof provider === "string" ? provider.trim().toLowerCase() : void 0;
|
|
3510
3527
|
const isTemplated = typeof provider === "string" && /^\$\{\{.+\}\}$/.test(provider.trim());
|
|
3511
|
-
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
3528
|
+
if (!hasUseTarget && (typeof provider !== "string" || provider.trim().length === 0)) {
|
|
3512
3529
|
errors.push({
|
|
3513
3530
|
severity: "error",
|
|
3514
3531
|
filePath: absolutePath,
|
|
3515
3532
|
location: `${location}.provider`,
|
|
3516
|
-
message: "Missing or invalid 'provider' field (must be a non-empty string)"
|
|
3533
|
+
message: "Missing or invalid 'provider' field (must be a non-empty string, or use use_target for delegation)"
|
|
3517
3534
|
});
|
|
3518
|
-
} else if (!isTemplated && !knownProviders.includes(provider)) {
|
|
3535
|
+
} else if (typeof provider === "string" && !isTemplated && !knownProviders.includes(provider)) {
|
|
3519
3536
|
errors.push({
|
|
3520
3537
|
severity: "warning",
|
|
3521
3538
|
filePath: absolutePath,
|
|
@@ -3761,6 +3778,34 @@ var ANSI_RESET3 = "\x1B[0m";
|
|
|
3761
3778
|
function isTTY() {
|
|
3762
3779
|
return process.stdout.isTTY ?? false;
|
|
3763
3780
|
}
|
|
3781
|
+
function resolveUseTarget(name, definitions, env, targetsFilePath) {
|
|
3782
|
+
const maxDepth = 5;
|
|
3783
|
+
let current = definitions.find((d) => d.name === name);
|
|
3784
|
+
if (!current) {
|
|
3785
|
+
const available = listTargetNames(definitions).join(", ");
|
|
3786
|
+
throw new Error(
|
|
3787
|
+
`Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`
|
|
3788
|
+
);
|
|
3789
|
+
}
|
|
3790
|
+
for (let depth = 0; depth < maxDepth; depth++) {
|
|
3791
|
+
const useTarget = current.use_target;
|
|
3792
|
+
if (useTarget === void 0 || useTarget === null) break;
|
|
3793
|
+
const raw = String(useTarget).trim();
|
|
3794
|
+
if (raw.length === 0) break;
|
|
3795
|
+
const envMatch = raw.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
3796
|
+
const resolved = envMatch ? env[envMatch[1]] ?? "" : raw;
|
|
3797
|
+
if (resolved.trim().length === 0) break;
|
|
3798
|
+
const next = definitions.find((d) => d.name === resolved.trim());
|
|
3799
|
+
if (!next) {
|
|
3800
|
+
const available = listTargetNames(definitions).join(", ");
|
|
3801
|
+
throw new Error(
|
|
3802
|
+
`Target '${name}' use_target '${resolved.trim()}' not found in ${targetsFilePath}. Available targets: ${available}`
|
|
3803
|
+
);
|
|
3804
|
+
}
|
|
3805
|
+
current = next;
|
|
3806
|
+
}
|
|
3807
|
+
return current;
|
|
3808
|
+
}
|
|
3764
3809
|
async function readTestSuiteTarget(testFilePath) {
|
|
3765
3810
|
const metadata = await readTestSuiteMetadata(testFilePath);
|
|
3766
3811
|
return metadata.target;
|
|
@@ -3824,15 +3869,7 @@ Errors in ${targetsFilePath}:`);
|
|
|
3824
3869
|
const definitions = await readTargetDefinitions(targetsFilePath);
|
|
3825
3870
|
const fileTargetName = await readTestSuiteTarget(testFilePath);
|
|
3826
3871
|
const targetChoice = pickTargetName({ cliTargetName, fileTargetName });
|
|
3827
|
-
const targetDefinition = definitions
|
|
3828
|
-
(definition) => definition.name === targetChoice.name
|
|
3829
|
-
);
|
|
3830
|
-
if (!targetDefinition) {
|
|
3831
|
-
const available = listTargetNames(definitions).join(", ");
|
|
3832
|
-
throw new Error(
|
|
3833
|
-
`Target '${targetChoice.name}' not found in ${targetsFilePath}. Available targets: ${available}`
|
|
3834
|
-
);
|
|
3835
|
-
}
|
|
3872
|
+
const targetDefinition = resolveUseTarget(targetChoice.name, definitions, env, targetsFilePath);
|
|
3836
3873
|
if (dryRun) {
|
|
3837
3874
|
const mockTarget = {
|
|
3838
3875
|
kind: "mock",
|
|
@@ -3915,15 +3952,7 @@ Errors in ${targetsFilePath}:`);
|
|
|
3915
3952
|
const definitions = await readTargetDefinitions(targetsFilePath);
|
|
3916
3953
|
const results = [];
|
|
3917
3954
|
for (const name of targetNames) {
|
|
3918
|
-
const targetDefinition = definitions
|
|
3919
|
-
(definition) => definition.name === name
|
|
3920
|
-
);
|
|
3921
|
-
if (!targetDefinition) {
|
|
3922
|
-
const available = listTargetNames(definitions).join(", ");
|
|
3923
|
-
throw new Error(
|
|
3924
|
-
`Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`
|
|
3925
|
-
);
|
|
3926
|
-
}
|
|
3955
|
+
const targetDefinition = resolveUseTarget(name, definitions, env, targetsFilePath);
|
|
3927
3956
|
if (dryRun) {
|
|
3928
3957
|
const mockTarget = {
|
|
3929
3958
|
kind: "mock",
|
|
@@ -4008,6 +4037,24 @@ function normalizeOptionalNumber(value) {
|
|
|
4008
4037
|
function normalizeWorkspaceMode(value) {
|
|
4009
4038
|
return value === "pooled" || value === "temp" || value === "static" ? value : void 0;
|
|
4010
4039
|
}
|
|
4040
|
+
function normalizeStringArray(value) {
|
|
4041
|
+
if (Array.isArray(value)) {
|
|
4042
|
+
return value.filter((v) => typeof v === "string" && v.trim().length > 0);
|
|
4043
|
+
}
|
|
4044
|
+
return [];
|
|
4045
|
+
}
|
|
4046
|
+
function matchesTagFilters(fileTags, includeTags, excludeTags) {
|
|
4047
|
+
const tags = new Set(fileTags ?? []);
|
|
4048
|
+
if (includeTags.length > 0) {
|
|
4049
|
+
for (const required of includeTags) {
|
|
4050
|
+
if (!tags.has(required)) return false;
|
|
4051
|
+
}
|
|
4052
|
+
}
|
|
4053
|
+
for (const excluded of excludeTags) {
|
|
4054
|
+
if (tags.has(excluded)) return false;
|
|
4055
|
+
}
|
|
4056
|
+
return true;
|
|
4057
|
+
}
|
|
4011
4058
|
function normalizeOutputMessages(cliValue) {
|
|
4012
4059
|
if (cliValue === void 0) {
|
|
4013
4060
|
return 1;
|
|
@@ -4116,7 +4163,9 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4116
4163
|
graderTarget: normalizeString(rawOptions.graderTarget),
|
|
4117
4164
|
model: normalizeString(rawOptions.model),
|
|
4118
4165
|
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
|
|
4119
|
-
threshold: normalizeOptionalNumber(rawOptions.threshold)
|
|
4166
|
+
threshold: normalizeOptionalNumber(rawOptions.threshold),
|
|
4167
|
+
tags: normalizeStringArray(rawOptions.tag),
|
|
4168
|
+
excludeTags: normalizeStringArray(rawOptions.excludeTag)
|
|
4120
4169
|
};
|
|
4121
4170
|
}
|
|
4122
4171
|
async function ensureFileExists(filePath, description) {
|
|
@@ -4266,7 +4315,8 @@ async function prepareFileMetadata(params) {
|
|
|
4266
4315
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
4267
4316
|
totalBudgetUsd: suite.totalBudgetUsd,
|
|
4268
4317
|
failOnError: suite.failOnError,
|
|
4269
|
-
threshold: suite.threshold
|
|
4318
|
+
threshold: suite.threshold,
|
|
4319
|
+
tags: suite.metadata?.tags
|
|
4270
4320
|
};
|
|
4271
4321
|
}
|
|
4272
4322
|
async function runWithLimit(items, limit, task) {
|
|
@@ -4484,7 +4534,7 @@ async function runEvalCommand(input) {
|
|
|
4484
4534
|
const useFileExport = !!options.otelFile;
|
|
4485
4535
|
if (options.exportOtel || useFileExport) {
|
|
4486
4536
|
try {
|
|
4487
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4537
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-FBPCDLOY.js");
|
|
4488
4538
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4489
4539
|
let headers = {};
|
|
4490
4540
|
if (options.otelBackend) {
|
|
@@ -4578,6 +4628,25 @@ async function runEvalCommand(input) {
|
|
|
4578
4628
|
});
|
|
4579
4629
|
fileMetadata.set(testFilePath, meta);
|
|
4580
4630
|
}
|
|
4631
|
+
const hasTagFilters = options.tags.length > 0 || options.excludeTags.length > 0;
|
|
4632
|
+
if (hasTagFilters) {
|
|
4633
|
+
const skippedFiles = [];
|
|
4634
|
+
for (const [testFilePath, meta] of fileMetadata.entries()) {
|
|
4635
|
+
if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
|
|
4636
|
+
fileMetadata.delete(testFilePath);
|
|
4637
|
+
skippedFiles.push(path15.relative(cwd, testFilePath));
|
|
4638
|
+
}
|
|
4639
|
+
}
|
|
4640
|
+
if (skippedFiles.length > 0 && options.verbose) {
|
|
4641
|
+
console.log(
|
|
4642
|
+
`Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(", ")}`
|
|
4643
|
+
);
|
|
4644
|
+
}
|
|
4645
|
+
if (fileMetadata.size === 0) {
|
|
4646
|
+
console.log("No eval files matched the tag filters. Nothing to run.");
|
|
4647
|
+
return;
|
|
4648
|
+
}
|
|
4649
|
+
}
|
|
4581
4650
|
const firstMeta = fileMetadata.values().next().value;
|
|
4582
4651
|
const yamlCacheEnabled = firstMeta?.yamlCache;
|
|
4583
4652
|
const yamlCachePath = firstMeta?.yamlCachePath;
|
|
@@ -4667,8 +4736,9 @@ async function runEvalCommand(input) {
|
|
|
4667
4736
|
}
|
|
4668
4737
|
}
|
|
4669
4738
|
}
|
|
4739
|
+
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
4670
4740
|
try {
|
|
4671
|
-
await runWithLimit(
|
|
4741
|
+
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
|
|
4672
4742
|
const targetPrep = fileMetadata.get(testFilePath);
|
|
4673
4743
|
if (!targetPrep) {
|
|
4674
4744
|
throw new Error(`Missing metadata for ${testFilePath}`);
|
|
@@ -4685,30 +4755,56 @@ async function runEvalCommand(input) {
|
|
|
4685
4755
|
if (applicableEvalCases.length === 0) {
|
|
4686
4756
|
return [];
|
|
4687
4757
|
}
|
|
4688
|
-
|
|
4689
|
-
|
|
4690
|
-
|
|
4691
|
-
|
|
4692
|
-
|
|
4693
|
-
|
|
4694
|
-
|
|
4695
|
-
|
|
4696
|
-
|
|
4697
|
-
|
|
4698
|
-
|
|
4699
|
-
|
|
4700
|
-
|
|
4701
|
-
|
|
4702
|
-
|
|
4703
|
-
|
|
4704
|
-
|
|
4705
|
-
|
|
4706
|
-
|
|
4707
|
-
|
|
4708
|
-
|
|
4709
|
-
|
|
4710
|
-
|
|
4711
|
-
|
|
4758
|
+
try {
|
|
4759
|
+
const result = await runSingleEvalFile({
|
|
4760
|
+
testFilePath,
|
|
4761
|
+
cwd,
|
|
4762
|
+
repoRoot,
|
|
4763
|
+
options,
|
|
4764
|
+
outputWriter,
|
|
4765
|
+
otelExporter,
|
|
4766
|
+
cache,
|
|
4767
|
+
evaluationRunner,
|
|
4768
|
+
workersOverride: perFileWorkers,
|
|
4769
|
+
yamlWorkers: targetPrep.yamlWorkers,
|
|
4770
|
+
progressReporter,
|
|
4771
|
+
seenEvalCases,
|
|
4772
|
+
displayIdTracker,
|
|
4773
|
+
selection,
|
|
4774
|
+
inlineTargetLabel,
|
|
4775
|
+
evalCases: applicableEvalCases,
|
|
4776
|
+
trialsConfig: targetPrep.trialsConfig,
|
|
4777
|
+
matrixMode: targetPrep.selections.length > 1,
|
|
4778
|
+
totalBudgetUsd: targetPrep.totalBudgetUsd,
|
|
4779
|
+
failOnError: targetPrep.failOnError,
|
|
4780
|
+
threshold: resolvedThreshold
|
|
4781
|
+
});
|
|
4782
|
+
return result.results;
|
|
4783
|
+
} catch (fileError) {
|
|
4784
|
+
const message = fileError instanceof Error ? fileError.message : String(fileError);
|
|
4785
|
+
console.error(`
|
|
4786
|
+
\u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
|
|
4787
|
+
`);
|
|
4788
|
+
const errorResults = applicableEvalCases.map((evalCase) => ({
|
|
4789
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4790
|
+
testId: evalCase.id,
|
|
4791
|
+
score: 0,
|
|
4792
|
+
assertions: [],
|
|
4793
|
+
output: [],
|
|
4794
|
+
scores: [],
|
|
4795
|
+
error: message,
|
|
4796
|
+
executionStatus: "execution_error",
|
|
4797
|
+
failureStage: "setup",
|
|
4798
|
+
failureReasonCode: "setup_error",
|
|
4799
|
+
durationMs: 0,
|
|
4800
|
+
tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
|
|
4801
|
+
target: selection.targetName
|
|
4802
|
+
}));
|
|
4803
|
+
for (const errResult of errorResults) {
|
|
4804
|
+
await outputWriter.append(errResult);
|
|
4805
|
+
}
|
|
4806
|
+
return errorResults;
|
|
4807
|
+
}
|
|
4712
4808
|
})
|
|
4713
4809
|
);
|
|
4714
4810
|
for (const results of targetResults) {
|
|
@@ -4738,7 +4834,7 @@ async function runEvalCommand(input) {
|
|
|
4738
4834
|
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
4739
4835
|
}
|
|
4740
4836
|
if (usesDefaultArtifactWorkspace) {
|
|
4741
|
-
const evalFile =
|
|
4837
|
+
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
|
|
4742
4838
|
const workspaceDir = path15.dirname(outputPath);
|
|
4743
4839
|
const {
|
|
4744
4840
|
testArtifactDir,
|
|
@@ -4758,7 +4854,7 @@ async function runEvalCommand(input) {
|
|
|
4758
4854
|
}
|
|
4759
4855
|
if (options.artifacts) {
|
|
4760
4856
|
const artifactsDir = path15.resolve(options.artifacts);
|
|
4761
|
-
const evalFile =
|
|
4857
|
+
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
|
|
4762
4858
|
const {
|
|
4763
4859
|
testArtifactDir,
|
|
4764
4860
|
indexPath,
|
|
@@ -4797,7 +4893,7 @@ Results written to: ${outputPath}`);
|
|
|
4797
4893
|
await saveRunCache(cwd, outputPath).catch(() => void 0);
|
|
4798
4894
|
}
|
|
4799
4895
|
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
4800
|
-
const evalFileArgs =
|
|
4896
|
+
const evalFileArgs = activeTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
|
|
4801
4897
|
const targetFlag = options.target ? ` --target ${options.target}` : "";
|
|
4802
4898
|
const relativeOutputPath = path15.relative(cwd, outputPath);
|
|
4803
4899
|
console.log(
|
|
@@ -4809,7 +4905,7 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
4809
4905
|
return {
|
|
4810
4906
|
executionErrorCount: summary.executionErrorCount,
|
|
4811
4907
|
outputPath,
|
|
4812
|
-
testFiles:
|
|
4908
|
+
testFiles: activeTestFiles,
|
|
4813
4909
|
target: options.target,
|
|
4814
4910
|
thresholdFailed
|
|
4815
4911
|
};
|
|
@@ -4872,4 +4968,4 @@ export {
|
|
|
4872
4968
|
selectTarget,
|
|
4873
4969
|
runEvalCommand
|
|
4874
4970
|
};
|
|
4875
|
-
//# sourceMappingURL=chunk-
|
|
4971
|
+
//# sourceMappingURL=chunk-5GZJIXTY.js.map
|