agentv 2.12.0 → 2.14.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -14
- package/dist/{chunk-YBJX5CP6.js → chunk-K2APOWTE.js} +213 -29
- package/dist/chunk-K2APOWTE.js.map +1 -0
- package/dist/{chunk-LUHCYBMD.js → chunk-OQN2GDEU.js} +251 -164
- package/dist/chunk-OQN2GDEU.js.map +1 -0
- package/dist/{chunk-6KU2ZUFJ.js → chunk-ZSSGXZX6.js} +39 -77
- package/dist/chunk-ZSSGXZX6.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-OPPA4P5R.js → dist-QR5OZ4DH.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-TOUKPSHP.js → interactive-WF6UO63B.js} +3 -3
- package/package.json +4 -2
- package/dist/chunk-6KU2ZUFJ.js.map +0 -1
- package/dist/chunk-LUHCYBMD.js.map +0 -1
- package/dist/chunk-YBJX5CP6.js.map +0 -1
- /package/dist/{dist-OPPA4P5R.js.map → dist-QR5OZ4DH.js.map} +0 -0
- /package/dist/{interactive-TOUKPSHP.js.map → interactive-WF6UO63B.js.map} +0 -0
package/README.md
CHANGED
|
@@ -60,7 +60,7 @@ tests:
|
|
|
60
60
|
|
|
61
61
|
assert:
|
|
62
62
|
- name: math_check
|
|
63
|
-
type:
|
|
63
|
+
type: code-judge
|
|
64
64
|
script: ./validators/check_math.py
|
|
65
65
|
```
|
|
66
66
|
|
|
@@ -154,10 +154,10 @@ Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.json
|
|
|
154
154
|
description: Math evaluation dataset
|
|
155
155
|
dataset: math-tests
|
|
156
156
|
execution:
|
|
157
|
-
target:
|
|
157
|
+
target: azure-base
|
|
158
158
|
assert:
|
|
159
159
|
- name: correctness
|
|
160
|
-
type:
|
|
160
|
+
type: llm-judge
|
|
161
161
|
prompt: ./judges/correctness.md
|
|
162
162
|
```
|
|
163
163
|
|
|
@@ -175,7 +175,7 @@ agentv validate evals/my-eval.yaml
|
|
|
175
175
|
agentv eval evals/my-eval.yaml
|
|
176
176
|
|
|
177
177
|
# Override target
|
|
178
|
-
agentv eval --target
|
|
178
|
+
agentv eval --target azure-base evals/**/*.yaml
|
|
179
179
|
|
|
180
180
|
# Run specific test
|
|
181
181
|
agentv eval --test-id case-123 evals/my-eval.yaml
|
|
@@ -219,7 +219,7 @@ Reference evaluators in your eval file:
|
|
|
219
219
|
```yaml
|
|
220
220
|
assert:
|
|
221
221
|
- name: my_validator
|
|
222
|
-
type:
|
|
222
|
+
type: code-judge
|
|
223
223
|
script: ./validators/check_answer.py
|
|
224
224
|
```
|
|
225
225
|
|
|
@@ -339,7 +339,7 @@ Define execution targets in `.agentv/targets.yaml` to decouple evals from provid
|
|
|
339
339
|
|
|
340
340
|
```yaml
|
|
341
341
|
targets:
|
|
342
|
-
- name:
|
|
342
|
+
- name: azure-base
|
|
343
343
|
provider: azure
|
|
344
344
|
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
|
|
345
345
|
api_key: ${{ AZURE_OPENAI_API_KEY }}
|
|
@@ -348,12 +348,12 @@ targets:
|
|
|
348
348
|
- name: vscode_dev
|
|
349
349
|
provider: vscode
|
|
350
350
|
workspace_template: ${{ WORKSPACE_PATH }}
|
|
351
|
-
judge_target:
|
|
351
|
+
judge_target: azure-base
|
|
352
352
|
|
|
353
353
|
- name: local_agent
|
|
354
354
|
provider: cli
|
|
355
355
|
command: 'python agent.py --prompt-file {PROMPT_FILE} --output {OUTPUT_FILE}'
|
|
356
|
-
judge_target:
|
|
356
|
+
judge_target: azure-base
|
|
357
357
|
```
|
|
358
358
|
|
|
359
359
|
Supports: `azure`, `anthropic`, `gemini`, `codex`, `copilot`, `pi-coding-agent`, `claude`, `vscode`, `vscode-insiders`, `cli`, and `mock`.
|
|
@@ -398,12 +398,12 @@ All assertions support `weight`, `required`, and `negate` flags. Use `negate: tr
|
|
|
398
398
|
```yaml
|
|
399
399
|
assert:
|
|
400
400
|
# Case-insensitive matching for natural language variation
|
|
401
|
-
- type:
|
|
401
|
+
- type: icontains-any
|
|
402
402
|
value: ["missing rule code", "need rule code", "provide rule code"]
|
|
403
403
|
required: true
|
|
404
404
|
|
|
405
405
|
# Multiple required terms
|
|
406
|
-
- type:
|
|
406
|
+
- type: icontains-all
|
|
407
407
|
value: ["country code", "rule codes"]
|
|
408
408
|
|
|
409
409
|
# Case-insensitive regex
|
|
@@ -423,10 +423,10 @@ targets:
|
|
|
423
423
|
# Agent target — requires judge_target for LLM-based evaluation
|
|
424
424
|
- name: codex_local
|
|
425
425
|
provider: codex
|
|
426
|
-
judge_target:
|
|
426
|
+
judge_target: azure-base # Required: LLM provider for judging
|
|
427
427
|
|
|
428
428
|
# LLM target — no judge_target needed (judges itself)
|
|
429
|
-
- name:
|
|
429
|
+
- name: azure-base
|
|
430
430
|
provider: azure
|
|
431
431
|
```
|
|
432
432
|
|
|
@@ -445,7 +445,7 @@ Create markdown judge files with evaluation criteria and scoring guidelines:
|
|
|
445
445
|
```yaml
|
|
446
446
|
assert:
|
|
447
447
|
- name: semantic_check
|
|
448
|
-
type:
|
|
448
|
+
type: llm-judge
|
|
449
449
|
prompt: ./judges/correctness.md
|
|
450
450
|
```
|
|
451
451
|
|
|
@@ -487,7 +487,7 @@ Configure automatic retry with exponential backoff:
|
|
|
487
487
|
|
|
488
488
|
```yaml
|
|
489
489
|
targets:
|
|
490
|
-
- name:
|
|
490
|
+
- name: azure-base
|
|
491
491
|
provider: azure
|
|
492
492
|
max_retries: 5
|
|
493
493
|
retry_initial_delay_ms: 2000
|
|
@@ -25,7 +25,59 @@ import {
|
|
|
25
25
|
subscribeToCopilotCliLogEntries,
|
|
26
26
|
subscribeToCopilotSdkLogEntries,
|
|
27
27
|
subscribeToPiLogEntries
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-OQN2GDEU.js";
|
|
29
|
+
|
|
30
|
+
// package.json
|
|
31
|
+
var package_default = {
|
|
32
|
+
name: "agentv",
|
|
33
|
+
version: "2.14.0-next.1",
|
|
34
|
+
description: "CLI entry point for AgentV",
|
|
35
|
+
type: "module",
|
|
36
|
+
repository: {
|
|
37
|
+
type: "git",
|
|
38
|
+
url: "https://github.com/EntityProcess/agentv.git"
|
|
39
|
+
},
|
|
40
|
+
homepage: "https://github.com/EntityProcess/agentv#readme",
|
|
41
|
+
bugs: {
|
|
42
|
+
url: "https://github.com/EntityProcess/agentv/issues"
|
|
43
|
+
},
|
|
44
|
+
bin: {
|
|
45
|
+
agentv: "./dist/cli.js"
|
|
46
|
+
},
|
|
47
|
+
files: ["dist", "README.md"],
|
|
48
|
+
scripts: {
|
|
49
|
+
dev: "bun src/cli.ts",
|
|
50
|
+
build: "tsup && bun run copy-readme",
|
|
51
|
+
"copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
|
|
52
|
+
prepublishOnly: "bun run copy-readme",
|
|
53
|
+
typecheck: "tsc --noEmit",
|
|
54
|
+
lint: "biome check .",
|
|
55
|
+
format: "biome format --write .",
|
|
56
|
+
fix: "biome check --write .",
|
|
57
|
+
test: "bun test",
|
|
58
|
+
"test:watch": "bun test --watch"
|
|
59
|
+
},
|
|
60
|
+
dependencies: {
|
|
61
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
|
|
62
|
+
"@github/copilot-sdk": "^0.1.25",
|
|
63
|
+
"@inquirer/prompts": "^8.2.1",
|
|
64
|
+
"@mariozechner/pi-agent-core": "^0.54.2",
|
|
65
|
+
"@mariozechner/pi-ai": "^0.54.2",
|
|
66
|
+
"@openai/codex-sdk": "^0.104.0",
|
|
67
|
+
"cmd-ts": "^0.14.3",
|
|
68
|
+
dotenv: "^16.4.5",
|
|
69
|
+
"fast-glob": "^3.3.3",
|
|
70
|
+
json5: "^2.2.3",
|
|
71
|
+
micromatch: "^4.0.8",
|
|
72
|
+
semver: "^7.7.4",
|
|
73
|
+
yaml: "^2.6.1"
|
|
74
|
+
},
|
|
75
|
+
devDependencies: {
|
|
76
|
+
"@agentv/core": "workspace:*",
|
|
77
|
+
"@types/semver": "^7.7.1",
|
|
78
|
+
execa: "^9.3.0"
|
|
79
|
+
}
|
|
80
|
+
};
|
|
29
81
|
|
|
30
82
|
// src/commands/eval/shared.ts
|
|
31
83
|
import { constants } from "node:fs";
|
|
@@ -152,6 +204,60 @@ import { access as access4 } from "node:fs/promises";
|
|
|
152
204
|
import path10 from "node:path";
|
|
153
205
|
import { pathToFileURL } from "node:url";
|
|
154
206
|
|
|
207
|
+
// src/version-check.ts
|
|
208
|
+
import { satisfies, validRange } from "semver";
|
|
209
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
210
|
+
var ANSI_RED = "\x1B[31m";
|
|
211
|
+
var ANSI_RESET = "\x1B[0m";
|
|
212
|
+
function checkVersion(requiredVersion) {
|
|
213
|
+
const currentVersion = package_default.version;
|
|
214
|
+
if (!requiredVersion.trim() || !validRange(requiredVersion)) {
|
|
215
|
+
throw new Error(
|
|
216
|
+
`Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
return {
|
|
220
|
+
satisfied: satisfies(currentVersion, requiredVersion),
|
|
221
|
+
currentVersion,
|
|
222
|
+
requiredRange: requiredVersion
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
async function enforceRequiredVersion(requiredVersion, options) {
|
|
226
|
+
let result;
|
|
227
|
+
try {
|
|
228
|
+
result = checkVersion(requiredVersion);
|
|
229
|
+
} catch (err) {
|
|
230
|
+
console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
|
|
231
|
+
process.exit(1);
|
|
232
|
+
}
|
|
233
|
+
if (result.satisfied) {
|
|
234
|
+
return;
|
|
235
|
+
}
|
|
236
|
+
const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
|
|
237
|
+
Run \`agentv self update\` to upgrade.`;
|
|
238
|
+
if (options?.strict) {
|
|
239
|
+
console.error(warning);
|
|
240
|
+
console.error(
|
|
241
|
+
`${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
|
|
242
|
+
);
|
|
243
|
+
process.exit(1);
|
|
244
|
+
}
|
|
245
|
+
if (process.stdin.isTTY && process.stdout.isTTY) {
|
|
246
|
+
console.warn(warning);
|
|
247
|
+
const shouldContinue = await promptContinue();
|
|
248
|
+
if (!shouldContinue) {
|
|
249
|
+
process.exit(1);
|
|
250
|
+
}
|
|
251
|
+
} else {
|
|
252
|
+
process.stderr.write(`${warning}
|
|
253
|
+
`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
async function promptContinue() {
|
|
257
|
+
const { confirm } = await import("@inquirer/prompts");
|
|
258
|
+
return confirm({ message: "Continue anyway?", default: false });
|
|
259
|
+
}
|
|
260
|
+
|
|
155
261
|
// src/commands/eval/env.ts
|
|
156
262
|
import { constants as constants3 } from "node:fs";
|
|
157
263
|
import { access as access3 } from "node:fs/promises";
|
|
@@ -822,6 +928,49 @@ var ProgressDisplay = class {
|
|
|
822
928
|
}
|
|
823
929
|
};
|
|
824
930
|
|
|
931
|
+
// src/commands/eval/retry-errors.ts
|
|
932
|
+
import { createReadStream } from "node:fs";
|
|
933
|
+
import { createInterface } from "node:readline";
|
|
934
|
+
async function loadErrorTestIds(jsonlPath) {
|
|
935
|
+
const ids = [];
|
|
936
|
+
const rl = createInterface({
|
|
937
|
+
input: createReadStream(jsonlPath),
|
|
938
|
+
crlfDelay: Number.POSITIVE_INFINITY
|
|
939
|
+
});
|
|
940
|
+
for await (const line of rl) {
|
|
941
|
+
const trimmed = line.trim();
|
|
942
|
+
if (!trimmed) continue;
|
|
943
|
+
try {
|
|
944
|
+
const parsed = JSON.parse(trimmed);
|
|
945
|
+
if (parsed.executionStatus === "execution_error" && parsed.testId) {
|
|
946
|
+
ids.push(parsed.testId);
|
|
947
|
+
}
|
|
948
|
+
} catch {
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
return [...new Set(ids)];
|
|
952
|
+
}
|
|
953
|
+
async function loadNonErrorResults(jsonlPath) {
|
|
954
|
+
const results = [];
|
|
955
|
+
const rl = createInterface({
|
|
956
|
+
input: createReadStream(jsonlPath),
|
|
957
|
+
crlfDelay: Number.POSITIVE_INFINITY
|
|
958
|
+
});
|
|
959
|
+
for await (const line of rl) {
|
|
960
|
+
const trimmed = line.trim();
|
|
961
|
+
if (!trimmed) continue;
|
|
962
|
+
try {
|
|
963
|
+
const parsed = JSON.parse(trimmed);
|
|
964
|
+
if (!parsed.testId || parsed.score === void 0) continue;
|
|
965
|
+
if (parsed.executionStatus !== "execution_error") {
|
|
966
|
+
results.push(parsed);
|
|
967
|
+
}
|
|
968
|
+
} catch {
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
return results;
|
|
972
|
+
}
|
|
973
|
+
|
|
825
974
|
// src/commands/eval/statistics.ts
|
|
826
975
|
var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
|
|
827
976
|
function computeMean(values) {
|
|
@@ -1117,16 +1266,16 @@ function inferFileTypeFromPath(filePath) {
|
|
|
1117
1266
|
var ASSERTION_TYPES_WITH_STRING_VALUE = /* @__PURE__ */ new Set([
|
|
1118
1267
|
"contains",
|
|
1119
1268
|
"icontains",
|
|
1120
|
-
"
|
|
1121
|
-
"
|
|
1269
|
+
"starts-with",
|
|
1270
|
+
"ends-with",
|
|
1122
1271
|
"equals",
|
|
1123
1272
|
"regex"
|
|
1124
1273
|
]);
|
|
1125
1274
|
var ASSERTION_TYPES_WITH_ARRAY_VALUE = /* @__PURE__ */ new Set([
|
|
1126
|
-
"
|
|
1127
|
-
"
|
|
1128
|
-
"
|
|
1129
|
-
"
|
|
1275
|
+
"contains-any",
|
|
1276
|
+
"contains-all",
|
|
1277
|
+
"icontains-any",
|
|
1278
|
+
"icontains-all"
|
|
1130
1279
|
]);
|
|
1131
1280
|
var VALID_TEST_FILE_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".jsonl"]);
|
|
1132
1281
|
var NAME_PATTERN = /^[a-z0-9-]+$/;
|
|
@@ -1492,8 +1641,8 @@ function validateAssertArray(assertField, parentLocation, filePath, errors) {
|
|
|
1492
1641
|
});
|
|
1493
1642
|
continue;
|
|
1494
1643
|
}
|
|
1495
|
-
const
|
|
1496
|
-
if (
|
|
1644
|
+
const rawTypeValue = item.type;
|
|
1645
|
+
if (rawTypeValue === void 0 || typeof rawTypeValue !== "string") {
|
|
1497
1646
|
errors.push({
|
|
1498
1647
|
severity: "warning",
|
|
1499
1648
|
filePath,
|
|
@@ -1502,12 +1651,13 @@ function validateAssertArray(assertField, parentLocation, filePath, errors) {
|
|
|
1502
1651
|
});
|
|
1503
1652
|
continue;
|
|
1504
1653
|
}
|
|
1654
|
+
const typeValue = rawTypeValue.replace(/_/g, "-");
|
|
1505
1655
|
if (!isEvaluatorKind(typeValue)) {
|
|
1506
1656
|
errors.push({
|
|
1507
1657
|
severity: "warning",
|
|
1508
1658
|
filePath,
|
|
1509
1659
|
location: `${location}.type`,
|
|
1510
|
-
message: `Unknown assertion type '${
|
|
1660
|
+
message: `Unknown assertion type '${rawTypeValue}'.`
|
|
1511
1661
|
});
|
|
1512
1662
|
continue;
|
|
1513
1663
|
}
|
|
@@ -1732,7 +1882,7 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
1732
1882
|
"delayMinMs",
|
|
1733
1883
|
"delayMaxMs",
|
|
1734
1884
|
"trace"
|
|
1735
|
-
// For testing
|
|
1885
|
+
// For testing tool-trajectory evaluator
|
|
1736
1886
|
]);
|
|
1737
1887
|
var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
|
|
1738
1888
|
...COMMON_SETTINGS,
|
|
@@ -2230,9 +2380,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
2230
2380
|
}
|
|
2231
2381
|
|
|
2232
2382
|
// src/commands/eval/targets.ts
|
|
2233
|
-
var
|
|
2234
|
-
var
|
|
2235
|
-
var
|
|
2383
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
2384
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
2385
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
2236
2386
|
function isTTY() {
|
|
2237
2387
|
return process.stdout.isTTY ?? false;
|
|
2238
2388
|
}
|
|
@@ -2278,8 +2428,8 @@ async function selectTarget(options) {
|
|
|
2278
2428
|
Warnings in ${targetsFilePath}:`);
|
|
2279
2429
|
for (const warning of warnings) {
|
|
2280
2430
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
2281
|
-
const prefix = useColors ? `${
|
|
2282
|
-
const message = useColors ? `${
|
|
2431
|
+
const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
|
|
2432
|
+
const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
|
|
2283
2433
|
console.warn(`${prefix}${location} ${message}`);
|
|
2284
2434
|
}
|
|
2285
2435
|
console.warn("");
|
|
@@ -2290,8 +2440,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
2290
2440
|
Errors in ${targetsFilePath}:`);
|
|
2291
2441
|
for (const error of errors) {
|
|
2292
2442
|
const location = error.location ? ` [${error.location}]` : "";
|
|
2293
|
-
const prefix = useColors ? `${
|
|
2294
|
-
const message = useColors ? `${
|
|
2443
|
+
const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
|
|
2444
|
+
const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
|
|
2295
2445
|
console.error(`${prefix}${location} ${message}`);
|
|
2296
2446
|
}
|
|
2297
2447
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -2369,8 +2519,8 @@ async function selectMultipleTargets(options) {
|
|
|
2369
2519
|
Warnings in ${targetsFilePath}:`);
|
|
2370
2520
|
for (const warning of warnings) {
|
|
2371
2521
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
2372
|
-
const prefix = useColors ? `${
|
|
2373
|
-
const message = useColors ? `${
|
|
2522
|
+
const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
|
|
2523
|
+
const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
|
|
2374
2524
|
console.warn(`${prefix}${location} ${message}`);
|
|
2375
2525
|
}
|
|
2376
2526
|
console.warn("");
|
|
@@ -2381,8 +2531,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
2381
2531
|
Errors in ${targetsFilePath}:`);
|
|
2382
2532
|
for (const error of errors) {
|
|
2383
2533
|
const location = error.location ? ` [${error.location}]` : "";
|
|
2384
|
-
const prefix = useColors ? `${
|
|
2385
|
-
const message = useColors ? `${
|
|
2534
|
+
const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
|
|
2535
|
+
const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
|
|
2386
2536
|
console.error(`${prefix}${location} ${message}`);
|
|
2387
2537
|
}
|
|
2388
2538
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -2543,7 +2693,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
2543
2693
|
exportOtel: normalizeBoolean(rawOptions.exportOtel),
|
|
2544
2694
|
otelBackend: normalizeString(rawOptions.otelBackend),
|
|
2545
2695
|
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
|
|
2546
|
-
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns)
|
|
2696
|
+
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
|
|
2697
|
+
retryErrors: normalizeString(rawOptions.retryErrors)
|
|
2547
2698
|
};
|
|
2548
2699
|
}
|
|
2549
2700
|
async function ensureFileExists(filePath, description) {
|
|
@@ -2677,7 +2828,8 @@ async function prepareFileMetadata(params) {
|
|
|
2677
2828
|
suiteTargets,
|
|
2678
2829
|
yamlCache: suite.cacheConfig?.enabled,
|
|
2679
2830
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
2680
|
-
totalBudgetUsd: suite.totalBudgetUsd
|
|
2831
|
+
totalBudgetUsd: suite.totalBudgetUsd,
|
|
2832
|
+
failOnError: suite.failOnError
|
|
2681
2833
|
};
|
|
2682
2834
|
}
|
|
2683
2835
|
async function runWithLimit(items, limit, task) {
|
|
@@ -2711,7 +2863,8 @@ async function runSingleEvalFile(params) {
|
|
|
2711
2863
|
evalCases,
|
|
2712
2864
|
trialsConfig,
|
|
2713
2865
|
matrixMode,
|
|
2714
|
-
totalBudgetUsd
|
|
2866
|
+
totalBudgetUsd,
|
|
2867
|
+
failOnError
|
|
2715
2868
|
} = params;
|
|
2716
2869
|
const targetName = selection.targetName;
|
|
2717
2870
|
await ensureFileExists(testFilePath, "Test file");
|
|
@@ -2773,6 +2926,7 @@ async function runSingleEvalFile(params) {
|
|
|
2773
2926
|
cleanupWorkspaces: options.cleanupWorkspaces,
|
|
2774
2927
|
trials: trialsConfig,
|
|
2775
2928
|
totalBudgetUsd,
|
|
2929
|
+
failOnError,
|
|
2776
2930
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
2777
2931
|
onResult: async (result) => {
|
|
2778
2932
|
streamingObserver?.finalizeEvalCase(result.score, result.error);
|
|
@@ -2826,7 +2980,26 @@ async function runEvalCommand(input) {
|
|
|
2826
2980
|
}
|
|
2827
2981
|
const repoRoot = await findRepoRoot(cwd);
|
|
2828
2982
|
const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
|
|
2829
|
-
|
|
2983
|
+
if (yamlConfig?.required_version) {
|
|
2984
|
+
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
2985
|
+
strict: normalizeBoolean(input.rawOptions.strict)
|
|
2986
|
+
});
|
|
2987
|
+
}
|
|
2988
|
+
let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
|
|
2989
|
+
let retryNonErrorResults;
|
|
2990
|
+
if (options.retryErrors) {
|
|
2991
|
+
const retryPath = path10.resolve(options.retryErrors);
|
|
2992
|
+
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
2993
|
+
const errorIds = await loadErrorTestIds(retryPath);
|
|
2994
|
+
if (errorIds.length === 0) {
|
|
2995
|
+
console.log("No execution errors found in the previous output. Nothing to retry.");
|
|
2996
|
+
return;
|
|
2997
|
+
}
|
|
2998
|
+
console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
|
|
2999
|
+
const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
|
|
3000
|
+
options = { ...options, filter: filterPattern };
|
|
3001
|
+
retryNonErrorResults = await loadNonErrorResults(retryPath);
|
|
3002
|
+
}
|
|
2830
3003
|
if (options.keepWorkspaces && options.cleanupWorkspaces) {
|
|
2831
3004
|
console.warn(
|
|
2832
3005
|
"Warning: Both --keep-workspaces and --cleanup-workspaces specified. --cleanup-workspaces takes precedence."
|
|
@@ -2839,7 +3012,7 @@ async function runEvalCommand(input) {
|
|
|
2839
3012
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
2840
3013
|
if (options.exportOtel || useFileExport) {
|
|
2841
3014
|
try {
|
|
2842
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
3015
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-QR5OZ4DH.js");
|
|
2843
3016
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
2844
3017
|
let headers = {};
|
|
2845
3018
|
if (options.otelBackend) {
|
|
@@ -3034,12 +3207,22 @@ async function runEvalCommand(input) {
|
|
|
3034
3207
|
evalCases: applicableEvalCases,
|
|
3035
3208
|
trialsConfig: targetPrep.trialsConfig,
|
|
3036
3209
|
matrixMode: targetPrep.selections.length > 1,
|
|
3037
|
-
totalBudgetUsd: targetPrep.totalBudgetUsd
|
|
3210
|
+
totalBudgetUsd: targetPrep.totalBudgetUsd,
|
|
3211
|
+
failOnError: targetPrep.failOnError
|
|
3038
3212
|
});
|
|
3039
3213
|
allResults.push(...result.results);
|
|
3040
3214
|
}
|
|
3041
3215
|
});
|
|
3042
3216
|
progressReporter.finish();
|
|
3217
|
+
if (retryNonErrorResults && retryNonErrorResults.length > 0) {
|
|
3218
|
+
for (const preserved of retryNonErrorResults) {
|
|
3219
|
+
await outputWriter.append(preserved);
|
|
3220
|
+
}
|
|
3221
|
+
allResults.push(...retryNonErrorResults);
|
|
3222
|
+
console.log(
|
|
3223
|
+
`Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
|
|
3224
|
+
);
|
|
3225
|
+
}
|
|
3043
3226
|
const summary = calculateEvaluationSummary(allResults);
|
|
3044
3227
|
console.log(formatEvaluationSummary(summary));
|
|
3045
3228
|
if (isMatrixMode && allResults.length > 0) {
|
|
@@ -3097,6 +3280,7 @@ async function resolveEvaluationRunner() {
|
|
|
3097
3280
|
}
|
|
3098
3281
|
|
|
3099
3282
|
export {
|
|
3283
|
+
package_default,
|
|
3100
3284
|
toSnakeCaseDeep,
|
|
3101
3285
|
resolveEvalPaths,
|
|
3102
3286
|
findRepoRoot,
|
|
@@ -3110,4 +3294,4 @@ export {
|
|
|
3110
3294
|
selectTarget,
|
|
3111
3295
|
runEvalCommand
|
|
3112
3296
|
};
|
|
3113
|
-
//# sourceMappingURL=chunk-
|
|
3297
|
+
//# sourceMappingURL=chunk-K2APOWTE.js.map
|