@agentv/core 2.12.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7HPKTRFZ.js → chunk-JHER2LQ5.js} +1 -1
- package/dist/chunk-JHER2LQ5.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +64 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -1
- package/dist/index.d.ts +18 -1
- package/dist/index.js +64 -3
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-7HPKTRFZ.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1492,6 +1492,7 @@ __export(index_exports, {
|
|
|
1492
1492
|
executeWorkspaceScript: () => executeWorkspaceScript,
|
|
1493
1493
|
explorationRatio: () => explorationRatio,
|
|
1494
1494
|
extractCacheConfig: () => extractCacheConfig,
|
|
1495
|
+
extractFailOnError: () => extractFailOnError,
|
|
1495
1496
|
extractJsonBlob: () => extractJsonBlob,
|
|
1496
1497
|
extractTargetFromSuite: () => extractTargetFromSuite,
|
|
1497
1498
|
extractTargetsFromSuite: () => extractTargetsFromSuite,
|
|
@@ -2014,6 +2015,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2014
2015
|
continue;
|
|
2015
2016
|
}
|
|
2016
2017
|
const config = parsed;
|
|
2018
|
+
const requiredVersion = parsed.required_version;
|
|
2019
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
2020
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
2021
|
+
continue;
|
|
2022
|
+
}
|
|
2017
2023
|
const guidelinePatterns = config.guideline_patterns;
|
|
2018
2024
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
2019
2025
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -2037,6 +2043,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2037
2043
|
configPath
|
|
2038
2044
|
);
|
|
2039
2045
|
return {
|
|
2046
|
+
required_version: requiredVersion,
|
|
2040
2047
|
guideline_patterns: guidelinePatterns,
|
|
2041
2048
|
eval_patterns: evalPatterns,
|
|
2042
2049
|
execution: executionDefaults
|
|
@@ -2180,6 +2187,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
2180
2187
|
);
|
|
2181
2188
|
return void 0;
|
|
2182
2189
|
}
|
|
2190
|
+
function extractFailOnError(suite) {
|
|
2191
|
+
const execution = suite.execution;
|
|
2192
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
2193
|
+
return void 0;
|
|
2194
|
+
}
|
|
2195
|
+
const executionObj = execution;
|
|
2196
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
2197
|
+
if (raw === void 0 || raw === null) {
|
|
2198
|
+
return void 0;
|
|
2199
|
+
}
|
|
2200
|
+
if (typeof raw === "boolean") {
|
|
2201
|
+
return raw;
|
|
2202
|
+
}
|
|
2203
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
2204
|
+
return void 0;
|
|
2205
|
+
}
|
|
2183
2206
|
function parseExecutionDefaults(raw, configPath) {
|
|
2184
2207
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
2185
2208
|
return void 0;
|
|
@@ -4375,13 +4398,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
4375
4398
|
}
|
|
4376
4399
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
4377
4400
|
const metadata = parseMetadata(parsed);
|
|
4401
|
+
const failOnError = extractFailOnError(parsed);
|
|
4378
4402
|
return {
|
|
4379
4403
|
tests,
|
|
4380
4404
|
trials: extractTrialsConfig(parsed),
|
|
4381
4405
|
targets: extractTargetsFromSuite(parsed),
|
|
4382
4406
|
cacheConfig: extractCacheConfig(parsed),
|
|
4383
4407
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
4384
|
-
...metadata !== void 0 && { metadata }
|
|
4408
|
+
...metadata !== void 0 && { metadata },
|
|
4409
|
+
...failOnError !== void 0 && { failOnError }
|
|
4385
4410
|
};
|
|
4386
4411
|
}
|
|
4387
4412
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -15780,7 +15805,8 @@ async function runEvaluation(options) {
|
|
|
15780
15805
|
cleanupWorkspaces,
|
|
15781
15806
|
trials,
|
|
15782
15807
|
streamCallbacks,
|
|
15783
|
-
totalBudgetUsd
|
|
15808
|
+
totalBudgetUsd,
|
|
15809
|
+
failOnError
|
|
15784
15810
|
} = options;
|
|
15785
15811
|
let useCache = options.useCache;
|
|
15786
15812
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -15976,6 +16002,7 @@ async function runEvaluation(options) {
|
|
|
15976
16002
|
let beforeAllOutputAttached = false;
|
|
15977
16003
|
let cumulativeBudgetCost = 0;
|
|
15978
16004
|
let budgetExhausted = false;
|
|
16005
|
+
let failOnErrorTriggered = false;
|
|
15979
16006
|
const promises = filteredEvalCases.map(
|
|
15980
16007
|
(evalCase) => limit(async () => {
|
|
15981
16008
|
const workerId = nextWorkerId++;
|
|
@@ -16014,6 +16041,37 @@ async function runEvaluation(options) {
|
|
|
16014
16041
|
}
|
|
16015
16042
|
return budgetResult;
|
|
16016
16043
|
}
|
|
16044
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
16045
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
16046
|
+
const haltResult = {
|
|
16047
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16048
|
+
testId: evalCase.id,
|
|
16049
|
+
dataset: evalCase.dataset,
|
|
16050
|
+
score: 0,
|
|
16051
|
+
hits: [],
|
|
16052
|
+
misses: [],
|
|
16053
|
+
answer: "",
|
|
16054
|
+
target: target.name,
|
|
16055
|
+
error: errorMsg,
|
|
16056
|
+
executionStatus: "execution_error",
|
|
16057
|
+
failureStage: "setup",
|
|
16058
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
16059
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
16060
|
+
};
|
|
16061
|
+
if (onProgress) {
|
|
16062
|
+
await onProgress({
|
|
16063
|
+
workerId,
|
|
16064
|
+
testId: evalCase.id,
|
|
16065
|
+
status: "failed",
|
|
16066
|
+
completedAt: Date.now(),
|
|
16067
|
+
error: haltResult.error
|
|
16068
|
+
});
|
|
16069
|
+
}
|
|
16070
|
+
if (onResult) {
|
|
16071
|
+
await onResult(haltResult);
|
|
16072
|
+
}
|
|
16073
|
+
return haltResult;
|
|
16074
|
+
}
|
|
16017
16075
|
if (onProgress) {
|
|
16018
16076
|
await onProgress({
|
|
16019
16077
|
workerId,
|
|
@@ -16066,6 +16124,9 @@ async function runEvaluation(options) {
|
|
|
16066
16124
|
}
|
|
16067
16125
|
}
|
|
16068
16126
|
}
|
|
16127
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
16128
|
+
failOnErrorTriggered = true;
|
|
16129
|
+
}
|
|
16069
16130
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
16070
16131
|
result = { ...result, beforeAllOutput };
|
|
16071
16132
|
beforeAllOutputAttached = true;
|
|
@@ -18132,6 +18193,7 @@ function createAgentKernel() {
|
|
|
18132
18193
|
executeWorkspaceScript,
|
|
18133
18194
|
explorationRatio,
|
|
18134
18195
|
extractCacheConfig,
|
|
18196
|
+
extractFailOnError,
|
|
18135
18197
|
extractJsonBlob,
|
|
18136
18198
|
extractTargetFromSuite,
|
|
18137
18199
|
extractTargetsFromSuite,
|