agentv 2.12.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-LUHCYBMD.js → chunk-FSBZM3HT.js} +66 -5
- package/dist/chunk-FSBZM3HT.js.map +1 -0
- package/dist/{chunk-6KU2ZUFJ.js → chunk-M6JYP6A6.js} +17 -55
- package/dist/chunk-M6JYP6A6.js.map +1 -0
- package/dist/{chunk-YBJX5CP6.js → chunk-UWDI4UVN.js} +202 -19
- package/dist/chunk-UWDI4UVN.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-OPPA4P5R.js → dist-CCUHG3SN.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-TOUKPSHP.js → interactive-P3D5O673.js} +3 -3
- package/package.json +4 -2
- package/dist/chunk-6KU2ZUFJ.js.map +0 -1
- package/dist/chunk-LUHCYBMD.js.map +0 -1
- package/dist/chunk-YBJX5CP6.js.map +0 -1
- /package/dist/{dist-OPPA4P5R.js.map → dist-CCUHG3SN.js.map} +0 -0
- /package/dist/{interactive-TOUKPSHP.js.map → interactive-P3D5O673.js.map} +0 -0
|
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
|
|
|
148
148
|
}
|
|
149
149
|
});
|
|
150
150
|
|
|
151
|
-
// ../../packages/core/dist/chunk-
|
|
151
|
+
// ../../packages/core/dist/chunk-JHER2LQ5.js
|
|
152
152
|
import { constants } from "node:fs";
|
|
153
153
|
import { access, readFile } from "node:fs/promises";
|
|
154
154
|
import path from "node:path";
|
|
@@ -4195,7 +4195,7 @@ var coerce = {
|
|
|
4195
4195
|
};
|
|
4196
4196
|
var NEVER = INVALID;
|
|
4197
4197
|
|
|
4198
|
-
// ../../packages/core/dist/chunk-
|
|
4198
|
+
// ../../packages/core/dist/chunk-JHER2LQ5.js
|
|
4199
4199
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
4200
4200
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
4201
4201
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -34331,6 +34331,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
34331
34331
|
continue;
|
|
34332
34332
|
}
|
|
34333
34333
|
const config2 = parsed;
|
|
34334
|
+
const requiredVersion = parsed.required_version;
|
|
34335
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
34336
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
34337
|
+
continue;
|
|
34338
|
+
}
|
|
34334
34339
|
const guidelinePatterns = config2.guideline_patterns;
|
|
34335
34340
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
34336
34341
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -34354,6 +34359,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
34354
34359
|
configPath
|
|
34355
34360
|
);
|
|
34356
34361
|
return {
|
|
34362
|
+
required_version: requiredVersion,
|
|
34357
34363
|
guideline_patterns: guidelinePatterns,
|
|
34358
34364
|
eval_patterns: evalPatterns,
|
|
34359
34365
|
execution: executionDefaults
|
|
@@ -34497,6 +34503,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
34497
34503
|
);
|
|
34498
34504
|
return void 0;
|
|
34499
34505
|
}
|
|
34506
|
+
function extractFailOnError(suite) {
|
|
34507
|
+
const execution = suite.execution;
|
|
34508
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
34509
|
+
return void 0;
|
|
34510
|
+
}
|
|
34511
|
+
const executionObj = execution;
|
|
34512
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
34513
|
+
if (raw === void 0 || raw === null) {
|
|
34514
|
+
return void 0;
|
|
34515
|
+
}
|
|
34516
|
+
if (typeof raw === "boolean") {
|
|
34517
|
+
return raw;
|
|
34518
|
+
}
|
|
34519
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
34520
|
+
return void 0;
|
|
34521
|
+
}
|
|
34500
34522
|
function parseExecutionDefaults(raw, configPath) {
|
|
34501
34523
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
34502
34524
|
return void 0;
|
|
@@ -36653,13 +36675,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
36653
36675
|
}
|
|
36654
36676
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
36655
36677
|
const metadata = parseMetadata(parsed);
|
|
36678
|
+
const failOnError = extractFailOnError(parsed);
|
|
36656
36679
|
return {
|
|
36657
36680
|
tests,
|
|
36658
36681
|
trials: extractTrialsConfig(parsed),
|
|
36659
36682
|
targets: extractTargetsFromSuite(parsed),
|
|
36660
36683
|
cacheConfig: extractCacheConfig(parsed),
|
|
36661
36684
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
36662
|
-
...metadata !== void 0 && { metadata }
|
|
36685
|
+
...metadata !== void 0 && { metadata },
|
|
36686
|
+
...failOnError !== void 0 && { failOnError }
|
|
36663
36687
|
};
|
|
36664
36688
|
}
|
|
36665
36689
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -46553,7 +46577,8 @@ async function runEvaluation(options) {
|
|
|
46553
46577
|
cleanupWorkspaces,
|
|
46554
46578
|
trials,
|
|
46555
46579
|
streamCallbacks,
|
|
46556
|
-
totalBudgetUsd
|
|
46580
|
+
totalBudgetUsd,
|
|
46581
|
+
failOnError
|
|
46557
46582
|
} = options;
|
|
46558
46583
|
let useCache = options.useCache;
|
|
46559
46584
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -46749,6 +46774,7 @@ async function runEvaluation(options) {
|
|
|
46749
46774
|
let beforeAllOutputAttached = false;
|
|
46750
46775
|
let cumulativeBudgetCost = 0;
|
|
46751
46776
|
let budgetExhausted = false;
|
|
46777
|
+
let failOnErrorTriggered = false;
|
|
46752
46778
|
const promises = filteredEvalCases.map(
|
|
46753
46779
|
(evalCase) => limit(async () => {
|
|
46754
46780
|
const workerId = nextWorkerId++;
|
|
@@ -46787,6 +46813,37 @@ async function runEvaluation(options) {
|
|
|
46787
46813
|
}
|
|
46788
46814
|
return budgetResult;
|
|
46789
46815
|
}
|
|
46816
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
46817
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
46818
|
+
const haltResult = {
|
|
46819
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
46820
|
+
testId: evalCase.id,
|
|
46821
|
+
dataset: evalCase.dataset,
|
|
46822
|
+
score: 0,
|
|
46823
|
+
hits: [],
|
|
46824
|
+
misses: [],
|
|
46825
|
+
answer: "",
|
|
46826
|
+
target: target.name,
|
|
46827
|
+
error: errorMsg,
|
|
46828
|
+
executionStatus: "execution_error",
|
|
46829
|
+
failureStage: "setup",
|
|
46830
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
46831
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
46832
|
+
};
|
|
46833
|
+
if (onProgress) {
|
|
46834
|
+
await onProgress({
|
|
46835
|
+
workerId,
|
|
46836
|
+
testId: evalCase.id,
|
|
46837
|
+
status: "failed",
|
|
46838
|
+
completedAt: Date.now(),
|
|
46839
|
+
error: haltResult.error
|
|
46840
|
+
});
|
|
46841
|
+
}
|
|
46842
|
+
if (onResult) {
|
|
46843
|
+
await onResult(haltResult);
|
|
46844
|
+
}
|
|
46845
|
+
return haltResult;
|
|
46846
|
+
}
|
|
46790
46847
|
if (onProgress) {
|
|
46791
46848
|
await onProgress({
|
|
46792
46849
|
workerId,
|
|
@@ -46839,6 +46896,9 @@ async function runEvaluation(options) {
|
|
|
46839
46896
|
}
|
|
46840
46897
|
}
|
|
46841
46898
|
}
|
|
46899
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
46900
|
+
failOnErrorTriggered = true;
|
|
46901
|
+
}
|
|
46842
46902
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
46843
46903
|
result = { ...result, beforeAllOutput };
|
|
46844
46904
|
beforeAllOutputAttached = true;
|
|
@@ -48851,6 +48911,7 @@ export {
|
|
|
48851
48911
|
extractTargetsFromTestCase,
|
|
48852
48912
|
extractTrialsConfig,
|
|
48853
48913
|
extractCacheConfig,
|
|
48914
|
+
extractFailOnError,
|
|
48854
48915
|
detectFormat,
|
|
48855
48916
|
buildPromptInputs,
|
|
48856
48917
|
readTestSuiteMetadata,
|
|
@@ -48950,4 +49011,4 @@ export {
|
|
|
48950
49011
|
OtelStreamingObserver,
|
|
48951
49012
|
createAgentKernel
|
|
48952
49013
|
};
|
|
48953
|
-
//# sourceMappingURL=chunk-
|
|
49014
|
+
//# sourceMappingURL=chunk-FSBZM3HT.js.map
|