agentv 2.11.4 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-KWUTY5XR.js → chunk-FSBZM3HT.js} +176 -31
- package/dist/chunk-FSBZM3HT.js.map +1 -0
- package/dist/{chunk-FBGAD3CQ.js → chunk-M6JYP6A6.js} +17 -55
- package/dist/chunk-M6JYP6A6.js.map +1 -0
- package/dist/{chunk-APGYGAVM.js → chunk-UWDI4UVN.js} +266 -34
- package/dist/chunk-UWDI4UVN.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-RVGCGRG4.js → dist-CCUHG3SN.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-O2KBWGEI.js → interactive-P3D5O673.js} +3 -3
- package/package.json +4 -2
- package/dist/chunk-APGYGAVM.js.map +0 -1
- package/dist/chunk-FBGAD3CQ.js.map +0 -1
- package/dist/chunk-KWUTY5XR.js.map +0 -1
- /package/dist/{dist-RVGCGRG4.js.map → dist-CCUHG3SN.js.map} +0 -0
- /package/dist/{interactive-O2KBWGEI.js.map → interactive-P3D5O673.js.map} +0 -0
|
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
|
|
|
148
148
|
}
|
|
149
149
|
});
|
|
150
150
|
|
|
151
|
-
// ../../packages/core/dist/chunk-
|
|
151
|
+
// ../../packages/core/dist/chunk-JHER2LQ5.js
|
|
152
152
|
import { constants } from "node:fs";
|
|
153
153
|
import { access, readFile } from "node:fs/promises";
|
|
154
154
|
import path from "node:path";
|
|
@@ -4195,7 +4195,7 @@ var coerce = {
|
|
|
4195
4195
|
};
|
|
4196
4196
|
var NEVER = INVALID;
|
|
4197
4197
|
|
|
4198
|
-
// ../../packages/core/dist/chunk-
|
|
4198
|
+
// ../../packages/core/dist/chunk-JHER2LQ5.js
|
|
4199
4199
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
4200
4200
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
4201
4201
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -34331,6 +34331,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
34331
34331
|
continue;
|
|
34332
34332
|
}
|
|
34333
34333
|
const config2 = parsed;
|
|
34334
|
+
const requiredVersion = parsed.required_version;
|
|
34335
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
34336
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
34337
|
+
continue;
|
|
34338
|
+
}
|
|
34334
34339
|
const guidelinePatterns = config2.guideline_patterns;
|
|
34335
34340
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
34336
34341
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -34354,6 +34359,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
34354
34359
|
configPath
|
|
34355
34360
|
);
|
|
34356
34361
|
return {
|
|
34362
|
+
required_version: requiredVersion,
|
|
34357
34363
|
guideline_patterns: guidelinePatterns,
|
|
34358
34364
|
eval_patterns: evalPatterns,
|
|
34359
34365
|
execution: executionDefaults
|
|
@@ -34497,6 +34503,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
34497
34503
|
);
|
|
34498
34504
|
return void 0;
|
|
34499
34505
|
}
|
|
34506
|
+
function extractFailOnError(suite) {
|
|
34507
|
+
const execution = suite.execution;
|
|
34508
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
34509
|
+
return void 0;
|
|
34510
|
+
}
|
|
34511
|
+
const executionObj = execution;
|
|
34512
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
34513
|
+
if (raw === void 0 || raw === null) {
|
|
34514
|
+
return void 0;
|
|
34515
|
+
}
|
|
34516
|
+
if (typeof raw === "boolean") {
|
|
34517
|
+
return raw;
|
|
34518
|
+
}
|
|
34519
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
34520
|
+
return void 0;
|
|
34521
|
+
}
|
|
34500
34522
|
function parseExecutionDefaults(raw, configPath) {
|
|
34501
34523
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
34502
34524
|
return void 0;
|
|
@@ -36653,13 +36675,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
36653
36675
|
}
|
|
36654
36676
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
36655
36677
|
const metadata = parseMetadata(parsed);
|
|
36678
|
+
const failOnError = extractFailOnError(parsed);
|
|
36656
36679
|
return {
|
|
36657
36680
|
tests,
|
|
36658
36681
|
trials: extractTrialsConfig(parsed),
|
|
36659
36682
|
targets: extractTargetsFromSuite(parsed),
|
|
36660
36683
|
cacheConfig: extractCacheConfig(parsed),
|
|
36661
36684
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
36662
|
-
...metadata !== void 0 && { metadata }
|
|
36685
|
+
...metadata !== void 0 && { metadata },
|
|
36686
|
+
...failOnError !== void 0 && { failOnError }
|
|
36663
36687
|
};
|
|
36664
36688
|
}
|
|
36665
36689
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -46481,6 +46505,16 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
46481
46505
|
}
|
|
46482
46506
|
return { dir: resolved };
|
|
46483
46507
|
}
|
|
46508
|
+
function interpolateArgs(args, context) {
|
|
46509
|
+
const vars = {
|
|
46510
|
+
workspace_path: context.workspacePath,
|
|
46511
|
+
test_id: context.testId,
|
|
46512
|
+
eval_run_id: context.evalRunId,
|
|
46513
|
+
case_input: context.caseInput ?? "",
|
|
46514
|
+
case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
|
|
46515
|
+
};
|
|
46516
|
+
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name16) => vars[name16] ?? match));
|
|
46517
|
+
}
|
|
46484
46518
|
async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
|
|
46485
46519
|
const stdin = JSON.stringify({
|
|
46486
46520
|
workspace_path: context.workspacePath,
|
|
@@ -46490,8 +46524,9 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
|
|
|
46490
46524
|
case_metadata: context.caseMetadata ?? null
|
|
46491
46525
|
});
|
|
46492
46526
|
const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
46493
|
-
const cwd = config2.cwd;
|
|
46494
|
-
const
|
|
46527
|
+
const cwd = config2.cwd ?? context.evalDir;
|
|
46528
|
+
const rawCommand = config2.command ?? config2.script ?? [];
|
|
46529
|
+
const commandArray = interpolateArgs(rawCommand, context);
|
|
46495
46530
|
const result = await execFileWithStdin(commandArray, stdin, {
|
|
46496
46531
|
timeoutMs,
|
|
46497
46532
|
cwd
|
|
@@ -46506,6 +46541,10 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
|
|
|
46506
46541
|
}
|
|
46507
46542
|
return result.stdout;
|
|
46508
46543
|
}
|
|
46544
|
+
var QUALITY_PASS_THRESHOLD = 0.8;
|
|
46545
|
+
function classifyQualityStatus(score) {
|
|
46546
|
+
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
46547
|
+
}
|
|
46509
46548
|
function usesFileReferencePrompt(provider) {
|
|
46510
46549
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
46511
46550
|
}
|
|
@@ -46538,7 +46577,8 @@ async function runEvaluation(options) {
|
|
|
46538
46577
|
cleanupWorkspaces,
|
|
46539
46578
|
trials,
|
|
46540
46579
|
streamCallbacks,
|
|
46541
|
-
totalBudgetUsd
|
|
46580
|
+
totalBudgetUsd,
|
|
46581
|
+
failOnError
|
|
46542
46582
|
} = options;
|
|
46543
46583
|
let useCache = options.useCache;
|
|
46544
46584
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -46613,6 +46653,7 @@ async function runEvaluation(options) {
|
|
|
46613
46653
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
46614
46654
|
const typeRegistry = createBuiltinRegistry();
|
|
46615
46655
|
const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
|
|
46656
|
+
const evalDir = discoveryBaseDir;
|
|
46616
46657
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
46617
46658
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
46618
46659
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -46708,7 +46749,8 @@ async function runEvaluation(options) {
|
|
|
46708
46749
|
const scriptContext = {
|
|
46709
46750
|
workspacePath: sharedWorkspacePath,
|
|
46710
46751
|
testId: "__before_all__",
|
|
46711
|
-
evalRunId
|
|
46752
|
+
evalRunId,
|
|
46753
|
+
evalDir
|
|
46712
46754
|
};
|
|
46713
46755
|
try {
|
|
46714
46756
|
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
@@ -46732,6 +46774,7 @@ async function runEvaluation(options) {
|
|
|
46732
46774
|
let beforeAllOutputAttached = false;
|
|
46733
46775
|
let cumulativeBudgetCost = 0;
|
|
46734
46776
|
let budgetExhausted = false;
|
|
46777
|
+
let failOnErrorTriggered = false;
|
|
46735
46778
|
const promises = filteredEvalCases.map(
|
|
46736
46779
|
(evalCase) => limit(async () => {
|
|
46737
46780
|
const workerId = nextWorkerId++;
|
|
@@ -46747,7 +46790,14 @@ async function runEvaluation(options) {
|
|
|
46747
46790
|
answer: "",
|
|
46748
46791
|
target: target.name,
|
|
46749
46792
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
46750
|
-
budgetExceeded: true
|
|
46793
|
+
budgetExceeded: true,
|
|
46794
|
+
executionStatus: "execution_error",
|
|
46795
|
+
failureStage: "setup",
|
|
46796
|
+
failureReasonCode: "budget_exceeded",
|
|
46797
|
+
executionError: {
|
|
46798
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
46799
|
+
stage: "setup"
|
|
46800
|
+
}
|
|
46751
46801
|
};
|
|
46752
46802
|
if (onProgress) {
|
|
46753
46803
|
await onProgress({
|
|
@@ -46763,6 +46813,37 @@ async function runEvaluation(options) {
|
|
|
46763
46813
|
}
|
|
46764
46814
|
return budgetResult;
|
|
46765
46815
|
}
|
|
46816
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
46817
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
46818
|
+
const haltResult = {
|
|
46819
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
46820
|
+
testId: evalCase.id,
|
|
46821
|
+
dataset: evalCase.dataset,
|
|
46822
|
+
score: 0,
|
|
46823
|
+
hits: [],
|
|
46824
|
+
misses: [],
|
|
46825
|
+
answer: "",
|
|
46826
|
+
target: target.name,
|
|
46827
|
+
error: errorMsg,
|
|
46828
|
+
executionStatus: "execution_error",
|
|
46829
|
+
failureStage: "setup",
|
|
46830
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
46831
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
46832
|
+
};
|
|
46833
|
+
if (onProgress) {
|
|
46834
|
+
await onProgress({
|
|
46835
|
+
workerId,
|
|
46836
|
+
testId: evalCase.id,
|
|
46837
|
+
status: "failed",
|
|
46838
|
+
completedAt: Date.now(),
|
|
46839
|
+
error: haltResult.error
|
|
46840
|
+
});
|
|
46841
|
+
}
|
|
46842
|
+
if (onResult) {
|
|
46843
|
+
await onResult(haltResult);
|
|
46844
|
+
}
|
|
46845
|
+
return haltResult;
|
|
46846
|
+
}
|
|
46766
46847
|
if (onProgress) {
|
|
46767
46848
|
await onProgress({
|
|
46768
46849
|
workerId,
|
|
@@ -46794,7 +46875,8 @@ async function runEvaluation(options) {
|
|
|
46794
46875
|
suiteWorkspaceFile,
|
|
46795
46876
|
streamCallbacks,
|
|
46796
46877
|
typeRegistry,
|
|
46797
|
-
repoManager
|
|
46878
|
+
repoManager,
|
|
46879
|
+
evalDir
|
|
46798
46880
|
};
|
|
46799
46881
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
46800
46882
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -46814,6 +46896,9 @@ async function runEvaluation(options) {
|
|
|
46814
46896
|
}
|
|
46815
46897
|
}
|
|
46816
46898
|
}
|
|
46899
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
46900
|
+
failOnErrorTriggered = true;
|
|
46901
|
+
}
|
|
46817
46902
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
46818
46903
|
result = { ...result, beforeAllOutput };
|
|
46819
46904
|
beforeAllOutputAttached = true;
|
|
@@ -46863,7 +46948,9 @@ async function runEvaluation(options) {
|
|
|
46863
46948
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
46864
46949
|
outcome.reason,
|
|
46865
46950
|
promptInputs,
|
|
46866
|
-
primaryProvider
|
|
46951
|
+
primaryProvider,
|
|
46952
|
+
"agent",
|
|
46953
|
+
"provider_error"
|
|
46867
46954
|
);
|
|
46868
46955
|
results.push(errorResult);
|
|
46869
46956
|
if (onResult) {
|
|
@@ -46875,7 +46962,8 @@ async function runEvaluation(options) {
|
|
|
46875
46962
|
const scriptContext = {
|
|
46876
46963
|
workspacePath: sharedWorkspacePath,
|
|
46877
46964
|
testId: "__after_all__",
|
|
46878
|
-
evalRunId
|
|
46965
|
+
evalRunId,
|
|
46966
|
+
evalDir
|
|
46879
46967
|
};
|
|
46880
46968
|
try {
|
|
46881
46969
|
const afterAllOutput = await executeWorkspaceScript(
|
|
@@ -47005,7 +47093,14 @@ async function runBatchEvaluation(options) {
|
|
|
47005
47093
|
availableTargets
|
|
47006
47094
|
});
|
|
47007
47095
|
if (providerError) {
|
|
47008
|
-
result = {
|
|
47096
|
+
result = {
|
|
47097
|
+
...result,
|
|
47098
|
+
error: providerError,
|
|
47099
|
+
executionStatus: "execution_error",
|
|
47100
|
+
failureStage: "agent",
|
|
47101
|
+
failureReasonCode: "provider_error",
|
|
47102
|
+
executionError: { message: providerError, stage: "agent" }
|
|
47103
|
+
};
|
|
47009
47104
|
}
|
|
47010
47105
|
} catch (error40) {
|
|
47011
47106
|
const errorResult = buildErrorResult(
|
|
@@ -47014,7 +47109,9 @@ async function runBatchEvaluation(options) {
|
|
|
47014
47109
|
nowFn(),
|
|
47015
47110
|
error40,
|
|
47016
47111
|
promptInputs,
|
|
47017
|
-
provider
|
|
47112
|
+
provider,
|
|
47113
|
+
"evaluator",
|
|
47114
|
+
"evaluator_error"
|
|
47018
47115
|
);
|
|
47019
47116
|
results.push(errorResult);
|
|
47020
47117
|
if (onResult) {
|
|
@@ -47070,7 +47167,8 @@ async function runEvalCase(options) {
|
|
|
47070
47167
|
sharedBaselineCommit,
|
|
47071
47168
|
suiteWorkspaceFile,
|
|
47072
47169
|
typeRegistry: providedTypeRegistry,
|
|
47073
|
-
repoManager
|
|
47170
|
+
repoManager,
|
|
47171
|
+
evalDir
|
|
47074
47172
|
} = options;
|
|
47075
47173
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
47076
47174
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -47103,7 +47201,9 @@ async function runEvalCase(options) {
|
|
|
47103
47201
|
nowFn(),
|
|
47104
47202
|
new Error(`Failed to create workspace: ${message}`),
|
|
47105
47203
|
promptInputs,
|
|
47106
|
-
provider
|
|
47204
|
+
provider,
|
|
47205
|
+
"setup",
|
|
47206
|
+
"template_error"
|
|
47107
47207
|
);
|
|
47108
47208
|
}
|
|
47109
47209
|
}
|
|
@@ -47123,7 +47223,9 @@ async function runEvalCase(options) {
|
|
|
47123
47223
|
nowFn(),
|
|
47124
47224
|
new Error(`Failed to materialize repos: ${message}`),
|
|
47125
47225
|
promptInputs,
|
|
47126
|
-
provider
|
|
47226
|
+
provider,
|
|
47227
|
+
"repo_setup",
|
|
47228
|
+
"clone_error"
|
|
47127
47229
|
);
|
|
47128
47230
|
}
|
|
47129
47231
|
}
|
|
@@ -47133,7 +47235,8 @@ async function runEvalCase(options) {
|
|
|
47133
47235
|
testId: evalCase.id,
|
|
47134
47236
|
evalRunId: evalRunId ?? "",
|
|
47135
47237
|
caseInput: evalCase.question,
|
|
47136
|
-
caseMetadata: evalCase.metadata
|
|
47238
|
+
caseMetadata: evalCase.metadata,
|
|
47239
|
+
evalDir
|
|
47137
47240
|
};
|
|
47138
47241
|
try {
|
|
47139
47242
|
beforeAllOutput = await executeWorkspaceScript(
|
|
@@ -47152,7 +47255,9 @@ async function runEvalCase(options) {
|
|
|
47152
47255
|
nowFn(),
|
|
47153
47256
|
new Error(`before_all script failed: ${message}`),
|
|
47154
47257
|
promptInputs,
|
|
47155
|
-
provider
|
|
47258
|
+
provider,
|
|
47259
|
+
"setup",
|
|
47260
|
+
"script_error"
|
|
47156
47261
|
);
|
|
47157
47262
|
}
|
|
47158
47263
|
}
|
|
@@ -47163,7 +47268,8 @@ async function runEvalCase(options) {
|
|
|
47163
47268
|
testId: evalCase.id,
|
|
47164
47269
|
evalRunId: evalRunId ?? "",
|
|
47165
47270
|
caseInput: evalCase.question,
|
|
47166
|
-
caseMetadata: evalCase.metadata
|
|
47271
|
+
caseMetadata: evalCase.metadata,
|
|
47272
|
+
evalDir
|
|
47167
47273
|
};
|
|
47168
47274
|
try {
|
|
47169
47275
|
beforeEachOutput = await executeWorkspaceScript(
|
|
@@ -47178,7 +47284,9 @@ async function runEvalCase(options) {
|
|
|
47178
47284
|
nowFn(),
|
|
47179
47285
|
new Error(`before_each script failed: ${message}`),
|
|
47180
47286
|
promptInputs,
|
|
47181
|
-
provider
|
|
47287
|
+
provider,
|
|
47288
|
+
"setup",
|
|
47289
|
+
"script_error"
|
|
47182
47290
|
);
|
|
47183
47291
|
}
|
|
47184
47292
|
}
|
|
@@ -47219,7 +47327,9 @@ async function runEvalCase(options) {
|
|
|
47219
47327
|
nowFn(),
|
|
47220
47328
|
error40,
|
|
47221
47329
|
promptInputs,
|
|
47222
|
-
provider
|
|
47330
|
+
provider,
|
|
47331
|
+
"agent",
|
|
47332
|
+
"provider_error"
|
|
47223
47333
|
);
|
|
47224
47334
|
if (workspacePath) {
|
|
47225
47335
|
if (forceCleanup) {
|
|
@@ -47238,7 +47348,9 @@ async function runEvalCase(options) {
|
|
|
47238
47348
|
nowFn(),
|
|
47239
47349
|
lastError ?? new Error("Provider did not return a response"),
|
|
47240
47350
|
promptInputs,
|
|
47241
|
-
provider
|
|
47351
|
+
provider,
|
|
47352
|
+
"agent",
|
|
47353
|
+
"provider_error"
|
|
47242
47354
|
);
|
|
47243
47355
|
if (workspacePath) {
|
|
47244
47356
|
if (forceCleanup) {
|
|
@@ -47294,7 +47406,8 @@ async function runEvalCase(options) {
|
|
|
47294
47406
|
testId: evalCase.id,
|
|
47295
47407
|
evalRunId: evalRunId ?? "",
|
|
47296
47408
|
caseInput: evalCase.question,
|
|
47297
|
-
caseMetadata: evalCase.metadata
|
|
47409
|
+
caseMetadata: evalCase.metadata,
|
|
47410
|
+
evalDir
|
|
47298
47411
|
};
|
|
47299
47412
|
try {
|
|
47300
47413
|
afterEachOutput = await executeWorkspaceScript(
|
|
@@ -47330,7 +47443,18 @@ async function runEvalCase(options) {
|
|
|
47330
47443
|
fileChanges,
|
|
47331
47444
|
workspacePath
|
|
47332
47445
|
});
|
|
47333
|
-
const
|
|
47446
|
+
const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
|
|
47447
|
+
const finalResult = providerError ? {
|
|
47448
|
+
...result,
|
|
47449
|
+
error: providerError,
|
|
47450
|
+
executionStatus,
|
|
47451
|
+
failureStage: "agent",
|
|
47452
|
+
failureReasonCode: "provider_error",
|
|
47453
|
+
executionError: { message: providerError, stage: "agent" },
|
|
47454
|
+
beforeAllOutput,
|
|
47455
|
+
beforeEachOutput,
|
|
47456
|
+
afterEachOutput
|
|
47457
|
+
} : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
|
|
47334
47458
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
47335
47459
|
if (workspacePath && !isSharedWorkspace) {
|
|
47336
47460
|
if (forceCleanup) {
|
|
@@ -47351,7 +47475,9 @@ async function runEvalCase(options) {
|
|
|
47351
47475
|
nowFn(),
|
|
47352
47476
|
error40,
|
|
47353
47477
|
promptInputs,
|
|
47354
|
-
provider
|
|
47478
|
+
provider,
|
|
47479
|
+
"evaluator",
|
|
47480
|
+
"evaluator_error"
|
|
47355
47481
|
);
|
|
47356
47482
|
if (workspacePath && !isSharedWorkspace) {
|
|
47357
47483
|
if (forceCleanup) {
|
|
@@ -47389,7 +47515,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
47389
47515
|
verdict: trialVerdict,
|
|
47390
47516
|
scores: result.scores,
|
|
47391
47517
|
error: result.error,
|
|
47392
|
-
costUsd: trialCost
|
|
47518
|
+
costUsd: trialCost,
|
|
47519
|
+
executionStatus: result.executionStatus,
|
|
47520
|
+
failureStage: result.failureStage,
|
|
47521
|
+
failureReasonCode: result.failureReasonCode
|
|
47393
47522
|
};
|
|
47394
47523
|
trialResults.push(trial);
|
|
47395
47524
|
if (trialCost !== void 0) {
|
|
@@ -47414,12 +47543,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
47414
47543
|
0
|
|
47415
47544
|
);
|
|
47416
47545
|
const baseResult = allResults[bestTrialIndex];
|
|
47546
|
+
const hasOk = trialResults.some((t) => t.executionStatus === "ok");
|
|
47547
|
+
const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
|
|
47548
|
+
const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
|
|
47549
|
+
const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
|
|
47550
|
+
const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
|
|
47551
|
+
const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
|
|
47417
47552
|
return {
|
|
47418
47553
|
...baseResult,
|
|
47419
47554
|
score,
|
|
47420
47555
|
trials: trialResults,
|
|
47421
47556
|
aggregation,
|
|
47422
|
-
costLimited: costLimited || void 0
|
|
47557
|
+
costLimited: costLimited || void 0,
|
|
47558
|
+
executionStatus: aggregateExecutionStatus,
|
|
47559
|
+
failureStage: aggregateFailureStage,
|
|
47560
|
+
failureReasonCode: aggregateFailureReasonCode,
|
|
47561
|
+
executionError: aggregateExecutionError
|
|
47423
47562
|
};
|
|
47424
47563
|
}
|
|
47425
47564
|
async function evaluateCandidate(options) {
|
|
@@ -47520,7 +47659,8 @@ async function evaluateCandidate(options) {
|
|
|
47520
47659
|
scores,
|
|
47521
47660
|
trace: trace2,
|
|
47522
47661
|
output,
|
|
47523
|
-
fileChanges
|
|
47662
|
+
fileChanges,
|
|
47663
|
+
executionStatus: classifyQualityStatus(score.score)
|
|
47524
47664
|
};
|
|
47525
47665
|
}
|
|
47526
47666
|
async function runEvaluatorsForCase(options) {
|
|
@@ -47825,7 +47965,7 @@ async function invokeProvider(provider, options) {
|
|
|
47825
47965
|
}
|
|
47826
47966
|
}
|
|
47827
47967
|
}
|
|
47828
|
-
function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider) {
|
|
47968
|
+
function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider, failureStage, failureReasonCode) {
|
|
47829
47969
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
47830
47970
|
let agentRequest;
|
|
47831
47971
|
let lmRequest;
|
|
@@ -47868,7 +48008,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs
|
|
|
47868
48008
|
target: targetName,
|
|
47869
48009
|
requests,
|
|
47870
48010
|
input,
|
|
47871
|
-
error: message
|
|
48011
|
+
error: message,
|
|
48012
|
+
executionStatus: "execution_error",
|
|
48013
|
+
failureStage,
|
|
48014
|
+
failureReasonCode,
|
|
48015
|
+
executionError: { message, stage: failureStage }
|
|
47872
48016
|
};
|
|
47873
48017
|
}
|
|
47874
48018
|
function extractProviderError(response) {
|
|
@@ -48767,6 +48911,7 @@ export {
|
|
|
48767
48911
|
extractTargetsFromTestCase,
|
|
48768
48912
|
extractTrialsConfig,
|
|
48769
48913
|
extractCacheConfig,
|
|
48914
|
+
extractFailOnError,
|
|
48770
48915
|
detectFormat,
|
|
48771
48916
|
buildPromptInputs,
|
|
48772
48917
|
readTestSuiteMetadata,
|
|
@@ -48866,4 +49011,4 @@ export {
|
|
|
48866
49011
|
OtelStreamingObserver,
|
|
48867
49012
|
createAgentKernel
|
|
48868
49013
|
};
|
|
48869
|
-
//# sourceMappingURL=chunk-
|
|
49014
|
+
//# sourceMappingURL=chunk-FSBZM3HT.js.map
|