@agentv/core 2.11.4 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-REN5PS7B.js → chunk-JHER2LQ5.js} +1 -1
- package/dist/chunk-JHER2LQ5.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +174 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +54 -1
- package/dist/index.d.ts +54 -1
- package/dist/index.js +174 -29
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-REN5PS7B.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1492,6 +1492,7 @@ __export(index_exports, {
|
|
|
1492
1492
|
executeWorkspaceScript: () => executeWorkspaceScript,
|
|
1493
1493
|
explorationRatio: () => explorationRatio,
|
|
1494
1494
|
extractCacheConfig: () => extractCacheConfig,
|
|
1495
|
+
extractFailOnError: () => extractFailOnError,
|
|
1495
1496
|
extractJsonBlob: () => extractJsonBlob,
|
|
1496
1497
|
extractTargetFromSuite: () => extractTargetFromSuite,
|
|
1497
1498
|
extractTargetsFromSuite: () => extractTargetsFromSuite,
|
|
@@ -2014,6 +2015,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2014
2015
|
continue;
|
|
2015
2016
|
}
|
|
2016
2017
|
const config = parsed;
|
|
2018
|
+
const requiredVersion = parsed.required_version;
|
|
2019
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
2020
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
2021
|
+
continue;
|
|
2022
|
+
}
|
|
2017
2023
|
const guidelinePatterns = config.guideline_patterns;
|
|
2018
2024
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
2019
2025
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -2037,6 +2043,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2037
2043
|
configPath
|
|
2038
2044
|
);
|
|
2039
2045
|
return {
|
|
2046
|
+
required_version: requiredVersion,
|
|
2040
2047
|
guideline_patterns: guidelinePatterns,
|
|
2041
2048
|
eval_patterns: evalPatterns,
|
|
2042
2049
|
execution: executionDefaults
|
|
@@ -2180,6 +2187,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
2180
2187
|
);
|
|
2181
2188
|
return void 0;
|
|
2182
2189
|
}
|
|
2190
|
+
function extractFailOnError(suite) {
|
|
2191
|
+
const execution = suite.execution;
|
|
2192
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
2193
|
+
return void 0;
|
|
2194
|
+
}
|
|
2195
|
+
const executionObj = execution;
|
|
2196
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
2197
|
+
if (raw === void 0 || raw === null) {
|
|
2198
|
+
return void 0;
|
|
2199
|
+
}
|
|
2200
|
+
if (typeof raw === "boolean") {
|
|
2201
|
+
return raw;
|
|
2202
|
+
}
|
|
2203
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
2204
|
+
return void 0;
|
|
2205
|
+
}
|
|
2183
2206
|
function parseExecutionDefaults(raw, configPath) {
|
|
2184
2207
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
2185
2208
|
return void 0;
|
|
@@ -4375,13 +4398,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
4375
4398
|
}
|
|
4376
4399
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
4377
4400
|
const metadata = parseMetadata(parsed);
|
|
4401
|
+
const failOnError = extractFailOnError(parsed);
|
|
4378
4402
|
return {
|
|
4379
4403
|
tests,
|
|
4380
4404
|
trials: extractTrialsConfig(parsed),
|
|
4381
4405
|
targets: extractTargetsFromSuite(parsed),
|
|
4382
4406
|
cacheConfig: extractCacheConfig(parsed),
|
|
4383
4407
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
4384
|
-
...metadata !== void 0 && { metadata }
|
|
4408
|
+
...metadata !== void 0 && { metadata },
|
|
4409
|
+
...failOnError !== void 0 && { failOnError }
|
|
4385
4410
|
};
|
|
4386
4411
|
}
|
|
4387
4412
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -15706,6 +15731,16 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
15706
15731
|
}
|
|
15707
15732
|
|
|
15708
15733
|
// src/evaluation/workspace/script-executor.ts
|
|
15734
|
+
function interpolateArgs(args, context2) {
|
|
15735
|
+
const vars = {
|
|
15736
|
+
workspace_path: context2.workspacePath,
|
|
15737
|
+
test_id: context2.testId,
|
|
15738
|
+
eval_run_id: context2.evalRunId,
|
|
15739
|
+
case_input: context2.caseInput ?? "",
|
|
15740
|
+
case_metadata: context2.caseMetadata ? JSON.stringify(context2.caseMetadata) : ""
|
|
15741
|
+
};
|
|
15742
|
+
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
|
|
15743
|
+
}
|
|
15709
15744
|
async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
15710
15745
|
const stdin = JSON.stringify({
|
|
15711
15746
|
workspace_path: context2.workspacePath,
|
|
@@ -15715,8 +15750,9 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
15715
15750
|
case_metadata: context2.caseMetadata ?? null
|
|
15716
15751
|
});
|
|
15717
15752
|
const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
15718
|
-
const cwd = config.cwd;
|
|
15719
|
-
const
|
|
15753
|
+
const cwd = config.cwd ?? context2.evalDir;
|
|
15754
|
+
const rawCommand = config.command ?? config.script ?? [];
|
|
15755
|
+
const commandArray = interpolateArgs(rawCommand, context2);
|
|
15720
15756
|
const result = await execFileWithStdin(commandArray, stdin, {
|
|
15721
15757
|
timeoutMs,
|
|
15722
15758
|
cwd
|
|
@@ -15733,6 +15769,10 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
15733
15769
|
}
|
|
15734
15770
|
|
|
15735
15771
|
// src/evaluation/orchestrator.ts
|
|
15772
|
+
var QUALITY_PASS_THRESHOLD = 0.8;
|
|
15773
|
+
function classifyQualityStatus(score) {
|
|
15774
|
+
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
15775
|
+
}
|
|
15736
15776
|
function usesFileReferencePrompt(provider) {
|
|
15737
15777
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
15738
15778
|
}
|
|
@@ -15765,7 +15805,8 @@ async function runEvaluation(options) {
|
|
|
15765
15805
|
cleanupWorkspaces,
|
|
15766
15806
|
trials,
|
|
15767
15807
|
streamCallbacks,
|
|
15768
|
-
totalBudgetUsd
|
|
15808
|
+
totalBudgetUsd,
|
|
15809
|
+
failOnError
|
|
15769
15810
|
} = options;
|
|
15770
15811
|
let useCache = options.useCache;
|
|
15771
15812
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -15840,6 +15881,7 @@ async function runEvaluation(options) {
|
|
|
15840
15881
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
15841
15882
|
const typeRegistry = createBuiltinRegistry();
|
|
15842
15883
|
const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
|
|
15884
|
+
const evalDir = discoveryBaseDir;
|
|
15843
15885
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
15844
15886
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
15845
15887
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -15935,7 +15977,8 @@ async function runEvaluation(options) {
|
|
|
15935
15977
|
const scriptContext = {
|
|
15936
15978
|
workspacePath: sharedWorkspacePath,
|
|
15937
15979
|
testId: "__before_all__",
|
|
15938
|
-
evalRunId
|
|
15980
|
+
evalRunId,
|
|
15981
|
+
evalDir
|
|
15939
15982
|
};
|
|
15940
15983
|
try {
|
|
15941
15984
|
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
@@ -15959,6 +16002,7 @@ async function runEvaluation(options) {
|
|
|
15959
16002
|
let beforeAllOutputAttached = false;
|
|
15960
16003
|
let cumulativeBudgetCost = 0;
|
|
15961
16004
|
let budgetExhausted = false;
|
|
16005
|
+
let failOnErrorTriggered = false;
|
|
15962
16006
|
const promises = filteredEvalCases.map(
|
|
15963
16007
|
(evalCase) => limit(async () => {
|
|
15964
16008
|
const workerId = nextWorkerId++;
|
|
@@ -15974,7 +16018,14 @@ async function runEvaluation(options) {
|
|
|
15974
16018
|
answer: "",
|
|
15975
16019
|
target: target.name,
|
|
15976
16020
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
15977
|
-
budgetExceeded: true
|
|
16021
|
+
budgetExceeded: true,
|
|
16022
|
+
executionStatus: "execution_error",
|
|
16023
|
+
failureStage: "setup",
|
|
16024
|
+
failureReasonCode: "budget_exceeded",
|
|
16025
|
+
executionError: {
|
|
16026
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16027
|
+
stage: "setup"
|
|
16028
|
+
}
|
|
15978
16029
|
};
|
|
15979
16030
|
if (onProgress) {
|
|
15980
16031
|
await onProgress({
|
|
@@ -15990,6 +16041,37 @@ async function runEvaluation(options) {
|
|
|
15990
16041
|
}
|
|
15991
16042
|
return budgetResult;
|
|
15992
16043
|
}
|
|
16044
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
16045
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
16046
|
+
const haltResult = {
|
|
16047
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16048
|
+
testId: evalCase.id,
|
|
16049
|
+
dataset: evalCase.dataset,
|
|
16050
|
+
score: 0,
|
|
16051
|
+
hits: [],
|
|
16052
|
+
misses: [],
|
|
16053
|
+
answer: "",
|
|
16054
|
+
target: target.name,
|
|
16055
|
+
error: errorMsg,
|
|
16056
|
+
executionStatus: "execution_error",
|
|
16057
|
+
failureStage: "setup",
|
|
16058
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
16059
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
16060
|
+
};
|
|
16061
|
+
if (onProgress) {
|
|
16062
|
+
await onProgress({
|
|
16063
|
+
workerId,
|
|
16064
|
+
testId: evalCase.id,
|
|
16065
|
+
status: "failed",
|
|
16066
|
+
completedAt: Date.now(),
|
|
16067
|
+
error: haltResult.error
|
|
16068
|
+
});
|
|
16069
|
+
}
|
|
16070
|
+
if (onResult) {
|
|
16071
|
+
await onResult(haltResult);
|
|
16072
|
+
}
|
|
16073
|
+
return haltResult;
|
|
16074
|
+
}
|
|
15993
16075
|
if (onProgress) {
|
|
15994
16076
|
await onProgress({
|
|
15995
16077
|
workerId,
|
|
@@ -16021,7 +16103,8 @@ async function runEvaluation(options) {
|
|
|
16021
16103
|
suiteWorkspaceFile,
|
|
16022
16104
|
streamCallbacks,
|
|
16023
16105
|
typeRegistry,
|
|
16024
|
-
repoManager
|
|
16106
|
+
repoManager,
|
|
16107
|
+
evalDir
|
|
16025
16108
|
};
|
|
16026
16109
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
16027
16110
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -16041,6 +16124,9 @@ async function runEvaluation(options) {
|
|
|
16041
16124
|
}
|
|
16042
16125
|
}
|
|
16043
16126
|
}
|
|
16127
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
16128
|
+
failOnErrorTriggered = true;
|
|
16129
|
+
}
|
|
16044
16130
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
16045
16131
|
result = { ...result, beforeAllOutput };
|
|
16046
16132
|
beforeAllOutputAttached = true;
|
|
@@ -16090,7 +16176,9 @@ async function runEvaluation(options) {
|
|
|
16090
16176
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
16091
16177
|
outcome.reason,
|
|
16092
16178
|
promptInputs,
|
|
16093
|
-
primaryProvider
|
|
16179
|
+
primaryProvider,
|
|
16180
|
+
"agent",
|
|
16181
|
+
"provider_error"
|
|
16094
16182
|
);
|
|
16095
16183
|
results.push(errorResult);
|
|
16096
16184
|
if (onResult) {
|
|
@@ -16102,7 +16190,8 @@ async function runEvaluation(options) {
|
|
|
16102
16190
|
const scriptContext = {
|
|
16103
16191
|
workspacePath: sharedWorkspacePath,
|
|
16104
16192
|
testId: "__after_all__",
|
|
16105
|
-
evalRunId
|
|
16193
|
+
evalRunId,
|
|
16194
|
+
evalDir
|
|
16106
16195
|
};
|
|
16107
16196
|
try {
|
|
16108
16197
|
const afterAllOutput = await executeWorkspaceScript(
|
|
@@ -16232,7 +16321,14 @@ async function runBatchEvaluation(options) {
|
|
|
16232
16321
|
availableTargets
|
|
16233
16322
|
});
|
|
16234
16323
|
if (providerError) {
|
|
16235
|
-
result = {
|
|
16324
|
+
result = {
|
|
16325
|
+
...result,
|
|
16326
|
+
error: providerError,
|
|
16327
|
+
executionStatus: "execution_error",
|
|
16328
|
+
failureStage: "agent",
|
|
16329
|
+
failureReasonCode: "provider_error",
|
|
16330
|
+
executionError: { message: providerError, stage: "agent" }
|
|
16331
|
+
};
|
|
16236
16332
|
}
|
|
16237
16333
|
} catch (error) {
|
|
16238
16334
|
const errorResult = buildErrorResult(
|
|
@@ -16241,7 +16337,9 @@ async function runBatchEvaluation(options) {
|
|
|
16241
16337
|
nowFn(),
|
|
16242
16338
|
error,
|
|
16243
16339
|
promptInputs,
|
|
16244
|
-
provider
|
|
16340
|
+
provider,
|
|
16341
|
+
"evaluator",
|
|
16342
|
+
"evaluator_error"
|
|
16245
16343
|
);
|
|
16246
16344
|
results.push(errorResult);
|
|
16247
16345
|
if (onResult) {
|
|
@@ -16297,7 +16395,8 @@ async function runEvalCase(options) {
|
|
|
16297
16395
|
sharedBaselineCommit,
|
|
16298
16396
|
suiteWorkspaceFile,
|
|
16299
16397
|
typeRegistry: providedTypeRegistry,
|
|
16300
|
-
repoManager
|
|
16398
|
+
repoManager,
|
|
16399
|
+
evalDir
|
|
16301
16400
|
} = options;
|
|
16302
16401
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
16303
16402
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -16330,7 +16429,9 @@ async function runEvalCase(options) {
|
|
|
16330
16429
|
nowFn(),
|
|
16331
16430
|
new Error(`Failed to create workspace: ${message}`),
|
|
16332
16431
|
promptInputs,
|
|
16333
|
-
provider
|
|
16432
|
+
provider,
|
|
16433
|
+
"setup",
|
|
16434
|
+
"template_error"
|
|
16334
16435
|
);
|
|
16335
16436
|
}
|
|
16336
16437
|
}
|
|
@@ -16350,7 +16451,9 @@ async function runEvalCase(options) {
|
|
|
16350
16451
|
nowFn(),
|
|
16351
16452
|
new Error(`Failed to materialize repos: ${message}`),
|
|
16352
16453
|
promptInputs,
|
|
16353
|
-
provider
|
|
16454
|
+
provider,
|
|
16455
|
+
"repo_setup",
|
|
16456
|
+
"clone_error"
|
|
16354
16457
|
);
|
|
16355
16458
|
}
|
|
16356
16459
|
}
|
|
@@ -16360,7 +16463,8 @@ async function runEvalCase(options) {
|
|
|
16360
16463
|
testId: evalCase.id,
|
|
16361
16464
|
evalRunId: evalRunId ?? "",
|
|
16362
16465
|
caseInput: evalCase.question,
|
|
16363
|
-
caseMetadata: evalCase.metadata
|
|
16466
|
+
caseMetadata: evalCase.metadata,
|
|
16467
|
+
evalDir
|
|
16364
16468
|
};
|
|
16365
16469
|
try {
|
|
16366
16470
|
beforeAllOutput = await executeWorkspaceScript(
|
|
@@ -16379,7 +16483,9 @@ async function runEvalCase(options) {
|
|
|
16379
16483
|
nowFn(),
|
|
16380
16484
|
new Error(`before_all script failed: ${message}`),
|
|
16381
16485
|
promptInputs,
|
|
16382
|
-
provider
|
|
16486
|
+
provider,
|
|
16487
|
+
"setup",
|
|
16488
|
+
"script_error"
|
|
16383
16489
|
);
|
|
16384
16490
|
}
|
|
16385
16491
|
}
|
|
@@ -16390,7 +16496,8 @@ async function runEvalCase(options) {
|
|
|
16390
16496
|
testId: evalCase.id,
|
|
16391
16497
|
evalRunId: evalRunId ?? "",
|
|
16392
16498
|
caseInput: evalCase.question,
|
|
16393
|
-
caseMetadata: evalCase.metadata
|
|
16499
|
+
caseMetadata: evalCase.metadata,
|
|
16500
|
+
evalDir
|
|
16394
16501
|
};
|
|
16395
16502
|
try {
|
|
16396
16503
|
beforeEachOutput = await executeWorkspaceScript(
|
|
@@ -16405,7 +16512,9 @@ async function runEvalCase(options) {
|
|
|
16405
16512
|
nowFn(),
|
|
16406
16513
|
new Error(`before_each script failed: ${message}`),
|
|
16407
16514
|
promptInputs,
|
|
16408
|
-
provider
|
|
16515
|
+
provider,
|
|
16516
|
+
"setup",
|
|
16517
|
+
"script_error"
|
|
16409
16518
|
);
|
|
16410
16519
|
}
|
|
16411
16520
|
}
|
|
@@ -16446,7 +16555,9 @@ async function runEvalCase(options) {
|
|
|
16446
16555
|
nowFn(),
|
|
16447
16556
|
error,
|
|
16448
16557
|
promptInputs,
|
|
16449
|
-
provider
|
|
16558
|
+
provider,
|
|
16559
|
+
"agent",
|
|
16560
|
+
"provider_error"
|
|
16450
16561
|
);
|
|
16451
16562
|
if (workspacePath) {
|
|
16452
16563
|
if (forceCleanup) {
|
|
@@ -16465,7 +16576,9 @@ async function runEvalCase(options) {
|
|
|
16465
16576
|
nowFn(),
|
|
16466
16577
|
lastError ?? new Error("Provider did not return a response"),
|
|
16467
16578
|
promptInputs,
|
|
16468
|
-
provider
|
|
16579
|
+
provider,
|
|
16580
|
+
"agent",
|
|
16581
|
+
"provider_error"
|
|
16469
16582
|
);
|
|
16470
16583
|
if (workspacePath) {
|
|
16471
16584
|
if (forceCleanup) {
|
|
@@ -16521,7 +16634,8 @@ async function runEvalCase(options) {
|
|
|
16521
16634
|
testId: evalCase.id,
|
|
16522
16635
|
evalRunId: evalRunId ?? "",
|
|
16523
16636
|
caseInput: evalCase.question,
|
|
16524
|
-
caseMetadata: evalCase.metadata
|
|
16637
|
+
caseMetadata: evalCase.metadata,
|
|
16638
|
+
evalDir
|
|
16525
16639
|
};
|
|
16526
16640
|
try {
|
|
16527
16641
|
afterEachOutput = await executeWorkspaceScript(
|
|
@@ -16557,7 +16671,18 @@ async function runEvalCase(options) {
|
|
|
16557
16671
|
fileChanges,
|
|
16558
16672
|
workspacePath
|
|
16559
16673
|
});
|
|
16560
|
-
const
|
|
16674
|
+
const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
|
|
16675
|
+
const finalResult = providerError ? {
|
|
16676
|
+
...result,
|
|
16677
|
+
error: providerError,
|
|
16678
|
+
executionStatus,
|
|
16679
|
+
failureStage: "agent",
|
|
16680
|
+
failureReasonCode: "provider_error",
|
|
16681
|
+
executionError: { message: providerError, stage: "agent" },
|
|
16682
|
+
beforeAllOutput,
|
|
16683
|
+
beforeEachOutput,
|
|
16684
|
+
afterEachOutput
|
|
16685
|
+
} : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
|
|
16561
16686
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
16562
16687
|
if (workspacePath && !isSharedWorkspace) {
|
|
16563
16688
|
if (forceCleanup) {
|
|
@@ -16578,7 +16703,9 @@ async function runEvalCase(options) {
|
|
|
16578
16703
|
nowFn(),
|
|
16579
16704
|
error,
|
|
16580
16705
|
promptInputs,
|
|
16581
|
-
provider
|
|
16706
|
+
provider,
|
|
16707
|
+
"evaluator",
|
|
16708
|
+
"evaluator_error"
|
|
16582
16709
|
);
|
|
16583
16710
|
if (workspacePath && !isSharedWorkspace) {
|
|
16584
16711
|
if (forceCleanup) {
|
|
@@ -16616,7 +16743,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
16616
16743
|
verdict: trialVerdict,
|
|
16617
16744
|
scores: result.scores,
|
|
16618
16745
|
error: result.error,
|
|
16619
|
-
costUsd: trialCost
|
|
16746
|
+
costUsd: trialCost,
|
|
16747
|
+
executionStatus: result.executionStatus,
|
|
16748
|
+
failureStage: result.failureStage,
|
|
16749
|
+
failureReasonCode: result.failureReasonCode
|
|
16620
16750
|
};
|
|
16621
16751
|
trialResults.push(trial);
|
|
16622
16752
|
if (trialCost !== void 0) {
|
|
@@ -16641,12 +16771,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
16641
16771
|
0
|
|
16642
16772
|
);
|
|
16643
16773
|
const baseResult = allResults[bestTrialIndex];
|
|
16774
|
+
const hasOk = trialResults.some((t) => t.executionStatus === "ok");
|
|
16775
|
+
const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
|
|
16776
|
+
const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
|
|
16777
|
+
const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
|
|
16778
|
+
const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
|
|
16779
|
+
const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
|
|
16644
16780
|
return {
|
|
16645
16781
|
...baseResult,
|
|
16646
16782
|
score,
|
|
16647
16783
|
trials: trialResults,
|
|
16648
16784
|
aggregation,
|
|
16649
|
-
costLimited: costLimited || void 0
|
|
16785
|
+
costLimited: costLimited || void 0,
|
|
16786
|
+
executionStatus: aggregateExecutionStatus,
|
|
16787
|
+
failureStage: aggregateFailureStage,
|
|
16788
|
+
failureReasonCode: aggregateFailureReasonCode,
|
|
16789
|
+
executionError: aggregateExecutionError
|
|
16650
16790
|
};
|
|
16651
16791
|
}
|
|
16652
16792
|
async function evaluateCandidate(options) {
|
|
@@ -16747,7 +16887,8 @@ async function evaluateCandidate(options) {
|
|
|
16747
16887
|
scores,
|
|
16748
16888
|
trace: trace2,
|
|
16749
16889
|
output,
|
|
16750
|
-
fileChanges
|
|
16890
|
+
fileChanges,
|
|
16891
|
+
executionStatus: classifyQualityStatus(score.score)
|
|
16751
16892
|
};
|
|
16752
16893
|
}
|
|
16753
16894
|
async function runEvaluatorsForCase(options) {
|
|
@@ -17052,7 +17193,7 @@ async function invokeProvider(provider, options) {
|
|
|
17052
17193
|
}
|
|
17053
17194
|
}
|
|
17054
17195
|
}
|
|
17055
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
17196
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
17056
17197
|
const message = error instanceof Error ? error.message : String(error);
|
|
17057
17198
|
let agentRequest;
|
|
17058
17199
|
let lmRequest;
|
|
@@ -17095,7 +17236,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
17095
17236
|
target: targetName,
|
|
17096
17237
|
requests,
|
|
17097
17238
|
input,
|
|
17098
|
-
error: message
|
|
17239
|
+
error: message,
|
|
17240
|
+
executionStatus: "execution_error",
|
|
17241
|
+
failureStage,
|
|
17242
|
+
failureReasonCode,
|
|
17243
|
+
executionError: { message, stage: failureStage }
|
|
17099
17244
|
};
|
|
17100
17245
|
}
|
|
17101
17246
|
function extractProviderError(response) {
|
|
@@ -18048,6 +18193,7 @@ function createAgentKernel() {
|
|
|
18048
18193
|
executeWorkspaceScript,
|
|
18049
18194
|
explorationRatio,
|
|
18050
18195
|
extractCacheConfig,
|
|
18196
|
+
extractFailOnError,
|
|
18051
18197
|
extractJsonBlob,
|
|
18052
18198
|
extractTargetFromSuite,
|
|
18053
18199
|
extractTargetsFromSuite,
|