@agentv/core 2.11.4 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-REN5PS7B.js → chunk-JHER2LQ5.js} +1 -1
- package/dist/chunk-JHER2LQ5.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +174 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +54 -1
- package/dist/index.d.ts +54 -1
- package/dist/index.js +174 -29
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-REN5PS7B.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-JHER2LQ5.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -396,6 +396,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
396
396
|
continue;
|
|
397
397
|
}
|
|
398
398
|
const config = parsed;
|
|
399
|
+
const requiredVersion = parsed.required_version;
|
|
400
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
401
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
399
404
|
const guidelinePatterns = config.guideline_patterns;
|
|
400
405
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
401
406
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -419,6 +424,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
419
424
|
configPath
|
|
420
425
|
);
|
|
421
426
|
return {
|
|
427
|
+
required_version: requiredVersion,
|
|
422
428
|
guideline_patterns: guidelinePatterns,
|
|
423
429
|
eval_patterns: evalPatterns,
|
|
424
430
|
execution: executionDefaults
|
|
@@ -562,6 +568,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
562
568
|
);
|
|
563
569
|
return void 0;
|
|
564
570
|
}
|
|
571
|
+
function extractFailOnError(suite) {
|
|
572
|
+
const execution = suite.execution;
|
|
573
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
574
|
+
return void 0;
|
|
575
|
+
}
|
|
576
|
+
const executionObj = execution;
|
|
577
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
578
|
+
if (raw === void 0 || raw === null) {
|
|
579
|
+
return void 0;
|
|
580
|
+
}
|
|
581
|
+
if (typeof raw === "boolean") {
|
|
582
|
+
return raw;
|
|
583
|
+
}
|
|
584
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
585
|
+
return void 0;
|
|
586
|
+
}
|
|
565
587
|
function parseExecutionDefaults(raw, configPath) {
|
|
566
588
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
567
589
|
return void 0;
|
|
@@ -2757,13 +2779,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2757
2779
|
}
|
|
2758
2780
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
2759
2781
|
const metadata = parseMetadata(parsed);
|
|
2782
|
+
const failOnError = extractFailOnError(parsed);
|
|
2760
2783
|
return {
|
|
2761
2784
|
tests,
|
|
2762
2785
|
trials: extractTrialsConfig(parsed),
|
|
2763
2786
|
targets: extractTargetsFromSuite(parsed),
|
|
2764
2787
|
cacheConfig: extractCacheConfig(parsed),
|
|
2765
2788
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
2766
|
-
...metadata !== void 0 && { metadata }
|
|
2789
|
+
...metadata !== void 0 && { metadata },
|
|
2790
|
+
...failOnError !== void 0 && { failOnError }
|
|
2767
2791
|
};
|
|
2768
2792
|
}
|
|
2769
2793
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -12847,6 +12871,16 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
12847
12871
|
}
|
|
12848
12872
|
|
|
12849
12873
|
// src/evaluation/workspace/script-executor.ts
|
|
12874
|
+
function interpolateArgs(args, context) {
|
|
12875
|
+
const vars = {
|
|
12876
|
+
workspace_path: context.workspacePath,
|
|
12877
|
+
test_id: context.testId,
|
|
12878
|
+
eval_run_id: context.evalRunId,
|
|
12879
|
+
case_input: context.caseInput ?? "",
|
|
12880
|
+
case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
|
|
12881
|
+
};
|
|
12882
|
+
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
|
|
12883
|
+
}
|
|
12850
12884
|
async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
12851
12885
|
const stdin = JSON.stringify({
|
|
12852
12886
|
workspace_path: context.workspacePath,
|
|
@@ -12856,8 +12890,9 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
12856
12890
|
case_metadata: context.caseMetadata ?? null
|
|
12857
12891
|
});
|
|
12858
12892
|
const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
12859
|
-
const cwd = config.cwd;
|
|
12860
|
-
const
|
|
12893
|
+
const cwd = config.cwd ?? context.evalDir;
|
|
12894
|
+
const rawCommand = config.command ?? config.script ?? [];
|
|
12895
|
+
const commandArray = interpolateArgs(rawCommand, context);
|
|
12861
12896
|
const result = await execFileWithStdin(commandArray, stdin, {
|
|
12862
12897
|
timeoutMs,
|
|
12863
12898
|
cwd
|
|
@@ -12874,6 +12909,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
12874
12909
|
}
|
|
12875
12910
|
|
|
12876
12911
|
// src/evaluation/orchestrator.ts
|
|
12912
|
+
var QUALITY_PASS_THRESHOLD = 0.8;
|
|
12913
|
+
function classifyQualityStatus(score) {
|
|
12914
|
+
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
12915
|
+
}
|
|
12877
12916
|
function usesFileReferencePrompt(provider) {
|
|
12878
12917
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
12879
12918
|
}
|
|
@@ -12906,7 +12945,8 @@ async function runEvaluation(options) {
|
|
|
12906
12945
|
cleanupWorkspaces,
|
|
12907
12946
|
trials,
|
|
12908
12947
|
streamCallbacks,
|
|
12909
|
-
totalBudgetUsd
|
|
12948
|
+
totalBudgetUsd,
|
|
12949
|
+
failOnError
|
|
12910
12950
|
} = options;
|
|
12911
12951
|
let useCache = options.useCache;
|
|
12912
12952
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -12981,6 +13021,7 @@ async function runEvaluation(options) {
|
|
|
12981
13021
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
12982
13022
|
const typeRegistry = createBuiltinRegistry();
|
|
12983
13023
|
const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
|
|
13024
|
+
const evalDir = discoveryBaseDir;
|
|
12984
13025
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
12985
13026
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
12986
13027
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -13076,7 +13117,8 @@ async function runEvaluation(options) {
|
|
|
13076
13117
|
const scriptContext = {
|
|
13077
13118
|
workspacePath: sharedWorkspacePath,
|
|
13078
13119
|
testId: "__before_all__",
|
|
13079
|
-
evalRunId
|
|
13120
|
+
evalRunId,
|
|
13121
|
+
evalDir
|
|
13080
13122
|
};
|
|
13081
13123
|
try {
|
|
13082
13124
|
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
@@ -13100,6 +13142,7 @@ async function runEvaluation(options) {
|
|
|
13100
13142
|
let beforeAllOutputAttached = false;
|
|
13101
13143
|
let cumulativeBudgetCost = 0;
|
|
13102
13144
|
let budgetExhausted = false;
|
|
13145
|
+
let failOnErrorTriggered = false;
|
|
13103
13146
|
const promises = filteredEvalCases.map(
|
|
13104
13147
|
(evalCase) => limit(async () => {
|
|
13105
13148
|
const workerId = nextWorkerId++;
|
|
@@ -13115,7 +13158,14 @@ async function runEvaluation(options) {
|
|
|
13115
13158
|
answer: "",
|
|
13116
13159
|
target: target.name,
|
|
13117
13160
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13118
|
-
budgetExceeded: true
|
|
13161
|
+
budgetExceeded: true,
|
|
13162
|
+
executionStatus: "execution_error",
|
|
13163
|
+
failureStage: "setup",
|
|
13164
|
+
failureReasonCode: "budget_exceeded",
|
|
13165
|
+
executionError: {
|
|
13166
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13167
|
+
stage: "setup"
|
|
13168
|
+
}
|
|
13119
13169
|
};
|
|
13120
13170
|
if (onProgress) {
|
|
13121
13171
|
await onProgress({
|
|
@@ -13131,6 +13181,37 @@ async function runEvaluation(options) {
|
|
|
13131
13181
|
}
|
|
13132
13182
|
return budgetResult;
|
|
13133
13183
|
}
|
|
13184
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
13185
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
13186
|
+
const haltResult = {
|
|
13187
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13188
|
+
testId: evalCase.id,
|
|
13189
|
+
dataset: evalCase.dataset,
|
|
13190
|
+
score: 0,
|
|
13191
|
+
hits: [],
|
|
13192
|
+
misses: [],
|
|
13193
|
+
answer: "",
|
|
13194
|
+
target: target.name,
|
|
13195
|
+
error: errorMsg,
|
|
13196
|
+
executionStatus: "execution_error",
|
|
13197
|
+
failureStage: "setup",
|
|
13198
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
13199
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
13200
|
+
};
|
|
13201
|
+
if (onProgress) {
|
|
13202
|
+
await onProgress({
|
|
13203
|
+
workerId,
|
|
13204
|
+
testId: evalCase.id,
|
|
13205
|
+
status: "failed",
|
|
13206
|
+
completedAt: Date.now(),
|
|
13207
|
+
error: haltResult.error
|
|
13208
|
+
});
|
|
13209
|
+
}
|
|
13210
|
+
if (onResult) {
|
|
13211
|
+
await onResult(haltResult);
|
|
13212
|
+
}
|
|
13213
|
+
return haltResult;
|
|
13214
|
+
}
|
|
13134
13215
|
if (onProgress) {
|
|
13135
13216
|
await onProgress({
|
|
13136
13217
|
workerId,
|
|
@@ -13162,7 +13243,8 @@ async function runEvaluation(options) {
|
|
|
13162
13243
|
suiteWorkspaceFile,
|
|
13163
13244
|
streamCallbacks,
|
|
13164
13245
|
typeRegistry,
|
|
13165
|
-
repoManager
|
|
13246
|
+
repoManager,
|
|
13247
|
+
evalDir
|
|
13166
13248
|
};
|
|
13167
13249
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
13168
13250
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -13182,6 +13264,9 @@ async function runEvaluation(options) {
|
|
|
13182
13264
|
}
|
|
13183
13265
|
}
|
|
13184
13266
|
}
|
|
13267
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
13268
|
+
failOnErrorTriggered = true;
|
|
13269
|
+
}
|
|
13185
13270
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
13186
13271
|
result = { ...result, beforeAllOutput };
|
|
13187
13272
|
beforeAllOutputAttached = true;
|
|
@@ -13231,7 +13316,9 @@ async function runEvaluation(options) {
|
|
|
13231
13316
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
13232
13317
|
outcome.reason,
|
|
13233
13318
|
promptInputs,
|
|
13234
|
-
primaryProvider
|
|
13319
|
+
primaryProvider,
|
|
13320
|
+
"agent",
|
|
13321
|
+
"provider_error"
|
|
13235
13322
|
);
|
|
13236
13323
|
results.push(errorResult);
|
|
13237
13324
|
if (onResult) {
|
|
@@ -13243,7 +13330,8 @@ async function runEvaluation(options) {
|
|
|
13243
13330
|
const scriptContext = {
|
|
13244
13331
|
workspacePath: sharedWorkspacePath,
|
|
13245
13332
|
testId: "__after_all__",
|
|
13246
|
-
evalRunId
|
|
13333
|
+
evalRunId,
|
|
13334
|
+
evalDir
|
|
13247
13335
|
};
|
|
13248
13336
|
try {
|
|
13249
13337
|
const afterAllOutput = await executeWorkspaceScript(
|
|
@@ -13373,7 +13461,14 @@ async function runBatchEvaluation(options) {
|
|
|
13373
13461
|
availableTargets
|
|
13374
13462
|
});
|
|
13375
13463
|
if (providerError) {
|
|
13376
|
-
result = {
|
|
13464
|
+
result = {
|
|
13465
|
+
...result,
|
|
13466
|
+
error: providerError,
|
|
13467
|
+
executionStatus: "execution_error",
|
|
13468
|
+
failureStage: "agent",
|
|
13469
|
+
failureReasonCode: "provider_error",
|
|
13470
|
+
executionError: { message: providerError, stage: "agent" }
|
|
13471
|
+
};
|
|
13377
13472
|
}
|
|
13378
13473
|
} catch (error) {
|
|
13379
13474
|
const errorResult = buildErrorResult(
|
|
@@ -13382,7 +13477,9 @@ async function runBatchEvaluation(options) {
|
|
|
13382
13477
|
nowFn(),
|
|
13383
13478
|
error,
|
|
13384
13479
|
promptInputs,
|
|
13385
|
-
provider
|
|
13480
|
+
provider,
|
|
13481
|
+
"evaluator",
|
|
13482
|
+
"evaluator_error"
|
|
13386
13483
|
);
|
|
13387
13484
|
results.push(errorResult);
|
|
13388
13485
|
if (onResult) {
|
|
@@ -13438,7 +13535,8 @@ async function runEvalCase(options) {
|
|
|
13438
13535
|
sharedBaselineCommit,
|
|
13439
13536
|
suiteWorkspaceFile,
|
|
13440
13537
|
typeRegistry: providedTypeRegistry,
|
|
13441
|
-
repoManager
|
|
13538
|
+
repoManager,
|
|
13539
|
+
evalDir
|
|
13442
13540
|
} = options;
|
|
13443
13541
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
13444
13542
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -13471,7 +13569,9 @@ async function runEvalCase(options) {
|
|
|
13471
13569
|
nowFn(),
|
|
13472
13570
|
new Error(`Failed to create workspace: ${message}`),
|
|
13473
13571
|
promptInputs,
|
|
13474
|
-
provider
|
|
13572
|
+
provider,
|
|
13573
|
+
"setup",
|
|
13574
|
+
"template_error"
|
|
13475
13575
|
);
|
|
13476
13576
|
}
|
|
13477
13577
|
}
|
|
@@ -13491,7 +13591,9 @@ async function runEvalCase(options) {
|
|
|
13491
13591
|
nowFn(),
|
|
13492
13592
|
new Error(`Failed to materialize repos: ${message}`),
|
|
13493
13593
|
promptInputs,
|
|
13494
|
-
provider
|
|
13594
|
+
provider,
|
|
13595
|
+
"repo_setup",
|
|
13596
|
+
"clone_error"
|
|
13495
13597
|
);
|
|
13496
13598
|
}
|
|
13497
13599
|
}
|
|
@@ -13501,7 +13603,8 @@ async function runEvalCase(options) {
|
|
|
13501
13603
|
testId: evalCase.id,
|
|
13502
13604
|
evalRunId: evalRunId ?? "",
|
|
13503
13605
|
caseInput: evalCase.question,
|
|
13504
|
-
caseMetadata: evalCase.metadata
|
|
13606
|
+
caseMetadata: evalCase.metadata,
|
|
13607
|
+
evalDir
|
|
13505
13608
|
};
|
|
13506
13609
|
try {
|
|
13507
13610
|
beforeAllOutput = await executeWorkspaceScript(
|
|
@@ -13520,7 +13623,9 @@ async function runEvalCase(options) {
|
|
|
13520
13623
|
nowFn(),
|
|
13521
13624
|
new Error(`before_all script failed: ${message}`),
|
|
13522
13625
|
promptInputs,
|
|
13523
|
-
provider
|
|
13626
|
+
provider,
|
|
13627
|
+
"setup",
|
|
13628
|
+
"script_error"
|
|
13524
13629
|
);
|
|
13525
13630
|
}
|
|
13526
13631
|
}
|
|
@@ -13531,7 +13636,8 @@ async function runEvalCase(options) {
|
|
|
13531
13636
|
testId: evalCase.id,
|
|
13532
13637
|
evalRunId: evalRunId ?? "",
|
|
13533
13638
|
caseInput: evalCase.question,
|
|
13534
|
-
caseMetadata: evalCase.metadata
|
|
13639
|
+
caseMetadata: evalCase.metadata,
|
|
13640
|
+
evalDir
|
|
13535
13641
|
};
|
|
13536
13642
|
try {
|
|
13537
13643
|
beforeEachOutput = await executeWorkspaceScript(
|
|
@@ -13546,7 +13652,9 @@ async function runEvalCase(options) {
|
|
|
13546
13652
|
nowFn(),
|
|
13547
13653
|
new Error(`before_each script failed: ${message}`),
|
|
13548
13654
|
promptInputs,
|
|
13549
|
-
provider
|
|
13655
|
+
provider,
|
|
13656
|
+
"setup",
|
|
13657
|
+
"script_error"
|
|
13550
13658
|
);
|
|
13551
13659
|
}
|
|
13552
13660
|
}
|
|
@@ -13587,7 +13695,9 @@ async function runEvalCase(options) {
|
|
|
13587
13695
|
nowFn(),
|
|
13588
13696
|
error,
|
|
13589
13697
|
promptInputs,
|
|
13590
|
-
provider
|
|
13698
|
+
provider,
|
|
13699
|
+
"agent",
|
|
13700
|
+
"provider_error"
|
|
13591
13701
|
);
|
|
13592
13702
|
if (workspacePath) {
|
|
13593
13703
|
if (forceCleanup) {
|
|
@@ -13606,7 +13716,9 @@ async function runEvalCase(options) {
|
|
|
13606
13716
|
nowFn(),
|
|
13607
13717
|
lastError ?? new Error("Provider did not return a response"),
|
|
13608
13718
|
promptInputs,
|
|
13609
|
-
provider
|
|
13719
|
+
provider,
|
|
13720
|
+
"agent",
|
|
13721
|
+
"provider_error"
|
|
13610
13722
|
);
|
|
13611
13723
|
if (workspacePath) {
|
|
13612
13724
|
if (forceCleanup) {
|
|
@@ -13662,7 +13774,8 @@ async function runEvalCase(options) {
|
|
|
13662
13774
|
testId: evalCase.id,
|
|
13663
13775
|
evalRunId: evalRunId ?? "",
|
|
13664
13776
|
caseInput: evalCase.question,
|
|
13665
|
-
caseMetadata: evalCase.metadata
|
|
13777
|
+
caseMetadata: evalCase.metadata,
|
|
13778
|
+
evalDir
|
|
13666
13779
|
};
|
|
13667
13780
|
try {
|
|
13668
13781
|
afterEachOutput = await executeWorkspaceScript(
|
|
@@ -13698,7 +13811,18 @@ async function runEvalCase(options) {
|
|
|
13698
13811
|
fileChanges,
|
|
13699
13812
|
workspacePath
|
|
13700
13813
|
});
|
|
13701
|
-
const
|
|
13814
|
+
const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
|
|
13815
|
+
const finalResult = providerError ? {
|
|
13816
|
+
...result,
|
|
13817
|
+
error: providerError,
|
|
13818
|
+
executionStatus,
|
|
13819
|
+
failureStage: "agent",
|
|
13820
|
+
failureReasonCode: "provider_error",
|
|
13821
|
+
executionError: { message: providerError, stage: "agent" },
|
|
13822
|
+
beforeAllOutput,
|
|
13823
|
+
beforeEachOutput,
|
|
13824
|
+
afterEachOutput
|
|
13825
|
+
} : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
|
|
13702
13826
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
13703
13827
|
if (workspacePath && !isSharedWorkspace) {
|
|
13704
13828
|
if (forceCleanup) {
|
|
@@ -13719,7 +13843,9 @@ async function runEvalCase(options) {
|
|
|
13719
13843
|
nowFn(),
|
|
13720
13844
|
error,
|
|
13721
13845
|
promptInputs,
|
|
13722
|
-
provider
|
|
13846
|
+
provider,
|
|
13847
|
+
"evaluator",
|
|
13848
|
+
"evaluator_error"
|
|
13723
13849
|
);
|
|
13724
13850
|
if (workspacePath && !isSharedWorkspace) {
|
|
13725
13851
|
if (forceCleanup) {
|
|
@@ -13757,7 +13883,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
13757
13883
|
verdict: trialVerdict,
|
|
13758
13884
|
scores: result.scores,
|
|
13759
13885
|
error: result.error,
|
|
13760
|
-
costUsd: trialCost
|
|
13886
|
+
costUsd: trialCost,
|
|
13887
|
+
executionStatus: result.executionStatus,
|
|
13888
|
+
failureStage: result.failureStage,
|
|
13889
|
+
failureReasonCode: result.failureReasonCode
|
|
13761
13890
|
};
|
|
13762
13891
|
trialResults.push(trial);
|
|
13763
13892
|
if (trialCost !== void 0) {
|
|
@@ -13782,12 +13911,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
13782
13911
|
0
|
|
13783
13912
|
);
|
|
13784
13913
|
const baseResult = allResults[bestTrialIndex];
|
|
13914
|
+
const hasOk = trialResults.some((t) => t.executionStatus === "ok");
|
|
13915
|
+
const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
|
|
13916
|
+
const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
|
|
13917
|
+
const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
|
|
13918
|
+
const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
|
|
13919
|
+
const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
|
|
13785
13920
|
return {
|
|
13786
13921
|
...baseResult,
|
|
13787
13922
|
score,
|
|
13788
13923
|
trials: trialResults,
|
|
13789
13924
|
aggregation,
|
|
13790
|
-
costLimited: costLimited || void 0
|
|
13925
|
+
costLimited: costLimited || void 0,
|
|
13926
|
+
executionStatus: aggregateExecutionStatus,
|
|
13927
|
+
failureStage: aggregateFailureStage,
|
|
13928
|
+
failureReasonCode: aggregateFailureReasonCode,
|
|
13929
|
+
executionError: aggregateExecutionError
|
|
13791
13930
|
};
|
|
13792
13931
|
}
|
|
13793
13932
|
async function evaluateCandidate(options) {
|
|
@@ -13888,7 +14027,8 @@ async function evaluateCandidate(options) {
|
|
|
13888
14027
|
scores,
|
|
13889
14028
|
trace,
|
|
13890
14029
|
output,
|
|
13891
|
-
fileChanges
|
|
14030
|
+
fileChanges,
|
|
14031
|
+
executionStatus: classifyQualityStatus(score.score)
|
|
13892
14032
|
};
|
|
13893
14033
|
}
|
|
13894
14034
|
async function runEvaluatorsForCase(options) {
|
|
@@ -14193,7 +14333,7 @@ async function invokeProvider(provider, options) {
|
|
|
14193
14333
|
}
|
|
14194
14334
|
}
|
|
14195
14335
|
}
|
|
14196
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
14336
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
14197
14337
|
const message = error instanceof Error ? error.message : String(error);
|
|
14198
14338
|
let agentRequest;
|
|
14199
14339
|
let lmRequest;
|
|
@@ -14236,7 +14376,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
14236
14376
|
target: targetName,
|
|
14237
14377
|
requests,
|
|
14238
14378
|
input,
|
|
14239
|
-
error: message
|
|
14379
|
+
error: message,
|
|
14380
|
+
executionStatus: "execution_error",
|
|
14381
|
+
failureStage,
|
|
14382
|
+
failureReasonCode,
|
|
14383
|
+
executionError: { message, stage: failureStage }
|
|
14240
14384
|
};
|
|
14241
14385
|
}
|
|
14242
14386
|
function extractProviderError(response) {
|
|
@@ -15184,6 +15328,7 @@ export {
|
|
|
15184
15328
|
executeWorkspaceScript,
|
|
15185
15329
|
explorationRatio,
|
|
15186
15330
|
extractCacheConfig,
|
|
15331
|
+
extractFailOnError,
|
|
15187
15332
|
extractJsonBlob,
|
|
15188
15333
|
extractTargetFromSuite,
|
|
15189
15334
|
extractTargetsFromSuite,
|