@agentv/core 2.11.4 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-REN5PS7B.js → chunk-7HPKTRFZ.js} +1 -1
- package/dist/chunk-7HPKTRFZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +110 -26
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +37 -1
- package/dist/index.d.ts +37 -1
- package/dist/index.js +111 -27
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-REN5PS7B.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -15706,6 +15706,16 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
15706
15706
|
}
|
|
15707
15707
|
|
|
15708
15708
|
// src/evaluation/workspace/script-executor.ts
|
|
15709
|
+
function interpolateArgs(args, context2) {
|
|
15710
|
+
const vars = {
|
|
15711
|
+
workspace_path: context2.workspacePath,
|
|
15712
|
+
test_id: context2.testId,
|
|
15713
|
+
eval_run_id: context2.evalRunId,
|
|
15714
|
+
case_input: context2.caseInput ?? "",
|
|
15715
|
+
case_metadata: context2.caseMetadata ? JSON.stringify(context2.caseMetadata) : ""
|
|
15716
|
+
};
|
|
15717
|
+
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
|
|
15718
|
+
}
|
|
15709
15719
|
async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
15710
15720
|
const stdin = JSON.stringify({
|
|
15711
15721
|
workspace_path: context2.workspacePath,
|
|
@@ -15715,8 +15725,9 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
15715
15725
|
case_metadata: context2.caseMetadata ?? null
|
|
15716
15726
|
});
|
|
15717
15727
|
const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
15718
|
-
const cwd = config.cwd;
|
|
15719
|
-
const
|
|
15728
|
+
const cwd = config.cwd ?? context2.evalDir;
|
|
15729
|
+
const rawCommand = config.command ?? config.script ?? [];
|
|
15730
|
+
const commandArray = interpolateArgs(rawCommand, context2);
|
|
15720
15731
|
const result = await execFileWithStdin(commandArray, stdin, {
|
|
15721
15732
|
timeoutMs,
|
|
15722
15733
|
cwd
|
|
@@ -15733,6 +15744,10 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
15733
15744
|
}
|
|
15734
15745
|
|
|
15735
15746
|
// src/evaluation/orchestrator.ts
|
|
15747
|
+
var QUALITY_PASS_THRESHOLD = 0.8;
|
|
15748
|
+
function classifyQualityStatus(score) {
|
|
15749
|
+
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
15750
|
+
}
|
|
15736
15751
|
function usesFileReferencePrompt(provider) {
|
|
15737
15752
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
15738
15753
|
}
|
|
@@ -15840,6 +15855,7 @@ async function runEvaluation(options) {
|
|
|
15840
15855
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
15841
15856
|
const typeRegistry = createBuiltinRegistry();
|
|
15842
15857
|
const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
|
|
15858
|
+
const evalDir = discoveryBaseDir;
|
|
15843
15859
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
15844
15860
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
15845
15861
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -15935,7 +15951,8 @@ async function runEvaluation(options) {
|
|
|
15935
15951
|
const scriptContext = {
|
|
15936
15952
|
workspacePath: sharedWorkspacePath,
|
|
15937
15953
|
testId: "__before_all__",
|
|
15938
|
-
evalRunId
|
|
15954
|
+
evalRunId,
|
|
15955
|
+
evalDir
|
|
15939
15956
|
};
|
|
15940
15957
|
try {
|
|
15941
15958
|
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
@@ -15974,7 +15991,14 @@ async function runEvaluation(options) {
|
|
|
15974
15991
|
answer: "",
|
|
15975
15992
|
target: target.name,
|
|
15976
15993
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
15977
|
-
budgetExceeded: true
|
|
15994
|
+
budgetExceeded: true,
|
|
15995
|
+
executionStatus: "execution_error",
|
|
15996
|
+
failureStage: "setup",
|
|
15997
|
+
failureReasonCode: "budget_exceeded",
|
|
15998
|
+
executionError: {
|
|
15999
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16000
|
+
stage: "setup"
|
|
16001
|
+
}
|
|
15978
16002
|
};
|
|
15979
16003
|
if (onProgress) {
|
|
15980
16004
|
await onProgress({
|
|
@@ -16021,7 +16045,8 @@ async function runEvaluation(options) {
|
|
|
16021
16045
|
suiteWorkspaceFile,
|
|
16022
16046
|
streamCallbacks,
|
|
16023
16047
|
typeRegistry,
|
|
16024
|
-
repoManager
|
|
16048
|
+
repoManager,
|
|
16049
|
+
evalDir
|
|
16025
16050
|
};
|
|
16026
16051
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
16027
16052
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -16090,7 +16115,9 @@ async function runEvaluation(options) {
|
|
|
16090
16115
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
16091
16116
|
outcome.reason,
|
|
16092
16117
|
promptInputs,
|
|
16093
|
-
primaryProvider
|
|
16118
|
+
primaryProvider,
|
|
16119
|
+
"agent",
|
|
16120
|
+
"provider_error"
|
|
16094
16121
|
);
|
|
16095
16122
|
results.push(errorResult);
|
|
16096
16123
|
if (onResult) {
|
|
@@ -16102,7 +16129,8 @@ async function runEvaluation(options) {
|
|
|
16102
16129
|
const scriptContext = {
|
|
16103
16130
|
workspacePath: sharedWorkspacePath,
|
|
16104
16131
|
testId: "__after_all__",
|
|
16105
|
-
evalRunId
|
|
16132
|
+
evalRunId,
|
|
16133
|
+
evalDir
|
|
16106
16134
|
};
|
|
16107
16135
|
try {
|
|
16108
16136
|
const afterAllOutput = await executeWorkspaceScript(
|
|
@@ -16232,7 +16260,14 @@ async function runBatchEvaluation(options) {
|
|
|
16232
16260
|
availableTargets
|
|
16233
16261
|
});
|
|
16234
16262
|
if (providerError) {
|
|
16235
|
-
result = {
|
|
16263
|
+
result = {
|
|
16264
|
+
...result,
|
|
16265
|
+
error: providerError,
|
|
16266
|
+
executionStatus: "execution_error",
|
|
16267
|
+
failureStage: "agent",
|
|
16268
|
+
failureReasonCode: "provider_error",
|
|
16269
|
+
executionError: { message: providerError, stage: "agent" }
|
|
16270
|
+
};
|
|
16236
16271
|
}
|
|
16237
16272
|
} catch (error) {
|
|
16238
16273
|
const errorResult = buildErrorResult(
|
|
@@ -16241,7 +16276,9 @@ async function runBatchEvaluation(options) {
|
|
|
16241
16276
|
nowFn(),
|
|
16242
16277
|
error,
|
|
16243
16278
|
promptInputs,
|
|
16244
|
-
provider
|
|
16279
|
+
provider,
|
|
16280
|
+
"evaluator",
|
|
16281
|
+
"evaluator_error"
|
|
16245
16282
|
);
|
|
16246
16283
|
results.push(errorResult);
|
|
16247
16284
|
if (onResult) {
|
|
@@ -16297,7 +16334,8 @@ async function runEvalCase(options) {
|
|
|
16297
16334
|
sharedBaselineCommit,
|
|
16298
16335
|
suiteWorkspaceFile,
|
|
16299
16336
|
typeRegistry: providedTypeRegistry,
|
|
16300
|
-
repoManager
|
|
16337
|
+
repoManager,
|
|
16338
|
+
evalDir
|
|
16301
16339
|
} = options;
|
|
16302
16340
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
16303
16341
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -16330,7 +16368,9 @@ async function runEvalCase(options) {
|
|
|
16330
16368
|
nowFn(),
|
|
16331
16369
|
new Error(`Failed to create workspace: ${message}`),
|
|
16332
16370
|
promptInputs,
|
|
16333
|
-
provider
|
|
16371
|
+
provider,
|
|
16372
|
+
"setup",
|
|
16373
|
+
"template_error"
|
|
16334
16374
|
);
|
|
16335
16375
|
}
|
|
16336
16376
|
}
|
|
@@ -16350,7 +16390,9 @@ async function runEvalCase(options) {
|
|
|
16350
16390
|
nowFn(),
|
|
16351
16391
|
new Error(`Failed to materialize repos: ${message}`),
|
|
16352
16392
|
promptInputs,
|
|
16353
|
-
provider
|
|
16393
|
+
provider,
|
|
16394
|
+
"repo_setup",
|
|
16395
|
+
"clone_error"
|
|
16354
16396
|
);
|
|
16355
16397
|
}
|
|
16356
16398
|
}
|
|
@@ -16360,7 +16402,8 @@ async function runEvalCase(options) {
|
|
|
16360
16402
|
testId: evalCase.id,
|
|
16361
16403
|
evalRunId: evalRunId ?? "",
|
|
16362
16404
|
caseInput: evalCase.question,
|
|
16363
|
-
caseMetadata: evalCase.metadata
|
|
16405
|
+
caseMetadata: evalCase.metadata,
|
|
16406
|
+
evalDir
|
|
16364
16407
|
};
|
|
16365
16408
|
try {
|
|
16366
16409
|
beforeAllOutput = await executeWorkspaceScript(
|
|
@@ -16379,7 +16422,9 @@ async function runEvalCase(options) {
|
|
|
16379
16422
|
nowFn(),
|
|
16380
16423
|
new Error(`before_all script failed: ${message}`),
|
|
16381
16424
|
promptInputs,
|
|
16382
|
-
provider
|
|
16425
|
+
provider,
|
|
16426
|
+
"setup",
|
|
16427
|
+
"script_error"
|
|
16383
16428
|
);
|
|
16384
16429
|
}
|
|
16385
16430
|
}
|
|
@@ -16390,7 +16435,8 @@ async function runEvalCase(options) {
|
|
|
16390
16435
|
testId: evalCase.id,
|
|
16391
16436
|
evalRunId: evalRunId ?? "",
|
|
16392
16437
|
caseInput: evalCase.question,
|
|
16393
|
-
caseMetadata: evalCase.metadata
|
|
16438
|
+
caseMetadata: evalCase.metadata,
|
|
16439
|
+
evalDir
|
|
16394
16440
|
};
|
|
16395
16441
|
try {
|
|
16396
16442
|
beforeEachOutput = await executeWorkspaceScript(
|
|
@@ -16405,7 +16451,9 @@ async function runEvalCase(options) {
|
|
|
16405
16451
|
nowFn(),
|
|
16406
16452
|
new Error(`before_each script failed: ${message}`),
|
|
16407
16453
|
promptInputs,
|
|
16408
|
-
provider
|
|
16454
|
+
provider,
|
|
16455
|
+
"setup",
|
|
16456
|
+
"script_error"
|
|
16409
16457
|
);
|
|
16410
16458
|
}
|
|
16411
16459
|
}
|
|
@@ -16446,7 +16494,9 @@ async function runEvalCase(options) {
|
|
|
16446
16494
|
nowFn(),
|
|
16447
16495
|
error,
|
|
16448
16496
|
promptInputs,
|
|
16449
|
-
provider
|
|
16497
|
+
provider,
|
|
16498
|
+
"agent",
|
|
16499
|
+
"provider_error"
|
|
16450
16500
|
);
|
|
16451
16501
|
if (workspacePath) {
|
|
16452
16502
|
if (forceCleanup) {
|
|
@@ -16465,7 +16515,9 @@ async function runEvalCase(options) {
|
|
|
16465
16515
|
nowFn(),
|
|
16466
16516
|
lastError ?? new Error("Provider did not return a response"),
|
|
16467
16517
|
promptInputs,
|
|
16468
|
-
provider
|
|
16518
|
+
provider,
|
|
16519
|
+
"agent",
|
|
16520
|
+
"provider_error"
|
|
16469
16521
|
);
|
|
16470
16522
|
if (workspacePath) {
|
|
16471
16523
|
if (forceCleanup) {
|
|
@@ -16521,7 +16573,8 @@ async function runEvalCase(options) {
|
|
|
16521
16573
|
testId: evalCase.id,
|
|
16522
16574
|
evalRunId: evalRunId ?? "",
|
|
16523
16575
|
caseInput: evalCase.question,
|
|
16524
|
-
caseMetadata: evalCase.metadata
|
|
16576
|
+
caseMetadata: evalCase.metadata,
|
|
16577
|
+
evalDir
|
|
16525
16578
|
};
|
|
16526
16579
|
try {
|
|
16527
16580
|
afterEachOutput = await executeWorkspaceScript(
|
|
@@ -16557,7 +16610,18 @@ async function runEvalCase(options) {
|
|
|
16557
16610
|
fileChanges,
|
|
16558
16611
|
workspacePath
|
|
16559
16612
|
});
|
|
16560
|
-
const
|
|
16613
|
+
const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
|
|
16614
|
+
const finalResult = providerError ? {
|
|
16615
|
+
...result,
|
|
16616
|
+
error: providerError,
|
|
16617
|
+
executionStatus,
|
|
16618
|
+
failureStage: "agent",
|
|
16619
|
+
failureReasonCode: "provider_error",
|
|
16620
|
+
executionError: { message: providerError, stage: "agent" },
|
|
16621
|
+
beforeAllOutput,
|
|
16622
|
+
beforeEachOutput,
|
|
16623
|
+
afterEachOutput
|
|
16624
|
+
} : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
|
|
16561
16625
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
16562
16626
|
if (workspacePath && !isSharedWorkspace) {
|
|
16563
16627
|
if (forceCleanup) {
|
|
@@ -16578,7 +16642,9 @@ async function runEvalCase(options) {
|
|
|
16578
16642
|
nowFn(),
|
|
16579
16643
|
error,
|
|
16580
16644
|
promptInputs,
|
|
16581
|
-
provider
|
|
16645
|
+
provider,
|
|
16646
|
+
"evaluator",
|
|
16647
|
+
"evaluator_error"
|
|
16582
16648
|
);
|
|
16583
16649
|
if (workspacePath && !isSharedWorkspace) {
|
|
16584
16650
|
if (forceCleanup) {
|
|
@@ -16616,7 +16682,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
16616
16682
|
verdict: trialVerdict,
|
|
16617
16683
|
scores: result.scores,
|
|
16618
16684
|
error: result.error,
|
|
16619
|
-
costUsd: trialCost
|
|
16685
|
+
costUsd: trialCost,
|
|
16686
|
+
executionStatus: result.executionStatus,
|
|
16687
|
+
failureStage: result.failureStage,
|
|
16688
|
+
failureReasonCode: result.failureReasonCode
|
|
16620
16689
|
};
|
|
16621
16690
|
trialResults.push(trial);
|
|
16622
16691
|
if (trialCost !== void 0) {
|
|
@@ -16641,12 +16710,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
16641
16710
|
0
|
|
16642
16711
|
);
|
|
16643
16712
|
const baseResult = allResults[bestTrialIndex];
|
|
16713
|
+
const hasOk = trialResults.some((t) => t.executionStatus === "ok");
|
|
16714
|
+
const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
|
|
16715
|
+
const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
|
|
16716
|
+
const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
|
|
16717
|
+
const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
|
|
16718
|
+
const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
|
|
16644
16719
|
return {
|
|
16645
16720
|
...baseResult,
|
|
16646
16721
|
score,
|
|
16647
16722
|
trials: trialResults,
|
|
16648
16723
|
aggregation,
|
|
16649
|
-
costLimited: costLimited || void 0
|
|
16724
|
+
costLimited: costLimited || void 0,
|
|
16725
|
+
executionStatus: aggregateExecutionStatus,
|
|
16726
|
+
failureStage: aggregateFailureStage,
|
|
16727
|
+
failureReasonCode: aggregateFailureReasonCode,
|
|
16728
|
+
executionError: aggregateExecutionError
|
|
16650
16729
|
};
|
|
16651
16730
|
}
|
|
16652
16731
|
async function evaluateCandidate(options) {
|
|
@@ -16747,7 +16826,8 @@ async function evaluateCandidate(options) {
|
|
|
16747
16826
|
scores,
|
|
16748
16827
|
trace: trace2,
|
|
16749
16828
|
output,
|
|
16750
|
-
fileChanges
|
|
16829
|
+
fileChanges,
|
|
16830
|
+
executionStatus: classifyQualityStatus(score.score)
|
|
16751
16831
|
};
|
|
16752
16832
|
}
|
|
16753
16833
|
async function runEvaluatorsForCase(options) {
|
|
@@ -17052,7 +17132,7 @@ async function invokeProvider(provider, options) {
|
|
|
17052
17132
|
}
|
|
17053
17133
|
}
|
|
17054
17134
|
}
|
|
17055
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
17135
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
17056
17136
|
const message = error instanceof Error ? error.message : String(error);
|
|
17057
17137
|
let agentRequest;
|
|
17058
17138
|
let lmRequest;
|
|
@@ -17095,7 +17175,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
17095
17175
|
target: targetName,
|
|
17096
17176
|
requests,
|
|
17097
17177
|
input,
|
|
17098
|
-
error: message
|
|
17178
|
+
error: message,
|
|
17179
|
+
executionStatus: "execution_error",
|
|
17180
|
+
failureStage,
|
|
17181
|
+
failureReasonCode,
|
|
17182
|
+
executionError: { message, stage: failureStage }
|
|
17099
17183
|
};
|
|
17100
17184
|
}
|
|
17101
17185
|
function extractProviderError(response) {
|