agentv 2.11.4 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-FBGAD3CQ.js → chunk-6KU2ZUFJ.js} +5 -5
- package/dist/{chunk-FBGAD3CQ.js.map → chunk-6KU2ZUFJ.js.map} +1 -1
- package/dist/{chunk-KWUTY5XR.js → chunk-LUHCYBMD.js} +113 -29
- package/dist/chunk-LUHCYBMD.js.map +1 -0
- package/dist/{chunk-APGYGAVM.js → chunk-YBJX5CP6.js} +67 -18
- package/dist/chunk-YBJX5CP6.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-RVGCGRG4.js → dist-OPPA4P5R.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-O2KBWGEI.js → interactive-TOUKPSHP.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-APGYGAVM.js.map +0 -1
- package/dist/chunk-KWUTY5XR.js.map +0 -1
- /package/dist/{dist-RVGCGRG4.js.map → dist-OPPA4P5R.js.map} +0 -0
- /package/dist/{interactive-O2KBWGEI.js.map → interactive-TOUKPSHP.js.map} +0 -0
|
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
|
|
|
148
148
|
}
|
|
149
149
|
});
|
|
150
150
|
|
|
151
|
-
// ../../packages/core/dist/chunk-
|
|
151
|
+
// ../../packages/core/dist/chunk-7HPKTRFZ.js
|
|
152
152
|
import { constants } from "node:fs";
|
|
153
153
|
import { access, readFile } from "node:fs/promises";
|
|
154
154
|
import path from "node:path";
|
|
@@ -4195,7 +4195,7 @@ var coerce = {
|
|
|
4195
4195
|
};
|
|
4196
4196
|
var NEVER = INVALID;
|
|
4197
4197
|
|
|
4198
|
-
// ../../packages/core/dist/chunk-
|
|
4198
|
+
// ../../packages/core/dist/chunk-7HPKTRFZ.js
|
|
4199
4199
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
4200
4200
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
4201
4201
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -46481,6 +46481,16 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
46481
46481
|
}
|
|
46482
46482
|
return { dir: resolved };
|
|
46483
46483
|
}
|
|
46484
|
+
function interpolateArgs(args, context) {
|
|
46485
|
+
const vars = {
|
|
46486
|
+
workspace_path: context.workspacePath,
|
|
46487
|
+
test_id: context.testId,
|
|
46488
|
+
eval_run_id: context.evalRunId,
|
|
46489
|
+
case_input: context.caseInput ?? "",
|
|
46490
|
+
case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
|
|
46491
|
+
};
|
|
46492
|
+
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name16) => vars[name16] ?? match));
|
|
46493
|
+
}
|
|
46484
46494
|
async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
|
|
46485
46495
|
const stdin = JSON.stringify({
|
|
46486
46496
|
workspace_path: context.workspacePath,
|
|
@@ -46490,8 +46500,9 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
|
|
|
46490
46500
|
case_metadata: context.caseMetadata ?? null
|
|
46491
46501
|
});
|
|
46492
46502
|
const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
46493
|
-
const cwd = config2.cwd;
|
|
46494
|
-
const
|
|
46503
|
+
const cwd = config2.cwd ?? context.evalDir;
|
|
46504
|
+
const rawCommand = config2.command ?? config2.script ?? [];
|
|
46505
|
+
const commandArray = interpolateArgs(rawCommand, context);
|
|
46495
46506
|
const result = await execFileWithStdin(commandArray, stdin, {
|
|
46496
46507
|
timeoutMs,
|
|
46497
46508
|
cwd
|
|
@@ -46506,6 +46517,10 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
|
|
|
46506
46517
|
}
|
|
46507
46518
|
return result.stdout;
|
|
46508
46519
|
}
|
|
46520
|
+
var QUALITY_PASS_THRESHOLD = 0.8;
|
|
46521
|
+
function classifyQualityStatus(score) {
|
|
46522
|
+
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
46523
|
+
}
|
|
46509
46524
|
function usesFileReferencePrompt(provider) {
|
|
46510
46525
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
46511
46526
|
}
|
|
@@ -46613,6 +46628,7 @@ async function runEvaluation(options) {
|
|
|
46613
46628
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
46614
46629
|
const typeRegistry = createBuiltinRegistry();
|
|
46615
46630
|
const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
|
|
46631
|
+
const evalDir = discoveryBaseDir;
|
|
46616
46632
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
46617
46633
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
46618
46634
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -46708,7 +46724,8 @@ async function runEvaluation(options) {
|
|
|
46708
46724
|
const scriptContext = {
|
|
46709
46725
|
workspacePath: sharedWorkspacePath,
|
|
46710
46726
|
testId: "__before_all__",
|
|
46711
|
-
evalRunId
|
|
46727
|
+
evalRunId,
|
|
46728
|
+
evalDir
|
|
46712
46729
|
};
|
|
46713
46730
|
try {
|
|
46714
46731
|
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
@@ -46747,7 +46764,14 @@ async function runEvaluation(options) {
|
|
|
46747
46764
|
answer: "",
|
|
46748
46765
|
target: target.name,
|
|
46749
46766
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
46750
|
-
budgetExceeded: true
|
|
46767
|
+
budgetExceeded: true,
|
|
46768
|
+
executionStatus: "execution_error",
|
|
46769
|
+
failureStage: "setup",
|
|
46770
|
+
failureReasonCode: "budget_exceeded",
|
|
46771
|
+
executionError: {
|
|
46772
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
46773
|
+
stage: "setup"
|
|
46774
|
+
}
|
|
46751
46775
|
};
|
|
46752
46776
|
if (onProgress) {
|
|
46753
46777
|
await onProgress({
|
|
@@ -46794,7 +46818,8 @@ async function runEvaluation(options) {
|
|
|
46794
46818
|
suiteWorkspaceFile,
|
|
46795
46819
|
streamCallbacks,
|
|
46796
46820
|
typeRegistry,
|
|
46797
|
-
repoManager
|
|
46821
|
+
repoManager,
|
|
46822
|
+
evalDir
|
|
46798
46823
|
};
|
|
46799
46824
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
46800
46825
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -46863,7 +46888,9 @@ async function runEvaluation(options) {
|
|
|
46863
46888
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
46864
46889
|
outcome.reason,
|
|
46865
46890
|
promptInputs,
|
|
46866
|
-
primaryProvider
|
|
46891
|
+
primaryProvider,
|
|
46892
|
+
"agent",
|
|
46893
|
+
"provider_error"
|
|
46867
46894
|
);
|
|
46868
46895
|
results.push(errorResult);
|
|
46869
46896
|
if (onResult) {
|
|
@@ -46875,7 +46902,8 @@ async function runEvaluation(options) {
|
|
|
46875
46902
|
const scriptContext = {
|
|
46876
46903
|
workspacePath: sharedWorkspacePath,
|
|
46877
46904
|
testId: "__after_all__",
|
|
46878
|
-
evalRunId
|
|
46905
|
+
evalRunId,
|
|
46906
|
+
evalDir
|
|
46879
46907
|
};
|
|
46880
46908
|
try {
|
|
46881
46909
|
const afterAllOutput = await executeWorkspaceScript(
|
|
@@ -47005,7 +47033,14 @@ async function runBatchEvaluation(options) {
|
|
|
47005
47033
|
availableTargets
|
|
47006
47034
|
});
|
|
47007
47035
|
if (providerError) {
|
|
47008
|
-
result = {
|
|
47036
|
+
result = {
|
|
47037
|
+
...result,
|
|
47038
|
+
error: providerError,
|
|
47039
|
+
executionStatus: "execution_error",
|
|
47040
|
+
failureStage: "agent",
|
|
47041
|
+
failureReasonCode: "provider_error",
|
|
47042
|
+
executionError: { message: providerError, stage: "agent" }
|
|
47043
|
+
};
|
|
47009
47044
|
}
|
|
47010
47045
|
} catch (error40) {
|
|
47011
47046
|
const errorResult = buildErrorResult(
|
|
@@ -47014,7 +47049,9 @@ async function runBatchEvaluation(options) {
|
|
|
47014
47049
|
nowFn(),
|
|
47015
47050
|
error40,
|
|
47016
47051
|
promptInputs,
|
|
47017
|
-
provider
|
|
47052
|
+
provider,
|
|
47053
|
+
"evaluator",
|
|
47054
|
+
"evaluator_error"
|
|
47018
47055
|
);
|
|
47019
47056
|
results.push(errorResult);
|
|
47020
47057
|
if (onResult) {
|
|
@@ -47070,7 +47107,8 @@ async function runEvalCase(options) {
|
|
|
47070
47107
|
sharedBaselineCommit,
|
|
47071
47108
|
suiteWorkspaceFile,
|
|
47072
47109
|
typeRegistry: providedTypeRegistry,
|
|
47073
|
-
repoManager
|
|
47110
|
+
repoManager,
|
|
47111
|
+
evalDir
|
|
47074
47112
|
} = options;
|
|
47075
47113
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
47076
47114
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -47103,7 +47141,9 @@ async function runEvalCase(options) {
|
|
|
47103
47141
|
nowFn(),
|
|
47104
47142
|
new Error(`Failed to create workspace: ${message}`),
|
|
47105
47143
|
promptInputs,
|
|
47106
|
-
provider
|
|
47144
|
+
provider,
|
|
47145
|
+
"setup",
|
|
47146
|
+
"template_error"
|
|
47107
47147
|
);
|
|
47108
47148
|
}
|
|
47109
47149
|
}
|
|
@@ -47123,7 +47163,9 @@ async function runEvalCase(options) {
|
|
|
47123
47163
|
nowFn(),
|
|
47124
47164
|
new Error(`Failed to materialize repos: ${message}`),
|
|
47125
47165
|
promptInputs,
|
|
47126
|
-
provider
|
|
47166
|
+
provider,
|
|
47167
|
+
"repo_setup",
|
|
47168
|
+
"clone_error"
|
|
47127
47169
|
);
|
|
47128
47170
|
}
|
|
47129
47171
|
}
|
|
@@ -47133,7 +47175,8 @@ async function runEvalCase(options) {
|
|
|
47133
47175
|
testId: evalCase.id,
|
|
47134
47176
|
evalRunId: evalRunId ?? "",
|
|
47135
47177
|
caseInput: evalCase.question,
|
|
47136
|
-
caseMetadata: evalCase.metadata
|
|
47178
|
+
caseMetadata: evalCase.metadata,
|
|
47179
|
+
evalDir
|
|
47137
47180
|
};
|
|
47138
47181
|
try {
|
|
47139
47182
|
beforeAllOutput = await executeWorkspaceScript(
|
|
@@ -47152,7 +47195,9 @@ async function runEvalCase(options) {
|
|
|
47152
47195
|
nowFn(),
|
|
47153
47196
|
new Error(`before_all script failed: ${message}`),
|
|
47154
47197
|
promptInputs,
|
|
47155
|
-
provider
|
|
47198
|
+
provider,
|
|
47199
|
+
"setup",
|
|
47200
|
+
"script_error"
|
|
47156
47201
|
);
|
|
47157
47202
|
}
|
|
47158
47203
|
}
|
|
@@ -47163,7 +47208,8 @@ async function runEvalCase(options) {
|
|
|
47163
47208
|
testId: evalCase.id,
|
|
47164
47209
|
evalRunId: evalRunId ?? "",
|
|
47165
47210
|
caseInput: evalCase.question,
|
|
47166
|
-
caseMetadata: evalCase.metadata
|
|
47211
|
+
caseMetadata: evalCase.metadata,
|
|
47212
|
+
evalDir
|
|
47167
47213
|
};
|
|
47168
47214
|
try {
|
|
47169
47215
|
beforeEachOutput = await executeWorkspaceScript(
|
|
@@ -47178,7 +47224,9 @@ async function runEvalCase(options) {
|
|
|
47178
47224
|
nowFn(),
|
|
47179
47225
|
new Error(`before_each script failed: ${message}`),
|
|
47180
47226
|
promptInputs,
|
|
47181
|
-
provider
|
|
47227
|
+
provider,
|
|
47228
|
+
"setup",
|
|
47229
|
+
"script_error"
|
|
47182
47230
|
);
|
|
47183
47231
|
}
|
|
47184
47232
|
}
|
|
@@ -47219,7 +47267,9 @@ async function runEvalCase(options) {
|
|
|
47219
47267
|
nowFn(),
|
|
47220
47268
|
error40,
|
|
47221
47269
|
promptInputs,
|
|
47222
|
-
provider
|
|
47270
|
+
provider,
|
|
47271
|
+
"agent",
|
|
47272
|
+
"provider_error"
|
|
47223
47273
|
);
|
|
47224
47274
|
if (workspacePath) {
|
|
47225
47275
|
if (forceCleanup) {
|
|
@@ -47238,7 +47288,9 @@ async function runEvalCase(options) {
|
|
|
47238
47288
|
nowFn(),
|
|
47239
47289
|
lastError ?? new Error("Provider did not return a response"),
|
|
47240
47290
|
promptInputs,
|
|
47241
|
-
provider
|
|
47291
|
+
provider,
|
|
47292
|
+
"agent",
|
|
47293
|
+
"provider_error"
|
|
47242
47294
|
);
|
|
47243
47295
|
if (workspacePath) {
|
|
47244
47296
|
if (forceCleanup) {
|
|
@@ -47294,7 +47346,8 @@ async function runEvalCase(options) {
|
|
|
47294
47346
|
testId: evalCase.id,
|
|
47295
47347
|
evalRunId: evalRunId ?? "",
|
|
47296
47348
|
caseInput: evalCase.question,
|
|
47297
|
-
caseMetadata: evalCase.metadata
|
|
47349
|
+
caseMetadata: evalCase.metadata,
|
|
47350
|
+
evalDir
|
|
47298
47351
|
};
|
|
47299
47352
|
try {
|
|
47300
47353
|
afterEachOutput = await executeWorkspaceScript(
|
|
@@ -47330,7 +47383,18 @@ async function runEvalCase(options) {
|
|
|
47330
47383
|
fileChanges,
|
|
47331
47384
|
workspacePath
|
|
47332
47385
|
});
|
|
47333
|
-
const
|
|
47386
|
+
const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
|
|
47387
|
+
const finalResult = providerError ? {
|
|
47388
|
+
...result,
|
|
47389
|
+
error: providerError,
|
|
47390
|
+
executionStatus,
|
|
47391
|
+
failureStage: "agent",
|
|
47392
|
+
failureReasonCode: "provider_error",
|
|
47393
|
+
executionError: { message: providerError, stage: "agent" },
|
|
47394
|
+
beforeAllOutput,
|
|
47395
|
+
beforeEachOutput,
|
|
47396
|
+
afterEachOutput
|
|
47397
|
+
} : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
|
|
47334
47398
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
47335
47399
|
if (workspacePath && !isSharedWorkspace) {
|
|
47336
47400
|
if (forceCleanup) {
|
|
@@ -47351,7 +47415,9 @@ async function runEvalCase(options) {
|
|
|
47351
47415
|
nowFn(),
|
|
47352
47416
|
error40,
|
|
47353
47417
|
promptInputs,
|
|
47354
|
-
provider
|
|
47418
|
+
provider,
|
|
47419
|
+
"evaluator",
|
|
47420
|
+
"evaluator_error"
|
|
47355
47421
|
);
|
|
47356
47422
|
if (workspacePath && !isSharedWorkspace) {
|
|
47357
47423
|
if (forceCleanup) {
|
|
@@ -47389,7 +47455,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
47389
47455
|
verdict: trialVerdict,
|
|
47390
47456
|
scores: result.scores,
|
|
47391
47457
|
error: result.error,
|
|
47392
|
-
costUsd: trialCost
|
|
47458
|
+
costUsd: trialCost,
|
|
47459
|
+
executionStatus: result.executionStatus,
|
|
47460
|
+
failureStage: result.failureStage,
|
|
47461
|
+
failureReasonCode: result.failureReasonCode
|
|
47393
47462
|
};
|
|
47394
47463
|
trialResults.push(trial);
|
|
47395
47464
|
if (trialCost !== void 0) {
|
|
@@ -47414,12 +47483,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
47414
47483
|
0
|
|
47415
47484
|
);
|
|
47416
47485
|
const baseResult = allResults[bestTrialIndex];
|
|
47486
|
+
const hasOk = trialResults.some((t) => t.executionStatus === "ok");
|
|
47487
|
+
const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
|
|
47488
|
+
const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
|
|
47489
|
+
const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
|
|
47490
|
+
const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
|
|
47491
|
+
const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
|
|
47417
47492
|
return {
|
|
47418
47493
|
...baseResult,
|
|
47419
47494
|
score,
|
|
47420
47495
|
trials: trialResults,
|
|
47421
47496
|
aggregation,
|
|
47422
|
-
costLimited: costLimited || void 0
|
|
47497
|
+
costLimited: costLimited || void 0,
|
|
47498
|
+
executionStatus: aggregateExecutionStatus,
|
|
47499
|
+
failureStage: aggregateFailureStage,
|
|
47500
|
+
failureReasonCode: aggregateFailureReasonCode,
|
|
47501
|
+
executionError: aggregateExecutionError
|
|
47423
47502
|
};
|
|
47424
47503
|
}
|
|
47425
47504
|
async function evaluateCandidate(options) {
|
|
@@ -47520,7 +47599,8 @@ async function evaluateCandidate(options) {
|
|
|
47520
47599
|
scores,
|
|
47521
47600
|
trace: trace2,
|
|
47522
47601
|
output,
|
|
47523
|
-
fileChanges
|
|
47602
|
+
fileChanges,
|
|
47603
|
+
executionStatus: classifyQualityStatus(score.score)
|
|
47524
47604
|
};
|
|
47525
47605
|
}
|
|
47526
47606
|
async function runEvaluatorsForCase(options) {
|
|
@@ -47825,7 +47905,7 @@ async function invokeProvider(provider, options) {
|
|
|
47825
47905
|
}
|
|
47826
47906
|
}
|
|
47827
47907
|
}
|
|
47828
|
-
function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider) {
|
|
47908
|
+
function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider, failureStage, failureReasonCode) {
|
|
47829
47909
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
47830
47910
|
let agentRequest;
|
|
47831
47911
|
let lmRequest;
|
|
@@ -47868,7 +47948,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs
|
|
|
47868
47948
|
target: targetName,
|
|
47869
47949
|
requests,
|
|
47870
47950
|
input,
|
|
47871
|
-
error: message
|
|
47951
|
+
error: message,
|
|
47952
|
+
executionStatus: "execution_error",
|
|
47953
|
+
failureStage,
|
|
47954
|
+
failureReasonCode,
|
|
47955
|
+
executionError: { message, stage: failureStage }
|
|
47872
47956
|
};
|
|
47873
47957
|
}
|
|
47874
47958
|
function extractProviderError(response) {
|
|
@@ -48866,4 +48950,4 @@ export {
|
|
|
48866
48950
|
OtelStreamingObserver,
|
|
48867
48951
|
createAgentKernel
|
|
48868
48952
|
};
|
|
48869
|
-
//# sourceMappingURL=chunk-
|
|
48953
|
+
//# sourceMappingURL=chunk-LUHCYBMD.js.map
|