agentv 2.11.4 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
148
148
  }
149
149
  });
150
150
 
151
- // ../../packages/core/dist/chunk-REN5PS7B.js
151
+ // ../../packages/core/dist/chunk-7HPKTRFZ.js
152
152
  import { constants } from "node:fs";
153
153
  import { access, readFile } from "node:fs/promises";
154
154
  import path from "node:path";
@@ -4195,7 +4195,7 @@ var coerce = {
4195
4195
  };
4196
4196
  var NEVER = INVALID;
4197
4197
 
4198
- // ../../packages/core/dist/chunk-REN5PS7B.js
4198
+ // ../../packages/core/dist/chunk-7HPKTRFZ.js
4199
4199
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
4200
4200
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
4201
4201
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -46481,6 +46481,16 @@ async function resolveWorkspaceTemplate(templatePath) {
46481
46481
  }
46482
46482
  return { dir: resolved };
46483
46483
  }
46484
+ function interpolateArgs(args, context) {
46485
+ const vars = {
46486
+ workspace_path: context.workspacePath,
46487
+ test_id: context.testId,
46488
+ eval_run_id: context.evalRunId,
46489
+ case_input: context.caseInput ?? "",
46490
+ case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
46491
+ };
46492
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name16) => vars[name16] ?? match));
46493
+ }
46484
46494
  async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46485
46495
  const stdin = JSON.stringify({
46486
46496
  workspace_path: context.workspacePath,
@@ -46490,8 +46500,9 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46490
46500
  case_metadata: context.caseMetadata ?? null
46491
46501
  });
46492
46502
  const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
46493
- const cwd = config2.cwd;
46494
- const commandArray = config2.command ?? config2.script ?? [];
46503
+ const cwd = config2.cwd ?? context.evalDir;
46504
+ const rawCommand = config2.command ?? config2.script ?? [];
46505
+ const commandArray = interpolateArgs(rawCommand, context);
46495
46506
  const result = await execFileWithStdin(commandArray, stdin, {
46496
46507
  timeoutMs,
46497
46508
  cwd
@@ -46506,6 +46517,10 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46506
46517
  }
46507
46518
  return result.stdout;
46508
46519
  }
46520
+ var QUALITY_PASS_THRESHOLD = 0.8;
46521
+ function classifyQualityStatus(score) {
46522
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
46523
+ }
46509
46524
  function usesFileReferencePrompt(provider) {
46510
46525
  return isAgentProvider(provider) || provider.kind === "cli";
46511
46526
  }
@@ -46613,6 +46628,7 @@ async function runEvaluation(options) {
46613
46628
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
46614
46629
  const typeRegistry = createBuiltinRegistry();
46615
46630
  const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
46631
+ const evalDir = discoveryBaseDir;
46616
46632
  await discoverAssertions(typeRegistry, discoveryBaseDir);
46617
46633
  const providerRegistry = createBuiltinProviderRegistry();
46618
46634
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -46708,7 +46724,8 @@ async function runEvaluation(options) {
46708
46724
  const scriptContext = {
46709
46725
  workspacePath: sharedWorkspacePath,
46710
46726
  testId: "__before_all__",
46711
- evalRunId
46727
+ evalRunId,
46728
+ evalDir
46712
46729
  };
46713
46730
  try {
46714
46731
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -46747,7 +46764,14 @@ async function runEvaluation(options) {
46747
46764
  answer: "",
46748
46765
  target: target.name,
46749
46766
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
46750
- budgetExceeded: true
46767
+ budgetExceeded: true,
46768
+ executionStatus: "execution_error",
46769
+ failureStage: "setup",
46770
+ failureReasonCode: "budget_exceeded",
46771
+ executionError: {
46772
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
46773
+ stage: "setup"
46774
+ }
46751
46775
  };
46752
46776
  if (onProgress) {
46753
46777
  await onProgress({
@@ -46794,7 +46818,8 @@ async function runEvaluation(options) {
46794
46818
  suiteWorkspaceFile,
46795
46819
  streamCallbacks,
46796
46820
  typeRegistry,
46797
- repoManager
46821
+ repoManager,
46822
+ evalDir
46798
46823
  };
46799
46824
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
46800
46825
  if (totalBudgetUsd !== void 0) {
@@ -46863,7 +46888,9 @@ async function runEvaluation(options) {
46863
46888
  (now ?? (() => /* @__PURE__ */ new Date()))(),
46864
46889
  outcome.reason,
46865
46890
  promptInputs,
46866
- primaryProvider
46891
+ primaryProvider,
46892
+ "agent",
46893
+ "provider_error"
46867
46894
  );
46868
46895
  results.push(errorResult);
46869
46896
  if (onResult) {
@@ -46875,7 +46902,8 @@ async function runEvaluation(options) {
46875
46902
  const scriptContext = {
46876
46903
  workspacePath: sharedWorkspacePath,
46877
46904
  testId: "__after_all__",
46878
- evalRunId
46905
+ evalRunId,
46906
+ evalDir
46879
46907
  };
46880
46908
  try {
46881
46909
  const afterAllOutput = await executeWorkspaceScript(
@@ -47005,7 +47033,14 @@ async function runBatchEvaluation(options) {
47005
47033
  availableTargets
47006
47034
  });
47007
47035
  if (providerError) {
47008
- result = { ...result, error: providerError };
47036
+ result = {
47037
+ ...result,
47038
+ error: providerError,
47039
+ executionStatus: "execution_error",
47040
+ failureStage: "agent",
47041
+ failureReasonCode: "provider_error",
47042
+ executionError: { message: providerError, stage: "agent" }
47043
+ };
47009
47044
  }
47010
47045
  } catch (error40) {
47011
47046
  const errorResult = buildErrorResult(
@@ -47014,7 +47049,9 @@ async function runBatchEvaluation(options) {
47014
47049
  nowFn(),
47015
47050
  error40,
47016
47051
  promptInputs,
47017
- provider
47052
+ provider,
47053
+ "evaluator",
47054
+ "evaluator_error"
47018
47055
  );
47019
47056
  results.push(errorResult);
47020
47057
  if (onResult) {
@@ -47070,7 +47107,8 @@ async function runEvalCase(options) {
47070
47107
  sharedBaselineCommit,
47071
47108
  suiteWorkspaceFile,
47072
47109
  typeRegistry: providedTypeRegistry,
47073
- repoManager
47110
+ repoManager,
47111
+ evalDir
47074
47112
  } = options;
47075
47113
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
47076
47114
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -47103,7 +47141,9 @@ async function runEvalCase(options) {
47103
47141
  nowFn(),
47104
47142
  new Error(`Failed to create workspace: ${message}`),
47105
47143
  promptInputs,
47106
- provider
47144
+ provider,
47145
+ "setup",
47146
+ "template_error"
47107
47147
  );
47108
47148
  }
47109
47149
  }
@@ -47123,7 +47163,9 @@ async function runEvalCase(options) {
47123
47163
  nowFn(),
47124
47164
  new Error(`Failed to materialize repos: ${message}`),
47125
47165
  promptInputs,
47126
- provider
47166
+ provider,
47167
+ "repo_setup",
47168
+ "clone_error"
47127
47169
  );
47128
47170
  }
47129
47171
  }
@@ -47133,7 +47175,8 @@ async function runEvalCase(options) {
47133
47175
  testId: evalCase.id,
47134
47176
  evalRunId: evalRunId ?? "",
47135
47177
  caseInput: evalCase.question,
47136
- caseMetadata: evalCase.metadata
47178
+ caseMetadata: evalCase.metadata,
47179
+ evalDir
47137
47180
  };
47138
47181
  try {
47139
47182
  beforeAllOutput = await executeWorkspaceScript(
@@ -47152,7 +47195,9 @@ async function runEvalCase(options) {
47152
47195
  nowFn(),
47153
47196
  new Error(`before_all script failed: ${message}`),
47154
47197
  promptInputs,
47155
- provider
47198
+ provider,
47199
+ "setup",
47200
+ "script_error"
47156
47201
  );
47157
47202
  }
47158
47203
  }
@@ -47163,7 +47208,8 @@ async function runEvalCase(options) {
47163
47208
  testId: evalCase.id,
47164
47209
  evalRunId: evalRunId ?? "",
47165
47210
  caseInput: evalCase.question,
47166
- caseMetadata: evalCase.metadata
47211
+ caseMetadata: evalCase.metadata,
47212
+ evalDir
47167
47213
  };
47168
47214
  try {
47169
47215
  beforeEachOutput = await executeWorkspaceScript(
@@ -47178,7 +47224,9 @@ async function runEvalCase(options) {
47178
47224
  nowFn(),
47179
47225
  new Error(`before_each script failed: ${message}`),
47180
47226
  promptInputs,
47181
- provider
47227
+ provider,
47228
+ "setup",
47229
+ "script_error"
47182
47230
  );
47183
47231
  }
47184
47232
  }
@@ -47219,7 +47267,9 @@ async function runEvalCase(options) {
47219
47267
  nowFn(),
47220
47268
  error40,
47221
47269
  promptInputs,
47222
- provider
47270
+ provider,
47271
+ "agent",
47272
+ "provider_error"
47223
47273
  );
47224
47274
  if (workspacePath) {
47225
47275
  if (forceCleanup) {
@@ -47238,7 +47288,9 @@ async function runEvalCase(options) {
47238
47288
  nowFn(),
47239
47289
  lastError ?? new Error("Provider did not return a response"),
47240
47290
  promptInputs,
47241
- provider
47291
+ provider,
47292
+ "agent",
47293
+ "provider_error"
47242
47294
  );
47243
47295
  if (workspacePath) {
47244
47296
  if (forceCleanup) {
@@ -47294,7 +47346,8 @@ async function runEvalCase(options) {
47294
47346
  testId: evalCase.id,
47295
47347
  evalRunId: evalRunId ?? "",
47296
47348
  caseInput: evalCase.question,
47297
- caseMetadata: evalCase.metadata
47349
+ caseMetadata: evalCase.metadata,
47350
+ evalDir
47298
47351
  };
47299
47352
  try {
47300
47353
  afterEachOutput = await executeWorkspaceScript(
@@ -47330,7 +47383,18 @@ async function runEvalCase(options) {
47330
47383
  fileChanges,
47331
47384
  workspacePath
47332
47385
  });
47333
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
47386
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
47387
+ const finalResult = providerError ? {
47388
+ ...result,
47389
+ error: providerError,
47390
+ executionStatus,
47391
+ failureStage: "agent",
47392
+ failureReasonCode: "provider_error",
47393
+ executionError: { message: providerError, stage: "agent" },
47394
+ beforeAllOutput,
47395
+ beforeEachOutput,
47396
+ afterEachOutput
47397
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
47334
47398
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
47335
47399
  if (workspacePath && !isSharedWorkspace) {
47336
47400
  if (forceCleanup) {
@@ -47351,7 +47415,9 @@ async function runEvalCase(options) {
47351
47415
  nowFn(),
47352
47416
  error40,
47353
47417
  promptInputs,
47354
- provider
47418
+ provider,
47419
+ "evaluator",
47420
+ "evaluator_error"
47355
47421
  );
47356
47422
  if (workspacePath && !isSharedWorkspace) {
47357
47423
  if (forceCleanup) {
@@ -47389,7 +47455,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
47389
47455
  verdict: trialVerdict,
47390
47456
  scores: result.scores,
47391
47457
  error: result.error,
47392
- costUsd: trialCost
47458
+ costUsd: trialCost,
47459
+ executionStatus: result.executionStatus,
47460
+ failureStage: result.failureStage,
47461
+ failureReasonCode: result.failureReasonCode
47393
47462
  };
47394
47463
  trialResults.push(trial);
47395
47464
  if (trialCost !== void 0) {
@@ -47414,12 +47483,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
47414
47483
  0
47415
47484
  );
47416
47485
  const baseResult = allResults[bestTrialIndex];
47486
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
47487
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
47488
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
47489
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
47490
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
47491
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
47417
47492
  return {
47418
47493
  ...baseResult,
47419
47494
  score,
47420
47495
  trials: trialResults,
47421
47496
  aggregation,
47422
- costLimited: costLimited || void 0
47497
+ costLimited: costLimited || void 0,
47498
+ executionStatus: aggregateExecutionStatus,
47499
+ failureStage: aggregateFailureStage,
47500
+ failureReasonCode: aggregateFailureReasonCode,
47501
+ executionError: aggregateExecutionError
47423
47502
  };
47424
47503
  }
47425
47504
  async function evaluateCandidate(options) {
@@ -47520,7 +47599,8 @@ async function evaluateCandidate(options) {
47520
47599
  scores,
47521
47600
  trace: trace2,
47522
47601
  output,
47523
- fileChanges
47602
+ fileChanges,
47603
+ executionStatus: classifyQualityStatus(score.score)
47524
47604
  };
47525
47605
  }
47526
47606
  async function runEvaluatorsForCase(options) {
@@ -47825,7 +47905,7 @@ async function invokeProvider(provider, options) {
47825
47905
  }
47826
47906
  }
47827
47907
  }
47828
- function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider) {
47908
+ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider, failureStage, failureReasonCode) {
47829
47909
  const message = error40 instanceof Error ? error40.message : String(error40);
47830
47910
  let agentRequest;
47831
47911
  let lmRequest;
@@ -47868,7 +47948,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs
47868
47948
  target: targetName,
47869
47949
  requests,
47870
47950
  input,
47871
- error: message
47951
+ error: message,
47952
+ executionStatus: "execution_error",
47953
+ failureStage,
47954
+ failureReasonCode,
47955
+ executionError: { message, stage: failureStage }
47872
47956
  };
47873
47957
  }
47874
47958
  function extractProviderError(response) {
@@ -48866,4 +48950,4 @@ export {
48866
48950
  OtelStreamingObserver,
48867
48951
  createAgentKernel
48868
48952
  };
48869
- //# sourceMappingURL=chunk-KWUTY5XR.js.map
48953
+ //# sourceMappingURL=chunk-LUHCYBMD.js.map