agentv 2.11.4 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
148
148
  }
149
149
  });
150
150
 
151
- // ../../packages/core/dist/chunk-REN5PS7B.js
151
+ // ../../packages/core/dist/chunk-JHER2LQ5.js
152
152
  import { constants } from "node:fs";
153
153
  import { access, readFile } from "node:fs/promises";
154
154
  import path from "node:path";
@@ -4195,7 +4195,7 @@ var coerce = {
4195
4195
  };
4196
4196
  var NEVER = INVALID;
4197
4197
 
4198
- // ../../packages/core/dist/chunk-REN5PS7B.js
4198
+ // ../../packages/core/dist/chunk-JHER2LQ5.js
4199
4199
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
4200
4200
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
4201
4201
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -34331,6 +34331,11 @@ async function loadConfig(evalFilePath, repoRoot) {
34331
34331
  continue;
34332
34332
  }
34333
34333
  const config2 = parsed;
34334
+ const requiredVersion = parsed.required_version;
34335
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
34336
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
34337
+ continue;
34338
+ }
34334
34339
  const guidelinePatterns = config2.guideline_patterns;
34335
34340
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
34336
34341
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -34354,6 +34359,7 @@ async function loadConfig(evalFilePath, repoRoot) {
34354
34359
  configPath
34355
34360
  );
34356
34361
  return {
34362
+ required_version: requiredVersion,
34357
34363
  guideline_patterns: guidelinePatterns,
34358
34364
  eval_patterns: evalPatterns,
34359
34365
  execution: executionDefaults
@@ -34497,6 +34503,22 @@ function extractTotalBudgetUsd(suite) {
34497
34503
  );
34498
34504
  return void 0;
34499
34505
  }
34506
+ function extractFailOnError(suite) {
34507
+ const execution = suite.execution;
34508
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
34509
+ return void 0;
34510
+ }
34511
+ const executionObj = execution;
34512
+ const raw = executionObj.fail_on_error ?? executionObj.failOnError;
34513
+ if (raw === void 0 || raw === null) {
34514
+ return void 0;
34515
+ }
34516
+ if (typeof raw === "boolean") {
34517
+ return raw;
34518
+ }
34519
+ logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
34520
+ return void 0;
34521
+ }
34500
34522
  function parseExecutionDefaults(raw, configPath) {
34501
34523
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
34502
34524
  return void 0;
@@ -36653,13 +36675,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
36653
36675
  }
36654
36676
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
36655
36677
  const metadata = parseMetadata(parsed);
36678
+ const failOnError = extractFailOnError(parsed);
36656
36679
  return {
36657
36680
  tests,
36658
36681
  trials: extractTrialsConfig(parsed),
36659
36682
  targets: extractTargetsFromSuite(parsed),
36660
36683
  cacheConfig: extractCacheConfig(parsed),
36661
36684
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
36662
- ...metadata !== void 0 && { metadata }
36685
+ ...metadata !== void 0 && { metadata },
36686
+ ...failOnError !== void 0 && { failOnError }
36663
36687
  };
36664
36688
  }
36665
36689
  var loadEvalSuite = loadTestSuite;
@@ -46481,6 +46505,16 @@ async function resolveWorkspaceTemplate(templatePath) {
46481
46505
  }
46482
46506
  return { dir: resolved };
46483
46507
  }
46508
+ function interpolateArgs(args, context) {
46509
+ const vars = {
46510
+ workspace_path: context.workspacePath,
46511
+ test_id: context.testId,
46512
+ eval_run_id: context.evalRunId,
46513
+ case_input: context.caseInput ?? "",
46514
+ case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
46515
+ };
46516
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name16) => vars[name16] ?? match));
46517
+ }
46484
46518
  async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46485
46519
  const stdin = JSON.stringify({
46486
46520
  workspace_path: context.workspacePath,
@@ -46490,8 +46524,9 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46490
46524
  case_metadata: context.caseMetadata ?? null
46491
46525
  });
46492
46526
  const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
46493
- const cwd = config2.cwd;
46494
- const commandArray = config2.command ?? config2.script ?? [];
46527
+ const cwd = config2.cwd ?? context.evalDir;
46528
+ const rawCommand = config2.command ?? config2.script ?? [];
46529
+ const commandArray = interpolateArgs(rawCommand, context);
46495
46530
  const result = await execFileWithStdin(commandArray, stdin, {
46496
46531
  timeoutMs,
46497
46532
  cwd
@@ -46506,6 +46541,10 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46506
46541
  }
46507
46542
  return result.stdout;
46508
46543
  }
46544
+ var QUALITY_PASS_THRESHOLD = 0.8;
46545
+ function classifyQualityStatus(score) {
46546
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
46547
+ }
46509
46548
  function usesFileReferencePrompt(provider) {
46510
46549
  return isAgentProvider(provider) || provider.kind === "cli";
46511
46550
  }
@@ -46538,7 +46577,8 @@ async function runEvaluation(options) {
46538
46577
  cleanupWorkspaces,
46539
46578
  trials,
46540
46579
  streamCallbacks,
46541
- totalBudgetUsd
46580
+ totalBudgetUsd,
46581
+ failOnError
46542
46582
  } = options;
46543
46583
  let useCache = options.useCache;
46544
46584
  if (trials && trials.count > 1 && useCache) {
@@ -46613,6 +46653,7 @@ async function runEvaluation(options) {
46613
46653
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
46614
46654
  const typeRegistry = createBuiltinRegistry();
46615
46655
  const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
46656
+ const evalDir = discoveryBaseDir;
46616
46657
  await discoverAssertions(typeRegistry, discoveryBaseDir);
46617
46658
  const providerRegistry = createBuiltinProviderRegistry();
46618
46659
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -46708,7 +46749,8 @@ async function runEvaluation(options) {
46708
46749
  const scriptContext = {
46709
46750
  workspacePath: sharedWorkspacePath,
46710
46751
  testId: "__before_all__",
46711
- evalRunId
46752
+ evalRunId,
46753
+ evalDir
46712
46754
  };
46713
46755
  try {
46714
46756
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -46732,6 +46774,7 @@ async function runEvaluation(options) {
46732
46774
  let beforeAllOutputAttached = false;
46733
46775
  let cumulativeBudgetCost = 0;
46734
46776
  let budgetExhausted = false;
46777
+ let failOnErrorTriggered = false;
46735
46778
  const promises = filteredEvalCases.map(
46736
46779
  (evalCase) => limit(async () => {
46737
46780
  const workerId = nextWorkerId++;
@@ -46747,7 +46790,14 @@ async function runEvaluation(options) {
46747
46790
  answer: "",
46748
46791
  target: target.name,
46749
46792
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
46750
- budgetExceeded: true
46793
+ budgetExceeded: true,
46794
+ executionStatus: "execution_error",
46795
+ failureStage: "setup",
46796
+ failureReasonCode: "budget_exceeded",
46797
+ executionError: {
46798
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
46799
+ stage: "setup"
46800
+ }
46751
46801
  };
46752
46802
  if (onProgress) {
46753
46803
  await onProgress({
@@ -46763,6 +46813,37 @@ async function runEvaluation(options) {
46763
46813
  }
46764
46814
  return budgetResult;
46765
46815
  }
46816
+ if (failOnError === true && failOnErrorTriggered) {
46817
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
46818
+ const haltResult = {
46819
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
46820
+ testId: evalCase.id,
46821
+ dataset: evalCase.dataset,
46822
+ score: 0,
46823
+ hits: [],
46824
+ misses: [],
46825
+ answer: "",
46826
+ target: target.name,
46827
+ error: errorMsg,
46828
+ executionStatus: "execution_error",
46829
+ failureStage: "setup",
46830
+ failureReasonCode: "error_threshold_exceeded",
46831
+ executionError: { message: errorMsg, stage: "setup" }
46832
+ };
46833
+ if (onProgress) {
46834
+ await onProgress({
46835
+ workerId,
46836
+ testId: evalCase.id,
46837
+ status: "failed",
46838
+ completedAt: Date.now(),
46839
+ error: haltResult.error
46840
+ });
46841
+ }
46842
+ if (onResult) {
46843
+ await onResult(haltResult);
46844
+ }
46845
+ return haltResult;
46846
+ }
46766
46847
  if (onProgress) {
46767
46848
  await onProgress({
46768
46849
  workerId,
@@ -46794,7 +46875,8 @@ async function runEvaluation(options) {
46794
46875
  suiteWorkspaceFile,
46795
46876
  streamCallbacks,
46796
46877
  typeRegistry,
46797
- repoManager
46878
+ repoManager,
46879
+ evalDir
46798
46880
  };
46799
46881
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
46800
46882
  if (totalBudgetUsd !== void 0) {
@@ -46814,6 +46896,9 @@ async function runEvaluation(options) {
46814
46896
  }
46815
46897
  }
46816
46898
  }
46899
+ if (failOnError === true && result.executionStatus === "execution_error") {
46900
+ failOnErrorTriggered = true;
46901
+ }
46817
46902
  if (beforeAllOutput && !beforeAllOutputAttached) {
46818
46903
  result = { ...result, beforeAllOutput };
46819
46904
  beforeAllOutputAttached = true;
@@ -46863,7 +46948,9 @@ async function runEvaluation(options) {
46863
46948
  (now ?? (() => /* @__PURE__ */ new Date()))(),
46864
46949
  outcome.reason,
46865
46950
  promptInputs,
46866
- primaryProvider
46951
+ primaryProvider,
46952
+ "agent",
46953
+ "provider_error"
46867
46954
  );
46868
46955
  results.push(errorResult);
46869
46956
  if (onResult) {
@@ -46875,7 +46962,8 @@ async function runEvaluation(options) {
46875
46962
  const scriptContext = {
46876
46963
  workspacePath: sharedWorkspacePath,
46877
46964
  testId: "__after_all__",
46878
- evalRunId
46965
+ evalRunId,
46966
+ evalDir
46879
46967
  };
46880
46968
  try {
46881
46969
  const afterAllOutput = await executeWorkspaceScript(
@@ -47005,7 +47093,14 @@ async function runBatchEvaluation(options) {
47005
47093
  availableTargets
47006
47094
  });
47007
47095
  if (providerError) {
47008
- result = { ...result, error: providerError };
47096
+ result = {
47097
+ ...result,
47098
+ error: providerError,
47099
+ executionStatus: "execution_error",
47100
+ failureStage: "agent",
47101
+ failureReasonCode: "provider_error",
47102
+ executionError: { message: providerError, stage: "agent" }
47103
+ };
47009
47104
  }
47010
47105
  } catch (error40) {
47011
47106
  const errorResult = buildErrorResult(
@@ -47014,7 +47109,9 @@ async function runBatchEvaluation(options) {
47014
47109
  nowFn(),
47015
47110
  error40,
47016
47111
  promptInputs,
47017
- provider
47112
+ provider,
47113
+ "evaluator",
47114
+ "evaluator_error"
47018
47115
  );
47019
47116
  results.push(errorResult);
47020
47117
  if (onResult) {
@@ -47070,7 +47167,8 @@ async function runEvalCase(options) {
47070
47167
  sharedBaselineCommit,
47071
47168
  suiteWorkspaceFile,
47072
47169
  typeRegistry: providedTypeRegistry,
47073
- repoManager
47170
+ repoManager,
47171
+ evalDir
47074
47172
  } = options;
47075
47173
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
47076
47174
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -47103,7 +47201,9 @@ async function runEvalCase(options) {
47103
47201
  nowFn(),
47104
47202
  new Error(`Failed to create workspace: ${message}`),
47105
47203
  promptInputs,
47106
- provider
47204
+ provider,
47205
+ "setup",
47206
+ "template_error"
47107
47207
  );
47108
47208
  }
47109
47209
  }
@@ -47123,7 +47223,9 @@ async function runEvalCase(options) {
47123
47223
  nowFn(),
47124
47224
  new Error(`Failed to materialize repos: ${message}`),
47125
47225
  promptInputs,
47126
- provider
47226
+ provider,
47227
+ "repo_setup",
47228
+ "clone_error"
47127
47229
  );
47128
47230
  }
47129
47231
  }
@@ -47133,7 +47235,8 @@ async function runEvalCase(options) {
47133
47235
  testId: evalCase.id,
47134
47236
  evalRunId: evalRunId ?? "",
47135
47237
  caseInput: evalCase.question,
47136
- caseMetadata: evalCase.metadata
47238
+ caseMetadata: evalCase.metadata,
47239
+ evalDir
47137
47240
  };
47138
47241
  try {
47139
47242
  beforeAllOutput = await executeWorkspaceScript(
@@ -47152,7 +47255,9 @@ async function runEvalCase(options) {
47152
47255
  nowFn(),
47153
47256
  new Error(`before_all script failed: ${message}`),
47154
47257
  promptInputs,
47155
- provider
47258
+ provider,
47259
+ "setup",
47260
+ "script_error"
47156
47261
  );
47157
47262
  }
47158
47263
  }
@@ -47163,7 +47268,8 @@ async function runEvalCase(options) {
47163
47268
  testId: evalCase.id,
47164
47269
  evalRunId: evalRunId ?? "",
47165
47270
  caseInput: evalCase.question,
47166
- caseMetadata: evalCase.metadata
47271
+ caseMetadata: evalCase.metadata,
47272
+ evalDir
47167
47273
  };
47168
47274
  try {
47169
47275
  beforeEachOutput = await executeWorkspaceScript(
@@ -47178,7 +47284,9 @@ async function runEvalCase(options) {
47178
47284
  nowFn(),
47179
47285
  new Error(`before_each script failed: ${message}`),
47180
47286
  promptInputs,
47181
- provider
47287
+ provider,
47288
+ "setup",
47289
+ "script_error"
47182
47290
  );
47183
47291
  }
47184
47292
  }
@@ -47219,7 +47327,9 @@ async function runEvalCase(options) {
47219
47327
  nowFn(),
47220
47328
  error40,
47221
47329
  promptInputs,
47222
- provider
47330
+ provider,
47331
+ "agent",
47332
+ "provider_error"
47223
47333
  );
47224
47334
  if (workspacePath) {
47225
47335
  if (forceCleanup) {
@@ -47238,7 +47348,9 @@ async function runEvalCase(options) {
47238
47348
  nowFn(),
47239
47349
  lastError ?? new Error("Provider did not return a response"),
47240
47350
  promptInputs,
47241
- provider
47351
+ provider,
47352
+ "agent",
47353
+ "provider_error"
47242
47354
  );
47243
47355
  if (workspacePath) {
47244
47356
  if (forceCleanup) {
@@ -47294,7 +47406,8 @@ async function runEvalCase(options) {
47294
47406
  testId: evalCase.id,
47295
47407
  evalRunId: evalRunId ?? "",
47296
47408
  caseInput: evalCase.question,
47297
- caseMetadata: evalCase.metadata
47409
+ caseMetadata: evalCase.metadata,
47410
+ evalDir
47298
47411
  };
47299
47412
  try {
47300
47413
  afterEachOutput = await executeWorkspaceScript(
@@ -47330,7 +47443,18 @@ async function runEvalCase(options) {
47330
47443
  fileChanges,
47331
47444
  workspacePath
47332
47445
  });
47333
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
47446
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
47447
+ const finalResult = providerError ? {
47448
+ ...result,
47449
+ error: providerError,
47450
+ executionStatus,
47451
+ failureStage: "agent",
47452
+ failureReasonCode: "provider_error",
47453
+ executionError: { message: providerError, stage: "agent" },
47454
+ beforeAllOutput,
47455
+ beforeEachOutput,
47456
+ afterEachOutput
47457
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
47334
47458
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
47335
47459
  if (workspacePath && !isSharedWorkspace) {
47336
47460
  if (forceCleanup) {
@@ -47351,7 +47475,9 @@ async function runEvalCase(options) {
47351
47475
  nowFn(),
47352
47476
  error40,
47353
47477
  promptInputs,
47354
- provider
47478
+ provider,
47479
+ "evaluator",
47480
+ "evaluator_error"
47355
47481
  );
47356
47482
  if (workspacePath && !isSharedWorkspace) {
47357
47483
  if (forceCleanup) {
@@ -47389,7 +47515,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
47389
47515
  verdict: trialVerdict,
47390
47516
  scores: result.scores,
47391
47517
  error: result.error,
47392
- costUsd: trialCost
47518
+ costUsd: trialCost,
47519
+ executionStatus: result.executionStatus,
47520
+ failureStage: result.failureStage,
47521
+ failureReasonCode: result.failureReasonCode
47393
47522
  };
47394
47523
  trialResults.push(trial);
47395
47524
  if (trialCost !== void 0) {
@@ -47414,12 +47543,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
47414
47543
  0
47415
47544
  );
47416
47545
  const baseResult = allResults[bestTrialIndex];
47546
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
47547
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
47548
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
47549
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
47550
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
47551
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
47417
47552
  return {
47418
47553
  ...baseResult,
47419
47554
  score,
47420
47555
  trials: trialResults,
47421
47556
  aggregation,
47422
- costLimited: costLimited || void 0
47557
+ costLimited: costLimited || void 0,
47558
+ executionStatus: aggregateExecutionStatus,
47559
+ failureStage: aggregateFailureStage,
47560
+ failureReasonCode: aggregateFailureReasonCode,
47561
+ executionError: aggregateExecutionError
47423
47562
  };
47424
47563
  }
47425
47564
  async function evaluateCandidate(options) {
@@ -47520,7 +47659,8 @@ async function evaluateCandidate(options) {
47520
47659
  scores,
47521
47660
  trace: trace2,
47522
47661
  output,
47523
- fileChanges
47662
+ fileChanges,
47663
+ executionStatus: classifyQualityStatus(score.score)
47524
47664
  };
47525
47665
  }
47526
47666
  async function runEvaluatorsForCase(options) {
@@ -47825,7 +47965,7 @@ async function invokeProvider(provider, options) {
47825
47965
  }
47826
47966
  }
47827
47967
  }
47828
- function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider) {
47968
+ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider, failureStage, failureReasonCode) {
47829
47969
  const message = error40 instanceof Error ? error40.message : String(error40);
47830
47970
  let agentRequest;
47831
47971
  let lmRequest;
@@ -47868,7 +48008,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs
47868
48008
  target: targetName,
47869
48009
  requests,
47870
48010
  input,
47871
- error: message
48011
+ error: message,
48012
+ executionStatus: "execution_error",
48013
+ failureStage,
48014
+ failureReasonCode,
48015
+ executionError: { message, stage: failureStage }
47872
48016
  };
47873
48017
  }
47874
48018
  function extractProviderError(response) {
@@ -48767,6 +48911,7 @@ export {
48767
48911
  extractTargetsFromTestCase,
48768
48912
  extractTrialsConfig,
48769
48913
  extractCacheConfig,
48914
+ extractFailOnError,
48770
48915
  detectFormat,
48771
48916
  buildPromptInputs,
48772
48917
  readTestSuiteMetadata,
@@ -48866,4 +49011,4 @@ export {
48866
49011
  OtelStreamingObserver,
48867
49012
  createAgentKernel
48868
49013
  };
48869
- //# sourceMappingURL=chunk-KWUTY5XR.js.map
49014
+ //# sourceMappingURL=chunk-FSBZM3HT.js.map