@agentv/core 2.11.4 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  isEvaluatorKind,
8
8
  resolveFileReference
9
- } from "../../chunk-REN5PS7B.js";
9
+ } from "../../chunk-JHER2LQ5.js";
10
10
 
11
11
  // src/evaluation/validation/file-type.ts
12
12
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1492,6 +1492,7 @@ __export(index_exports, {
1492
1492
  executeWorkspaceScript: () => executeWorkspaceScript,
1493
1493
  explorationRatio: () => explorationRatio,
1494
1494
  extractCacheConfig: () => extractCacheConfig,
1495
+ extractFailOnError: () => extractFailOnError,
1495
1496
  extractJsonBlob: () => extractJsonBlob,
1496
1497
  extractTargetFromSuite: () => extractTargetFromSuite,
1497
1498
  extractTargetsFromSuite: () => extractTargetsFromSuite,
@@ -2014,6 +2015,11 @@ async function loadConfig(evalFilePath, repoRoot) {
2014
2015
  continue;
2015
2016
  }
2016
2017
  const config = parsed;
2018
+ const requiredVersion = parsed.required_version;
2019
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
2020
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
2021
+ continue;
2022
+ }
2017
2023
  const guidelinePatterns = config.guideline_patterns;
2018
2024
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
2019
2025
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -2037,6 +2043,7 @@ async function loadConfig(evalFilePath, repoRoot) {
2037
2043
  configPath
2038
2044
  );
2039
2045
  return {
2046
+ required_version: requiredVersion,
2040
2047
  guideline_patterns: guidelinePatterns,
2041
2048
  eval_patterns: evalPatterns,
2042
2049
  execution: executionDefaults
@@ -2180,6 +2187,22 @@ function extractTotalBudgetUsd(suite) {
2180
2187
  );
2181
2188
  return void 0;
2182
2189
  }
2190
+ function extractFailOnError(suite) {
2191
+ const execution = suite.execution;
2192
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2193
+ return void 0;
2194
+ }
2195
+ const executionObj = execution;
2196
+ const raw = executionObj.fail_on_error ?? executionObj.failOnError;
2197
+ if (raw === void 0 || raw === null) {
2198
+ return void 0;
2199
+ }
2200
+ if (typeof raw === "boolean") {
2201
+ return raw;
2202
+ }
2203
+ logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
2204
+ return void 0;
2205
+ }
2183
2206
  function parseExecutionDefaults(raw, configPath) {
2184
2207
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
2185
2208
  return void 0;
@@ -4375,13 +4398,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4375
4398
  }
4376
4399
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
4377
4400
  const metadata = parseMetadata(parsed);
4401
+ const failOnError = extractFailOnError(parsed);
4378
4402
  return {
4379
4403
  tests,
4380
4404
  trials: extractTrialsConfig(parsed),
4381
4405
  targets: extractTargetsFromSuite(parsed),
4382
4406
  cacheConfig: extractCacheConfig(parsed),
4383
4407
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
4384
- ...metadata !== void 0 && { metadata }
4408
+ ...metadata !== void 0 && { metadata },
4409
+ ...failOnError !== void 0 && { failOnError }
4385
4410
  };
4386
4411
  }
4387
4412
  var loadEvalSuite = loadTestSuite;
@@ -15706,6 +15731,16 @@ async function resolveWorkspaceTemplate(templatePath) {
15706
15731
  }
15707
15732
 
15708
15733
  // src/evaluation/workspace/script-executor.ts
15734
+ function interpolateArgs(args, context2) {
15735
+ const vars = {
15736
+ workspace_path: context2.workspacePath,
15737
+ test_id: context2.testId,
15738
+ eval_run_id: context2.evalRunId,
15739
+ case_input: context2.caseInput ?? "",
15740
+ case_metadata: context2.caseMetadata ? JSON.stringify(context2.caseMetadata) : ""
15741
+ };
15742
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
15743
+ }
15709
15744
  async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15710
15745
  const stdin = JSON.stringify({
15711
15746
  workspace_path: context2.workspacePath,
@@ -15715,8 +15750,9 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15715
15750
  case_metadata: context2.caseMetadata ?? null
15716
15751
  });
15717
15752
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
15718
- const cwd = config.cwd;
15719
- const commandArray = config.command ?? config.script ?? [];
15753
+ const cwd = config.cwd ?? context2.evalDir;
15754
+ const rawCommand = config.command ?? config.script ?? [];
15755
+ const commandArray = interpolateArgs(rawCommand, context2);
15720
15756
  const result = await execFileWithStdin(commandArray, stdin, {
15721
15757
  timeoutMs,
15722
15758
  cwd
@@ -15733,6 +15769,10 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15733
15769
  }
15734
15770
 
15735
15771
  // src/evaluation/orchestrator.ts
15772
+ var QUALITY_PASS_THRESHOLD = 0.8;
15773
+ function classifyQualityStatus(score) {
15774
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
15775
+ }
15736
15776
  function usesFileReferencePrompt(provider) {
15737
15777
  return isAgentProvider(provider) || provider.kind === "cli";
15738
15778
  }
@@ -15765,7 +15805,8 @@ async function runEvaluation(options) {
15765
15805
  cleanupWorkspaces,
15766
15806
  trials,
15767
15807
  streamCallbacks,
15768
- totalBudgetUsd
15808
+ totalBudgetUsd,
15809
+ failOnError
15769
15810
  } = options;
15770
15811
  let useCache = options.useCache;
15771
15812
  if (trials && trials.count > 1 && useCache) {
@@ -15840,6 +15881,7 @@ async function runEvaluation(options) {
15840
15881
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
15841
15882
  const typeRegistry = createBuiltinRegistry();
15842
15883
  const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
15884
+ const evalDir = discoveryBaseDir;
15843
15885
  await discoverAssertions(typeRegistry, discoveryBaseDir);
15844
15886
  const providerRegistry = createBuiltinProviderRegistry();
15845
15887
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -15935,7 +15977,8 @@ async function runEvaluation(options) {
15935
15977
  const scriptContext = {
15936
15978
  workspacePath: sharedWorkspacePath,
15937
15979
  testId: "__before_all__",
15938
- evalRunId
15980
+ evalRunId,
15981
+ evalDir
15939
15982
  };
15940
15983
  try {
15941
15984
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -15959,6 +16002,7 @@ async function runEvaluation(options) {
15959
16002
  let beforeAllOutputAttached = false;
15960
16003
  let cumulativeBudgetCost = 0;
15961
16004
  let budgetExhausted = false;
16005
+ let failOnErrorTriggered = false;
15962
16006
  const promises = filteredEvalCases.map(
15963
16007
  (evalCase) => limit(async () => {
15964
16008
  const workerId = nextWorkerId++;
@@ -15974,7 +16018,14 @@ async function runEvaluation(options) {
15974
16018
  answer: "",
15975
16019
  target: target.name,
15976
16020
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15977
- budgetExceeded: true
16021
+ budgetExceeded: true,
16022
+ executionStatus: "execution_error",
16023
+ failureStage: "setup",
16024
+ failureReasonCode: "budget_exceeded",
16025
+ executionError: {
16026
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
16027
+ stage: "setup"
16028
+ }
15978
16029
  };
15979
16030
  if (onProgress) {
15980
16031
  await onProgress({
@@ -15990,6 +16041,37 @@ async function runEvaluation(options) {
15990
16041
  }
15991
16042
  return budgetResult;
15992
16043
  }
16044
+ if (failOnError === true && failOnErrorTriggered) {
16045
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
16046
+ const haltResult = {
16047
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16048
+ testId: evalCase.id,
16049
+ dataset: evalCase.dataset,
16050
+ score: 0,
16051
+ hits: [],
16052
+ misses: [],
16053
+ answer: "",
16054
+ target: target.name,
16055
+ error: errorMsg,
16056
+ executionStatus: "execution_error",
16057
+ failureStage: "setup",
16058
+ failureReasonCode: "error_threshold_exceeded",
16059
+ executionError: { message: errorMsg, stage: "setup" }
16060
+ };
16061
+ if (onProgress) {
16062
+ await onProgress({
16063
+ workerId,
16064
+ testId: evalCase.id,
16065
+ status: "failed",
16066
+ completedAt: Date.now(),
16067
+ error: haltResult.error
16068
+ });
16069
+ }
16070
+ if (onResult) {
16071
+ await onResult(haltResult);
16072
+ }
16073
+ return haltResult;
16074
+ }
15993
16075
  if (onProgress) {
15994
16076
  await onProgress({
15995
16077
  workerId,
@@ -16021,7 +16103,8 @@ async function runEvaluation(options) {
16021
16103
  suiteWorkspaceFile,
16022
16104
  streamCallbacks,
16023
16105
  typeRegistry,
16024
- repoManager
16106
+ repoManager,
16107
+ evalDir
16025
16108
  };
16026
16109
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
16027
16110
  if (totalBudgetUsd !== void 0) {
@@ -16041,6 +16124,9 @@ async function runEvaluation(options) {
16041
16124
  }
16042
16125
  }
16043
16126
  }
16127
+ if (failOnError === true && result.executionStatus === "execution_error") {
16128
+ failOnErrorTriggered = true;
16129
+ }
16044
16130
  if (beforeAllOutput && !beforeAllOutputAttached) {
16045
16131
  result = { ...result, beforeAllOutput };
16046
16132
  beforeAllOutputAttached = true;
@@ -16090,7 +16176,9 @@ async function runEvaluation(options) {
16090
16176
  (now ?? (() => /* @__PURE__ */ new Date()))(),
16091
16177
  outcome.reason,
16092
16178
  promptInputs,
16093
- primaryProvider
16179
+ primaryProvider,
16180
+ "agent",
16181
+ "provider_error"
16094
16182
  );
16095
16183
  results.push(errorResult);
16096
16184
  if (onResult) {
@@ -16102,7 +16190,8 @@ async function runEvaluation(options) {
16102
16190
  const scriptContext = {
16103
16191
  workspacePath: sharedWorkspacePath,
16104
16192
  testId: "__after_all__",
16105
- evalRunId
16193
+ evalRunId,
16194
+ evalDir
16106
16195
  };
16107
16196
  try {
16108
16197
  const afterAllOutput = await executeWorkspaceScript(
@@ -16232,7 +16321,14 @@ async function runBatchEvaluation(options) {
16232
16321
  availableTargets
16233
16322
  });
16234
16323
  if (providerError) {
16235
- result = { ...result, error: providerError };
16324
+ result = {
16325
+ ...result,
16326
+ error: providerError,
16327
+ executionStatus: "execution_error",
16328
+ failureStage: "agent",
16329
+ failureReasonCode: "provider_error",
16330
+ executionError: { message: providerError, stage: "agent" }
16331
+ };
16236
16332
  }
16237
16333
  } catch (error) {
16238
16334
  const errorResult = buildErrorResult(
@@ -16241,7 +16337,9 @@ async function runBatchEvaluation(options) {
16241
16337
  nowFn(),
16242
16338
  error,
16243
16339
  promptInputs,
16244
- provider
16340
+ provider,
16341
+ "evaluator",
16342
+ "evaluator_error"
16245
16343
  );
16246
16344
  results.push(errorResult);
16247
16345
  if (onResult) {
@@ -16297,7 +16395,8 @@ async function runEvalCase(options) {
16297
16395
  sharedBaselineCommit,
16298
16396
  suiteWorkspaceFile,
16299
16397
  typeRegistry: providedTypeRegistry,
16300
- repoManager
16398
+ repoManager,
16399
+ evalDir
16301
16400
  } = options;
16302
16401
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
16303
16402
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -16330,7 +16429,9 @@ async function runEvalCase(options) {
16330
16429
  nowFn(),
16331
16430
  new Error(`Failed to create workspace: ${message}`),
16332
16431
  promptInputs,
16333
- provider
16432
+ provider,
16433
+ "setup",
16434
+ "template_error"
16334
16435
  );
16335
16436
  }
16336
16437
  }
@@ -16350,7 +16451,9 @@ async function runEvalCase(options) {
16350
16451
  nowFn(),
16351
16452
  new Error(`Failed to materialize repos: ${message}`),
16352
16453
  promptInputs,
16353
- provider
16454
+ provider,
16455
+ "repo_setup",
16456
+ "clone_error"
16354
16457
  );
16355
16458
  }
16356
16459
  }
@@ -16360,7 +16463,8 @@ async function runEvalCase(options) {
16360
16463
  testId: evalCase.id,
16361
16464
  evalRunId: evalRunId ?? "",
16362
16465
  caseInput: evalCase.question,
16363
- caseMetadata: evalCase.metadata
16466
+ caseMetadata: evalCase.metadata,
16467
+ evalDir
16364
16468
  };
16365
16469
  try {
16366
16470
  beforeAllOutput = await executeWorkspaceScript(
@@ -16379,7 +16483,9 @@ async function runEvalCase(options) {
16379
16483
  nowFn(),
16380
16484
  new Error(`before_all script failed: ${message}`),
16381
16485
  promptInputs,
16382
- provider
16486
+ provider,
16487
+ "setup",
16488
+ "script_error"
16383
16489
  );
16384
16490
  }
16385
16491
  }
@@ -16390,7 +16496,8 @@ async function runEvalCase(options) {
16390
16496
  testId: evalCase.id,
16391
16497
  evalRunId: evalRunId ?? "",
16392
16498
  caseInput: evalCase.question,
16393
- caseMetadata: evalCase.metadata
16499
+ caseMetadata: evalCase.metadata,
16500
+ evalDir
16394
16501
  };
16395
16502
  try {
16396
16503
  beforeEachOutput = await executeWorkspaceScript(
@@ -16405,7 +16512,9 @@ async function runEvalCase(options) {
16405
16512
  nowFn(),
16406
16513
  new Error(`before_each script failed: ${message}`),
16407
16514
  promptInputs,
16408
- provider
16515
+ provider,
16516
+ "setup",
16517
+ "script_error"
16409
16518
  );
16410
16519
  }
16411
16520
  }
@@ -16446,7 +16555,9 @@ async function runEvalCase(options) {
16446
16555
  nowFn(),
16447
16556
  error,
16448
16557
  promptInputs,
16449
- provider
16558
+ provider,
16559
+ "agent",
16560
+ "provider_error"
16450
16561
  );
16451
16562
  if (workspacePath) {
16452
16563
  if (forceCleanup) {
@@ -16465,7 +16576,9 @@ async function runEvalCase(options) {
16465
16576
  nowFn(),
16466
16577
  lastError ?? new Error("Provider did not return a response"),
16467
16578
  promptInputs,
16468
- provider
16579
+ provider,
16580
+ "agent",
16581
+ "provider_error"
16469
16582
  );
16470
16583
  if (workspacePath) {
16471
16584
  if (forceCleanup) {
@@ -16521,7 +16634,8 @@ async function runEvalCase(options) {
16521
16634
  testId: evalCase.id,
16522
16635
  evalRunId: evalRunId ?? "",
16523
16636
  caseInput: evalCase.question,
16524
- caseMetadata: evalCase.metadata
16637
+ caseMetadata: evalCase.metadata,
16638
+ evalDir
16525
16639
  };
16526
16640
  try {
16527
16641
  afterEachOutput = await executeWorkspaceScript(
@@ -16557,7 +16671,18 @@ async function runEvalCase(options) {
16557
16671
  fileChanges,
16558
16672
  workspacePath
16559
16673
  });
16560
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
16674
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
16675
+ const finalResult = providerError ? {
16676
+ ...result,
16677
+ error: providerError,
16678
+ executionStatus,
16679
+ failureStage: "agent",
16680
+ failureReasonCode: "provider_error",
16681
+ executionError: { message: providerError, stage: "agent" },
16682
+ beforeAllOutput,
16683
+ beforeEachOutput,
16684
+ afterEachOutput
16685
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
16561
16686
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
16562
16687
  if (workspacePath && !isSharedWorkspace) {
16563
16688
  if (forceCleanup) {
@@ -16578,7 +16703,9 @@ async function runEvalCase(options) {
16578
16703
  nowFn(),
16579
16704
  error,
16580
16705
  promptInputs,
16581
- provider
16706
+ provider,
16707
+ "evaluator",
16708
+ "evaluator_error"
16582
16709
  );
16583
16710
  if (workspacePath && !isSharedWorkspace) {
16584
16711
  if (forceCleanup) {
@@ -16616,7 +16743,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
16616
16743
  verdict: trialVerdict,
16617
16744
  scores: result.scores,
16618
16745
  error: result.error,
16619
- costUsd: trialCost
16746
+ costUsd: trialCost,
16747
+ executionStatus: result.executionStatus,
16748
+ failureStage: result.failureStage,
16749
+ failureReasonCode: result.failureReasonCode
16620
16750
  };
16621
16751
  trialResults.push(trial);
16622
16752
  if (trialCost !== void 0) {
@@ -16641,12 +16771,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
16641
16771
  0
16642
16772
  );
16643
16773
  const baseResult = allResults[bestTrialIndex];
16774
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
16775
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
16776
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
16777
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
16778
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
16779
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
16644
16780
  return {
16645
16781
  ...baseResult,
16646
16782
  score,
16647
16783
  trials: trialResults,
16648
16784
  aggregation,
16649
- costLimited: costLimited || void 0
16785
+ costLimited: costLimited || void 0,
16786
+ executionStatus: aggregateExecutionStatus,
16787
+ failureStage: aggregateFailureStage,
16788
+ failureReasonCode: aggregateFailureReasonCode,
16789
+ executionError: aggregateExecutionError
16650
16790
  };
16651
16791
  }
16652
16792
  async function evaluateCandidate(options) {
@@ -16747,7 +16887,8 @@ async function evaluateCandidate(options) {
16747
16887
  scores,
16748
16888
  trace: trace2,
16749
16889
  output,
16750
- fileChanges
16890
+ fileChanges,
16891
+ executionStatus: classifyQualityStatus(score.score)
16751
16892
  };
16752
16893
  }
16753
16894
  async function runEvaluatorsForCase(options) {
@@ -17052,7 +17193,7 @@ async function invokeProvider(provider, options) {
17052
17193
  }
17053
17194
  }
17054
17195
  }
17055
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
17196
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
17056
17197
  const message = error instanceof Error ? error.message : String(error);
17057
17198
  let agentRequest;
17058
17199
  let lmRequest;
@@ -17095,7 +17236,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
17095
17236
  target: targetName,
17096
17237
  requests,
17097
17238
  input,
17098
- error: message
17239
+ error: message,
17240
+ executionStatus: "execution_error",
17241
+ failureStage,
17242
+ failureReasonCode,
17243
+ executionError: { message, stage: failureStage }
17099
17244
  };
17100
17245
  }
17101
17246
  function extractProviderError(response) {
@@ -18048,6 +18193,7 @@ function createAgentKernel() {
18048
18193
  executeWorkspaceScript,
18049
18194
  explorationRatio,
18050
18195
  extractCacheConfig,
18196
+ extractFailOnError,
18051
18197
  extractJsonBlob,
18052
18198
  extractTargetFromSuite,
18053
18199
  extractTargetsFromSuite,