@agentv/core 2.11.4 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-REN5PS7B.js";
20
+ } from "./chunk-JHER2LQ5.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -396,6 +396,11 @@ async function loadConfig(evalFilePath, repoRoot) {
396
396
  continue;
397
397
  }
398
398
  const config = parsed;
399
+ const requiredVersion = parsed.required_version;
400
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
401
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
402
+ continue;
403
+ }
399
404
  const guidelinePatterns = config.guideline_patterns;
400
405
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
401
406
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -419,6 +424,7 @@ async function loadConfig(evalFilePath, repoRoot) {
419
424
  configPath
420
425
  );
421
426
  return {
427
+ required_version: requiredVersion,
422
428
  guideline_patterns: guidelinePatterns,
423
429
  eval_patterns: evalPatterns,
424
430
  execution: executionDefaults
@@ -562,6 +568,22 @@ function extractTotalBudgetUsd(suite) {
562
568
  );
563
569
  return void 0;
564
570
  }
571
+ function extractFailOnError(suite) {
572
+ const execution = suite.execution;
573
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
574
+ return void 0;
575
+ }
576
+ const executionObj = execution;
577
+ const raw = executionObj.fail_on_error ?? executionObj.failOnError;
578
+ if (raw === void 0 || raw === null) {
579
+ return void 0;
580
+ }
581
+ if (typeof raw === "boolean") {
582
+ return raw;
583
+ }
584
+ logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
585
+ return void 0;
586
+ }
565
587
  function parseExecutionDefaults(raw, configPath) {
566
588
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
567
589
  return void 0;
@@ -2757,13 +2779,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
2757
2779
  }
2758
2780
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
2759
2781
  const metadata = parseMetadata(parsed);
2782
+ const failOnError = extractFailOnError(parsed);
2760
2783
  return {
2761
2784
  tests,
2762
2785
  trials: extractTrialsConfig(parsed),
2763
2786
  targets: extractTargetsFromSuite(parsed),
2764
2787
  cacheConfig: extractCacheConfig(parsed),
2765
2788
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
2766
- ...metadata !== void 0 && { metadata }
2789
+ ...metadata !== void 0 && { metadata },
2790
+ ...failOnError !== void 0 && { failOnError }
2767
2791
  };
2768
2792
  }
2769
2793
  var loadEvalSuite = loadTestSuite;
@@ -12847,6 +12871,16 @@ async function resolveWorkspaceTemplate(templatePath) {
12847
12871
  }
12848
12872
 
12849
12873
  // src/evaluation/workspace/script-executor.ts
12874
+ function interpolateArgs(args, context) {
12875
+ const vars = {
12876
+ workspace_path: context.workspacePath,
12877
+ test_id: context.testId,
12878
+ eval_run_id: context.evalRunId,
12879
+ case_input: context.caseInput ?? "",
12880
+ case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
12881
+ };
12882
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
12883
+ }
12850
12884
  async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12851
12885
  const stdin = JSON.stringify({
12852
12886
  workspace_path: context.workspacePath,
@@ -12856,8 +12890,9 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12856
12890
  case_metadata: context.caseMetadata ?? null
12857
12891
  });
12858
12892
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
12859
- const cwd = config.cwd;
12860
- const commandArray = config.command ?? config.script ?? [];
12893
+ const cwd = config.cwd ?? context.evalDir;
12894
+ const rawCommand = config.command ?? config.script ?? [];
12895
+ const commandArray = interpolateArgs(rawCommand, context);
12861
12896
  const result = await execFileWithStdin(commandArray, stdin, {
12862
12897
  timeoutMs,
12863
12898
  cwd
@@ -12874,6 +12909,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12874
12909
  }
12875
12910
 
12876
12911
  // src/evaluation/orchestrator.ts
12912
+ var QUALITY_PASS_THRESHOLD = 0.8;
12913
+ function classifyQualityStatus(score) {
12914
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
12915
+ }
12877
12916
  function usesFileReferencePrompt(provider) {
12878
12917
  return isAgentProvider(provider) || provider.kind === "cli";
12879
12918
  }
@@ -12906,7 +12945,8 @@ async function runEvaluation(options) {
12906
12945
  cleanupWorkspaces,
12907
12946
  trials,
12908
12947
  streamCallbacks,
12909
- totalBudgetUsd
12948
+ totalBudgetUsd,
12949
+ failOnError
12910
12950
  } = options;
12911
12951
  let useCache = options.useCache;
12912
12952
  if (trials && trials.count > 1 && useCache) {
@@ -12981,6 +13021,7 @@ async function runEvaluation(options) {
12981
13021
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
12982
13022
  const typeRegistry = createBuiltinRegistry();
12983
13023
  const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
13024
+ const evalDir = discoveryBaseDir;
12984
13025
  await discoverAssertions(typeRegistry, discoveryBaseDir);
12985
13026
  const providerRegistry = createBuiltinProviderRegistry();
12986
13027
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -13076,7 +13117,8 @@ async function runEvaluation(options) {
13076
13117
  const scriptContext = {
13077
13118
  workspacePath: sharedWorkspacePath,
13078
13119
  testId: "__before_all__",
13079
- evalRunId
13120
+ evalRunId,
13121
+ evalDir
13080
13122
  };
13081
13123
  try {
13082
13124
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -13100,6 +13142,7 @@ async function runEvaluation(options) {
13100
13142
  let beforeAllOutputAttached = false;
13101
13143
  let cumulativeBudgetCost = 0;
13102
13144
  let budgetExhausted = false;
13145
+ let failOnErrorTriggered = false;
13103
13146
  const promises = filteredEvalCases.map(
13104
13147
  (evalCase) => limit(async () => {
13105
13148
  const workerId = nextWorkerId++;
@@ -13115,7 +13158,14 @@ async function runEvaluation(options) {
13115
13158
  answer: "",
13116
13159
  target: target.name,
13117
13160
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13118
- budgetExceeded: true
13161
+ budgetExceeded: true,
13162
+ executionStatus: "execution_error",
13163
+ failureStage: "setup",
13164
+ failureReasonCode: "budget_exceeded",
13165
+ executionError: {
13166
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13167
+ stage: "setup"
13168
+ }
13119
13169
  };
13120
13170
  if (onProgress) {
13121
13171
  await onProgress({
@@ -13131,6 +13181,37 @@ async function runEvaluation(options) {
13131
13181
  }
13132
13182
  return budgetResult;
13133
13183
  }
13184
+ if (failOnError === true && failOnErrorTriggered) {
13185
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
13186
+ const haltResult = {
13187
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
13188
+ testId: evalCase.id,
13189
+ dataset: evalCase.dataset,
13190
+ score: 0,
13191
+ hits: [],
13192
+ misses: [],
13193
+ answer: "",
13194
+ target: target.name,
13195
+ error: errorMsg,
13196
+ executionStatus: "execution_error",
13197
+ failureStage: "setup",
13198
+ failureReasonCode: "error_threshold_exceeded",
13199
+ executionError: { message: errorMsg, stage: "setup" }
13200
+ };
13201
+ if (onProgress) {
13202
+ await onProgress({
13203
+ workerId,
13204
+ testId: evalCase.id,
13205
+ status: "failed",
13206
+ completedAt: Date.now(),
13207
+ error: haltResult.error
13208
+ });
13209
+ }
13210
+ if (onResult) {
13211
+ await onResult(haltResult);
13212
+ }
13213
+ return haltResult;
13214
+ }
13134
13215
  if (onProgress) {
13135
13216
  await onProgress({
13136
13217
  workerId,
@@ -13162,7 +13243,8 @@ async function runEvaluation(options) {
13162
13243
  suiteWorkspaceFile,
13163
13244
  streamCallbacks,
13164
13245
  typeRegistry,
13165
- repoManager
13246
+ repoManager,
13247
+ evalDir
13166
13248
  };
13167
13249
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
13168
13250
  if (totalBudgetUsd !== void 0) {
@@ -13182,6 +13264,9 @@ async function runEvaluation(options) {
13182
13264
  }
13183
13265
  }
13184
13266
  }
13267
+ if (failOnError === true && result.executionStatus === "execution_error") {
13268
+ failOnErrorTriggered = true;
13269
+ }
13185
13270
  if (beforeAllOutput && !beforeAllOutputAttached) {
13186
13271
  result = { ...result, beforeAllOutput };
13187
13272
  beforeAllOutputAttached = true;
@@ -13231,7 +13316,9 @@ async function runEvaluation(options) {
13231
13316
  (now ?? (() => /* @__PURE__ */ new Date()))(),
13232
13317
  outcome.reason,
13233
13318
  promptInputs,
13234
- primaryProvider
13319
+ primaryProvider,
13320
+ "agent",
13321
+ "provider_error"
13235
13322
  );
13236
13323
  results.push(errorResult);
13237
13324
  if (onResult) {
@@ -13243,7 +13330,8 @@ async function runEvaluation(options) {
13243
13330
  const scriptContext = {
13244
13331
  workspacePath: sharedWorkspacePath,
13245
13332
  testId: "__after_all__",
13246
- evalRunId
13333
+ evalRunId,
13334
+ evalDir
13247
13335
  };
13248
13336
  try {
13249
13337
  const afterAllOutput = await executeWorkspaceScript(
@@ -13373,7 +13461,14 @@ async function runBatchEvaluation(options) {
13373
13461
  availableTargets
13374
13462
  });
13375
13463
  if (providerError) {
13376
- result = { ...result, error: providerError };
13464
+ result = {
13465
+ ...result,
13466
+ error: providerError,
13467
+ executionStatus: "execution_error",
13468
+ failureStage: "agent",
13469
+ failureReasonCode: "provider_error",
13470
+ executionError: { message: providerError, stage: "agent" }
13471
+ };
13377
13472
  }
13378
13473
  } catch (error) {
13379
13474
  const errorResult = buildErrorResult(
@@ -13382,7 +13477,9 @@ async function runBatchEvaluation(options) {
13382
13477
  nowFn(),
13383
13478
  error,
13384
13479
  promptInputs,
13385
- provider
13480
+ provider,
13481
+ "evaluator",
13482
+ "evaluator_error"
13386
13483
  );
13387
13484
  results.push(errorResult);
13388
13485
  if (onResult) {
@@ -13438,7 +13535,8 @@ async function runEvalCase(options) {
13438
13535
  sharedBaselineCommit,
13439
13536
  suiteWorkspaceFile,
13440
13537
  typeRegistry: providedTypeRegistry,
13441
- repoManager
13538
+ repoManager,
13539
+ evalDir
13442
13540
  } = options;
13443
13541
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
13444
13542
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -13471,7 +13569,9 @@ async function runEvalCase(options) {
13471
13569
  nowFn(),
13472
13570
  new Error(`Failed to create workspace: ${message}`),
13473
13571
  promptInputs,
13474
- provider
13572
+ provider,
13573
+ "setup",
13574
+ "template_error"
13475
13575
  );
13476
13576
  }
13477
13577
  }
@@ -13491,7 +13591,9 @@ async function runEvalCase(options) {
13491
13591
  nowFn(),
13492
13592
  new Error(`Failed to materialize repos: ${message}`),
13493
13593
  promptInputs,
13494
- provider
13594
+ provider,
13595
+ "repo_setup",
13596
+ "clone_error"
13495
13597
  );
13496
13598
  }
13497
13599
  }
@@ -13501,7 +13603,8 @@ async function runEvalCase(options) {
13501
13603
  testId: evalCase.id,
13502
13604
  evalRunId: evalRunId ?? "",
13503
13605
  caseInput: evalCase.question,
13504
- caseMetadata: evalCase.metadata
13606
+ caseMetadata: evalCase.metadata,
13607
+ evalDir
13505
13608
  };
13506
13609
  try {
13507
13610
  beforeAllOutput = await executeWorkspaceScript(
@@ -13520,7 +13623,9 @@ async function runEvalCase(options) {
13520
13623
  nowFn(),
13521
13624
  new Error(`before_all script failed: ${message}`),
13522
13625
  promptInputs,
13523
- provider
13626
+ provider,
13627
+ "setup",
13628
+ "script_error"
13524
13629
  );
13525
13630
  }
13526
13631
  }
@@ -13531,7 +13636,8 @@ async function runEvalCase(options) {
13531
13636
  testId: evalCase.id,
13532
13637
  evalRunId: evalRunId ?? "",
13533
13638
  caseInput: evalCase.question,
13534
- caseMetadata: evalCase.metadata
13639
+ caseMetadata: evalCase.metadata,
13640
+ evalDir
13535
13641
  };
13536
13642
  try {
13537
13643
  beforeEachOutput = await executeWorkspaceScript(
@@ -13546,7 +13652,9 @@ async function runEvalCase(options) {
13546
13652
  nowFn(),
13547
13653
  new Error(`before_each script failed: ${message}`),
13548
13654
  promptInputs,
13549
- provider
13655
+ provider,
13656
+ "setup",
13657
+ "script_error"
13550
13658
  );
13551
13659
  }
13552
13660
  }
@@ -13587,7 +13695,9 @@ async function runEvalCase(options) {
13587
13695
  nowFn(),
13588
13696
  error,
13589
13697
  promptInputs,
13590
- provider
13698
+ provider,
13699
+ "agent",
13700
+ "provider_error"
13591
13701
  );
13592
13702
  if (workspacePath) {
13593
13703
  if (forceCleanup) {
@@ -13606,7 +13716,9 @@ async function runEvalCase(options) {
13606
13716
  nowFn(),
13607
13717
  lastError ?? new Error("Provider did not return a response"),
13608
13718
  promptInputs,
13609
- provider
13719
+ provider,
13720
+ "agent",
13721
+ "provider_error"
13610
13722
  );
13611
13723
  if (workspacePath) {
13612
13724
  if (forceCleanup) {
@@ -13662,7 +13774,8 @@ async function runEvalCase(options) {
13662
13774
  testId: evalCase.id,
13663
13775
  evalRunId: evalRunId ?? "",
13664
13776
  caseInput: evalCase.question,
13665
- caseMetadata: evalCase.metadata
13777
+ caseMetadata: evalCase.metadata,
13778
+ evalDir
13666
13779
  };
13667
13780
  try {
13668
13781
  afterEachOutput = await executeWorkspaceScript(
@@ -13698,7 +13811,18 @@ async function runEvalCase(options) {
13698
13811
  fileChanges,
13699
13812
  workspacePath
13700
13813
  });
13701
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
13814
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
13815
+ const finalResult = providerError ? {
13816
+ ...result,
13817
+ error: providerError,
13818
+ executionStatus,
13819
+ failureStage: "agent",
13820
+ failureReasonCode: "provider_error",
13821
+ executionError: { message: providerError, stage: "agent" },
13822
+ beforeAllOutput,
13823
+ beforeEachOutput,
13824
+ afterEachOutput
13825
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
13702
13826
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
13703
13827
  if (workspacePath && !isSharedWorkspace) {
13704
13828
  if (forceCleanup) {
@@ -13719,7 +13843,9 @@ async function runEvalCase(options) {
13719
13843
  nowFn(),
13720
13844
  error,
13721
13845
  promptInputs,
13722
- provider
13846
+ provider,
13847
+ "evaluator",
13848
+ "evaluator_error"
13723
13849
  );
13724
13850
  if (workspacePath && !isSharedWorkspace) {
13725
13851
  if (forceCleanup) {
@@ -13757,7 +13883,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
13757
13883
  verdict: trialVerdict,
13758
13884
  scores: result.scores,
13759
13885
  error: result.error,
13760
- costUsd: trialCost
13886
+ costUsd: trialCost,
13887
+ executionStatus: result.executionStatus,
13888
+ failureStage: result.failureStage,
13889
+ failureReasonCode: result.failureReasonCode
13761
13890
  };
13762
13891
  trialResults.push(trial);
13763
13892
  if (trialCost !== void 0) {
@@ -13782,12 +13911,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
13782
13911
  0
13783
13912
  );
13784
13913
  const baseResult = allResults[bestTrialIndex];
13914
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
13915
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
13916
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
13917
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
13918
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
13919
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
13785
13920
  return {
13786
13921
  ...baseResult,
13787
13922
  score,
13788
13923
  trials: trialResults,
13789
13924
  aggregation,
13790
- costLimited: costLimited || void 0
13925
+ costLimited: costLimited || void 0,
13926
+ executionStatus: aggregateExecutionStatus,
13927
+ failureStage: aggregateFailureStage,
13928
+ failureReasonCode: aggregateFailureReasonCode,
13929
+ executionError: aggregateExecutionError
13791
13930
  };
13792
13931
  }
13793
13932
  async function evaluateCandidate(options) {
@@ -13888,7 +14027,8 @@ async function evaluateCandidate(options) {
13888
14027
  scores,
13889
14028
  trace,
13890
14029
  output,
13891
- fileChanges
14030
+ fileChanges,
14031
+ executionStatus: classifyQualityStatus(score.score)
13892
14032
  };
13893
14033
  }
13894
14034
  async function runEvaluatorsForCase(options) {
@@ -14193,7 +14333,7 @@ async function invokeProvider(provider, options) {
14193
14333
  }
14194
14334
  }
14195
14335
  }
14196
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
14336
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
14197
14337
  const message = error instanceof Error ? error.message : String(error);
14198
14338
  let agentRequest;
14199
14339
  let lmRequest;
@@ -14236,7 +14376,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
14236
14376
  target: targetName,
14237
14377
  requests,
14238
14378
  input,
14239
- error: message
14379
+ error: message,
14380
+ executionStatus: "execution_error",
14381
+ failureStage,
14382
+ failureReasonCode,
14383
+ executionError: { message, stage: failureStage }
14240
14384
  };
14241
14385
  }
14242
14386
  function extractProviderError(response) {
@@ -15184,6 +15328,7 @@ export {
15184
15328
  executeWorkspaceScript,
15185
15329
  explorationRatio,
15186
15330
  extractCacheConfig,
15331
+ extractFailOnError,
15187
15332
  extractJsonBlob,
15188
15333
  extractTargetFromSuite,
15189
15334
  extractTargetsFromSuite,