@agentv/core 2.12.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  isEvaluatorKind,
8
8
  resolveFileReference
9
- } from "../../chunk-7HPKTRFZ.js";
9
+ } from "../../chunk-JHER2LQ5.js";
10
10
 
11
11
  // src/evaluation/validation/file-type.ts
12
12
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1492,6 +1492,7 @@ __export(index_exports, {
1492
1492
  executeWorkspaceScript: () => executeWorkspaceScript,
1493
1493
  explorationRatio: () => explorationRatio,
1494
1494
  extractCacheConfig: () => extractCacheConfig,
1495
+ extractFailOnError: () => extractFailOnError,
1495
1496
  extractJsonBlob: () => extractJsonBlob,
1496
1497
  extractTargetFromSuite: () => extractTargetFromSuite,
1497
1498
  extractTargetsFromSuite: () => extractTargetsFromSuite,
@@ -2014,6 +2015,11 @@ async function loadConfig(evalFilePath, repoRoot) {
2014
2015
  continue;
2015
2016
  }
2016
2017
  const config = parsed;
2018
+ const requiredVersion = parsed.required_version;
2019
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
2020
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
2021
+ continue;
2022
+ }
2017
2023
  const guidelinePatterns = config.guideline_patterns;
2018
2024
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
2019
2025
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -2037,6 +2043,7 @@ async function loadConfig(evalFilePath, repoRoot) {
2037
2043
  configPath
2038
2044
  );
2039
2045
  return {
2046
+ required_version: requiredVersion,
2040
2047
  guideline_patterns: guidelinePatterns,
2041
2048
  eval_patterns: evalPatterns,
2042
2049
  execution: executionDefaults
@@ -2180,6 +2187,22 @@ function extractTotalBudgetUsd(suite) {
2180
2187
  );
2181
2188
  return void 0;
2182
2189
  }
2190
+ function extractFailOnError(suite) {
2191
+ const execution = suite.execution;
2192
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2193
+ return void 0;
2194
+ }
2195
+ const executionObj = execution;
2196
+ const raw = executionObj.fail_on_error ?? executionObj.failOnError;
2197
+ if (raw === void 0 || raw === null) {
2198
+ return void 0;
2199
+ }
2200
+ if (typeof raw === "boolean") {
2201
+ return raw;
2202
+ }
2203
+ logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
2204
+ return void 0;
2205
+ }
2183
2206
  function parseExecutionDefaults(raw, configPath) {
2184
2207
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
2185
2208
  return void 0;
@@ -4375,13 +4398,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4375
4398
  }
4376
4399
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
4377
4400
  const metadata = parseMetadata(parsed);
4401
+ const failOnError = extractFailOnError(parsed);
4378
4402
  return {
4379
4403
  tests,
4380
4404
  trials: extractTrialsConfig(parsed),
4381
4405
  targets: extractTargetsFromSuite(parsed),
4382
4406
  cacheConfig: extractCacheConfig(parsed),
4383
4407
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
4384
- ...metadata !== void 0 && { metadata }
4408
+ ...metadata !== void 0 && { metadata },
4409
+ ...failOnError !== void 0 && { failOnError }
4385
4410
  };
4386
4411
  }
4387
4412
  var loadEvalSuite = loadTestSuite;
@@ -15780,7 +15805,8 @@ async function runEvaluation(options) {
15780
15805
  cleanupWorkspaces,
15781
15806
  trials,
15782
15807
  streamCallbacks,
15783
- totalBudgetUsd
15808
+ totalBudgetUsd,
15809
+ failOnError
15784
15810
  } = options;
15785
15811
  let useCache = options.useCache;
15786
15812
  if (trials && trials.count > 1 && useCache) {
@@ -15976,6 +16002,7 @@ async function runEvaluation(options) {
15976
16002
  let beforeAllOutputAttached = false;
15977
16003
  let cumulativeBudgetCost = 0;
15978
16004
  let budgetExhausted = false;
16005
+ let failOnErrorTriggered = false;
15979
16006
  const promises = filteredEvalCases.map(
15980
16007
  (evalCase) => limit(async () => {
15981
16008
  const workerId = nextWorkerId++;
@@ -16014,6 +16041,37 @@ async function runEvaluation(options) {
16014
16041
  }
16015
16042
  return budgetResult;
16016
16043
  }
16044
+ if (failOnError === true && failOnErrorTriggered) {
16045
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
16046
+ const haltResult = {
16047
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16048
+ testId: evalCase.id,
16049
+ dataset: evalCase.dataset,
16050
+ score: 0,
16051
+ hits: [],
16052
+ misses: [],
16053
+ answer: "",
16054
+ target: target.name,
16055
+ error: errorMsg,
16056
+ executionStatus: "execution_error",
16057
+ failureStage: "setup",
16058
+ failureReasonCode: "error_threshold_exceeded",
16059
+ executionError: { message: errorMsg, stage: "setup" }
16060
+ };
16061
+ if (onProgress) {
16062
+ await onProgress({
16063
+ workerId,
16064
+ testId: evalCase.id,
16065
+ status: "failed",
16066
+ completedAt: Date.now(),
16067
+ error: haltResult.error
16068
+ });
16069
+ }
16070
+ if (onResult) {
16071
+ await onResult(haltResult);
16072
+ }
16073
+ return haltResult;
16074
+ }
16017
16075
  if (onProgress) {
16018
16076
  await onProgress({
16019
16077
  workerId,
@@ -16066,6 +16124,9 @@ async function runEvaluation(options) {
16066
16124
  }
16067
16125
  }
16068
16126
  }
16127
+ if (failOnError === true && result.executionStatus === "execution_error") {
16128
+ failOnErrorTriggered = true;
16129
+ }
16069
16130
  if (beforeAllOutput && !beforeAllOutputAttached) {
16070
16131
  result = { ...result, beforeAllOutput };
16071
16132
  beforeAllOutputAttached = true;
@@ -18132,6 +18193,7 @@ function createAgentKernel() {
18132
18193
  executeWorkspaceScript,
18133
18194
  explorationRatio,
18134
18195
  extractCacheConfig,
18196
+ extractFailOnError,
18135
18197
  extractJsonBlob,
18136
18198
  extractTargetFromSuite,
18137
18199
  extractTargetsFromSuite,