@agentv/core 2.11.4 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  isEvaluatorKind,
8
8
  resolveFileReference
9
- } from "../../chunk-REN5PS7B.js";
9
+ } from "../../chunk-7HPKTRFZ.js";
10
10
 
11
11
  // src/evaluation/validation/file-type.ts
12
12
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -15706,6 +15706,16 @@ async function resolveWorkspaceTemplate(templatePath) {
15706
15706
  }
15707
15707
 
15708
15708
  // src/evaluation/workspace/script-executor.ts
15709
+ function interpolateArgs(args, context2) {
15710
+ const vars = {
15711
+ workspace_path: context2.workspacePath,
15712
+ test_id: context2.testId,
15713
+ eval_run_id: context2.evalRunId,
15714
+ case_input: context2.caseInput ?? "",
15715
+ case_metadata: context2.caseMetadata ? JSON.stringify(context2.caseMetadata) : ""
15716
+ };
15717
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
15718
+ }
15709
15719
  async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15710
15720
  const stdin = JSON.stringify({
15711
15721
  workspace_path: context2.workspacePath,
@@ -15715,8 +15725,9 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15715
15725
  case_metadata: context2.caseMetadata ?? null
15716
15726
  });
15717
15727
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
15718
- const cwd = config.cwd;
15719
- const commandArray = config.command ?? config.script ?? [];
15728
+ const cwd = config.cwd ?? context2.evalDir;
15729
+ const rawCommand = config.command ?? config.script ?? [];
15730
+ const commandArray = interpolateArgs(rawCommand, context2);
15720
15731
  const result = await execFileWithStdin(commandArray, stdin, {
15721
15732
  timeoutMs,
15722
15733
  cwd
@@ -15733,6 +15744,10 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15733
15744
  }
15734
15745
 
15735
15746
  // src/evaluation/orchestrator.ts
15747
+ var QUALITY_PASS_THRESHOLD = 0.8;
15748
+ function classifyQualityStatus(score) {
15749
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
15750
+ }
15736
15751
  function usesFileReferencePrompt(provider) {
15737
15752
  return isAgentProvider(provider) || provider.kind === "cli";
15738
15753
  }
@@ -15840,6 +15855,7 @@ async function runEvaluation(options) {
15840
15855
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
15841
15856
  const typeRegistry = createBuiltinRegistry();
15842
15857
  const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
15858
+ const evalDir = discoveryBaseDir;
15843
15859
  await discoverAssertions(typeRegistry, discoveryBaseDir);
15844
15860
  const providerRegistry = createBuiltinProviderRegistry();
15845
15861
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -15935,7 +15951,8 @@ async function runEvaluation(options) {
15935
15951
  const scriptContext = {
15936
15952
  workspacePath: sharedWorkspacePath,
15937
15953
  testId: "__before_all__",
15938
- evalRunId
15954
+ evalRunId,
15955
+ evalDir
15939
15956
  };
15940
15957
  try {
15941
15958
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -15974,7 +15991,14 @@ async function runEvaluation(options) {
15974
15991
  answer: "",
15975
15992
  target: target.name,
15976
15993
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15977
- budgetExceeded: true
15994
+ budgetExceeded: true,
15995
+ executionStatus: "execution_error",
15996
+ failureStage: "setup",
15997
+ failureReasonCode: "budget_exceeded",
15998
+ executionError: {
15999
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
16000
+ stage: "setup"
16001
+ }
15978
16002
  };
15979
16003
  if (onProgress) {
15980
16004
  await onProgress({
@@ -16021,7 +16045,8 @@ async function runEvaluation(options) {
16021
16045
  suiteWorkspaceFile,
16022
16046
  streamCallbacks,
16023
16047
  typeRegistry,
16024
- repoManager
16048
+ repoManager,
16049
+ evalDir
16025
16050
  };
16026
16051
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
16027
16052
  if (totalBudgetUsd !== void 0) {
@@ -16090,7 +16115,9 @@ async function runEvaluation(options) {
16090
16115
  (now ?? (() => /* @__PURE__ */ new Date()))(),
16091
16116
  outcome.reason,
16092
16117
  promptInputs,
16093
- primaryProvider
16118
+ primaryProvider,
16119
+ "agent",
16120
+ "provider_error"
16094
16121
  );
16095
16122
  results.push(errorResult);
16096
16123
  if (onResult) {
@@ -16102,7 +16129,8 @@ async function runEvaluation(options) {
16102
16129
  const scriptContext = {
16103
16130
  workspacePath: sharedWorkspacePath,
16104
16131
  testId: "__after_all__",
16105
- evalRunId
16132
+ evalRunId,
16133
+ evalDir
16106
16134
  };
16107
16135
  try {
16108
16136
  const afterAllOutput = await executeWorkspaceScript(
@@ -16232,7 +16260,14 @@ async function runBatchEvaluation(options) {
16232
16260
  availableTargets
16233
16261
  });
16234
16262
  if (providerError) {
16235
- result = { ...result, error: providerError };
16263
+ result = {
16264
+ ...result,
16265
+ error: providerError,
16266
+ executionStatus: "execution_error",
16267
+ failureStage: "agent",
16268
+ failureReasonCode: "provider_error",
16269
+ executionError: { message: providerError, stage: "agent" }
16270
+ };
16236
16271
  }
16237
16272
  } catch (error) {
16238
16273
  const errorResult = buildErrorResult(
@@ -16241,7 +16276,9 @@ async function runBatchEvaluation(options) {
16241
16276
  nowFn(),
16242
16277
  error,
16243
16278
  promptInputs,
16244
- provider
16279
+ provider,
16280
+ "evaluator",
16281
+ "evaluator_error"
16245
16282
  );
16246
16283
  results.push(errorResult);
16247
16284
  if (onResult) {
@@ -16297,7 +16334,8 @@ async function runEvalCase(options) {
16297
16334
  sharedBaselineCommit,
16298
16335
  suiteWorkspaceFile,
16299
16336
  typeRegistry: providedTypeRegistry,
16300
- repoManager
16337
+ repoManager,
16338
+ evalDir
16301
16339
  } = options;
16302
16340
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
16303
16341
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -16330,7 +16368,9 @@ async function runEvalCase(options) {
16330
16368
  nowFn(),
16331
16369
  new Error(`Failed to create workspace: ${message}`),
16332
16370
  promptInputs,
16333
- provider
16371
+ provider,
16372
+ "setup",
16373
+ "template_error"
16334
16374
  );
16335
16375
  }
16336
16376
  }
@@ -16350,7 +16390,9 @@ async function runEvalCase(options) {
16350
16390
  nowFn(),
16351
16391
  new Error(`Failed to materialize repos: ${message}`),
16352
16392
  promptInputs,
16353
- provider
16393
+ provider,
16394
+ "repo_setup",
16395
+ "clone_error"
16354
16396
  );
16355
16397
  }
16356
16398
  }
@@ -16360,7 +16402,8 @@ async function runEvalCase(options) {
16360
16402
  testId: evalCase.id,
16361
16403
  evalRunId: evalRunId ?? "",
16362
16404
  caseInput: evalCase.question,
16363
- caseMetadata: evalCase.metadata
16405
+ caseMetadata: evalCase.metadata,
16406
+ evalDir
16364
16407
  };
16365
16408
  try {
16366
16409
  beforeAllOutput = await executeWorkspaceScript(
@@ -16379,7 +16422,9 @@ async function runEvalCase(options) {
16379
16422
  nowFn(),
16380
16423
  new Error(`before_all script failed: ${message}`),
16381
16424
  promptInputs,
16382
- provider
16425
+ provider,
16426
+ "setup",
16427
+ "script_error"
16383
16428
  );
16384
16429
  }
16385
16430
  }
@@ -16390,7 +16435,8 @@ async function runEvalCase(options) {
16390
16435
  testId: evalCase.id,
16391
16436
  evalRunId: evalRunId ?? "",
16392
16437
  caseInput: evalCase.question,
16393
- caseMetadata: evalCase.metadata
16438
+ caseMetadata: evalCase.metadata,
16439
+ evalDir
16394
16440
  };
16395
16441
  try {
16396
16442
  beforeEachOutput = await executeWorkspaceScript(
@@ -16405,7 +16451,9 @@ async function runEvalCase(options) {
16405
16451
  nowFn(),
16406
16452
  new Error(`before_each script failed: ${message}`),
16407
16453
  promptInputs,
16408
- provider
16454
+ provider,
16455
+ "setup",
16456
+ "script_error"
16409
16457
  );
16410
16458
  }
16411
16459
  }
@@ -16446,7 +16494,9 @@ async function runEvalCase(options) {
16446
16494
  nowFn(),
16447
16495
  error,
16448
16496
  promptInputs,
16449
- provider
16497
+ provider,
16498
+ "agent",
16499
+ "provider_error"
16450
16500
  );
16451
16501
  if (workspacePath) {
16452
16502
  if (forceCleanup) {
@@ -16465,7 +16515,9 @@ async function runEvalCase(options) {
16465
16515
  nowFn(),
16466
16516
  lastError ?? new Error("Provider did not return a response"),
16467
16517
  promptInputs,
16468
- provider
16518
+ provider,
16519
+ "agent",
16520
+ "provider_error"
16469
16521
  );
16470
16522
  if (workspacePath) {
16471
16523
  if (forceCleanup) {
@@ -16521,7 +16573,8 @@ async function runEvalCase(options) {
16521
16573
  testId: evalCase.id,
16522
16574
  evalRunId: evalRunId ?? "",
16523
16575
  caseInput: evalCase.question,
16524
- caseMetadata: evalCase.metadata
16576
+ caseMetadata: evalCase.metadata,
16577
+ evalDir
16525
16578
  };
16526
16579
  try {
16527
16580
  afterEachOutput = await executeWorkspaceScript(
@@ -16557,7 +16610,18 @@ async function runEvalCase(options) {
16557
16610
  fileChanges,
16558
16611
  workspacePath
16559
16612
  });
16560
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
16613
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
16614
+ const finalResult = providerError ? {
16615
+ ...result,
16616
+ error: providerError,
16617
+ executionStatus,
16618
+ failureStage: "agent",
16619
+ failureReasonCode: "provider_error",
16620
+ executionError: { message: providerError, stage: "agent" },
16621
+ beforeAllOutput,
16622
+ beforeEachOutput,
16623
+ afterEachOutput
16624
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
16561
16625
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
16562
16626
  if (workspacePath && !isSharedWorkspace) {
16563
16627
  if (forceCleanup) {
@@ -16578,7 +16642,9 @@ async function runEvalCase(options) {
16578
16642
  nowFn(),
16579
16643
  error,
16580
16644
  promptInputs,
16581
- provider
16645
+ provider,
16646
+ "evaluator",
16647
+ "evaluator_error"
16582
16648
  );
16583
16649
  if (workspacePath && !isSharedWorkspace) {
16584
16650
  if (forceCleanup) {
@@ -16616,7 +16682,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
16616
16682
  verdict: trialVerdict,
16617
16683
  scores: result.scores,
16618
16684
  error: result.error,
16619
- costUsd: trialCost
16685
+ costUsd: trialCost,
16686
+ executionStatus: result.executionStatus,
16687
+ failureStage: result.failureStage,
16688
+ failureReasonCode: result.failureReasonCode
16620
16689
  };
16621
16690
  trialResults.push(trial);
16622
16691
  if (trialCost !== void 0) {
@@ -16641,12 +16710,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
16641
16710
  0
16642
16711
  );
16643
16712
  const baseResult = allResults[bestTrialIndex];
16713
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
16714
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
16715
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
16716
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
16717
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
16718
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
16644
16719
  return {
16645
16720
  ...baseResult,
16646
16721
  score,
16647
16722
  trials: trialResults,
16648
16723
  aggregation,
16649
- costLimited: costLimited || void 0
16724
+ costLimited: costLimited || void 0,
16725
+ executionStatus: aggregateExecutionStatus,
16726
+ failureStage: aggregateFailureStage,
16727
+ failureReasonCode: aggregateFailureReasonCode,
16728
+ executionError: aggregateExecutionError
16650
16729
  };
16651
16730
  }
16652
16731
  async function evaluateCandidate(options) {
@@ -16747,7 +16826,8 @@ async function evaluateCandidate(options) {
16747
16826
  scores,
16748
16827
  trace: trace2,
16749
16828
  output,
16750
- fileChanges
16829
+ fileChanges,
16830
+ executionStatus: classifyQualityStatus(score.score)
16751
16831
  };
16752
16832
  }
16753
16833
  async function runEvaluatorsForCase(options) {
@@ -17052,7 +17132,7 @@ async function invokeProvider(provider, options) {
17052
17132
  }
17053
17133
  }
17054
17134
  }
17055
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
17135
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
17056
17136
  const message = error instanceof Error ? error.message : String(error);
17057
17137
  let agentRequest;
17058
17138
  let lmRequest;
@@ -17095,7 +17175,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
17095
17175
  target: targetName,
17096
17176
  requests,
17097
17177
  input,
17098
- error: message
17178
+ error: message,
17179
+ executionStatus: "execution_error",
17180
+ failureStage,
17181
+ failureReasonCode,
17182
+ executionError: { message, stage: failureStage }
17099
17183
  };
17100
17184
  }
17101
17185
  function extractProviderError(response) {