@agentv/core 2.11.2 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  isEvaluatorKind,
8
8
  resolveFileReference
9
- } from "../../chunk-REN5PS7B.js";
9
+ } from "../../chunk-7HPKTRFZ.js";
10
10
 
11
11
  // src/evaluation/validation/file-type.ts
12
12
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -15512,14 +15512,24 @@ var RepoManager = class {
15512
15512
  * Creates on first access, fetches updates on subsequent calls.
15513
15513
  * Returns the absolute path to the cache directory.
15514
15514
  */
15515
- async ensureCache(source, depth) {
15515
+ async ensureCache(source, depth, resolve) {
15516
15516
  const key = cacheKey(source);
15517
15517
  const cachePath = import_node_path38.default.join(this.cacheDir, key);
15518
15518
  const lockPath = `${cachePath}.lock`;
15519
+ const cacheExists = (0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"));
15520
+ if (resolve === "local") {
15521
+ if (cacheExists) {
15522
+ return cachePath;
15523
+ }
15524
+ const url = getSourceUrl(source);
15525
+ throw new Error(
15526
+ `No cache found for \`${url}\`. Run \`agentv cache add --url ${url} --from <local-path>\` to seed it.`
15527
+ );
15528
+ }
15519
15529
  await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
15520
15530
  await acquireLock(lockPath);
15521
15531
  try {
15522
- if ((0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"))) {
15532
+ if (cacheExists) {
15523
15533
  const fetchArgs = ["fetch", "--prune"];
15524
15534
  if (depth) {
15525
15535
  fetchArgs.push("--depth", String(depth));
@@ -15546,7 +15556,11 @@ var RepoManager = class {
15546
15556
  */
15547
15557
  async materialize(repo, workspacePath) {
15548
15558
  const targetDir = import_node_path38.default.join(workspacePath, repo.path);
15549
- const cachePath = await this.ensureCache(repo.source, repo.clone?.depth);
15559
+ const cachePath = await this.ensureCache(
15560
+ repo.source,
15561
+ repo.clone?.depth,
15562
+ repo.checkout?.resolve
15563
+ );
15550
15564
  const cloneArgs = ["clone"];
15551
15565
  if (repo.clone?.depth) {
15552
15566
  cloneArgs.push("--depth", String(repo.clone.depth));
@@ -15692,6 +15706,16 @@ async function resolveWorkspaceTemplate(templatePath) {
15692
15706
  }
15693
15707
 
15694
15708
  // src/evaluation/workspace/script-executor.ts
15709
+ function interpolateArgs(args, context2) {
15710
+ const vars = {
15711
+ workspace_path: context2.workspacePath,
15712
+ test_id: context2.testId,
15713
+ eval_run_id: context2.evalRunId,
15714
+ case_input: context2.caseInput ?? "",
15715
+ case_metadata: context2.caseMetadata ? JSON.stringify(context2.caseMetadata) : ""
15716
+ };
15717
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
15718
+ }
15695
15719
  async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15696
15720
  const stdin = JSON.stringify({
15697
15721
  workspace_path: context2.workspacePath,
@@ -15701,8 +15725,9 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15701
15725
  case_metadata: context2.caseMetadata ?? null
15702
15726
  });
15703
15727
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
15704
- const cwd = config.cwd;
15705
- const commandArray = config.command ?? config.script ?? [];
15728
+ const cwd = config.cwd ?? context2.evalDir;
15729
+ const rawCommand = config.command ?? config.script ?? [];
15730
+ const commandArray = interpolateArgs(rawCommand, context2);
15706
15731
  const result = await execFileWithStdin(commandArray, stdin, {
15707
15732
  timeoutMs,
15708
15733
  cwd
@@ -15719,6 +15744,10 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15719
15744
  }
15720
15745
 
15721
15746
  // src/evaluation/orchestrator.ts
15747
+ var QUALITY_PASS_THRESHOLD = 0.8;
15748
+ function classifyQualityStatus(score) {
15749
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
15750
+ }
15722
15751
  function usesFileReferencePrompt(provider) {
15723
15752
  return isAgentProvider(provider) || provider.kind === "cli";
15724
15753
  }
@@ -15826,6 +15855,7 @@ async function runEvaluation(options) {
15826
15855
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
15827
15856
  const typeRegistry = createBuiltinRegistry();
15828
15857
  const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
15858
+ const evalDir = discoveryBaseDir;
15829
15859
  await discoverAssertions(typeRegistry, discoveryBaseDir);
15830
15860
  const providerRegistry = createBuiltinProviderRegistry();
15831
15861
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -15921,7 +15951,8 @@ async function runEvaluation(options) {
15921
15951
  const scriptContext = {
15922
15952
  workspacePath: sharedWorkspacePath,
15923
15953
  testId: "__before_all__",
15924
- evalRunId
15954
+ evalRunId,
15955
+ evalDir
15925
15956
  };
15926
15957
  try {
15927
15958
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -15960,7 +15991,14 @@ async function runEvaluation(options) {
15960
15991
  answer: "",
15961
15992
  target: target.name,
15962
15993
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15963
- budgetExceeded: true
15994
+ budgetExceeded: true,
15995
+ executionStatus: "execution_error",
15996
+ failureStage: "setup",
15997
+ failureReasonCode: "budget_exceeded",
15998
+ executionError: {
15999
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
16000
+ stage: "setup"
16001
+ }
15964
16002
  };
15965
16003
  if (onProgress) {
15966
16004
  await onProgress({
@@ -16007,7 +16045,8 @@ async function runEvaluation(options) {
16007
16045
  suiteWorkspaceFile,
16008
16046
  streamCallbacks,
16009
16047
  typeRegistry,
16010
- repoManager
16048
+ repoManager,
16049
+ evalDir
16011
16050
  };
16012
16051
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
16013
16052
  if (totalBudgetUsd !== void 0) {
@@ -16076,7 +16115,9 @@ async function runEvaluation(options) {
16076
16115
  (now ?? (() => /* @__PURE__ */ new Date()))(),
16077
16116
  outcome.reason,
16078
16117
  promptInputs,
16079
- primaryProvider
16118
+ primaryProvider,
16119
+ "agent",
16120
+ "provider_error"
16080
16121
  );
16081
16122
  results.push(errorResult);
16082
16123
  if (onResult) {
@@ -16088,7 +16129,8 @@ async function runEvaluation(options) {
16088
16129
  const scriptContext = {
16089
16130
  workspacePath: sharedWorkspacePath,
16090
16131
  testId: "__after_all__",
16091
- evalRunId
16132
+ evalRunId,
16133
+ evalDir
16092
16134
  };
16093
16135
  try {
16094
16136
  const afterAllOutput = await executeWorkspaceScript(
@@ -16218,7 +16260,14 @@ async function runBatchEvaluation(options) {
16218
16260
  availableTargets
16219
16261
  });
16220
16262
  if (providerError) {
16221
- result = { ...result, error: providerError };
16263
+ result = {
16264
+ ...result,
16265
+ error: providerError,
16266
+ executionStatus: "execution_error",
16267
+ failureStage: "agent",
16268
+ failureReasonCode: "provider_error",
16269
+ executionError: { message: providerError, stage: "agent" }
16270
+ };
16222
16271
  }
16223
16272
  } catch (error) {
16224
16273
  const errorResult = buildErrorResult(
@@ -16227,7 +16276,9 @@ async function runBatchEvaluation(options) {
16227
16276
  nowFn(),
16228
16277
  error,
16229
16278
  promptInputs,
16230
- provider
16279
+ provider,
16280
+ "evaluator",
16281
+ "evaluator_error"
16231
16282
  );
16232
16283
  results.push(errorResult);
16233
16284
  if (onResult) {
@@ -16283,7 +16334,8 @@ async function runEvalCase(options) {
16283
16334
  sharedBaselineCommit,
16284
16335
  suiteWorkspaceFile,
16285
16336
  typeRegistry: providedTypeRegistry,
16286
- repoManager
16337
+ repoManager,
16338
+ evalDir
16287
16339
  } = options;
16288
16340
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
16289
16341
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -16316,7 +16368,9 @@ async function runEvalCase(options) {
16316
16368
  nowFn(),
16317
16369
  new Error(`Failed to create workspace: ${message}`),
16318
16370
  promptInputs,
16319
- provider
16371
+ provider,
16372
+ "setup",
16373
+ "template_error"
16320
16374
  );
16321
16375
  }
16322
16376
  }
@@ -16336,7 +16390,9 @@ async function runEvalCase(options) {
16336
16390
  nowFn(),
16337
16391
  new Error(`Failed to materialize repos: ${message}`),
16338
16392
  promptInputs,
16339
- provider
16393
+ provider,
16394
+ "repo_setup",
16395
+ "clone_error"
16340
16396
  );
16341
16397
  }
16342
16398
  }
@@ -16346,7 +16402,8 @@ async function runEvalCase(options) {
16346
16402
  testId: evalCase.id,
16347
16403
  evalRunId: evalRunId ?? "",
16348
16404
  caseInput: evalCase.question,
16349
- caseMetadata: evalCase.metadata
16405
+ caseMetadata: evalCase.metadata,
16406
+ evalDir
16350
16407
  };
16351
16408
  try {
16352
16409
  beforeAllOutput = await executeWorkspaceScript(
@@ -16365,7 +16422,9 @@ async function runEvalCase(options) {
16365
16422
  nowFn(),
16366
16423
  new Error(`before_all script failed: ${message}`),
16367
16424
  promptInputs,
16368
- provider
16425
+ provider,
16426
+ "setup",
16427
+ "script_error"
16369
16428
  );
16370
16429
  }
16371
16430
  }
@@ -16376,7 +16435,8 @@ async function runEvalCase(options) {
16376
16435
  testId: evalCase.id,
16377
16436
  evalRunId: evalRunId ?? "",
16378
16437
  caseInput: evalCase.question,
16379
- caseMetadata: evalCase.metadata
16438
+ caseMetadata: evalCase.metadata,
16439
+ evalDir
16380
16440
  };
16381
16441
  try {
16382
16442
  beforeEachOutput = await executeWorkspaceScript(
@@ -16391,7 +16451,9 @@ async function runEvalCase(options) {
16391
16451
  nowFn(),
16392
16452
  new Error(`before_each script failed: ${message}`),
16393
16453
  promptInputs,
16394
- provider
16454
+ provider,
16455
+ "setup",
16456
+ "script_error"
16395
16457
  );
16396
16458
  }
16397
16459
  }
@@ -16432,7 +16494,9 @@ async function runEvalCase(options) {
16432
16494
  nowFn(),
16433
16495
  error,
16434
16496
  promptInputs,
16435
- provider
16497
+ provider,
16498
+ "agent",
16499
+ "provider_error"
16436
16500
  );
16437
16501
  if (workspacePath) {
16438
16502
  if (forceCleanup) {
@@ -16451,7 +16515,9 @@ async function runEvalCase(options) {
16451
16515
  nowFn(),
16452
16516
  lastError ?? new Error("Provider did not return a response"),
16453
16517
  promptInputs,
16454
- provider
16518
+ provider,
16519
+ "agent",
16520
+ "provider_error"
16455
16521
  );
16456
16522
  if (workspacePath) {
16457
16523
  if (forceCleanup) {
@@ -16507,7 +16573,8 @@ async function runEvalCase(options) {
16507
16573
  testId: evalCase.id,
16508
16574
  evalRunId: evalRunId ?? "",
16509
16575
  caseInput: evalCase.question,
16510
- caseMetadata: evalCase.metadata
16576
+ caseMetadata: evalCase.metadata,
16577
+ evalDir
16511
16578
  };
16512
16579
  try {
16513
16580
  afterEachOutput = await executeWorkspaceScript(
@@ -16543,7 +16610,18 @@ async function runEvalCase(options) {
16543
16610
  fileChanges,
16544
16611
  workspacePath
16545
16612
  });
16546
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
16613
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
16614
+ const finalResult = providerError ? {
16615
+ ...result,
16616
+ error: providerError,
16617
+ executionStatus,
16618
+ failureStage: "agent",
16619
+ failureReasonCode: "provider_error",
16620
+ executionError: { message: providerError, stage: "agent" },
16621
+ beforeAllOutput,
16622
+ beforeEachOutput,
16623
+ afterEachOutput
16624
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
16547
16625
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
16548
16626
  if (workspacePath && !isSharedWorkspace) {
16549
16627
  if (forceCleanup) {
@@ -16564,7 +16642,9 @@ async function runEvalCase(options) {
16564
16642
  nowFn(),
16565
16643
  error,
16566
16644
  promptInputs,
16567
- provider
16645
+ provider,
16646
+ "evaluator",
16647
+ "evaluator_error"
16568
16648
  );
16569
16649
  if (workspacePath && !isSharedWorkspace) {
16570
16650
  if (forceCleanup) {
@@ -16602,7 +16682,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
16602
16682
  verdict: trialVerdict,
16603
16683
  scores: result.scores,
16604
16684
  error: result.error,
16605
- costUsd: trialCost
16685
+ costUsd: trialCost,
16686
+ executionStatus: result.executionStatus,
16687
+ failureStage: result.failureStage,
16688
+ failureReasonCode: result.failureReasonCode
16606
16689
  };
16607
16690
  trialResults.push(trial);
16608
16691
  if (trialCost !== void 0) {
@@ -16627,12 +16710,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
16627
16710
  0
16628
16711
  );
16629
16712
  const baseResult = allResults[bestTrialIndex];
16713
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
16714
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
16715
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
16716
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
16717
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
16718
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
16630
16719
  return {
16631
16720
  ...baseResult,
16632
16721
  score,
16633
16722
  trials: trialResults,
16634
16723
  aggregation,
16635
- costLimited: costLimited || void 0
16724
+ costLimited: costLimited || void 0,
16725
+ executionStatus: aggregateExecutionStatus,
16726
+ failureStage: aggregateFailureStage,
16727
+ failureReasonCode: aggregateFailureReasonCode,
16728
+ executionError: aggregateExecutionError
16636
16729
  };
16637
16730
  }
16638
16731
  async function evaluateCandidate(options) {
@@ -16733,7 +16826,8 @@ async function evaluateCandidate(options) {
16733
16826
  scores,
16734
16827
  trace: trace2,
16735
16828
  output,
16736
- fileChanges
16829
+ fileChanges,
16830
+ executionStatus: classifyQualityStatus(score.score)
16737
16831
  };
16738
16832
  }
16739
16833
  async function runEvaluatorsForCase(options) {
@@ -17038,7 +17132,7 @@ async function invokeProvider(provider, options) {
17038
17132
  }
17039
17133
  }
17040
17134
  }
17041
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
17135
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
17042
17136
  const message = error instanceof Error ? error.message : String(error);
17043
17137
  let agentRequest;
17044
17138
  let lmRequest;
@@ -17081,7 +17175,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
17081
17175
  target: targetName,
17082
17176
  requests,
17083
17177
  input,
17084
- error: message
17178
+ error: message,
17179
+ executionStatus: "execution_error",
17180
+ failureStage,
17181
+ failureReasonCode,
17182
+ executionError: { message, stage: failureStage }
17085
17183
  };
17086
17184
  }
17087
17185
  function extractProviderError(response) {
@@ -17341,7 +17439,15 @@ var AgentVConfigSchema = import_zod6.z.object({
17341
17439
  /** Maximum retries on failure (default: 2) */
17342
17440
  maxRetries: import_zod6.z.number().int().min(0).optional(),
17343
17441
  /** Agent timeout in milliseconds (default: 120000) */
17344
- agentTimeoutMs: import_zod6.z.number().int().min(0).optional()
17442
+ agentTimeoutMs: import_zod6.z.number().int().min(0).optional(),
17443
+ /** Enable verbose logging */
17444
+ verbose: import_zod6.z.boolean().optional(),
17445
+ /** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
17446
+ traceFile: import_zod6.z.string().optional(),
17447
+ /** Always keep temp workspaces after eval */
17448
+ keepWorkspaces: import_zod6.z.boolean().optional(),
17449
+ /** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
17450
+ otelFile: import_zod6.z.string().optional()
17345
17451
  }).optional(),
17346
17452
  /** Output settings */
17347
17453
  output: import_zod6.z.object({