@agentv/core 2.11.2 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-REN5PS7B.js";
20
+ } from "./chunk-7HPKTRFZ.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -12653,14 +12653,24 @@ var RepoManager = class {
12653
12653
  * Creates on first access, fetches updates on subsequent calls.
12654
12654
  * Returns the absolute path to the cache directory.
12655
12655
  */
12656
- async ensureCache(source, depth) {
12656
+ async ensureCache(source, depth, resolve) {
12657
12657
  const key = cacheKey(source);
12658
12658
  const cachePath = path35.join(this.cacheDir, key);
12659
12659
  const lockPath = `${cachePath}.lock`;
12660
+ const cacheExists = existsSync2(path35.join(cachePath, "HEAD"));
12661
+ if (resolve === "local") {
12662
+ if (cacheExists) {
12663
+ return cachePath;
12664
+ }
12665
+ const url = getSourceUrl(source);
12666
+ throw new Error(
12667
+ `No cache found for \`${url}\`. Run \`agentv cache add --url ${url} --from <local-path>\` to seed it.`
12668
+ );
12669
+ }
12660
12670
  await mkdir11(this.cacheDir, { recursive: true });
12661
12671
  await acquireLock(lockPath);
12662
12672
  try {
12663
- if (existsSync2(path35.join(cachePath, "HEAD"))) {
12673
+ if (cacheExists) {
12664
12674
  const fetchArgs = ["fetch", "--prune"];
12665
12675
  if (depth) {
12666
12676
  fetchArgs.push("--depth", String(depth));
@@ -12687,7 +12697,11 @@ var RepoManager = class {
12687
12697
  */
12688
12698
  async materialize(repo, workspacePath) {
12689
12699
  const targetDir = path35.join(workspacePath, repo.path);
12690
- const cachePath = await this.ensureCache(repo.source, repo.clone?.depth);
12700
+ const cachePath = await this.ensureCache(
12701
+ repo.source,
12702
+ repo.clone?.depth,
12703
+ repo.checkout?.resolve
12704
+ );
12691
12705
  const cloneArgs = ["clone"];
12692
12706
  if (repo.clone?.depth) {
12693
12707
  cloneArgs.push("--depth", String(repo.clone.depth));
@@ -12833,6 +12847,16 @@ async function resolveWorkspaceTemplate(templatePath) {
12833
12847
  }
12834
12848
 
12835
12849
  // src/evaluation/workspace/script-executor.ts
12850
+ function interpolateArgs(args, context) {
12851
+ const vars = {
12852
+ workspace_path: context.workspacePath,
12853
+ test_id: context.testId,
12854
+ eval_run_id: context.evalRunId,
12855
+ case_input: context.caseInput ?? "",
12856
+ case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
12857
+ };
12858
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
12859
+ }
12836
12860
  async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12837
12861
  const stdin = JSON.stringify({
12838
12862
  workspace_path: context.workspacePath,
@@ -12842,8 +12866,9 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12842
12866
  case_metadata: context.caseMetadata ?? null
12843
12867
  });
12844
12868
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
12845
- const cwd = config.cwd;
12846
- const commandArray = config.command ?? config.script ?? [];
12869
+ const cwd = config.cwd ?? context.evalDir;
12870
+ const rawCommand = config.command ?? config.script ?? [];
12871
+ const commandArray = interpolateArgs(rawCommand, context);
12847
12872
  const result = await execFileWithStdin(commandArray, stdin, {
12848
12873
  timeoutMs,
12849
12874
  cwd
@@ -12860,6 +12885,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12860
12885
  }
12861
12886
 
12862
12887
  // src/evaluation/orchestrator.ts
12888
+ var QUALITY_PASS_THRESHOLD = 0.8;
12889
+ function classifyQualityStatus(score) {
12890
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
12891
+ }
12863
12892
  function usesFileReferencePrompt(provider) {
12864
12893
  return isAgentProvider(provider) || provider.kind === "cli";
12865
12894
  }
@@ -12967,6 +12996,7 @@ async function runEvaluation(options) {
12967
12996
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
12968
12997
  const typeRegistry = createBuiltinRegistry();
12969
12998
  const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
12999
+ const evalDir = discoveryBaseDir;
12970
13000
  await discoverAssertions(typeRegistry, discoveryBaseDir);
12971
13001
  const providerRegistry = createBuiltinProviderRegistry();
12972
13002
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -13062,7 +13092,8 @@ async function runEvaluation(options) {
13062
13092
  const scriptContext = {
13063
13093
  workspacePath: sharedWorkspacePath,
13064
13094
  testId: "__before_all__",
13065
- evalRunId
13095
+ evalRunId,
13096
+ evalDir
13066
13097
  };
13067
13098
  try {
13068
13099
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -13101,7 +13132,14 @@ async function runEvaluation(options) {
13101
13132
  answer: "",
13102
13133
  target: target.name,
13103
13134
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13104
- budgetExceeded: true
13135
+ budgetExceeded: true,
13136
+ executionStatus: "execution_error",
13137
+ failureStage: "setup",
13138
+ failureReasonCode: "budget_exceeded",
13139
+ executionError: {
13140
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13141
+ stage: "setup"
13142
+ }
13105
13143
  };
13106
13144
  if (onProgress) {
13107
13145
  await onProgress({
@@ -13148,7 +13186,8 @@ async function runEvaluation(options) {
13148
13186
  suiteWorkspaceFile,
13149
13187
  streamCallbacks,
13150
13188
  typeRegistry,
13151
- repoManager
13189
+ repoManager,
13190
+ evalDir
13152
13191
  };
13153
13192
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
13154
13193
  if (totalBudgetUsd !== void 0) {
@@ -13217,7 +13256,9 @@ async function runEvaluation(options) {
13217
13256
  (now ?? (() => /* @__PURE__ */ new Date()))(),
13218
13257
  outcome.reason,
13219
13258
  promptInputs,
13220
- primaryProvider
13259
+ primaryProvider,
13260
+ "agent",
13261
+ "provider_error"
13221
13262
  );
13222
13263
  results.push(errorResult);
13223
13264
  if (onResult) {
@@ -13229,7 +13270,8 @@ async function runEvaluation(options) {
13229
13270
  const scriptContext = {
13230
13271
  workspacePath: sharedWorkspacePath,
13231
13272
  testId: "__after_all__",
13232
- evalRunId
13273
+ evalRunId,
13274
+ evalDir
13233
13275
  };
13234
13276
  try {
13235
13277
  const afterAllOutput = await executeWorkspaceScript(
@@ -13359,7 +13401,14 @@ async function runBatchEvaluation(options) {
13359
13401
  availableTargets
13360
13402
  });
13361
13403
  if (providerError) {
13362
- result = { ...result, error: providerError };
13404
+ result = {
13405
+ ...result,
13406
+ error: providerError,
13407
+ executionStatus: "execution_error",
13408
+ failureStage: "agent",
13409
+ failureReasonCode: "provider_error",
13410
+ executionError: { message: providerError, stage: "agent" }
13411
+ };
13363
13412
  }
13364
13413
  } catch (error) {
13365
13414
  const errorResult = buildErrorResult(
@@ -13368,7 +13417,9 @@ async function runBatchEvaluation(options) {
13368
13417
  nowFn(),
13369
13418
  error,
13370
13419
  promptInputs,
13371
- provider
13420
+ provider,
13421
+ "evaluator",
13422
+ "evaluator_error"
13372
13423
  );
13373
13424
  results.push(errorResult);
13374
13425
  if (onResult) {
@@ -13424,7 +13475,8 @@ async function runEvalCase(options) {
13424
13475
  sharedBaselineCommit,
13425
13476
  suiteWorkspaceFile,
13426
13477
  typeRegistry: providedTypeRegistry,
13427
- repoManager
13478
+ repoManager,
13479
+ evalDir
13428
13480
  } = options;
13429
13481
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
13430
13482
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -13457,7 +13509,9 @@ async function runEvalCase(options) {
13457
13509
  nowFn(),
13458
13510
  new Error(`Failed to create workspace: ${message}`),
13459
13511
  promptInputs,
13460
- provider
13512
+ provider,
13513
+ "setup",
13514
+ "template_error"
13461
13515
  );
13462
13516
  }
13463
13517
  }
@@ -13477,7 +13531,9 @@ async function runEvalCase(options) {
13477
13531
  nowFn(),
13478
13532
  new Error(`Failed to materialize repos: ${message}`),
13479
13533
  promptInputs,
13480
- provider
13534
+ provider,
13535
+ "repo_setup",
13536
+ "clone_error"
13481
13537
  );
13482
13538
  }
13483
13539
  }
@@ -13487,7 +13543,8 @@ async function runEvalCase(options) {
13487
13543
  testId: evalCase.id,
13488
13544
  evalRunId: evalRunId ?? "",
13489
13545
  caseInput: evalCase.question,
13490
- caseMetadata: evalCase.metadata
13546
+ caseMetadata: evalCase.metadata,
13547
+ evalDir
13491
13548
  };
13492
13549
  try {
13493
13550
  beforeAllOutput = await executeWorkspaceScript(
@@ -13506,7 +13563,9 @@ async function runEvalCase(options) {
13506
13563
  nowFn(),
13507
13564
  new Error(`before_all script failed: ${message}`),
13508
13565
  promptInputs,
13509
- provider
13566
+ provider,
13567
+ "setup",
13568
+ "script_error"
13510
13569
  );
13511
13570
  }
13512
13571
  }
@@ -13517,7 +13576,8 @@ async function runEvalCase(options) {
13517
13576
  testId: evalCase.id,
13518
13577
  evalRunId: evalRunId ?? "",
13519
13578
  caseInput: evalCase.question,
13520
- caseMetadata: evalCase.metadata
13579
+ caseMetadata: evalCase.metadata,
13580
+ evalDir
13521
13581
  };
13522
13582
  try {
13523
13583
  beforeEachOutput = await executeWorkspaceScript(
@@ -13532,7 +13592,9 @@ async function runEvalCase(options) {
13532
13592
  nowFn(),
13533
13593
  new Error(`before_each script failed: ${message}`),
13534
13594
  promptInputs,
13535
- provider
13595
+ provider,
13596
+ "setup",
13597
+ "script_error"
13536
13598
  );
13537
13599
  }
13538
13600
  }
@@ -13573,7 +13635,9 @@ async function runEvalCase(options) {
13573
13635
  nowFn(),
13574
13636
  error,
13575
13637
  promptInputs,
13576
- provider
13638
+ provider,
13639
+ "agent",
13640
+ "provider_error"
13577
13641
  );
13578
13642
  if (workspacePath) {
13579
13643
  if (forceCleanup) {
@@ -13592,7 +13656,9 @@ async function runEvalCase(options) {
13592
13656
  nowFn(),
13593
13657
  lastError ?? new Error("Provider did not return a response"),
13594
13658
  promptInputs,
13595
- provider
13659
+ provider,
13660
+ "agent",
13661
+ "provider_error"
13596
13662
  );
13597
13663
  if (workspacePath) {
13598
13664
  if (forceCleanup) {
@@ -13648,7 +13714,8 @@ async function runEvalCase(options) {
13648
13714
  testId: evalCase.id,
13649
13715
  evalRunId: evalRunId ?? "",
13650
13716
  caseInput: evalCase.question,
13651
- caseMetadata: evalCase.metadata
13717
+ caseMetadata: evalCase.metadata,
13718
+ evalDir
13652
13719
  };
13653
13720
  try {
13654
13721
  afterEachOutput = await executeWorkspaceScript(
@@ -13684,7 +13751,18 @@ async function runEvalCase(options) {
13684
13751
  fileChanges,
13685
13752
  workspacePath
13686
13753
  });
13687
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
13754
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
13755
+ const finalResult = providerError ? {
13756
+ ...result,
13757
+ error: providerError,
13758
+ executionStatus,
13759
+ failureStage: "agent",
13760
+ failureReasonCode: "provider_error",
13761
+ executionError: { message: providerError, stage: "agent" },
13762
+ beforeAllOutput,
13763
+ beforeEachOutput,
13764
+ afterEachOutput
13765
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
13688
13766
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
13689
13767
  if (workspacePath && !isSharedWorkspace) {
13690
13768
  if (forceCleanup) {
@@ -13705,7 +13783,9 @@ async function runEvalCase(options) {
13705
13783
  nowFn(),
13706
13784
  error,
13707
13785
  promptInputs,
13708
- provider
13786
+ provider,
13787
+ "evaluator",
13788
+ "evaluator_error"
13709
13789
  );
13710
13790
  if (workspacePath && !isSharedWorkspace) {
13711
13791
  if (forceCleanup) {
@@ -13743,7 +13823,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
13743
13823
  verdict: trialVerdict,
13744
13824
  scores: result.scores,
13745
13825
  error: result.error,
13746
- costUsd: trialCost
13826
+ costUsd: trialCost,
13827
+ executionStatus: result.executionStatus,
13828
+ failureStage: result.failureStage,
13829
+ failureReasonCode: result.failureReasonCode
13747
13830
  };
13748
13831
  trialResults.push(trial);
13749
13832
  if (trialCost !== void 0) {
@@ -13768,12 +13851,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
13768
13851
  0
13769
13852
  );
13770
13853
  const baseResult = allResults[bestTrialIndex];
13854
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
13855
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
13856
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
13857
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
13858
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
13859
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
13771
13860
  return {
13772
13861
  ...baseResult,
13773
13862
  score,
13774
13863
  trials: trialResults,
13775
13864
  aggregation,
13776
- costLimited: costLimited || void 0
13865
+ costLimited: costLimited || void 0,
13866
+ executionStatus: aggregateExecutionStatus,
13867
+ failureStage: aggregateFailureStage,
13868
+ failureReasonCode: aggregateFailureReasonCode,
13869
+ executionError: aggregateExecutionError
13777
13870
  };
13778
13871
  }
13779
13872
  async function evaluateCandidate(options) {
@@ -13874,7 +13967,8 @@ async function evaluateCandidate(options) {
13874
13967
  scores,
13875
13968
  trace,
13876
13969
  output,
13877
- fileChanges
13970
+ fileChanges,
13971
+ executionStatus: classifyQualityStatus(score.score)
13878
13972
  };
13879
13973
  }
13880
13974
  async function runEvaluatorsForCase(options) {
@@ -14179,7 +14273,7 @@ async function invokeProvider(provider, options) {
14179
14273
  }
14180
14274
  }
14181
14275
  }
14182
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
14276
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
14183
14277
  const message = error instanceof Error ? error.message : String(error);
14184
14278
  let agentRequest;
14185
14279
  let lmRequest;
@@ -14222,7 +14316,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
14222
14316
  target: targetName,
14223
14317
  requests,
14224
14318
  input,
14225
- error: message
14319
+ error: message,
14320
+ executionStatus: "execution_error",
14321
+ failureStage,
14322
+ failureReasonCode,
14323
+ executionError: { message, stage: failureStage }
14226
14324
  };
14227
14325
  }
14228
14326
  function extractProviderError(response) {
@@ -14482,7 +14580,15 @@ var AgentVConfigSchema = z5.object({
14482
14580
  /** Maximum retries on failure (default: 2) */
14483
14581
  maxRetries: z5.number().int().min(0).optional(),
14484
14582
  /** Agent timeout in milliseconds (default: 120000) */
14485
- agentTimeoutMs: z5.number().int().min(0).optional()
14583
+ agentTimeoutMs: z5.number().int().min(0).optional(),
14584
+ /** Enable verbose logging */
14585
+ verbose: z5.boolean().optional(),
14586
+ /** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
14587
+ traceFile: z5.string().optional(),
14588
+ /** Always keep temp workspaces after eval */
14589
+ keepWorkspaces: z5.boolean().optional(),
14590
+ /** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
14591
+ otelFile: z5.string().optional()
14486
14592
  }).optional(),
14487
14593
  /** Output settings */
14488
14594
  output: z5.object({