agentv 2.11.2 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
148
148
  }
149
149
  });
150
150
 
151
- // ../../packages/core/dist/chunk-REN5PS7B.js
151
+ // ../../packages/core/dist/chunk-7HPKTRFZ.js
152
152
  import { constants } from "node:fs";
153
153
  import { access, readFile } from "node:fs/promises";
154
154
  import path from "node:path";
@@ -4195,7 +4195,7 @@ var coerce = {
4195
4195
  };
4196
4196
  var NEVER = INVALID;
4197
4197
 
4198
- // ../../packages/core/dist/chunk-REN5PS7B.js
4198
+ // ../../packages/core/dist/chunk-7HPKTRFZ.js
4199
4199
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
4200
4200
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
4201
4201
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -46293,14 +46293,24 @@ var RepoManager = class {
46293
46293
  * Creates on first access, fetches updates on subsequent calls.
46294
46294
  * Returns the absolute path to the cache directory.
46295
46295
  */
46296
- async ensureCache(source, depth) {
46296
+ async ensureCache(source, depth, resolve2) {
46297
46297
  const key = cacheKey(source);
46298
46298
  const cachePath = path35.join(this.cacheDir, key);
46299
46299
  const lockPath = `${cachePath}.lock`;
46300
+ const cacheExists = existsSync2(path35.join(cachePath, "HEAD"));
46301
+ if (resolve2 === "local") {
46302
+ if (cacheExists) {
46303
+ return cachePath;
46304
+ }
46305
+ const url2 = getSourceUrl(source);
46306
+ throw new Error(
46307
+ `No cache found for \`${url2}\`. Run \`agentv cache add --url ${url2} --from <local-path>\` to seed it.`
46308
+ );
46309
+ }
46300
46310
  await mkdir11(this.cacheDir, { recursive: true });
46301
46311
  await acquireLock(lockPath);
46302
46312
  try {
46303
- if (existsSync2(path35.join(cachePath, "HEAD"))) {
46313
+ if (cacheExists) {
46304
46314
  const fetchArgs = ["fetch", "--prune"];
46305
46315
  if (depth) {
46306
46316
  fetchArgs.push("--depth", String(depth));
@@ -46327,7 +46337,11 @@ var RepoManager = class {
46327
46337
  */
46328
46338
  async materialize(repo, workspacePath) {
46329
46339
  const targetDir = path35.join(workspacePath, repo.path);
46330
- const cachePath = await this.ensureCache(repo.source, repo.clone?.depth);
46340
+ const cachePath = await this.ensureCache(
46341
+ repo.source,
46342
+ repo.clone?.depth,
46343
+ repo.checkout?.resolve
46344
+ );
46331
46345
  const cloneArgs = ["clone"];
46332
46346
  if (repo.clone?.depth) {
46333
46347
  cloneArgs.push("--depth", String(repo.clone.depth));
@@ -46467,6 +46481,16 @@ async function resolveWorkspaceTemplate(templatePath) {
46467
46481
  }
46468
46482
  return { dir: resolved };
46469
46483
  }
46484
+ function interpolateArgs(args, context) {
46485
+ const vars = {
46486
+ workspace_path: context.workspacePath,
46487
+ test_id: context.testId,
46488
+ eval_run_id: context.evalRunId,
46489
+ case_input: context.caseInput ?? "",
46490
+ case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
46491
+ };
46492
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name16) => vars[name16] ?? match));
46493
+ }
46470
46494
  async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46471
46495
  const stdin = JSON.stringify({
46472
46496
  workspace_path: context.workspacePath,
@@ -46476,8 +46500,9 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46476
46500
  case_metadata: context.caseMetadata ?? null
46477
46501
  });
46478
46502
  const timeoutMs = config2.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
46479
- const cwd = config2.cwd;
46480
- const commandArray = config2.command ?? config2.script ?? [];
46503
+ const cwd = config2.cwd ?? context.evalDir;
46504
+ const rawCommand = config2.command ?? config2.script ?? [];
46505
+ const commandArray = interpolateArgs(rawCommand, context);
46481
46506
  const result = await execFileWithStdin(commandArray, stdin, {
46482
46507
  timeoutMs,
46483
46508
  cwd
@@ -46492,6 +46517,10 @@ async function executeWorkspaceScript(config2, context, failureMode = "fatal") {
46492
46517
  }
46493
46518
  return result.stdout;
46494
46519
  }
46520
+ var QUALITY_PASS_THRESHOLD = 0.8;
46521
+ function classifyQualityStatus(score) {
46522
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
46523
+ }
46495
46524
  function usesFileReferencePrompt(provider) {
46496
46525
  return isAgentProvider(provider) || provider.kind === "cli";
46497
46526
  }
@@ -46599,6 +46628,7 @@ async function runEvaluation(options) {
46599
46628
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
46600
46629
  const typeRegistry = createBuiltinRegistry();
46601
46630
  const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
46631
+ const evalDir = discoveryBaseDir;
46602
46632
  await discoverAssertions(typeRegistry, discoveryBaseDir);
46603
46633
  const providerRegistry = createBuiltinProviderRegistry();
46604
46634
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -46694,7 +46724,8 @@ async function runEvaluation(options) {
46694
46724
  const scriptContext = {
46695
46725
  workspacePath: sharedWorkspacePath,
46696
46726
  testId: "__before_all__",
46697
- evalRunId
46727
+ evalRunId,
46728
+ evalDir
46698
46729
  };
46699
46730
  try {
46700
46731
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -46733,7 +46764,14 @@ async function runEvaluation(options) {
46733
46764
  answer: "",
46734
46765
  target: target.name,
46735
46766
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
46736
- budgetExceeded: true
46767
+ budgetExceeded: true,
46768
+ executionStatus: "execution_error",
46769
+ failureStage: "setup",
46770
+ failureReasonCode: "budget_exceeded",
46771
+ executionError: {
46772
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
46773
+ stage: "setup"
46774
+ }
46737
46775
  };
46738
46776
  if (onProgress) {
46739
46777
  await onProgress({
@@ -46780,7 +46818,8 @@ async function runEvaluation(options) {
46780
46818
  suiteWorkspaceFile,
46781
46819
  streamCallbacks,
46782
46820
  typeRegistry,
46783
- repoManager
46821
+ repoManager,
46822
+ evalDir
46784
46823
  };
46785
46824
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
46786
46825
  if (totalBudgetUsd !== void 0) {
@@ -46849,7 +46888,9 @@ async function runEvaluation(options) {
46849
46888
  (now ?? (() => /* @__PURE__ */ new Date()))(),
46850
46889
  outcome.reason,
46851
46890
  promptInputs,
46852
- primaryProvider
46891
+ primaryProvider,
46892
+ "agent",
46893
+ "provider_error"
46853
46894
  );
46854
46895
  results.push(errorResult);
46855
46896
  if (onResult) {
@@ -46861,7 +46902,8 @@ async function runEvaluation(options) {
46861
46902
  const scriptContext = {
46862
46903
  workspacePath: sharedWorkspacePath,
46863
46904
  testId: "__after_all__",
46864
- evalRunId
46905
+ evalRunId,
46906
+ evalDir
46865
46907
  };
46866
46908
  try {
46867
46909
  const afterAllOutput = await executeWorkspaceScript(
@@ -46991,7 +47033,14 @@ async function runBatchEvaluation(options) {
46991
47033
  availableTargets
46992
47034
  });
46993
47035
  if (providerError) {
46994
- result = { ...result, error: providerError };
47036
+ result = {
47037
+ ...result,
47038
+ error: providerError,
47039
+ executionStatus: "execution_error",
47040
+ failureStage: "agent",
47041
+ failureReasonCode: "provider_error",
47042
+ executionError: { message: providerError, stage: "agent" }
47043
+ };
46995
47044
  }
46996
47045
  } catch (error40) {
46997
47046
  const errorResult = buildErrorResult(
@@ -47000,7 +47049,9 @@ async function runBatchEvaluation(options) {
47000
47049
  nowFn(),
47001
47050
  error40,
47002
47051
  promptInputs,
47003
- provider
47052
+ provider,
47053
+ "evaluator",
47054
+ "evaluator_error"
47004
47055
  );
47005
47056
  results.push(errorResult);
47006
47057
  if (onResult) {
@@ -47056,7 +47107,8 @@ async function runEvalCase(options) {
47056
47107
  sharedBaselineCommit,
47057
47108
  suiteWorkspaceFile,
47058
47109
  typeRegistry: providedTypeRegistry,
47059
- repoManager
47110
+ repoManager,
47111
+ evalDir
47060
47112
  } = options;
47061
47113
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
47062
47114
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -47089,7 +47141,9 @@ async function runEvalCase(options) {
47089
47141
  nowFn(),
47090
47142
  new Error(`Failed to create workspace: ${message}`),
47091
47143
  promptInputs,
47092
- provider
47144
+ provider,
47145
+ "setup",
47146
+ "template_error"
47093
47147
  );
47094
47148
  }
47095
47149
  }
@@ -47109,7 +47163,9 @@ async function runEvalCase(options) {
47109
47163
  nowFn(),
47110
47164
  new Error(`Failed to materialize repos: ${message}`),
47111
47165
  promptInputs,
47112
- provider
47166
+ provider,
47167
+ "repo_setup",
47168
+ "clone_error"
47113
47169
  );
47114
47170
  }
47115
47171
  }
@@ -47119,7 +47175,8 @@ async function runEvalCase(options) {
47119
47175
  testId: evalCase.id,
47120
47176
  evalRunId: evalRunId ?? "",
47121
47177
  caseInput: evalCase.question,
47122
- caseMetadata: evalCase.metadata
47178
+ caseMetadata: evalCase.metadata,
47179
+ evalDir
47123
47180
  };
47124
47181
  try {
47125
47182
  beforeAllOutput = await executeWorkspaceScript(
@@ -47138,7 +47195,9 @@ async function runEvalCase(options) {
47138
47195
  nowFn(),
47139
47196
  new Error(`before_all script failed: ${message}`),
47140
47197
  promptInputs,
47141
- provider
47198
+ provider,
47199
+ "setup",
47200
+ "script_error"
47142
47201
  );
47143
47202
  }
47144
47203
  }
@@ -47149,7 +47208,8 @@ async function runEvalCase(options) {
47149
47208
  testId: evalCase.id,
47150
47209
  evalRunId: evalRunId ?? "",
47151
47210
  caseInput: evalCase.question,
47152
- caseMetadata: evalCase.metadata
47211
+ caseMetadata: evalCase.metadata,
47212
+ evalDir
47153
47213
  };
47154
47214
  try {
47155
47215
  beforeEachOutput = await executeWorkspaceScript(
@@ -47164,7 +47224,9 @@ async function runEvalCase(options) {
47164
47224
  nowFn(),
47165
47225
  new Error(`before_each script failed: ${message}`),
47166
47226
  promptInputs,
47167
- provider
47227
+ provider,
47228
+ "setup",
47229
+ "script_error"
47168
47230
  );
47169
47231
  }
47170
47232
  }
@@ -47205,7 +47267,9 @@ async function runEvalCase(options) {
47205
47267
  nowFn(),
47206
47268
  error40,
47207
47269
  promptInputs,
47208
- provider
47270
+ provider,
47271
+ "agent",
47272
+ "provider_error"
47209
47273
  );
47210
47274
  if (workspacePath) {
47211
47275
  if (forceCleanup) {
@@ -47224,7 +47288,9 @@ async function runEvalCase(options) {
47224
47288
  nowFn(),
47225
47289
  lastError ?? new Error("Provider did not return a response"),
47226
47290
  promptInputs,
47227
- provider
47291
+ provider,
47292
+ "agent",
47293
+ "provider_error"
47228
47294
  );
47229
47295
  if (workspacePath) {
47230
47296
  if (forceCleanup) {
@@ -47280,7 +47346,8 @@ async function runEvalCase(options) {
47280
47346
  testId: evalCase.id,
47281
47347
  evalRunId: evalRunId ?? "",
47282
47348
  caseInput: evalCase.question,
47283
- caseMetadata: evalCase.metadata
47349
+ caseMetadata: evalCase.metadata,
47350
+ evalDir
47284
47351
  };
47285
47352
  try {
47286
47353
  afterEachOutput = await executeWorkspaceScript(
@@ -47316,7 +47383,18 @@ async function runEvalCase(options) {
47316
47383
  fileChanges,
47317
47384
  workspacePath
47318
47385
  });
47319
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
47386
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
47387
+ const finalResult = providerError ? {
47388
+ ...result,
47389
+ error: providerError,
47390
+ executionStatus,
47391
+ failureStage: "agent",
47392
+ failureReasonCode: "provider_error",
47393
+ executionError: { message: providerError, stage: "agent" },
47394
+ beforeAllOutput,
47395
+ beforeEachOutput,
47396
+ afterEachOutput
47397
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
47320
47398
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
47321
47399
  if (workspacePath && !isSharedWorkspace) {
47322
47400
  if (forceCleanup) {
@@ -47337,7 +47415,9 @@ async function runEvalCase(options) {
47337
47415
  nowFn(),
47338
47416
  error40,
47339
47417
  promptInputs,
47340
- provider
47418
+ provider,
47419
+ "evaluator",
47420
+ "evaluator_error"
47341
47421
  );
47342
47422
  if (workspacePath && !isSharedWorkspace) {
47343
47423
  if (forceCleanup) {
@@ -47375,7 +47455,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
47375
47455
  verdict: trialVerdict,
47376
47456
  scores: result.scores,
47377
47457
  error: result.error,
47378
- costUsd: trialCost
47458
+ costUsd: trialCost,
47459
+ executionStatus: result.executionStatus,
47460
+ failureStage: result.failureStage,
47461
+ failureReasonCode: result.failureReasonCode
47379
47462
  };
47380
47463
  trialResults.push(trial);
47381
47464
  if (trialCost !== void 0) {
@@ -47400,12 +47483,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
47400
47483
  0
47401
47484
  );
47402
47485
  const baseResult = allResults[bestTrialIndex];
47486
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
47487
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
47488
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
47489
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
47490
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
47491
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
47403
47492
  return {
47404
47493
  ...baseResult,
47405
47494
  score,
47406
47495
  trials: trialResults,
47407
47496
  aggregation,
47408
- costLimited: costLimited || void 0
47497
+ costLimited: costLimited || void 0,
47498
+ executionStatus: aggregateExecutionStatus,
47499
+ failureStage: aggregateFailureStage,
47500
+ failureReasonCode: aggregateFailureReasonCode,
47501
+ executionError: aggregateExecutionError
47409
47502
  };
47410
47503
  }
47411
47504
  async function evaluateCandidate(options) {
@@ -47506,7 +47599,8 @@ async function evaluateCandidate(options) {
47506
47599
  scores,
47507
47600
  trace: trace2,
47508
47601
  output,
47509
- fileChanges
47602
+ fileChanges,
47603
+ executionStatus: classifyQualityStatus(score.score)
47510
47604
  };
47511
47605
  }
47512
47606
  async function runEvaluatorsForCase(options) {
@@ -47811,7 +47905,7 @@ async function invokeProvider(provider, options) {
47811
47905
  }
47812
47906
  }
47813
47907
  }
47814
- function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider) {
47908
+ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs, provider, failureStage, failureReasonCode) {
47815
47909
  const message = error40 instanceof Error ? error40.message : String(error40);
47816
47910
  let agentRequest;
47817
47911
  let lmRequest;
@@ -47854,7 +47948,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs
47854
47948
  target: targetName,
47855
47949
  requests,
47856
47950
  input,
47857
- error: message
47951
+ error: message,
47952
+ executionStatus: "execution_error",
47953
+ failureStage,
47954
+ failureReasonCode,
47955
+ executionError: { message, stage: failureStage }
47858
47956
  };
47859
47957
  }
47860
47958
  function extractProviderError(response) {
@@ -48107,7 +48205,15 @@ var AgentVConfigSchema = external_exports.object({
48107
48205
  /** Maximum retries on failure (default: 2) */
48108
48206
  maxRetries: external_exports.number().int().min(0).optional(),
48109
48207
  /** Agent timeout in milliseconds (default: 120000) */
48110
- agentTimeoutMs: external_exports.number().int().min(0).optional()
48208
+ agentTimeoutMs: external_exports.number().int().min(0).optional(),
48209
+ /** Enable verbose logging */
48210
+ verbose: external_exports.boolean().optional(),
48211
+ /** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
48212
+ traceFile: external_exports.string().optional(),
48213
+ /** Always keep temp workspaces after eval */
48214
+ keepWorkspaces: external_exports.boolean().optional(),
48215
+ /** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
48216
+ otelFile: external_exports.string().optional()
48111
48217
  }).optional(),
48112
48218
  /** Output settings */
48113
48219
  output: external_exports.object({
@@ -48844,4 +48950,4 @@ export {
48844
48950
  OtelStreamingObserver,
48845
48951
  createAgentKernel
48846
48952
  };
48847
- //# sourceMappingURL=chunk-MQIQH5LB.js.map
48953
+ //# sourceMappingURL=chunk-LUHCYBMD.js.map