agentv 4.38.1 → 4.40.1-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/{artifact-writer-MK5X5MSO.js → artifact-writer-GIAIMGPQ.js} +14 -11
  2. package/dist/{chunk-QOBQ5XYF.js → chunk-76FOHROU.js} +16 -4
  3. package/dist/chunk-76FOHROU.js.map +1 -0
  4. package/dist/{chunk-VBHHZQS6.js → chunk-BLXYBUU4.js} +1825 -333
  5. package/dist/chunk-BLXYBUU4.js.map +1 -0
  6. package/dist/{chunk-NLTIK3LV.js → chunk-I3SC4FOT.js} +499 -347
  7. package/dist/chunk-I3SC4FOT.js.map +1 -0
  8. package/dist/{chunk-OIN3MVOD.js → chunk-S2JJCLHV.js} +67 -68
  9. package/dist/chunk-S2JJCLHV.js.map +1 -0
  10. package/dist/chunk-TWQP7JYQ.js +494 -0
  11. package/dist/chunk-TWQP7JYQ.js.map +1 -0
  12. package/dist/{chunk-6M5S4IJW.js → chunk-WKA5QDNQ.js} +586 -183
  13. package/dist/chunk-WKA5QDNQ.js.map +1 -0
  14. package/dist/cli.js +6 -6
  15. package/dist/dashboard/assets/index-BnYCCJ7O.css +1 -0
  16. package/dist/dashboard/assets/index-DaueD7GO.js +118 -0
  17. package/dist/dashboard/assets/{index-SIl6NbIJ.js → index-_jpKSzIf.js} +1 -1
  18. package/dist/dashboard/index.html +2 -2
  19. package/dist/{dist-HVLBDG5F.js → dist-6Z4OSITR.js} +54 -16
  20. package/dist/index.js +6 -6
  21. package/dist/{interactive-45LPG2YJ.js → interactive-OUB3GZRC.js} +6 -6
  22. package/dist/{otlp-json-file-exporter-RJFPCKVK-T6N4OGWG.js → otlp-json-file-exporter-RY63S3IG-PZBQPVYY.js} +2 -2
  23. package/dist/skills/agentv-eval-writer/SKILL.md +49 -24
  24. package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +21 -15
  25. package/dist/{ts-eval-loader-TJT6BGFF-DI7XNSO4.js → ts-eval-loader-NWH3B4HG-UXXCZKLP.js} +2 -2
  26. package/package.json +1 -1
  27. package/dist/chunk-6M5S4IJW.js.map +0 -1
  28. package/dist/chunk-DKUAETXE.js +0 -1362
  29. package/dist/chunk-DKUAETXE.js.map +0 -1
  30. package/dist/chunk-NLTIK3LV.js.map +0 -1
  31. package/dist/chunk-OIN3MVOD.js.map +0 -1
  32. package/dist/chunk-QOBQ5XYF.js.map +0 -1
  33. package/dist/chunk-VBHHZQS6.js.map +0 -1
  34. package/dist/dashboard/assets/index-BpnllKET.css +0 -1
  35. package/dist/dashboard/assets/index-Cm9SUopp.js +0 -118
  36. /package/dist/{artifact-writer-MK5X5MSO.js.map → artifact-writer-GIAIMGPQ.js.map} +0 -0
  37. /package/dist/{dist-HVLBDG5F.js.map → dist-6Z4OSITR.js.map} +0 -0
  38. /package/dist/{interactive-45LPG2YJ.js.map → interactive-OUB3GZRC.js.map} +0 -0
  39. /package/dist/{otlp-json-file-exporter-RJFPCKVK-T6N4OGWG.js.map → otlp-json-file-exporter-RY63S3IG-PZBQPVYY.js.map} +0 -0
  40. /package/dist/{ts-eval-loader-TJT6BGFF-DI7XNSO4.js.map → ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map} +0 -0
@@ -493,8 +493,8 @@ function getErrorMap() {
493
493
 
494
494
  // ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
495
495
  var makeIssue = (params) => {
496
- const { data, path: path49, errorMaps, issueData } = params;
497
- const fullPath = [...path49, ...issueData.path || []];
496
+ const { data, path: path50, errorMaps, issueData } = params;
497
+ const fullPath = [...path50, ...issueData.path || []];
498
498
  const fullIssue = {
499
499
  ...issueData,
500
500
  path: fullPath
@@ -610,11 +610,11 @@ var errorUtil;
610
610
 
611
611
  // ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
612
612
  var ParseInputLazyPath = class {
613
- constructor(parent, value, path49, key) {
613
+ constructor(parent, value, path50, key) {
614
614
  this._cachedPath = [];
615
615
  this.parent = parent;
616
616
  this.data = value;
617
- this._path = path49;
617
+ this._path = path50;
618
618
  this._key = key;
619
619
  }
620
620
  get path() {
@@ -4056,7 +4056,7 @@ var coerce = {
4056
4056
  };
4057
4057
  var NEVER = INVALID;
4058
4058
 
4059
- // ../../packages/core/dist/chunk-M54RBDXI.js
4059
+ // ../../packages/core/dist/chunk-5JNFEE7J.js
4060
4060
  import { parse } from "yaml";
4061
4061
  import os from "node:os";
4062
4062
  import path from "node:path";
@@ -5146,6 +5146,7 @@ function resolveStreamLog(target, envFallback) {
5146
5146
  function resolveCopilotSdkConfig(target, env, _evalFilePath) {
5147
5147
  const cliUrlSource = target.cli_url;
5148
5148
  const cliPathSource = target.cli_path;
5149
+ const argsSource = target.args ?? target.arguments;
5149
5150
  const githubTokenSource = target.github_token;
5150
5151
  const modelSource = target.model;
5151
5152
  const cwdSource = target.cwd;
@@ -5166,6 +5167,7 @@ function resolveCopilotSdkConfig(target, env, _evalFilePath) {
5166
5167
  allowLiteral: true,
5167
5168
  optionalEnv: true
5168
5169
  });
5170
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} copilot-sdk args`);
5169
5171
  const githubToken = resolveOptionalString(
5170
5172
  githubTokenSource,
5171
5173
  env,
@@ -5195,12 +5197,11 @@ function resolveCopilotSdkConfig(target, env, _evalFilePath) {
5195
5197
  );
5196
5198
  const logFormat = normalizeCopilotLogFormat(logFormatSource);
5197
5199
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
5198
- const customProvider = resolveCopilotCustomProviderConfig(target, env, {
5199
- includeByokAlias: true
5200
- });
5200
+ const customProvider = resolveCopilotFlatProviderConfig(target, env);
5201
5201
  return {
5202
5202
  cliUrl,
5203
5203
  cliPath,
5204
+ args,
5204
5205
  githubToken,
5205
5206
  model,
5206
5207
  cwd,
@@ -5209,86 +5210,52 @@ function resolveCopilotSdkConfig(target, env, _evalFilePath) {
5209
5210
  logFormat,
5210
5211
  streamLog: streamLogResult.streamLog,
5211
5212
  systemPrompt,
5212
- ...customProvider ? {
5213
- customProvider,
5214
- byokType: customProvider.type,
5215
- byokBaseUrl: customProvider.baseUrl,
5216
- byokApiKey: customProvider.apiKey,
5217
- byokBearerToken: customProvider.bearerToken,
5218
- byokApiVersion: customProvider.apiVersion,
5219
- byokWireApi: customProvider.wireApi
5220
- } : {}
5221
- };
5222
- }
5223
- function resolveCopilotCustomProviderConfig(target, env, options = {}) {
5224
- const hasCustomProvider = target.custom_provider !== void 0;
5225
- const hasByokAlias = options.includeByokAlias === true && target.byok !== void 0;
5226
- if (!hasCustomProvider && !hasByokAlias) {
5227
- return void 0;
5228
- }
5229
- const sourceName = hasCustomProvider ? "custom_provider" : "byok";
5230
- const raw = sourceName === "custom_provider" ? target.custom_provider : target.byok;
5231
- if (raw === null) {
5232
- return void 0;
5233
- }
5234
- if (typeof raw !== "object" || Array.isArray(raw)) {
5235
- throw new Error(`${target.name}: '${sourceName}' must be an object`);
5236
- }
5237
- const provider = raw;
5238
- const type = resolveOptionalString(provider.type, env, `${target.name} ${sourceName} type`, {
5213
+ ...customProvider ? { customProvider } : {}
5214
+ };
5215
+ }
5216
+ function resolveCopilotFlatProviderConfig(target, env) {
5217
+ const baseUrlSource = target.base_url;
5218
+ if (!baseUrlSource) return void 0;
5219
+ const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} copilot base URL`, {
5239
5220
  allowLiteral: true,
5240
5221
  optionalEnv: true
5241
5222
  });
5242
- const baseUrl = resolveOptionalString(
5243
- provider.base_url,
5223
+ if (!baseUrl) return void 0;
5224
+ const type = resolveOptionalString(
5225
+ target.subprovider,
5244
5226
  env,
5245
- `${target.name} ${sourceName} base URL`,
5227
+ `${target.name} copilot provider type`,
5246
5228
  {
5247
5229
  allowLiteral: true,
5248
5230
  optionalEnv: true
5249
5231
  }
5250
5232
  );
5251
- const apiKey = resolveOptionalString(
5252
- provider.api_key,
5253
- env,
5254
- `${target.name} ${sourceName} API key`,
5255
- {
5256
- allowLiteral: false,
5257
- optionalEnv: true
5258
- }
5259
- );
5233
+ const apiKey = resolveOptionalString(target.api_key, env, `${target.name} copilot API key`, {
5234
+ allowLiteral: false,
5235
+ optionalEnv: true
5236
+ });
5260
5237
  const bearerToken = resolveOptionalString(
5261
- provider.bearer_token,
5238
+ target.bearer_token,
5262
5239
  env,
5263
- `${target.name} ${sourceName} bearer token`,
5240
+ `${target.name} copilot bearer token`,
5264
5241
  {
5265
5242
  allowLiteral: false,
5266
5243
  optionalEnv: true
5267
5244
  }
5268
5245
  );
5269
5246
  const apiVersion = resolveOptionalString(
5270
- provider.api_version,
5247
+ target.api_version,
5271
5248
  env,
5272
- `${target.name} ${sourceName} API version`,
5249
+ `${target.name} copilot API version`,
5273
5250
  {
5274
5251
  allowLiteral: true,
5275
5252
  optionalEnv: true
5276
5253
  }
5277
5254
  );
5278
- const wireApi = resolveOptionalString(
5279
- provider.wire_api,
5280
- env,
5281
- `${target.name} ${sourceName} wire API`,
5282
- {
5283
- allowLiteral: true,
5284
- optionalEnv: true
5285
- }
5286
- );
5287
- if (!baseUrl) {
5288
- throw new Error(
5289
- `${target.name}: '${sourceName}.base_url' is required when '${sourceName}' is specified`
5290
- );
5291
- }
5255
+ const wireApi = resolveOptionalString(target.wire_api, env, `${target.name} copilot wire API`, {
5256
+ allowLiteral: true,
5257
+ optionalEnv: true
5258
+ });
5292
5259
  return {
5293
5260
  ...type ? { type } : {},
5294
5261
  baseUrl,
@@ -5337,7 +5304,7 @@ function resolveCopilotCliConfig(target, env, _evalFilePath) {
5337
5304
  );
5338
5305
  const logFormat = normalizeCopilotLogFormat(logFormatSource);
5339
5306
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
5340
- const customProvider = resolveCopilotCustomProviderConfig(target, env);
5307
+ const customProvider = resolveCopilotFlatProviderConfig(target, env);
5341
5308
  return {
5342
5309
  executable,
5343
5310
  model,
@@ -5600,22 +5567,22 @@ function resolveReplayConfig(target, env, evalFilePath) {
5600
5567
  const fixtures = resolveOptionalString(target.fixtures, env, `${target.name} replay fixtures`, {
5601
5568
  allowLiteral: true
5602
5569
  });
5603
- const traceEnvelopes = resolveOptionalString(
5604
- target.trace_envelopes,
5570
+ const executionTraces = resolveOptionalString(
5571
+ target.execution_traces,
5605
5572
  env,
5606
- `${target.name} replay trace_envelopes`,
5573
+ `${target.name} replay execution_traces`,
5607
5574
  {
5608
5575
  allowLiteral: true
5609
5576
  }
5610
5577
  );
5611
- if ((fixtures ? 1 : 0) + (traceEnvelopes ? 1 : 0) !== 1) {
5578
+ if ((fixtures ? 1 : 0) + (executionTraces ? 1 : 0) !== 1) {
5612
5579
  throw new Error(
5613
- `Target "${target.name}" (provider: replay) requires exactly one replay source: "fixtures" or "trace_envelopes"`
5580
+ `Target "${target.name}" (provider: replay) requires exactly one replay source: "fixtures" or "execution_traces"`
5614
5581
  );
5615
5582
  }
5616
5583
  const fixturesPath = fixtures ? resolveReplaySourcePath(fixtures, evalFilePath) : void 0;
5617
- const traceEnvelopesPath = traceEnvelopes ? resolveReplaySourcePath(traceEnvelopes, evalFilePath) : void 0;
5618
- const source = fixturesPath ? { kind: "fixtures", path: fixturesPath } : { kind: "trace_envelopes", path: traceEnvelopesPath };
5584
+ const executionTracesPath = executionTraces ? resolveReplaySourcePath(executionTraces, evalFilePath) : void 0;
5585
+ const source = fixturesPath ? { kind: "fixtures", path: fixturesPath } : { kind: "execution_traces", path: executionTracesPath };
5619
5586
  const sourceTarget = resolveString(
5620
5587
  target.source_target,
5621
5588
  env,
@@ -6184,11 +6151,11 @@ async function expandFileReferences(tests, evalFileDir) {
6184
6151
  return expanded;
6185
6152
  }
6186
6153
 
6187
- // ../../packages/core/dist/chunk-RH5LAMMU.js
6188
- import path48 from "node:path";
6154
+ // ../../packages/core/dist/chunk-M6LF2BEU.js
6155
+ import path49 from "node:path";
6189
6156
  import { pathToFileURL as pathToFileURL2 } from "node:url";
6190
6157
  import { existsSync as existsSync7 } from "node:fs";
6191
- import path47 from "node:path";
6158
+ import path48 from "node:path";
6192
6159
  import micromatch4 from "micromatch";
6193
6160
  import { mkdir, readFile as readFile3, writeFile } from "node:fs/promises";
6194
6161
  import path5 from "node:path";
@@ -6916,10 +6883,10 @@ function assignProp(target, prop, value) {
6916
6883
  configurable: true
6917
6884
  });
6918
6885
  }
6919
- function getElementAtPath(obj, path49) {
6920
- if (!path49)
6886
+ function getElementAtPath(obj, path50) {
6887
+ if (!path50)
6921
6888
  return obj;
6922
- return path49.reduce((acc, key) => acc?.[key], obj);
6889
+ return path50.reduce((acc, key) => acc?.[key], obj);
6923
6890
  }
6924
6891
  function promiseAllObject(promisesObj) {
6925
6892
  const keys = Object.keys(promisesObj);
@@ -7239,11 +7206,11 @@ function aborted(x, startIndex = 0) {
7239
7206
  }
7240
7207
  return false;
7241
7208
  }
7242
- function prefixIssues(path49, issues) {
7209
+ function prefixIssues(path50, issues) {
7243
7210
  return issues.map((iss) => {
7244
7211
  var _a;
7245
7212
  (_a = iss).path ?? (_a.path = []);
7246
- iss.path.unshift(path49);
7213
+ iss.path.unshift(path50);
7247
7214
  return iss;
7248
7215
  });
7249
7216
  }
@@ -7380,7 +7347,7 @@ function treeifyError(error40, _mapper) {
7380
7347
  return issue2.message;
7381
7348
  };
7382
7349
  const result = { errors: [] };
7383
- const processError = (error41, path49 = []) => {
7350
+ const processError = (error41, path50 = []) => {
7384
7351
  var _a, _b;
7385
7352
  for (const issue2 of error41.issues) {
7386
7353
  if (issue2.code === "invalid_union" && issue2.errors.length) {
@@ -7390,7 +7357,7 @@ function treeifyError(error40, _mapper) {
7390
7357
  } else if (issue2.code === "invalid_element") {
7391
7358
  processError({ issues: issue2.issues }, issue2.path);
7392
7359
  } else {
7393
- const fullpath = [...path49, ...issue2.path];
7360
+ const fullpath = [...path50, ...issue2.path];
7394
7361
  if (fullpath.length === 0) {
7395
7362
  result.errors.push(mapper(issue2));
7396
7363
  continue;
@@ -7420,9 +7387,9 @@ function treeifyError(error40, _mapper) {
7420
7387
  processError(error40);
7421
7388
  return result;
7422
7389
  }
7423
- function toDotPath(path49) {
7390
+ function toDotPath(path50) {
7424
7391
  const segs = [];
7425
- for (const seg of path49) {
7392
+ for (const seg of path50) {
7426
7393
  if (typeof seg === "number")
7427
7394
  segs.push(`[${seg}]`);
7428
7395
  else if (typeof seg === "symbol")
@@ -18852,7 +18819,7 @@ var RequestError = class _RequestError extends Error {
18852
18819
  }
18853
18820
  };
18854
18821
 
18855
- // ../../packages/core/dist/chunk-RH5LAMMU.js
18822
+ // ../../packages/core/dist/chunk-M6LF2BEU.js
18856
18823
  import { exec as execCallback } from "node:child_process";
18857
18824
  import { readdirSync, statSync } from "node:fs";
18858
18825
  import { readFile as readFile32, readdir as readdir2, stat as stat2 } from "node:fs/promises";
@@ -18962,6 +18929,9 @@ import path44 from "node:path";
18962
18929
  import micromatch from "micromatch";
18963
18930
  import { readFile as readFile16 } from "node:fs/promises";
18964
18931
  import path43 from "node:path";
18932
+ import { mkdir as mkdir18, readFile as readFile20, writeFile as writeFile10 } from "node:fs/promises";
18933
+ import path47 from "node:path";
18934
+ import { readFile as readFile19 } from "node:fs/promises";
18965
18935
  var DEFAULT_CACHE_PATH = ".agentv/cache";
18966
18936
  var ResponseCache = class {
18967
18937
  cachePath;
@@ -19602,7 +19572,6 @@ var CodeGrader = class {
19602
19572
  getImageDir
19603
19573
  ),
19604
19574
  output: outputForPayload,
19605
- answer: context.candidate,
19606
19575
  messages: materializedMessages ?? [],
19607
19576
  outputPath,
19608
19577
  inputFiles: context.evalCase.file_paths,
@@ -19950,24 +19919,13 @@ var TEMPLATE_VARIABLES = {
19950
19919
  INPUT: "input",
19951
19920
  OUTPUT: "output",
19952
19921
  FILE_CHANGES: "file_changes",
19953
- TOOL_CALLS: "tool_calls",
19954
- /** @deprecated Use INPUT instead — resolves to the same text value. */
19955
- INPUT_TEXT: "input_text",
19956
- /** @deprecated Use OUTPUT instead — resolves to the same text value. */
19957
- OUTPUT_TEXT: "output_text",
19958
- /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
19959
- EXPECTED_OUTPUT_TEXT: "expected_output_text"
19922
+ TOOL_CALLS: "tool_calls"
19960
19923
  };
19961
19924
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
19962
19925
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
19963
19926
  TEMPLATE_VARIABLES.OUTPUT,
19964
19927
  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
19965
19928
  ]);
19966
- var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
19967
- [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
19968
- [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
19969
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
19970
- ]);
19971
19929
  var OPERATOR_GUIDANCE = {
19972
19930
  correctness: "Correctness: mark satisfied only when the answer positively supports or fulfills the outcome. Omission or contradiction should not satisfy it.",
19973
19931
  contradiction: "Contradiction guard: mark satisfied when the answer does not make a claim that contradicts the outcome. Do not require the answer to mention the outcome; mark unsatisfied only for incompatible claims."
@@ -20087,11 +20045,7 @@ function buildTemplateVariables(context) {
20087
20045
  [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
20088
20046
  [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
20089
20047
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
20090
- [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
20091
- // Deprecated aliases — same values as the primary variables above
20092
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
20093
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
20094
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
20048
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? ""
20095
20049
  };
20096
20050
  }
20097
20051
  function resolveContentBasePath(context) {
@@ -20168,7 +20122,6 @@ var LlmGrader = class {
20168
20122
  const variables = buildTemplateVariables(context);
20169
20123
  const systemPrompt = buildOutputSchema();
20170
20124
  const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
20171
- warnDeprecatedTemplateVars(graderTemplate);
20172
20125
  let userPrompt = substituteVariables(graderTemplate, variables);
20173
20126
  if (context.fileChanges && !context.graderTemplateOverride && !this.graderTemplate) {
20174
20127
  userPrompt += `
@@ -20499,7 +20452,6 @@ ${context.toolCalls}`;
20499
20452
  const variables = buildTemplateVariables(context);
20500
20453
  const template = context.graderTemplateOverride ?? this.graderTemplate;
20501
20454
  if (template) {
20502
- warnDeprecatedTemplateVars(template);
20503
20455
  return substituteVariables(template, variables);
20504
20456
  }
20505
20457
  const config2 = context.evaluator;
@@ -20553,7 +20505,6 @@ ${context.toolCalls}`;
20553
20505
  const template = context.graderTemplateOverride ?? this.graderTemplate;
20554
20506
  if (template) {
20555
20507
  const variables = buildTemplateVariables(context);
20556
- warnDeprecatedTemplateVars(template);
20557
20508
  const customPrompt = substituteVariables(template, variables);
20558
20509
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
20559
20510
  return `${customPrompt}
@@ -20707,7 +20658,6 @@ ${outputSchema}`;
20707
20658
  }
20708
20659
  buildCustomPrompt(context) {
20709
20660
  const template = context.graderTemplateOverride ?? this.graderTemplate ?? "";
20710
- warnDeprecatedTemplateVars(template);
20711
20661
  return substituteVariables(template, buildTemplateVariables(context));
20712
20662
  }
20713
20663
  buildRubricPrompt(context, rubrics) {
@@ -20890,26 +20840,6 @@ function substituteVariables(template, variables) {
20890
20840
  return variables[varName] ?? match;
20891
20841
  });
20892
20842
  }
20893
- var ANSI_YELLOW2 = "\x1B[33m";
20894
- var ANSI_RESET2 = "\x1B[0m";
20895
- var warnedTemplateStrings = /* @__PURE__ */ new Set();
20896
- function warnDeprecatedTemplateVars(template) {
20897
- if (warnedTemplateStrings.has(template)) return;
20898
- const used = [];
20899
- for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
20900
- if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
20901
- used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
20902
- }
20903
- }
20904
- if (used.length > 0) {
20905
- warnedTemplateStrings.add(template);
20906
- console.warn(
20907
- `${ANSI_YELLOW2}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
20908
- ${used.join("\n ")}
20909
- Update your custom grader template to use the new names.${ANSI_RESET2}`
20910
- );
20911
- }
20912
- }
20913
20843
  function calculateRubricScore(result, rubrics) {
20914
20844
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
20915
20845
  const assertions = [];
@@ -21473,7 +21403,7 @@ var CostGrader = class {
21473
21403
  };
21474
21404
  }
21475
21405
  };
21476
- var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trace.v1";
21406
+ var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trajectory.v1";
21477
21407
  var NORMALIZED_TRACE_SOURCE_KINDS = [
21478
21408
  "agentv_run",
21479
21409
  "otlp",
@@ -22532,115 +22462,115 @@ var FieldAccuracyGrader = class {
22532
22462
  * Evaluate a single field against the expected value.
22533
22463
  */
22534
22464
  evaluateField(fieldConfig, candidateData, expectedData) {
22535
- const { path: path49, match, required: required2 = true, weight = 1 } = fieldConfig;
22536
- const candidateValue = resolvePath(candidateData, path49);
22537
- const expectedValue = resolvePath(expectedData, path49);
22465
+ const { path: path50, match, required: required2 = true, weight = 1 } = fieldConfig;
22466
+ const candidateValue = resolvePath(candidateData, path50);
22467
+ const expectedValue = resolvePath(expectedData, path50);
22538
22468
  if (expectedValue === void 0) {
22539
22469
  return {
22540
- path: path49,
22470
+ path: path50,
22541
22471
  score: 1,
22542
22472
  // No expected value means no comparison needed
22543
22473
  weight,
22544
22474
  hit: true,
22545
- message: `${path49}: no expected value`
22475
+ message: `${path50}: no expected value`
22546
22476
  };
22547
22477
  }
22548
22478
  if (candidateValue === void 0) {
22549
22479
  if (required2) {
22550
22480
  return {
22551
- path: path49,
22481
+ path: path50,
22552
22482
  score: 0,
22553
22483
  weight,
22554
22484
  hit: false,
22555
- message: `${path49} (required, missing)`
22485
+ message: `${path50} (required, missing)`
22556
22486
  };
22557
22487
  }
22558
22488
  return {
22559
- path: path49,
22489
+ path: path50,
22560
22490
  score: 1,
22561
22491
  // Don't penalize missing optional fields
22562
22492
  weight: 0,
22563
22493
  // Zero weight means it won't affect the score
22564
22494
  hit: true,
22565
- message: `${path49}: optional field missing`
22495
+ message: `${path50}: optional field missing`
22566
22496
  };
22567
22497
  }
22568
22498
  switch (match) {
22569
22499
  case "exact":
22570
- return this.compareExact(path49, candidateValue, expectedValue, weight);
22500
+ return this.compareExact(path50, candidateValue, expectedValue, weight);
22571
22501
  case "numeric_tolerance":
22572
22502
  return this.compareNumericTolerance(
22573
- path49,
22503
+ path50,
22574
22504
  candidateValue,
22575
22505
  expectedValue,
22576
22506
  fieldConfig,
22577
22507
  weight
22578
22508
  );
22579
22509
  case "date":
22580
- return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
22510
+ return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
22581
22511
  default:
22582
22512
  return {
22583
- path: path49,
22513
+ path: path50,
22584
22514
  score: 0,
22585
22515
  weight,
22586
22516
  hit: false,
22587
- message: `${path49}: unknown match type "${match}"`
22517
+ message: `${path50}: unknown match type "${match}"`
22588
22518
  };
22589
22519
  }
22590
22520
  }
22591
22521
  /**
22592
22522
  * Exact equality comparison.
22593
22523
  */
22594
- compareExact(path49, candidateValue, expectedValue, weight) {
22524
+ compareExact(path50, candidateValue, expectedValue, weight) {
22595
22525
  if (deepEqual(candidateValue, expectedValue)) {
22596
22526
  return {
22597
- path: path49,
22527
+ path: path50,
22598
22528
  score: 1,
22599
22529
  weight,
22600
22530
  hit: true,
22601
- message: path49
22531
+ message: path50
22602
22532
  };
22603
22533
  }
22604
22534
  if (typeof candidateValue !== typeof expectedValue) {
22605
22535
  return {
22606
- path: path49,
22536
+ path: path50,
22607
22537
  score: 0,
22608
22538
  weight,
22609
22539
  hit: false,
22610
- message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
22540
+ message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
22611
22541
  };
22612
22542
  }
22613
22543
  return {
22614
- path: path49,
22544
+ path: path50,
22615
22545
  score: 0,
22616
22546
  weight,
22617
22547
  hit: false,
22618
- message: `${path49} (value mismatch)`
22548
+ message: `${path50} (value mismatch)`
22619
22549
  };
22620
22550
  }
22621
22551
  /**
22622
22552
  * Numeric comparison with absolute or relative tolerance.
22623
22553
  */
22624
- compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
22554
+ compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
22625
22555
  const { tolerance = 0, relative = false } = fieldConfig;
22626
22556
  const candidateNum = toNumber(candidateValue);
22627
22557
  const expectedNum = toNumber(expectedValue);
22628
22558
  if (candidateNum === null || expectedNum === null) {
22629
22559
  return {
22630
- path: path49,
22560
+ path: path50,
22631
22561
  score: 0,
22632
22562
  weight,
22633
22563
  hit: false,
22634
- message: `${path49} (non-numeric value)`
22564
+ message: `${path50} (non-numeric value)`
22635
22565
  };
22636
22566
  }
22637
22567
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
22638
22568
  return {
22639
- path: path49,
22569
+ path: path50,
22640
22570
  score: 0,
22641
22571
  weight,
22642
22572
  hit: false,
22643
- message: `${path49} (invalid numeric value)`
22573
+ message: `${path50} (invalid numeric value)`
22644
22574
  };
22645
22575
  }
22646
22576
  const diff = Math.abs(candidateNum - expectedNum);
@@ -22653,61 +22583,61 @@ var FieldAccuracyGrader = class {
22653
22583
  }
22654
22584
  if (withinTolerance) {
22655
22585
  return {
22656
- path: path49,
22586
+ path: path50,
22657
22587
  score: 1,
22658
22588
  weight,
22659
22589
  hit: true,
22660
- message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
22590
+ message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
22661
22591
  };
22662
22592
  }
22663
22593
  return {
22664
- path: path49,
22594
+ path: path50,
22665
22595
  score: 0,
22666
22596
  weight,
22667
22597
  hit: false,
22668
- message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
22598
+ message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
22669
22599
  };
22670
22600
  }
22671
22601
  /**
22672
22602
  * Date comparison with format normalization.
22673
22603
  */
22674
- compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
22604
+ compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
22675
22605
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
22676
22606
  const candidateDate = parseDate(String(candidateValue), formats);
22677
22607
  const expectedDate = parseDate(String(expectedValue), formats);
22678
22608
  if (candidateDate === null) {
22679
22609
  return {
22680
- path: path49,
22610
+ path: path50,
22681
22611
  score: 0,
22682
22612
  weight,
22683
22613
  hit: false,
22684
- message: `${path49} (unparseable candidate date)`
22614
+ message: `${path50} (unparseable candidate date)`
22685
22615
  };
22686
22616
  }
22687
22617
  if (expectedDate === null) {
22688
22618
  return {
22689
- path: path49,
22619
+ path: path50,
22690
22620
  score: 0,
22691
22621
  weight,
22692
22622
  hit: false,
22693
- message: `${path49} (unparseable expected date)`
22623
+ message: `${path50} (unparseable expected date)`
22694
22624
  };
22695
22625
  }
22696
22626
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
22697
22627
  return {
22698
- path: path49,
22628
+ path: path50,
22699
22629
  score: 1,
22700
22630
  weight,
22701
22631
  hit: true,
22702
- message: path49
22632
+ message: path50
22703
22633
  };
22704
22634
  }
22705
22635
  return {
22706
- path: path49,
22636
+ path: path50,
22707
22637
  score: 0,
22708
22638
  weight,
22709
22639
  hit: false,
22710
- message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
22640
+ message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
22711
22641
  };
22712
22642
  }
22713
22643
  /**
@@ -22740,11 +22670,11 @@ var FieldAccuracyGrader = class {
22740
22670
  };
22741
22671
  }
22742
22672
  };
22743
- function resolvePath(obj, path49) {
22744
- if (!path49 || !obj) {
22673
+ function resolvePath(obj, path50) {
22674
+ if (!path50 || !obj) {
22745
22675
  return void 0;
22746
22676
  }
22747
- const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
22677
+ const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
22748
22678
  let current = obj;
22749
22679
  for (const part of parts) {
22750
22680
  if (current === null || current === void 0) {
@@ -22999,10 +22929,7 @@ function buildTemplateVariables2(input) {
22999
22929
  [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty2(input.rubrics),
23000
22930
  [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact2(input.rubrics),
23001
22931
  [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? "",
23002
- [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? "",
23003
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
23004
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
23005
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? "").trim()
22932
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? ""
23006
22933
  };
23007
22934
  }
23008
22935
  function assembleLlmGraderPrompt(input) {
@@ -23278,8 +23205,8 @@ var TokenUsageGrader = class {
23278
23205
  };
23279
23206
  }
23280
23207
  };
23281
- function getNestedValue(obj, path49) {
23282
- const parts = path49.split(".");
23208
+ function getNestedValue(obj, path50) {
23209
+ const parts = path50.split(".");
23283
23210
  let current = obj;
23284
23211
  for (const part of parts) {
23285
23212
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -26420,11 +26347,13 @@ var CopilotCliProvider = class {
26420
26347
  supportsBatch = false;
26421
26348
  config;
26422
26349
  runPromptMode;
26423
- constructor(targetName, config2, promptRunner = defaultCopilotCliPromptRunner) {
26350
+ spawnAcpProcess;
26351
+ constructor(targetName, config2, promptRunner = defaultCopilotCliPromptRunner, spawnAcpProcess = spawn2) {
26424
26352
  this.id = `copilot-cli:${targetName}`;
26425
26353
  this.targetName = targetName;
26426
26354
  this.config = config2;
26427
26355
  this.runPromptMode = promptRunner;
26356
+ this.spawnAcpProcess = spawnAcpProcess;
26428
26357
  }
26429
26358
  async invoke(request) {
26430
26359
  if (request.signal?.aborted) {
@@ -26432,14 +26361,12 @@ var CopilotCliProvider = class {
26432
26361
  }
26433
26362
  const startTime = (/* @__PURE__ */ new Date()).toISOString();
26434
26363
  const startMs = Date.now();
26435
- if (this.config.customProvider) {
26436
- return await this.invokePromptMode(request, startTime, startMs);
26437
- }
26438
26364
  const logger = await this.createStreamLogger(request, "acp").catch(() => void 0);
26439
26365
  const executable = this.resolveExecutable();
26440
26366
  const args = this.buildCliArgs();
26441
- const agentProcess = spawn2(executable, args, {
26367
+ const agentProcess = this.spawnAcpProcess(executable, args, {
26442
26368
  env: buildCopilotCliProviderEnv(process.env, this.config.customProvider),
26369
+ cwd: this.resolveCwd(request.cwd) ?? process.cwd(),
26443
26370
  stdio: ["pipe", "pipe", "inherit"]
26444
26371
  });
26445
26372
  trackChild(agentProcess);
@@ -26455,6 +26382,7 @@ var CopilotCliProvider = class {
26455
26382
  const input = Writable.toWeb(agentProcess.stdin);
26456
26383
  const output = Readable.toWeb(agentProcess.stdout);
26457
26384
  const stream = ndJsonStream(input, output);
26385
+ const customProvider = this.config.customProvider;
26458
26386
  const client = {
26459
26387
  async requestPermission() {
26460
26388
  return {
@@ -26464,7 +26392,7 @@ var CopilotCliProvider = class {
26464
26392
  async sessionUpdate(params) {
26465
26393
  const update = params.update;
26466
26394
  const sessionUpdate = update.sessionUpdate;
26467
- logger?.handleEvent(sessionUpdate, update);
26395
+ logger?.handleEvent(sessionUpdate, sanitizeSensitiveValue(update, customProvider));
26468
26396
  if (sessionUpdate === "tool_call") {
26469
26397
  const callId = update.toolCallId ?? randomUUID5();
26470
26398
  if (!update.status || update.status === "pending" || update.status === "in_progress") {
@@ -26956,6 +26884,26 @@ function sanitizeSensitiveText(text, customProvider) {
26956
26884
  }
26957
26885
  return sanitized;
26958
26886
  }
26887
+ function sanitizeSensitiveValue(value, customProvider) {
26888
+ if (!customProvider) {
26889
+ return value;
26890
+ }
26891
+ if (typeof value === "string") {
26892
+ return sanitizeSensitiveText(value, customProvider);
26893
+ }
26894
+ if (Array.isArray(value)) {
26895
+ return value.map((item) => sanitizeSensitiveValue(item, customProvider));
26896
+ }
26897
+ if (value && typeof value === "object") {
26898
+ return Object.fromEntries(
26899
+ Object.entries(value).map(([key, entry]) => [
26900
+ key,
26901
+ sanitizeSensitiveValue(entry, customProvider)
26902
+ ])
26903
+ );
26904
+ }
26905
+ return value;
26906
+ }
26959
26907
  async function defaultCopilotCliPromptRunner(options) {
26960
26908
  return await new Promise((resolve, reject) => {
26961
26909
  const child = spawn2(options.executable, options.args, {
@@ -27322,12 +27270,14 @@ async function loadCopilotSdk() {
27322
27270
  const message = error40 instanceof Error ? error40.message : String(error40);
27323
27271
  if (message.includes("vscode-jsonrpc")) {
27324
27272
  throw new Error(
27325
- `Failed to load @github/copilot-sdk due to a known ESM compatibility issue with vscode-jsonrpc (https://github.com/github/copilot-sdk/issues/710).
27273
+ `@github/copilot-sdk failed to load: vscode-jsonrpc ESM import specifier mismatch.
27274
+ The package imports 'vscode-jsonrpc/node' but the installed version exposes 'node.js'.
27326
27275
 
27327
- Workarounds:
27328
- - Use the copilot-cli target instead (recommended): set target type to "copilot-cli" in your eval YAML
27329
- - If running under Node.js 24+: set NODE_OPTIONS="--experimental-specifier-resolution=node"
27330
- - Wait for vscode-jsonrpc@9.0.0 stable to be released upstream`
27276
+ Repair (run once in your project root):
27277
+ node -e "const p=require.resolve('vscode-jsonrpc/package.json').replace('/package.json',''); require('fs').symlinkSync(p+'/node.js',p+'/node','file')" 2>/dev/null || true
27278
+
27279
+ Or switch to the copilot-cli target (no SDK dependency):
27280
+ Set provider: copilot-cli in your eval YAML`
27331
27281
  );
27332
27282
  }
27333
27283
  throw new Error(
@@ -27358,7 +27308,8 @@ var CopilotSdkProvider = class {
27358
27308
  throw new Error("Copilot SDK request was aborted before execution");
27359
27309
  }
27360
27310
  const sdk = await loadCopilotSdk();
27361
- const client = await this.getOrCreateClient(sdk);
27311
+ const evalCwd = this.resolveCwd(request.cwd);
27312
+ const client = await this.getOrCreateClient(sdk, evalCwd ?? void 0);
27362
27313
  const startTime = (/* @__PURE__ */ new Date()).toISOString();
27363
27314
  const startMs = Date.now();
27364
27315
  const logger = await this.createStreamLogger(request).catch(() => void 0);
@@ -27368,10 +27319,9 @@ var CopilotSdkProvider = class {
27368
27319
  if (this.config.model) {
27369
27320
  sessionOptions.model = this.config.model;
27370
27321
  }
27371
- const cwd = this.resolveCwd(request.cwd);
27372
- if (cwd) {
27373
- sessionOptions.workingDirectory = cwd;
27374
- sessionOptions.skillDirectories = resolveSkillDirectories(cwd);
27322
+ if (evalCwd) {
27323
+ sessionOptions.workingDirectory = evalCwd;
27324
+ sessionOptions.skillDirectories = resolveSkillDirectories(evalCwd);
27375
27325
  }
27376
27326
  const systemPrompt = this.config.systemPrompt;
27377
27327
  if (systemPrompt) {
@@ -27380,12 +27330,12 @@ var CopilotSdkProvider = class {
27380
27330
  content: systemPrompt
27381
27331
  };
27382
27332
  }
27383
- const customProvider = resolveCustomProviderConfig(this.config);
27333
+ const customProvider = this.config.customProvider;
27384
27334
  if (customProvider) {
27385
27335
  const providerType = customProvider.type ?? "openai";
27386
27336
  const provider = {
27387
27337
  type: providerType,
27388
- baseUrl: normalizeByokBaseUrl(customProvider.baseUrl, providerType)
27338
+ baseUrl: normalizeProviderBaseUrl(customProvider.baseUrl, providerType)
27389
27339
  };
27390
27340
  if (customProvider.bearerToken) {
27391
27341
  provider.bearerToken = customProvider.bearerToken;
@@ -27532,7 +27482,7 @@ var CopilotSdkProvider = class {
27532
27482
  }
27533
27483
  }
27534
27484
  // biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
27535
- async getOrCreateClient(sdk) {
27485
+ async getOrCreateClient(sdk, evalCwd) {
27536
27486
  if (!this.client) {
27537
27487
  const clientOptions = {};
27538
27488
  if (this.config.cliUrl) {
@@ -27546,6 +27496,11 @@ var CopilotSdkProvider = class {
27546
27496
  clientOptions.cliPath = nativePath;
27547
27497
  }
27548
27498
  }
27499
+ const resolvedCwd = evalCwd ?? process.cwd();
27500
+ clientOptions.cwd = resolvedCwd;
27501
+ if (this.config.args && this.config.args.length > 0) {
27502
+ clientOptions.cliArgs = [...this.config.args];
27503
+ }
27549
27504
  if (this.config.githubToken) {
27550
27505
  clientOptions.githubToken = this.config.githubToken;
27551
27506
  }
@@ -27629,22 +27584,6 @@ var CopilotSdkProvider = class {
27629
27584
  }
27630
27585
  }
27631
27586
  };
27632
- function resolveCustomProviderConfig(config2) {
27633
- if (config2.customProvider) {
27634
- return config2.customProvider;
27635
- }
27636
- if (!config2.byokBaseUrl) {
27637
- return void 0;
27638
- }
27639
- return {
27640
- ...config2.byokType ? { type: config2.byokType } : {},
27641
- baseUrl: config2.byokBaseUrl,
27642
- ...config2.byokApiKey ? { apiKey: config2.byokApiKey } : {},
27643
- ...config2.byokBearerToken ? { bearerToken: config2.byokBearerToken } : {},
27644
- ...config2.byokApiVersion ? { apiVersion: config2.byokApiVersion } : {},
27645
- ...config2.byokWireApi ? { wireApi: config2.byokWireApi } : {}
27646
- };
27647
- }
27648
27587
  function resolveSkillDirectories(cwd) {
27649
27588
  const candidates = [
27650
27589
  path14.join(cwd, ".claude", "skills"),
@@ -27653,7 +27592,7 @@ function resolveSkillDirectories(cwd) {
27653
27592
  ];
27654
27593
  return candidates.filter((dir) => existsSync22(dir));
27655
27594
  }
27656
- function normalizeByokBaseUrl(baseUrl, type) {
27595
+ function normalizeProviderBaseUrl(baseUrl, type) {
27657
27596
  const trimmed = baseUrl.trim().replace(/\/+$/, "");
27658
27597
  if (/^https?:\/\//i.test(trimmed)) {
27659
27598
  return trimmed;
@@ -29690,8 +29629,9 @@ function extractTranscript(raw) {
29690
29629
  const transcript = raw.transcript;
29691
29630
  return transcript;
29692
29631
  }
29693
- var TRACE_ENVELOPE_SCHEMA_VERSION = "agentv.trace_envelope.v1";
29632
+ var EXECUTION_TRACE_SCHEMA_VERSION = "agentv.trace.v1";
29694
29633
  var TRACE_ENVELOPE_FORMAT = "otlp_openinference_spans";
29634
+ var TRANSCRIPT_MESSAGE_EVENT_NAME = "agentv.transcript.message";
29695
29635
  var CAPTURE_CONTENT_VALUES = ["none", "metadata", "full"];
29696
29636
  var REDACTION_LEVEL_VALUES = ["none", "partial", "full"];
29697
29637
  var WARNING_SEVERITY_VALUES = ["info", "warning", "error"];
@@ -29793,8 +29733,8 @@ var TraceEnvelopeScoreWireSchema = external_exports.object({
29793
29733
  evidence: AttributeMapWireSchema.optional()
29794
29734
  }).strict();
29795
29735
  var TraceEnvelopeWireSchema = external_exports.object({
29796
- schema_version: external_exports.literal(TRACE_ENVELOPE_SCHEMA_VERSION),
29797
- envelope_id: external_exports.string(),
29736
+ schema_version: external_exports.literal(EXECUTION_TRACE_SCHEMA_VERSION),
29737
+ artifact_id: external_exports.string(),
29798
29738
  created_at: external_exports.string(),
29799
29739
  eval: TraceEnvelopeEvalWireSchema,
29800
29740
  replay: TraceEnvelopeReplayWireSchema.optional(),
@@ -29808,6 +29748,9 @@ var TraceEnvelopeWireSchema = external_exports.object({
29808
29748
  function dropUndefined2(value) {
29809
29749
  return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== void 0));
29810
29750
  }
29751
+ function isRecord(value) {
29752
+ return typeof value === "object" && value !== null && !Array.isArray(value);
29753
+ }
29811
29754
  function definedStringRecord(value) {
29812
29755
  if (!value) {
29813
29756
  return void 0;
@@ -29831,6 +29774,32 @@ function parseTimeMs(timestamp) {
29831
29774
  function msToUnixNano(ms) {
29832
29775
  return String(BigInt(Math.round(ms)) * 1000000n);
29833
29776
  }
29777
+ function compareUnixNanoStrings(first, second) {
29778
+ try {
29779
+ const left = BigInt(first);
29780
+ const right = BigInt(second);
29781
+ return left < right ? -1 : left > right ? 1 : 0;
29782
+ } catch {
29783
+ return first.localeCompare(second);
29784
+ }
29785
+ }
29786
+ function compareSpanTime(first, second) {
29787
+ const byStart = compareUnixNanoStrings(first.startTimeUnixNano, second.startTimeUnixNano);
29788
+ if (byStart !== 0) {
29789
+ return byStart;
29790
+ }
29791
+ if (first.spanId === second.parentSpanId) {
29792
+ return -1;
29793
+ }
29794
+ if (second.spanId === first.parentSpanId) {
29795
+ return 1;
29796
+ }
29797
+ const byEnd = compareUnixNanoStrings(first.endTimeUnixNano, second.endTimeUnixNano);
29798
+ return byEnd !== 0 ? byEnd : first.spanId.localeCompare(second.spanId);
29799
+ }
29800
+ function orderedSpans(spans) {
29801
+ return [...spans].sort(compareSpanTime);
29802
+ }
29834
29803
  function unixNanoToIso(value) {
29835
29804
  if (!value) {
29836
29805
  return void 0;
@@ -29919,6 +29888,41 @@ function maybeToolContentAttributes(toolCall, capture) {
29919
29888
  "gen_ai.tool.call.result": toolCall.output
29920
29889
  });
29921
29890
  }
29891
+ function toTranscriptToolCallWire(toolCall, capture) {
29892
+ return dropUndefined2({
29893
+ tool: toolCall.tool,
29894
+ input: capture.content === "full" ? toolCall.input : void 0,
29895
+ output: capture.content === "full" ? toolCall.output : void 0,
29896
+ id: toolCall.id,
29897
+ start_time: toolCall.startTime,
29898
+ end_time: toolCall.endTime,
29899
+ duration_ms: toolCall.durationMs
29900
+ });
29901
+ }
29902
+ function toTranscriptMessageWire(message, capture) {
29903
+ return dropUndefined2({
29904
+ role: message.role,
29905
+ name: message.name,
29906
+ content: capture.content === "full" ? message.content : void 0,
29907
+ tool_calls: message.toolCalls?.map((toolCall) => toTranscriptToolCallWire(toolCall, capture)),
29908
+ start_time: message.startTime,
29909
+ end_time: message.endTime,
29910
+ duration_ms: message.durationMs,
29911
+ metadata: message.metadata,
29912
+ token_usage: message.tokenUsage
29913
+ });
29914
+ }
29915
+ function transcriptMessageEvent(message, index, capture) {
29916
+ const startMs = parseTimeMs(message.startTime);
29917
+ return {
29918
+ name: TRANSCRIPT_MESSAGE_EVENT_NAME,
29919
+ timeUnixNano: startMs !== void 0 ? msToUnixNano(startMs) : void 0,
29920
+ attributes: dropUndefined2({
29921
+ "agentv.transcript.message.index": index,
29922
+ "agentv.transcript.message": toTranscriptMessageWire(message, capture)
29923
+ })
29924
+ };
29925
+ }
29922
29926
  function spanStatusFromResult(result) {
29923
29927
  if (result.executionStatus === "execution_error" || result.error) {
29924
29928
  return { code: "ERROR", message: result.error };
@@ -29965,7 +29969,14 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
29965
29969
  const capture = capturePolicy(options);
29966
29970
  const source = sourceFromResult(result, options);
29967
29971
  const traceId = hashHex(
29968
- ["trace-envelope", result.timestamp, result.suite, result.testId, result.target, options.runId],
29972
+ [
29973
+ "execution-trace",
29974
+ result.timestamp,
29975
+ result.suite,
29976
+ result.testId,
29977
+ result.target,
29978
+ options.runId
29979
+ ],
29969
29980
  32
29970
29981
  );
29971
29982
  const rootSpanId = hashHex([traceId, "root"], 16);
@@ -29974,6 +29985,9 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
29974
29985
  const rootStatus = spanStatusFromResult(result);
29975
29986
  const conversionWarnings = [];
29976
29987
  const spans = [];
29988
+ const rootEvents = result.trace.messages.map(
29989
+ (message, index) => transcriptMessageEvent(message, index, capture)
29990
+ );
29977
29991
  const rootAttributes = dropUndefined2({
29978
29992
  "gen_ai.operation.name": "invoke_agent",
29979
29993
  "gen_ai.provider.name": "agentv",
@@ -30005,12 +30019,13 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
30005
30019
  status: rootStatus,
30006
30020
  attributes: rootAttributes,
30007
30021
  events: result.error ? [
30022
+ ...rootEvents,
30008
30023
  {
30009
30024
  name: "exception",
30010
30025
  timeUnixNano: msToUnixNano(Math.max(rootStartMs, rootEndMs)),
30011
30026
  attributes: { "exception.message": result.error }
30012
30027
  }
30013
- ] : []
30028
+ ] : rootEvents
30014
30029
  });
30015
30030
  const assistantEntries = assistantMessages(result.trace.messages);
30016
30031
  const chatEntries = assistantEntries.length > 0 ? assistantEntries : result.output.length > 0 ? [{ message: { role: "assistant", content: result.output }, index: 0 }] : [];
@@ -30094,7 +30109,7 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
30094
30109
  });
30095
30110
  }
30096
30111
  }
30097
- const envelopeId = `trace-env-${hashHex([traceId, result.timestamp, result.score], 20)}`;
30112
+ const artifactId = `execution-trace-${hashHex([traceId, result.timestamp, result.score], 20)}`;
30098
30113
  const evalIdentity = {
30099
30114
  evalId: options.evalId,
30100
30115
  evalPath: options.evalPath,
@@ -30109,8 +30124,8 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
30109
30124
  experiment: options.experiment
30110
30125
  };
30111
30126
  return {
30112
- schemaVersion: TRACE_ENVELOPE_SCHEMA_VERSION,
30113
- envelopeId,
30127
+ schemaVersion: EXECUTION_TRACE_SCHEMA_VERSION,
30128
+ artifactId,
30114
30129
  createdAt: now.toISOString(),
30115
30130
  eval: evalIdentity,
30116
30131
  replay: options.replay,
@@ -30133,7 +30148,7 @@ function toTraceEnvelopeWire(envelope) {
30133
30148
  return TraceEnvelopeWireSchema.parse(
30134
30149
  dropUndefined2({
30135
30150
  schema_version: envelope.schemaVersion,
30136
- envelope_id: envelope.envelopeId,
30151
+ artifact_id: envelope.artifactId,
30137
30152
  created_at: envelope.createdAt,
30138
30153
  eval: toTraceEnvelopeEvalWire(envelope.eval),
30139
30154
  replay: envelope.replay ? toTraceEnvelopeReplayWire(envelope.replay) : void 0,
@@ -30150,7 +30165,7 @@ function fromTraceEnvelopeWire(input) {
30150
30165
  const wire = TraceEnvelopeWireSchema.parse(input);
30151
30166
  return {
30152
30167
  schemaVersion: wire.schema_version,
30153
- envelopeId: wire.envelope_id,
30168
+ artifactId: wire.artifact_id,
30154
30169
  createdAt: wire.created_at,
30155
30170
  eval: fromTraceEnvelopeEvalWire(wire.eval),
30156
30171
  replay: wire.replay ? fromTraceEnvelopeReplayWire(wire.replay) : void 0,
@@ -30433,10 +30448,68 @@ function toolCallFromSpan(span) {
30433
30448
  durationMs: durationMsFromSpan(span)
30434
30449
  };
30435
30450
  }
30436
- function traceEnvelopeToMessages(envelope) {
30437
- const spans = [...envelope.trace.spans].sort(
30438
- (first, second) => first.startTimeUnixNano.localeCompare(second.startTimeUnixNano)
30439
- );
30451
+ function buildSpanMap(spans) {
30452
+ return new Map(spans.map((span) => [span.spanId, span]));
30453
+ }
30454
+ function ancestorSpanIds(span, spansById) {
30455
+ const ancestors = [];
30456
+ const seen = /* @__PURE__ */ new Set();
30457
+ let parentSpanId = span.parentSpanId ?? void 0;
30458
+ while (parentSpanId && !seen.has(parentSpanId)) {
30459
+ seen.add(parentSpanId);
30460
+ ancestors.push(parentSpanId);
30461
+ parentSpanId = spansById.get(parentSpanId)?.parentSpanId ?? void 0;
30462
+ }
30463
+ return ancestors;
30464
+ }
30465
+ function nearestAncestorToolCallId(ancestorIds, spansById) {
30466
+ for (const ancestorId of ancestorIds) {
30467
+ const ancestor = spansById.get(ancestorId);
30468
+ if (ancestor && isToolSpan(ancestor)) {
30469
+ return toolCallFromSpan(ancestor).id;
30470
+ }
30471
+ }
30472
+ return void 0;
30473
+ }
30474
+ function fromTranscriptToolCallWire(wire) {
30475
+ if (!isRecord(wire) || typeof wire.tool !== "string") {
30476
+ return void 0;
30477
+ }
30478
+ return {
30479
+ tool: wire.tool,
30480
+ input: wire.input,
30481
+ output: wire.output,
30482
+ id: typeof wire.id === "string" ? wire.id : void 0,
30483
+ startTime: typeof wire.start_time === "string" ? wire.start_time : void 0,
30484
+ endTime: typeof wire.end_time === "string" ? wire.end_time : void 0,
30485
+ durationMs: numberAttribute(wire, "duration_ms")
30486
+ };
30487
+ }
30488
+ function fromTranscriptMessageWire(wire) {
30489
+ if (!isRecord(wire) || typeof wire.role !== "string") {
30490
+ return void 0;
30491
+ }
30492
+ const toolCalls = Array.isArray(wire.tool_calls) ? wire.tool_calls.map(fromTranscriptToolCallWire).filter((toolCall) => toolCall !== void 0) : void 0;
30493
+ return dropUndefined2({
30494
+ role: wire.role,
30495
+ name: typeof wire.name === "string" ? wire.name : void 0,
30496
+ content: wire.content,
30497
+ toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : void 0,
30498
+ startTime: typeof wire.start_time === "string" ? wire.start_time : void 0,
30499
+ endTime: typeof wire.end_time === "string" ? wire.end_time : void 0,
30500
+ durationMs: numberAttribute(wire, "duration_ms"),
30501
+ metadata: isRecord(wire.metadata) ? wire.metadata : void 0,
30502
+ tokenUsage: isRecord(wire.token_usage) ? tokenUsageFromAttributes({
30503
+ "gen_ai.usage.input_tokens": wire.token_usage.input,
30504
+ "gen_ai.usage.output_tokens": wire.token_usage.output,
30505
+ "gen_ai.usage.cache_read.input_tokens": wire.token_usage.cached,
30506
+ "gen_ai.usage.reasoning.output_tokens": wire.token_usage.reasoning
30507
+ }) : void 0
30508
+ });
30509
+ }
30510
+ function traceEnvelopeToMessageEntries(envelope) {
30511
+ const spans = orderedSpans(envelope.trace.spans);
30512
+ const spansById = buildSpanMap(spans);
30440
30513
  const toolSpansByParent = /* @__PURE__ */ new Map();
30441
30514
  for (const span of spans.filter(isToolSpan)) {
30442
30515
  const parentSpanId = span.parentSpanId ?? envelope.trace.rootSpanId;
@@ -30444,20 +30517,97 @@ function traceEnvelopeToMessages(envelope) {
30444
30517
  existing.push(span);
30445
30518
  toolSpansByParent.set(parentSpanId, existing);
30446
30519
  }
30447
- return spans.filter(isChatSpan).map((span) => ({
30448
- role: "assistant",
30449
- content: span.attributes["gen_ai.output.messages"],
30450
- toolCalls: toolSpansByParent.get(span.spanId)?.map(toolCallFromSpan),
30451
- startTime: unixNanoToIso(span.startTimeUnixNano),
30452
- endTime: unixNanoToIso(span.endTimeUnixNano),
30453
- durationMs: durationMsFromSpan(span),
30454
- tokenUsage: tokenUsageFromAttributes(span.attributes),
30455
- metadata: {
30456
- span_id: span.spanId,
30457
- trace_id: span.traceId
30520
+ return spans.filter(isChatSpan).map((span, fallbackIndex) => ({
30521
+ index: numberAttribute(span.attributes, "agentv.message.index") ?? fallbackIndex,
30522
+ timeUnixNano: span.startTimeUnixNano,
30523
+ message: {
30524
+ role: "assistant",
30525
+ content: span.attributes["gen_ai.output.messages"],
30526
+ toolCalls: toolSpansByParent.get(span.spanId)?.map(toolCallFromSpan),
30527
+ startTime: unixNanoToIso(span.startTimeUnixNano),
30528
+ endTime: unixNanoToIso(span.endTimeUnixNano),
30529
+ durationMs: durationMsFromSpan(span),
30530
+ tokenUsage: tokenUsageFromAttributes(span.attributes),
30531
+ metadata: dropUndefined2({
30532
+ span_id: span.spanId,
30533
+ trace_id: span.traceId,
30534
+ parent_span_id: span.parentSpanId ?? void 0,
30535
+ parent_tool_call_id: nearestAncestorToolCallId(ancestorSpanIds(span, spansById), spansById)
30536
+ })
30458
30537
  }
30459
30538
  }));
30460
30539
  }
30540
+ function traceEnvelopeToMessages(envelope) {
30541
+ return traceEnvelopeToMessageEntries(envelope).map((entry) => entry.message);
30542
+ }
30543
+ function transcriptMessageEntries(envelope) {
30544
+ const entries = [];
30545
+ for (const span of orderedSpans(envelope.trace.spans)) {
30546
+ for (const event of span.events ?? []) {
30547
+ if (event.name !== TRANSCRIPT_MESSAGE_EVENT_NAME) {
30548
+ continue;
30549
+ }
30550
+ const attributes = event.attributes ?? {};
30551
+ const message = fromTranscriptMessageWire(attributes["agentv.transcript.message"]);
30552
+ if (!message) {
30553
+ continue;
30554
+ }
30555
+ entries.push({
30556
+ index: numberAttribute(attributes, "agentv.transcript.message.index") ?? entries.length,
30557
+ timeUnixNano: event.timeUnixNano,
30558
+ message
30559
+ });
30560
+ }
30561
+ }
30562
+ return entries;
30563
+ }
30564
+ function traceEnvelopeToTranscriptMessages(envelope) {
30565
+ const entries = transcriptMessageEntries(envelope);
30566
+ if (entries.length === 0) {
30567
+ return traceEnvelopeToMessages(envelope);
30568
+ }
30569
+ return [...entries].sort((first, second) => {
30570
+ const byIndex = first.index - second.index;
30571
+ if (byIndex !== 0) {
30572
+ return byIndex;
30573
+ }
30574
+ if (first.timeUnixNano && second.timeUnixNano) {
30575
+ return compareUnixNanoStrings(first.timeUnixNano, second.timeUnixNano);
30576
+ }
30577
+ return 0;
30578
+ }).map((entry) => entry.message);
30579
+ }
30580
+ function traceEnvelopeToToolTrajectoryView(envelope) {
30581
+ const spans = orderedSpans(envelope.trace.spans);
30582
+ const spansById = buildSpanMap(spans);
30583
+ const tools = spans.filter(isToolSpan).map((span, position) => {
30584
+ const toolCall = toolCallFromSpan(span);
30585
+ const toolCallId = toolCall.id ?? span.spanId;
30586
+ const ancestorIds = ancestorSpanIds(span, spansById);
30587
+ return {
30588
+ position,
30589
+ traceId: span.traceId,
30590
+ spanId: span.spanId,
30591
+ parentSpanId: span.parentSpanId ?? void 0,
30592
+ ancestorSpanIds: ancestorIds,
30593
+ tool: toolCall.tool,
30594
+ toolCallId,
30595
+ parentToolCallId: nearestAncestorToolCallId(ancestorIds, spansById),
30596
+ input: toolCall.input,
30597
+ output: toolCall.output,
30598
+ status: span.status.code === "ERROR" ? "error" : "ok",
30599
+ startTime: toolCall.startTime,
30600
+ endTime: toolCall.endTime,
30601
+ durationMs: toolCall.durationMs
30602
+ };
30603
+ });
30604
+ return {
30605
+ schemaVersion: NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
30606
+ traceId: envelope.trace.traceId,
30607
+ rootSpanId: envelope.trace.rootSpanId,
30608
+ tools
30609
+ };
30610
+ }
30461
30611
  function traceEnvelopeToTraceSummary(envelope) {
30462
30612
  const toolCallCounts = {};
30463
30613
  const toolDurations = {};
@@ -30509,7 +30659,7 @@ function traceEnvelopeToTraceSummary(envelope) {
30509
30659
  function traceEnvelopeToTraceArtifact(envelope) {
30510
30660
  const events = [];
30511
30661
  let ordinal = 0;
30512
- for (const span of envelope.trace.spans) {
30662
+ for (const span of orderedSpans(envelope.trace.spans)) {
30513
30663
  if (isChatSpan(span)) {
30514
30664
  events.push({
30515
30665
  eventId: `span-${span.spanId}`,
@@ -30578,6 +30728,95 @@ function traceEnvelopeToTraceArtifact(envelope) {
30578
30728
  function getTraceEnvelopeSummary(envelope) {
30579
30729
  return traceEnvelopeToTraceSummary(envelope).trace;
30580
30730
  }
30731
+ function traceEnvelopeToOtlpJson(envelope) {
30732
+ return {
30733
+ resourceSpans: [
30734
+ {
30735
+ resource: {
30736
+ attributes: attributesToOtlp(envelope.trace.resource?.attributes)
30737
+ },
30738
+ scopeSpans: [
30739
+ {
30740
+ scope: dropUndefined2({
30741
+ name: envelope.trace.scope?.name,
30742
+ version: envelope.trace.scope?.version
30743
+ }),
30744
+ spans: orderedSpans(envelope.trace.spans).map(
30745
+ (span) => dropUndefined2({
30746
+ traceId: span.traceId,
30747
+ spanId: span.spanId,
30748
+ parentSpanId: span.parentSpanId ?? void 0,
30749
+ name: span.name,
30750
+ kind: spanKindToOtlp(span.kind),
30751
+ startTimeUnixNano: span.startTimeUnixNano,
30752
+ endTimeUnixNano: span.endTimeUnixNano,
30753
+ attributes: attributesToOtlp(span.attributes),
30754
+ status: spanStatusToOtlp(span.status),
30755
+ events: span.events?.map(
30756
+ (event) => dropUndefined2({
30757
+ name: event.name,
30758
+ timeUnixNano: event.timeUnixNano,
30759
+ attributes: attributesToOtlp(event.attributes)
30760
+ })
30761
+ )
30762
+ })
30763
+ )
30764
+ }
30765
+ ]
30766
+ }
30767
+ ]
30768
+ };
30769
+ }
30770
+ function spanKindToOtlp(kind) {
30771
+ if (kind === "SERVER") {
30772
+ return 1;
30773
+ }
30774
+ if (kind === "CLIENT") {
30775
+ return 2;
30776
+ }
30777
+ if (kind === "PRODUCER") {
30778
+ return 3;
30779
+ }
30780
+ if (kind === "CONSUMER") {
30781
+ return 4;
30782
+ }
30783
+ return 0;
30784
+ }
30785
+ function spanStatusToOtlp(status) {
30786
+ const code = status.code === "OK" ? 1 : status.code === "ERROR" ? 2 : 0;
30787
+ return dropUndefined2({ code, message: status.message });
30788
+ }
30789
+ function attributesToOtlp(attributes) {
30790
+ return Object.entries(attributes ?? {}).map(([key, value]) => ({
30791
+ key,
30792
+ value: toOtlpAnyValue(value)
30793
+ }));
30794
+ }
30795
+ function toOtlpAnyValue(value) {
30796
+ if (typeof value === "string") {
30797
+ return { stringValue: value };
30798
+ }
30799
+ if (typeof value === "number") {
30800
+ return Number.isInteger(value) ? { intValue: value } : { doubleValue: value };
30801
+ }
30802
+ if (typeof value === "boolean") {
30803
+ return { boolValue: value };
30804
+ }
30805
+ if (Array.isArray(value)) {
30806
+ return { arrayValue: { values: value.map(toOtlpAnyValue) } };
30807
+ }
30808
+ return { stringValue: stringifyOtlpAttribute(value) };
30809
+ }
30810
+ function stringifyOtlpAttribute(value) {
30811
+ if (value === void 0) {
30812
+ return "";
30813
+ }
30814
+ try {
30815
+ return JSON.stringify(value);
30816
+ } catch {
30817
+ return String(value);
30818
+ }
30819
+ }
30581
30820
  async function readTraceEnvelopeReplayRecords(sourcePath) {
30582
30821
  let raw;
30583
30822
  try {
@@ -30585,7 +30824,7 @@ async function readTraceEnvelopeReplayRecords(sourcePath) {
30585
30824
  } catch (error40) {
30586
30825
  const reason = error40 instanceof Error ? error40.message : String(error40);
30587
30826
  throw new Error(
30588
- `Trace envelope replay source not found or unreadable: ${sourcePath}: ${reason}`
30827
+ `Execution trace replay source not found or unreadable: ${sourcePath}: ${reason}`
30589
30828
  );
30590
30829
  }
30591
30830
  const documents = parseTraceEnvelopeDocuments(raw, sourcePath);
@@ -30604,10 +30843,10 @@ function findTraceEnvelopeReplayRecord(records, lookup) {
30604
30843
  }
30605
30844
  const key = formatReplayLookupKey(lookup);
30606
30845
  if (matches.length === 0) {
30607
- throw new Error(`Trace envelope replay lookup found no record for ${key}`);
30846
+ throw new Error(`Execution trace replay lookup found no record for ${key}`);
30608
30847
  }
30609
30848
  throw new Error(
30610
- `Trace envelope replay lookup found ${matches.length} duplicate records for ${key}`
30849
+ `Execution trace replay lookup found ${matches.length} duplicate records for ${key}`
30611
30850
  );
30612
30851
  }
30613
30852
  function traceEnvelopeReplayRecordToProviderResponse(record2) {
@@ -30624,8 +30863,8 @@ function traceEnvelopeReplayRecordToProviderResponse(record2) {
30624
30863
  startTime: summary.startTime,
30625
30864
  endTime: summary.endTime,
30626
30865
  raw: {
30627
- replay_trace_envelope: dropUndefined3({
30628
- envelope_id: record2.envelope.envelopeId,
30866
+ replay_execution_trace: dropUndefined3({
30867
+ artifact_id: record2.envelope.artifactId,
30629
30868
  source_path: record2.sourcePath,
30630
30869
  line_number: record2.lineNumber,
30631
30870
  suite: identity.suite,
@@ -30668,7 +30907,7 @@ function parseTraceEnvelopeDocuments(raw, sourcePath) {
30668
30907
  documents.push({ value: JSON.parse(line), lineNumber: i + 1 });
30669
30908
  } catch (error40) {
30670
30909
  const reason = error40 instanceof Error ? error40.message : String(error40);
30671
- throw new Error(`Invalid trace envelope JSONL at ${sourcePath}:${i + 1}: ${reason}`);
30910
+ throw new Error(`Invalid execution trace JSONL at ${sourcePath}:${i + 1}: ${reason}`);
30672
30911
  }
30673
30912
  }
30674
30913
  return documents;
@@ -30680,7 +30919,7 @@ function parseTraceEnvelopeDocument(value, sourcePath, lineNumber) {
30680
30919
  } catch (error40) {
30681
30920
  const location = lineNumber === void 0 ? sourcePath : `${sourcePath}:${lineNumber}`;
30682
30921
  const reason = error40 instanceof Error ? error40.message : String(error40);
30683
- throw new Error(`Invalid trace envelope replay record at ${location}: ${reason}`);
30922
+ throw new Error(`Invalid execution trace replay record at ${location}: ${reason}`);
30684
30923
  }
30685
30924
  }
30686
30925
  function traceEnvelopeReplayIdentity(envelope) {
@@ -30705,13 +30944,13 @@ function lookupKeyNumber(lookupKey, key) {
30705
30944
  function assertReplayableMessages(output, record2) {
30706
30945
  if (output.length === 0) {
30707
30946
  throw new Error(
30708
- `Trace envelope replay source ${formatRecordLocation(record2)} cannot project to provider Message[]: no chat spans found`
30947
+ `Execution trace replay source ${formatRecordLocation(record2)} cannot project to provider Message[]: no chat spans found`
30709
30948
  );
30710
30949
  }
30711
30950
  const lastAssistant = [...output].reverse().find((message) => message.role === "assistant");
30712
30951
  if (!lastAssistant || lastAssistant.content === void 0) {
30713
30952
  throw new Error(
30714
- `Trace envelope replay source ${formatRecordLocation(record2)} is missing assistant output content; replay requires a full-content trace envelope before grading`
30953
+ `Execution trace replay source ${formatRecordLocation(record2)} is missing assistant output content; replay requires a full-content execution trace before grading`
30715
30954
  );
30716
30955
  }
30717
30956
  }
@@ -30740,7 +30979,7 @@ var ReplayProvider = class {
30740
30979
  const record2 = findReplayFixtureRecord(records, this.lookupForRequest(request));
30741
30980
  return replayFixtureRecordToProviderResponse(record2);
30742
30981
  }
30743
- case "trace_envelopes": {
30982
+ case "execution_traces": {
30744
30983
  const records = await readTraceEnvelopeReplayRecords(source.path);
30745
30984
  const record2 = findTraceEnvelopeReplayRecord(records, this.lookupForRequest(request));
30746
30985
  return traceEnvelopeReplayRecordToProviderResponse(record2);
@@ -30758,7 +30997,7 @@ var ReplayProvider = class {
30758
30997
  )
30759
30998
  );
30760
30999
  }
30761
- case "trace_envelopes": {
31000
+ case "execution_traces": {
30762
31001
  const records = await readTraceEnvelopeReplayRecords(source.path);
30763
31002
  return requests.map(
30764
31003
  (request) => traceEnvelopeReplayRecordToProviderResponse(
@@ -30791,7 +31030,7 @@ function resolveReplaySource(config2) {
30791
31030
  return { kind: "fixtures", path: config2.fixturesPath };
30792
31031
  }
30793
31032
  throw new Error(
30794
- "Replay provider requires exactly one replay source: fixtures or trace_envelopes"
31033
+ "Replay provider requires exactly one replay source: fixtures or execution_traces"
30795
31034
  );
30796
31035
  }
30797
31036
  async function pathExists(target) {
@@ -32132,7 +32371,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
32132
32371
  };
32133
32372
  }
32134
32373
  }
32135
- function isRecord(value) {
32374
+ function isRecord2(value) {
32136
32375
  return typeof value === "object" && value !== null && !Array.isArray(value);
32137
32376
  }
32138
32377
  function extractTargetsArray(parsed, absolutePath) {
@@ -32143,7 +32382,7 @@ function extractTargetsArray(parsed, absolutePath) {
32143
32382
  return targets;
32144
32383
  }
32145
32384
  function assertTargetDefinition(value, index, filePath) {
32146
- if (!isRecord(value)) {
32385
+ if (!isRecord2(value)) {
32147
32386
  throw new Error(`targets.yaml entry at index ${index} in ${filePath} must be an object`);
32148
32387
  }
32149
32388
  const name = value.name;
@@ -32176,7 +32415,7 @@ async function readTargetDefinitions(filePath) {
32176
32415
  }
32177
32416
  const raw = await readFile10(absolutePath, "utf8");
32178
32417
  const parsed = parseYamlValue(raw);
32179
- if (!isRecord(parsed)) {
32418
+ if (!isRecord2(parsed)) {
32180
32419
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
32181
32420
  }
32182
32421
  const targets = extractTargetsArray(parsed, absolutePath);
@@ -33248,13 +33487,22 @@ function fromYaml(raw) {
33248
33487
  }
33249
33488
  if (e.results && typeof e.results === "object") {
33250
33489
  const r = e.results;
33251
- if (typeof r.repo_url === "string" && r.repo_url.trim().length > 0) {
33490
+ const repoUrl = typeof r.repo_url === "string" && r.repo_url.trim().length > 0 ? r.repo_url.trim() : void 0;
33491
+ const repoPath = typeof r.repo_path === "string" && r.repo_path.trim().length > 0 ? r.repo_path.trim() : void 0;
33492
+ if (repoUrl || repoPath) {
33252
33493
  const sync = r.sync && typeof r.sync === "object" ? r.sync : void 0;
33253
33494
  entry.results = {
33254
- repoUrl: r.repo_url.trim(),
33495
+ ...repoUrl ? { repoUrl } : {},
33496
+ ...repoPath ? { repoPath } : {},
33255
33497
  ...typeof r.branch === "string" && r.branch.trim().length > 0 ? { branch: r.branch.trim() } : {},
33498
+ ...typeof r.remote === "string" && r.remote.trim().length > 0 ? { remote: r.remote.trim() } : {},
33256
33499
  ...typeof r.path === "string" && r.path.trim().length > 0 ? { path: r.path.trim() } : {},
33257
- ...sync && typeof sync.auto_push === "boolean" ? { sync: { autoPush: sync.auto_push } } : {},
33500
+ ...sync && (typeof sync.auto_push === "boolean" || typeof sync.require_push === "boolean") ? {
33501
+ sync: {
33502
+ ...typeof sync.auto_push === "boolean" ? { autoPush: sync.auto_push } : {},
33503
+ ...typeof sync.require_push === "boolean" ? { requirePush: sync.require_push } : {}
33504
+ }
33505
+ } : {},
33258
33506
  ...typeof r.branch_prefix === "string" && r.branch_prefix.trim().length > 0 ? { branchPrefix: r.branch_prefix.trim() } : {}
33259
33507
  };
33260
33508
  }
@@ -33273,11 +33521,20 @@ function toYaml(entry) {
33273
33521
  };
33274
33522
  if (entry.results) {
33275
33523
  yaml.results = {
33276
- repo_url: entry.results.repoUrl,
33524
+ ...entry.results.repoUrl !== void 0 && { repo_url: entry.results.repoUrl },
33525
+ ...entry.results.repoPath !== void 0 && { repo_path: entry.results.repoPath },
33277
33526
  ...entry.results.branch !== void 0 && { branch: entry.results.branch },
33527
+ ...entry.results.remote !== void 0 && { remote: entry.results.remote },
33278
33528
  ...entry.results.path !== void 0 && { path: entry.results.path },
33279
- ...entry.results.sync?.autoPush !== void 0 && {
33280
- sync: { auto_push: entry.results.sync.autoPush }
33529
+ ...(entry.results.sync?.autoPush !== void 0 || entry.results.sync?.requirePush !== void 0) && {
33530
+ sync: {
33531
+ ...entry.results.sync?.autoPush !== void 0 && {
33532
+ auto_push: entry.results.sync.autoPush
33533
+ },
33534
+ ...entry.results.sync?.requirePush !== void 0 && {
33535
+ require_push: entry.results.sync.requirePush
33536
+ }
33537
+ }
33281
33538
  },
33282
33539
  ...entry.results.branchPrefix !== void 0 && {
33283
33540
  branch_prefix: entry.results.branchPrefix
@@ -34031,9 +34288,9 @@ function cloneJsonValue(value) {
34031
34288
  return value;
34032
34289
  }
34033
34290
  var ANSI_RED = "\x1B[31m";
34034
- var ANSI_RESET22 = "\x1B[0m";
34291
+ var ANSI_RESET2 = "\x1B[0m";
34035
34292
  function logError(msg) {
34036
- console.error(`${ANSI_RED}Error: ${msg}${ANSI_RESET22}`);
34293
+ console.error(`${ANSI_RED}Error: ${msg}${ANSI_RESET2}`);
34037
34294
  }
34038
34295
  function isAgentSkillsFormat(parsed) {
34039
34296
  if (typeof parsed !== "object" || parsed === null) return false;
@@ -34201,8 +34458,8 @@ async function resolveFileReference3(rawValue, searchRoots) {
34201
34458
  }
34202
34459
  return { displayPath, attempted };
34203
34460
  }
34204
- var ANSI_YELLOW22 = "\x1B[33m";
34205
- var ANSI_RESET3 = "\x1B[0m";
34461
+ var ANSI_YELLOW2 = "\x1B[33m";
34462
+ var ANSI_RESET22 = "\x1B[0m";
34206
34463
  var DEFAULT_EVAL_PATTERNS = [
34207
34464
  "**/evals/**/*.eval.yaml",
34208
34465
  "**/evals/**/eval.yaml",
@@ -34566,13 +34823,20 @@ function parseResultsConfig(raw, configPath2) {
34566
34823
  return void 0;
34567
34824
  }
34568
34825
  const obj = raw;
34569
- if (obj.mode !== "github") {
34826
+ if (obj.mode !== void 0 && obj.mode !== "github") {
34570
34827
  logWarning(`Invalid results.mode in ${configPath2}, expected 'github'`);
34571
34828
  return void 0;
34572
34829
  }
34573
- const repo = typeof obj.repo === "string" ? obj.repo.trim() : "";
34574
- if (!repo) {
34575
- logWarning(`Invalid results.repo in ${configPath2}, expected non-empty string`);
34830
+ const legacyRepo = typeof obj.repo === "string" ? obj.repo.trim() : "";
34831
+ const repoUrl = typeof obj.repo_url === "string" ? obj.repo_url.trim() : "";
34832
+ const repoPath = typeof obj.repo_path === "string" ? obj.repo_path.trim() : "";
34833
+ const repo = legacyRepo || repoUrl;
34834
+ if (!repo && !repoPath) {
34835
+ logWarning(`Invalid results in ${configPath2}, expected repo_url/repo or repo_path`);
34836
+ return void 0;
34837
+ }
34838
+ if (repo && repoPath) {
34839
+ logWarning(`Invalid results in ${configPath2}, set only one of repo_url/repo or repo_path`);
34576
34840
  return void 0;
34577
34841
  }
34578
34842
  let branch;
@@ -34583,6 +34847,14 @@ function parseResultsConfig(raw, configPath2) {
34583
34847
  }
34584
34848
  branch = obj.branch.trim();
34585
34849
  }
34850
+ let remote;
34851
+ if (obj.remote !== void 0) {
34852
+ if (typeof obj.remote !== "string" || obj.remote.trim().length === 0) {
34853
+ logWarning(`Invalid results.remote in ${configPath2}, expected non-empty string`);
34854
+ return void 0;
34855
+ }
34856
+ remote = obj.remote.trim();
34857
+ }
34586
34858
  let resultsPath;
34587
34859
  if (obj.path !== void 0) {
34588
34860
  if (typeof obj.path !== "string" || obj.path.trim().length === 0) {
@@ -34602,6 +34874,26 @@ function parseResultsConfig(raw, configPath2) {
34602
34874
  logWarning(`Invalid results.auto_push in ${configPath2}, expected boolean`);
34603
34875
  return void 0;
34604
34876
  }
34877
+ let sync;
34878
+ if (obj.sync !== void 0) {
34879
+ if (typeof obj.sync !== "object" || obj.sync === null || Array.isArray(obj.sync)) {
34880
+ logWarning(`Invalid results.sync in ${configPath2}, expected object`);
34881
+ return void 0;
34882
+ }
34883
+ const syncObj = obj.sync;
34884
+ if (syncObj.auto_push !== void 0 && typeof syncObj.auto_push !== "boolean") {
34885
+ logWarning(`Invalid results.sync.auto_push in ${configPath2}, expected boolean`);
34886
+ return void 0;
34887
+ }
34888
+ if (syncObj.require_push !== void 0 && typeof syncObj.require_push !== "boolean") {
34889
+ logWarning(`Invalid results.sync.require_push in ${configPath2}, expected boolean`);
34890
+ return void 0;
34891
+ }
34892
+ sync = {
34893
+ ...typeof syncObj.auto_push === "boolean" && { auto_push: syncObj.auto_push },
34894
+ ...typeof syncObj.require_push === "boolean" && { require_push: syncObj.require_push }
34895
+ };
34896
+ }
34605
34897
  let branchPrefix;
34606
34898
  if (obj.branch_prefix !== void 0) {
34607
34899
  if (typeof obj.branch_prefix !== "string" || obj.branch_prefix.trim().length === 0) {
@@ -34612,10 +34904,14 @@ function parseResultsConfig(raw, configPath2) {
34612
34904
  }
34613
34905
  return {
34614
34906
  mode: "github",
34615
- repo,
34907
+ ...repo && { repo },
34908
+ ...repoUrl && { repo_url: repoUrl },
34909
+ ...repoPath && { repo_path: repoPath },
34616
34910
  ...branch !== void 0 && { branch },
34911
+ ...remote !== void 0 && { remote },
34617
34912
  ...resultsPath !== void 0 && { path: resultsPath },
34618
34913
  ...typeof obj.auto_push === "boolean" && { auto_push: obj.auto_push },
34914
+ ...sync && { sync },
34619
34915
  ...branchPrefix && { branch_prefix: branchPrefix }
34620
34916
  };
34621
34917
  }
@@ -34645,10 +34941,8 @@ function parseHooksConfig(raw, configPath2) {
34645
34941
  return void 0;
34646
34942
  }
34647
34943
  function logWarning(message) {
34648
- console.warn(`${ANSI_YELLOW22}Warning: ${message}${ANSI_RESET3}`);
34944
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET22}`);
34649
34945
  }
34650
- var ANSI_YELLOW3 = "\x1B[33m";
34651
- var ANSI_RESET4 = "\x1B[0m";
34652
34946
  async function validateCustomPromptContent(promptPath) {
34653
34947
  const content = await readFile14(promptPath, "utf8");
34654
34948
  validateTemplateVariables(content, promptPath);
@@ -34666,8 +34960,8 @@ function validateTemplateVariables(content, source) {
34666
34960
  }
34667
34961
  match = variablePattern.exec(content);
34668
34962
  }
34669
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
34670
- const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
34963
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT);
34964
+ const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
34671
34965
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
34672
34966
  if (!hasRequiredFields) {
34673
34967
  throw new Error(
@@ -34676,28 +34970,15 @@ function validateTemplateVariables(content, source) {
34676
34970
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
34677
34971
  );
34678
34972
  }
34679
- const deprecatedUsed = [];
34680
- for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
34681
- if (foundVariables.has(deprecated)) {
34682
- deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
34683
- }
34684
- }
34685
- if (deprecatedUsed.length > 0) {
34686
- console.warn(
34687
- `${ANSI_YELLOW3}Warning: Template at ${source} uses deprecated variable names:
34688
- ${deprecatedUsed.join("\n ")}
34689
- These still work but will be removed in a future version.${ANSI_RESET4}`
34690
- );
34691
- }
34692
34973
  if (invalidVariables.length > 0) {
34693
- const warningMessage = `${ANSI_YELLOW3}Warning: Custom grader template at ${source}
34974
+ const warningMessage = `Warning: Custom grader template at ${source}
34694
34975
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
34695
- Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET4}`;
34976
+ Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}`;
34696
34977
  console.warn(warningMessage);
34697
34978
  }
34698
34979
  }
34699
- var ANSI_YELLOW4 = "\x1B[33m";
34700
- var ANSI_RESET5 = "\x1B[0m";
34980
+ var ANSI_YELLOW22 = "\x1B[33m";
34981
+ var ANSI_RESET3 = "\x1B[0m";
34701
34982
  var MAX_ASSERTION_INCLUDE_DEPTH = 3;
34702
34983
  var PROMPT_FILE_PREFIX = "file://";
34703
34984
  function normalizeGraderType(type) {
@@ -35027,7 +35308,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
35027
35308
  let command;
35028
35309
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
35029
35310
  console.warn(
35030
- `${ANSI_YELLOW4}Warning: 'script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'command' instead.${ANSI_RESET5}`
35311
+ `${ANSI_YELLOW22}Warning: 'script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'command' instead.${ANSI_RESET3}`
35031
35312
  );
35032
35313
  }
35033
35314
  const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
@@ -35909,7 +36190,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
35909
36190
  if (isJsonObject2(rawPrompt)) {
35910
36191
  if (rawPrompt.script !== void 0 && rawPrompt.command === void 0) {
35911
36192
  console.warn(
35912
- `${ANSI_YELLOW4}Warning: 'prompt.script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'prompt.command' instead.${ANSI_RESET5}`
36193
+ `${ANSI_YELLOW22}Warning: 'prompt.script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'prompt.command' instead.${ANSI_RESET3}`
35913
36194
  );
35914
36195
  }
35915
36196
  const commandArray = asStringArray(
@@ -36210,10 +36491,10 @@ function warnUnconsumedCriteria(_criteria, _evaluators, _testId) {
36210
36491
  function logWarning2(message, details) {
36211
36492
  if (details && details.length > 0) {
36212
36493
  const detailBlock = details.join("\n");
36213
- console.warn(`${ANSI_YELLOW4}Warning: ${message}
36214
- ${detailBlock}${ANSI_RESET5}`);
36494
+ console.warn(`${ANSI_YELLOW22}Warning: ${message}
36495
+ ${detailBlock}${ANSI_RESET3}`);
36215
36496
  } else {
36216
- console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
36497
+ console.warn(`${ANSI_YELLOW22}Warning: ${message}${ANSI_RESET3}`);
36217
36498
  }
36218
36499
  }
36219
36500
  function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
@@ -36584,8 +36865,8 @@ function detectImageMediaType(filePath) {
36584
36865
  const ext = path43.extname(filePath).toLowerCase();
36585
36866
  return IMAGE_MEDIA_TYPES[ext];
36586
36867
  }
36587
- var ANSI_YELLOW5 = "\x1B[33m";
36588
- var ANSI_RESET6 = "\x1B[0m";
36868
+ var ANSI_YELLOW3 = "\x1B[33m";
36869
+ var ANSI_RESET4 = "\x1B[0m";
36589
36870
  async function processMessages(options) {
36590
36871
  const { messages, searchRoots, repoRootPath, textParts, messageType, verbose } = options;
36591
36872
  const processedMessages = [];
@@ -36716,10 +36997,10 @@ function asString3(value) {
36716
36997
  function logWarning3(message, details) {
36717
36998
  if (details && details.length > 0) {
36718
36999
  const detailBlock = details.join("\n");
36719
- console.warn(`${ANSI_YELLOW5}Warning: ${message}
36720
- ${detailBlock}${ANSI_RESET6}`);
37000
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
37001
+ ${detailBlock}${ANSI_RESET4}`);
36721
37002
  } else {
36722
- console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET6}`);
37003
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
36723
37004
  }
36724
37005
  }
36725
37006
  async function processExpectedMessages(options) {
@@ -36908,9 +37189,9 @@ function resolveInputMessages(raw, suiteInputFiles) {
36908
37189
  function resolveExpectedMessages(raw) {
36909
37190
  return expandExpectedOutputShorthand(raw.expected_output);
36910
37191
  }
36911
- var ANSI_YELLOW6 = "\x1B[33m";
37192
+ var ANSI_YELLOW4 = "\x1B[33m";
36912
37193
  var ANSI_RED2 = "\x1B[31m";
36913
- var ANSI_RESET7 = "\x1B[0m";
37194
+ var ANSI_RESET5 = "\x1B[0m";
36914
37195
  function matchesFilter(id, filter) {
36915
37196
  return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
36916
37197
  }
@@ -37100,19 +37381,19 @@ function asString4(value) {
37100
37381
  function logWarning4(message, details) {
37101
37382
  if (details && details.length > 0) {
37102
37383
  const detailBlock = details.join("\n");
37103
- console.warn(`${ANSI_YELLOW6}Warning: ${message}
37104
- ${detailBlock}${ANSI_RESET7}`);
37384
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}
37385
+ ${detailBlock}${ANSI_RESET5}`);
37105
37386
  } else {
37106
- console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET7}`);
37387
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
37107
37388
  }
37108
37389
  }
37109
37390
  function logError2(message, details) {
37110
37391
  if (details && details.length > 0) {
37111
37392
  const detailBlock = details.join("\n");
37112
37393
  console.error(`${ANSI_RED2}Error: ${message}
37113
- ${detailBlock}${ANSI_RESET7}`);
37394
+ ${detailBlock}${ANSI_RESET5}`);
37114
37395
  } else {
37115
- console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
37396
+ console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET5}`);
37116
37397
  }
37117
37398
  }
37118
37399
  var MetadataSchema = external_exports.object({
@@ -37324,9 +37605,9 @@ function buildChatPromptFromSegments(options) {
37324
37605
  }
37325
37606
  return chatPrompt.length > 0 ? chatPrompt : void 0;
37326
37607
  }
37327
- var ANSI_YELLOW7 = "\x1B[33m";
37608
+ var ANSI_YELLOW5 = "\x1B[33m";
37328
37609
  var ANSI_RED3 = "\x1B[31m";
37329
- var ANSI_RESET8 = "\x1B[0m";
37610
+ var ANSI_RESET6 = "\x1B[0m";
37330
37611
  function matchesFilter2(id, filter) {
37331
37612
  return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
37332
37613
  }
@@ -37404,7 +37685,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
37404
37685
  return { tests: await loadTestsFromAgentSkills(evalFilePath) };
37405
37686
  }
37406
37687
  if (format === "typescript") {
37407
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-TJT6BGFF-DI7XNSO4.js");
37688
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-NWH3B4HG-UXXCZKLP.js");
37408
37689
  return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
37409
37690
  }
37410
37691
  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
@@ -37439,7 +37720,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
37439
37720
  return loadTestsFromAgentSkills(evalFilePath);
37440
37721
  }
37441
37722
  if (format === "typescript") {
37442
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-TJT6BGFF-DI7XNSO4.js");
37723
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-NWH3B4HG-UXXCZKLP.js");
37443
37724
  const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
37444
37725
  return suite.tests;
37445
37726
  }
@@ -38151,19 +38432,19 @@ function mergeSuiteMetadataPayload(caseMetadata, suitePayload) {
38151
38432
  function logWarning5(message, details) {
38152
38433
  if (details && details.length > 0) {
38153
38434
  const detailBlock = details.join("\n");
38154
- console.warn(`${ANSI_YELLOW7}Warning: ${message}
38155
- ${detailBlock}${ANSI_RESET8}`);
38435
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
38436
+ ${detailBlock}${ANSI_RESET6}`);
38156
38437
  } else {
38157
- console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET8}`);
38438
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET6}`);
38158
38439
  }
38159
38440
  }
38160
38441
  function logError3(message, details) {
38161
38442
  if (details && details.length > 0) {
38162
38443
  const detailBlock = details.join("\n");
38163
38444
  console.error(`${ANSI_RED3}Error: ${message}
38164
- ${detailBlock}${ANSI_RESET8}`);
38445
+ ${detailBlock}${ANSI_RESET6}`);
38165
38446
  } else {
38166
- console.error(`${ANSI_RED3}Error: ${message}${ANSI_RESET8}`);
38447
+ console.error(`${ANSI_RED3}Error: ${message}${ANSI_RESET6}`);
38167
38448
  }
38168
38449
  }
38169
38450
  var execFileAsync2 = promisify6(execFile2);
@@ -38266,18 +38547,18 @@ function validateDependencyGraph(tests) {
38266
38547
  }
38267
38548
  const visited = /* @__PURE__ */ new Set();
38268
38549
  const visiting = /* @__PURE__ */ new Set();
38269
- function visit(id, path49) {
38550
+ function visit(id, path50) {
38270
38551
  if (visiting.has(id)) {
38271
- const cycle = [...path49.slice(path49.indexOf(id)), id];
38552
+ const cycle = [...path50.slice(path50.indexOf(id)), id];
38272
38553
  throw new Error(`Circular dependency detected: ${cycle.join(" \u2192 ")}`);
38273
38554
  }
38274
38555
  if (visited.has(id)) return;
38275
38556
  visiting.add(id);
38276
- path49.push(id);
38557
+ path50.push(id);
38277
38558
  for (const dep of depMap.get(id) ?? []) {
38278
- visit(dep, path49);
38559
+ visit(dep, path50);
38279
38560
  }
38280
- path49.pop();
38561
+ path50.pop();
38281
38562
  visiting.delete(id);
38282
38563
  visited.add(id);
38283
38564
  }
@@ -41201,6 +41482,1179 @@ function createFunctionProvider(taskFn) {
41201
41482
  }
41202
41483
  };
41203
41484
  }
41485
+ function dropUndefined4(value) {
41486
+ return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== void 0));
41487
+ }
41488
+ function toTranscriptTokenUsage(usage) {
41489
+ if (!usage) {
41490
+ return void 0;
41491
+ }
41492
+ return dropUndefined4({
41493
+ input: usage.input,
41494
+ output: usage.output,
41495
+ cached: usage.cached,
41496
+ reasoning: usage.reasoning
41497
+ });
41498
+ }
41499
+ function toTranscriptToolCall(toolCall) {
41500
+ return dropUndefined4({
41501
+ tool: toolCall.tool,
41502
+ input: toolCall.input,
41503
+ output: toolCall.output,
41504
+ id: toolCall.id,
41505
+ start_time: toolCall.startTime,
41506
+ end_time: toolCall.endTime,
41507
+ duration_ms: toolCall.durationMs
41508
+ });
41509
+ }
41510
+ function toTranscriptMessageFields(message) {
41511
+ return dropUndefined4({
41512
+ role: message.role,
41513
+ name: message.name,
41514
+ content: message.content,
41515
+ tool_calls: message.toolCalls?.map(toTranscriptToolCall),
41516
+ start_time: message.startTime,
41517
+ end_time: message.endTime,
41518
+ duration_ms: message.durationMs,
41519
+ metadata: message.metadata,
41520
+ token_usage: toTranscriptTokenUsage(message.tokenUsage)
41521
+ });
41522
+ }
41523
+ function toTranscriptJsonLines(entry, options) {
41524
+ const source = {
41525
+ provider: entry.source.provider,
41526
+ session_id: entry.source.sessionId,
41527
+ model: entry.source.model,
41528
+ timestamp: entry.source.startedAt,
41529
+ git_branch: entry.source.gitBranch,
41530
+ cwd: entry.source.cwd ?? entry.source.projectPath,
41531
+ version: entry.source.version
41532
+ };
41533
+ const transcriptTokenUsage = entry.tokenUsage ? {
41534
+ input: entry.tokenUsage.input,
41535
+ output: entry.tokenUsage.output,
41536
+ cached: entry.tokenUsage.cached,
41537
+ reasoning: entry.tokenUsage.reasoning
41538
+ } : void 0;
41539
+ const testId = options?.testId ?? entry.source.sessionId;
41540
+ const target = options?.target ?? entry.source.provider;
41541
+ return entry.messages.map((message, index) => ({
41542
+ test_id: testId,
41543
+ target,
41544
+ message_index: index,
41545
+ ...toTranscriptMessageFields(message),
41546
+ transcript_token_usage: transcriptTokenUsage,
41547
+ transcript_duration_ms: entry.durationMs,
41548
+ transcript_cost_usd: entry.costUsd,
41549
+ source
41550
+ }));
41551
+ }
41552
+ function traceToTranscriptJsonLines(trace, options) {
41553
+ const provider = (typeof trace.metadata?.provider === "string" ? trace.metadata.provider : void 0) ?? options?.target ?? "agentv";
41554
+ const sessionId = (typeof trace.metadata?.provider_session_id === "string" ? trace.metadata.provider_session_id : void 0) ?? (typeof trace.metadata?.eval_case_id === "string" ? trace.metadata.eval_case_id : void 0) ?? options?.testId ?? "trace";
41555
+ return toTranscriptJsonLines(
41556
+ {
41557
+ messages: [...trace.messages],
41558
+ source: {
41559
+ provider,
41560
+ sessionId,
41561
+ startedAt: trace.startTime
41562
+ },
41563
+ tokenUsage: trace.tokenUsage,
41564
+ durationMs: trace.durationMs,
41565
+ costUsd: trace.costUsd
41566
+ },
41567
+ options
41568
+ );
41569
+ }
41570
+ function traceFromTranscriptJsonLines(lines) {
41571
+ const [entry] = groupTranscriptJsonLines(lines);
41572
+ if (!entry) {
41573
+ return buildTraceFromMessages();
41574
+ }
41575
+ return buildTraceFromMessages({
41576
+ output: entry.messages,
41577
+ tokenUsage: entry.tokenUsage,
41578
+ durationMs: entry.durationMs,
41579
+ costUsd: entry.costUsd ?? void 0,
41580
+ startTime: entry.source.startedAt,
41581
+ provider: entry.source.provider,
41582
+ target: entry.target,
41583
+ testId: entry.testId,
41584
+ conversationId: entry.source.sessionId
41585
+ });
41586
+ }
41587
+ function fromTranscriptTokenUsage(usage) {
41588
+ if (!usage) {
41589
+ return void 0;
41590
+ }
41591
+ return {
41592
+ input: usage.input,
41593
+ output: usage.output,
41594
+ cached: usage.cached,
41595
+ reasoning: usage.reasoning
41596
+ };
41597
+ }
41598
+ function readOptionalString(record2, key) {
41599
+ const value = record2[key];
41600
+ return typeof value === "string" ? value : void 0;
41601
+ }
41602
+ function readOptionalNumber(record2, key) {
41603
+ const value = record2[key];
41604
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
41605
+ }
41606
+ function fromTranscriptToolCall(wire) {
41607
+ const tool = readOptionalString(wire, "tool");
41608
+ if (!tool) {
41609
+ return void 0;
41610
+ }
41611
+ return {
41612
+ tool,
41613
+ input: wire.input,
41614
+ output: wire.output,
41615
+ id: readOptionalString(wire, "id"),
41616
+ startTime: readOptionalString(wire, "start_time"),
41617
+ endTime: readOptionalString(wire, "end_time"),
41618
+ durationMs: readOptionalNumber(wire, "duration_ms")
41619
+ };
41620
+ }
41621
+ function buildReplayMessage(line) {
41622
+ return {
41623
+ role: line.role,
41624
+ name: line.name,
41625
+ content: line.content,
41626
+ toolCalls: line.tool_calls?.map(fromTranscriptToolCall).filter((toolCall) => toolCall !== void 0),
41627
+ startTime: line.start_time,
41628
+ endTime: line.end_time,
41629
+ durationMs: line.duration_ms,
41630
+ metadata: line.metadata,
41631
+ tokenUsage: fromTranscriptTokenUsage(line.token_usage)
41632
+ };
41633
+ }
41634
+ function groupTranscriptJsonLines(lines) {
41635
+ const grouped = /* @__PURE__ */ new Map();
41636
+ for (const line of lines) {
41637
+ const existing = grouped.get(line.test_id);
41638
+ const source = {
41639
+ provider: line.source.provider,
41640
+ sessionId: line.source.session_id,
41641
+ startedAt: line.source.timestamp,
41642
+ model: line.source.model,
41643
+ gitBranch: line.source.git_branch,
41644
+ cwd: line.source.cwd,
41645
+ version: line.source.version
41646
+ };
41647
+ const transcriptTokenUsage = line.transcript_token_usage ? {
41648
+ input: line.transcript_token_usage.input,
41649
+ output: line.transcript_token_usage.output,
41650
+ cached: line.transcript_token_usage.cached,
41651
+ reasoning: line.transcript_token_usage.reasoning
41652
+ } : void 0;
41653
+ if (existing) {
41654
+ existing.messages.push({ index: line.message_index, message: buildReplayMessage(line) });
41655
+ continue;
41656
+ }
41657
+ grouped.set(line.test_id, {
41658
+ target: line.target,
41659
+ tokenUsage: transcriptTokenUsage,
41660
+ durationMs: line.transcript_duration_ms,
41661
+ costUsd: line.transcript_cost_usd,
41662
+ source,
41663
+ messages: [{ index: line.message_index, message: buildReplayMessage(line) }]
41664
+ });
41665
+ }
41666
+ return [...grouped.entries()].map(([testId, entry]) => ({
41667
+ testId,
41668
+ target: entry.target,
41669
+ tokenUsage: entry.tokenUsage,
41670
+ durationMs: entry.durationMs,
41671
+ costUsd: entry.costUsd,
41672
+ source: entry.source,
41673
+ messages: entry.messages.sort((first, second) => first.index - second.index).map((item) => item.message)
41674
+ }));
41675
+ }
41676
+ async function readTranscriptJsonl(filePath) {
41677
+ const text = await readFile19(filePath, "utf8");
41678
+ return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
41679
+ }
41680
+ async function readTranscriptFile(filePath) {
41681
+ return readFile19(filePath, "utf8");
41682
+ }
41683
+ var ResultRowSchemaError = class extends Error {
41684
+ constructor(message) {
41685
+ super(message);
41686
+ this.name = "ResultRowSchemaError";
41687
+ }
41688
+ };
41689
+ var MIGRATION_GUIDANCE = "Expected an AgentV result row with a numeric score. Eval-case JSONL is input data, not a results artifact. Run `agentv eval <eval-file> --output <run-dir>` and pass the run workspace or its index.jsonl manifest.";
41690
+ var RESULT_ROW_ALIASES = {
41691
+ answerPath: "answer_path",
41692
+ artifactDir: "artifact_dir",
41693
+ conversationId: "conversation_id",
41694
+ costUsd: "cost_usd",
41695
+ durationMs: "duration_ms",
41696
+ endTime: "end_time",
41697
+ evalPath: "eval_path",
41698
+ executionStatus: "execution_status",
41699
+ failureReasonCode: "failure_reason_code",
41700
+ failureStage: "failure_stage",
41701
+ filesPath: "files_path",
41702
+ gradersPath: "graders_path",
41703
+ gradingPath: "grading_path",
41704
+ inputPath: "input_path",
41705
+ outputPath: "output_path",
41706
+ responsePath: "response_path",
41707
+ startTime: "start_time",
41708
+ targetsPath: "targets_path",
41709
+ taskDir: "task_dir",
41710
+ testId: "test_id",
41711
+ timingPath: "timing_path",
41712
+ tokenUsage: "token_usage",
41713
+ transcriptPath: "transcript_path",
41714
+ workspacePath: "workspace_path"
41715
+ };
41716
+ var TRACE_SUMMARY_ALIASES = {
41717
+ costUsd: "cost_usd",
41718
+ durationMs: "duration_ms",
41719
+ errorCount: "error_count",
41720
+ eventCount: "event_count",
41721
+ llmCallCount: "llm_call_count",
41722
+ tokenUsage: "token_usage",
41723
+ toolCalls: "tool_calls",
41724
+ toolDurations: "tool_durations"
41725
+ };
41726
+ var MESSAGE_ALIASES = {
41727
+ durationMs: "duration_ms",
41728
+ endTime: "end_time",
41729
+ startTime: "start_time",
41730
+ tokenUsage: "token_usage",
41731
+ toolCalls: "tool_calls"
41732
+ };
41733
+ var TOOL_CALL_ALIASES = {
41734
+ durationMs: "duration_ms",
41735
+ endTime: "end_time",
41736
+ startTime: "start_time"
41737
+ };
41738
+ function isRecord3(value) {
41739
+ return typeof value === "object" && value !== null && !Array.isArray(value);
41740
+ }
41741
+ function normalizeKnownAliases(value, aliases) {
41742
+ const normalized = { ...value };
41743
+ for (const [camelKey, snakeKey] of Object.entries(aliases)) {
41744
+ if (normalized[snakeKey] === void 0 && normalized[camelKey] !== void 0) {
41745
+ normalized[snakeKey] = normalized[camelKey];
41746
+ }
41747
+ if (camelKey !== snakeKey) {
41748
+ delete normalized[camelKey];
41749
+ }
41750
+ }
41751
+ return normalized;
41752
+ }
41753
+ function normalizeToolCall2(value) {
41754
+ if (!isRecord3(value)) {
41755
+ return value;
41756
+ }
41757
+ return normalizeKnownAliases(value, TOOL_CALL_ALIASES);
41758
+ }
41759
+ function normalizeMessage(value) {
41760
+ if (!isRecord3(value)) {
41761
+ return value;
41762
+ }
41763
+ const normalized = normalizeKnownAliases(value, MESSAGE_ALIASES);
41764
+ if (Array.isArray(normalized.tool_calls)) {
41765
+ normalized.tool_calls = normalized.tool_calls.map(normalizeToolCall2);
41766
+ }
41767
+ return normalized;
41768
+ }
41769
+ function normalizeTraceSummary(value) {
41770
+ if (!isRecord3(value)) {
41771
+ return value;
41772
+ }
41773
+ const normalized = normalizeKnownAliases(value, TRACE_SUMMARY_ALIASES);
41774
+ if (Array.isArray(normalized.messages)) {
41775
+ normalized.messages = normalized.messages.map(normalizeMessage);
41776
+ }
41777
+ return normalized;
41778
+ }
41779
+ function normalizeOutput(value) {
41780
+ if (!Array.isArray(value)) {
41781
+ return value;
41782
+ }
41783
+ return value.map(normalizeMessage);
41784
+ }
41785
+ function buildSchemaError(context) {
41786
+ const location = [
41787
+ context.sourceLabel ? ` in ${context.sourceLabel}` : "",
41788
+ context.lineNumber !== void 0 ? ` at line ${context.lineNumber}` : ""
41789
+ ].join("");
41790
+ return new ResultRowSchemaError(`Unsupported result row${location}. ${MIGRATION_GUIDANCE}`);
41791
+ }
41792
+ function buildInvalidScoreError(context) {
41793
+ const location = [
41794
+ context.sourceLabel ? ` in ${context.sourceLabel}` : "",
41795
+ context.lineNumber !== void 0 ? ` at line ${context.lineNumber}` : ""
41796
+ ].join("");
41797
+ return new ResultRowSchemaError(`Missing or invalid score in result row${location}.`);
41798
+ }
41799
+ function looksLikeResultRow(value) {
41800
+ return typeof value.test_id === "string" || Object.hasOwn(value, "score") || Object.hasOwn(value, "trace") || Object.hasOwn(value, "spans") || Object.hasOwn(value, "target") || Object.hasOwn(value, "grading_path") || Object.hasOwn(value, "timing_path");
41801
+ }
41802
+ function normalizeResultRow(value, context = {}) {
41803
+ if (!isRecord3(value)) {
41804
+ throw buildSchemaError(context);
41805
+ }
41806
+ const normalized = normalizeKnownAliases(value, RESULT_ROW_ALIASES);
41807
+ if (normalized.trace !== void 0) {
41808
+ normalized.trace = normalizeTraceSummary(normalized.trace);
41809
+ }
41810
+ if (normalized.output !== void 0) {
41811
+ normalized.output = normalizeOutput(normalized.output);
41812
+ }
41813
+ if (typeof normalized.score !== "number" || !Number.isFinite(normalized.score)) {
41814
+ if (looksLikeResultRow(normalized)) {
41815
+ throw buildInvalidScoreError(context);
41816
+ }
41817
+ throw buildSchemaError(context);
41818
+ }
41819
+ return normalized;
41820
+ }
41821
+ var RESULT_INDEX_FILENAME = "index.jsonl";
41822
+ function buildTestTargetKey(testId, target) {
41823
+ return `${testId ?? "unknown"}::${target ?? "unknown"}`;
41824
+ }
41825
+ function deduplicateByTestIdTarget(results) {
41826
+ const seen = /* @__PURE__ */ new Map();
41827
+ for (let i = 0; i < results.length; i++) {
41828
+ seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
41829
+ }
41830
+ const deduped = [];
41831
+ for (let i = 0; i < results.length; i++) {
41832
+ const key = buildTestTargetKey(results[i].testId, results[i].target);
41833
+ if (seen.get(key) === i) {
41834
+ deduped.push(results[i]);
41835
+ }
41836
+ }
41837
+ return deduped;
41838
+ }
41839
+ async function aggregateRunDir(runDir, options) {
41840
+ const indexPath = path47.join(runDir, RESULT_INDEX_FILENAME);
41841
+ const content = await readFile20(indexPath, "utf8");
41842
+ const allResults = parseJsonlResults(content);
41843
+ const results = deduplicateByTestIdTarget(allResults);
41844
+ const timing = buildTimingArtifact(results);
41845
+ const timingPath = path47.join(runDir, "timing.json");
41846
+ await writeFile10(timingPath, `${JSON.stringify(timing, null, 2)}
41847
+ `, "utf8");
41848
+ const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(path47.join(runDir, "benchmark.json"));
41849
+ const benchmark = buildBenchmarkArtifact(
41850
+ results,
41851
+ options?.evalFile,
41852
+ options?.experiment,
41853
+ plannedTestCount
41854
+ );
41855
+ const benchmarkPath = path47.join(runDir, "benchmark.json");
41856
+ await writeFile10(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
41857
+ `, "utf8");
41858
+ const targetSet = new Set(results.map((r) => r.target ?? "unknown"));
41859
+ return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
41860
+ }
41861
+ async function readPlannedTestCount(benchmarkPath) {
41862
+ try {
41863
+ const raw = await readFile20(benchmarkPath, "utf8");
41864
+ const parsed = JSON.parse(raw);
41865
+ const value = parsed.metadata?.planned_test_count;
41866
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
41867
+ } catch {
41868
+ return void 0;
41869
+ }
41870
+ }
41871
+ function computeStats(values) {
41872
+ if (values.length === 0) {
41873
+ return { mean: 0, stddev: 0 };
41874
+ }
41875
+ const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
41876
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
41877
+ return {
41878
+ mean: Math.round(mean * 1e3) / 1e3,
41879
+ stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
41880
+ };
41881
+ }
41882
+ function computePassRate(result) {
41883
+ const scores = result.scores;
41884
+ if (scores && scores.length > 0) {
41885
+ const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
41886
+ return passed / scores.length;
41887
+ }
41888
+ return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
41889
+ }
41890
+ function isExecutionError(result) {
41891
+ return result.executionStatus === "execution_error";
41892
+ }
41893
+ function countToolCalls(result) {
41894
+ const toolCalls = { ...result.trace?.toolCalls ?? {} };
41895
+ const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0);
41896
+ return { toolCalls, total };
41897
+ }
41898
+ function parseWorkspaceChanges(fileChanges) {
41899
+ if (!fileChanges) {
41900
+ return void 0;
41901
+ }
41902
+ let filesModified = 0;
41903
+ let filesCreated = 0;
41904
+ for (const line of fileChanges.split("\n")) {
41905
+ if (line.startsWith("--- /dev/null")) {
41906
+ filesCreated += 1;
41907
+ } else if (line.startsWith("--- a/")) {
41908
+ filesModified += 1;
41909
+ }
41910
+ }
41911
+ const lines = fileChanges.split("\n");
41912
+ const summaryLines = lines.slice(0, 20);
41913
+ const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
41914
+ ... (${lines.length - 20} more lines)` : fileChanges;
41915
+ return {
41916
+ files_modified: filesModified,
41917
+ files_created: filesCreated,
41918
+ diff_summary: diffSummary
41919
+ };
41920
+ }
41921
+ function buildAssertions(result) {
41922
+ if (!result.assertions) return [];
41923
+ return result.assertions.map((a) => ({
41924
+ text: a.text,
41925
+ passed: a.passed,
41926
+ evidence: a.evidence ?? ""
41927
+ }));
41928
+ }
41929
+ function buildEvaluators(scores) {
41930
+ if (!scores || scores.length === 0) {
41931
+ return void 0;
41932
+ }
41933
+ return scores.map((s) => ({
41934
+ name: s.name,
41935
+ type: s.type,
41936
+ score: s.score,
41937
+ reasoning: "",
41938
+ weight: s.weight,
41939
+ verdict: s.verdict,
41940
+ assertions: s.assertions,
41941
+ details: s.details
41942
+ }));
41943
+ }
41944
+ function toIndexAssertion(assertion) {
41945
+ return {
41946
+ text: assertion.text,
41947
+ passed: assertion.passed,
41948
+ evidence: assertion.evidence
41949
+ };
41950
+ }
41951
+ function toIndexScore(score) {
41952
+ return {
41953
+ name: score.name,
41954
+ type: score.type,
41955
+ score: score.score,
41956
+ weight: score.weight,
41957
+ verdict: score.verdict,
41958
+ assertions: score.assertions.map(toIndexAssertion),
41959
+ raw_request: score.rawRequest,
41960
+ input: score.input,
41961
+ target: score.target,
41962
+ scores: score.scores?.map(toIndexScore),
41963
+ details: score.details,
41964
+ token_usage: score.tokenUsage,
41965
+ duration_ms: score.durationMs,
41966
+ started_at: score.startedAt,
41967
+ ended_at: score.endedAt
41968
+ };
41969
+ }
41970
+ function toIndexScores(scores) {
41971
+ return scores?.map(toIndexScore);
41972
+ }
41973
+ function dropUndefined5(value) {
41974
+ return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== void 0));
41975
+ }
41976
+ function isRecord4(value) {
41977
+ return typeof value === "object" && value !== null && !Array.isArray(value);
41978
+ }
41979
+ function toIndexRerunSource(value) {
41980
+ if (!isRecord4(value)) {
41981
+ return void 0;
41982
+ }
41983
+ return dropUndefined5({
41984
+ mode: value.mode,
41985
+ source_run_dir: value.sourceRunDir,
41986
+ source_index_path: value.sourceIndexPath,
41987
+ source_artifact_dir: value.sourceArtifactDir,
41988
+ source_task_dir: value.sourceTaskDir,
41989
+ source_test_id: value.sourceTestId,
41990
+ source_target: value.sourceTarget,
41991
+ source_timestamp: value.sourceTimestamp
41992
+ });
41993
+ }
41994
+ function toIndexMetadata(metadata) {
41995
+ if (!metadata) {
41996
+ return void 0;
41997
+ }
41998
+ const rerunSource = toIndexRerunSource(metadata.rerunSource);
41999
+ if (!rerunSource) {
42000
+ return { ...metadata };
42001
+ }
42002
+ return {
42003
+ ...Object.fromEntries(Object.entries(metadata).filter(([key]) => key !== "rerunSource")),
42004
+ rerun_source: rerunSource
42005
+ };
42006
+ }
42007
+ function buildGradingArtifact(result) {
42008
+ const assertions = buildAssertions(result);
42009
+ const passed = assertions.filter((e) => e.passed).length;
42010
+ const failed = assertions.filter((e) => !e.passed).length;
42011
+ const total = assertions.length;
42012
+ const { toolCalls, total: totalToolCalls } = countToolCalls(result);
42013
+ const errorsEncountered = result.error ? 1 : 0;
42014
+ return {
42015
+ assertions,
42016
+ summary: {
42017
+ passed,
42018
+ failed,
42019
+ total,
42020
+ pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
42021
+ },
42022
+ execution_metrics: {
42023
+ tool_calls: toolCalls,
42024
+ total_tool_calls: totalToolCalls,
42025
+ errors_encountered: errorsEncountered
42026
+ },
42027
+ graders: buildEvaluators(result.scores),
42028
+ workspace_changes: parseWorkspaceChanges(result.fileChanges),
42029
+ conversation: result.conversationId ? {
42030
+ turns: result.trace?.messages.filter((message) => message.role === "assistant").length ?? 0,
42031
+ conversation_id: result.conversationId
42032
+ } : void 0
42033
+ };
42034
+ }
42035
+ function buildTimingArtifact(results) {
42036
+ let totalInput = 0;
42037
+ let totalOutput = 0;
42038
+ let totalReasoning = 0;
42039
+ let totalDurationMs = 0;
42040
+ for (const result of results) {
42041
+ const usage = result.tokenUsage;
42042
+ if (usage) {
42043
+ totalInput += usage.input ?? 0;
42044
+ totalOutput += usage.output ?? 0;
42045
+ totalReasoning += usage.reasoning ?? 0;
42046
+ }
42047
+ if (result.durationMs != null) {
42048
+ totalDurationMs += result.durationMs;
42049
+ }
42050
+ }
42051
+ return {
42052
+ total_tokens: totalInput + totalOutput,
42053
+ duration_ms: totalDurationMs,
42054
+ total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
42055
+ token_usage: {
42056
+ input: totalInput,
42057
+ output: totalOutput,
42058
+ reasoning: totalReasoning
42059
+ }
42060
+ };
42061
+ }
42062
+ function buildBenchmarkArtifact(results, evalFile = "", experiment, plannedTestCount) {
42063
+ const targetSet = /* @__PURE__ */ new Set();
42064
+ const testIdSet = /* @__PURE__ */ new Set();
42065
+ for (const result of results) {
42066
+ targetSet.add(result.target ?? "unknown");
42067
+ testIdSet.add(result.testId ?? "unknown");
42068
+ }
42069
+ const targets = [...targetSet].sort();
42070
+ const testIds = [...testIdSet].sort();
42071
+ const runSummary = {};
42072
+ const notes = [];
42073
+ for (const target of targets) {
42074
+ const targetResults = results.filter((r) => r.target === target);
42075
+ const qualityResults = targetResults.filter((r) => !isExecutionError(r));
42076
+ const passRates = qualityResults.map(computePassRate);
42077
+ const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
42078
+ const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
42079
+ const usage = r.tokenUsage;
42080
+ return (usage.input ?? 0) + (usage.output ?? 0);
42081
+ });
42082
+ const entry = {
42083
+ pass_rate: computeStats(passRates),
42084
+ time_seconds: computeStats(timings),
42085
+ tokens: computeStats(tokens)
42086
+ };
42087
+ const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
42088
+ if (toolCallCounts.some((count) => count > 0)) {
42089
+ entry.tool_calls = computeStats(toolCallCounts);
42090
+ }
42091
+ const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
42092
+ if (costs.length > 0) {
42093
+ entry.cost_usd = computeStats(costs);
42094
+ }
42095
+ runSummary[target] = entry;
42096
+ }
42097
+ const evaluatorScores = /* @__PURE__ */ new Map();
42098
+ for (const result of results) {
42099
+ if (isExecutionError(result)) {
42100
+ continue;
42101
+ }
42102
+ for (const score of result.scores ?? []) {
42103
+ const key = `${score.name}:${score.type}`;
42104
+ if (!evaluatorScores.has(key)) {
42105
+ evaluatorScores.set(key, []);
42106
+ }
42107
+ evaluatorScores.get(key)?.push(score.score);
42108
+ }
42109
+ }
42110
+ let perEvaluatorSummary;
42111
+ if (evaluatorScores.size > 0) {
42112
+ perEvaluatorSummary = {};
42113
+ for (const [key, scores] of evaluatorScores) {
42114
+ perEvaluatorSummary[key] = computeStats(scores);
42115
+ }
42116
+ }
42117
+ const errorCount = results.filter((r) => r.executionStatus === "execution_error").length;
42118
+ if (errorCount > 0) {
42119
+ notes.push(
42120
+ `${errorCount} test(s) had execution errors and are excluded from quality pass_rate`
42121
+ );
42122
+ }
42123
+ if (results.length === 0) {
42124
+ notes.push("No results to summarize");
42125
+ }
42126
+ const firstResult = results[0];
42127
+ const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
42128
+ return {
42129
+ metadata: {
42130
+ eval_file: evalFile,
42131
+ timestamp,
42132
+ targets,
42133
+ tests_run: testIds,
42134
+ experiment,
42135
+ planned_test_count: plannedTestCount
42136
+ },
42137
+ run_summary: runSummary,
42138
+ per_grader_summary: perEvaluatorSummary,
42139
+ notes
42140
+ };
42141
+ }
42142
+ async function writeInitialBenchmarkArtifact(runDir, options) {
42143
+ await mkdir18(runDir, { recursive: true });
42144
+ const stub = buildBenchmarkArtifact(
42145
+ [],
42146
+ options.evalFile,
42147
+ options.experiment,
42148
+ options.plannedTestCount
42149
+ );
42150
+ const benchmarkPath = path47.join(runDir, "benchmark.json");
42151
+ await writeFile10(benchmarkPath, `${JSON.stringify(stub, null, 2)}
42152
+ `, "utf8");
42153
+ }
42154
+ function buildAggregateGradingArtifact(results) {
42155
+ const assertions = [];
42156
+ for (const result of results.filter((r) => !isExecutionError(r))) {
42157
+ const testId = result.testId ?? "unknown";
42158
+ for (const assertion of result.assertions ?? []) {
42159
+ assertions.push({
42160
+ test_id: testId,
42161
+ text: assertion.text,
42162
+ passed: assertion.passed,
42163
+ evidence: assertion.evidence ?? ""
42164
+ });
42165
+ }
42166
+ }
42167
+ const passed = assertions.filter((a) => a.passed).length;
42168
+ const failed = assertions.filter((a) => !a.passed).length;
42169
+ const total = assertions.length;
42170
+ return {
42171
+ assertions,
42172
+ summary: {
42173
+ passed,
42174
+ failed,
42175
+ total,
42176
+ pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
42177
+ }
42178
+ };
42179
+ }
42180
+ function safeArtifactPathSegment(value, fallback) {
42181
+ const trimmed = value?.trim();
42182
+ if (!trimmed) {
42183
+ return fallback;
42184
+ }
42185
+ return trimmed.replace(/[/\\:*?"<>|]/g, "_");
42186
+ }
42187
+ function safeTestId(testId) {
42188
+ return safeArtifactPathSegment(testId, "unknown");
42189
+ }
42190
+ function getSuite(result) {
42191
+ return result.suite;
42192
+ }
42193
+ function buildArtifactSubdir(result) {
42194
+ const segments = [];
42195
+ const evalSet = getSuite(result);
42196
+ if (evalSet) {
42197
+ segments.push(safeArtifactPathSegment(evalSet, "default"));
42198
+ }
42199
+ segments.push(safeTestId(result.testId));
42200
+ return path47.posix.join(...segments);
42201
+ }
42202
+ function formatOutputMarkdown(output) {
42203
+ return output.map((msg) => `@[${msg.role}]:
42204
+ ${String(msg.content ?? "")}`).join("\n\n");
42205
+ }
42206
+ function extractInput(result) {
42207
+ const input = result.input;
42208
+ if (!input) return null;
42209
+ if (typeof input === "string") return input;
42210
+ if (Array.isArray(input) && input.length > 0) {
42211
+ return formatOutputMarkdown(input);
42212
+ }
42213
+ return null;
42214
+ }
42215
+ function toRelativeArtifactPath(outputDir, filePath) {
42216
+ return path47.relative(outputDir, filePath).split(path47.sep).join("/");
42217
+ }
42218
+ function findResultSourceTest(result, testByTestId) {
42219
+ return testByTestId.get(result.testId ?? "unknown");
42220
+ }
42221
+ function resolveEnvelopeEvalPath(result, testByTestId, fallbackEvalFile) {
42222
+ const source = findResultSourceTest(result, testByTestId)?.source;
42223
+ return source?.evalFileRepoPath ?? source?.evalFilePath ?? fallbackEvalFile;
42224
+ }
42225
+ function resultHasExecutionTraceTranscript(result) {
42226
+ return result.output.length > 0 || result.trace.messages.length > 0;
42227
+ }
42228
+ async function writeTraceEnvelopeSidecar(params) {
42229
+ const hasTranscript = resultHasExecutionTraceTranscript(params.result);
42230
+ const envelope = buildTraceEnvelopeFromEvaluationResult(params.result, {
42231
+ evalPath: params.evalPath,
42232
+ runId: path47.basename(params.outputDir),
42233
+ experiment: params.experiment,
42234
+ source: { path: RESULT_INDEX_FILENAME },
42235
+ capture: { content: "full", redactionLevel: "none", redactedFields: [] },
42236
+ artifacts: {
42237
+ execution_trace_path: "outputs/execution-trace.json",
42238
+ answer_path: params.result.output.length > 0 ? "outputs/answer.md" : void 0,
42239
+ response_path: params.result.output.length > 0 ? "outputs/response.md" : void 0,
42240
+ transcript_path: hasTranscript ? "outputs/transcript.jsonl" : void 0
42241
+ }
42242
+ });
42243
+ await writeFile10(
42244
+ path47.join(params.outputsDir, "execution-trace.json"),
42245
+ `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}
42246
+ `,
42247
+ "utf8"
42248
+ );
42249
+ return envelope;
42250
+ }
42251
+ function buildIndexArtifactEntry(result, options) {
42252
+ return {
42253
+ timestamp: result.timestamp,
42254
+ test_id: result.testId ?? "unknown",
42255
+ suite: getSuite(result),
42256
+ category: result.category,
42257
+ conversation_id: result.conversationId,
42258
+ score: result.score,
42259
+ target: result.target ?? "unknown",
42260
+ token_usage: result.tokenUsage,
42261
+ cost_usd: result.costUsd,
42262
+ duration_ms: result.durationMs,
42263
+ start_time: result.startTime,
42264
+ end_time: result.endTime,
42265
+ scores: toIndexScores(result.scores),
42266
+ execution_status: result.executionStatus,
42267
+ error: result.error,
42268
+ failure_stage: result.failureStage,
42269
+ failure_reason_code: result.failureReasonCode,
42270
+ workspace_path: result.workspacePath,
42271
+ artifact_dir: options.artifactDir ? toRelativeArtifactPath(options.outputDir, options.artifactDir) : void 0,
42272
+ grading_path: toRelativeArtifactPath(options.outputDir, options.gradingPath),
42273
+ timing_path: toRelativeArtifactPath(options.outputDir, options.timingPath),
42274
+ output_path: options.outputPath ? toRelativeArtifactPath(options.outputDir, options.outputPath) : void 0,
42275
+ answer_path: options.answerPath ? toRelativeArtifactPath(options.outputDir, options.answerPath) : void 0,
42276
+ transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : void 0,
42277
+ input_path: options.inputPath ? toRelativeArtifactPath(options.outputDir, options.inputPath) : void 0,
42278
+ response_path: options.responsePath ? toRelativeArtifactPath(options.outputDir, options.responsePath) : void 0,
42279
+ ...options.extraIndexFields,
42280
+ metadata: toIndexMetadata(result.metadata)
42281
+ };
42282
+ }
42283
+ function buildResultIndexArtifact(result, extraIndexFields) {
42284
+ const artifactSubdir = buildArtifactSubdir(result);
42285
+ const input = extractInput(result);
42286
+ const hasAnswer = result.output.length > 0;
42287
+ const hasTranscript = resultHasExecutionTraceTranscript(result);
42288
+ return {
42289
+ timestamp: result.timestamp,
42290
+ test_id: result.testId ?? "unknown",
42291
+ suite: getSuite(result),
42292
+ category: result.category,
42293
+ conversation_id: result.conversationId,
42294
+ score: result.score,
42295
+ target: result.target ?? "unknown",
42296
+ token_usage: result.tokenUsage,
42297
+ cost_usd: result.costUsd,
42298
+ duration_ms: result.durationMs,
42299
+ start_time: result.startTime,
42300
+ end_time: result.endTime,
42301
+ scores: toIndexScores(result.scores),
42302
+ execution_status: result.executionStatus,
42303
+ error: result.error,
42304
+ failure_stage: result.failureStage,
42305
+ failure_reason_code: result.failureReasonCode,
42306
+ workspace_path: result.workspacePath,
42307
+ artifact_dir: artifactSubdir,
42308
+ grading_path: path47.posix.join(artifactSubdir, "grading.json"),
42309
+ timing_path: path47.posix.join(artifactSubdir, "timing.json"),
42310
+ input_path: input ? path47.posix.join(artifactSubdir, "input.md") : void 0,
42311
+ output_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
42312
+ answer_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
42313
+ transcript_path: hasTranscript ? path47.posix.join(artifactSubdir, "outputs", "transcript.jsonl") : void 0,
42314
+ response_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
42315
+ ...extraIndexFields,
42316
+ metadata: toIndexMetadata(result.metadata)
42317
+ };
42318
+ }
42319
+ async function writeJsonlFile(filePath, records) {
42320
+ const content = records.length === 0 ? "" : `${records.map((record2) => JSON.stringify(record2)).join("\n")}
42321
+ `;
42322
+ await writeFile10(filePath, content, "utf8");
42323
+ }
42324
+ function traceProjectionForTranscript(result, envelope) {
42325
+ return {
42326
+ ...result.trace,
42327
+ messages: traceEnvelopeToTranscriptMessages(envelope)
42328
+ };
42329
+ }
42330
+ function hasTranscriptProjection(result, envelope) {
42331
+ return result.output.length > 0 || traceEnvelopeToTranscriptMessages(envelope).length > 0;
42332
+ }
42333
+ async function writeTranscriptJsonl(filePath, result, envelope) {
42334
+ const lines = traceToTranscriptJsonLines(traceProjectionForTranscript(result, envelope), {
42335
+ testId: result.testId,
42336
+ target: result.target
42337
+ });
42338
+ const content = lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join("\n")}
42339
+ ` : "";
42340
+ await writeFile10(filePath, content, "utf8");
42341
+ }
42342
+ function indexRecordKey(record2) {
42343
+ if (!isRecord4(record2)) {
42344
+ return void 0;
42345
+ }
42346
+ const testId = typeof record2.test_id === "string" ? record2.test_id : typeof record2.testId === "string" ? record2.testId : void 0;
42347
+ const target = typeof record2.target === "string" ? record2.target : void 0;
42348
+ return testId ? buildTestTargetKey(testId, target) : void 0;
42349
+ }
42350
+ async function rewriteExistingIndexRecords(outputDir, replacements) {
42351
+ if (replacements.length === 0) {
42352
+ return;
42353
+ }
42354
+ const indexPath = path47.join(outputDir, RESULT_INDEX_FILENAME);
42355
+ const content = await readFile20(indexPath, "utf8").catch(() => void 0);
42356
+ if (content === void 0) {
42357
+ return;
42358
+ }
42359
+ const replacementsByKey = new Map(
42360
+ replacements.map((record2) => [buildTestTargetKey(record2.test_id, record2.target), record2])
42361
+ );
42362
+ const seen = /* @__PURE__ */ new Set();
42363
+ const records = [];
42364
+ for (const line of content.split("\n")) {
42365
+ if (line.trim().length === 0) {
42366
+ continue;
42367
+ }
42368
+ try {
42369
+ const parsed = JSON.parse(line);
42370
+ const key = indexRecordKey(parsed);
42371
+ const replacement = key ? replacementsByKey.get(key) : void 0;
42372
+ if (key && replacement) {
42373
+ records.push(replacement);
42374
+ seen.add(key);
42375
+ } else {
42376
+ records.push(parsed);
42377
+ }
42378
+ } catch {
42379
+ }
42380
+ }
42381
+ for (const replacement of replacements) {
42382
+ const key = buildTestTargetKey(replacement.test_id, replacement.target);
42383
+ if (!seen.has(key)) {
42384
+ records.push(replacement);
42385
+ }
42386
+ }
42387
+ await writeJsonlFile(indexPath, records);
42388
+ }
42389
+ function toCamelCase2(str) {
42390
+ return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
42391
+ }
42392
+ function toCamelCaseDeep2(obj) {
42393
+ if (obj === null || obj === void 0) {
42394
+ return obj;
42395
+ }
42396
+ if (Array.isArray(obj)) {
42397
+ return obj.map((item) => toCamelCaseDeep2(item));
42398
+ }
42399
+ if (typeof obj === "object") {
42400
+ const result = {};
42401
+ for (const [key, value] of Object.entries(obj)) {
42402
+ result[toCamelCase2(key)] = toCamelCaseDeep2(value);
42403
+ }
42404
+ return result;
42405
+ }
42406
+ return obj;
42407
+ }
42408
+ var EXECUTION_STATUSES = /* @__PURE__ */ new Set([
42409
+ "ok",
42410
+ "quality_failure",
42411
+ "execution_error"
42412
+ ]);
42413
+ function isAssertionEntry(value) {
42414
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
42415
+ return false;
42416
+ }
42417
+ const candidate = value;
42418
+ return typeof candidate.text === "string" && typeof candidate.passed === "boolean" && (candidate.evidence === void 0 || typeof candidate.evidence === "string");
42419
+ }
42420
+ function isOutputMessage(value) {
42421
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
42422
+ return false;
42423
+ }
42424
+ return typeof value.role === "string";
42425
+ }
42426
+ function isExecutionStatus(value) {
42427
+ return typeof value === "string" && EXECUTION_STATUSES.has(value);
42428
+ }
42429
+ function isTraceRecord(value) {
42430
+ return !!value && typeof value === "object" && !Array.isArray(value) && Array.isArray(value.messages) && Array.isArray(value.events);
42431
+ }
42432
+ function normalizeParsedResult(value) {
42433
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
42434
+ return void 0;
42435
+ }
42436
+ const result = value;
42437
+ const legacyOutputMessages = Array.isArray(result.output) ? result.output.filter(isOutputMessage) : void 0;
42438
+ const output = typeof result.output === "string" ? result.output : extractLastAssistantContent(legacyOutputMessages);
42439
+ const legacySummary = result.trace && typeof result.trace === "object" && !Array.isArray(result.trace) ? result.trace : void 0;
42440
+ const trace = isTraceRecord(result.trace) ? result.trace : buildTraceFromMessages({
42441
+ input: Array.isArray(result.input) ? result.input : [],
42442
+ output: legacyOutputMessages,
42443
+ summary: legacySummary,
42444
+ finalOutput: output,
42445
+ tokenUsage: result.tokenUsage,
42446
+ costUsd: typeof result.costUsd === "number" ? result.costUsd : void 0,
42447
+ durationMs: typeof result.durationMs === "number" ? result.durationMs : void 0,
42448
+ target: typeof result.target === "string" ? result.target : void 0,
42449
+ testId: typeof result.testId === "string" ? result.testId : void 0
42450
+ });
42451
+ return {
42452
+ ...result,
42453
+ timestamp: typeof result.timestamp === "string" ? result.timestamp : (/* @__PURE__ */ new Date(0)).toISOString(),
42454
+ testId: typeof result.testId === "string" ? result.testId : "unknown",
42455
+ score: typeof result.score === "number" ? result.score : 0,
42456
+ assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
42457
+ target: typeof result.target === "string" ? result.target : "unknown",
42458
+ output,
42459
+ trace,
42460
+ executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : "ok"
42461
+ };
42462
+ }
42463
+ function parseJsonlResults(content) {
42464
+ const results = [];
42465
+ const lines = content.split("\n");
42466
+ for (let i = 0; i < lines.length; i++) {
42467
+ const trimmed = lines[i]?.trim();
42468
+ if (!trimmed) {
42469
+ continue;
42470
+ }
42471
+ let parsed;
42472
+ try {
42473
+ parsed = JSON.parse(trimmed);
42474
+ } catch {
42475
+ continue;
42476
+ }
42477
+ const canonicalRow = normalizeResultRow(parsed, { lineNumber: i + 1 });
42478
+ const camelCased = toCamelCaseDeep2(canonicalRow);
42479
+ const normalized = normalizeParsedResult(camelCased);
42480
+ if (normalized) {
42481
+ results.push(normalized);
42482
+ }
42483
+ }
42484
+ return results;
42485
+ }
42486
+ async function writeArtifacts(jsonlPath, outputDir, options) {
42487
+ const content = await readFile20(jsonlPath, "utf8");
42488
+ const results = parseJsonlResults(content);
42489
+ return writeArtifactsFromResults(results, outputDir, options);
42490
+ }
42491
+ function buildTranscriptMessageLines(results) {
42492
+ const lines = [];
42493
+ for (const result of results) {
42494
+ const transcriptLines = traceToTranscriptJsonLines(result.trace, {
42495
+ testId: result.testId,
42496
+ target: result.target
42497
+ });
42498
+ lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
42499
+ }
42500
+ return lines.length > 0 ? `${lines.join("\n")}
42501
+ ` : "";
42502
+ }
42503
+ async function collectAdditionalIndexFields(result, outputDir, testDir, testByTestId, additionalArtifacts) {
42504
+ if (!additionalArtifacts) {
42505
+ return void 0;
42506
+ }
42507
+ return additionalArtifacts({
42508
+ result,
42509
+ outputDir,
42510
+ testDir,
42511
+ sourceTest: testByTestId.get(result.testId ?? "unknown"),
42512
+ sourceTestsById: testByTestId
42513
+ });
42514
+ }
42515
+ async function writePerTestArtifacts(results, outputDir, options) {
42516
+ await mkdir18(outputDir, { recursive: true });
42517
+ const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
42518
+ const indexRecords = [];
42519
+ for (const result of results) {
42520
+ const grading = buildGradingArtifact(result);
42521
+ const timing = buildTimingArtifact([result]);
42522
+ const artifactSubdir = buildArtifactSubdir(result);
42523
+ const testDir = path47.join(outputDir, artifactSubdir);
42524
+ await mkdir18(testDir, { recursive: true });
42525
+ await writeFile10(
42526
+ path47.join(testDir, "grading.json"),
42527
+ `${JSON.stringify(grading, null, 2)}
42528
+ `,
42529
+ "utf8"
42530
+ );
42531
+ await writeFile10(
42532
+ path47.join(testDir, "timing.json"),
42533
+ `${JSON.stringify(timing, null, 2)}
42534
+ `,
42535
+ "utf8"
42536
+ );
42537
+ const input = extractInput(result);
42538
+ if (input) {
42539
+ await writeFile10(path47.join(testDir, "input.md"), input, "utf8");
42540
+ }
42541
+ const outputsDir = path47.join(testDir, "outputs");
42542
+ await mkdir18(outputsDir, { recursive: true });
42543
+ if (result.output.length > 0) {
42544
+ await writeFile10(path47.join(outputsDir, "answer.md"), result.output, "utf8");
42545
+ await writeFile10(path47.join(outputsDir, "response.md"), result.output, "utf8");
42546
+ }
42547
+ const envelope = await writeTraceEnvelopeSidecar({
42548
+ result,
42549
+ outputDir,
42550
+ outputsDir,
42551
+ evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile),
42552
+ experiment: options?.experiment
42553
+ });
42554
+ if (hasTranscriptProjection(result, envelope)) {
42555
+ await writeTranscriptJsonl(path47.join(outputsDir, "transcript.jsonl"), result, envelope);
42556
+ }
42557
+ const extraIndexFields = await collectAdditionalIndexFields(
42558
+ result,
42559
+ outputDir,
42560
+ testDir,
42561
+ testByTestId,
42562
+ options?.additionalArtifacts
42563
+ );
42564
+ indexRecords.push({
42565
+ ...buildResultIndexArtifact(result, extraIndexFields),
42566
+ experiment: options?.experiment
42567
+ });
42568
+ }
42569
+ await rewriteExistingIndexRecords(outputDir, indexRecords);
42570
+ }
42571
+ async function writeArtifactsFromResults(results, outputDir, options) {
42572
+ const testArtifactDir = outputDir;
42573
+ const timingPath = path47.join(outputDir, "timing.json");
42574
+ const benchmarkPath = path47.join(outputDir, "benchmark.json");
42575
+ const indexPath = path47.join(outputDir, RESULT_INDEX_FILENAME);
42576
+ await mkdir18(outputDir, { recursive: true });
42577
+ const indexRecords = [];
42578
+ const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
42579
+ for (const result of results) {
42580
+ const grading = buildGradingArtifact(result);
42581
+ const timing2 = buildTimingArtifact([result]);
42582
+ const artifactSubdir = buildArtifactSubdir(result);
42583
+ const testDir = path47.join(outputDir, artifactSubdir);
42584
+ const gradingPath = path47.join(testDir, "grading.json");
42585
+ const perTestTimingPath = path47.join(testDir, "timing.json");
42586
+ await mkdir18(testDir, { recursive: true });
42587
+ await writeFile10(gradingPath, `${JSON.stringify(grading, null, 2)}
42588
+ `, "utf8");
42589
+ await writeFile10(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
42590
+ `, "utf8");
42591
+ const input = extractInput(result);
42592
+ const inputPath = input ? path47.join(testDir, "input.md") : void 0;
42593
+ if (inputPath && input) {
42594
+ await writeFile10(inputPath, input, "utf8");
42595
+ }
42596
+ const outputsDir = path47.join(testDir, "outputs");
42597
+ await mkdir18(outputsDir, { recursive: true });
42598
+ const answerPath = result.output.length > 0 ? path47.join(outputsDir, "answer.md") : void 0;
42599
+ const responsePath = result.output.length > 0 ? path47.join(outputsDir, "response.md") : void 0;
42600
+ if (answerPath && responsePath) {
42601
+ await writeFile10(answerPath, result.output, "utf8");
42602
+ await writeFile10(responsePath, result.output, "utf8");
42603
+ }
42604
+ const envelope = await writeTraceEnvelopeSidecar({
42605
+ result,
42606
+ outputDir,
42607
+ outputsDir,
42608
+ evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile),
42609
+ experiment: options?.experiment
42610
+ });
42611
+ const transcriptPath = hasTranscriptProjection(result, envelope) ? path47.join(outputsDir, "transcript.jsonl") : void 0;
42612
+ if (transcriptPath) {
42613
+ await writeTranscriptJsonl(transcriptPath, result, envelope);
42614
+ }
42615
+ const extraIndexFields = await collectAdditionalIndexFields(
42616
+ result,
42617
+ outputDir,
42618
+ testDir,
42619
+ testByTestId,
42620
+ options?.additionalArtifacts
42621
+ );
42622
+ indexRecords.push({
42623
+ ...buildIndexArtifactEntry(result, {
42624
+ outputDir,
42625
+ artifactDir: testDir,
42626
+ gradingPath,
42627
+ timingPath: perTestTimingPath,
42628
+ outputPath: answerPath,
42629
+ answerPath,
42630
+ transcriptPath,
42631
+ inputPath,
42632
+ responsePath,
42633
+ extraIndexFields
42634
+ }),
42635
+ experiment: options?.experiment
42636
+ });
42637
+ }
42638
+ const timing = buildTimingArtifact(results);
42639
+ await writeFile10(timingPath, `${JSON.stringify(timing, null, 2)}
42640
+ `, "utf8");
42641
+ const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(benchmarkPath);
42642
+ const benchmark = buildBenchmarkArtifact(
42643
+ results,
42644
+ options?.evalFile,
42645
+ options?.experiment,
42646
+ plannedTestCount
42647
+ );
42648
+ await writeFile10(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
42649
+ `, "utf8");
42650
+ await writeJsonlFile(indexPath, indexRecords);
42651
+ await writeFile10(
42652
+ path47.join(outputDir, "transcript.jsonl"),
42653
+ buildTranscriptMessageLines(results),
42654
+ "utf8"
42655
+ );
42656
+ return { testArtifactDir, timingPath, benchmarkPath, indexPath };
42657
+ }
41204
42658
  async function evaluate(config2) {
41205
42659
  const startTime = Date.now();
41206
42660
  if (config2.tests && config2.specFile) {
@@ -41246,7 +42700,7 @@ async function evaluate(config2) {
41246
42700
  cliNoCache: false,
41247
42701
  yamlCache: config2.cache === void 0 ? materialized.cache : void 0
41248
42702
  });
41249
- const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path47.resolve(materialized.cachePath) : void 0) : void 0;
42703
+ const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path48.resolve(materialized.cachePath) : void 0) : void 0;
41250
42704
  const results = await runEvaluation({
41251
42705
  testFilePath,
41252
42706
  repoRoot,
@@ -41269,15 +42723,27 @@ async function evaluate(config2) {
41269
42723
  });
41270
42724
  const allResults = collectedResults.length > 0 ? collectedResults : [...results];
41271
42725
  const durationMs = Date.now() - startTime;
42726
+ const outputDir = config2.outputDir ? path48.resolve(config2.outputDir) : void 0;
42727
+ const artifacts = outputDir ? await writeArtifactsFromResults(allResults, outputDir, {
42728
+ evalFile: config2.specFile ? testFilePath : "",
42729
+ experiment: config2.experiment,
42730
+ sourceTests: materialized.tests
42731
+ }).then(({ benchmarkPath, indexPath, timingPath }) => ({
42732
+ runDir: outputDir,
42733
+ benchmarkPath,
42734
+ indexPath,
42735
+ timingPath
42736
+ })) : void 0;
41272
42737
  return {
41273
42738
  results: allResults,
41274
- summary: computeSummary(allResults, durationMs, config2.threshold)
42739
+ summary: computeSummary(allResults, durationMs, config2.threshold),
42740
+ artifacts
41275
42741
  };
41276
42742
  }
41277
42743
  async function materializeEvalConfig(config2, options) {
41278
42744
  const baseDir = options?.baseDir ?? process.cwd();
41279
42745
  const repoRoot = options?.repoRoot ?? await findGitRoot(baseDir) ?? baseDir;
41280
- const testFilePath = config2.specFile ? path47.resolve(baseDir, config2.specFile) : path47.join(baseDir, "__programmatic__.yaml");
42746
+ const testFilePath = config2.specFile ? path48.resolve(baseDir, config2.specFile) : path48.join(baseDir, "__programmatic__.yaml");
41281
42747
  const effectiveFilter = options?.filter ?? config2.filter;
41282
42748
  if (config2.specFile) {
41283
42749
  const suite = await loadTestSuite(testFilePath, repoRoot, {
@@ -41354,7 +42820,7 @@ function convertAssertions(entries) {
41354
42820
  }
41355
42821
  function buildInlineEvalTests(config2, options) {
41356
42822
  const suiteWorkspace = config2.beforeAll ? { hooks: { before_all: toBeforeAllHook(config2.beforeAll) } } : void 0;
41357
- const derivedSuiteName = path47.basename(options.testFilePath).replace(/\.eval\.[cm]?ts$/i, "").replace(/\.[cm]?ts$/i, "");
42823
+ const derivedSuiteName = path48.basename(options.testFilePath).replace(/\.eval\.[cm]?ts$/i, "").replace(/\.[cm]?ts$/i, "");
41358
42824
  const suiteName = config2.metadata?.name ?? (derivedSuiteName || "eval");
41359
42825
  return (config2.tests ?? []).filter((test) => !options.filter || matchesFilter4(test.id, options.filter)).map((test) => {
41360
42826
  const isConversation = test.mode === "conversation" || test.turns && test.turns.length > 0;
@@ -41450,10 +42916,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
41450
42916
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
41451
42917
  async function discoverDefaultTarget(repoRoot) {
41452
42918
  const cwd = process.cwd();
41453
- const chain = buildDirectoryChain(path47.join(cwd, "_placeholder"), repoRoot);
42919
+ const chain = buildDirectoryChain(path48.join(cwd, "_placeholder"), repoRoot);
41454
42920
  for (const dir of chain) {
41455
42921
  for (const candidate of TARGET_FILE_CANDIDATES) {
41456
- const targetsPath = path47.join(dir, candidate);
42922
+ const targetsPath = path48.join(dir, candidate);
41457
42923
  if (!existsSync7(targetsPath)) continue;
41458
42924
  try {
41459
42925
  const definitions = await readTargetDefinitions(targetsPath);
@@ -41470,7 +42936,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
41470
42936
  const chain = buildDirectoryChain(startPath, repoRoot);
41471
42937
  const envFiles = [];
41472
42938
  for (const dir of chain) {
41473
- const envPath = path47.join(dir, ".env");
42939
+ const envPath = path48.join(dir, ".env");
41474
42940
  if (existsSync7(envPath)) envFiles.push(envPath);
41475
42941
  }
41476
42942
  for (let i = 0; i < envFiles.length; i++) {
@@ -41496,7 +42962,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
41496
42962
  }
41497
42963
  var EXPORT_NAMES = ["default", "config", "evalConfig"];
41498
42964
  async function loadTsEvalFile(filePath) {
41499
- const absolutePath = path48.resolve(filePath);
42965
+ const absolutePath = path49.resolve(filePath);
41500
42966
  const moduleUrl = pathToFileURL2(absolutePath).href;
41501
42967
  const module = await import(moduleUrl);
41502
42968
  let config2;
@@ -41518,7 +42984,7 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
41518
42984
  const { config: config2, filePath: absolutePath } = await loadTsEvalFile(filePath);
41519
42985
  const materialized = await materializeEvalConfig(config2, {
41520
42986
  repoRoot,
41521
- baseDir: path48.dirname(absolutePath),
42987
+ baseDir: path49.dirname(absolutePath),
41522
42988
  filter: options?.filter,
41523
42989
  category: options?.category
41524
42990
  });
@@ -41718,7 +43184,7 @@ export {
41718
43184
  formatReplayLookupKey,
41719
43185
  replayFixtureRecordToProviderResponse,
41720
43186
  buildReplayFixtureRecord,
41721
- TRACE_ENVELOPE_SCHEMA_VERSION,
43187
+ EXECUTION_TRACE_SCHEMA_VERSION,
41722
43188
  TraceEnvelopeEvalWireSchema,
41723
43189
  TraceEnvelopeReplayWireSchema,
41724
43190
  TraceEnvelopeSpanStatusWireSchema,
@@ -41735,9 +43201,12 @@ export {
41735
43201
  toTraceEnvelopeWire,
41736
43202
  fromTraceEnvelopeWire,
41737
43203
  traceEnvelopeToMessages,
43204
+ traceEnvelopeToTranscriptMessages,
43205
+ traceEnvelopeToToolTrajectoryView,
41738
43206
  traceEnvelopeToTraceSummary,
41739
43207
  traceEnvelopeToTraceArtifact,
41740
43208
  getTraceEnvelopeSummary,
43209
+ traceEnvelopeToOtlpJson,
41741
43210
  readTraceEnvelopeReplayRecords,
41742
43211
  findTraceEnvelopeReplayRecord,
41743
43212
  traceEnvelopeReplayRecordToProviderResponse,
@@ -41804,8 +43273,31 @@ export {
41804
43273
  loadEvalCaseById,
41805
43274
  runEvaluation,
41806
43275
  runEvalCase,
43276
+ toTranscriptJsonLines,
43277
+ traceToTranscriptJsonLines,
43278
+ traceFromTranscriptJsonLines,
43279
+ groupTranscriptJsonLines,
43280
+ readTranscriptJsonl,
43281
+ readTranscriptFile,
43282
+ ResultRowSchemaError,
43283
+ normalizeResultRow,
43284
+ RESULT_INDEX_FILENAME,
43285
+ buildTestTargetKey,
43286
+ deduplicateByTestIdTarget,
43287
+ aggregateRunDir,
43288
+ buildGradingArtifact,
43289
+ buildTimingArtifact,
43290
+ buildBenchmarkArtifact,
43291
+ writeInitialBenchmarkArtifact,
43292
+ buildAggregateGradingArtifact,
43293
+ buildIndexArtifactEntry,
43294
+ buildResultIndexArtifact,
43295
+ parseJsonlResults,
43296
+ writeArtifacts,
43297
+ writePerTestArtifacts,
43298
+ writeArtifactsFromResults,
41807
43299
  evaluate,
41808
43300
  loadTsEvalFile,
41809
43301
  loadTsEvalSuite
41810
43302
  };
41811
- //# sourceMappingURL=chunk-VBHHZQS6.js.map
43303
+ //# sourceMappingURL=chunk-BLXYBUU4.js.map