agentv 4.38.1-next.1 → 4.39.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/{artifact-writer-MK5X5MSO.js → artifact-writer-VPRAQSQM.js} +14 -11
  2. package/dist/{chunk-SMZQ7RPW.js → chunk-4NAWRNBL.js} +67 -68
  3. package/dist/chunk-4NAWRNBL.js.map +1 -0
  4. package/dist/{chunk-Z4BVJJXA.js → chunk-5JWECTVJ.js} +586 -183
  5. package/dist/chunk-5JWECTVJ.js.map +1 -0
  6. package/dist/chunk-5VQPWWUI.js +494 -0
  7. package/dist/chunk-5VQPWWUI.js.map +1 -0
  8. package/dist/{chunk-QOBQ5XYF.js → chunk-76FOHROU.js} +16 -4
  9. package/dist/chunk-76FOHROU.js.map +1 -0
  10. package/dist/{chunk-VBHHZQS6.js → chunk-DR2ZHSBE.js} +1827 -332
  11. package/dist/chunk-DR2ZHSBE.js.map +1 -0
  12. package/dist/{chunk-NLTIK3LV.js → chunk-RLMXZDDC.js} +499 -347
  13. package/dist/chunk-RLMXZDDC.js.map +1 -0
  14. package/dist/cli.js +6 -6
  15. package/dist/dashboard/assets/index-BnYCCJ7O.css +1 -0
  16. package/dist/dashboard/assets/index-DaueD7GO.js +118 -0
  17. package/dist/dashboard/assets/{index-SIl6NbIJ.js → index-_jpKSzIf.js} +1 -1
  18. package/dist/dashboard/index.html +2 -2
  19. package/dist/{dist-HVLBDG5F.js → dist-QAMAJMAH.js} +54 -16
  20. package/dist/index.js +6 -6
  21. package/dist/{interactive-A7JNS2MT.js → interactive-V2GW7A25.js} +6 -6
  22. package/dist/{otlp-json-file-exporter-RJFPCKVK-T6N4OGWG.js → otlp-json-file-exporter-RY63S3IG-PZBQPVYY.js} +2 -2
  23. package/dist/skills/agentv-eval-writer/SKILL.md +15 -10
  24. package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +23 -15
  25. package/dist/{ts-eval-loader-TJT6BGFF-DI7XNSO4.js → ts-eval-loader-MQFJ5AEM-6V7TAKWK.js} +2 -2
  26. package/package.json +1 -1
  27. package/dist/chunk-DKUAETXE.js +0 -1362
  28. package/dist/chunk-DKUAETXE.js.map +0 -1
  29. package/dist/chunk-NLTIK3LV.js.map +0 -1
  30. package/dist/chunk-QOBQ5XYF.js.map +0 -1
  31. package/dist/chunk-SMZQ7RPW.js.map +0 -1
  32. package/dist/chunk-VBHHZQS6.js.map +0 -1
  33. package/dist/chunk-Z4BVJJXA.js.map +0 -1
  34. package/dist/dashboard/assets/index-BpnllKET.css +0 -1
  35. package/dist/dashboard/assets/index-Cm9SUopp.js +0 -118
  36. /package/dist/{artifact-writer-MK5X5MSO.js.map → artifact-writer-VPRAQSQM.js.map} +0 -0
  37. /package/dist/{dist-HVLBDG5F.js.map → dist-QAMAJMAH.js.map} +0 -0
  38. /package/dist/{interactive-A7JNS2MT.js.map → interactive-V2GW7A25.js.map} +0 -0
  39. /package/dist/{otlp-json-file-exporter-RJFPCKVK-T6N4OGWG.js.map → otlp-json-file-exporter-RY63S3IG-PZBQPVYY.js.map} +0 -0
  40. /package/dist/{ts-eval-loader-TJT6BGFF-DI7XNSO4.js.map → ts-eval-loader-MQFJ5AEM-6V7TAKWK.js.map} +0 -0
@@ -1,28 +1,31 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
2
  import {
3
+ buildIndexArtifactEntry,
4
+ buildResultIndexArtifact,
5
+ writeArtifactsFromResults,
6
+ writePerTestArtifacts
7
+ } from "./chunk-5VQPWWUI.js";
8
+ import "./chunk-RLMXZDDC.js";
9
+ import "./chunk-76FOHROU.js";
10
+ import "./chunk-BPGJ4HBU.js";
11
+ import {
12
+ RESULT_INDEX_FILENAME,
3
13
  aggregateRunDir,
4
14
  buildAggregateGradingArtifact,
5
15
  buildBenchmarkArtifact,
6
16
  buildGradingArtifact,
7
- buildIndexArtifactEntry,
8
- buildResultIndexArtifact,
9
17
  buildTestTargetKey,
10
18
  buildTimingArtifact,
11
19
  deduplicateByTestIdTarget,
12
20
  parseJsonlResults,
13
21
  writeArtifacts,
14
- writeArtifactsFromResults,
15
- writeInitialBenchmarkArtifact,
16
- writePerTestArtifacts
17
- } from "./chunk-DKUAETXE.js";
18
- import "./chunk-NLTIK3LV.js";
19
- import "./chunk-QOBQ5XYF.js";
20
- import "./chunk-BPGJ4HBU.js";
21
- import "./chunk-VBHHZQS6.js";
22
+ writeInitialBenchmarkArtifact
23
+ } from "./chunk-DR2ZHSBE.js";
22
24
  import "./chunk-NPVGBFF6.js";
23
25
  import "./chunk-M7BUKBAF.js";
24
26
  import "./chunk-5H446C7X.js";
25
27
  export {
28
+ RESULT_INDEX_FILENAME,
26
29
  aggregateRunDir,
27
30
  buildAggregateGradingArtifact,
28
31
  buildBenchmarkArtifact,
@@ -38,4 +41,4 @@ export {
38
41
  writeInitialBenchmarkArtifact,
39
42
  writePerTestArtifacts
40
43
  };
41
- //# sourceMappingURL=artifact-writer-MK5X5MSO.js.map
44
+ //# sourceMappingURL=artifact-writer-VPRAQSQM.js.map
@@ -1,9 +1,12 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
2
  import {
3
3
  Mutex,
4
+ RESULT_INDEX_FILENAME,
4
5
  TARGET_FILE_CANDIDATES,
6
+ buildDefaultRunDir,
5
7
  c,
6
8
  clearRemoteRunTags,
9
+ createRunDirName,
7
10
  deleteRunTags,
8
11
  detectFileType,
9
12
  discoverEvalFiles,
@@ -36,6 +39,7 @@ import {
36
39
  resolveEvalPaths,
37
40
  resolveResultSourcePath,
38
41
  resolveRunCacheFile,
42
+ resolveRunManifestPath,
39
43
  runEvalCommand,
40
44
  selectTarget,
41
45
  setRemoteRunTags,
@@ -48,19 +52,11 @@ import {
48
52
  validateTargetsFile,
49
53
  validateWorkspacePaths,
50
54
  writeRunTags
51
- } from "./chunk-Z4BVJJXA.js";
55
+ } from "./chunk-5JWECTVJ.js";
52
56
  import {
53
- RESULT_INDEX_FILENAME,
54
- aggregateRunDir,
55
- buildBenchmarkArtifact,
56
- buildDefaultRunDir,
57
- buildTestTargetKey,
58
- buildTimingArtifact,
59
- createRunDirName,
60
- resolveRunManifestPath,
61
57
  toSnakeCaseDeep as toSnakeCaseDeep2,
62
58
  writeArtifactsFromResults
63
- } from "./chunk-DKUAETXE.js";
59
+ } from "./chunk-5VQPWWUI.js";
64
60
  import {
65
61
  DEFAULT_CATEGORY,
66
62
  deriveCategory,
@@ -69,17 +65,19 @@ import {
69
65
  getOutputFilenames,
70
66
  parseClaudeSession,
71
67
  parseCodexSession,
72
- readTranscriptFile,
73
68
  runBeforeSessionHook,
74
69
  scanRepoDeps,
75
70
  syncProjects,
76
- toTranscriptJsonLines,
77
71
  transpileEvalYamlFile,
78
72
  trimBaselineResult
79
- } from "./chunk-NLTIK3LV.js";
73
+ } from "./chunk-RLMXZDDC.js";
80
74
  import {
81
75
  DEFAULT_THRESHOLD,
82
76
  addProject,
77
+ aggregateRunDir,
78
+ buildBenchmarkArtifact,
79
+ buildTestTargetKey,
80
+ buildTimingArtifact,
83
81
  buildTraceFromMessages,
84
82
  createBuiltinRegistry,
85
83
  discoverCopilotSessions,
@@ -93,10 +91,12 @@ import {
93
91
  loadProjectRegistry,
94
92
  loadTestSuite,
95
93
  normalizeLineEndings,
94
+ normalizeResultRow,
96
95
  parseAgentSkillsEvals,
97
96
  parseCopilotEvents,
98
97
  parseYamlValue,
99
98
  readTargetDefinitions,
99
+ readTranscriptFile,
100
100
  removeProject,
101
101
  runContainsAllAssertion,
102
102
  runContainsAnyAssertion,
@@ -111,8 +111,9 @@ import {
111
111
  runStartsWithAssertion,
112
112
  toCamelCaseDeep,
113
113
  toSnakeCaseDeep,
114
+ toTranscriptJsonLines,
114
115
  touchProject
115
- } from "./chunk-VBHHZQS6.js";
116
+ } from "./chunk-DR2ZHSBE.js";
116
117
  import {
117
118
  __commonJS,
118
119
  __require,
@@ -3955,53 +3956,27 @@ var ASSERTION_TEMPLATES = {
3955
3956
  default: `#!/usr/bin/env bun
3956
3957
  import { defineAssertion } from '@agentv/eval';
3957
3958
 
3958
- /** Extract text from the last message with the given role. */
3959
- function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3960
- for (let i = messages.length - 1; i >= 0; i--) {
3961
- const msg = messages[i];
3962
- if (msg.role !== role) continue;
3963
- if (typeof msg.content === 'string') return msg.content;
3964
- if (Array.isArray(msg.content)) {
3965
- return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3966
- }
3967
- }
3968
- return '';
3969
- }
3970
-
3971
3959
  export default defineAssertion(({ output }) => {
3972
3960
  // TODO: Implement your assertion logic
3973
- const text = getMessageText(output ?? []);
3961
+ const text = output ?? '';
3974
3962
  const pass = text.length > 0;
3975
3963
  return {
3976
3964
  pass,
3977
- reasoning: pass ? 'Output has content' : 'Output is empty',
3965
+ assertions: [{ text: pass ? 'Output has content' : 'Output is empty', passed: pass }],
3978
3966
  };
3979
3967
  });
3980
3968
  `,
3981
3969
  score: `#!/usr/bin/env bun
3982
3970
  import { defineAssertion } from '@agentv/eval';
3983
3971
 
3984
- /** Extract text from the last message with the given role. */
3985
- function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3986
- for (let i = messages.length - 1; i >= 0; i--) {
3987
- const msg = messages[i];
3988
- if (msg.role !== role) continue;
3989
- if (typeof msg.content === 'string') return msg.content;
3990
- if (Array.isArray(msg.content)) {
3991
- return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3992
- }
3993
- }
3994
- return '';
3995
- }
3996
-
3997
3972
  export default defineAssertion(({ output }) => {
3998
3973
  // TODO: Implement your scoring logic (0.0 to 1.0)
3999
- const text = getMessageText(output ?? []);
3974
+ const text = output ?? '';
4000
3975
  const score = text.length > 0 ? 1.0 : 0.0;
4001
3976
  return {
4002
3977
  pass: score >= 0.5,
4003
3978
  score,
4004
- reasoning: \`Score: \${score}\`,
3979
+ assertions: [{ text: 'Output has content', passed: score === 1.0 }],
4005
3980
  };
4006
3981
  });
4007
3982
  `
@@ -4337,7 +4312,6 @@ var evalAssertCommand = command({
4337
4312
  question: resolvedInput,
4338
4313
  criteria: "",
4339
4314
  expected_output: [],
4340
- reference_answer: "",
4341
4315
  input_files: [],
4342
4316
  trace,
4343
4317
  token_usage: null,
@@ -4348,11 +4322,7 @@ var evalAssertCommand = command({
4348
4322
  file_changes: null,
4349
4323
  workspace_path: null,
4350
4324
  config: null,
4351
- metadata: {},
4352
- // Text convenience accessors (new names)
4353
- input_text: resolvedInput,
4354
- output_text: resolvedOutput,
4355
- expected_output_text: ""
4325
+ metadata: {}
4356
4326
  },
4357
4327
  null,
4358
4328
  2
@@ -4440,6 +4410,33 @@ var evalRunCommand = command({
4440
4410
  long: "experiment",
4441
4411
  description: "Experiment label for canonical run output (default: default)"
4442
4412
  }),
4413
+ resultsRepo: option({
4414
+ type: optional(string),
4415
+ long: "results-repo",
4416
+ description: "Results Git repo override: current/. for the source repo, a local path, Git URL, or owner/repo"
4417
+ }),
4418
+ resultsBranch: option({
4419
+ type: optional(string),
4420
+ long: "results-branch",
4421
+ description: "Results storage branch (default: agentv/results/v1)"
4422
+ }),
4423
+ resultsRemote: option({
4424
+ type: optional(string),
4425
+ long: "results-remote",
4426
+ description: "Git remote name for results push/fetch (default: origin)"
4427
+ }),
4428
+ resultsPush: flag({
4429
+ long: "results-push",
4430
+ description: "Push the results branch after publishing the completed local run"
4431
+ }),
4432
+ noResultsPush: flag({
4433
+ long: "no-results-push",
4434
+ description: "Publish to the local results branch without pushing to the remote"
4435
+ }),
4436
+ resultsRequirePush: flag({
4437
+ long: "results-require-push",
4438
+ description: "Fail the eval command if the completed results branch cannot be pushed"
4439
+ }),
4443
4440
  dryRun: flag({
4444
4441
  long: "dry-run",
4445
4442
  description: "Use mock provider responses instead of real LLM calls"
@@ -4515,7 +4512,7 @@ var evalRunCommand = command({
4515
4512
  otelBackend: option({
4516
4513
  type: optional(string),
4517
4514
  long: "otel-backend",
4518
- description: "Use a backend preset (langfuse, braintrust, confident)"
4515
+ description: "Use an OTel backend resolver (langfuse, braintrust, confident, or local)"
4519
4516
  }),
4520
4517
  otelCaptureContent: flag({
4521
4518
  long: "otel-capture-content",
@@ -4600,7 +4597,7 @@ var evalRunCommand = command({
4600
4597
  },
4601
4598
  handler: async (args) => {
4602
4599
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4603
- const { launchInteractiveWizard } = await import("./interactive-A7JNS2MT.js");
4600
+ const { launchInteractiveWizard } = await import("./interactive-V2GW7A25.js");
4604
4601
  await launchInteractiveWizard();
4605
4602
  return;
4606
4603
  }
@@ -4609,6 +4606,10 @@ var evalRunCommand = command({
4609
4606
  console.error("Error: --budget-usd must be a positive number.");
4610
4607
  process.exit(2);
4611
4608
  }
4609
+ if (args.resultsPush && args.noResultsPush) {
4610
+ console.error("Error: --results-push and --no-results-push cannot be used together.");
4611
+ process.exit(2);
4612
+ }
4612
4613
  const rawOptions = {
4613
4614
  target: args.target,
4614
4615
  targets: args.targets,
@@ -4618,6 +4619,12 @@ var evalRunCommand = command({
4618
4619
  output: args.output,
4619
4620
  outputFormat: args.outputFormat,
4620
4621
  experiment: args.experiment,
4622
+ resultsRepo: args.resultsRepo,
4623
+ resultsBranch: args.resultsBranch,
4624
+ resultsRemote: args.resultsRemote,
4625
+ resultsPush: args.resultsPush,
4626
+ noResultsPush: args.noResultsPush,
4627
+ resultsRequirePush: args.resultsRequirePush,
4621
4628
  dryRun: args.dryRun,
4622
4629
  dryRunDelay: args.dryRunDelay,
4623
4630
  dryRunDelayMin: args.dryRunDelayMin,
@@ -6353,13 +6360,15 @@ function parseFilterableRecords(filePath) {
6353
6360
  }
6354
6361
  const lines = content.split("\n").filter((line) => line.trim());
6355
6362
  const records = [];
6356
- for (const line of lines) {
6363
+ for (let i = 0; i < lines.length; i++) {
6364
+ const line = lines[i];
6357
6365
  let raw;
6358
6366
  try {
6359
6367
  raw = JSON.parse(line);
6360
6368
  } catch {
6361
6369
  continue;
6362
6370
  }
6371
+ raw = normalizeResultRow(raw, { lineNumber: i + 1, sourceLabel: filePath });
6363
6372
  let experiment = typeof raw.experiment === "string" ? raw.experiment : void 0;
6364
6373
  if (!experiment) {
6365
6374
  const parts = filePath.split(path14.sep);
@@ -6955,7 +6964,7 @@ function searchJsonlFile(filePath, regex2, targetFilter, experimentFilter) {
6955
6964
  const target = typeof record.target === "string" ? record.target : void 0;
6956
6965
  const experiment = typeof record.experiment === "string" ? record.experiment : void 0;
6957
6966
  const score = typeof record.score === "number" ? record.score : void 0;
6958
- const testId = typeof record.test_id === "string" ? record.test_id : typeof record.source === "object" && record.source !== null ? record.source.session_id : void 0;
6967
+ const testId = typeof record.test_id === "string" ? record.test_id : typeof record.testId === "string" ? record.testId : typeof record.source === "object" && record.source !== null ? record.source.session_id : void 0;
6959
6968
  if (targetFilter && target !== targetFilter) continue;
6960
6969
  if (experimentFilter && experiment !== experimentFilter) continue;
6961
6970
  const match = regex2.exec(line);
@@ -7748,12 +7757,6 @@ function computeStats(values) {
7748
7757
  import { mkdir as mkdir7, readFile as readFile4, readdir as readdir2, writeFile as writeFile8 } from "node:fs/promises";
7749
7758
  import { join as join2 } from "node:path";
7750
7759
  var DEFAULT_CONCURRENCY = 10;
7751
- function extractInputText(input) {
7752
- if (!input || input.length === 0) return "";
7753
- if (input.length === 1) return input[0].content;
7754
- return input.map((m) => `@[${m.role}]:
7755
- ${m.content}`).join("\n\n");
7756
- }
7757
7760
  async function runCodeGraders(tasks, concurrency) {
7758
7761
  let totalGraders = 0;
7759
7762
  let totalPassed = 0;
@@ -7782,7 +7785,6 @@ async function runCodeGraders(tasks, concurrency) {
7782
7785
  const executeCodeGrader = async (graderConfig, task) => {
7783
7786
  const { testId, resultsDir, responseText, inputData } = task;
7784
7787
  const graderName = graderConfig.name;
7785
- const inputText = extractInputText(inputData.input);
7786
7788
  const messages = [{ role: "assistant", content: responseText }];
7787
7789
  const trace = buildTraceFromMessages({
7788
7790
  input: inputData.input,
@@ -7807,10 +7809,7 @@ async function runCodeGraders(tasks, concurrency) {
7807
7809
  file_changes: null,
7808
7810
  workspace_path: null,
7809
7811
  config: graderConfig.config ?? null,
7810
- metadata: inputData.metadata ?? {},
7811
- input_text: inputText,
7812
- output_text: responseText,
7813
- expected_output_text: ""
7812
+ metadata: inputData.metadata ?? {}
7814
7813
  });
7815
7814
  try {
7816
7815
  const stdout = await executeScript(
@@ -8215,7 +8214,7 @@ import { existsSync as existsSync7, readFileSync as readFileSync6, unlinkSync }
8215
8214
  import { mkdir as mkdir9, readFile as readFile6, readdir as readdir3, writeFile as writeFile10 } from "node:fs/promises";
8216
8215
  import { tmpdir } from "node:os";
8217
8216
  import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
8218
- function extractInputText2(input) {
8217
+ function extractInputText(input) {
8219
8218
  if (!input || input.length === 0) return "";
8220
8219
  if (input.length === 1) return input[0].content;
8221
8220
  return input.map((m) => `@[${m.role}]:
@@ -8399,7 +8398,7 @@ var evalRunCommand2 = command({
8399
8398
  const timeoutMs = invoke.timeout_ms ?? 12e4;
8400
8399
  const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
8401
8400
  const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
8402
- const inputText = extractInputText2(inputData.input);
8401
+ const inputText = extractInputText(inputData.input);
8403
8402
  await writeFile10(promptFile, inputText, "utf8");
8404
8403
  let rendered = template;
8405
8404
  rendered = rendered.replace("{PROMPT_FILE}", promptFile);
@@ -15632,4 +15631,4 @@ export {
15632
15631
  preprocessArgv,
15633
15632
  runCli
15634
15633
  };
15635
- //# sourceMappingURL=chunk-SMZQ7RPW.js.map
15634
+ //# sourceMappingURL=chunk-4NAWRNBL.js.map