agentv 4.35.1 → 4.37.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/{artifact-writer-G57MG52C.js → artifact-writer-GFNKYREE.js} +4 -4
  2. package/dist/{chunk-INOKS5LF.js → chunk-M7AMFWBZ.js} +275 -58
  3. package/dist/chunk-M7AMFWBZ.js.map +1 -0
  4. package/dist/{chunk-KJGYL3M3.js → chunk-N6E5XFOM.js} +213 -85
  5. package/dist/chunk-N6E5XFOM.js.map +1 -0
  6. package/dist/{chunk-KNF3AGCI.js → chunk-OYI35QFW.js} +314 -49
  7. package/dist/chunk-OYI35QFW.js.map +1 -0
  8. package/dist/{chunk-CRMGUVRZ.js → chunk-P4LSNFZR.js} +85 -19
  9. package/dist/chunk-P4LSNFZR.js.map +1 -0
  10. package/dist/{chunk-6QEIZ33V.js → chunk-RL4S2FBZ.js} +2700 -456
  11. package/dist/chunk-RL4S2FBZ.js.map +1 -0
  12. package/dist/cli.js +5 -5
  13. package/dist/dashboard/assets/index-9tV-u4HJ.css +1 -0
  14. package/dist/dashboard/assets/{index-Bdk-9a_8.js → index-BDRYJsGF.js} +1 -1
  15. package/dist/dashboard/assets/index-DuESU7zZ.js +118 -0
  16. package/dist/dashboard/index.html +2 -2
  17. package/dist/{dist-M4B77IW4.js → dist-OY3JSP6Z.js} +125 -3
  18. package/dist/index.js +5 -5
  19. package/dist/{interactive-VYQ5SYMR.js → interactive-CQELHITQ.js} +5 -5
  20. package/dist/skills/agentv-eval-writer/SKILL.md +6 -0
  21. package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js → ts-eval-loader-RBTB2HG2-H5TRXZLO.js} +2 -2
  22. package/package.json +1 -1
  23. package/dist/chunk-6QEIZ33V.js.map +0 -1
  24. package/dist/chunk-CRMGUVRZ.js.map +0 -1
  25. package/dist/chunk-INOKS5LF.js.map +0 -1
  26. package/dist/chunk-KJGYL3M3.js.map +0 -1
  27. package/dist/chunk-KNF3AGCI.js.map +0 -1
  28. package/dist/dashboard/assets/index-BPMAZqjE.css +0 -1
  29. package/dist/dashboard/assets/index-BWO0UcxG.js +0 -118
  30. /package/dist/{artifact-writer-G57MG52C.js.map → artifact-writer-GFNKYREE.js.map} +0 -0
  31. /package/dist/{dist-M4B77IW4.js.map → dist-OY3JSP6Z.js.map} +0 -0
  32. /package/dist/{interactive-VYQ5SYMR.js.map → interactive-CQELHITQ.js.map} +0 -0
  33. /package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js.map → ts-eval-loader-RBTB2HG2-H5TRXZLO.js.map} +0 -0
@@ -4,8 +4,8 @@
4
4
  <meta charset="UTF-8" />
5
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
6
  <title>AgentV</title>
7
- <script type="module" crossorigin src="/assets/index-BWO0UcxG.js"></script>
8
- <link rel="stylesheet" crossorigin href="/assets/index-BPMAZqjE.css">
7
+ <script type="module" crossorigin src="/assets/index-DuESU7zZ.js"></script>
8
+ <link rel="stylesheet" crossorigin href="/assets/index-9tV-u4HJ.css">
9
9
  </head>
10
10
  <body class="bg-gray-950 text-gray-100">
11
11
  <div id="root"></div>
@@ -7,11 +7,13 @@ import {
7
7
  RunBudgetTracker,
8
8
  TranscriptProvider,
9
9
  addProject,
10
+ buildWipBranchName,
10
11
  checkoutResultsRepoBranch,
11
12
  commitAndPushResultsBranch,
12
13
  createAgentKernel,
13
14
  createDraftResultsPr,
14
15
  defineConfig,
16
+ deleteWipBranch,
15
17
  deriveCategory,
16
18
  deriveProjectId,
17
19
  directPushResults,
@@ -39,6 +41,7 @@ import {
39
41
  parseEnvOutput,
40
42
  prepareResultsRepoBranch,
41
43
  pushResultsRepoBranch,
44
+ pushWipCheckpoint,
42
45
  readTranscriptFile,
43
46
  readTranscriptJsonl,
44
47
  removeProject,
@@ -47,6 +50,7 @@ import {
47
50
  runBeforeSessionHook,
48
51
  saveProjectRegistry,
49
52
  scanRepoDeps,
53
+ setupWipWorktree,
50
54
  stageResultsArtifacts,
51
55
  syncProject,
52
56
  syncProjects,
@@ -54,10 +58,12 @@ import {
54
58
  syncResultsRepoForProject,
55
59
  toTranscriptJsonLines,
56
60
  touchProject,
61
+ traceFromTranscriptJsonLines,
62
+ traceToTranscriptJsonLines,
57
63
  transpileEvalYaml,
58
64
  transpileEvalYamlFile,
59
65
  trimBaselineResult
60
- } from "./chunk-KNF3AGCI.js";
66
+ } from "./chunk-OYI35QFW.js";
61
67
  import {
62
68
  OtlpJsonFileExporter
63
69
  } from "./chunk-QOBQ5XYF.js";
@@ -98,25 +104,62 @@ import {
98
104
  NormalizedTrajectoryWireSchema,
99
105
  PASS_THRESHOLD,
100
106
  ProviderRegistry,
107
+ REPLAY_FIXTURE_SCHEMA_VERSION,
101
108
  RUBRIC_OPERATOR_VALUES,
109
+ ReplayProvider,
102
110
  RepoManager,
103
111
  ResponseCache,
104
112
  SkillTriggerGrader,
105
113
  TEST_MESSAGE_ROLES,
114
+ TRACE_ENVELOPE_SCHEMA_VERSION,
115
+ TRACE_EVENT_TYPES,
116
+ TRACE_REDACTION_LEVELS,
117
+ TRACE_SCHEMA_VERSION,
118
+ TRACE_SOURCE_KINDS,
119
+ TRACE_TOOL_STATUSES,
106
120
  TemplateNotDirectoryError,
107
121
  TemplateNotFoundError,
108
122
  TokenUsageGrader,
109
123
  ToolTrajectoryGrader,
124
+ TraceArtifactWireSchema,
125
+ TraceBranchWireSchema,
126
+ TraceEnvelopeBodyWireSchema,
127
+ TraceEnvelopeCaptureWireSchema,
128
+ TraceEnvelopeConversionWarningWireSchema,
129
+ TraceEnvelopeEvalWireSchema,
130
+ TraceEnvelopeReplayWireSchema,
131
+ TraceEnvelopeScoreWireSchema,
132
+ TraceEnvelopeSourceRefWireSchema,
133
+ TraceEnvelopeSourceWireSchema,
134
+ TraceEnvelopeSpanEventWireSchema,
135
+ TraceEnvelopeSpanStatusWireSchema,
136
+ TraceEnvelopeSpanWireSchema,
137
+ TraceEnvelopeWireSchema,
138
+ TraceErrorWireSchema,
139
+ TraceEventWireSchema,
140
+ TraceMessageWireSchema,
141
+ TraceModelWireSchema,
142
+ TraceRawEvidenceWireSchema,
143
+ TraceRedactionStateWireSchema,
144
+ TraceSessionWireSchema,
145
+ TraceSourceRefWireSchema,
146
+ TraceSourceWireSchema,
147
+ TraceToolWireSchema,
110
148
  WorkspaceCreationError,
111
149
  WorkspacePoolManager,
150
+ appendErrorEventToTrace,
151
+ appendReplayFixtureRecord,
112
152
  assembleLlmGraderPrompt,
113
153
  avgToolDurationMs,
114
154
  buildDirectoryChain,
115
155
  buildOutputSchema,
116
156
  buildPromptInputs,
157
+ buildReplayFixtureRecord,
117
158
  buildRubricOutputSchema,
118
159
  buildScoreRangeOutputSchema,
119
160
  buildSearchRoots,
161
+ buildTraceEnvelopeFromEvaluationResult,
162
+ buildTraceFromMessages,
120
163
  calculateRubricScore,
121
164
  captureFileChanges,
122
165
  clampScore,
@@ -159,15 +202,21 @@ import {
159
202
  extractWorkersFromSuite,
160
203
  fileExists,
161
204
  findGitRoot,
205
+ findReplayFixtureRecord,
206
+ findTraceEnvelopeReplayRecord,
207
+ formatReplayLookupKey,
162
208
  formatToolCalls,
163
209
  freeformEvaluationSchema,
164
210
  fromNormalizedTrajectoryWire,
211
+ fromTraceArtifactWire,
212
+ fromTraceEnvelopeWire,
165
213
  getAgentvConfigDir,
166
214
  getAgentvDataDir,
167
215
  getAgentvHome,
168
216
  getSelectedTrajectoryEvents,
169
217
  getSubagentsRoot,
170
218
  getTextContent,
219
+ getTraceEnvelopeSummary,
171
220
  getTraceStateRoot,
172
221
  getWorkspacePath,
173
222
  getWorkspacePoolRoot,
@@ -201,9 +250,13 @@ import {
201
250
  parseJsonSafe,
202
251
  parseYamlValue,
203
252
  readJsonFile,
253
+ readReplayFixtureRecords,
204
254
  readTargetDefinitions,
205
255
  readTestSuiteMetadata,
206
256
  readTextFile,
257
+ readTraceEnvelopeReplayRecords,
258
+ replayFixtureRecordToProviderResponse,
259
+ replayLookupIdentityMatches,
207
260
  resolveAndCreateProvider,
208
261
  resolveDelegatedTargetDefinition,
209
262
  resolveFileReference,
@@ -224,8 +277,10 @@ import {
224
277
  runIsJsonAssertion,
225
278
  runRegexAssertion,
226
279
  runStartsWithAssertion,
280
+ sameReplayEvalPath,
227
281
  scoreRangeEvaluationSchema,
228
282
  scoreToVerdict,
283
+ serializeReplayFixtureRecord,
229
284
  shouldEnableCache,
230
285
  shouldSkipCacheForTemperature,
231
286
  subscribeToClaudeLogEntries,
@@ -237,10 +292,16 @@ import {
237
292
  toCamelCaseDeep,
238
293
  toNormalizedTrajectoryWire,
239
294
  toSnakeCaseDeep,
295
+ toTraceArtifactWire,
296
+ toTraceEnvelopeWire,
240
297
  tokensPerTool,
298
+ traceEnvelopeReplayRecordToProviderResponse,
299
+ traceEnvelopeToMessages,
300
+ traceEnvelopeToTraceArtifact,
301
+ traceEnvelopeToTraceSummary,
241
302
  trackChild,
242
303
  trackedChildCount
243
- } from "./chunk-6QEIZ33V.js";
304
+ } from "./chunk-RL4S2FBZ.js";
244
305
  import "./chunk-NPVGBFF6.js";
245
306
  import "./chunk-M7BUKBAF.js";
246
307
  import "./chunk-5H446C7X.js";
@@ -284,28 +345,66 @@ export {
284
345
  OtlpJsonFileExporter,
285
346
  PASS_THRESHOLD,
286
347
  ProviderRegistry,
348
+ REPLAY_FIXTURE_SCHEMA_VERSION,
287
349
  RUBRIC_OPERATOR_VALUES,
350
+ ReplayProvider,
288
351
  RepoManager,
289
352
  ResponseCache,
290
353
  RunBudgetTracker,
291
354
  SkillTriggerGrader,
292
355
  TEST_MESSAGE_ROLES,
356
+ TRACE_ENVELOPE_SCHEMA_VERSION,
357
+ TRACE_EVENT_TYPES,
358
+ TRACE_REDACTION_LEVELS,
359
+ TRACE_SCHEMA_VERSION,
360
+ TRACE_SOURCE_KINDS,
361
+ TRACE_TOOL_STATUSES,
293
362
  TemplateNotDirectoryError,
294
363
  TemplateNotFoundError,
295
364
  TokenUsageGrader,
296
365
  ToolTrajectoryGrader,
366
+ TraceArtifactWireSchema,
367
+ TraceBranchWireSchema,
368
+ TraceEnvelopeBodyWireSchema,
369
+ TraceEnvelopeCaptureWireSchema,
370
+ TraceEnvelopeConversionWarningWireSchema,
371
+ TraceEnvelopeEvalWireSchema,
372
+ TraceEnvelopeReplayWireSchema,
373
+ TraceEnvelopeScoreWireSchema,
374
+ TraceEnvelopeSourceRefWireSchema,
375
+ TraceEnvelopeSourceWireSchema,
376
+ TraceEnvelopeSpanEventWireSchema,
377
+ TraceEnvelopeSpanStatusWireSchema,
378
+ TraceEnvelopeSpanWireSchema,
379
+ TraceEnvelopeWireSchema,
380
+ TraceErrorWireSchema,
381
+ TraceEventWireSchema,
382
+ TraceMessageWireSchema,
383
+ TraceModelWireSchema,
384
+ TraceRawEvidenceWireSchema,
385
+ TraceRedactionStateWireSchema,
386
+ TraceSessionWireSchema,
387
+ TraceSourceRefWireSchema,
388
+ TraceSourceWireSchema,
389
+ TraceToolWireSchema,
297
390
  TranscriptProvider,
298
391
  WorkspaceCreationError,
299
392
  WorkspacePoolManager,
300
393
  addProject,
394
+ appendErrorEventToTrace,
395
+ appendReplayFixtureRecord,
301
396
  assembleLlmGraderPrompt,
302
397
  avgToolDurationMs,
303
398
  buildDirectoryChain,
304
399
  buildOutputSchema,
305
400
  buildPromptInputs,
401
+ buildReplayFixtureRecord,
306
402
  buildRubricOutputSchema,
307
403
  buildScoreRangeOutputSchema,
308
404
  buildSearchRoots,
405
+ buildTraceEnvelopeFromEvaluationResult,
406
+ buildTraceFromMessages,
407
+ buildWipBranchName,
309
408
  calculateRubricScore,
310
409
  captureFileChanges,
311
410
  checkoutResultsRepoBranch,
@@ -329,6 +428,7 @@ export {
329
428
  createTempWorkspace,
330
429
  deepEqual,
331
430
  defineConfig,
431
+ deleteWipBranch,
332
432
  deriveCategory,
333
433
  deriveProjectId,
334
434
  detectFormat,
@@ -361,9 +461,14 @@ export {
361
461
  extractWorkersFromSuite,
362
462
  fileExists,
363
463
  findGitRoot,
464
+ findReplayFixtureRecord,
465
+ findTraceEnvelopeReplayRecord,
466
+ formatReplayLookupKey,
364
467
  formatToolCalls,
365
468
  freeformEvaluationSchema,
366
469
  fromNormalizedTrajectoryWire,
470
+ fromTraceArtifactWire,
471
+ fromTraceEnvelopeWire,
367
472
  generateRubrics,
368
473
  getAgentvConfigDir,
369
474
  getAgentvDataDir,
@@ -378,6 +483,7 @@ export {
378
483
  getSelectedTrajectoryEvents,
379
484
  getSubagentsRoot,
380
485
  getTextContent,
486
+ getTraceEnvelopeSummary,
381
487
  getTraceStateRoot,
382
488
  getWorkspacePath,
383
489
  getWorkspacePoolRoot,
@@ -421,13 +527,18 @@ export {
421
527
  parseYamlValue,
422
528
  prepareResultsRepoBranch,
423
529
  pushResultsRepoBranch,
530
+ pushWipCheckpoint,
424
531
  readJsonFile,
532
+ readReplayFixtureRecords,
425
533
  readTargetDefinitions,
426
534
  readTestSuiteMetadata,
427
535
  readTextFile,
536
+ readTraceEnvelopeReplayRecords,
428
537
  readTranscriptFile,
429
538
  readTranscriptJsonl,
430
539
  removeProject,
540
+ replayFixtureRecordToProviderResponse,
541
+ replayLookupIdentityMatches,
431
542
  resolveAndCreateProvider,
432
543
  resolveDelegatedTargetDefinition,
433
544
  resolveFileReference,
@@ -451,10 +562,13 @@ export {
451
562
  runIsJsonAssertion,
452
563
  runRegexAssertion,
453
564
  runStartsWithAssertion,
565
+ sameReplayEvalPath,
454
566
  saveProjectRegistry,
455
567
  scanRepoDeps,
456
568
  scoreRangeEvaluationSchema,
457
569
  scoreToVerdict,
570
+ serializeReplayFixtureRecord,
571
+ setupWipWorktree,
458
572
  shouldEnableCache,
459
573
  shouldSkipCacheForTemperature,
460
574
  stageResultsArtifacts,
@@ -471,13 +585,21 @@ export {
471
585
  toCamelCaseDeep,
472
586
  toNormalizedTrajectoryWire,
473
587
  toSnakeCaseDeep,
588
+ toTraceArtifactWire,
589
+ toTraceEnvelopeWire,
474
590
  toTranscriptJsonLines,
475
591
  tokensPerTool,
476
592
  touchProject,
593
+ traceEnvelopeReplayRecordToProviderResponse,
594
+ traceEnvelopeToMessages,
595
+ traceEnvelopeToTraceArtifact,
596
+ traceEnvelopeToTraceSummary,
597
+ traceFromTranscriptJsonLines,
598
+ traceToTranscriptJsonLines,
477
599
  trackChild,
478
600
  trackedChildCount,
479
601
  transpileEvalYaml,
480
602
  transpileEvalYamlFile,
481
603
  trimBaselineResult
482
604
  };
483
- //# sourceMappingURL=dist-M4B77IW4.js.map
605
+ //# sourceMappingURL=dist-OY3JSP6Z.js.map
package/dist/index.js CHANGED
@@ -4,13 +4,13 @@ import {
4
4
  preprocessArgv,
5
5
  runCli,
6
6
  usesDeprecatedStudioAlias
7
- } from "./chunk-CRMGUVRZ.js";
8
- import "./chunk-INOKS5LF.js";
9
- import "./chunk-KJGYL3M3.js";
10
- import "./chunk-KNF3AGCI.js";
7
+ } from "./chunk-P4LSNFZR.js";
8
+ import "./chunk-M7AMFWBZ.js";
9
+ import "./chunk-N6E5XFOM.js";
10
+ import "./chunk-OYI35QFW.js";
11
11
  import "./chunk-QOBQ5XYF.js";
12
12
  import "./chunk-BPGJ4HBU.js";
13
- import "./chunk-6QEIZ33V.js";
13
+ import "./chunk-RL4S2FBZ.js";
14
14
  import "./chunk-NPVGBFF6.js";
15
15
  import "./chunk-M7BUKBAF.js";
16
16
  import "./chunk-5H446C7X.js";
@@ -7,16 +7,16 @@ import {
7
7
  findRepoRoot,
8
8
  getCategories,
9
9
  runEvalCommand
10
- } from "./chunk-INOKS5LF.js";
11
- import "./chunk-KJGYL3M3.js";
12
- import "./chunk-KNF3AGCI.js";
10
+ } from "./chunk-M7AMFWBZ.js";
11
+ import "./chunk-N6E5XFOM.js";
12
+ import "./chunk-OYI35QFW.js";
13
13
  import "./chunk-QOBQ5XYF.js";
14
14
  import "./chunk-BPGJ4HBU.js";
15
15
  import {
16
16
  getAgentvConfigDir,
17
17
  listTargetNames,
18
18
  readTargetDefinitions
19
- } from "./chunk-6QEIZ33V.js";
19
+ } from "./chunk-RL4S2FBZ.js";
20
20
  import "./chunk-NPVGBFF6.js";
21
21
  import "./chunk-M7BUKBAF.js";
22
22
  import "./chunk-5H446C7X.js";
@@ -360,4 +360,4 @@ ${ANSI_DIM}Retrying execution errors...${ANSI_RESET}
360
360
  export {
361
361
  launchInteractiveWizard
362
362
  };
363
- //# sourceMappingURL=interactive-VYQ5SYMR.js.map
363
+ //# sourceMappingURL=interactive-CQELHITQ.js.map
@@ -544,6 +544,10 @@ agentv eval <file.yaml> [--test-id <id>] [--target <name>] [--dry-run] [--thresh
544
544
  # Run with OTLP JSON file (importable by OTel backends)
545
545
  agentv eval <file.yaml> --otel-file traces/eval.otlp.json
546
546
 
547
+ # Record live target output for later target substitution
548
+ agentv eval <file.yaml> --target live_agent --record-replay fixtures/target-output.jsonl
549
+ agentv eval <file.yaml> --target replay_agent
550
+
547
551
  # Run a single assertion in isolation (no API keys needed)
548
552
  agentv eval assert <grader-name> --agent-output "..." --agent-input "..."
549
553
 
@@ -567,6 +571,8 @@ agentv compare .agentv/results/runs/<baseline-timestamp>/index.jsonl .agentv/res
567
571
  agentv validate <file.yaml>
568
572
  ```
569
573
 
574
+ **Replay targets:** Add `provider: replay`, `fixtures: <jsonl>`, and `source_target: <live target name>` in `.agentv/targets.yaml`. Optional `suite`, `eval_path`, and `variant` tighten lookup. The eval YAML and graders stay unchanged; replay only substitutes recorded target output, and graders run fresh.
575
+
570
576
  ## Code Judge SDK
571
577
 
572
578
  Use `@agentv/eval` to build custom graders in TypeScript/JavaScript:
@@ -2,7 +2,7 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
2
2
  import {
3
3
  loadTsEvalFile,
4
4
  loadTsEvalSuite
5
- } from "./chunk-6QEIZ33V.js";
5
+ } from "./chunk-RL4S2FBZ.js";
6
6
  import "./chunk-NPVGBFF6.js";
7
7
  import "./chunk-M7BUKBAF.js";
8
8
  import "./chunk-5H446C7X.js";
@@ -10,4 +10,4 @@ export {
10
10
  loadTsEvalFile,
11
11
  loadTsEvalSuite
12
12
  };
13
- //# sourceMappingURL=ts-eval-loader-EQJX3OLT-THE7D3GR.js.map
13
+ //# sourceMappingURL=ts-eval-loader-RBTB2HG2-H5TRXZLO.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentv",
3
- "version": "4.35.1",
3
+ "version": "4.37.0-next.1",
4
4
  "description": "CLI entry point for AgentV",
5
5
  "type": "module",
6
6
  "repository": {