@tangle-network/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +236 -1
  2. package/README.md +17 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  5. package/dist/chunk-4W4NCYM2.js.map +1 -0
  6. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  7. package/dist/chunk-6M774GY6.js +53 -0
  8. package/dist/chunk-6M774GY6.js.map +1 -0
  9. package/dist/chunk-7EAUOUQS.js +495 -0
  10. package/dist/chunk-7EAUOUQS.js.map +1 -0
  11. package/dist/chunk-AXHNWLIX.js +246 -0
  12. package/dist/chunk-AXHNWLIX.js.map +1 -0
  13. package/dist/chunk-EXGR4XEM.js +283 -0
  14. package/dist/chunk-EXGR4XEM.js.map +1 -0
  15. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  16. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  17. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  18. package/dist/chunk-LZKIOBG2.js +2026 -0
  19. package/dist/chunk-LZKIOBG2.js.map +1 -0
  20. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  21. package/dist/chunk-QBW3YBTR.js.map +1 -0
  22. package/dist/chunk-QUKKGHTZ.js +121 -0
  23. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  24. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  25. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  26. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  27. package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
  28. package/dist/cli.js +3 -3
  29. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  30. package/dist/control.d.ts +3 -3
  31. package/dist/control.js +2 -2
  32. package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
  33. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  34. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  35. package/dist/index-ekBXweiQ.d.ts +1894 -0
  36. package/dist/index.d.ts +20 -430
  37. package/dist/index.js +154 -34
  38. package/dist/index.js.map +1 -1
  39. package/dist/integrity-Cr5YodSY.d.ts +210 -0
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +7 -145
  42. package/dist/optimization.js +12 -3
  43. package/dist/reporting.d.ts +294 -4
  44. package/dist/reporting.js +18 -9
  45. package/dist/rl.d.ts +8 -0
  46. package/dist/rl.js +113 -0
  47. package/dist/rl.js.map +1 -0
  48. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  49. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  50. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
  51. package/dist/traces.d.ts +101 -181
  52. package/dist/traces.js +19 -8
  53. package/dist/wire/index.js +3 -3
  54. package/docs/auto-research-loop-end-to-end.md +186 -0
  55. package/docs/research-report-methodology.md +19 -4
  56. package/docs/three-package-architecture.md +180 -0
  57. package/docs/wire-protocol.md +1 -1
  58. package/package.json +7 -2
  59. package/dist/chunk-3IX6QTB7.js.map +0 -1
  60. package/dist/chunk-KRR4VMH7.js +0 -423
  61. package/dist/chunk-KRR4VMH7.js.map +0 -1
  62. package/dist/chunk-WOK2RTWG.js.map +0 -1
  63. package/dist/chunk-YUFXO3TU.js.map +0 -1
  64. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  65. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  66. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
  67. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
  68. /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  stopOnNoProgress,
20
20
  stopOnRepeatedAction,
21
21
  subjectiveEval
22
- } from "./chunk-ARZ6BEV6.js";
22
+ } from "./chunk-V5QSWN7L.js";
23
23
  import {
24
24
  CallbackResearcher,
25
25
  DEFAULT_MUTATION_PRIMITIVES,
@@ -53,49 +53,111 @@ import {
53
53
  summarizePreferenceMemory,
54
54
  trialTraceFromMultiShotTrial,
55
55
  withAssignedFeedbackSplit
56
- } from "./chunk-HRZELXCR.js";
56
+ } from "./chunk-VQQSPGSM.js";
57
57
  import {
58
58
  RunRecordValidationError,
59
59
  isRunRecord,
60
60
  parseRunRecordSafe,
61
61
  roundTripRunRecord,
62
62
  validateRunRecord
63
- } from "./chunk-YUFXO3TU.js";
63
+ } from "./chunk-QBW3YBTR.js";
64
64
  import {
65
- RESEARCH_REPORT_HARD_PAIR_FLOOR,
66
65
  assertReleaseConfidence,
67
66
  bootstrapCi,
68
- canonicalize,
69
- evaluateHypothesis,
70
67
  evaluateReleaseConfidence,
71
- gainHistogram,
72
- hashJson,
73
68
  judgeReplayGate,
74
- paretoChart,
75
69
  releaseTraceEvidenceFromMultiShotTrials,
76
- renderReleaseReport,
77
- researchReport,
78
- signManifest,
79
- summaryTable,
80
- verifyManifest
81
- } from "./chunk-3IX6QTB7.js";
70
+ renderReleaseReport
71
+ } from "./chunk-7EAUOUQS.js";
72
+ import {
73
+ PredictiveValidityResearcher,
74
+ adversarialScenarioSearch,
75
+ analyzeOptimizationResult,
76
+ applyEloUpdate,
77
+ bestOfN,
78
+ buildPairwiseFromCampaign,
79
+ compareAdaptationCurves,
80
+ detectRewardHacking,
81
+ doublyRobust,
82
+ extractPreferences,
83
+ extractStepRewards,
84
+ extractVerifiableReward,
85
+ extractVerifiableRewardsFromRecords,
86
+ filterDeterministicallyRewarded,
87
+ firstPassK,
88
+ fitBradleyTerry,
89
+ injectIrrelevantClause,
90
+ inverseProbabilityWeighting,
91
+ observationsFromRunRecords,
92
+ offPolicyEstimateAll,
93
+ prmTrainingPairs,
94
+ renameVariables,
95
+ runAdaptationCurve,
96
+ runComputeCurve,
97
+ runContaminationProbe,
98
+ runRLCampaign,
99
+ runwiseStepRewardSummary,
100
+ selfConsistency,
101
+ selfNormalizedImportanceWeighting,
102
+ shuffleOrder,
103
+ stepRewardsToJsonl,
104
+ thompsonCurriculum,
105
+ toAnthropicFormat,
106
+ toDpoJsonl,
107
+ toDpoRows,
108
+ toGrpoJsonl,
109
+ toGrpoRows,
110
+ toPrmJsonl,
111
+ toPrmRows,
112
+ toSftJsonl,
113
+ toSftRows,
114
+ toTRLFormat,
115
+ trialToRunRecord,
116
+ trialsToRunRecords,
117
+ varianceBasedCurriculum,
118
+ variantAggregateToRunRecord,
119
+ verificationReportToRunRecord
120
+ } from "./chunk-LZKIOBG2.js";
121
+ import {
122
+ runEvalCampaign
123
+ } from "./chunk-EXGR4XEM.js";
124
+ import {
125
+ LlmCallError,
126
+ LlmClient,
127
+ LlmRouteAssertionError,
128
+ assertLlmRoute,
129
+ callLlm,
130
+ callLlmJson,
131
+ probeLlm,
132
+ stripFencedJson
133
+ } from "./chunk-KAO3Q65R.js";
134
+ import {
135
+ evaluateInterimReleaseConfidence,
136
+ pairedEvalueSequence,
137
+ rubricPredictiveValidity
138
+ } from "./chunk-AXHNWLIX.js";
82
139
  import {
140
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
83
141
  benjaminiHochberg,
84
142
  bhAdjust,
85
143
  bonferroni,
86
144
  cohensD,
87
145
  confidenceInterval,
146
+ gainHistogram,
88
147
  interRaterReliability,
89
148
  mannWhitneyU,
90
149
  normalizeScores,
91
150
  pairedBootstrap,
92
151
  pairedTTest,
93
152
  pairedWilcoxon,
153
+ paretoChart,
94
154
  partialCredit,
95
155
  requiredSampleSize,
156
+ researchReport,
157
+ summaryTable,
96
158
  weightedMean,
97
159
  wilcoxonSignedRank
98
- } from "./chunk-KRR4VMH7.js";
160
+ } from "./chunk-IOXMGMHQ.js";
99
161
  import {
100
162
  DEFAULT_REDACTION_RULES,
101
163
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -105,7 +167,8 @@ import {
105
167
  OTEL_AGENT_EVAL_SCOPE,
106
168
  OtlpFileTraceStore,
107
169
  REDACTION_VERSION,
108
- RunIntegrityError,
170
+ ReplayCache,
171
+ ReplayCacheMissError,
109
172
  SpanNotFoundError,
110
173
  TRACE_ANALYST_ACTOR_DESCRIPTION,
111
174
  TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -117,10 +180,10 @@ import {
117
180
  aggregateLlm,
118
181
  analyzeTraces,
119
182
  argHash,
120
- assertRunCaptured,
121
183
  buildTraceAnalystTools,
122
184
  buildTraceInsightContext,
123
185
  buildTraceInsightPrompt,
186
+ createReplayFetch,
124
187
  defaultTraceInsightPanel,
125
188
  describeTraceInsightScope,
126
189
  domainEvidencePattern,
@@ -132,6 +195,7 @@ import {
132
195
  isRetrievalSpan,
133
196
  isSandboxSpan,
134
197
  isToolSpan,
198
+ iterateRawCalls,
135
199
  judgeSpans,
136
200
  llmSpans,
137
201
  planTraceInsightQuestions,
@@ -140,33 +204,34 @@ import {
140
204
  runFailureClass,
141
205
  runsForScenario,
142
206
  scoreTraceInsightReadiness,
143
- throwIfRunIncomplete,
144
207
  tokenizeDomainWords,
145
208
  toolSpans,
146
209
  traceAnalystFunctionGroup,
147
210
  traceAnalystOnRunComplete
148
- } from "./chunk-WOK2RTWG.js";
149
- import {
150
- TraceEmitter,
151
- llmSpanFromProvider
152
- } from "./chunk-5IIQKMD5.js";
211
+ } from "./chunk-4W4NCYM2.js";
153
212
  import {
154
- LlmCallError,
155
- LlmClient,
156
- LlmRouteAssertionError,
157
- assertLlmRoute,
158
- callLlm,
159
- callLlmJson,
160
- probeLlm,
161
- stripFencedJson
162
- } from "./chunk-3GN6U53I.js";
213
+ RunIntegrityError,
214
+ assertRunCaptured,
215
+ throwIfRunIncomplete
216
+ } from "./chunk-QUKKGHTZ.js";
163
217
  import {
164
218
  FileSystemRawProviderSink,
165
219
  InMemoryRawProviderSink,
166
220
  NoopRawProviderSink,
167
221
  defaultProviderRedactor,
168
222
  providerFromBaseUrl
169
- } from "./chunk-SNUHRBDL.js";
223
+ } from "./chunk-SQQLHODJ.js";
224
+ import {
225
+ TraceEmitter,
226
+ llmSpanFromProvider
227
+ } from "./chunk-5IIQKMD5.js";
228
+ import {
229
+ canonicalize,
230
+ evaluateHypothesis,
231
+ hashJson,
232
+ signManifest,
233
+ verifyManifest
234
+ } from "./chunk-6M774GY6.js";
170
235
  import "./chunk-PZ5AY32C.js";
171
236
 
172
237
  // src/client.ts
@@ -10486,12 +10551,15 @@ export {
10486
10551
  OTEL_AGENT_EVAL_SCOPE,
10487
10552
  OtlpFileTraceStore,
10488
10553
  PairwiseSteeringOptimizer,
10554
+ PredictiveValidityResearcher,
10489
10555
  PrmGrader,
10490
10556
  ProductClient,
10491
10557
  ProjectRegistry,
10492
10558
  PromptRegistry,
10493
10559
  REDACTION_VERSION,
10494
10560
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
10561
+ ReplayCache,
10562
+ ReplayCacheMissError,
10495
10563
  RunCritic,
10496
10564
  RunIntegrityError,
10497
10565
  RunRecordValidationError,
@@ -10513,12 +10581,15 @@ export {
10513
10581
  UNIVERSAL_FINDERS,
10514
10582
  acquisitionPlansForKnowledgeGaps,
10515
10583
  adversarialJudge,
10584
+ adversarialScenarioSearch,
10516
10585
  aggregateLlm,
10517
10586
  aggregateRunScore,
10518
10587
  allCriticalPassed,
10519
10588
  analyzeAntiSlop,
10589
+ analyzeOptimizationResult,
10520
10590
  analyzeSeries,
10521
10591
  analyzeTraces,
10592
+ applyEloUpdate,
10522
10593
  argHash,
10523
10594
  assertLlmRoute,
10524
10595
  assertReleaseConfidence,
@@ -10528,12 +10599,14 @@ export {
10528
10599
  deterministicSplit as benchmarkDeterministicSplit,
10529
10600
  benchmarks_exports as benchmarks,
10530
10601
  benjaminiHochberg,
10602
+ bestOfN,
10531
10603
  bhAdjust,
10532
10604
  bisect,
10533
10605
  blockingKnowledgeEval,
10534
10606
  bonferroni,
10535
10607
  bootstrapCi,
10536
10608
  budgetBreachView,
10609
+ buildPairwiseFromCampaign,
10537
10610
  buildReflectionPrompt,
10538
10611
  buildReviewerPrompt,
10539
10612
  buildTraceAnalystTools,
@@ -10559,6 +10632,7 @@ export {
10559
10632
  coherenceJudge,
10560
10633
  collectionPreserved,
10561
10634
  commitBisect,
10635
+ compareAdaptationCurves,
10562
10636
  compareReferenceReplay,
10563
10637
  compareToBaseline,
10564
10638
  compilerJudge,
@@ -10580,6 +10654,7 @@ export {
10580
10654
  createFeedbackTrajectory,
10581
10655
  createIntentMatchJudge,
10582
10656
  createLlmReviewer,
10657
+ createReplayFetch,
10583
10658
  createSandboxCodeMutator,
10584
10659
  createSandboxPool,
10585
10660
  createSemanticConceptJudge,
@@ -10594,15 +10669,18 @@ export {
10594
10669
  defaultTraceInsightPanel,
10595
10670
  deployGateLayer,
10596
10671
  describeTraceInsightScope,
10672
+ detectRewardHacking,
10597
10673
  distillPlaybook,
10598
10674
  domainEvidencePattern,
10599
10675
  dominates,
10676
+ doublyRobust,
10600
10677
  estimateCost,
10601
10678
  estimateTokens,
10602
10679
  euAiActReport,
10603
10680
  evaluateActionPolicy,
10604
10681
  evaluateContract,
10605
10682
  evaluateHypothesis,
10683
+ evaluateInterimReleaseConfidence,
10606
10684
  evaluateOracles,
10607
10685
  evaluateReleaseConfidence,
10608
10686
  executeScenario,
@@ -10612,6 +10690,10 @@ export {
10612
10690
  exportTrainingData,
10613
10691
  extractAssetUrls,
10614
10692
  extractErrorCount,
10693
+ extractPreferences,
10694
+ extractStepRewards,
10695
+ extractVerifiableReward,
10696
+ extractVerifiableRewardsFromRecords,
10615
10697
  failureClusterView,
10616
10698
  feedbackTrajectoriesToDatasetScenarios,
10617
10699
  feedbackTrajectoriesToOptimizerRows,
@@ -10619,12 +10701,15 @@ export {
10619
10701
  feedbackTrajectoryToOptimizerRow,
10620
10702
  fileContains,
10621
10703
  fileExists,
10704
+ filterDeterministicallyRewarded,
10622
10705
  findAutoMatchNoExpectation,
10623
10706
  findConstructorCwdDropped,
10624
10707
  findFallbackToPass,
10625
10708
  findLiteralTruePass,
10626
10709
  findSkipCountsAsPass,
10627
10710
  firstDivergenceView,
10711
+ firstPassK,
10712
+ fitBradleyTerry,
10628
10713
  flowLayer,
10629
10714
  formatBenchmarkReport,
10630
10715
  formatDriverReport,
@@ -10640,12 +10725,14 @@ export {
10640
10725
  inMemoryReferenceReplayStore,
10641
10726
  inMemoryReviewStore,
10642
10727
  inferDomainKeywords,
10728
+ injectIrrelevantClause,
10643
10729
  integrationAsi,
10644
10730
  integrationGateEvals,
10645
10731
  integrationInvokeFailedPayload,
10646
10732
  integrationManifestResolvedPayload,
10647
10733
  integrationManifestValidatedPayload,
10648
10734
  interRaterReliability,
10735
+ inverseProbabilityWeighting,
10649
10736
  iqr,
10650
10737
  isJudgeSpan,
10651
10738
  isLlmSpan,
@@ -10654,6 +10741,7 @@ export {
10654
10741
  isRunRecord,
10655
10742
  isSandboxSpan,
10656
10743
  isToolSpan,
10744
+ iterateRawCalls,
10657
10745
  jestTestParser,
10658
10746
  jsonHasKeys,
10659
10747
  jsonShape,
@@ -10680,8 +10768,11 @@ export {
10680
10768
  normalizeScores,
10681
10769
  notBlocked,
10682
10770
  objectiveEval,
10771
+ observationsFromRunRecords,
10772
+ offPolicyEstimateAll,
10683
10773
  outputLengthRubric,
10684
10774
  pairedBootstrap,
10775
+ pairedEvalueSequence,
10685
10776
  pairedTTest,
10686
10777
  pairedWilcoxon,
10687
10778
  paraphraseRobustness,
@@ -10701,6 +10792,7 @@ export {
10701
10792
  printDriverSummary,
10702
10793
  prmBestOfN,
10703
10794
  prmEnsembleBestOfN,
10795
+ prmTrainingPairs,
10704
10796
  probeLlm,
10705
10797
  promptBisect,
10706
10798
  proposeSynthesisTargets,
@@ -10716,6 +10808,7 @@ export {
10716
10808
  regexMatches,
10717
10809
  regressionView,
10718
10810
  releaseTraceEvidenceFromMultiShotTrials,
10811
+ renameVariables,
10719
10812
  renderMarkdown,
10720
10813
  renderMarkdownReport,
10721
10814
  renderPlaybookMarkdown,
@@ -10733,12 +10826,17 @@ export {
10733
10826
  roundTripRunRecord,
10734
10827
  rowCount,
10735
10828
  rowWhere,
10829
+ rubricPredictiveValidity,
10830
+ runAdaptationCurve,
10736
10831
  runAgentControlLoop,
10737
10832
  runAssertions,
10738
10833
  runBehavioralCanaries,
10739
10834
  runCanaries,
10835
+ runComputeCurve,
10836
+ runContaminationProbe,
10740
10837
  runCounterfactual,
10741
10838
  runE2EWorkflow,
10839
+ runEvalCampaign,
10742
10840
  runExpectations,
10743
10841
  runFailureClass,
10744
10842
  runHarnessExperiment,
@@ -10751,11 +10849,13 @@ export {
10751
10849
  runPromptEvolution,
10752
10850
  runProposeReview,
10753
10851
  runProposeReviewAsControlLoop,
10852
+ runRLCampaign,
10754
10853
  runReferenceReplay,
10755
10854
  runSelfPlay,
10756
10855
  runSemanticConceptJudge,
10757
10856
  runTestGradedScenario,
10758
10857
  runsForScenario,
10858
+ runwiseStepRewardSummary,
10759
10859
  scalarScore,
10760
10860
  scanForMuffledGates,
10761
10861
  scoreAllProjects,
@@ -10768,12 +10868,16 @@ export {
10768
10868
  scoreTraceInsightReadiness,
10769
10869
  securityJudge,
10770
10870
  selectHarnessVariant,
10871
+ selfConsistency,
10872
+ selfNormalizedImportanceWeighting,
10771
10873
  selfPreference,
10772
10874
  sentenceReorderMutator,
10773
10875
  serializeFeedbackTrajectoriesJsonl,
10876
+ shuffleOrder,
10774
10877
  signManifest,
10775
10878
  soc2Report,
10776
10879
  statusAdvanced,
10880
+ stepRewardsToJsonl,
10777
10881
  stopOnNoProgress,
10778
10882
  stopOnRepeatedAction,
10779
10883
  stripFencedJson,
@@ -10785,10 +10889,21 @@ export {
10785
10889
  summaryTable,
10786
10890
  testJudge,
10787
10891
  textInSnapshot,
10892
+ thompsonCurriculum,
10788
10893
  throwIfRunIncomplete,
10894
+ toAnthropicFormat,
10895
+ toDpoJsonl,
10896
+ toDpoRows,
10897
+ toGrpoJsonl,
10898
+ toGrpoRows,
10789
10899
  toLangfuseEnvelope,
10790
10900
  toNdjson,
10901
+ toPrmJsonl,
10902
+ toPrmRows,
10791
10903
  toPrometheusText,
10904
+ toSftJsonl,
10905
+ toSftRows,
10906
+ toTRLFormat,
10792
10907
  tokenizeDomainWords,
10793
10908
  toolIntentAlignmentRubric,
10794
10909
  toolNamesForRun,
@@ -10798,12 +10913,17 @@ export {
10798
10913
  toolWasteView,
10799
10914
  traceAnalystFunctionGroup,
10800
10915
  traceAnalystOnRunComplete,
10916
+ trialToRunRecord,
10801
10917
  trialTraceFromMultiShotTrial,
10918
+ trialsToRunRecords,
10802
10919
  typoMutator,
10803
10920
  urlContains,
10804
10921
  userQuestionsForKnowledgeGaps,
10805
10922
  validateRunRecord,
10923
+ varianceBasedCurriculum,
10924
+ variantAggregateToRunRecord,
10806
10925
  verbosityBias,
10926
+ verificationReportToRunRecord,
10807
10927
  verifyManifest,
10808
10928
  visualDiff,
10809
10929
  viteDeployRunner,