@tangle-network/agent-eval 0.20.12 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +177 -0
  2. package/README.md +43 -1
  3. package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  6. package/dist/chunk-5IIQKMD5.js.map +1 -0
  7. package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
  8. package/dist/chunk-6M774GY6.js +53 -0
  9. package/dist/chunk-6M774GY6.js.map +1 -0
  10. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  11. package/dist/chunk-IOXMGMHQ.js +1226 -0
  12. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  13. package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
  14. package/dist/chunk-KAO3Q65R.js.map +1 -0
  15. package/dist/chunk-QUKKGHTZ.js +121 -0
  16. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  17. package/dist/chunk-SQQLHODJ.js +163 -0
  18. package/dist/chunk-SQQLHODJ.js.map +1 -0
  19. package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
  20. package/dist/chunk-UAND2LOT.js.map +1 -0
  21. package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
  22. package/dist/chunk-USHQBPMH.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  26. package/dist/control.d.ts +4 -3
  27. package/dist/control.js +2 -2
  28. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  29. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  30. package/dist/index.d.ts +16 -302
  31. package/dist/index.js +70 -62
  32. package/dist/index.js.map +1 -1
  33. package/dist/integrity-K2oVlF57.d.ts +210 -0
  34. package/dist/openapi.json +1 -1
  35. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  36. package/dist/optimization.d.ts +7 -144
  37. package/dist/optimization.js +9 -2
  38. package/dist/reporting-B82RSv9C.d.ts +593 -0
  39. package/dist/reporting.d.ts +5 -426
  40. package/dist/reporting.js +17 -6
  41. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  42. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  43. package/dist/traces.d.ts +179 -3
  44. package/dist/traces.js +35 -4
  45. package/dist/wire/index.js +3 -2
  46. package/docs/research-report-methodology.md +170 -0
  47. package/docs/wire-protocol.md +1 -1
  48. package/package.json +11 -13
  49. package/dist/chunk-75MCTH7P.js.map +0 -1
  50. package/dist/chunk-HKYRWNHV.js.map +0 -1
  51. package/dist/chunk-IKFVX537.js.map +0 -1
  52. package/dist/chunk-KWUAAIHR.js.map +0 -1
  53. package/dist/chunk-ODFINDLQ.js +0 -413
  54. package/dist/chunk-ODFINDLQ.js.map +0 -1
  55. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  56. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
  57. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  stopOnNoProgress,
20
20
  stopOnRepeatedAction,
21
21
  subjectiveEval
22
- } from "./chunk-MCMV7DUL.js";
22
+ } from "./chunk-ARZ6BEV6.js";
23
23
  import {
24
24
  CallbackResearcher,
25
25
  DEFAULT_MUTATION_PRIMITIVES,
@@ -46,6 +46,7 @@ import {
46
46
  renderPreferenceMemoryMarkdown,
47
47
  replayFeedbackTrajectories,
48
48
  replayFeedbackTrajectory,
49
+ runEvalCampaign,
49
50
  runMultiShotOptimization,
50
51
  runPromptEvolution,
51
52
  scalarScore,
@@ -53,7 +54,7 @@ import {
53
54
  summarizePreferenceMemory,
54
55
  trialTraceFromMultiShotTrial,
55
56
  withAssignedFeedbackSplit
56
- } from "./chunk-HKYRWNHV.js";
57
+ } from "./chunk-USHQBPMH.js";
57
58
  import {
58
59
  RunRecordValidationError,
59
60
  isRunRecord,
@@ -64,31 +65,36 @@ import {
64
65
  import {
65
66
  assertReleaseConfidence,
66
67
  bootstrapCi,
68
+ evaluateInterimReleaseConfidence,
67
69
  evaluateReleaseConfidence,
68
- gainHistogram,
69
70
  judgeReplayGate,
70
- paretoChart,
71
+ pairedEvalueSequence,
71
72
  releaseTraceEvidenceFromMultiShotTrials,
72
73
  renderReleaseReport,
73
- summaryTable
74
- } from "./chunk-IKFVX537.js";
74
+ rubricPredictiveValidity
75
+ } from "./chunk-UAND2LOT.js";
75
76
  import {
77
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
76
78
  benjaminiHochberg,
77
79
  bhAdjust,
78
80
  bonferroni,
79
81
  cohensD,
80
82
  confidenceInterval,
83
+ gainHistogram,
81
84
  interRaterReliability,
82
85
  mannWhitneyU,
83
86
  normalizeScores,
84
87
  pairedBootstrap,
85
88
  pairedTTest,
86
89
  pairedWilcoxon,
90
+ paretoChart,
87
91
  partialCredit,
88
92
  requiredSampleSize,
93
+ researchReport,
94
+ summaryTable,
89
95
  weightedMean,
90
96
  wilcoxonSignedRank
91
- } from "./chunk-ODFINDLQ.js";
97
+ } from "./chunk-IOXMGMHQ.js";
92
98
  import {
93
99
  DEFAULT_REDACTION_RULES,
94
100
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -98,6 +104,8 @@ import {
98
104
  OTEL_AGENT_EVAL_SCOPE,
99
105
  OtlpFileTraceStore,
100
106
  REDACTION_VERSION,
107
+ ReplayCache,
108
+ ReplayCacheMissError,
101
109
  SpanNotFoundError,
102
110
  TRACE_ANALYST_ACTOR_DESCRIPTION,
103
111
  TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -112,6 +120,7 @@ import {
112
120
  buildTraceAnalystTools,
113
121
  buildTraceInsightContext,
114
122
  buildTraceInsightPrompt,
123
+ createReplayFetch,
115
124
  defaultTraceInsightPanel,
116
125
  describeTraceInsightScope,
117
126
  domainEvidencePattern,
@@ -123,6 +132,7 @@ import {
123
132
  isRetrievalSpan,
124
133
  isSandboxSpan,
125
134
  isToolSpan,
135
+ iterateRawCalls,
126
136
  judgeSpans,
127
137
  llmSpans,
128
138
  planTraceInsightQuestions,
@@ -133,20 +143,42 @@ import {
133
143
  scoreTraceInsightReadiness,
134
144
  tokenizeDomainWords,
135
145
  toolSpans,
136
- traceAnalystFunctionGroup
137
- } from "./chunk-KWUAAIHR.js";
146
+ traceAnalystFunctionGroup,
147
+ traceAnalystOnRunComplete
148
+ } from "./chunk-4W4NCYM2.js";
149
+ import {
150
+ RunIntegrityError,
151
+ assertRunCaptured,
152
+ throwIfRunIncomplete
153
+ } from "./chunk-QUKKGHTZ.js";
138
154
  import {
139
155
  TraceEmitter,
140
156
  llmSpanFromProvider
141
- } from "./chunk-PKCVBYTQ.js";
157
+ } from "./chunk-5IIQKMD5.js";
158
+ import {
159
+ canonicalize,
160
+ evaluateHypothesis,
161
+ hashJson,
162
+ signManifest,
163
+ verifyManifest
164
+ } from "./chunk-6M774GY6.js";
142
165
  import {
143
166
  LlmCallError,
144
167
  LlmClient,
168
+ LlmRouteAssertionError,
169
+ assertLlmRoute,
145
170
  callLlm,
146
171
  callLlmJson,
147
172
  probeLlm,
148
173
  stripFencedJson
149
- } from "./chunk-75MCTH7P.js";
174
+ } from "./chunk-KAO3Q65R.js";
175
+ import {
176
+ FileSystemRawProviderSink,
177
+ InMemoryRawProviderSink,
178
+ NoopRawProviderSink,
179
+ defaultProviderRedactor,
180
+ providerFromBaseUrl
181
+ } from "./chunk-SQQLHODJ.js";
150
182
  import "./chunk-PZ5AY32C.js";
151
183
 
152
184
  // src/client.ts
@@ -4847,7 +4879,7 @@ var Dataset = class _Dataset {
4847
4879
  * Write to disk for contamination-verifiable archives.
4848
4880
  */
4849
4881
  toJsonl() {
4850
- return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize(s))).join("\n") + "\n";
4882
+ return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
4851
4883
  }
4852
4884
  static fromJsonl(jsonl, manifest) {
4853
4885
  const scenarios = [];
@@ -4860,18 +4892,18 @@ var Dataset = class _Dataset {
4860
4892
  }
4861
4893
  };
4862
4894
  async function hashScenarios(scenarios) {
4863
- const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize);
4895
+ const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
4864
4896
  const text = JSON.stringify(canonical);
4865
4897
  const bytes = new TextEncoder().encode(text);
4866
4898
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
4867
4899
  return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
4868
4900
  }
4869
- function canonicalize(v) {
4901
+ function canonicalize2(v) {
4870
4902
  if (v === null || typeof v !== "object") return v;
4871
- if (Array.isArray(v)) return v.map(canonicalize);
4903
+ if (Array.isArray(v)) return v.map(canonicalize2);
4872
4904
  const keys = Object.keys(v).sort();
4873
4905
  const out = {};
4874
- for (const k of keys) out[k] = canonicalize(v[k]);
4906
+ for (const k of keys) out[k] = canonicalize2(v[k]);
4875
4907
  return out;
4876
4908
  }
4877
4909
  function seededShuffle(items, seed) {
@@ -6978,51 +7010,6 @@ function attributeStep(op, prmA, prmB) {
6978
7010
  };
6979
7011
  }
6980
7012
 
6981
- // src/pre-registration.ts
6982
- function canonicalize2(v) {
6983
- if (v === null || typeof v !== "object") return v;
6984
- if (Array.isArray(v)) return v.map(canonicalize2);
6985
- const keys = Object.keys(v).sort();
6986
- const out = {};
6987
- for (const k of keys) out[k] = canonicalize2(v[k]);
6988
- return out;
6989
- }
6990
- async function hashJson(obj) {
6991
- const canonical = canonicalize2(obj);
6992
- const bytes = new TextEncoder().encode(JSON.stringify(canonical));
6993
- const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
6994
- return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
6995
- }
6996
- async function signManifest(m) {
6997
- const hash = await hashJson(m);
6998
- return { ...m, contentHash: hash, algo: "sha256-content" };
6999
- }
7000
- async function verifyManifest(m) {
7001
- const { contentHash, algo: _algo, ...rest } = m;
7002
- void _algo;
7003
- const resigned = await signManifest(rest);
7004
- return resigned.contentHash === contentHash;
7005
- }
7006
- async function evaluateHypothesis(manifest, observed) {
7007
- if (!await verifyManifest(manifest)) {
7008
- throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
7009
- }
7010
- const reasons = [];
7011
- const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
7012
- if (!directionOk) reasons.push("wrong_direction");
7013
- if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
7014
- if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
7015
- if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
7016
- return {
7017
- manifest,
7018
- observedN: observed.n,
7019
- observedEffect: observed.effect,
7020
- observedPValue: observed.pValue,
7021
- confirmed: reasons.length === 0,
7022
- rejectionReasons: reasons
7023
- };
7024
- }
7025
-
7026
7013
  // src/self-play.ts
7027
7014
  async function runSelfPlay(proposer, scorer, targets, options = {}) {
7028
7015
  if (targets.length < 2) throw new Error("runSelfPlay: at least 2 targets required (need a difference to measure)");
@@ -10481,6 +10468,7 @@ export {
10481
10468
  FileSystemExperimentStore,
10482
10469
  FileSystemFeedbackTrajectoryStore,
10483
10470
  FileSystemOutcomeStore,
10471
+ FileSystemRawProviderSink,
10484
10472
  FileSystemTraceStore,
10485
10473
  HeldOutGate,
10486
10474
  HoldoutAuditor,
@@ -10489,6 +10477,7 @@ export {
10489
10477
  InMemoryExperimentStore,
10490
10478
  InMemoryFeedbackTrajectoryStore,
10491
10479
  InMemoryOutcomeStore,
10480
+ InMemoryRawProviderSink,
10492
10481
  InMemoryTraceStore,
10493
10482
  InMemoryTrialCache,
10494
10483
  InMemoryWorkspaceInspector,
@@ -10497,12 +10486,14 @@ export {
10497
10486
  LineageRecorder,
10498
10487
  LlmCallError,
10499
10488
  LlmClient,
10489
+ LlmRouteAssertionError,
10500
10490
  LockedJsonlAppender,
10501
10491
  MODEL_PRICING,
10502
10492
  MetricsCollector,
10503
10493
  MultiLayerVerifier,
10504
10494
  MutationTelemetry,
10505
10495
  Mutex,
10496
+ NoopRawProviderSink,
10506
10497
  NoopResearcher,
10507
10498
  OTEL_AGENT_EVAL_SCOPE,
10508
10499
  OtlpFileTraceStore,
@@ -10512,7 +10503,11 @@ export {
10512
10503
  ProjectRegistry,
10513
10504
  PromptRegistry,
10514
10505
  REDACTION_VERSION,
10506
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
10507
+ ReplayCache,
10508
+ ReplayCacheMissError,
10515
10509
  RunCritic,
10510
+ RunIntegrityError,
10516
10511
  RunRecordValidationError,
10517
10512
  SEMANTIC_CONCEPT_JUDGE_VERSION,
10518
10513
  SandboxHarness,
@@ -10539,7 +10534,9 @@ export {
10539
10534
  analyzeSeries,
10540
10535
  analyzeTraces,
10541
10536
  argHash,
10537
+ assertLlmRoute,
10542
10538
  assertReleaseConfidence,
10539
+ assertRunCaptured,
10543
10540
  assignFeedbackSplit,
10544
10541
  attributeCounterfactuals,
10545
10542
  deterministicSplit as benchmarkDeterministicSplit,
@@ -10563,7 +10560,7 @@ export {
10563
10560
  callLlm,
10564
10561
  callLlmJson,
10565
10562
  canaryLeakView,
10566
- canonicalize2 as canonicalize,
10563
+ canonicalize,
10567
10564
  causalAttribution,
10568
10565
  checkBehavioralCanary,
10569
10566
  checkCanaries,
@@ -10597,6 +10594,7 @@ export {
10597
10594
  createFeedbackTrajectory,
10598
10595
  createIntentMatchJudge,
10599
10596
  createLlmReviewer,
10597
+ createReplayFetch,
10600
10598
  createSandboxCodeMutator,
10601
10599
  createSandboxPool,
10602
10600
  createSemanticConceptJudge,
@@ -10606,6 +10604,7 @@ export {
10606
10604
  decideReferenceReplayRunPromotion,
10607
10605
  defaultJudges,
10608
10606
  defaultMultiShotObjectives,
10607
+ defaultProviderRedactor,
10609
10608
  defaultReferenceReplayMatcher,
10610
10609
  defaultTraceInsightPanel,
10611
10610
  deployGateLayer,
@@ -10619,6 +10618,7 @@ export {
10619
10618
  evaluateActionPolicy,
10620
10619
  evaluateContract,
10621
10620
  evaluateHypothesis,
10621
+ evaluateInterimReleaseConfidence,
10622
10622
  evaluateOracles,
10623
10623
  evaluateReleaseConfidence,
10624
10624
  executeScenario,
@@ -10670,6 +10670,7 @@ export {
10670
10670
  isRunRecord,
10671
10671
  isSandboxSpan,
10672
10672
  isToolSpan,
10673
+ iterateRawCalls,
10673
10674
  jestTestParser,
10674
10675
  jsonHasKeys,
10675
10676
  jsonShape,
@@ -10698,6 +10699,7 @@ export {
10698
10699
  objectiveEval,
10699
10700
  outputLengthRubric,
10700
10701
  pairedBootstrap,
10702
+ pairedEvalueSequence,
10701
10703
  pairedTTest,
10702
10704
  pairedWilcoxon,
10703
10705
  paraphraseRobustness,
@@ -10720,6 +10722,7 @@ export {
10720
10722
  probeLlm,
10721
10723
  promptBisect,
10722
10724
  proposeSynthesisTargets,
10725
+ providerFromBaseUrl,
10723
10726
  pytestTestParser,
10724
10727
  redTeamDataset,
10725
10728
  redTeamReport,
@@ -10742,17 +10745,20 @@ export {
10742
10745
  replayScorerOverCorpus,
10743
10746
  replayTraceThroughJudge,
10744
10747
  requiredSampleSize,
10748
+ researchReport,
10745
10749
  resetLockedAppendersForTesting,
10746
10750
  resumeBuilderSession,
10747
10751
  roundTripRunRecord,
10748
10752
  rowCount,
10749
10753
  rowWhere,
10754
+ rubricPredictiveValidity,
10750
10755
  runAgentControlLoop,
10751
10756
  runAssertions,
10752
10757
  runBehavioralCanaries,
10753
10758
  runCanaries,
10754
10759
  runCounterfactual,
10755
10760
  runE2EWorkflow,
10761
+ runEvalCampaign,
10756
10762
  runExpectations,
10757
10763
  runFailureClass,
10758
10764
  runHarnessExperiment,
@@ -10799,6 +10805,7 @@ export {
10799
10805
  summaryTable,
10800
10806
  testJudge,
10801
10807
  textInSnapshot,
10808
+ throwIfRunIncomplete,
10802
10809
  toLangfuseEnvelope,
10803
10810
  toNdjson,
10804
10811
  toPrometheusText,
@@ -10810,6 +10817,7 @@ export {
10810
10817
  toolSuccessRubric,
10811
10818
  toolWasteView,
10812
10819
  traceAnalystFunctionGroup,
10820
+ traceAnalystOnRunComplete,
10813
10821
  trialTraceFromMultiShotTrial,
10814
10822
  typoMutator,
10815
10823
  urlContains,