@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +80 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -1,304 +0,0 @@
1
- import { R as RunRecord } from './run-record-DNiOMBrZ.js';
2
-
3
- /**
4
- * OutcomeStore — deployment outcomes attached to Run IDs.
5
- *
6
- * Outcomes arrive asynchronously from production telemetry after the
7
- * eval run completed: user ratings, retention flags, conversion events,
8
- * revenue, support-ticket rate, anything a product team can measure.
9
- * The store is a peer to TraceStore — separate lifecycle, same runId
10
- * foreign key.
11
- *
12
- * The whole point of this module is to make the meta-eval correlation
13
- * question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
14
- */
15
- interface DeploymentOutcome {
16
- runId: string;
17
- capturedAt: number;
18
- /** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
19
- metrics: Record<string, number>;
20
- /** Dimensions for stratified analysis — cohort, region, user_segment. */
21
- labels?: Record<string, string>;
22
- /** Free-form provenance (source system, pipeline version). */
23
- source?: string;
24
- }
25
- interface OutcomeFilter {
26
- runIds?: string[];
27
- since?: number;
28
- until?: number;
29
- label?: {
30
- key: string;
31
- value: string;
32
- };
33
- source?: string;
34
- }
35
- interface OutcomeStore {
36
- append(outcome: DeploymentOutcome): Promise<void>;
37
- /** All outcomes attached to this run (a single run can have many — multiple
38
- * capture windows over deployment time). */
39
- forRun(runId: string): Promise<DeploymentOutcome[]>;
40
- list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
41
- }
42
- declare class InMemoryOutcomeStore implements OutcomeStore {
43
- private items;
44
- append(outcome: DeploymentOutcome): Promise<void>;
45
- forRun(runId: string): Promise<DeploymentOutcome[]>;
46
- list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
47
- }
48
- interface FileSystemOutcomeStoreOptions {
49
- dir: string;
50
- maxBytes?: number;
51
- }
52
- declare class FileSystemOutcomeStore implements OutcomeStore {
53
- private dir;
54
- private maxBytes;
55
- private memo?;
56
- private loaded;
57
- constructor(options: FileSystemOutcomeStoreOptions);
58
- private ensureDir;
59
- append(outcome: DeploymentOutcome): Promise<void>;
60
- private load;
61
- forRun(runId: string): Promise<DeploymentOutcome[]>;
62
- list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
63
- }
64
-
65
- /**
66
- * Rubric predictive validity — does our eval rubric predict deployment
67
- * outcomes?
68
- *
69
- * `correlationStudy` (already in this package) joins a `TraceStore` to an
70
- * `OutcomeStore` and computes Pearson + Spearman + bootstrap CI for each
71
- * (eval-metric, outcome-metric) pair. That answers "does X correlate with
72
- * Y at all." `rubricPredictiveValidity` is the campaign-shaped wrapper
73
- * around it: take a sequence of `RunRecord`s (the canonical campaign
74
- * artifact) and a `DeploymentOutcomeStore`, join on `runId`, return a
75
- * ranked verdict on every rubric whose dimension scores were captured in
76
- * `outcome.raw`.
77
- *
78
- * The point — quoting the methodology doc — is that **without this loop
79
- * every rubric is faith-based**. Once it's wired, you know which rubrics
80
- * have earned their promotion power and which ones are decoration.
81
- *
82
- * const validity = await rubricPredictiveValidity({
83
- * runs: lastQuarter,
84
- * outcomes: shipFlagOutcomeStore,
85
- * outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
86
- * rubrics: ['anti_slop', 'semantic_concept', 'tool_recovery'],
87
- * })
88
- * for (const r of validity.ranked) {
89
- * console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)}`)
90
- * }
91
- *
92
- * The function is intentionally read-only. Use the verdict to deprecate
93
- * decorative rubrics, re-weight composite scores, or trigger a
94
- * recalibration sweep when predictive validity drops below a threshold.
95
- */
96
-
97
- interface RubricPredictiveValidityInput {
98
- /**
99
- * Canonical campaign output. Each record's `outcome.raw[<rubricId>]`
100
- * provides the eval score; missing keys are silently skipped per pair.
101
- */
102
- runs: RunRecord[];
103
- outcomes: OutcomeStore;
104
- /**
105
- * Outcome metric names to evaluate against. Each must appear in at
106
- * least one `DeploymentOutcome.metrics` keyspace; pairs with too few
107
- * joined samples are excluded from the result.
108
- */
109
- outcomeMetrics: string[];
110
- /**
111
- * Rubric ids to evaluate. Must appear as keys in `RunRecord.outcome.raw`.
112
- * If omitted, every numeric key in `outcome.raw` across the run set is
113
- * treated as a rubric.
114
- */
115
- rubrics?: string[];
116
- /** Minimum joined-sample count before a pair is reported. Default 8. */
117
- minSamples?: number;
118
- /** Bootstrap resamples for CI. Default 500. */
119
- bootstrapResamples?: number;
120
- /** Random seed for the bootstrap (mulberry32). Default unset (Math.random). */
121
- seed?: number;
122
- /**
123
- * Reduction when multiple outcomes attach to one runId. Default `'latest'`
124
- * (most recently captured).
125
- */
126
- reduction?: 'latest' | 'mean' | 'max';
127
- }
128
- interface RubricOutcomePair {
129
- rubric: string;
130
- outcome: string;
131
- n: number;
132
- pearson: number;
133
- spearman: number;
134
- ci95: {
135
- low: number;
136
- high: number;
137
- };
138
- /**
139
- * Verdict bucket. `load_bearing` ≥ 0.7, `informative` ≥ 0.4,
140
- * `decorative` < 0.4 in absolute correlation. A negative correlation
141
- * with a desired outcome is also `decorative` — actively misleading
142
- * is worse than uninformative.
143
- */
144
- verdict: 'load_bearing' | 'informative' | 'decorative';
145
- }
146
- interface RubricRanking {
147
- rubric: string;
148
- /** Outcome metric this rubric correlated best with. */
149
- bestOutcome: string;
150
- spearman: number;
151
- pearson: number;
152
- n: number;
153
- verdict: RubricOutcomePair['verdict'];
154
- }
155
- interface RubricPredictiveValidityReport {
156
- pairs: RubricOutcomePair[];
157
- /** Per-rubric best pair, sorted descending by |spearman|. */
158
- ranked: RubricRanking[];
159
- joinedSamples: number;
160
- skippedRuns: number;
161
- /** Rubrics that were declared but never produced a usable score. */
162
- rubricsWithoutData: string[];
163
- }
164
- declare function rubricPredictiveValidity(input: RubricPredictiveValidityInput): Promise<RubricPredictiveValidityReport>;
165
-
166
- /**
167
- * Always-valid sequential evaluation.
168
- *
169
- * `researchReport` (0.21+) assumes a single pre-specified analysis. Real
170
- * consumers run campaigns weekly / nightly / per-PR; each new run silently
171
- * inflates the false-discovery rate, because the BH-FDR guarantee was for
172
- * the *first* look, not the 47th. Without time-uniform inference,
173
- * launch-decision teams either (a) don't peek, which forfeits the cost
174
- * advantage of stop-when-decisive, or (b) peek and pretend they didn't,
175
- * which forfeits scientific validity.
176
- *
177
- * This module ships **e-value-based confidence sequences** for paired
178
- * bounded outcomes. The methodology is the predictable plug-in betting
179
- * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
180
- * stopping time. Concretely:
181
- *
182
- * For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
183
- * a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
184
- * plug-in), and the running e-value is
185
- *
186
- * E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
187
- *
188
- * E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
189
- * Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
190
- * at any time without inflating the type-I error.
191
- *
192
- * Combined with `runEvalCampaign`, every consumer running rolling
193
- * campaigns gains the ability to ship the moment evidence is decisive,
194
- * stop-early on dead-on-arrival variants, and accumulate evidence across
195
- * partial runs without spending the FDR budget. No new sweep is wasted.
196
- *
197
- * References:
198
- * - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
199
- * Time-uniform, nonparametric, nonasymptotic confidence sequences.
200
- * Annals of Statistics, 49(2), 1055–1080.
201
- * - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
202
- * random variables by betting. JRSS B, 86(1), 1–27.
203
- */
204
- type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
205
- interface PairedEvalueOptions {
206
- /**
207
- * Bound on |delta|. Default 1 (matching most score scales). Must satisfy
208
- * c > 0; deltas outside [-c, c] are clipped with a warning attached to
209
- * the return value.
210
- */
211
- bound?: number;
212
- /** Target Type-I error. Default 0.05. */
213
- alpha?: number;
214
- /**
215
- * Region of Practical Equivalence on the *mean* paired delta. When
216
- * supplied, the verdict can return `'equivalent'` once the running
217
- * confidence sequence on the mean is fully contained in [low, high].
218
- */
219
- rope?: {
220
- low: number;
221
- high: number;
222
- };
223
- /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
224
- initialBetShrinkage?: number;
225
- }
226
- interface PairedEvalueStep {
227
- /** 1-indexed observation count. */
228
- t: number;
229
- delta: number;
230
- /** Running e-value E_t = ∏ (1 + λ_i · D_i). */
231
- evalue: number;
232
- /** Time-uniform p-value at stopping time t. */
233
- pValue: number;
234
- /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
235
- csLow: number;
236
- csHigh: number;
237
- /** Verdict at this stopping time. */
238
- decision: SequentialDecision;
239
- }
240
- interface PairedEvalueSequence {
241
- steps: PairedEvalueStep[];
242
- /** The decision at the final step. */
243
- finalDecision: SequentialDecision;
244
- /** Index (1-based) at which a non-`continue` decision first fired, or null. */
245
- decisionFiredAt: number | null;
246
- /** True if any deltas were clipped to [-bound, bound]. */
247
- clipped: boolean;
248
- }
249
- /**
250
- * Run the paired e-value sequence over an in-order delta stream.
251
- *
252
- * Use for *streaming* / interim analyses: pass the deltas you have so
253
- * far, get the verdict at every prefix length. The decision is
254
- * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
255
- * fires, the verdict at later steps remains decisive (the e-value is a
256
- * non-negative martingale; once it crosses the threshold, it's crossed).
257
- */
258
- declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
259
- interface InterimReleaseConfidenceInput {
260
- /**
261
- * One delta series per candidate (paired deltas vs comparator). Order
262
- * within a series is the order the campaigns were run.
263
- */
264
- deltaSeries: Array<{
265
- candidateId: string;
266
- deltas: number[];
267
- }>;
268
- alpha?: number;
269
- bound?: number;
270
- rope?: {
271
- low: number;
272
- high: number;
273
- };
274
- }
275
- interface InterimReleaseConfidence {
276
- candidates: Array<{
277
- candidateId: string;
278
- decision: SequentialDecision;
279
- decisionFiredAt: number | null;
280
- finalEvalue: number;
281
- finalPValue: number;
282
- pairs: number;
283
- csLow: number;
284
- csHigh: number;
285
- }>;
286
- /**
287
- * Campaign-level recommendation: pick the strongest 'promote_now', else
288
- * 'continue' if any candidate is still live, else 'reject_now' if every
289
- * candidate is dead, else 'equivalent'.
290
- */
291
- recommendation: {
292
- decision: SequentialDecision;
293
- candidateId: string | null;
294
- };
295
- }
296
- /**
297
- * Run interim sequential analyses across many candidates at once,
298
- * preserving the time-uniform α guarantee for each candidate's series and
299
- * synthesising a campaign-level recommendation. Designed to be called on
300
- * every campaign tick — the recommendation is anytime-valid.
301
- */
302
- declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
303
-
304
- export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeFilter as O, type PairedEvalueOptions as P, type RubricOutcomePair as R, type SequentialDecision as S, type OutcomeStore as a, type FileSystemOutcomeStoreOptions as b, type InterimReleaseConfidence as c, type InterimReleaseConfidenceInput as d, type PairedEvalueSequence as e, type PairedEvalueStep as f, type RubricPredictiveValidityInput as g, type RubricPredictiveValidityReport as h, type RubricRanking as i, evaluateInterimReleaseConfidence as j, pairedEvalueSequence as p, rubricPredictiveValidity as r };