@tangle-network/agent-eval 0.49.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/langchain.d.ts +1 -1
  3. package/dist/adapters/otel.d.ts +8 -2
  4. package/dist/campaign/index.d.ts +3 -3
  5. package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
  6. package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
  7. package/dist/chunk-EGIPWXHL.js.map +1 -0
  8. package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
  9. package/dist/chunk-FQK2CCIM.js.map +1 -0
  10. package/dist/chunk-MAZ26DC7.js +99 -0
  11. package/dist/chunk-MAZ26DC7.js.map +1 -0
  12. package/dist/chunk-SHTXZ4O2.js +113 -0
  13. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  14. package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
  15. package/dist/contract/index.d.ts +206 -9
  16. package/dist/contract/index.js +751 -3
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/governance/index.d.ts +1 -1
  19. package/dist/hosted/index.d.ts +8 -192
  20. package/dist/hosted/index.js +1 -1
  21. package/dist/index-BRxz6qov.d.ts +409 -0
  22. package/dist/index.d.ts +18 -462
  23. package/dist/index.js +14 -106
  24. package/dist/index.js.map +1 -1
  25. package/dist/meta-eval/index.d.ts +3 -3
  26. package/dist/openapi.json +1 -1
  27. package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
  28. package/dist/registry-8KAs18kY.d.ts +457 -0
  29. package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
  30. package/dist/reporting.d.ts +6 -4
  31. package/dist/reporting.js +6 -4
  32. package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
  33. package/dist/rl.d.ts +9 -8
  34. package/dist/rl.js +3 -2
  35. package/dist/rl.js.map +1 -1
  36. package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  37. package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
  38. package/dist/sequential-5iSVfzl2.d.ts +139 -0
  39. package/dist/store-CJbzDxZ2.d.ts +220 -0
  40. package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
  41. package/dist/traces.d.ts +3 -220
  42. package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
  43. package/dist/types-DhqpAi_z.d.ts +296 -0
  44. package/package.json +1 -1
  45. package/dist/chunk-MNL6LXGQ.js.map +0 -1
  46. package/dist/chunk-OYI6RZJK.js.map +0 -1
  47. /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
  48. /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
@@ -1,5 +1,5 @@
1
1
  import { R as RunRecord } from './run-record-BGY6bHRh.js';
2
- import { O as OutcomeStore } from './outcome-store-BxJ3DQKJ.js';
2
+ import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
3
 
4
4
  /**
5
5
  * Rubric predictive validity — does our eval rubric predict deployment
@@ -1,4 +1,4 @@
1
- import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, g as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-8u72Gc76.js';
1
+ import { S as Scenario, j as CampaignResult, n as GateResult, t as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, k as CampaignTraceWriter, M as MutableSurface, p as GenerationRecord } from './types-Dbj5gu8n.js';
2
2
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
3
  import { R as RedTeamCase } from './red-team-30II1T4o.js';
4
4
  import { R as RunRecord } from './run-record-BGY6bHRh.js';
@@ -414,4 +414,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
414
414
  }
415
415
  declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
416
416
 
417
- export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
417
+ export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type OpenAutoPrResult as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, type RunImprovementLoopResult as d, type RunOptimizationOptions as e, type RunOptimizationResult as f, composeGate as g, defaultProductionGate as h, evolutionaryDriver as i, fsCampaignStorage as j, gepaDriver as k, heldOutGate as l, inMemoryCampaignStorage as m, runEval as n, openAutoPr as o, runImprovementLoop as p, runOptimization as q, runCampaign as r, surfaceHash as s };
@@ -0,0 +1,139 @@
1
+ /**
2
+ * Always-valid sequential evaluation.
3
+ *
4
+ * `researchReport` assumes a single pre-specified analysis. Real
5
+ * consumers run campaigns weekly / nightly / per-PR; each new run silently
6
+ * inflates the false-discovery rate, because the BH-FDR guarantee is for
7
+ * the *first* look, not the 47th. Without time-uniform inference,
8
+ * launch-decision teams either (a) don't peek, which forfeits the cost
9
+ * advantage of stop-when-decisive, or (b) peek and pretend they didn't,
10
+ * which forfeits scientific validity.
11
+ *
12
+ * This module ships **e-value-based confidence sequences** for paired
13
+ * bounded outcomes. The methodology is the predictable plug-in betting
14
+ * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
15
+ * stopping time. Concretely:
16
+ *
17
+ * For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
18
+ * a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
19
+ * plug-in), and the running e-value is
20
+ *
21
+ * E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
22
+ *
23
+ * E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
24
+ * Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
25
+ * at any time without inflating the type-I error.
26
+ *
27
+ * Combined with `runEvalCampaign`, every consumer running rolling
28
+ * campaigns gains the ability to ship the moment evidence is decisive,
29
+ * stop-early on dead-on-arrival variants, and accumulate evidence across
30
+ * partial runs without spending the FDR budget. No new sweep is wasted.
31
+ *
32
+ * References:
33
+ * - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
34
+ * Time-uniform, nonparametric, nonasymptotic confidence sequences.
35
+ * Annals of Statistics, 49(2), 1055–1080.
36
+ * - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
37
+ * random variables by betting. JRSS B, 86(1), 1–27.
38
+ */
39
+ type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
40
+ interface PairedEvalueOptions {
41
+ /**
42
+ * Bound on |delta|. Default 1 (matching most score scales). Must satisfy
43
+ * c > 0; deltas outside [-c, c] are clipped with a warning attached to
44
+ * the return value.
45
+ */
46
+ bound?: number;
47
+ /** Target Type-I error. Default 0.05. */
48
+ alpha?: number;
49
+ /**
50
+ * Region of Practical Equivalence on the *mean* paired delta. When
51
+ * supplied, the verdict can return `'equivalent'` once the running
52
+ * confidence sequence on the mean is fully contained in [low, high].
53
+ */
54
+ rope?: {
55
+ low: number;
56
+ high: number;
57
+ };
58
+ /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
59
+ initialBetShrinkage?: number;
60
+ }
61
+ interface PairedEvalueStep {
62
+ /** 1-indexed observation count. */
63
+ t: number;
64
+ delta: number;
65
+ /** Running e-value E_t = ∏ (1 + λ_i · D_i). */
66
+ evalue: number;
67
+ /** Time-uniform p-value at stopping time t. */
68
+ pValue: number;
69
+ /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
70
+ csLow: number;
71
+ csHigh: number;
72
+ /** Verdict at this stopping time. */
73
+ decision: SequentialDecision;
74
+ }
75
+ interface PairedEvalueSequence {
76
+ steps: PairedEvalueStep[];
77
+ /** The decision at the final step. */
78
+ finalDecision: SequentialDecision;
79
+ /** Index (1-based) at which a non-`continue` decision first fired, or null. */
80
+ decisionFiredAt: number | null;
81
+ /** True if any deltas were clipped to [-bound, bound]. */
82
+ clipped: boolean;
83
+ }
84
+ /**
85
+ * Run the paired e-value sequence over an in-order delta stream.
86
+ *
87
+ * Use for *streaming* / interim analyses: pass the deltas you have so
88
+ * far, get the verdict at every prefix length. The decision is
89
+ * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
90
+ * fires, the verdict at later steps remains decisive (the e-value is a
91
+ * non-negative martingale; once it crosses the threshold, it's crossed).
92
+ */
93
+ declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
94
+ interface InterimReleaseConfidenceInput {
95
+ /**
96
+ * One delta series per candidate (paired deltas vs comparator). Order
97
+ * within a series is the order the campaigns were run.
98
+ */
99
+ deltaSeries: Array<{
100
+ candidateId: string;
101
+ deltas: number[];
102
+ }>;
103
+ alpha?: number;
104
+ bound?: number;
105
+ rope?: {
106
+ low: number;
107
+ high: number;
108
+ };
109
+ }
110
+ interface InterimReleaseConfidence {
111
+ candidates: Array<{
112
+ candidateId: string;
113
+ decision: SequentialDecision;
114
+ decisionFiredAt: number | null;
115
+ finalEvalue: number;
116
+ finalPValue: number;
117
+ pairs: number;
118
+ csLow: number;
119
+ csHigh: number;
120
+ }>;
121
+ /**
122
+ * Campaign-level recommendation: pick the strongest 'promote_now', else
123
+ * 'continue' if any candidate is still live, else 'reject_now' if every
124
+ * candidate is dead, else 'equivalent'.
125
+ */
126
+ recommendation: {
127
+ decision: SequentialDecision;
128
+ candidateId: string | null;
129
+ };
130
+ }
131
+ /**
132
+ * Run interim sequential analyses across many candidates at once,
133
+ * preserving the time-uniform α guarantee for each candidate's series and
134
+ * synthesising a campaign-level recommendation. Designed to be called on
135
+ * every campaign tick — the recommendation is anytime-valid.
136
+ */
137
+ declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
138
+
139
+ export { type InterimReleaseConfidence as I, type PairedEvalueOptions as P, type SequentialDecision as S, type InterimReleaseConfidenceInput as a, type PairedEvalueSequence as b, type PairedEvalueStep as c, evaluateInterimReleaseConfidence as e, pairedEvalueSequence as p };
@@ -0,0 +1,220 @@
1
+ /**
2
+ * Shared types for the trace-analyst module.
3
+ *
4
+ * Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
5
+ * line per span, OTLP-shaped. We do NOT depend on a specific tracing
6
+ * vendor at the type level. Adapter
7
+ * layers map upstream shapes onto this interface.
8
+ *
9
+ * Design constraint. Every read operation that can return arbitrary
10
+ * payload must carry a byte budget so the agent's tool result stays
11
+ * bounded regardless of input trace size. Oversized responses
12
+ * substitute a deterministic summary instead of bytes — see
13
+ * `ViewTraceOversized`.
14
+ */
15
+ /** OTLP span kind (subset we actually use). */
16
+ type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
17
+ type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
18
+ /** Subset of OTLP span fields the analyst exposes to the agent. The
19
+ * store's job is to project upstream's full span shape down to this
20
+ * view — the analyst never sees vendor extensions directly. */
21
+ interface TraceAnalystSpan {
22
+ trace_id: string;
23
+ span_id: string;
24
+ parent_span_id: string | null;
25
+ name: string;
26
+ kind: TraceAnalystSpanKind;
27
+ start_time: string;
28
+ end_time: string;
29
+ duration_ms: number;
30
+ status: TraceAnalystSpanStatus;
31
+ status_message?: string;
32
+ service_name: string | null;
33
+ agent_name: string | null;
34
+ model_name: string | null;
35
+ tool_name: string | null;
36
+ /** Raw JSON-serialisable attribute map. May contain large strings;
37
+ * callers must respect the per-attribute byte cap. */
38
+ attributes: Record<string, unknown>;
39
+ }
40
+ interface TraceAnalystTraceSummary {
41
+ trace_id: string;
42
+ service_name: string | null;
43
+ agent_name: string | null;
44
+ span_count: number;
45
+ has_errors: boolean;
46
+ start_time: string;
47
+ end_time: string;
48
+ duration_ms: number;
49
+ raw_jsonl_bytes: number;
50
+ models: string[];
51
+ tools: string[];
52
+ }
53
+ interface TraceAnalystFilters {
54
+ /** Restrict to traces that contain at least one error span. */
55
+ has_errors?: boolean;
56
+ /** Match if any span's `service.name` is in this list. */
57
+ service_names?: string[];
58
+ /** Match if any span's `agent.name` is in this list. */
59
+ agent_names?: string[];
60
+ /** Match if any LLM span's `llm.model_name` is in this list. */
61
+ model_names?: string[];
62
+ /** Match if any tool span's `tool.name` is in this list. */
63
+ tool_names?: string[];
64
+ /** ISO-8601 lower bound on the trace's earliest start time. */
65
+ start_time_after?: string;
66
+ /** ISO-8601 upper bound on the trace's earliest start time. */
67
+ start_time_before?: string;
68
+ /** Single regex applied to raw JSONL bytes for the trace. Opt-in;
69
+ * expensive on large datasets. Use the indexed filters above first. */
70
+ regex_pattern?: string;
71
+ }
72
+ interface DatasetOverview {
73
+ total_traces: number;
74
+ raw_jsonl_bytes: number;
75
+ services: string[];
76
+ agents: string[];
77
+ models: string[];
78
+ tool_names: string[];
79
+ /** Up to 20 real trace ids the agent may pass to view/search tools. */
80
+ sample_trace_ids: string[];
81
+ errors: {
82
+ trace_count: number;
83
+ span_count: number;
84
+ };
85
+ time_range: {
86
+ earliest: string;
87
+ latest: string;
88
+ } | null;
89
+ }
90
+ interface QueryTracesPage {
91
+ traces: TraceAnalystTraceSummary[];
92
+ total: number;
93
+ has_more: boolean;
94
+ }
95
+ /** Full-trace view. When the response would exceed the per-call byte
96
+ * budget, `oversized` is populated INSTEAD of `spans` so the agent
97
+ * knows to switch to `searchTrace` / `viewSpans`. */
98
+ interface ViewTraceResult {
99
+ trace_id: string;
100
+ spans?: TraceAnalystSpan[];
101
+ oversized?: ViewTraceOversized;
102
+ }
103
+ interface ViewTraceOversized {
104
+ span_count: number;
105
+ /** Names with their counts, sorted desc. Capped at 20 entries. */
106
+ top_span_names: Array<[string, number]>;
107
+ /** Largest single span body (bytes after attribute-cap projection). */
108
+ span_response_bytes_max: number;
109
+ error_span_count: number;
110
+ }
111
+ interface ViewSpansResult {
112
+ trace_id: string;
113
+ spans: TraceAnalystSpan[];
114
+ /** Number of requested span ids that were not found in the trace. */
115
+ missing_span_ids: string[];
116
+ /** Number of attribute fields truncated to fit the per-attribute cap. */
117
+ truncated_attribute_count: number;
118
+ }
119
+ interface SpanMatchRecord {
120
+ trace_id: string;
121
+ span_id: string;
122
+ span_name: string;
123
+ span_kind: TraceAnalystSpanKind;
124
+ /** JSON pointer-style path to the matched value, e.g.
125
+ * `attributes."llm.input_messages"[2].content`. */
126
+ attribute_path: string;
127
+ matched_text: string;
128
+ context_before: string;
129
+ context_after: string;
130
+ match_offset: number;
131
+ }
132
+ interface SearchTraceResult {
133
+ trace_id: string;
134
+ hits: SpanMatchRecord[];
135
+ total_matches: number;
136
+ has_more: boolean;
137
+ }
138
+ interface SearchSpanResult {
139
+ trace_id: string;
140
+ span_id: string;
141
+ hits: SpanMatchRecord[];
142
+ total_matches: number;
143
+ has_more: boolean;
144
+ }
145
+ /** Tunable byte budgets for bounded RLM tool output. */
146
+ interface TraceAnalystByteBudgets {
147
+ /** Max bytes any single tool response may emit. Hard ceiling enforced
148
+ * by the store; oversized → summary. Default 150_000. */
149
+ perCallByteCeiling: number;
150
+ /** Per-attribute string truncation cap on `viewTrace` (discovery scan).
151
+ * Default 4096. */
152
+ perAttributeViewBudget: number;
153
+ /** Per-attribute string truncation cap on `viewSpans` (surgical reads).
154
+ * Default 16384. */
155
+ perAttributeSpanBudget: number;
156
+ /** Per-attribute cap on a single match record's `matched_text` and
157
+ * context window. Default 1024. */
158
+ perMatchTextBudget: number;
159
+ }
160
+ declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
161
+ /** Marker substituted in place of truncated string payloads. Callers
162
+ * parsing tool output can detect it deterministically. */
163
+ declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
164
+
165
+ /**
166
+ * `TraceAnalysisStore` — read-side interface the trace-analyst calls
167
+ * through. Six operations, all bounded:
168
+ *
169
+ * - `getOverview(filters?)` — dataset rollup + sample trace ids.
170
+ * - `queryTraces(filters?, limit, offset)` — paginated summaries.
171
+ * - `countTraces(filters?)` — cheap count without materialisation.
172
+ * - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
173
+ * - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
174
+ * - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
175
+ * - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
176
+ *
177
+ * Multiple implementations ship in the core (`OtlpFileTraceStore`).
178
+ * Downstream callers can supply their own — e.g. a DuckDB-backed
179
+ * adapter or an in-memory adapter for tests — by implementing this
180
+ * interface.
181
+ *
182
+ * Filters compose with AND semantics. Empty/undefined fields impose
183
+ * no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
184
+ * implementations may skip it via `count`/`overview` when not set.
185
+ */
186
+
187
+ interface TraceAnalysisStore {
188
+ getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
189
+ queryTraces(opts: {
190
+ filters?: TraceAnalystFilters;
191
+ limit: number;
192
+ offset?: number;
193
+ }): Promise<QueryTracesPage>;
194
+ countTraces(filters?: TraceAnalystFilters): Promise<number>;
195
+ viewTrace(opts: {
196
+ trace_id: string;
197
+ /** Override per-attribute byte cap. Defaults to discovery budget. */
198
+ per_attribute_byte_cap?: number;
199
+ }): Promise<ViewTraceResult>;
200
+ viewSpans(opts: {
201
+ trace_id: string;
202
+ span_ids: readonly string[];
203
+ /** Override per-attribute byte cap. Defaults to surgical budget. */
204
+ per_attribute_byte_cap?: number;
205
+ }): Promise<ViewSpansResult>;
206
+ searchTrace(opts: {
207
+ trace_id: string;
208
+ regex_pattern: string;
209
+ /** Hard cap on matches returned. Default 50. */
210
+ max_matches?: number;
211
+ }): Promise<SearchTraceResult>;
212
+ searchSpan(opts: {
213
+ trace_id: string;
214
+ span_id: string;
215
+ regex_pattern: string;
216
+ max_matches?: number;
217
+ }): Promise<SearchSpanResult>;
218
+ }
219
+
220
+ export { DEFAULT_TRACE_ANALYST_BUDGETS as D, type QueryTracesPage as Q, type SearchSpanResult as S, type TraceAnalysisStore as T, type ViewSpansResult as V, type DatasetOverview as a, type SearchTraceResult as b, type SpanMatchRecord as c, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX as d, type TraceAnalystByteBudgets as e, type TraceAnalystFilters as f, type TraceAnalystSpan as g, type TraceAnalystSpanKind as h, type TraceAnalystSpanStatus as i, type TraceAnalystTraceSummary as j, type ViewTraceOversized as k, type ViewTraceResult as l };
@@ -442,142 +442,4 @@ interface ResearchReport {
442
442
  */
443
443
  declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
444
444
 
445
- /**
446
- * Always-valid sequential evaluation.
447
- *
448
- * `researchReport` assumes a single pre-specified analysis. Real
449
- * consumers run campaigns weekly / nightly / per-PR; each new run silently
450
- * inflates the false-discovery rate, because the BH-FDR guarantee is for
451
- * the *first* look, not the 47th. Without time-uniform inference,
452
- * launch-decision teams either (a) don't peek, which forfeits the cost
453
- * advantage of stop-when-decisive, or (b) peek and pretend they didn't,
454
- * which forfeits scientific validity.
455
- *
456
- * This module ships **e-value-based confidence sequences** for paired
457
- * bounded outcomes. The methodology is the predictable plug-in betting
458
- * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
459
- * stopping time. Concretely:
460
- *
461
- * For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
462
- * a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
463
- * plug-in), and the running e-value is
464
- *
465
- * E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
466
- *
467
- * E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
468
- * Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
469
- * at any time without inflating the type-I error.
470
- *
471
- * Combined with `runEvalCampaign`, every consumer running rolling
472
- * campaigns gains the ability to ship the moment evidence is decisive,
473
- * stop-early on dead-on-arrival variants, and accumulate evidence across
474
- * partial runs without spending the FDR budget. No new sweep is wasted.
475
- *
476
- * References:
477
- * - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
478
- * Time-uniform, nonparametric, nonasymptotic confidence sequences.
479
- * Annals of Statistics, 49(2), 1055–1080.
480
- * - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
481
- * random variables by betting. JRSS B, 86(1), 1–27.
482
- */
483
- type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
484
- interface PairedEvalueOptions {
485
- /**
486
- * Bound on |delta|. Default 1 (matching most score scales). Must satisfy
487
- * c > 0; deltas outside [-c, c] are clipped with a warning attached to
488
- * the return value.
489
- */
490
- bound?: number;
491
- /** Target Type-I error. Default 0.05. */
492
- alpha?: number;
493
- /**
494
- * Region of Practical Equivalence on the *mean* paired delta. When
495
- * supplied, the verdict can return `'equivalent'` once the running
496
- * confidence sequence on the mean is fully contained in [low, high].
497
- */
498
- rope?: {
499
- low: number;
500
- high: number;
501
- };
502
- /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
503
- initialBetShrinkage?: number;
504
- }
505
- interface PairedEvalueStep {
506
- /** 1-indexed observation count. */
507
- t: number;
508
- delta: number;
509
- /** Running e-value E_t = ∏ (1 + λ_i · D_i). */
510
- evalue: number;
511
- /** Time-uniform p-value at stopping time t. */
512
- pValue: number;
513
- /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
514
- csLow: number;
515
- csHigh: number;
516
- /** Verdict at this stopping time. */
517
- decision: SequentialDecision;
518
- }
519
- interface PairedEvalueSequence {
520
- steps: PairedEvalueStep[];
521
- /** The decision at the final step. */
522
- finalDecision: SequentialDecision;
523
- /** Index (1-based) at which a non-`continue` decision first fired, or null. */
524
- decisionFiredAt: number | null;
525
- /** True if any deltas were clipped to [-bound, bound]. */
526
- clipped: boolean;
527
- }
528
- /**
529
- * Run the paired e-value sequence over an in-order delta stream.
530
- *
531
- * Use for *streaming* / interim analyses: pass the deltas you have so
532
- * far, get the verdict at every prefix length. The decision is
533
- * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
534
- * fires, the verdict at later steps remains decisive (the e-value is a
535
- * non-negative martingale; once it crosses the threshold, it's crossed).
536
- */
537
- declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
538
- interface InterimReleaseConfidenceInput {
539
- /**
540
- * One delta series per candidate (paired deltas vs comparator). Order
541
- * within a series is the order the campaigns were run.
542
- */
543
- deltaSeries: Array<{
544
- candidateId: string;
545
- deltas: number[];
546
- }>;
547
- alpha?: number;
548
- bound?: number;
549
- rope?: {
550
- low: number;
551
- high: number;
552
- };
553
- }
554
- interface InterimReleaseConfidence {
555
- candidates: Array<{
556
- candidateId: string;
557
- decision: SequentialDecision;
558
- decisionFiredAt: number | null;
559
- finalEvalue: number;
560
- finalPValue: number;
561
- pairs: number;
562
- csLow: number;
563
- csHigh: number;
564
- }>;
565
- /**
566
- * Campaign-level recommendation: pick the strongest 'promote_now', else
567
- * 'continue' if any candidate is still live, else 'reject_now' if every
568
- * candidate is dead, else 'equivalent'.
569
- */
570
- recommendation: {
571
- decision: SequentialDecision;
572
- candidateId: string | null;
573
- };
574
- }
575
- /**
576
- * Run interim sequential analyses across many candidates at once,
577
- * preserving the time-uniform α guarantee for each candidate's series and
578
- * synthesising a campaign-level recommendation. Designed to be called on
579
- * every campaign tick — the recommendation is anytime-valid.
580
- */
581
- declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
582
-
583
- export { type GainDistributionBin as G, HeldOutGate as H, type InterimReleaseConfidence as I, type PairedEvalueOptions as P, RESEARCH_REPORT_HARD_PAIR_FLOOR as R, type SequentialDecision as S, type GainDistributionFigureSpec as a, type GainDistributionOptions as b, type InterimReleaseConfidenceInput as c, type PairedEvalueSequence as d, type PairedEvalueStep as e, type ParetoFigureSpec as f, type ParetoPoint as g, type ResearchReport as h, type ResearchReportCandidate as i, type ResearchReportDecision as j, type ResearchReportMethodology as k, type ResearchReportOptions as l, type ResearchReportRecommendation as m, type SummaryTable as n, type SummaryTableOptions as o, type SummaryTableRow as p, evaluateInterimReleaseConfidence as q, gainHistogram as r, pairedEvalueSequence as s, paretoChart as t, researchReport as u, summaryTable as v, type GateDecision as w, type GateEvidence as x, type HeldOutGateConfig as y, type HeldOutGateRejectionCode as z };
445
+ export { type GainDistributionBin as G, HeldOutGate as H, type ParetoFigureSpec as P, RESEARCH_REPORT_HARD_PAIR_FLOOR as R, type SummaryTable as S, type GainDistributionFigureSpec as a, type GainDistributionOptions as b, type ParetoPoint as c, type ResearchReport as d, type ResearchReportCandidate as e, type ResearchReportDecision as f, type ResearchReportMethodology as g, type ResearchReportOptions as h, type ResearchReportRecommendation as i, type SummaryTableOptions as j, type SummaryTableRow as k, gainHistogram as l, type GateDecision as m, type GateEvidence as n, type HeldOutGateConfig as o, paretoChart as p, type HeldOutGateRejectionCode as q, researchReport as r, summaryTable as s };