@tangle-network/agent-eval 0.49.0 → 0.50.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +135 -0
- package/README.md +235 -331
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +8 -2
- package/dist/campaign/index.d.ts +3 -3
- package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
- package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
- package/dist/chunk-EGIPWXHL.js.map +1 -0
- package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
- package/dist/chunk-FQK2CCIM.js.map +1 -0
- package/dist/chunk-MAZ26DC7.js +99 -0
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
- package/dist/contract/index.d.ts +206 -9
- package/dist/contract/index.js +751 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +8 -192
- package/dist/hosted/index.js +1 -1
- package/dist/index-BRxz6qov.d.ts +409 -0
- package/dist/index.d.ts +18 -462
- package/dist/index.js +14 -106
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
- package/dist/registry-8KAs18kY.d.ts +457 -0
- package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +6 -4
- package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
- package/dist/rl.d.ts +9 -8
- package/dist/rl.js +3 -2
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
- package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
- package/dist/sequential-5iSVfzl2.d.ts +139 -0
- package/dist/store-CJbzDxZ2.d.ts +220 -0
- package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
- package/dist/traces.d.ts +3 -220
- package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
- package/dist/types-DhqpAi_z.d.ts +296 -0
- package/docs/concepts.md +20 -0
- package/docs/customer-journeys.md +208 -0
- package/docs/insight-report.md +337 -0
- package/package.json +1 -1
- package/dist/chunk-MNL6LXGQ.js.map +0 -1
- package/dist/chunk-OYI6RZJK.js.map +0 -1
- /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
- /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import { O as OutcomeStore } from './outcome-store-
|
|
2
|
+
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Rubric predictive validity — does our eval rubric predict deployment
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario,
|
|
1
|
+
import { S as Scenario, j as CampaignResult, n as GateResult, t as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, k as CampaignTraceWriter, M as MutableSurface, p as GenerationRecord } from './types-Dbj5gu8n.js';
|
|
2
2
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
3
|
import { R as RedTeamCase } from './red-team-30II1T4o.js';
|
|
4
4
|
import { R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
@@ -414,4 +414,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
|
|
|
414
414
|
}
|
|
415
415
|
declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
|
|
416
416
|
|
|
417
|
-
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type
|
|
417
|
+
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type OpenAutoPrResult as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, type RunImprovementLoopResult as d, type RunOptimizationOptions as e, type RunOptimizationResult as f, composeGate as g, defaultProductionGate as h, evolutionaryDriver as i, fsCampaignStorage as j, gepaDriver as k, heldOutGate as l, inMemoryCampaignStorage as m, runEval as n, openAutoPr as o, runImprovementLoop as p, runOptimization as q, runCampaign as r, surfaceHash as s };
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Always-valid sequential evaluation.
|
|
3
|
+
*
|
|
4
|
+
* `researchReport` assumes a single pre-specified analysis. Real
|
|
5
|
+
* consumers run campaigns weekly / nightly / per-PR; each new run silently
|
|
6
|
+
* inflates the false-discovery rate, because the BH-FDR guarantee is for
|
|
7
|
+
* the *first* look, not the 47th. Without time-uniform inference,
|
|
8
|
+
* launch-decision teams either (a) don't peek, which forfeits the cost
|
|
9
|
+
* advantage of stop-when-decisive, or (b) peek and pretend they didn't,
|
|
10
|
+
* which forfeits scientific validity.
|
|
11
|
+
*
|
|
12
|
+
* This module ships **e-value-based confidence sequences** for paired
|
|
13
|
+
* bounded outcomes. The methodology is the predictable plug-in betting
|
|
14
|
+
* martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
|
|
15
|
+
* stopping time. Concretely:
|
|
16
|
+
*
|
|
17
|
+
* For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
|
|
18
|
+
* a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
|
|
19
|
+
* plug-in), and the running e-value is
|
|
20
|
+
*
|
|
21
|
+
* E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
|
|
22
|
+
*
|
|
23
|
+
* E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
|
|
24
|
+
* Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
|
|
25
|
+
* at any time without inflating the type-I error.
|
|
26
|
+
*
|
|
27
|
+
* Combined with `runEvalCampaign`, every consumer running rolling
|
|
28
|
+
* campaigns gains the ability to ship the moment evidence is decisive,
|
|
29
|
+
* stop-early on dead-on-arrival variants, and accumulate evidence across
|
|
30
|
+
* partial runs without spending the FDR budget. No new sweep is wasted.
|
|
31
|
+
*
|
|
32
|
+
* References:
|
|
33
|
+
* - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
|
|
34
|
+
* Time-uniform, nonparametric, nonasymptotic confidence sequences.
|
|
35
|
+
* Annals of Statistics, 49(2), 1055–1080.
|
|
36
|
+
* - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
|
|
37
|
+
* random variables by betting. JRSS B, 86(1), 1–27.
|
|
38
|
+
*/
|
|
39
|
+
type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
|
|
40
|
+
interface PairedEvalueOptions {
|
|
41
|
+
/**
|
|
42
|
+
* Bound on |delta|. Default 1 (matching most score scales). Must satisfy
|
|
43
|
+
* c > 0; deltas outside [-c, c] are clipped with a warning attached to
|
|
44
|
+
* the return value.
|
|
45
|
+
*/
|
|
46
|
+
bound?: number;
|
|
47
|
+
/** Target Type-I error. Default 0.05. */
|
|
48
|
+
alpha?: number;
|
|
49
|
+
/**
|
|
50
|
+
* Region of Practical Equivalence on the *mean* paired delta. When
|
|
51
|
+
* supplied, the verdict can return `'equivalent'` once the running
|
|
52
|
+
* confidence sequence on the mean is fully contained in [low, high].
|
|
53
|
+
*/
|
|
54
|
+
rope?: {
|
|
55
|
+
low: number;
|
|
56
|
+
high: number;
|
|
57
|
+
};
|
|
58
|
+
/** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
|
|
59
|
+
initialBetShrinkage?: number;
|
|
60
|
+
}
|
|
61
|
+
interface PairedEvalueStep {
|
|
62
|
+
/** 1-indexed observation count. */
|
|
63
|
+
t: number;
|
|
64
|
+
delta: number;
|
|
65
|
+
/** Running e-value E_t = ∏ (1 + λ_i · D_i). */
|
|
66
|
+
evalue: number;
|
|
67
|
+
/** Time-uniform p-value at stopping time t. */
|
|
68
|
+
pValue: number;
|
|
69
|
+
/** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
|
|
70
|
+
csLow: number;
|
|
71
|
+
csHigh: number;
|
|
72
|
+
/** Verdict at this stopping time. */
|
|
73
|
+
decision: SequentialDecision;
|
|
74
|
+
}
|
|
75
|
+
interface PairedEvalueSequence {
|
|
76
|
+
steps: PairedEvalueStep[];
|
|
77
|
+
/** The decision at the final step. */
|
|
78
|
+
finalDecision: SequentialDecision;
|
|
79
|
+
/** Index (1-based) at which a non-`continue` decision first fired, or null. */
|
|
80
|
+
decisionFiredAt: number | null;
|
|
81
|
+
/** True if any deltas were clipped to [-bound, bound]. */
|
|
82
|
+
clipped: boolean;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Run the paired e-value sequence over an in-order delta stream.
|
|
86
|
+
*
|
|
87
|
+
* Use for *streaming* / interim analyses: pass the deltas you have so
|
|
88
|
+
* far, get the verdict at every prefix length. The decision is
|
|
89
|
+
* monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
|
|
90
|
+
* fires, the verdict at later steps remains decisive (the e-value is a
|
|
91
|
+
* non-negative martingale; once it crosses the threshold, it's crossed).
|
|
92
|
+
*/
|
|
93
|
+
declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
|
|
94
|
+
interface InterimReleaseConfidenceInput {
|
|
95
|
+
/**
|
|
96
|
+
* One delta series per candidate (paired deltas vs comparator). Order
|
|
97
|
+
* within a series is the order the campaigns were run.
|
|
98
|
+
*/
|
|
99
|
+
deltaSeries: Array<{
|
|
100
|
+
candidateId: string;
|
|
101
|
+
deltas: number[];
|
|
102
|
+
}>;
|
|
103
|
+
alpha?: number;
|
|
104
|
+
bound?: number;
|
|
105
|
+
rope?: {
|
|
106
|
+
low: number;
|
|
107
|
+
high: number;
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
interface InterimReleaseConfidence {
|
|
111
|
+
candidates: Array<{
|
|
112
|
+
candidateId: string;
|
|
113
|
+
decision: SequentialDecision;
|
|
114
|
+
decisionFiredAt: number | null;
|
|
115
|
+
finalEvalue: number;
|
|
116
|
+
finalPValue: number;
|
|
117
|
+
pairs: number;
|
|
118
|
+
csLow: number;
|
|
119
|
+
csHigh: number;
|
|
120
|
+
}>;
|
|
121
|
+
/**
|
|
122
|
+
* Campaign-level recommendation: pick the strongest 'promote_now', else
|
|
123
|
+
* 'continue' if any candidate is still live, else 'reject_now' if every
|
|
124
|
+
* candidate is dead, else 'equivalent'.
|
|
125
|
+
*/
|
|
126
|
+
recommendation: {
|
|
127
|
+
decision: SequentialDecision;
|
|
128
|
+
candidateId: string | null;
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Run interim sequential analyses across many candidates at once,
|
|
133
|
+
* preserving the time-uniform α guarantee for each candidate's series and
|
|
134
|
+
* synthesising a campaign-level recommendation. Designed to be called on
|
|
135
|
+
* every campaign tick — the recommendation is anytime-valid.
|
|
136
|
+
*/
|
|
137
|
+
declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
|
|
138
|
+
|
|
139
|
+
export { type InterimReleaseConfidence as I, type PairedEvalueOptions as P, type SequentialDecision as S, type InterimReleaseConfidenceInput as a, type PairedEvalueSequence as b, type PairedEvalueStep as c, evaluateInterimReleaseConfidence as e, pairedEvalueSequence as p };
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the trace-analyst module.
|
|
3
|
+
*
|
|
4
|
+
* Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
|
|
5
|
+
* line per span, OTLP-shaped. We do NOT depend on a specific tracing
|
|
6
|
+
* vendor at the type level. Adapter
|
|
7
|
+
* layers map upstream shapes onto this interface.
|
|
8
|
+
*
|
|
9
|
+
* Design constraint. Every read operation that can return arbitrary
|
|
10
|
+
* payload must carry a byte budget so the agent's tool result stays
|
|
11
|
+
* bounded regardless of input trace size. Oversized responses
|
|
12
|
+
* substitute a deterministic summary instead of bytes — see
|
|
13
|
+
* `ViewTraceOversized`.
|
|
14
|
+
*/
|
|
15
|
+
/** OTLP span kind (subset we actually use). */
|
|
16
|
+
type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
|
|
17
|
+
type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
|
|
18
|
+
/** Subset of OTLP span fields the analyst exposes to the agent. The
|
|
19
|
+
* store's job is to project upstream's full span shape down to this
|
|
20
|
+
* view — the analyst never sees vendor extensions directly. */
|
|
21
|
+
interface TraceAnalystSpan {
|
|
22
|
+
trace_id: string;
|
|
23
|
+
span_id: string;
|
|
24
|
+
parent_span_id: string | null;
|
|
25
|
+
name: string;
|
|
26
|
+
kind: TraceAnalystSpanKind;
|
|
27
|
+
start_time: string;
|
|
28
|
+
end_time: string;
|
|
29
|
+
duration_ms: number;
|
|
30
|
+
status: TraceAnalystSpanStatus;
|
|
31
|
+
status_message?: string;
|
|
32
|
+
service_name: string | null;
|
|
33
|
+
agent_name: string | null;
|
|
34
|
+
model_name: string | null;
|
|
35
|
+
tool_name: string | null;
|
|
36
|
+
/** Raw JSON-serialisable attribute map. May contain large strings;
|
|
37
|
+
* callers must respect the per-attribute byte cap. */
|
|
38
|
+
attributes: Record<string, unknown>;
|
|
39
|
+
}
|
|
40
|
+
interface TraceAnalystTraceSummary {
|
|
41
|
+
trace_id: string;
|
|
42
|
+
service_name: string | null;
|
|
43
|
+
agent_name: string | null;
|
|
44
|
+
span_count: number;
|
|
45
|
+
has_errors: boolean;
|
|
46
|
+
start_time: string;
|
|
47
|
+
end_time: string;
|
|
48
|
+
duration_ms: number;
|
|
49
|
+
raw_jsonl_bytes: number;
|
|
50
|
+
models: string[];
|
|
51
|
+
tools: string[];
|
|
52
|
+
}
|
|
53
|
+
interface TraceAnalystFilters {
|
|
54
|
+
/** Restrict to traces that contain at least one error span. */
|
|
55
|
+
has_errors?: boolean;
|
|
56
|
+
/** Match if any span's `service.name` is in this list. */
|
|
57
|
+
service_names?: string[];
|
|
58
|
+
/** Match if any span's `agent.name` is in this list. */
|
|
59
|
+
agent_names?: string[];
|
|
60
|
+
/** Match if any LLM span's `llm.model_name` is in this list. */
|
|
61
|
+
model_names?: string[];
|
|
62
|
+
/** Match if any tool span's `tool.name` is in this list. */
|
|
63
|
+
tool_names?: string[];
|
|
64
|
+
/** ISO-8601 lower bound on the trace's earliest start time. */
|
|
65
|
+
start_time_after?: string;
|
|
66
|
+
/** ISO-8601 upper bound on the trace's earliest start time. */
|
|
67
|
+
start_time_before?: string;
|
|
68
|
+
/** Single regex applied to raw JSONL bytes for the trace. Opt-in;
|
|
69
|
+
* expensive on large datasets. Use the indexed filters above first. */
|
|
70
|
+
regex_pattern?: string;
|
|
71
|
+
}
|
|
72
|
+
interface DatasetOverview {
|
|
73
|
+
total_traces: number;
|
|
74
|
+
raw_jsonl_bytes: number;
|
|
75
|
+
services: string[];
|
|
76
|
+
agents: string[];
|
|
77
|
+
models: string[];
|
|
78
|
+
tool_names: string[];
|
|
79
|
+
/** Up to 20 real trace ids the agent may pass to view/search tools. */
|
|
80
|
+
sample_trace_ids: string[];
|
|
81
|
+
errors: {
|
|
82
|
+
trace_count: number;
|
|
83
|
+
span_count: number;
|
|
84
|
+
};
|
|
85
|
+
time_range: {
|
|
86
|
+
earliest: string;
|
|
87
|
+
latest: string;
|
|
88
|
+
} | null;
|
|
89
|
+
}
|
|
90
|
+
interface QueryTracesPage {
|
|
91
|
+
traces: TraceAnalystTraceSummary[];
|
|
92
|
+
total: number;
|
|
93
|
+
has_more: boolean;
|
|
94
|
+
}
|
|
95
|
+
/** Full-trace view. When the response would exceed the per-call byte
|
|
96
|
+
* budget, `oversized` is populated INSTEAD of `spans` so the agent
|
|
97
|
+
* knows to switch to `searchTrace` / `viewSpans`. */
|
|
98
|
+
interface ViewTraceResult {
|
|
99
|
+
trace_id: string;
|
|
100
|
+
spans?: TraceAnalystSpan[];
|
|
101
|
+
oversized?: ViewTraceOversized;
|
|
102
|
+
}
|
|
103
|
+
interface ViewTraceOversized {
|
|
104
|
+
span_count: number;
|
|
105
|
+
/** Names with their counts, sorted desc. Capped at 20 entries. */
|
|
106
|
+
top_span_names: Array<[string, number]>;
|
|
107
|
+
/** Largest single span body (bytes after attribute-cap projection). */
|
|
108
|
+
span_response_bytes_max: number;
|
|
109
|
+
error_span_count: number;
|
|
110
|
+
}
|
|
111
|
+
interface ViewSpansResult {
|
|
112
|
+
trace_id: string;
|
|
113
|
+
spans: TraceAnalystSpan[];
|
|
114
|
+
/** Number of requested span ids that were not found in the trace. */
|
|
115
|
+
missing_span_ids: string[];
|
|
116
|
+
/** Number of attribute fields truncated to fit the per-attribute cap. */
|
|
117
|
+
truncated_attribute_count: number;
|
|
118
|
+
}
|
|
119
|
+
interface SpanMatchRecord {
|
|
120
|
+
trace_id: string;
|
|
121
|
+
span_id: string;
|
|
122
|
+
span_name: string;
|
|
123
|
+
span_kind: TraceAnalystSpanKind;
|
|
124
|
+
/** JSON pointer-style path to the matched value, e.g.
|
|
125
|
+
* `attributes."llm.input_messages"[2].content`. */
|
|
126
|
+
attribute_path: string;
|
|
127
|
+
matched_text: string;
|
|
128
|
+
context_before: string;
|
|
129
|
+
context_after: string;
|
|
130
|
+
match_offset: number;
|
|
131
|
+
}
|
|
132
|
+
interface SearchTraceResult {
|
|
133
|
+
trace_id: string;
|
|
134
|
+
hits: SpanMatchRecord[];
|
|
135
|
+
total_matches: number;
|
|
136
|
+
has_more: boolean;
|
|
137
|
+
}
|
|
138
|
+
interface SearchSpanResult {
|
|
139
|
+
trace_id: string;
|
|
140
|
+
span_id: string;
|
|
141
|
+
hits: SpanMatchRecord[];
|
|
142
|
+
total_matches: number;
|
|
143
|
+
has_more: boolean;
|
|
144
|
+
}
|
|
145
|
+
/** Tunable byte budgets for bounded RLM tool output. */
|
|
146
|
+
interface TraceAnalystByteBudgets {
|
|
147
|
+
/** Max bytes any single tool response may emit. Hard ceiling enforced
|
|
148
|
+
* by the store; oversized → summary. Default 150_000. */
|
|
149
|
+
perCallByteCeiling: number;
|
|
150
|
+
/** Per-attribute string truncation cap on `viewTrace` (discovery scan).
|
|
151
|
+
* Default 4096. */
|
|
152
|
+
perAttributeViewBudget: number;
|
|
153
|
+
/** Per-attribute string truncation cap on `viewSpans` (surgical reads).
|
|
154
|
+
* Default 16384. */
|
|
155
|
+
perAttributeSpanBudget: number;
|
|
156
|
+
/** Per-attribute cap on a single match record's `matched_text` and
|
|
157
|
+
* context window. Default 1024. */
|
|
158
|
+
perMatchTextBudget: number;
|
|
159
|
+
}
|
|
160
|
+
declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
|
|
161
|
+
/** Marker substituted in place of truncated string payloads. Callers
|
|
162
|
+
* parsing tool output can detect it deterministically. */
|
|
163
|
+
declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* `TraceAnalysisStore` — read-side interface the trace-analyst calls
|
|
167
|
+
* through. Six operations, all bounded:
|
|
168
|
+
*
|
|
169
|
+
* - `getOverview(filters?)` — dataset rollup + sample trace ids.
|
|
170
|
+
* - `queryTraces(filters?, limit, offset)` — paginated summaries.
|
|
171
|
+
* - `countTraces(filters?)` — cheap count without materialisation.
|
|
172
|
+
* - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
|
|
173
|
+
* - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
|
|
174
|
+
* - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
|
|
175
|
+
* - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
|
|
176
|
+
*
|
|
177
|
+
* Multiple implementations ship in the core (`OtlpFileTraceStore`).
|
|
178
|
+
* Downstream callers can supply their own — e.g. a DuckDB-backed
|
|
179
|
+
* adapter or an in-memory adapter for tests — by implementing this
|
|
180
|
+
* interface.
|
|
181
|
+
*
|
|
182
|
+
* Filters compose with AND semantics. Empty/undefined fields impose
|
|
183
|
+
* no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
|
|
184
|
+
* implementations may skip it via `count`/`overview` when not set.
|
|
185
|
+
*/
|
|
186
|
+
|
|
187
|
+
interface TraceAnalysisStore {
|
|
188
|
+
getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
|
|
189
|
+
queryTraces(opts: {
|
|
190
|
+
filters?: TraceAnalystFilters;
|
|
191
|
+
limit: number;
|
|
192
|
+
offset?: number;
|
|
193
|
+
}): Promise<QueryTracesPage>;
|
|
194
|
+
countTraces(filters?: TraceAnalystFilters): Promise<number>;
|
|
195
|
+
viewTrace(opts: {
|
|
196
|
+
trace_id: string;
|
|
197
|
+
/** Override per-attribute byte cap. Defaults to discovery budget. */
|
|
198
|
+
per_attribute_byte_cap?: number;
|
|
199
|
+
}): Promise<ViewTraceResult>;
|
|
200
|
+
viewSpans(opts: {
|
|
201
|
+
trace_id: string;
|
|
202
|
+
span_ids: readonly string[];
|
|
203
|
+
/** Override per-attribute byte cap. Defaults to surgical budget. */
|
|
204
|
+
per_attribute_byte_cap?: number;
|
|
205
|
+
}): Promise<ViewSpansResult>;
|
|
206
|
+
searchTrace(opts: {
|
|
207
|
+
trace_id: string;
|
|
208
|
+
regex_pattern: string;
|
|
209
|
+
/** Hard cap on matches returned. Default 50. */
|
|
210
|
+
max_matches?: number;
|
|
211
|
+
}): Promise<SearchTraceResult>;
|
|
212
|
+
searchSpan(opts: {
|
|
213
|
+
trace_id: string;
|
|
214
|
+
span_id: string;
|
|
215
|
+
regex_pattern: string;
|
|
216
|
+
max_matches?: number;
|
|
217
|
+
}): Promise<SearchSpanResult>;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
export { DEFAULT_TRACE_ANALYST_BUDGETS as D, type QueryTracesPage as Q, type SearchSpanResult as S, type TraceAnalysisStore as T, type ViewSpansResult as V, type DatasetOverview as a, type SearchTraceResult as b, type SpanMatchRecord as c, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX as d, type TraceAnalystByteBudgets as e, type TraceAnalystFilters as f, type TraceAnalystSpan as g, type TraceAnalystSpanKind as h, type TraceAnalystSpanStatus as i, type TraceAnalystTraceSummary as j, type ViewTraceOversized as k, type ViewTraceResult as l };
|
|
@@ -442,142 +442,4 @@ interface ResearchReport {
|
|
|
442
442
|
*/
|
|
443
443
|
declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
|
|
444
444
|
|
|
445
|
-
|
|
446
|
-
* Always-valid sequential evaluation.
|
|
447
|
-
*
|
|
448
|
-
* `researchReport` assumes a single pre-specified analysis. Real
|
|
449
|
-
* consumers run campaigns weekly / nightly / per-PR; each new run silently
|
|
450
|
-
* inflates the false-discovery rate, because the BH-FDR guarantee is for
|
|
451
|
-
* the *first* look, not the 47th. Without time-uniform inference,
|
|
452
|
-
* launch-decision teams either (a) don't peek, which forfeits the cost
|
|
453
|
-
* advantage of stop-when-decisive, or (b) peek and pretend they didn't,
|
|
454
|
-
* which forfeits scientific validity.
|
|
455
|
-
*
|
|
456
|
-
* This module ships **e-value-based confidence sequences** for paired
|
|
457
|
-
* bounded outcomes. The methodology is the predictable plug-in betting
|
|
458
|
-
* martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
|
|
459
|
-
* stopping time. Concretely:
|
|
460
|
-
*
|
|
461
|
-
* For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
|
|
462
|
-
* a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
|
|
463
|
-
* plug-in), and the running e-value is
|
|
464
|
-
*
|
|
465
|
-
* E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
|
|
466
|
-
*
|
|
467
|
-
* E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
|
|
468
|
-
* Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
|
|
469
|
-
* at any time without inflating the type-I error.
|
|
470
|
-
*
|
|
471
|
-
* Combined with `runEvalCampaign`, every consumer running rolling
|
|
472
|
-
* campaigns gains the ability to ship the moment evidence is decisive,
|
|
473
|
-
* stop-early on dead-on-arrival variants, and accumulate evidence across
|
|
474
|
-
* partial runs without spending the FDR budget. No new sweep is wasted.
|
|
475
|
-
*
|
|
476
|
-
* References:
|
|
477
|
-
* - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
|
|
478
|
-
* Time-uniform, nonparametric, nonasymptotic confidence sequences.
|
|
479
|
-
* Annals of Statistics, 49(2), 1055–1080.
|
|
480
|
-
* - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
|
|
481
|
-
* random variables by betting. JRSS B, 86(1), 1–27.
|
|
482
|
-
*/
|
|
483
|
-
type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
|
|
484
|
-
interface PairedEvalueOptions {
|
|
485
|
-
/**
|
|
486
|
-
* Bound on |delta|. Default 1 (matching most score scales). Must satisfy
|
|
487
|
-
* c > 0; deltas outside [-c, c] are clipped with a warning attached to
|
|
488
|
-
* the return value.
|
|
489
|
-
*/
|
|
490
|
-
bound?: number;
|
|
491
|
-
/** Target Type-I error. Default 0.05. */
|
|
492
|
-
alpha?: number;
|
|
493
|
-
/**
|
|
494
|
-
* Region of Practical Equivalence on the *mean* paired delta. When
|
|
495
|
-
* supplied, the verdict can return `'equivalent'` once the running
|
|
496
|
-
* confidence sequence on the mean is fully contained in [low, high].
|
|
497
|
-
*/
|
|
498
|
-
rope?: {
|
|
499
|
-
low: number;
|
|
500
|
-
high: number;
|
|
501
|
-
};
|
|
502
|
-
/** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
|
|
503
|
-
initialBetShrinkage?: number;
|
|
504
|
-
}
|
|
505
|
-
interface PairedEvalueStep {
|
|
506
|
-
/** 1-indexed observation count. */
|
|
507
|
-
t: number;
|
|
508
|
-
delta: number;
|
|
509
|
-
/** Running e-value E_t = ∏ (1 + λ_i · D_i). */
|
|
510
|
-
evalue: number;
|
|
511
|
-
/** Time-uniform p-value at stopping time t. */
|
|
512
|
-
pValue: number;
|
|
513
|
-
/** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
|
|
514
|
-
csLow: number;
|
|
515
|
-
csHigh: number;
|
|
516
|
-
/** Verdict at this stopping time. */
|
|
517
|
-
decision: SequentialDecision;
|
|
518
|
-
}
|
|
519
|
-
interface PairedEvalueSequence {
|
|
520
|
-
steps: PairedEvalueStep[];
|
|
521
|
-
/** The decision at the final step. */
|
|
522
|
-
finalDecision: SequentialDecision;
|
|
523
|
-
/** Index (1-based) at which a non-`continue` decision first fired, or null. */
|
|
524
|
-
decisionFiredAt: number | null;
|
|
525
|
-
/** True if any deltas were clipped to [-bound, bound]. */
|
|
526
|
-
clipped: boolean;
|
|
527
|
-
}
|
|
528
|
-
/**
|
|
529
|
-
* Run the paired e-value sequence over an in-order delta stream.
|
|
530
|
-
*
|
|
531
|
-
* Use for *streaming* / interim analyses: pass the deltas you have so
|
|
532
|
-
* far, get the verdict at every prefix length. The decision is
|
|
533
|
-
* monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
|
|
534
|
-
* fires, the verdict at later steps remains decisive (the e-value is a
|
|
535
|
-
* non-negative martingale; once it crosses the threshold, it's crossed).
|
|
536
|
-
*/
|
|
537
|
-
declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
|
|
538
|
-
interface InterimReleaseConfidenceInput {
|
|
539
|
-
/**
|
|
540
|
-
* One delta series per candidate (paired deltas vs comparator). Order
|
|
541
|
-
* within a series is the order the campaigns were run.
|
|
542
|
-
*/
|
|
543
|
-
deltaSeries: Array<{
|
|
544
|
-
candidateId: string;
|
|
545
|
-
deltas: number[];
|
|
546
|
-
}>;
|
|
547
|
-
alpha?: number;
|
|
548
|
-
bound?: number;
|
|
549
|
-
rope?: {
|
|
550
|
-
low: number;
|
|
551
|
-
high: number;
|
|
552
|
-
};
|
|
553
|
-
}
|
|
554
|
-
interface InterimReleaseConfidence {
|
|
555
|
-
candidates: Array<{
|
|
556
|
-
candidateId: string;
|
|
557
|
-
decision: SequentialDecision;
|
|
558
|
-
decisionFiredAt: number | null;
|
|
559
|
-
finalEvalue: number;
|
|
560
|
-
finalPValue: number;
|
|
561
|
-
pairs: number;
|
|
562
|
-
csLow: number;
|
|
563
|
-
csHigh: number;
|
|
564
|
-
}>;
|
|
565
|
-
/**
|
|
566
|
-
* Campaign-level recommendation: pick the strongest 'promote_now', else
|
|
567
|
-
* 'continue' if any candidate is still live, else 'reject_now' if every
|
|
568
|
-
* candidate is dead, else 'equivalent'.
|
|
569
|
-
*/
|
|
570
|
-
recommendation: {
|
|
571
|
-
decision: SequentialDecision;
|
|
572
|
-
candidateId: string | null;
|
|
573
|
-
};
|
|
574
|
-
}
|
|
575
|
-
/**
|
|
576
|
-
* Run interim sequential analyses across many candidates at once,
|
|
577
|
-
* preserving the time-uniform α guarantee for each candidate's series and
|
|
578
|
-
* synthesising a campaign-level recommendation. Designed to be called on
|
|
579
|
-
* every campaign tick — the recommendation is anytime-valid.
|
|
580
|
-
*/
|
|
581
|
-
declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
|
|
582
|
-
|
|
583
|
-
export { type GainDistributionBin as G, HeldOutGate as H, type InterimReleaseConfidence as I, type PairedEvalueOptions as P, RESEARCH_REPORT_HARD_PAIR_FLOOR as R, type SequentialDecision as S, type GainDistributionFigureSpec as a, type GainDistributionOptions as b, type InterimReleaseConfidenceInput as c, type PairedEvalueSequence as d, type PairedEvalueStep as e, type ParetoFigureSpec as f, type ParetoPoint as g, type ResearchReport as h, type ResearchReportCandidate as i, type ResearchReportDecision as j, type ResearchReportMethodology as k, type ResearchReportOptions as l, type ResearchReportRecommendation as m, type SummaryTable as n, type SummaryTableOptions as o, type SummaryTableRow as p, evaluateInterimReleaseConfidence as q, gainHistogram as r, pairedEvalueSequence as s, paretoChart as t, researchReport as u, summaryTable as v, type GateDecision as w, type GateEvidence as x, type HeldOutGateConfig as y, type HeldOutGateRejectionCode as z };
|
|
445
|
+
export { type GainDistributionBin as G, HeldOutGate as H, type ParetoFigureSpec as P, RESEARCH_REPORT_HARD_PAIR_FLOOR as R, type SummaryTable as S, type GainDistributionFigureSpec as a, type GainDistributionOptions as b, type ParetoPoint as c, type ResearchReport as d, type ResearchReportCandidate as e, type ResearchReportDecision as f, type ResearchReportMethodology as g, type ResearchReportOptions as h, type ResearchReportRecommendation as i, type SummaryTableOptions as j, type SummaryTableRow as k, gainHistogram as l, type GateDecision as m, type GateEvidence as n, type HeldOutGateConfig as o, paretoChart as p, type HeldOutGateRejectionCode as q, researchReport as r, summaryTable as s };
|