@tangle-network/agent-eval 0.20.10 → 0.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +129 -126
  2. package/dist/benchmarks/index.d.ts +2 -1
  3. package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
  4. package/dist/chunk-75MCTH7P.js.map +1 -0
  5. package/dist/chunk-HKYRWNHV.js +1354 -0
  6. package/dist/chunk-HKYRWNHV.js.map +1 -0
  7. package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
  8. package/dist/chunk-IKFVX537.js +717 -0
  9. package/dist/chunk-IKFVX537.js.map +1 -0
  10. package/dist/chunk-KWUAAIHR.js +1764 -0
  11. package/dist/chunk-KWUAAIHR.js.map +1 -0
  12. package/dist/chunk-MCMV7DUL.js +1310 -0
  13. package/dist/chunk-MCMV7DUL.js.map +1 -0
  14. package/dist/chunk-ODFINDLQ.js +413 -0
  15. package/dist/chunk-ODFINDLQ.js.map +1 -0
  16. package/dist/chunk-PKCVBYTQ.js +200 -0
  17. package/dist/chunk-PKCVBYTQ.js.map +1 -0
  18. package/dist/chunk-YUFXO3TU.js +148 -0
  19. package/dist/chunk-YUFXO3TU.js.map +1 -0
  20. package/dist/cli.js +2 -2
  21. package/dist/control-C8NKbF3w.d.ts +258 -0
  22. package/dist/control.d.ts +5 -0
  23. package/dist/control.js +30 -0
  24. package/dist/control.js.map +1 -0
  25. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  26. package/dist/emitter-BYO2nSDA.d.ts +387 -0
  27. package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
  28. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  29. package/dist/index.d.ts +115 -2870
  30. package/dist/index.js +1049 -6156
  31. package/dist/index.js.map +1 -1
  32. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  33. package/dist/openapi.json +1 -1
  34. package/dist/optimization.d.ts +145 -0
  35. package/dist/optimization.js +60 -0
  36. package/dist/optimization.js.map +1 -0
  37. package/dist/reporting.d.ts +426 -0
  38. package/dist/reporting.js +32 -0
  39. package/dist/reporting.js.map +1 -0
  40. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  41. package/dist/traces.d.ts +658 -0
  42. package/dist/traces.js +100 -0
  43. package/dist/traces.js.map +1 -0
  44. package/dist/wire/index.js +2 -2
  45. package/docs/concepts.md +16 -11
  46. package/docs/feature-guide.md +10 -17
  47. package/docs/integration-launch-gates.md +77 -0
  48. package/docs/product-eval-adoption.md +221 -0
  49. package/docs/trace-analysis.md +75 -0
  50. package/package.json +21 -1
  51. package/dist/chunk-JAOLXRIA.js.map +0 -1
  52. /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
@@ -0,0 +1,145 @@
1
+ import { G as GateDecision } from './multi-shot-optimization-Bvtz294B.js';
2
+ export { A as ActionableSideInfo, b as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, e as GenerationReport, I as InMemoryTrialCache, h as MultiShotGateConfig, i as MultiShotGateResult, j as MultiShotMutateAdapter, k as MultiShotOptimizationConfig, l as MultiShotOptimizationResult, m as MultiShotRun, n as MultiShotRunInput, o as MultiShotRunner, p as MultiShotScore, q as MultiShotScorer, r as MultiShotSplit, s as MultiShotTrace, t as MultiShotTrialResult, u as MultiShotVariant, M as MutateAdapter, v as PromptEvolutionConfig, w as PromptEvolutionEvent, x as PromptEvolutionResult, R as ReflectionContext, y as ReflectionProposal, S as ScenarioAggregate, z as ScoreAdapter, T as TrialCache, a as TrialResult, B as TrialTrace, V as VariantAggregate, C as buildReflectionPrompt, J as defaultMultiShotObjectives, Q as parseReflectionResponse, U as runMultiShotOptimization, W as runPromptEvolution, Y as trialTraceFromMultiShotTrial } from './multi-shot-optimization-Bvtz294B.js';
3
+ import { a as RunRecord } from './run-record-CX_jcAyr.js';
4
+ export { n as FeedbackArtifactType, o as FeedbackAttempt, F as FeedbackLabel, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, b as FeedbackTrajectory, y as FeedbackTrajectoryFilter, a as FeedbackTrajectoryStore, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, U as serializeFeedbackTrajectoriesJsonl, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-BGQ_ANCN.js';
5
+ import './dataset-B9qvlm_o.js';
6
+ import './emitter-BYO2nSDA.js';
7
+
8
+ /**
9
+ * Researcher interface — stable hook for an external autonomous-research
10
+ * agent to drive the meta-loop.
11
+ *
12
+ * Implementations live downstream (typically in a private repo that
13
+ * runs the actual LLM). This package ships only the contract + a
14
+ * `NoopResearcher` so consumers can wire the surface without being
15
+ * forced to implement every method up front.
16
+ *
17
+ * The four methods mirror the four stages of the paper "Two Loops,
18
+ * Three Roles":
19
+ *
20
+ * inspectFailures — given the observed runs, what failure modes
21
+ * are present? (data → diagnosis)
22
+ * proposeChange — given diagnosed failure modes, what
23
+ * structural changes should we try?
24
+ * (diagnosis → plan delta)
25
+ * applyChange — fold the proposed deltas into a concrete
26
+ * experiment plan against an existing baseline.
27
+ * (plan delta → executable plan)
28
+ * evaluateChange — run the plan, return runs + the gate verdict.
29
+ * (executable plan → verdict)
30
+ *
31
+ * Composition is the discipline: a Researcher implementation MUST
32
+ * keep these four steps separate and inspectable. Conflating
33
+ * "diagnose + propose + run" into a single LLM call defeats the
34
+ * point of the framework — you can't audit which step lied.
35
+ *
36
+ * THIS INTERFACE IS STABLE. Breaking changes require a new module
37
+ * (e.g. `Researcher2`) so existing implementations keep working.
38
+ */
39
+
40
+ /** A diagnosed failure mode with the run-IDs that exhibit it. */
41
+ interface FailureMode {
42
+ /** Short machine-readable code. Must be stable across runs of the
43
+ * same researcher to enable longitudinal tracking. */
44
+ code: string;
45
+ /** Human-readable description for the paper / dashboard. */
46
+ description: string;
47
+ evidence: {
48
+ /** Run IDs (from `RunRecord.runId`) where this failure mode was
49
+ * observed. */
50
+ runIds: string[];
51
+ /** Number of run samples that informed the diagnosis. */
52
+ samples: number;
53
+ };
54
+ }
55
+ /** A single steering change the researcher wants to try. */
56
+ interface SteeringChange {
57
+ kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
58
+ /** Implementation-specific payload. Researcher implementations
59
+ * define the schema — keep this `unknown` here to avoid coupling
60
+ * the public interface to any one researcher's internal model. */
61
+ payload: unknown;
62
+ /** Why the researcher proposed this change. Goes into the audit
63
+ * trail next to the failure-mode evidence. */
64
+ rationale: string;
65
+ /** Optional self-reported expected delta on the headline metric. */
66
+ expectedDelta?: number;
67
+ }
68
+ /** A single experiment plan, mapped onto the search/holdout splits. */
69
+ interface ExperimentPlan {
70
+ baselineCandidateId: string;
71
+ proposedCandidateId: string;
72
+ changes: SteeringChange[];
73
+ /** USD ceiling for the entire experiment. The runner must stop
74
+ * before exceeding this and report a partial result. */
75
+ evaluationBudgetUsd: number;
76
+ /** Item IDs (your dataset keys) for the search vs holdout splits. */
77
+ splits: {
78
+ search: string[];
79
+ holdout: string[];
80
+ };
81
+ }
82
+ /** Result of running a plan: every run, plus the gate verdict. */
83
+ interface ExperimentResult {
84
+ plan: ExperimentPlan;
85
+ runs: RunRecord[];
86
+ gateDecision: GateDecision;
87
+ }
88
+ /**
89
+ * The researcher loop. Stable, four-step, inspectable.
90
+ *
91
+ * ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
92
+ * │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
93
+ * └──────────┘ └──────────┘ └────┬─────┘
94
+ * │
95
+ * ▼
96
+ * ┌────────────────┐ applyChange ┌────────┐
97
+ * │ ExperimentPlan │ ◀────────────│ base │
98
+ * └────────┬───────┘ └────────┘
99
+ * │
100
+ * evaluateChange ▼
101
+ * ┌────────────────┐
102
+ * │ ExperimentResult│
103
+ * └────────────────┘
104
+ */
105
+ interface Researcher {
106
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
107
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
108
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
109
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
110
+ }
111
+ interface CallbackResearcherOptions {
112
+ inspectFailures: Researcher['inspectFailures'];
113
+ proposeChange: Researcher['proposeChange'];
114
+ applyChange: Researcher['applyChange'];
115
+ evaluateChange: Researcher['evaluateChange'];
116
+ }
117
+ /**
118
+ * Minimal concrete researcher for tests, scripts, and small integrations.
119
+ * Larger autonomous researchers can still implement `Researcher` directly.
120
+ */
121
+ declare class CallbackResearcher implements Researcher {
122
+ private readonly callbacks;
123
+ constructor(callbacks: CallbackResearcherOptions);
124
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
125
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
126
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
127
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
128
+ }
129
+ /**
130
+ * No-op researcher — fails loud on every method. Use as a placeholder
131
+ * in code paths that wire the interface but don't have an implementation
132
+ * yet. Importantly, this does NOT silently succeed: a no-op researcher
133
+ * that returned empty arrays would muffle the loop's signal that
134
+ * nobody implemented the brain.
135
+ */
136
+ declare class NoopResearcher implements Researcher {
137
+ private readonly hint;
138
+ constructor(hint?: string);
139
+ inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
140
+ proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
141
+ applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
142
+ evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
143
+ }
144
+
145
+ export { CallbackResearcher, type CallbackResearcherOptions, type ExperimentPlan, type ExperimentResult, type FailureMode, NoopResearcher, type Researcher, type SteeringChange };
@@ -0,0 +1,60 @@
1
+ import {
2
+ CallbackResearcher,
3
+ DEFAULT_MUTATION_PRIMITIVES,
4
+ FileSystemFeedbackTrajectoryStore,
5
+ InMemoryFeedbackTrajectoryStore,
6
+ InMemoryTrialCache,
7
+ NoopResearcher,
8
+ assignFeedbackSplit,
9
+ buildReflectionPrompt,
10
+ controlRunToFeedbackTrajectory,
11
+ createFeedbackTrajectory,
12
+ defaultMultiShotObjectives,
13
+ feedbackTrajectoriesToDatasetScenarios,
14
+ feedbackTrajectoriesToOptimizerRows,
15
+ feedbackTrajectoryToDatasetScenario,
16
+ feedbackTrajectoryToOptimizerRow,
17
+ parseFeedbackTrajectoriesJsonl,
18
+ parseReflectionResponse,
19
+ renderPreferenceMemoryMarkdown,
20
+ replayFeedbackTrajectories,
21
+ replayFeedbackTrajectory,
22
+ runMultiShotOptimization,
23
+ runPromptEvolution,
24
+ serializeFeedbackTrajectoriesJsonl,
25
+ summarizePreferenceMemory,
26
+ trialTraceFromMultiShotTrial,
27
+ withAssignedFeedbackSplit
28
+ } from "./chunk-HKYRWNHV.js";
29
+ import "./chunk-YUFXO3TU.js";
30
+ import "./chunk-ODFINDLQ.js";
31
+ import "./chunk-PZ5AY32C.js";
32
+ export {
33
+ CallbackResearcher,
34
+ DEFAULT_MUTATION_PRIMITIVES,
35
+ FileSystemFeedbackTrajectoryStore,
36
+ InMemoryFeedbackTrajectoryStore,
37
+ InMemoryTrialCache,
38
+ NoopResearcher,
39
+ assignFeedbackSplit,
40
+ buildReflectionPrompt,
41
+ controlRunToFeedbackTrajectory,
42
+ createFeedbackTrajectory,
43
+ defaultMultiShotObjectives,
44
+ feedbackTrajectoriesToDatasetScenarios,
45
+ feedbackTrajectoriesToOptimizerRows,
46
+ feedbackTrajectoryToDatasetScenario,
47
+ feedbackTrajectoryToOptimizerRow,
48
+ parseFeedbackTrajectoriesJsonl,
49
+ parseReflectionResponse,
50
+ renderPreferenceMemoryMarkdown,
51
+ replayFeedbackTrajectories,
52
+ replayFeedbackTrajectory,
53
+ runMultiShotOptimization,
54
+ runPromptEvolution,
55
+ serializeFeedbackTrajectoriesJsonl,
56
+ summarizePreferenceMemory,
57
+ trialTraceFromMultiShotTrial,
58
+ withAssignedFeedbackSplit
59
+ };
60
+ //# sourceMappingURL=optimization.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -0,0 +1,426 @@
1
+ import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B9qvlm_o.js';
2
+ import { G as GateDecision, A as ActionableSideInfo, t as MultiShotTrialResult } from './multi-shot-optimization-Bvtz294B.js';
3
+ import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
4
+
5
+ /**
6
+ * Release confidence gate.
7
+ *
8
+ * This is the production-facing composition layer over the lower-level
9
+ * primitives:
10
+ * - Dataset manifests prove corpus/version coverage.
11
+ * - RunRecord rows prove reproducible search/holdout outcomes.
12
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
13
+ * - HeldOutGate decisions remain the paired promotion authority.
14
+ *
15
+ * The gate is intentionally pure and conservative. Missing declared evidence
16
+ * fails closed instead of being treated as a neutral zero.
17
+ */
18
+
19
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
20
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
21
+ interface ReleaseTraceEvidence {
22
+ scenarioId: string;
23
+ candidateId?: string;
24
+ split?: RunSplitTag;
25
+ score?: number;
26
+ ok?: boolean;
27
+ turnCount?: number;
28
+ costUsd?: number;
29
+ durationMs?: number;
30
+ failureMode?: string;
31
+ asi?: ActionableSideInfo[];
32
+ metadata?: Record<string, unknown>;
33
+ }
34
+ interface ReleaseConfidenceThresholds {
35
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
36
+ requireCorpus?: boolean;
37
+ minScenarioCount?: number;
38
+ minSearchRuns?: number;
39
+ minHoldoutRuns?: number;
40
+ /** Require at least one holdout scenario/run. Default true. */
41
+ requireHoldout?: boolean;
42
+ minPassRate?: number;
43
+ minMeanScore?: number;
44
+ /** Search mean may exceed holdout mean by at most this much. */
45
+ maxOverfitGap?: number;
46
+ maxMeanCostUsd?: number;
47
+ maxP95WallMs?: number;
48
+ /** Low-score/failed rows must carry ASI. Default true. */
49
+ requireAsiForFailures?: boolean;
50
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
51
+ failureScoreThreshold?: number;
52
+ }
53
+ interface ReleaseConfidenceInput {
54
+ target: string;
55
+ candidateId?: string;
56
+ baselineId?: string;
57
+ dataset?: DatasetManifest;
58
+ scenarios?: readonly DatasetScenario[];
59
+ runs?: readonly RunRecord[];
60
+ traces?: readonly ReleaseTraceEvidence[];
61
+ gateDecision?: GateDecision | null;
62
+ thresholds?: ReleaseConfidenceThresholds;
63
+ }
64
+ interface ReleaseConfidenceAxis {
65
+ name: ReleaseConfidenceAxisName;
66
+ status: ReleaseConfidenceStatus;
67
+ score: number;
68
+ detail: string;
69
+ }
70
+ interface ReleaseConfidenceIssue {
71
+ axis: ReleaseConfidenceAxisName;
72
+ severity: 'critical' | 'warning';
73
+ code: string;
74
+ detail: string;
75
+ }
76
+ interface ReleaseConfidenceMetrics {
77
+ scenarioCount: number;
78
+ searchRuns: number;
79
+ holdoutRuns: number;
80
+ passRate: number;
81
+ meanScore: number;
82
+ searchMeanScore: number;
83
+ holdoutMeanScore: number;
84
+ overfitGap: number;
85
+ meanCostUsd: number;
86
+ p95WallMs: number;
87
+ failedRows: number;
88
+ failuresWithAsi: number;
89
+ singleShotTraces: number;
90
+ multiShotTraces: number;
91
+ splitCounts: Record<DatasetSplit, number>;
92
+ domainCounts: Record<string, number>;
93
+ failureModeCounts: Record<string, number>;
94
+ responsibleSurfaceCounts: Record<string, number>;
95
+ }
96
+ interface ReleaseConfidenceScorecard {
97
+ target: string;
98
+ candidateId: string | null;
99
+ baselineId: string | null;
100
+ status: ReleaseConfidenceStatus;
101
+ promote: boolean;
102
+ axes: ReleaseConfidenceAxis[];
103
+ issues: ReleaseConfidenceIssue[];
104
+ metrics: ReleaseConfidenceMetrics;
105
+ dataset: DatasetManifest | null;
106
+ gateDecision: GateDecision | null;
107
+ summary: string;
108
+ }
109
+ declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
110
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
111
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
112
+
113
+ /**
114
+ * Paper-grade paired statistics for held-out promotion gates.
115
+ *
116
+ * The promotion gate (`HeldOutGate`) needs three things:
117
+ *
118
+ * 1. A bootstrap confidence interval on the per-item paired delta
119
+ * (`pairedBootstrap`). Median delta is the headline number; the
120
+ * CI lower bound is what the gate checks against `pairedDeltaThreshold`.
121
+ * 2. A non-parametric significance test on the paired deltas
122
+ * (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
123
+ * paper-style name).
124
+ * 3. False-discovery-rate correction across simultaneously-tested
125
+ * candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
126
+ *
127
+ * Why a separate file: every existing primitive lives in `statistics.ts`
128
+ * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
129
+ * paired-only, paper-grade, and load-bearing for the promotion gate.
130
+ * Putting it next to `statistics.ts` would require editing that file;
131
+ * the brief forbids that. New file, new exports, no surface change.
132
+ */
133
+ interface PairedBootstrapResult {
134
+ /** Number of paired observations (after dropping unequal lengths is rejected). */
135
+ n: number;
136
+ /** Median of paired deltas (after − before). */
137
+ median: number;
138
+ /** Mean of paired deltas. */
139
+ mean: number;
140
+ /** Lower bound of the bootstrap CI on the median delta. */
141
+ low: number;
142
+ /** Upper bound of the bootstrap CI on the median delta. */
143
+ high: number;
144
+ /** Confidence level used (e.g. 0.95). */
145
+ confidence: number;
146
+ /** Number of bootstrap resamples used. */
147
+ resamples: number;
148
+ }
149
+ interface PairedBootstrapOptions {
150
+ /** Confidence level. Default 0.95. */
151
+ confidence?: number;
152
+ /** Bootstrap resample count. Default 2000. */
153
+ resamples?: number;
154
+ /** Statistic to bootstrap. Default 'median'. */
155
+ statistic?: 'median' | 'mean';
156
+ /** Deterministic seed. If omitted, uses Math.random(). */
157
+ seed?: number;
158
+ }
159
+ /**
160
+ * Paired bootstrap on (after - before) deltas. Returns a CI on the
161
+ * chosen statistic (median by default). Pairs are resampled with
162
+ * replacement. The lower bound is what the promotion gate checks: if
163
+ * `low > pairedDeltaThreshold`, the gain is real at the chosen
164
+ * confidence level.
165
+ *
166
+ * Throws on unequal sample sizes — caller must align pairs upstream.
167
+ */
168
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
169
+ /**
170
+ * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
171
+ * paired deltas is the standard non-parametric significance test for
172
+ * "candidate beats baseline on matched items." Use alongside the
173
+ * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
174
+ */
175
+ declare function pairedWilcoxon(before: number[], after: number[]): {
176
+ w: number;
177
+ p: number;
178
+ };
179
+ /**
180
+ * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
181
+ * across multiple candidate-vs-baseline comparisons run in the same
182
+ * promotion sweep. Returns BH-adjusted q-values and significance at
183
+ * the requested FDR (default 0.05).
184
+ */
185
+ declare function bhAdjust(pValues: number[], fdr?: number): {
186
+ qValues: number[];
187
+ significant: boolean[];
188
+ };
189
+
190
+ /**
191
+ * Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
192
+ * than replacing it.
193
+ *
194
+ * Three artefacts:
195
+ *
196
+ * - `summaryTable` Markdown table of per-candidate means,
197
+ * 95% bootstrap CIs, BH-adjusted Wilcoxon
198
+ * p-values, and Cohen's d versus a
199
+ * comparator candidate.
200
+ * - `paretoChart` Abstract spec for a cost vs quality
201
+ * scatter, with gate decisions overlaid.
202
+ * Returns numbers + labels — caller
203
+ * chooses the plotting library.
204
+ * - `gainHistogram`
205
+ * Per-item paired holdout deltas as a
206
+ * histogram spec (bins + counts + median +
207
+ * CI). Same "data, not images" contract.
208
+ *
209
+ * The figure types are PlotSpecs — JSON-friendly, library-agnostic.
210
+ * They aren't React components and they aren't PNGs; they are
211
+ * what you'd hand to vega-lite, plotly, matplotlib, or your own
212
+ * Canvas renderer to draw the actual figure.
213
+ */
214
+
215
+ interface SummaryTableOptions {
216
+ /** Comparator candidate id. Wilcoxon + Cohen's d are computed
217
+ * versus this candidate. Required for paired stats columns. */
218
+ comparator?: string;
219
+ /** Which split to read scores from. Default 'holdout'. */
220
+ split?: 'search' | 'holdout';
221
+ /** Confidence level for the bootstrap CI on the mean. Default 0.95. */
222
+ confidence?: number;
223
+ /** FDR for BH adjustment of the comparison p-values. Default 0.05. */
224
+ fdr?: number;
225
+ }
226
+ interface SummaryTableRow {
227
+ candidateId: string;
228
+ n: number;
229
+ mean: number;
230
+ ciLow: number;
231
+ ciHigh: number;
232
+ /** BH-adjusted q-value vs comparator. NaN if no comparator. */
233
+ qValue: number;
234
+ /** Cohen's d vs comparator. NaN if no comparator. */
235
+ cohensD: number;
236
+ }
237
+ interface SummaryTable {
238
+ rows: SummaryTableRow[];
239
+ comparator: string | null;
240
+ split: 'search' | 'holdout';
241
+ /** Pre-rendered markdown — drop into a paper or PR. */
242
+ markdown: string;
243
+ }
244
+ /**
245
+ * Table 1 helper. Buckets runs by `candidateId`, computes mean +
246
+ * bootstrap CI on the chosen split, and (when a comparator is given)
247
+ * BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
248
+ */
249
+ declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
250
+ interface ParetoPoint {
251
+ candidateId: string;
252
+ /** Mean USD cost per run on the chosen split. */
253
+ cost: number;
254
+ /** Mean score on the chosen split. */
255
+ quality: number;
256
+ /** Number of runs that informed this point. */
257
+ n: number;
258
+ /** Whether this candidate is on the Pareto frontier — high
259
+ * quality, low cost, no dominator. */
260
+ onFrontier: boolean;
261
+ /** Optional gate verdict for this candidate, if a `GateDecision`
262
+ * for it was passed in. */
263
+ gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
264
+ }
265
+ interface ParetoFigureSpec {
266
+ kind: 'pareto-cost-quality';
267
+ split: 'search' | 'holdout';
268
+ points: ParetoPoint[];
269
+ axes: {
270
+ x: 'costUsd';
271
+ y: 'score';
272
+ };
273
+ }
274
+ /**
275
+ * Cost vs quality scatter spec. `gateDecisions` is keyed by
276
+ * candidate id; if present, every point picks up the gate verdict
277
+ * for overlay.
278
+ */
279
+ declare function paretoChart(runs: RunRecord[], opts?: {
280
+ split?: 'search' | 'holdout';
281
+ gateDecisions?: Record<string, GateDecision>;
282
+ }): ParetoFigureSpec;
283
+ interface GainDistributionBin {
284
+ /** Inclusive lower edge. */
285
+ lo: number;
286
+ /** Exclusive upper edge (or inclusive if it's the last bin). */
287
+ hi: number;
288
+ /** Number of pairs whose delta lands in this bin. */
289
+ count: number;
290
+ }
291
+ interface GainDistributionFigureSpec {
292
+ kind: 'gain-distribution';
293
+ candidateId: string;
294
+ comparator: string;
295
+ split: 'search' | 'holdout';
296
+ /** Number of pairs used. */
297
+ n: number;
298
+ bins: GainDistributionBin[];
299
+ median: number;
300
+ ci: {
301
+ low: number;
302
+ high: number;
303
+ };
304
+ }
305
+ interface GainDistributionOptions {
306
+ /** Number of histogram bins. Default 11 (so the centre is exact at 0). */
307
+ bins?: number;
308
+ /** Which split to use. Default 'holdout'. */
309
+ split?: 'search' | 'holdout';
310
+ /** Confidence level for the CI. Default 0.95. */
311
+ confidence?: number;
312
+ /** Bootstrap resamples. Default 2000. */
313
+ resamples?: number;
314
+ /** Deterministic seed. */
315
+ seed?: number;
316
+ }
317
+ /**
318
+ * Held-out improvement distribution: per-pair delta (candidate −
319
+ * comparator), histogrammed. Includes the bootstrap CI on the median
320
+ * delta — same primitive the promotion gate uses.
321
+ */
322
+ declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
323
+
324
+ interface RenderReleaseReportOptions {
325
+ title?: string;
326
+ runs?: readonly RunRecord[];
327
+ comparator?: string;
328
+ traceAnalystFindings?: readonly string[];
329
+ nextActions?: readonly string[];
330
+ }
331
+ declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
332
+
333
+ /**
334
+ * Bootstrap-CI promotion gate.
335
+ *
336
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
337
+ * curation), the question is "did this generation actually improve, or are
338
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
339
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
340
+ * delta is real before code or prompts get promoted.
341
+ *
342
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
343
+ * and to compose into any verdict gate.
344
+ *
345
+ * Default gate:
346
+ * - Bootstrap mean baseline vs candidate (1k resamples).
347
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
348
+ * - Tunable confidence (default 95%) and resample count.
349
+ *
350
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
351
+ * vocabulary:
352
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
353
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
354
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
355
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
356
+ */
357
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
358
+ interface BootstrapResult {
359
+ baselineMean: number;
360
+ candidateMean: number;
361
+ /** candidateMean - baselineMean, point estimate. */
362
+ delta: number;
363
+ /** Lower bound of the (1 - alpha) CI on the delta. */
364
+ ciLower: number;
365
+ /** Upper bound of the (1 - alpha) CI on the delta. */
366
+ ciUpper: number;
367
+ /** Number of bootstrap resamples used. */
368
+ iterations: number;
369
+ alpha: number;
370
+ verdict: Verdict;
371
+ }
372
+ interface BootstrapOptions {
373
+ /** Confidence level alpha (default 0.05 → 95% CI). */
374
+ alpha?: number;
375
+ /** Number of resamples (default 1000). */
376
+ iterations?: number;
377
+ /**
378
+ * Minimum total samples (baseline + candidate) below which we always
379
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
380
+ * Default 6 (combined).
381
+ */
382
+ minTotalSamples?: number;
383
+ /** RNG seed for reproducibility. Default: Math.random. */
384
+ seed?: number;
385
+ }
386
+ /**
387
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
388
+ *
389
+ * Uses simple percentile bootstrap on the difference of resampled means.
390
+ * That's the standard non-parametric primitive — no distributional
391
+ * assumptions, robust to skew, easy to reason about.
392
+ */
393
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
394
+ /**
395
+ * Judge-replay promotion gate.
396
+ *
397
+ * The cheap inner-loop judge that drives an evolution run is by definition
398
+ * fast and noisy. When you're about to promote a winning variant to the
399
+ * canonical default, you want a STRONGER judge (a more expensive model, a
400
+ * human grader, a separately-trained reward model) to confirm the win
401
+ * generalises beyond the inner loop.
402
+ *
403
+ * This helper takes raw winner + baseline outputs, scores both through the
404
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
405
+ * judge agrees the winner is real with the configured confidence. Doesn't
406
+ * matter what shape your "output" is — pass a string, an object, anything
407
+ * the judge can read.
408
+ */
409
+ interface JudgeReplayGateArgs<TOutput> {
410
+ baselineOutputs: TOutput[];
411
+ candidateOutputs: TOutput[];
412
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
413
+ judge: (output: TOutput) => Promise<number> | number;
414
+ alpha?: number;
415
+ iterations?: number;
416
+ /** RNG seed for reproducibility. */
417
+ seed?: number;
418
+ /** Maximum concurrent judge calls. Default 4. */
419
+ judgeConcurrency?: number;
420
+ }
421
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
422
+ baselineSamples: number;
423
+ candidateSamples: number;
424
+ }>;
425
+
426
+ export { type BootstrapOptions, type BootstrapResult, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type JudgeReplayGateArgs, type PairedBootstrapOptions, type PairedBootstrapResult, type ParetoFigureSpec, type ParetoPoint, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type RenderReleaseReportOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type Verdict, assertReleaseConfidence, bhAdjust, bootstrapCi, evaluateReleaseConfidence, gainHistogram, judgeReplayGate, pairedBootstrap, pairedWilcoxon, paretoChart, releaseTraceEvidenceFromMultiShotTrials, renderReleaseReport, summaryTable };
@@ -0,0 +1,32 @@
1
+ import {
2
+ assertReleaseConfidence,
3
+ bootstrapCi,
4
+ evaluateReleaseConfidence,
5
+ gainHistogram,
6
+ judgeReplayGate,
7
+ paretoChart,
8
+ releaseTraceEvidenceFromMultiShotTrials,
9
+ renderReleaseReport,
10
+ summaryTable
11
+ } from "./chunk-IKFVX537.js";
12
+ import {
13
+ bhAdjust,
14
+ pairedBootstrap,
15
+ pairedWilcoxon
16
+ } from "./chunk-ODFINDLQ.js";
17
+ import "./chunk-PZ5AY32C.js";
18
+ export {
19
+ assertReleaseConfidence,
20
+ bhAdjust,
21
+ bootstrapCi,
22
+ evaluateReleaseConfidence,
23
+ gainHistogram,
24
+ judgeReplayGate,
25
+ pairedBootstrap,
26
+ pairedWilcoxon,
27
+ paretoChart,
28
+ releaseTraceEvidenceFromMultiShotTrials,
29
+ renderReleaseReport,
30
+ summaryTable
31
+ };
32
+ //# sourceMappingURL=reporting.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}