@tangle-network/agent-eval 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +134 -0
  2. package/README.md +13 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
  5. package/dist/chunk-7EAUOUQS.js.map +1 -0
  6. package/dist/chunk-AXHNWLIX.js +246 -0
  7. package/dist/chunk-AXHNWLIX.js.map +1 -0
  8. package/dist/chunk-EXGR4XEM.js +283 -0
  9. package/dist/chunk-EXGR4XEM.js.map +1 -0
  10. package/dist/chunk-LZKIOBG2.js +2026 -0
  11. package/dist/chunk-LZKIOBG2.js.map +1 -0
  12. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  13. package/dist/chunk-QBW3YBTR.js.map +1 -0
  14. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  15. package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
  16. package/dist/chunk-VQQSPGSM.js.map +1 -0
  17. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  18. package/dist/control.d.ts +3 -3
  19. package/dist/control.js +2 -2
  20. package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
  21. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  22. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  23. package/dist/index-ekBXweiQ.d.ts +1894 -0
  24. package/dist/index.d.ts +18 -154
  25. package/dist/index.js +125 -25
  26. package/dist/index.js.map +1 -1
  27. package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
  28. package/dist/openapi.json +1 -1
  29. package/dist/optimization.d.ts +5 -5
  30. package/dist/optimization.js +7 -5
  31. package/dist/reporting.d.ts +294 -4
  32. package/dist/reporting.js +6 -4
  33. package/dist/rl.d.ts +8 -0
  34. package/dist/rl.js +113 -0
  35. package/dist/rl.js.map +1 -0
  36. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  37. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  38. package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
  39. package/dist/traces.d.ts +2 -2
  40. package/dist/traces.js +5 -5
  41. package/docs/auto-research-loop-end-to-end.md +186 -0
  42. package/docs/three-package-architecture.md +180 -0
  43. package/package.json +6 -1
  44. package/dist/chunk-UAND2LOT.js.map +0 -1
  45. package/dist/chunk-USHQBPMH.js.map +0 -1
  46. package/dist/chunk-YUFXO3TU.js.map +0 -1
  47. package/dist/reporting-B82RSv9C.d.ts +0 -593
  48. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
@@ -0,0 +1,304 @@
1
+ import { R as RunRecord } from './run-record-DNiOMBrZ.js';
2
+
3
+ /**
4
+ * OutcomeStore — deployment outcomes attached to Run IDs.
5
+ *
6
+ * Outcomes arrive asynchronously from production telemetry after the
7
+ * eval run completed: user ratings, retention flags, conversion events,
8
+ * revenue, support-ticket rate, anything a product team can measure.
9
+ * The store is a peer to TraceStore — separate lifecycle, same runId
10
+ * foreign key.
11
+ *
12
+ * The whole point of this module is to make the meta-eval correlation
13
+ * question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
14
+ */
15
+ interface DeploymentOutcome {
16
+ runId: string;
17
+ capturedAt: number;
18
+ /** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
19
+ metrics: Record<string, number>;
20
+ /** Dimensions for stratified analysis — cohort, region, user_segment. */
21
+ labels?: Record<string, string>;
22
+ /** Free-form provenance (source system, pipeline version). */
23
+ source?: string;
24
+ }
25
+ interface OutcomeFilter {
26
+ runIds?: string[];
27
+ since?: number;
28
+ until?: number;
29
+ label?: {
30
+ key: string;
31
+ value: string;
32
+ };
33
+ source?: string;
34
+ }
35
+ interface OutcomeStore {
36
+ append(outcome: DeploymentOutcome): Promise<void>;
37
+ /** All outcomes attached to this run (a single run can have many — multiple
38
+ * capture windows over deployment time). */
39
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
40
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
41
+ }
42
+ declare class InMemoryOutcomeStore implements OutcomeStore {
43
+ private items;
44
+ append(outcome: DeploymentOutcome): Promise<void>;
45
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
46
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
47
+ }
48
+ interface FileSystemOutcomeStoreOptions {
49
+ dir: string;
50
+ maxBytes?: number;
51
+ }
52
+ declare class FileSystemOutcomeStore implements OutcomeStore {
53
+ private dir;
54
+ private maxBytes;
55
+ private memo?;
56
+ private loaded;
57
+ constructor(options: FileSystemOutcomeStoreOptions);
58
+ private ensureDir;
59
+ append(outcome: DeploymentOutcome): Promise<void>;
60
+ private load;
61
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
62
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
63
+ }
64
+
65
+ /**
66
+ * Rubric predictive validity — does our eval rubric predict deployment
67
+ * outcomes?
68
+ *
69
+ * `correlationStudy` (already in this package) joins a `TraceStore` to an
70
+ * `OutcomeStore` and computes Pearson + Spearman + bootstrap CI for each
71
+ * (eval-metric, outcome-metric) pair. That answers "does X correlate with
72
+ * Y at all." `rubricPredictiveValidity` is the campaign-shaped wrapper
73
+ * around it: take a sequence of `RunRecord`s (the canonical campaign
74
+ * artifact) and a `DeploymentOutcomeStore`, join on `runId`, return a
75
+ * ranked verdict on every rubric whose dimension scores were captured in
76
+ * `outcome.raw`.
77
+ *
78
+ * The point — quoting the methodology doc — is that **without this loop
79
+ * every rubric is faith-based**. Once it's wired, you know which rubrics
80
+ * have earned their promotion power and which ones are decoration.
81
+ *
82
+ * const validity = await rubricPredictiveValidity({
83
+ * runs: lastQuarter,
84
+ * outcomes: shipFlagOutcomeStore,
85
+ * outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
86
+ * rubrics: ['anti_slop', 'semantic_concept', 'tool_recovery'],
87
+ * })
88
+ * for (const r of validity.ranked) {
89
+ * console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)}`)
90
+ * }
91
+ *
92
+ * The function is intentionally read-only. Use the verdict to deprecate
93
+ * decorative rubrics, re-weight composite scores, or trigger a
94
+ * recalibration sweep when predictive validity drops below a threshold.
95
+ */
96
+
97
+ interface RubricPredictiveValidityInput {
98
+ /**
99
+ * Canonical campaign output. Each record's `outcome.raw[<rubricId>]`
100
+ * provides the eval score; missing keys are silently skipped per pair.
101
+ */
102
+ runs: RunRecord[];
103
+ outcomes: OutcomeStore;
104
+ /**
105
+ * Outcome metric names to evaluate against. Each must appear in at
106
+ * least one `DeploymentOutcome.metrics` keyspace; pairs with too few
107
+ * joined samples are excluded from the result.
108
+ */
109
+ outcomeMetrics: string[];
110
+ /**
111
+ * Rubric ids to evaluate. Must appear as keys in `RunRecord.outcome.raw`.
112
+ * If omitted, every numeric key in `outcome.raw` across the run set is
113
+ * treated as a rubric.
114
+ */
115
+ rubrics?: string[];
116
+ /** Minimum joined-sample count before a pair is reported. Default 8. */
117
+ minSamples?: number;
118
+ /** Bootstrap resamples for CI. Default 500. */
119
+ bootstrapResamples?: number;
120
+ /** Random seed for the bootstrap (mulberry32). Default unset (Math.random). */
121
+ seed?: number;
122
+ /**
123
+ * Reduction when multiple outcomes attach to one runId. Default `'latest'`
124
+ * (most recently captured).
125
+ */
126
+ reduction?: 'latest' | 'mean' | 'max';
127
+ }
128
+ interface RubricOutcomePair {
129
+ rubric: string;
130
+ outcome: string;
131
+ n: number;
132
+ pearson: number;
133
+ spearman: number;
134
+ ci95: {
135
+ low: number;
136
+ high: number;
137
+ };
138
+ /**
139
+ * Verdict bucket. `load_bearing` ≥ 0.7, `informative` ≥ 0.4,
140
+ * `decorative` < 0.4 in absolute correlation. A negative correlation
141
+ * with a desired outcome is also `decorative` — actively misleading
142
+ * is worse than uninformative.
143
+ */
144
+ verdict: 'load_bearing' | 'informative' | 'decorative';
145
+ }
146
+ interface RubricRanking {
147
+ rubric: string;
148
+ /** Outcome metric this rubric correlated best with. */
149
+ bestOutcome: string;
150
+ spearman: number;
151
+ pearson: number;
152
+ n: number;
153
+ verdict: RubricOutcomePair['verdict'];
154
+ }
155
+ interface RubricPredictiveValidityReport {
156
+ pairs: RubricOutcomePair[];
157
+ /** Per-rubric best pair, sorted descending by |spearman|. */
158
+ ranked: RubricRanking[];
159
+ joinedSamples: number;
160
+ skippedRuns: number;
161
+ /** Rubrics that were declared but never produced a usable score. */
162
+ rubricsWithoutData: string[];
163
+ }
164
+ declare function rubricPredictiveValidity(input: RubricPredictiveValidityInput): Promise<RubricPredictiveValidityReport>;
165
+
166
+ /**
167
+ * Always-valid sequential evaluation.
168
+ *
169
+ * `researchReport` (0.21+) assumes a single pre-specified analysis. Real
170
+ * consumers run campaigns weekly / nightly / per-PR; each new run silently
171
+ * inflates the false-discovery rate, because the BH-FDR guarantee was for
172
+ * the *first* look, not the 47th. Without time-uniform inference,
173
+ * launch-decision teams either (a) don't peek, which forfeits the cost
174
+ * advantage of stop-when-decisive, or (b) peek and pretend they didn't,
175
+ * which forfeits scientific validity.
176
+ *
177
+ * This module ships **e-value-based confidence sequences** for paired
178
+ * bounded outcomes. The methodology is the predictable plug-in betting
179
+ * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
180
+ * stopping time. Concretely:
181
+ *
182
+ * For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
183
+ * a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
184
+ * plug-in), and the running e-value is
185
+ *
186
+ * E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
187
+ *
188
+ * E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
189
+ * Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
190
+ * at any time without inflating the type-I error.
191
+ *
192
+ * Combined with `runEvalCampaign`, every consumer running rolling
193
+ * campaigns gains the ability to ship the moment evidence is decisive,
194
+ * stop-early on dead-on-arrival variants, and accumulate evidence across
195
+ * partial runs without spending the FDR budget. No new sweep is wasted.
196
+ *
197
+ * References:
198
+ * - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
199
+ * Time-uniform, nonparametric, nonasymptotic confidence sequences.
200
+ * Annals of Statistics, 49(2), 1055–1080.
201
+ * - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
202
+ * random variables by betting. JRSS B, 86(1), 1–27.
203
+ */
204
+ type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
205
+ interface PairedEvalueOptions {
206
+ /**
207
+ * Bound on |delta|. Default 1 (matching most score scales). Must satisfy
208
+ * c > 0; deltas outside [-c, c] are clipped with a warning attached to
209
+ * the return value.
210
+ */
211
+ bound?: number;
212
+ /** Target Type-I error. Default 0.05. */
213
+ alpha?: number;
214
+ /**
215
+ * Region of Practical Equivalence on the *mean* paired delta. When
216
+ * supplied, the verdict can return `'equivalent'` once the running
217
+ * confidence sequence on the mean is fully contained in [low, high].
218
+ */
219
+ rope?: {
220
+ low: number;
221
+ high: number;
222
+ };
223
+ /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
224
+ initialBetShrinkage?: number;
225
+ }
226
+ interface PairedEvalueStep {
227
+ /** 1-indexed observation count. */
228
+ t: number;
229
+ delta: number;
230
+ /** Running e-value E_t = ∏ (1 + λ_i · D_i). */
231
+ evalue: number;
232
+ /** Time-uniform p-value at stopping time t. */
233
+ pValue: number;
234
+ /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
235
+ csLow: number;
236
+ csHigh: number;
237
+ /** Verdict at this stopping time. */
238
+ decision: SequentialDecision;
239
+ }
240
+ interface PairedEvalueSequence {
241
+ steps: PairedEvalueStep[];
242
+ /** The decision at the final step. */
243
+ finalDecision: SequentialDecision;
244
+ /** Index (1-based) at which a non-`continue` decision first fired, or null. */
245
+ decisionFiredAt: number | null;
246
+ /** True if any deltas were clipped to [-bound, bound]. */
247
+ clipped: boolean;
248
+ }
249
+ /**
250
+ * Run the paired e-value sequence over an in-order delta stream.
251
+ *
252
+ * Use for *streaming* / interim analyses: pass the deltas you have so
253
+ * far, get the verdict at every prefix length. The decision is
254
+ * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
255
+ * fires, the verdict at later steps remains decisive (the e-value is a
256
+ * non-negative martingale; once it crosses the threshold, it's crossed).
257
+ */
258
+ declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
259
+ interface InterimReleaseConfidenceInput {
260
+ /**
261
+ * One delta series per candidate (paired deltas vs comparator). Order
262
+ * within a series is the order the campaigns were run.
263
+ */
264
+ deltaSeries: Array<{
265
+ candidateId: string;
266
+ deltas: number[];
267
+ }>;
268
+ alpha?: number;
269
+ bound?: number;
270
+ rope?: {
271
+ low: number;
272
+ high: number;
273
+ };
274
+ }
275
+ interface InterimReleaseConfidence {
276
+ candidates: Array<{
277
+ candidateId: string;
278
+ decision: SequentialDecision;
279
+ decisionFiredAt: number | null;
280
+ finalEvalue: number;
281
+ finalPValue: number;
282
+ pairs: number;
283
+ csLow: number;
284
+ csHigh: number;
285
+ }>;
286
+ /**
287
+ * Campaign-level recommendation: pick the strongest 'promote_now', else
288
+ * 'continue' if any candidate is still live, else 'reject_now' if every
289
+ * candidate is dead, else 'equivalent'.
290
+ */
291
+ recommendation: {
292
+ decision: SequentialDecision;
293
+ candidateId: string | null;
294
+ };
295
+ }
296
+ /**
297
+ * Run interim sequential analyses across many candidates at once,
298
+ * preserving the time-uniform α guarantee for each candidate's series and
299
+ * synthesising a campaign-level recommendation. Designed to be called on
300
+ * every campaign tick — the recommendation is anytime-valid.
301
+ */
302
+ declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
303
+
304
+ export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeFilter as O, type PairedEvalueOptions as P, type RubricOutcomePair as R, type SequentialDecision as S, type OutcomeStore as a, type FileSystemOutcomeStoreOptions as b, type InterimReleaseConfidence as c, type InterimReleaseConfidenceInput as d, type PairedEvalueSequence as e, type PairedEvalueStep as f, type RubricPredictiveValidityInput as g, type RubricPredictiveValidityReport as h, type RubricRanking as i, evaluateInterimReleaseConfidence as j, pairedEvalueSequence as p, rubricPredictiveValidity as r };
@@ -1,4 +1,4 @@
1
- import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-DNiOMBrZ.js';
2
2
  import { a as Run, S as Span, f as TraceEvent, F as FailureClass, T as TraceStore } from './store-u47QaJ9G.js';
3
3
 
4
4
  /**
@@ -975,4 +975,4 @@ interface ResearchReport {
975
975
  */
976
976
  declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
977
977
 
978
- export { type ResearchReportOptions as $, type ActionableSideInfo as A, type MultiShotTrace as B, type MultiShotTrialResult as C, DEFAULT_RULES as D, type EvolvableVariant as E, type FailureClassification as F, type GainDistributionBin as G, HeldOutGate as H, InMemoryTrialCache as I, type MultiShotVariant as J, type ParetoFigureSpec as K, type ParetoPoint as L, type MutateAdapter as M, type PromptEvolutionConfig as N, type Objective as O, type ParetoResult as P, type PromptEvolutionEvent as Q, type PromptEvolutionResult as R, RESEARCH_REPORT_HARD_PAIR_FLOOR as S, type TrialCache as T, type ReflectionContext as U, type VariantAggregate as V, type ReflectionProposal as W, type ResearchReport as X, type ResearchReportCandidate as Y, type ResearchReportDecision as Z, type ResearchReportMethodology as _, type TrialResult as a, type ResearchReportRecommendation as a0, type ScenarioAggregate as a1, type ScoreAdapter as a2, type SummaryTable as a3, type SummaryTableOptions as a4, type SummaryTableRow as a5, type TrialTrace as a6, buildReflectionPrompt as a7, classifyFailure as a8, crowdingDistance as a9, defaultMultiShotObjectives as aa, dominates as ab, failureClusterView as ac, gainHistogram as ad, paretoChart as ae, paretoFrontier as af, paretoFrontierWithCrowding as ag, parseReflectionResponse as ah, researchReport as ai, runMultiShotOptimization as aj, runPromptEvolution as ak, scalarScore as al, summaryTable as am, trialTraceFromMultiShotTrial as an, type AsiSeverity as b, DEFAULT_MUTATION_PRIMITIVES as c, type Direction as d, type FailureCluster as e, type FailureClusterReport as f, type FailureContext as g, type FailureRule as h, type GainDistributionFigureSpec as i, type GainDistributionOptions as j, type GateDecision as k, type GateEvidence as l, type GenerationReport as m, type HeldOutGateConfig as n, type HeldOutGateRejectionCode as o, type MultiShotGateConfig as p, type MultiShotGateResult as q, type MultiShotMutateAdapter as r, type MultiShotOptimizationConfig as s, type MultiShotOptimizationResult as t, type MultiShotRun as u, type MultiShotRunInput as v, type MultiShotRunner as w, type MultiShotScore as x, type MultiShotScorer as y, type MultiShotSplit as z };
978
+ export { type GateEvidence as $, type ActionableSideInfo as A, trialTraceFromMultiShotTrial as B, type GateDecision as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, type ResearchReportOptions as F, type GenerationReport as G, type ResearchReport as H, InMemoryTrialCache as I, type ParetoResult as J, DEFAULT_RULES as K, type Direction as L, type MultiShotGateConfig as M, type FailureClassification as N, type Objective as O, type PromptEvolutionConfig as P, type FailureCluster as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, type FailureClusterReport as U, type VariantAggregate as V, type FailureContext as W, type FailureRule as X, type GainDistributionBin as Y, type GainDistributionFigureSpec as Z, type GainDistributionOptions as _, type AsiSeverity as a, HeldOutGate as a0, type HeldOutGateConfig as a1, type HeldOutGateRejectionCode as a2, type ParetoFigureSpec as a3, type ParetoPoint as a4, RESEARCH_REPORT_HARD_PAIR_FLOOR as a5, type ResearchReportCandidate as a6, type ResearchReportDecision as a7, type ResearchReportMethodology as a8, type ResearchReportRecommendation as a9, type SummaryTable as aa, type SummaryTableOptions as ab, type SummaryTableRow as ac, classifyFailure as ad, crowdingDistance as ae, dominates as af, failureClusterView as ag, gainHistogram as ah, paretoChart as ai, paretoFrontier as aj, paretoFrontierWithCrowding as ak, researchReport as al, scalarScore as am, summaryTable as an, type MultiShotGateResult as b, type MultiShotMutateAdapter as c, type MultiShotOptimizationConfig as d, type MultiShotOptimizationResult as e, type MultiShotRun as f, type MultiShotRunInput as g, type MultiShotRunner as h, type MultiShotScore as i, type MultiShotScorer as j, type MultiShotSplit as k, type MultiShotTrace as l, type MultiShotTrialResult as m, type MultiShotVariant as n, type MutateAdapter as o, type PromptEvolutionEvent as p, type PromptEvolutionResult as q, type ReflectionProposal as r, type ScoreAdapter as s, type TrialResult as t, type TrialTrace as u, buildReflectionPrompt as v, defaultMultiShotObjectives as w, parseReflectionResponse as x, runMultiShotOptimization as y, runPromptEvolution as z };
package/dist/traces.d.ts CHANGED
@@ -2,8 +2,8 @@ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureCl
2
2
  export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
3
3
  import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
4
4
  export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
5
- import { d as RawProviderSink, c as RawProviderEvent } from './integrity-K2oVlF57.js';
6
- export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
5
+ import { R as RawProviderSink, f as RawProviderEvent } from './integrity-Cr5YodSY.js';
6
+ export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
7
7
  import { AxAIService, AxFunction } from '@ax-llm/ax';
8
8
 
9
9
  /**
package/dist/traces.js CHANGED
@@ -54,11 +54,6 @@ import {
54
54
  assertRunCaptured,
55
55
  throwIfRunIncomplete
56
56
  } from "./chunk-QUKKGHTZ.js";
57
- import {
58
- TraceEmitter,
59
- llmSpanFromProvider
60
- } from "./chunk-5IIQKMD5.js";
61
- import "./chunk-6M774GY6.js";
62
57
  import {
63
58
  FileSystemRawProviderSink,
64
59
  InMemoryRawProviderSink,
@@ -66,6 +61,11 @@ import {
66
61
  defaultProviderRedactor,
67
62
  providerFromBaseUrl
68
63
  } from "./chunk-SQQLHODJ.js";
64
+ import {
65
+ TraceEmitter,
66
+ llmSpanFromProvider
67
+ } from "./chunk-5IIQKMD5.js";
68
+ import "./chunk-6M774GY6.js";
69
69
  import "./chunk-PZ5AY32C.js";
70
70
  export {
71
71
  DEFAULT_REDACTION_RULES,
@@ -0,0 +1,186 @@
1
+ # Auto-research loop end-to-end
2
+
3
+ This is the runnable composition pattern that closes the loop the package
4
+ was originally designed for: capture-integrity → eval → preferences →
5
+ mutation → improved candidate → repeat.
6
+
7
+ There's no new orchestrator primitive that runs this for you (and we
8
+ deliberately resisted shipping one — every consumer's loop has different
9
+ invariants). What this doc gives you is **the integration recipe**: the
10
+ imports, the wiring, and the explicit invariants every iteration must
11
+ preserve.
12
+
13
+ A working version of this recipe lives at
14
+ [`examples/auto-research-with-agent-builder/`](../examples/auto-research-with-agent-builder/) —
15
+ runnable, ~250 lines, demonstrates the score climbing across iterations.
16
+
17
+ ## The pattern
18
+
19
+ ```ts
20
+ import {
21
+ runEvalCampaign,
22
+ analyzeOptimizationResult,
23
+ trialsToRunRecords,
24
+ PredictiveValidityResearcher,
25
+ } from '@tangle-network/agent-eval'
26
+ import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces'
27
+
28
+ async function runAutoResearchLoop(opts: {
29
+ task: string
30
+ initialVariants: Variant[]
31
+ scenarios: Scenario[]
32
+ iterations: number
33
+ // The thing that turns a Variant into a scoreable artifact.
34
+ // For agent-builder this is `runForgeBuilderSim`; for tax-agent it's
35
+ // their domain runner; for the multi-shot prompt evolution case it's
36
+ // already wired inside `runPromptEvolution`.
37
+ candidateRunner: CandidateRunner<Variant>
38
+ // The thing that proposes the next variants given the analysis output.
39
+ // For prompt-only optimization, this is `reflective-mutation` against
40
+ // the top/bottom trials. For code+prompt, this is `createCompositeMutator`.
41
+ // For agent-builder, this can be a hand-rolled "edit the system prompt"
42
+ // function — the example shows one.
43
+ mutator: (champion: Variant, analysis: AnalysisReport) => Promise<Variant[]>
44
+ // Optional: outcome store for predictive validity. When present, the
45
+ // loop learns which scoring rubrics actually predict deployment outcomes
46
+ // and reweights the composite score accordingly.
47
+ outcomes?: { store: OutcomeStore; metrics: string[] }
48
+ }): Promise<IterationReport[]> {
49
+ const reports: IterationReport[] = []
50
+ let variants = opts.initialVariants
51
+
52
+ // (Optional) standing researcher that drives rubric reweighting.
53
+ const researcher = opts.outcomes
54
+ ? new PredictiveValidityResearcher({
55
+ outcomes: opts.outcomes.store,
56
+ outcomeMetrics: opts.outcomes.metrics,
57
+ })
58
+ : null
59
+
60
+ for (let iter = 0; iter < opts.iterations; iter++) {
61
+ // 1. Capture-integrity-by-construction matrix run.
62
+ const campaign = await runEvalCampaign({
63
+ campaignId: `auto-research-iter-${iter}`,
64
+ commitSha: opts.task,
65
+ variants: variants.map((v) => ({ id: v.id, payload: v })),
66
+ scenarios: opts.scenarios,
67
+ seeds: [0, 1, 2],
68
+ llmOpts: { ... },
69
+ storeFactory: () => new InMemoryTraceStore(),
70
+ rawSinkFactory: () => new InMemoryRawProviderSink(),
71
+ runner: makeCampaignRunner(opts.candidateRunner),
72
+ onRunComplete: opts.outcomes
73
+ ? [traceAnalystOnRunComplete({ analyze: ..., save: ... })]
74
+ : [],
75
+ report: { comparator: variants[0]!.id },
76
+ })
77
+
78
+ // 2. RL-bridge analysis: preferences, verifiable rewards, sequential
79
+ // interim verdict, reward-hacking diagnosis.
80
+ const analysis = await analyzeOptimizationResult({
81
+ result: pretendItsAPromptEvolution(campaign),
82
+ ctx: { experimentId: 'task', model: '...', commitSha: '...', promptHash: '...', configHash: '...' },
83
+ comparator: variants[0]!.id,
84
+ outcomes: opts.outcomes,
85
+ })
86
+
87
+ // 3. Periodic rubric recalibration via predictive validity.
88
+ if (researcher && iter > 0 && iter % 5 === 0) {
89
+ await researcher.runValidityCheck(campaign.runs)
90
+ // The researcher's `proposeChange` output can be folded into the
91
+ // mutator as a steering signal in the next iteration.
92
+ }
93
+
94
+ // 4. Pick champion + record this iteration.
95
+ const champion = pickChampion(campaign.runs)
96
+ reports.push({ iter, champion, score: champion.score, analysis })
97
+
98
+ // 5. Sequential stop: the anytime-valid e-value can decisively call
99
+ // 'promote_now' or 'reject_now' before iterations exhausted.
100
+ if (analysis.interimConfidence?.recommendation.decision === 'promote_now') {
101
+ break
102
+ }
103
+
104
+ // 6. Propose next variants via the mutator.
105
+ if (iter < opts.iterations - 1) {
106
+ variants = await opts.mutator(champion.variant, analysis)
107
+ }
108
+ }
109
+
110
+ return reports
111
+ }
112
+ ```
113
+
114
+ ## Invariants every iteration must preserve
115
+
116
+ 1. **The campaign produces RunRecord[] with `scenarioId` populated.** Every
117
+ downstream primitive (preferences, sequential, predictive validity,
118
+ tournament) keys on this. `runEvalCampaign` populates it canonically;
119
+ if you adapt from `runPromptEvolution` use `trialsToRunRecords`.
120
+
121
+ 2. **Capture is wired by construction.** Don't pass `NoopRawProviderSink`
122
+ to `rawSinkFactory` unless the iteration is exploratory. Every
123
+ captured run is replayable, every replayable run is free judge-iteration
124
+ data for the next loop.
125
+
126
+ 3. **`commitSha` is real.** It's how downstream tooling (predictive
127
+ validity, contamination probe, tournament) ties iterations together.
128
+
129
+ 4. **The comparator is stable across iterations.** Either the original
130
+ `baseline` or whichever champion you froze. Shifting the comparator
131
+ between iterations corrupts the paired-delta semantics.
132
+
133
+ 5. **The mutator is deterministic given the analysis output.** Otherwise
134
+ the iteration isn't reproducible and the auto-research artifacts
135
+ become unfalsifiable. If you need stochastic mutation, seed the
136
+ mutator and emit the seed onto the run record.
137
+
138
+ ## When to run each primitive
139
+
140
+ | Frequency | Primitive | Why |
141
+ |---|---|---|
142
+ | Every iteration | `runEvalCampaign` | core measurement |
143
+ | Every iteration | `analyzeOptimizationResult` | preferences + verifiable rewards + reward-hacking |
144
+ | Every iteration | `evaluateInterimReleaseConfidence` (via `analyzeOptimizationResult`) | anytime-valid stop signal |
145
+ | Every 5–10 iterations | `rubricPredictiveValidity` | rubric weights drift; recalibrate |
146
+ | Every release | `runContaminationProbe` | scenario set freshness |
147
+ | Once per task | `runComputeCurve` | cost-quality frontier |
148
+ | As-needed | `adversarialScenarioSearch` | discover failure modes the curated set missed |
149
+
150
+ ## When to drop into the smaller primitives
151
+
152
+ Two cases:
153
+
154
+ 1. **Trajectory-shaped optimization with steering.** Use
155
+ `runMultiShotOptimization` directly — it already runs the inner
156
+ search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
157
+ for the RL bridge.
158
+
159
+ 2. **Prompt + code evolution with sandboxed code mutation.** Use
160
+ `runPromptEvolution` + `createCompositeMutator` directly. Same wrap
161
+ pattern.
162
+
163
+ The auto-research loop above wraps these primitives in a higher-level
164
+ loop that runs them across multiple campaigns. They're each one tick of
165
+ the bigger loop.
166
+
167
+ ## What this does NOT do
168
+
169
+ - It doesn't fine-tune model weights. That's the
170
+ [`fine-tune-with-prime-rl`](../examples/fine-tune-with-prime-rl/) example
171
+ — separate concern, separate trainer.
172
+ - It doesn't drive a production deployment decision on its own. The
173
+ artifacts feed a launch-review process (humans, the `researchReport`
174
+ output, the `assertReleaseConfidence` gate). Loop ≠ promotion gate.
175
+ - It doesn't substitute for a real preregistration trail. The
176
+ `preregistrationHash` field on the report exists so iterations can be
177
+ audited, but the auto-research loop *is* iterative and post-hoc by
178
+ definition. Use the standing `assertReleaseConfidence` gate at the
179
+ release boundary; use the auto-research loop everywhere upstream of it.
180
+
181
+ ## Reading order for the example
182
+
183
+ 1. [`examples/auto-research-with-agent-builder/README.md`](../examples/auto-research-with-agent-builder/README.md) — architectural picture.
184
+ 2. [`examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`](../examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts) — runnable demo.
185
+ 3. Run it: `npx tsx examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`.
186
+ It prints the iteration progression and the score climbing.