@tangle-network/agent-eval 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +134 -0
- package/README.md +13 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
- package/dist/chunk-VQQSPGSM.js.map +1 -0
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +18 -154
- package/dist/index.js +125 -25
- package/dist/index.js.map +1 -1
- package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +7 -5
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +6 -4
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +5 -5
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/three-package-architecture.md +180 -0
- package/package.json +6 -1
- package/dist/chunk-UAND2LOT.js.map +0 -1
- package/dist/chunk-USHQBPMH.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-B82RSv9C.d.ts +0 -593
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
import { R as RunRecord } from './run-record-DNiOMBrZ.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* OutcomeStore — deployment outcomes attached to Run IDs.
|
|
5
|
+
*
|
|
6
|
+
* Outcomes arrive asynchronously from production telemetry after the
|
|
7
|
+
* eval run completed: user ratings, retention flags, conversion events,
|
|
8
|
+
* revenue, support-ticket rate, anything a product team can measure.
|
|
9
|
+
* The store is a peer to TraceStore — separate lifecycle, same runId
|
|
10
|
+
* foreign key.
|
|
11
|
+
*
|
|
12
|
+
* The whole point of this module is to make the meta-eval correlation
|
|
13
|
+
* question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
|
|
14
|
+
*/
|
|
15
|
+
interface DeploymentOutcome {
|
|
16
|
+
runId: string;
|
|
17
|
+
capturedAt: number;
|
|
18
|
+
/** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
|
|
19
|
+
metrics: Record<string, number>;
|
|
20
|
+
/** Dimensions for stratified analysis — cohort, region, user_segment. */
|
|
21
|
+
labels?: Record<string, string>;
|
|
22
|
+
/** Free-form provenance (source system, pipeline version). */
|
|
23
|
+
source?: string;
|
|
24
|
+
}
|
|
25
|
+
interface OutcomeFilter {
|
|
26
|
+
runIds?: string[];
|
|
27
|
+
since?: number;
|
|
28
|
+
until?: number;
|
|
29
|
+
label?: {
|
|
30
|
+
key: string;
|
|
31
|
+
value: string;
|
|
32
|
+
};
|
|
33
|
+
source?: string;
|
|
34
|
+
}
|
|
35
|
+
interface OutcomeStore {
|
|
36
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
37
|
+
/** All outcomes attached to this run (a single run can have many — multiple
|
|
38
|
+
* capture windows over deployment time). */
|
|
39
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
40
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
41
|
+
}
|
|
42
|
+
declare class InMemoryOutcomeStore implements OutcomeStore {
|
|
43
|
+
private items;
|
|
44
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
45
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
46
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
47
|
+
}
|
|
48
|
+
interface FileSystemOutcomeStoreOptions {
|
|
49
|
+
dir: string;
|
|
50
|
+
maxBytes?: number;
|
|
51
|
+
}
|
|
52
|
+
declare class FileSystemOutcomeStore implements OutcomeStore {
|
|
53
|
+
private dir;
|
|
54
|
+
private maxBytes;
|
|
55
|
+
private memo?;
|
|
56
|
+
private loaded;
|
|
57
|
+
constructor(options: FileSystemOutcomeStoreOptions);
|
|
58
|
+
private ensureDir;
|
|
59
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
60
|
+
private load;
|
|
61
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
62
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Rubric predictive validity — does our eval rubric predict deployment
|
|
67
|
+
* outcomes?
|
|
68
|
+
*
|
|
69
|
+
* `correlationStudy` (already in this package) joins a `TraceStore` to an
|
|
70
|
+
* `OutcomeStore` and computes Pearson + Spearman + bootstrap CI for each
|
|
71
|
+
* (eval-metric, outcome-metric) pair. That answers "does X correlate with
|
|
72
|
+
* Y at all." `rubricPredictiveValidity` is the campaign-shaped wrapper
|
|
73
|
+
* around it: take a sequence of `RunRecord`s (the canonical campaign
|
|
74
|
+
* artifact) and a `DeploymentOutcomeStore`, join on `runId`, return a
|
|
75
|
+
* ranked verdict on every rubric whose dimension scores were captured in
|
|
76
|
+
* `outcome.raw`.
|
|
77
|
+
*
|
|
78
|
+
* The point — quoting the methodology doc — is that **without this loop
|
|
79
|
+
* every rubric is faith-based**. Once it's wired, you know which rubrics
|
|
80
|
+
* have earned their promotion power and which ones are decoration.
|
|
81
|
+
*
|
|
82
|
+
* const validity = await rubricPredictiveValidity({
|
|
83
|
+
* runs: lastQuarter,
|
|
84
|
+
* outcomes: shipFlagOutcomeStore,
|
|
85
|
+
* outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
|
|
86
|
+
* rubrics: ['anti_slop', 'semantic_concept', 'tool_recovery'],
|
|
87
|
+
* })
|
|
88
|
+
* for (const r of validity.ranked) {
|
|
89
|
+
* console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)}`)
|
|
90
|
+
* }
|
|
91
|
+
*
|
|
92
|
+
* The function is intentionally read-only. Use the verdict to deprecate
|
|
93
|
+
* decorative rubrics, re-weight composite scores, or trigger a
|
|
94
|
+
* recalibration sweep when predictive validity drops below a threshold.
|
|
95
|
+
*/
|
|
96
|
+
|
|
97
|
+
interface RubricPredictiveValidityInput {
|
|
98
|
+
/**
|
|
99
|
+
* Canonical campaign output. Each record's `outcome.raw[<rubricId>]`
|
|
100
|
+
* provides the eval score; missing keys are silently skipped per pair.
|
|
101
|
+
*/
|
|
102
|
+
runs: RunRecord[];
|
|
103
|
+
outcomes: OutcomeStore;
|
|
104
|
+
/**
|
|
105
|
+
* Outcome metric names to evaluate against. Each must appear in at
|
|
106
|
+
* least one `DeploymentOutcome.metrics` keyspace; pairs with too few
|
|
107
|
+
* joined samples are excluded from the result.
|
|
108
|
+
*/
|
|
109
|
+
outcomeMetrics: string[];
|
|
110
|
+
/**
|
|
111
|
+
* Rubric ids to evaluate. Must appear as keys in `RunRecord.outcome.raw`.
|
|
112
|
+
* If omitted, every numeric key in `outcome.raw` across the run set is
|
|
113
|
+
* treated as a rubric.
|
|
114
|
+
*/
|
|
115
|
+
rubrics?: string[];
|
|
116
|
+
/** Minimum joined-sample count before a pair is reported. Default 8. */
|
|
117
|
+
minSamples?: number;
|
|
118
|
+
/** Bootstrap resamples for CI. Default 500. */
|
|
119
|
+
bootstrapResamples?: number;
|
|
120
|
+
/** Random seed for the bootstrap (mulberry32). Default unset (Math.random). */
|
|
121
|
+
seed?: number;
|
|
122
|
+
/**
|
|
123
|
+
* Reduction when multiple outcomes attach to one runId. Default `'latest'`
|
|
124
|
+
* (most recently captured).
|
|
125
|
+
*/
|
|
126
|
+
reduction?: 'latest' | 'mean' | 'max';
|
|
127
|
+
}
|
|
128
|
+
interface RubricOutcomePair {
|
|
129
|
+
rubric: string;
|
|
130
|
+
outcome: string;
|
|
131
|
+
n: number;
|
|
132
|
+
pearson: number;
|
|
133
|
+
spearman: number;
|
|
134
|
+
ci95: {
|
|
135
|
+
low: number;
|
|
136
|
+
high: number;
|
|
137
|
+
};
|
|
138
|
+
/**
|
|
139
|
+
* Verdict bucket. `load_bearing` ≥ 0.7, `informative` ≥ 0.4,
|
|
140
|
+
* `decorative` < 0.4 in absolute correlation. A negative correlation
|
|
141
|
+
* with a desired outcome is also `decorative` — actively misleading
|
|
142
|
+
* is worse than uninformative.
|
|
143
|
+
*/
|
|
144
|
+
verdict: 'load_bearing' | 'informative' | 'decorative';
|
|
145
|
+
}
|
|
146
|
+
interface RubricRanking {
|
|
147
|
+
rubric: string;
|
|
148
|
+
/** Outcome metric this rubric correlated best with. */
|
|
149
|
+
bestOutcome: string;
|
|
150
|
+
spearman: number;
|
|
151
|
+
pearson: number;
|
|
152
|
+
n: number;
|
|
153
|
+
verdict: RubricOutcomePair['verdict'];
|
|
154
|
+
}
|
|
155
|
+
interface RubricPredictiveValidityReport {
|
|
156
|
+
pairs: RubricOutcomePair[];
|
|
157
|
+
/** Per-rubric best pair, sorted descending by |spearman|. */
|
|
158
|
+
ranked: RubricRanking[];
|
|
159
|
+
joinedSamples: number;
|
|
160
|
+
skippedRuns: number;
|
|
161
|
+
/** Rubrics that were declared but never produced a usable score. */
|
|
162
|
+
rubricsWithoutData: string[];
|
|
163
|
+
}
|
|
164
|
+
declare function rubricPredictiveValidity(input: RubricPredictiveValidityInput): Promise<RubricPredictiveValidityReport>;
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Always-valid sequential evaluation.
|
|
168
|
+
*
|
|
169
|
+
* `researchReport` (0.21+) assumes a single pre-specified analysis. Real
|
|
170
|
+
* consumers run campaigns weekly / nightly / per-PR; each new run silently
|
|
171
|
+
* inflates the false-discovery rate, because the BH-FDR guarantee was for
|
|
172
|
+
* the *first* look, not the 47th. Without time-uniform inference,
|
|
173
|
+
* launch-decision teams either (a) don't peek, which forfeits the cost
|
|
174
|
+
* advantage of stop-when-decisive, or (b) peek and pretend they didn't,
|
|
175
|
+
* which forfeits scientific validity.
|
|
176
|
+
*
|
|
177
|
+
* This module ships **e-value-based confidence sequences** for paired
|
|
178
|
+
* bounded outcomes. The methodology is the predictable plug-in betting
|
|
179
|
+
* martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
|
|
180
|
+
* stopping time. Concretely:
|
|
181
|
+
*
|
|
182
|
+
* For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
|
|
183
|
+
* a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
|
|
184
|
+
* plug-in), and the running e-value is
|
|
185
|
+
*
|
|
186
|
+
* E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
|
|
187
|
+
*
|
|
188
|
+
* E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
|
|
189
|
+
* Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
|
|
190
|
+
* at any time without inflating the type-I error.
|
|
191
|
+
*
|
|
192
|
+
* Combined with `runEvalCampaign`, every consumer running rolling
|
|
193
|
+
* campaigns gains the ability to ship the moment evidence is decisive,
|
|
194
|
+
* stop-early on dead-on-arrival variants, and accumulate evidence across
|
|
195
|
+
* partial runs without spending the FDR budget. No new sweep is wasted.
|
|
196
|
+
*
|
|
197
|
+
* References:
|
|
198
|
+
* - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
|
|
199
|
+
* Time-uniform, nonparametric, nonasymptotic confidence sequences.
|
|
200
|
+
* Annals of Statistics, 49(2), 1055–1080.
|
|
201
|
+
* - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
|
|
202
|
+
* random variables by betting. JRSS B, 86(1), 1–27.
|
|
203
|
+
*/
|
|
204
|
+
type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
|
|
205
|
+
interface PairedEvalueOptions {
|
|
206
|
+
/**
|
|
207
|
+
* Bound on |delta|. Default 1 (matching most score scales). Must satisfy
|
|
208
|
+
* c > 0; deltas outside [-c, c] are clipped with a warning attached to
|
|
209
|
+
* the return value.
|
|
210
|
+
*/
|
|
211
|
+
bound?: number;
|
|
212
|
+
/** Target Type-I error. Default 0.05. */
|
|
213
|
+
alpha?: number;
|
|
214
|
+
/**
|
|
215
|
+
* Region of Practical Equivalence on the *mean* paired delta. When
|
|
216
|
+
* supplied, the verdict can return `'equivalent'` once the running
|
|
217
|
+
* confidence sequence on the mean is fully contained in [low, high].
|
|
218
|
+
*/
|
|
219
|
+
rope?: {
|
|
220
|
+
low: number;
|
|
221
|
+
high: number;
|
|
222
|
+
};
|
|
223
|
+
/** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
|
|
224
|
+
initialBetShrinkage?: number;
|
|
225
|
+
}
|
|
226
|
+
interface PairedEvalueStep {
|
|
227
|
+
/** 1-indexed observation count. */
|
|
228
|
+
t: number;
|
|
229
|
+
delta: number;
|
|
230
|
+
/** Running e-value E_t = ∏ (1 + λ_i · D_i). */
|
|
231
|
+
evalue: number;
|
|
232
|
+
/** Time-uniform p-value at stopping time t. */
|
|
233
|
+
pValue: number;
|
|
234
|
+
/** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
|
|
235
|
+
csLow: number;
|
|
236
|
+
csHigh: number;
|
|
237
|
+
/** Verdict at this stopping time. */
|
|
238
|
+
decision: SequentialDecision;
|
|
239
|
+
}
|
|
240
|
+
interface PairedEvalueSequence {
|
|
241
|
+
steps: PairedEvalueStep[];
|
|
242
|
+
/** The decision at the final step. */
|
|
243
|
+
finalDecision: SequentialDecision;
|
|
244
|
+
/** Index (1-based) at which a non-`continue` decision first fired, or null. */
|
|
245
|
+
decisionFiredAt: number | null;
|
|
246
|
+
/** True if any deltas were clipped to [-bound, bound]. */
|
|
247
|
+
clipped: boolean;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Run the paired e-value sequence over an in-order delta stream.
|
|
251
|
+
*
|
|
252
|
+
* Use for *streaming* / interim analyses: pass the deltas you have so
|
|
253
|
+
* far, get the verdict at every prefix length. The decision is
|
|
254
|
+
* monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
|
|
255
|
+
* fires, the verdict at later steps remains decisive (the e-value is a
|
|
256
|
+
* non-negative martingale; once it crosses the threshold, it's crossed).
|
|
257
|
+
*/
|
|
258
|
+
declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
|
|
259
|
+
interface InterimReleaseConfidenceInput {
|
|
260
|
+
/**
|
|
261
|
+
* One delta series per candidate (paired deltas vs comparator). Order
|
|
262
|
+
* within a series is the order the campaigns were run.
|
|
263
|
+
*/
|
|
264
|
+
deltaSeries: Array<{
|
|
265
|
+
candidateId: string;
|
|
266
|
+
deltas: number[];
|
|
267
|
+
}>;
|
|
268
|
+
alpha?: number;
|
|
269
|
+
bound?: number;
|
|
270
|
+
rope?: {
|
|
271
|
+
low: number;
|
|
272
|
+
high: number;
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
interface InterimReleaseConfidence {
|
|
276
|
+
candidates: Array<{
|
|
277
|
+
candidateId: string;
|
|
278
|
+
decision: SequentialDecision;
|
|
279
|
+
decisionFiredAt: number | null;
|
|
280
|
+
finalEvalue: number;
|
|
281
|
+
finalPValue: number;
|
|
282
|
+
pairs: number;
|
|
283
|
+
csLow: number;
|
|
284
|
+
csHigh: number;
|
|
285
|
+
}>;
|
|
286
|
+
/**
|
|
287
|
+
* Campaign-level recommendation: pick the strongest 'promote_now', else
|
|
288
|
+
* 'continue' if any candidate is still live, else 'reject_now' if every
|
|
289
|
+
* candidate is dead, else 'equivalent'.
|
|
290
|
+
*/
|
|
291
|
+
recommendation: {
|
|
292
|
+
decision: SequentialDecision;
|
|
293
|
+
candidateId: string | null;
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* Run interim sequential analyses across many candidates at once,
|
|
298
|
+
* preserving the time-uniform α guarantee for each candidate's series and
|
|
299
|
+
* synthesising a campaign-level recommendation. Designed to be called on
|
|
300
|
+
* every campaign tick — the recommendation is anytime-valid.
|
|
301
|
+
*/
|
|
302
|
+
declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
|
|
303
|
+
|
|
304
|
+
export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeFilter as O, type PairedEvalueOptions as P, type RubricOutcomePair as R, type SequentialDecision as S, type OutcomeStore as a, type FileSystemOutcomeStoreOptions as b, type InterimReleaseConfidence as c, type InterimReleaseConfidenceInput as d, type PairedEvalueSequence as e, type PairedEvalueStep as f, type RubricPredictiveValidityInput as g, type RubricPredictiveValidityReport as h, type RubricRanking as i, evaluateInterimReleaseConfidence as j, pairedEvalueSequence as p, rubricPredictiveValidity as r };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-DNiOMBrZ.js';
|
|
2
2
|
import { a as Run, S as Span, f as TraceEvent, F as FailureClass, T as TraceStore } from './store-u47QaJ9G.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
@@ -975,4 +975,4 @@ interface ResearchReport {
|
|
|
975
975
|
*/
|
|
976
976
|
declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
|
|
977
977
|
|
|
978
|
-
export { type
|
|
978
|
+
export { type GateEvidence as $, type ActionableSideInfo as A, trialTraceFromMultiShotTrial as B, type GateDecision as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, type ResearchReportOptions as F, type GenerationReport as G, type ResearchReport as H, InMemoryTrialCache as I, type ParetoResult as J, DEFAULT_RULES as K, type Direction as L, type MultiShotGateConfig as M, type FailureClassification as N, type Objective as O, type PromptEvolutionConfig as P, type FailureCluster as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, type FailureClusterReport as U, type VariantAggregate as V, type FailureContext as W, type FailureRule as X, type GainDistributionBin as Y, type GainDistributionFigureSpec as Z, type GainDistributionOptions as _, type AsiSeverity as a, HeldOutGate as a0, type HeldOutGateConfig as a1, type HeldOutGateRejectionCode as a2, type ParetoFigureSpec as a3, type ParetoPoint as a4, RESEARCH_REPORT_HARD_PAIR_FLOOR as a5, type ResearchReportCandidate as a6, type ResearchReportDecision as a7, type ResearchReportMethodology as a8, type ResearchReportRecommendation as a9, type SummaryTable as aa, type SummaryTableOptions as ab, type SummaryTableRow as ac, classifyFailure as ad, crowdingDistance as ae, dominates as af, failureClusterView as ag, gainHistogram as ah, paretoChart as ai, paretoFrontier as aj, paretoFrontierWithCrowding as ak, researchReport as al, scalarScore as am, summaryTable as an, type MultiShotGateResult as b, type MultiShotMutateAdapter as c, type MultiShotOptimizationConfig as d, type MultiShotOptimizationResult as e, type MultiShotRun as f, type MultiShotRunInput as g, type MultiShotRunner as h, type MultiShotScore as i, type MultiShotScorer as j, type MultiShotSplit as k, type MultiShotTrace as l, type MultiShotTrialResult as m, type MultiShotVariant as n, type MutateAdapter as o, type PromptEvolutionEvent as p, type PromptEvolutionResult as q, type ReflectionProposal as r, type ScoreAdapter as s, type TrialResult as t, type TrialTrace as u, buildReflectionPrompt as v, defaultMultiShotObjectives as w, parseReflectionResponse as x, runMultiShotOptimization as y, runPromptEvolution as z };
|
package/dist/traces.d.ts
CHANGED
|
@@ -2,8 +2,8 @@ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureCl
|
|
|
2
2
|
export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
|
|
3
3
|
import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
|
|
4
4
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
|
|
5
|
-
import {
|
|
6
|
-
export { F as FileSystemRawProviderSink,
|
|
5
|
+
import { R as RawProviderSink, f as RawProviderEvent } from './integrity-Cr5YodSY.js';
|
|
6
|
+
export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
|
|
7
7
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
8
8
|
|
|
9
9
|
/**
|
package/dist/traces.js
CHANGED
|
@@ -54,11 +54,6 @@ import {
|
|
|
54
54
|
assertRunCaptured,
|
|
55
55
|
throwIfRunIncomplete
|
|
56
56
|
} from "./chunk-QUKKGHTZ.js";
|
|
57
|
-
import {
|
|
58
|
-
TraceEmitter,
|
|
59
|
-
llmSpanFromProvider
|
|
60
|
-
} from "./chunk-5IIQKMD5.js";
|
|
61
|
-
import "./chunk-6M774GY6.js";
|
|
62
57
|
import {
|
|
63
58
|
FileSystemRawProviderSink,
|
|
64
59
|
InMemoryRawProviderSink,
|
|
@@ -66,6 +61,11 @@ import {
|
|
|
66
61
|
defaultProviderRedactor,
|
|
67
62
|
providerFromBaseUrl
|
|
68
63
|
} from "./chunk-SQQLHODJ.js";
|
|
64
|
+
import {
|
|
65
|
+
TraceEmitter,
|
|
66
|
+
llmSpanFromProvider
|
|
67
|
+
} from "./chunk-5IIQKMD5.js";
|
|
68
|
+
import "./chunk-6M774GY6.js";
|
|
69
69
|
import "./chunk-PZ5AY32C.js";
|
|
70
70
|
export {
|
|
71
71
|
DEFAULT_REDACTION_RULES,
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# Auto-research loop end-to-end
|
|
2
|
+
|
|
3
|
+
This is the runnable composition pattern that closes the loop the package
|
|
4
|
+
was originally designed for: capture-integrity → eval → preferences →
|
|
5
|
+
mutation → improved candidate → repeat.
|
|
6
|
+
|
|
7
|
+
There's no new orchestrator primitive that runs this for you (and we
|
|
8
|
+
deliberately resisted shipping one — every consumer's loop has different
|
|
9
|
+
invariants). What this doc gives you is **the integration recipe**: the
|
|
10
|
+
imports, the wiring, and the explicit invariants every iteration must
|
|
11
|
+
preserve.
|
|
12
|
+
|
|
13
|
+
A working version of this recipe lives at
|
|
14
|
+
[`examples/auto-research-with-agent-builder/`](../examples/auto-research-with-agent-builder/) —
|
|
15
|
+
runnable, ~250 lines, demonstrates the score climbing across iterations.
|
|
16
|
+
|
|
17
|
+
## The pattern
|
|
18
|
+
|
|
19
|
+
```ts
|
|
20
|
+
import {
|
|
21
|
+
runEvalCampaign,
|
|
22
|
+
analyzeOptimizationResult,
|
|
23
|
+
trialsToRunRecords,
|
|
24
|
+
PredictiveValidityResearcher,
|
|
25
|
+
} from '@tangle-network/agent-eval'
|
|
26
|
+
import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces'
|
|
27
|
+
|
|
28
|
+
async function runAutoResearchLoop(opts: {
|
|
29
|
+
task: string
|
|
30
|
+
initialVariants: Variant[]
|
|
31
|
+
scenarios: Scenario[]
|
|
32
|
+
iterations: number
|
|
33
|
+
// The thing that turns a Variant into a scoreable artifact.
|
|
34
|
+
// For agent-builder this is `runForgeBuilderSim`; for tax-agent it's
|
|
35
|
+
// their domain runner; for the multi-shot prompt evolution case it's
|
|
36
|
+
// already wired inside `runPromptEvolution`.
|
|
37
|
+
candidateRunner: CandidateRunner<Variant>
|
|
38
|
+
// The thing that proposes the next variants given the analysis output.
|
|
39
|
+
// For prompt-only optimization, this is `reflective-mutation` against
|
|
40
|
+
// the top/bottom trials. For code+prompt, this is `createCompositeMutator`.
|
|
41
|
+
// For agent-builder, this can be a hand-rolled "edit the system prompt"
|
|
42
|
+
// function — the example shows one.
|
|
43
|
+
mutator: (champion: Variant, analysis: AnalysisReport) => Promise<Variant[]>
|
|
44
|
+
// Optional: outcome store for predictive validity. When present, the
|
|
45
|
+
// loop learns which scoring rubrics actually predict deployment outcomes
|
|
46
|
+
// and reweights the composite score accordingly.
|
|
47
|
+
outcomes?: { store: OutcomeStore; metrics: string[] }
|
|
48
|
+
}): Promise<IterationReport[]> {
|
|
49
|
+
const reports: IterationReport[] = []
|
|
50
|
+
let variants = opts.initialVariants
|
|
51
|
+
|
|
52
|
+
// (Optional) standing researcher that drives rubric reweighting.
|
|
53
|
+
const researcher = opts.outcomes
|
|
54
|
+
? new PredictiveValidityResearcher({
|
|
55
|
+
outcomes: opts.outcomes.store,
|
|
56
|
+
outcomeMetrics: opts.outcomes.metrics,
|
|
57
|
+
})
|
|
58
|
+
: null
|
|
59
|
+
|
|
60
|
+
for (let iter = 0; iter < opts.iterations; iter++) {
|
|
61
|
+
// 1. Capture-integrity-by-construction matrix run.
|
|
62
|
+
const campaign = await runEvalCampaign({
|
|
63
|
+
campaignId: `auto-research-iter-${iter}`,
|
|
64
|
+
commitSha: opts.task,
|
|
65
|
+
variants: variants.map((v) => ({ id: v.id, payload: v })),
|
|
66
|
+
scenarios: opts.scenarios,
|
|
67
|
+
seeds: [0, 1, 2],
|
|
68
|
+
llmOpts: { ... },
|
|
69
|
+
storeFactory: () => new InMemoryTraceStore(),
|
|
70
|
+
rawSinkFactory: () => new InMemoryRawProviderSink(),
|
|
71
|
+
runner: makeCampaignRunner(opts.candidateRunner),
|
|
72
|
+
onRunComplete: opts.outcomes
|
|
73
|
+
? [traceAnalystOnRunComplete({ analyze: ..., save: ... })]
|
|
74
|
+
: [],
|
|
75
|
+
report: { comparator: variants[0]!.id },
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
// 2. RL-bridge analysis: preferences, verifiable rewards, sequential
|
|
79
|
+
// interim verdict, reward-hacking diagnosis.
|
|
80
|
+
const analysis = await analyzeOptimizationResult({
|
|
81
|
+
result: pretendItsAPromptEvolution(campaign),
|
|
82
|
+
ctx: { experimentId: 'task', model: '...', commitSha: '...', promptHash: '...', configHash: '...' },
|
|
83
|
+
comparator: variants[0]!.id,
|
|
84
|
+
outcomes: opts.outcomes,
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
// 3. Periodic rubric recalibration via predictive validity.
|
|
88
|
+
if (researcher && iter > 0 && iter % 5 === 0) {
|
|
89
|
+
await researcher.runValidityCheck(campaign.runs)
|
|
90
|
+
// The researcher's `proposeChange` output can be folded into the
|
|
91
|
+
// mutator as a steering signal in the next iteration.
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// 4. Pick champion + record this iteration.
|
|
95
|
+
const champion = pickChampion(campaign.runs)
|
|
96
|
+
reports.push({ iter, champion, score: champion.score, analysis })
|
|
97
|
+
|
|
98
|
+
// 5. Sequential stop: the anytime-valid e-value can decisively call
|
|
99
|
+
// 'promote_now' or 'reject_now' before iterations exhausted.
|
|
100
|
+
if (analysis.interimConfidence?.recommendation.decision === 'promote_now') {
|
|
101
|
+
break
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// 6. Propose next variants via the mutator.
|
|
105
|
+
if (iter < opts.iterations - 1) {
|
|
106
|
+
variants = await opts.mutator(champion.variant, analysis)
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return reports
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Invariants every iteration must preserve
|
|
115
|
+
|
|
116
|
+
1. **The campaign produces RunRecord[] with `scenarioId` populated.** Every
|
|
117
|
+
downstream primitive (preferences, sequential, predictive validity,
|
|
118
|
+
tournament) keys on this. `runEvalCampaign` populates it canonically;
|
|
119
|
+
if you adapt from `runPromptEvolution` use `trialsToRunRecords`.
|
|
120
|
+
|
|
121
|
+
2. **Capture is wired by construction.** Don't pass `NoopRawProviderSink`
|
|
122
|
+
to `rawSinkFactory` unless the iteration is exploratory. Every
|
|
123
|
+
captured run is replayable, every replayable run is free judge-iteration
|
|
124
|
+
data for the next loop.
|
|
125
|
+
|
|
126
|
+
3. **`commitSha` is real.** It's how downstream tooling (predictive
|
|
127
|
+
validity, contamination probe, tournament) ties iterations together.
|
|
128
|
+
|
|
129
|
+
4. **The comparator is stable across iterations.** Either the original
|
|
130
|
+
`baseline` or whichever champion you froze. Shifting the comparator
|
|
131
|
+
between iterations corrupts the paired-delta semantics.
|
|
132
|
+
|
|
133
|
+
5. **The mutator is deterministic given the analysis output.** Otherwise
|
|
134
|
+
the iteration isn't reproducible and the auto-research artifacts
|
|
135
|
+
become unfalsifiable. If you need stochastic mutation, seed the
|
|
136
|
+
mutator and emit the seed onto the run record.
|
|
137
|
+
|
|
138
|
+
## When to run each primitive
|
|
139
|
+
|
|
140
|
+
| Frequency | Primitive | Why |
|
|
141
|
+
|---|---|---|
|
|
142
|
+
| Every iteration | `runEvalCampaign` | core measurement |
|
|
143
|
+
| Every iteration | `analyzeOptimizationResult` | preferences + verifiable rewards + reward-hacking |
|
|
144
|
+
| Every iteration | `evaluateInterimReleaseConfidence` (via `analyzeOptimizationResult`) | anytime-valid stop signal |
|
|
145
|
+
| Every 5–10 iterations | `rubricPredictiveValidity` | rubric weights drift; recalibrate |
|
|
146
|
+
| Every release | `runContaminationProbe` | scenario set freshness |
|
|
147
|
+
| Once per task | `runComputeCurve` | cost-quality frontier |
|
|
148
|
+
| As-needed | `adversarialScenarioSearch` | discover failure modes the curated set missed |
|
|
149
|
+
|
|
150
|
+
## When to drop into the smaller primitives
|
|
151
|
+
|
|
152
|
+
Two cases:
|
|
153
|
+
|
|
154
|
+
1. **Trajectory-shaped optimization with steering.** Use
|
|
155
|
+
`runMultiShotOptimization` directly — it already runs the inner
|
|
156
|
+
search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
|
|
157
|
+
for the RL bridge.
|
|
158
|
+
|
|
159
|
+
2. **Prompt + code evolution with sandboxed code mutation.** Use
|
|
160
|
+
`runPromptEvolution` + `createCompositeMutator` directly. Same wrap
|
|
161
|
+
pattern.
|
|
162
|
+
|
|
163
|
+
The auto-research loop above wraps these primitives in a higher-level
|
|
164
|
+
loop that runs them across multiple campaigns. They're each one tick of
|
|
165
|
+
the bigger loop.
|
|
166
|
+
|
|
167
|
+
## What this does NOT do
|
|
168
|
+
|
|
169
|
+
- It doesn't fine-tune model weights. That's the
|
|
170
|
+
[`fine-tune-with-prime-rl`](../examples/fine-tune-with-prime-rl/) example
|
|
171
|
+
— separate concern, separate trainer.
|
|
172
|
+
- It doesn't drive a production deployment decision on its own. The
|
|
173
|
+
artifacts feed a launch-review process (humans, the `researchReport`
|
|
174
|
+
output, the `assertReleaseConfidence` gate). Loop ≠ promotion gate.
|
|
175
|
+
- It doesn't substitute for a real preregistration trail. The
|
|
176
|
+
`preregistrationHash` field on the report exists so iterations can be
|
|
177
|
+
audited, but the auto-research loop *is* iterative and post-hoc by
|
|
178
|
+
definition. Use the standing `assertReleaseConfidence` gate at the
|
|
179
|
+
release boundary; use the auto-research loop everywhere upstream of it.
|
|
180
|
+
|
|
181
|
+
## Reading order for the example
|
|
182
|
+
|
|
183
|
+
1. [`examples/auto-research-with-agent-builder/README.md`](../examples/auto-research-with-agent-builder/README.md) — architectural picture.
|
|
184
|
+
2. [`examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`](../examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts) — runnable demo.
|
|
185
|
+
3. Run it: `npx tsx examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`.
|
|
186
|
+
It prints the iteration progression and the score climbing.
|