darwin-agents 0.6.0-alpha.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +97 -0
- package/README.md +43 -1
- package/dist/src/cli/run.js +1 -1
- package/dist/src/cli/run.js.map +1 -1
- package/dist/src/evolution/alignment.d.ts +44 -0
- package/dist/src/evolution/alignment.d.ts.map +1 -1
- package/dist/src/evolution/alignment.js +124 -0
- package/dist/src/evolution/alignment.js.map +1 -1
- package/dist/src/evolution/loop.d.ts +56 -0
- package/dist/src/evolution/loop.d.ts.map +1 -1
- package/dist/src/evolution/loop.js +168 -16
- package/dist/src/evolution/loop.js.map +1 -1
- package/dist/src/evolution/multi-critic.d.ts +30 -1
- package/dist/src/evolution/multi-critic.d.ts.map +1 -1
- package/dist/src/evolution/multi-critic.js +57 -2
- package/dist/src/evolution/multi-critic.js.map +1 -1
- package/dist/src/evolution/optimizer-gepa.d.ts +47 -4
- package/dist/src/evolution/optimizer-gepa.d.ts.map +1 -1
- package/dist/src/evolution/optimizer-gepa.js +44 -5
- package/dist/src/evolution/optimizer-gepa.js.map +1 -1
- package/dist/src/evolution/pareto.d.ts +98 -3
- package/dist/src/evolution/pareto.d.ts.map +1 -1
- package/dist/src/evolution/pareto.js +193 -30
- package/dist/src/evolution/pareto.js.map +1 -1
- package/dist/src/evolution/safety.d.ts +35 -1
- package/dist/src/evolution/safety.d.ts.map +1 -1
- package/dist/src/evolution/safety.js +56 -2
- package/dist/src/evolution/safety.js.map +1 -1
- package/dist/src/evolution/sequential.d.ts +149 -0
- package/dist/src/evolution/sequential.d.ts.map +1 -0
- package/dist/src/evolution/sequential.js +239 -0
- package/dist/src/evolution/sequential.js.map +1 -0
- package/dist/src/evolution/tracker.d.ts +12 -0
- package/dist/src/evolution/tracker.d.ts.map +1 -1
- package/dist/src/evolution/tracker.js +24 -0
- package/dist/src/evolution/tracker.js.map +1 -1
- package/dist/src/index.d.ts +5 -3
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +10 -3
- package/dist/src/index.js.map +1 -1
- package/dist/src/types.d.ts +84 -0
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
* rollback triggers, and A/B test evaluation rules.
|
|
7
7
|
*/
|
|
8
8
|
import { DEFAULT_SAFETY } from '../types.js';
|
|
9
|
+
import { msprtTwoSample, hoeffdingTwoSample } from './sequential.js';
|
|
9
10
|
/** Default minRuns range for dynamic sizing */
|
|
10
11
|
const DYNAMIC_MIN_RUNS_FLOOR = 10;
|
|
11
12
|
const DYNAMIC_MIN_RUNS_CEIL = 30;
|
|
@@ -21,6 +22,17 @@ export class SafetyGate {
|
|
|
21
22
|
canEvolve(_agentName, stats) {
|
|
22
23
|
return stats.totalRuns >= this.thresholds.minDataPoints;
|
|
23
24
|
}
|
|
25
|
+
/**
|
|
26
|
+
* v0.7.0 — True iff the peeking guard is configured to use a sequential
|
|
27
|
+
* method (mSPRT / Hoeffding), which needs the per-arm composite samples.
|
|
28
|
+
* The loop calls this to decide whether to load that (slightly more
|
|
29
|
+
* expensive) per-sample data before calling {@link evaluateABTest}.
|
|
30
|
+
*/
|
|
31
|
+
usesSequentialConfidence() {
|
|
32
|
+
return (this.thresholds.requireConfidence === true &&
|
|
33
|
+
(this.thresholds.confidenceMethod === 'msprt' ||
|
|
34
|
+
this.thresholds.confidenceMethod === 'hoeffding'));
|
|
35
|
+
}
|
|
24
36
|
/**
|
|
25
37
|
* Check whether score B is NOT a regression beyond the allowed threshold.
|
|
26
38
|
*
|
|
@@ -58,7 +70,7 @@ export class SafetyGate {
|
|
|
58
70
|
* @param overrideMinRuns — Per-test minimum runs (from ABTest.minRuns).
|
|
59
71
|
* Falls back to SafetyThresholds.minDataPoints if not provided.
|
|
60
72
|
*/
|
|
61
|
-
evaluateABTest(compositeA, compositeB, runsA, runsB, failsA = 0, failsB = 0, overrideMinRuns) {
|
|
73
|
+
evaluateABTest(compositeA, compositeB, runsA, runsB, failsA = 0, failsB = 0, overrideMinRuns, samples) {
|
|
62
74
|
const minRuns = overrideMinRuns ?? this.thresholds.minDataPoints;
|
|
63
75
|
const totalA = runsA + failsA;
|
|
64
76
|
const totalB = runsB + failsB;
|
|
@@ -113,7 +125,7 @@ export class SafetyGate {
|
|
|
113
125
|
// tie-break below so the test still terminates (an early 'continue' here
|
|
114
126
|
// would loop forever on a persistent small-margin challenger).
|
|
115
127
|
if (!this.thresholds.requireConfidence ||
|
|
116
|
-
this.
|
|
128
|
+
this.isConfident(adjustedA, adjustedB, runsA, runsB, minRuns, marginOutcome, samples)) {
|
|
117
129
|
return marginOutcome;
|
|
118
130
|
}
|
|
119
131
|
}
|
|
@@ -165,6 +177,48 @@ export class SafetyGate {
|
|
|
165
177
|
const effectSize = Math.abs(scoreA - scoreB) / pooled;
|
|
166
178
|
return effectSize >= 0.2 && runsA + runsB >= minRuns * 2;
|
|
167
179
|
}
|
|
180
|
+
/**
|
|
181
|
+
* v0.7.0 — Dispatch the peeking-resistant confidence gate to the
|
|
182
|
+
* configured {@link SafetyThresholds.confidenceMethod}.
|
|
183
|
+
*
|
|
184
|
+
* - `'effect-size'` (default): the v0.6.0 heuristic ({@link meetsConfidence}).
|
|
185
|
+
* Byte-for-byte unchanged when no method is set.
|
|
186
|
+
* - `'msprt'` / `'hoeffding'`: an always-valid sequential test over the
|
|
187
|
+
* RAW per-arm composite samples (reliability is already handled by the
|
|
188
|
+
* auto-loss rule upstream, so the statistical test uses the unadjusted
|
|
189
|
+
* scores). The verdict must be `decisive` AND point in the SAME
|
|
190
|
+
* direction as the score margin — a sequential test that fires for the
|
|
191
|
+
* opposite arm does not confirm this margin.
|
|
192
|
+
*
|
|
193
|
+
* Falls back to the effect-size heuristic when a sequential method is set
|
|
194
|
+
* but no per-sample data was supplied (graceful — never throws).
|
|
195
|
+
*/
|
|
196
|
+
isConfident(adjustedA, adjustedB, runsA, runsB, minRuns, marginOutcome, samples) {
|
|
197
|
+
const method = this.thresholds.confidenceMethod ?? 'effect-size';
|
|
198
|
+
if (method === 'effect-size' || !samples) {
|
|
199
|
+
return this.meetsConfidence(adjustedA, adjustedB, runsA, runsB, minRuns);
|
|
200
|
+
}
|
|
201
|
+
const opts = {
|
|
202
|
+
alpha: this.thresholds.confidenceAlpha,
|
|
203
|
+
minSamplesPerArm: this.thresholds.confidenceMinSamples,
|
|
204
|
+
};
|
|
205
|
+
const verdict = method === 'hoeffding'
|
|
206
|
+
? hoeffdingTwoSample(samples.a, samples.b, {
|
|
207
|
+
...opts,
|
|
208
|
+
lo: this.thresholds.confidenceScoreRange?.[0],
|
|
209
|
+
hi: this.thresholds.confidenceScoreRange?.[1],
|
|
210
|
+
})
|
|
211
|
+
: msprtTwoSample(samples.a, samples.b, {
|
|
212
|
+
...opts,
|
|
213
|
+
tau: this.thresholds.confidenceTau,
|
|
214
|
+
});
|
|
215
|
+
if (!verdict.decisive)
|
|
216
|
+
return false;
|
|
217
|
+
// The sequential test must confirm the SAME winner as the score margin.
|
|
218
|
+
// direction +1 = B>A (b_wins), −1 = A>B (a_wins).
|
|
219
|
+
const expected = marginOutcome === 'b_wins' ? 1 : -1;
|
|
220
|
+
return verdict.direction === expected;
|
|
221
|
+
}
|
|
168
222
|
/**
|
|
169
223
|
* Compute dynamic minRuns based on observed quality score variance.
|
|
170
224
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"safety.js","sourceRoot":"","sources":["../../../src/evolution/safety.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"safety.js","sourceRoot":"","sources":["../../../src/evolution/safety.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAC7C,OAAO,EAAE,cAAc,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAsBrE,+CAA+C;AAC/C,MAAM,sBAAsB,GAAG,EAAE,CAAC;AAClC,MAAM,qBAAqB,GAAG,EAAE,CAAC;AAEjC,MAAM,OAAO,UAAU;IACb,UAAU,CAAmB;IAErC,YAAY,aAA+B,cAAc;QACvD,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED;;;OAGG;IACH,SAAS,CAAC,UAAkB,EAAE,KAAyB;QACrD,OAAO,KAAK,CAAC,SAAS,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;IAC1D,CAAC;IAED;;;;;OAKG;IACH,wBAAwB;QACtB,OAAO,CACL,IAAI,CAAC,UAAU,CAAC,iBAAiB,KAAK,IAAI;YAC1C,CAAC,IAAI,CAAC,UAAU,CAAC,gBAAgB,KAAK,OAAO;gBAC3C,IAAI,CAAC,UAAU,CAAC,gBAAgB,KAAK,WAAW,CAAC,CACpD,CAAC;IACJ,CAAC;IAED;;;;;;;;;OASG;IACH,eAAe,CAAC,MAAc,EAAE,MAAc;QAC5C,8DAA8D;QAC9D,IAAI,MAAM,IAAI,CAAC,EAAE,CAAC;YAChB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,GAAG,MAAM,CAAC,GAAG,MAAM,CAAC;QACxC,OAAO,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;IAC/C,CAAC;IAED;;;OAGG;IACH,cAAc,CAAC,mBAA2B;QACxC,OAAO,mBAAmB,IAAI,IAAI,CAAC,UAAU,CAAC,wBAAwB,CAAC;IACzE,CAAC;IAED;;;;;;;;;;;OAWG;IACH,cAAc,CACZ,UAAkB,EAClB,UAAkB,EAClB,KAAa,EACb,KAAa,EACb,SAAiB,CAAC,EAClB,SAAiB,CAAC,EAClB,eAAwB,EACxB,OAAuB;QAEvB,MAAM,OAAO,GAAG,eAAe,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;QACjE,MAAM,MAAM,GAAG,KAAK,GAAG,MAAM,CAAC;QAC9B,MAAM,MAAM,GAAG,KAAK,GAAG,MAAM,CAAC;QAE9B,mFAAmF;QACnF,MAAM,yBAAyB,GAAG,CAAC,CAAC;QACpC,IAAI,MAAM,IAAI,yBAAyB,IAAI,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE,CAAC;YACjE,OAAO,QAAQ,CAAC,CAAC,kBAAkB;QACrC,CAAC;QACD,IAAI,MAAM,IAAI,yBAAyB,IAAI,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE,CAAC;YACjE,OAAO,QAAQ,CAAC,CAAC,kBAAkB;QACrC,CAAC;QAED,2DAA2D;QAC3D,IAAI,KAAK,GAAG,OAAO,IAAI,KAAK,GAAG,OAAO,EAAE,CAAC;YACvC,OAAO,UAAU,CAAC;QACpB,CAAC;QAED,MAAM,oBAAoB,GAAG,IAAI,CAAC,CAAC,iCAAiC;QAEpE,qEAAqE;QACrE,MAAM,YAAY,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QACrD,MAAM,YAAY,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,UAAU,GAAG,YAAY,CAAC;QAC5C,MAAM,SAAS,GAAG,UAAU,GAAG,YAAY,CAAC;QAE5C,yBAAyB;QACzB,IAAI,SAAS,KAAK,CAAC,IAAI,SAAS,KAAK,CAAC,EAAE,CAAC;YACvC,OAAO,UAAU,CAAC;QACpB,CAAC;QAED,yEAAyE;QACzE,IAAI,aAAa,GAAyB,IAAI,CAAC;QAC/C,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;YAClB,MAAM,MAAM,GAAG,CAAC,SAAS,GAAG,SAAS,CAAC,GAAG,SAAS,CAAC;YACnD,IAAI,MAAM,GAAG,oBAAoB,EAAE,CAAC;gBAClC,aAAa,GAAG,QAAQ,CAAC;YAC3B,CAAC;QACH,CAAC;aAAM,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;YACzB,aAAa,GAAG,QAAQ,CAAC;QAC3B,CAAC;QACD,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;YAC3B,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;gBAClB,MAAM,MAAM,GAAG,CAAC,SAAS,GAAG,SAAS,CAAC,GAAG,SAAS,CAAC;gBACnD,IAAI,MAAM,GAAG,oBAAoB,EAAE,CAAC;oBAClC,aAAa,GAAG,QAAQ,CAAC;gBAC3B,CAAC;YACH,CAAC;iBAAM,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;gBACzB,aAAa,GAAG,QAAQ,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;YAC3B,uEAAuE;YACvE,wEAAwE;YACxE,uEAAuE;YACvE,yEAAyE;YACzE,+DAA+D;YAC/D,IACE,CAAC,IAAI,CAAC,UAAU,CAAC,iBAAiB;gBAClC,IAAI,CAAC,WAAW,CAAC,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE,KAAK,EAAE,OAAO,EAAE,aAAa,EAAE,OAAO,CAAC,EACrF,CAAC;gBACD,OAAO,aAAa,CAAC;YACvB,CAAC;QACH,CAAC;QAED,wDAAwD;QACxD,6EAA6E;QAC7E,4EAA4E;QAC5E,6DAA6D;QAC7D,MAAM,cAAc,GAAG,OAAO,GAAG,CAAC,CAAC;QACnC,IAAI,KAAK,IAAI,cAAc,IAAI,KAAK,IAAI,cAAc,EAAE,CAAC;YACvD,OAAO,QAAQ,CAAC,CAAC,qEAAqE;QACxF,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAED;;;;OAIG;IACH,mBAAmB,CACjB,UAAkB,EAClB,UAAkB,EAClB,KAAa,EACb,KAAa;QAEb,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;QAE9C,IAAI,KAAK,GAAG,OAAO,IAAI,KAAK,GAAG,OAAO,EAAE,CAAC;YACvC,OAAO,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;QAC7C,CAAC;QAED,qDAAqD;QACrD,MAAM,MAAM,GAAG,CAAC,UAAU,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;QAC7C,IAAI,MAAM,KAAK,CAAC,EAAE,CAAC;YACjB,OAAO,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;QAC7C,CAAC;QAED,6DAA6D;QAC7D,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,GAAG,UAAU,CAAC,GAAG,MAAM,CAAC;QAE9D,gEAAgE;QAChE,MAAM,YAAY,GAAG,KAAK,GAAG,KAAK,CAAC;QACnC,MAAM,SAAS,GAAG,UAAU,IAAI,GAAG,IAAI,YAAY,IAAI,OAAO,GAAG,CAAC,CAAC;QAEnE,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC;IACnC,CAAC;IAED;;;;;;;OAOG;IACK,eAAe,CACrB,MAAc,EACd,MAAc,EACd,KAAa,EACb,KAAa,EACb,OAAe;QAEf,MAAM,MAAM,GAAG,CAAC,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC;QACrC,IAAI,MAAM,KAAK,CAAC,EAAE,CAAC;YACjB,OAAO,KAAK,CAAC;QACf,CAAC;QACD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,MAAM,CAAC,GAAG,MAAM,CAAC;QACtD,OAAO,UAAU,IAAI,GAAG,IAAI,KAAK,GAAG,KAAK,IAAI,OAAO,GAAG,CAAC,CAAC;IAC3D,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACK,WAAW,CACjB,SAAiB,EACjB,SAAiB,EACjB,KAAa,EACb,KAAa,EACb,OAAe,EACf,aAA4B,EAC5B,OAAuB;QAEvB,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,gBAAgB,IAAI,aAAa,CAAC;QAEjE,IAAI,MAAM,KAAK,aAAa,IAAI,CAAC,OAAO,EAAE,CAAC;YACzC,OAAO,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;QAC3E,CAAC;QAED,MAAM,IAAI,GAAG;YACX,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,eAAe;YACtC,gBAAgB,EAAE,IAAI,CAAC,UAAU,CAAC,oBAAoB;SACvD,CAAC;QAEF,MAAM,OAAO,GACX,MAAM,KAAK,WAAW;YACpB,CAAC,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,EAAE;gBACvC,GAAG,IAAI;gBACP,EAAE,EAAE,IAAI,CAAC,UAAU,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC;gBAC7C,EAAE,EAAE,IAAI,CAAC,UAAU,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC;aAC9C,CAAC;YACJ,CAAC,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,EAAE;gBACnC,GAAG,IAAI;gBACP,GAAG,EAAE,IAAI,CAAC,UAAU,CAAC,aAAa;aACnC,CAAC,CAAC;QAET,IAAI,CAAC,OAAO,CAAC,QAAQ;YAAE,OAAO,KAAK,CAAC;QACpC,wEAAwE;QACxE,kDAAkD;QAClD,MAAM,QAAQ,GAAG,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACrD,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC;IACxC,CAAC;IAED;;;;;;;;;;;;;;OAcG;IACH,qBAAqB,CACnB,WAA+B,EAC/B,aAAsB;QAEtB,MAAM,KAAK,GAAG,aAAa,IAAI,sBAAsB,CAAC;QACtD,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,qBAAqB,CAAC,CAAC;QAEpD,sDAAsD;QACtD,MAAM,aAAa,GAAG,WAAW;aAC9B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC;aAClC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC;QAE1C,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7B,OAAO,KAAK,CAAC;QACf,CAAC;QAED,MAAM,IAAI,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC;QAC7E,kFAAkF;QAClF,oFAAoF;QACpF,MAAM,QAAQ,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACzG,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAEhC,mEAAmE;QACnE,IAAI,GAAG,IAAI,GAAG,EAAE,CAAC;YACf,OAAO,KAAK,CAAC;QACf,CAAC;QAED,qDAAqD;QACrD,IAAI,GAAG,GAAG,GAAG,EAAE,CAAC;YACd,OAAO,IAAI,CAAC;QACd,CAAC;QAED,yDAAyD;QACzD,kCAAkC;QAClC,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,6BAA6B;QAC1D,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC;IAC/C,CAAC;CACF"}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Darwin — Always-Valid Sequential Testing (v0.7.0)
|
|
3
|
+
*
|
|
4
|
+
* Pure statistical primitives for peeking-resistant A/B decisions during
|
|
5
|
+
* prompt evolution. This module exists because Darwin's safety gate calls
|
|
6
|
+
* `evaluateABTest` after EVERY run — continuous monitoring with a fixed
|
|
7
|
+
* relative-improvement threshold inflates the false-positive rate (the
|
|
8
|
+
* classic "peeking problem"). v0.6.0 shipped a first-step effect-size
|
|
9
|
+
* heuristic (`SafetyGate.calculateConfidence`, |Δ| / pooled-mean ≥ 0.2);
|
|
10
|
+
* this module is the rigorous upgrade promised in the v0.6 roadmap notes.
|
|
11
|
+
*
|
|
12
|
+
* Two methods, both **always-valid** (the decision stays statistically
|
|
13
|
+
* sound no matter how many times you peek):
|
|
14
|
+
*
|
|
15
|
+
* 1. {@link msprtTwoSample} — Mixture Sequential Probability Ratio Test
|
|
16
|
+
* (Johari, Pekelis & Walsh 2017, arXiv:1512.04922; the engine behind
|
|
17
|
+
* Optimizely/Statsig's "stats engine"). Gaussian mixture prior over
|
|
18
|
+
* the effect size; uses the observed (pooled) variance. Most powerful
|
|
19
|
+
* when the per-arm sample variance is meaningful — i.e. once each arm
|
|
20
|
+
* has accumulated a handful of runs (see {@link MsprtOptions.minSamplesPerArm}).
|
|
21
|
+
*
|
|
22
|
+
* 2. {@link hoeffdingTwoSample} — a σ-free time-uniform confidence
|
|
23
|
+
* sequence for variables bounded to a known range (Darwin composite
|
|
24
|
+
* scores live in [0, 1]). Valid at ANY sample size with no variance
|
|
25
|
+
* estimate, so it is the honest choice when only a few runs exist.
|
|
26
|
+
* More conservative than mSPRT (wider intervals) by design.
|
|
27
|
+
*
|
|
28
|
+
* **Pure** — no LLM calls, no I/O, no `Date.now()`, no `Math.random()`.
|
|
29
|
+
* Fully deterministic, so tests pin exact statistic values.
|
|
30
|
+
*
|
|
31
|
+
* Caveat on warmup (documented, not hidden): mSPRT with an *estimated*
|
|
32
|
+
* variance is only asymptotically always-valid; with very few samples the
|
|
33
|
+
* variance estimate is noisy. Darwin's A/B sample sizes (minRuns 10–30) sit
|
|
34
|
+
* below the ~100-sample comfort zone for tight σ-estimation, so we expose
|
|
35
|
+
* `minSamplesPerArm` (default 5) below which mSPRT abstains (`decisive:false`)
|
|
36
|
+
* rather than fire on noise, and we offer Hoeffding as the σ-free fallback.
|
|
37
|
+
*/
|
|
38
|
+
/** Which confidence method the safety gate uses for the peeking guard. */
|
|
39
|
+
export type ConfidenceMethod = "effect-size" | "msprt" | "hoeffding";
|
|
40
|
+
/** Verdict from a sequential test. `decisive` answers "is the gap real?". */
|
|
41
|
+
export interface SequentialVerdict {
|
|
42
|
+
/** True iff the test crossed its always-valid threshold (reject H0: equal means). */
|
|
43
|
+
decisive: boolean;
|
|
44
|
+
/** Which method produced this verdict. */
|
|
45
|
+
method: ConfidenceMethod;
|
|
46
|
+
/** Sign of the effect (mean B − mean A): +1 if B>A, −1 if A>B, 0 if tie/undecided. */
|
|
47
|
+
direction: -1 | 0 | 1;
|
|
48
|
+
/**
|
|
49
|
+
* The test statistic: for mSPRT the mixture likelihood ratio Λ (compare to
|
|
50
|
+
* `threshold = 1/alpha`); for Hoeffding the absolute mean gap |Δ| (compare
|
|
51
|
+
* to `threshold` = summed CS half-widths). NaN-free.
|
|
52
|
+
*/
|
|
53
|
+
statistic: number;
|
|
54
|
+
/** The threshold `statistic` must exceed for `decisive` to be true. */
|
|
55
|
+
threshold: number;
|
|
56
|
+
/** Effective per-arm sample counts after NaN filtering. */
|
|
57
|
+
nA: number;
|
|
58
|
+
nB: number;
|
|
59
|
+
/** Human-readable reason, e.g. "warmup: 3<5 samples on arm A". */
|
|
60
|
+
reason: string;
|
|
61
|
+
}
|
|
62
|
+
export interface MeanVar {
|
|
63
|
+
mean: number;
|
|
64
|
+
/** Sample variance with Bessel's correction (n−1). 0 when n<2. */
|
|
65
|
+
variance: number;
|
|
66
|
+
n: number;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Mean + Bessel-corrected sample variance over finite values. Non-finite
|
|
70
|
+
* entries (NaN/Infinity) are dropped — a single bad score never poisons the
|
|
71
|
+
* estimate. Returns `{mean:0, variance:0, n:0}` for an all-invalid/empty input.
|
|
72
|
+
*/
|
|
73
|
+
export declare function meanVar(samples: ReadonlyArray<number>): MeanVar;
|
|
74
|
+
export interface MsprtOptions {
|
|
75
|
+
/** Significance level. Reject H0 when Λ ≥ 1/alpha. Default 0.05. */
|
|
76
|
+
alpha?: number;
|
|
77
|
+
/**
|
|
78
|
+
* Mixing-prior standard deviation over the true mean DIFFERENCE δ (in raw
|
|
79
|
+
* score units, since the test runs in estimator coordinates). Larger τ ⇒
|
|
80
|
+
* optimised for bigger effects (fires faster on large gaps, slower on small
|
|
81
|
+
* ones). Default 0.1 — tuned for composite scores in [0,1] where a
|
|
82
|
+
* "meaningful" lift in the mean difference is on the order of ~0.1.
|
|
83
|
+
*/
|
|
84
|
+
tau?: number;
|
|
85
|
+
/**
|
|
86
|
+
* Per-arm warmup floor. Below this many valid samples on EITHER arm the
|
|
87
|
+
* test abstains (`decisive:false`) instead of firing on a noisy variance
|
|
88
|
+
* estimate. Default 5.
|
|
89
|
+
*/
|
|
90
|
+
minSamplesPerArm?: number;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Two-sample mixture SPRT for a difference in means with always-valid
|
|
94
|
+
* inference. Models H0: μ_A = μ_B against a Gaussian mixture alternative on
|
|
95
|
+
* the effect (prior δ ~ N(0, τ²) on the true mean difference). Returns
|
|
96
|
+
* `decisive:true` when the mixture likelihood ratio Λ crosses 1/alpha — a
|
|
97
|
+
* threshold valid at every n (no peeking penalty).
|
|
98
|
+
*
|
|
99
|
+
* Closed form in ESTIMATOR coordinates. Let δ̂ = x̄_B − x̄_A be the observed
|
|
100
|
+
* mean difference and v = Var(δ̂) its variance. Integrating the per-θ Gaussian
|
|
101
|
+
* likelihood ratio against the N(0, τ²) mixture prior (Johari, Pekelis &
|
|
102
|
+
* Walsh 2017) gives:
|
|
103
|
+
*
|
|
104
|
+
* Λ = sqrt( v / (v + τ²) ) · exp( τ²·δ̂² / (2·v·(v + τ²)) ), Λ ≥ 1/α ⇒ reject H0
|
|
105
|
+
*
|
|
106
|
+
* We estimate v with the WELCH variance of the difference of means,
|
|
107
|
+
* v = s²_A/n_A + s²_B/n_B (Bessel-corrected per-arm sample variances). Welch
|
|
108
|
+
* (rather than a pooled within-arm variance) keeps the form unambiguous and
|
|
109
|
+
* robust to unequal arm variances — it does not assume homoscedasticity. In
|
|
110
|
+
* estimator coordinates no `nEff` factor appears: the sample sizes enter only
|
|
111
|
+
* through v (a larger n shrinks v, which grows Λ), so the historical
|
|
112
|
+
* "n² vs n" ambiguity of the sample-mean form is avoided entirely.
|
|
113
|
+
*
|
|
114
|
+
* Defensive: empty/below-warmup arms ⇒ abstain; zero variance on either arm
|
|
115
|
+
* with a non-zero gap AND ≥2 samples per arm ⇒ decisive (deterministic arms
|
|
116
|
+
* differ); <2 samples ⇒ abstain (cannot estimate variance); NaN-free.
|
|
117
|
+
*/
|
|
118
|
+
export declare function msprtTwoSample(samplesA: ReadonlyArray<number>, samplesB: ReadonlyArray<number>, opts?: MsprtOptions): SequentialVerdict;
|
|
119
|
+
export interface HoeffdingOptions {
|
|
120
|
+
/** Significance level for the confidence sequence. Default 0.05. */
|
|
121
|
+
alpha?: number;
|
|
122
|
+
/** Lower bound of the score range. Default 0 (Darwin composite scores). */
|
|
123
|
+
lo?: number;
|
|
124
|
+
/** Upper bound of the score range. Default 1 (Darwin composite scores). */
|
|
125
|
+
hi?: number;
|
|
126
|
+
/** Per-arm warmup floor (≥1). Default 2 — Hoeffding is valid at any n≥1
|
|
127
|
+
* but a 1-sample arm gives a useless [lo,hi]-wide interval. */
|
|
128
|
+
minSamplesPerArm?: number;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Two-sample, variance-free, always-valid decision via per-arm time-uniform
|
|
132
|
+
* Hoeffding confidence sequences for bounded variables.
|
|
133
|
+
*
|
|
134
|
+
* Each arm's mean is bracketed by a half-width that shrinks with n while
|
|
135
|
+
* staying valid under continuous monitoring:
|
|
136
|
+
*
|
|
137
|
+
* w(n) = (hi − lo) · sqrt( ln( (n+1)/alpha ) / (2n) )
|
|
138
|
+
*
|
|
139
|
+
* (a standard union-bound / Cramér–Chernoff time-uniform Hoeffding bound).
|
|
140
|
+
* The arms are declared decisively different when their mean gap exceeds the
|
|
141
|
+
* sum of the two half-widths — i.e. the confidence intervals no longer
|
|
142
|
+
* overlap. No variance estimate needed, so this is the honest method when
|
|
143
|
+
* only a handful of runs exist or the score distribution is skewed/bounded.
|
|
144
|
+
*
|
|
145
|
+
* Conservative by construction (wider than mSPRT) — prefer mSPRT once both
|
|
146
|
+
* arms have enough runs for a stable variance estimate.
|
|
147
|
+
*/
|
|
148
|
+
export declare function hoeffdingTwoSample(samplesA: ReadonlyArray<number>, samplesB: ReadonlyArray<number>, opts?: HoeffdingOptions): SequentialVerdict;
|
|
149
|
+
//# sourceMappingURL=sequential.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sequential.d.ts","sourceRoot":"","sources":["../../../src/evolution/sequential.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AAEH,0EAA0E;AAC1E,MAAM,MAAM,gBAAgB,GAAG,aAAa,GAAG,OAAO,GAAG,WAAW,CAAC;AAErE,6EAA6E;AAC7E,MAAM,WAAW,iBAAiB;IAChC,qFAAqF;IACrF,QAAQ,EAAE,OAAO,CAAC;IAClB,0CAA0C;IAC1C,MAAM,EAAE,gBAAgB,CAAC;IACzB,sFAAsF;IACtF,SAAS,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IACtB;;;;OAIG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,uEAAuE;IACvE,SAAS,EAAE,MAAM,CAAC;IAClB,2DAA2D;IAC3D,EAAE,EAAE,MAAM,CAAC;IACX,EAAE,EAAE,MAAM,CAAC;IACX,kEAAkE;IAClE,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,OAAO;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,kEAAkE;IAClE,QAAQ,EAAE,MAAM,CAAC;IACjB,CAAC,EAAE,MAAM,CAAC;CACX;AAED;;;;GAIG;AACH,wBAAgB,OAAO,CAAC,OAAO,EAAE,aAAa,CAAC,MAAM,CAAC,GAAG,OAAO,CAmB/D;AAED,MAAM,WAAW,YAAY;IAC3B,oEAAoE;IACpE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;;;;OAMG;IACH,GAAG,CAAC,EAAE,MAAM,CAAC;IACb;;;;OAIG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAMD;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAgB,cAAc,CAC5B,QAAQ,EAAE,aAAa,CAAC,MAAM,CAAC,EAC/B,QAAQ,EAAE,aAAa,CAAC,MAAM,CAAC,EAC/B,IAAI,GAAE,YAAiB,GACtB,iBAAiB,CAmFnB;AAED,MAAM,WAAW,gBAAgB;IAC/B,oEAAoE;IACpE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,2EAA2E;IAC3E,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,2EAA2E;IAC3E,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ;oEACgE;IAChE,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,kBAAkB,CAChC,QAAQ,EAAE,aAAa,CAAC,MAAM,CAAC,EAC/B,QAAQ,EAAE,aAAa,CAAC,MAAM,CAAC,EAC/B,IAAI,GAAE,gBAAqB,GAC1B,iBAAiB,CA8CnB"}
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Darwin — Always-Valid Sequential Testing (v0.7.0)
|
|
3
|
+
*
|
|
4
|
+
* Pure statistical primitives for peeking-resistant A/B decisions during
|
|
5
|
+
* prompt evolution. This module exists because Darwin's safety gate calls
|
|
6
|
+
* `evaluateABTest` after EVERY run — continuous monitoring with a fixed
|
|
7
|
+
* relative-improvement threshold inflates the false-positive rate (the
|
|
8
|
+
* classic "peeking problem"). v0.6.0 shipped a first-step effect-size
|
|
9
|
+
* heuristic (`SafetyGate.calculateConfidence`, |Δ| / pooled-mean ≥ 0.2);
|
|
10
|
+
* this module is the rigorous upgrade promised in the v0.6 roadmap notes.
|
|
11
|
+
*
|
|
12
|
+
* Two methods, both **always-valid** (the decision stays statistically
|
|
13
|
+
* sound no matter how many times you peek):
|
|
14
|
+
*
|
|
15
|
+
* 1. {@link msprtTwoSample} — Mixture Sequential Probability Ratio Test
|
|
16
|
+
* (Johari, Pekelis & Walsh 2017, arXiv:1512.04922; the engine behind
|
|
17
|
+
* Optimizely/Statsig's "stats engine"). Gaussian mixture prior over
|
|
18
|
+
* the effect size; uses the observed (pooled) variance. Most powerful
|
|
19
|
+
* when the per-arm sample variance is meaningful — i.e. once each arm
|
|
20
|
+
* has accumulated a handful of runs (see {@link MsprtOptions.minSamplesPerArm}).
|
|
21
|
+
*
|
|
22
|
+
* 2. {@link hoeffdingTwoSample} — a σ-free time-uniform confidence
|
|
23
|
+
* sequence for variables bounded to a known range (Darwin composite
|
|
24
|
+
* scores live in [0, 1]). Valid at ANY sample size with no variance
|
|
25
|
+
* estimate, so it is the honest choice when only a few runs exist.
|
|
26
|
+
* More conservative than mSPRT (wider intervals) by design.
|
|
27
|
+
*
|
|
28
|
+
* **Pure** — no LLM calls, no I/O, no `Date.now()`, no `Math.random()`.
|
|
29
|
+
* Fully deterministic, so tests pin exact statistic values.
|
|
30
|
+
*
|
|
31
|
+
* Caveat on warmup (documented, not hidden): mSPRT with an *estimated*
|
|
32
|
+
* variance is only asymptotically always-valid; with very few samples the
|
|
33
|
+
* variance estimate is noisy. Darwin's A/B sample sizes (minRuns 10–30) sit
|
|
34
|
+
* below the ~100-sample comfort zone for tight σ-estimation, so we expose
|
|
35
|
+
* `minSamplesPerArm` (default 5) below which mSPRT abstains (`decisive:false`)
|
|
36
|
+
* rather than fire on noise, and we offer Hoeffding as the σ-free fallback.
|
|
37
|
+
*/
|
|
38
|
+
/**
|
|
39
|
+
* Mean + Bessel-corrected sample variance over finite values. Non-finite
|
|
40
|
+
* entries (NaN/Infinity) are dropped — a single bad score never poisons the
|
|
41
|
+
* estimate. Returns `{mean:0, variance:0, n:0}` for an all-invalid/empty input.
|
|
42
|
+
*/
|
|
43
|
+
export function meanVar(samples) {
|
|
44
|
+
let n = 0;
|
|
45
|
+
let sum = 0;
|
|
46
|
+
for (const s of samples) {
|
|
47
|
+
if (typeof s === "number" && Number.isFinite(s)) {
|
|
48
|
+
n++;
|
|
49
|
+
sum += s;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
if (n === 0)
|
|
53
|
+
return { mean: 0, variance: 0, n: 0 };
|
|
54
|
+
const mean = sum / n;
|
|
55
|
+
if (n < 2)
|
|
56
|
+
return { mean, variance: 0, n };
|
|
57
|
+
let sse = 0;
|
|
58
|
+
for (const s of samples) {
|
|
59
|
+
if (typeof s === "number" && Number.isFinite(s)) {
|
|
60
|
+
sse += (s - mean) ** 2;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return { mean, variance: sse / (n - 1), n };
|
|
64
|
+
}
|
|
65
|
+
const DEFAULT_ALPHA = 0.05;
|
|
66
|
+
const DEFAULT_TAU = 0.1;
|
|
67
|
+
const DEFAULT_MIN_SAMPLES = 5;
|
|
68
|
+
/**
|
|
69
|
+
* Two-sample mixture SPRT for a difference in means with always-valid
|
|
70
|
+
* inference. Models H0: μ_A = μ_B against a Gaussian mixture alternative on
|
|
71
|
+
* the effect (prior δ ~ N(0, τ²) on the true mean difference). Returns
|
|
72
|
+
* `decisive:true` when the mixture likelihood ratio Λ crosses 1/alpha — a
|
|
73
|
+
* threshold valid at every n (no peeking penalty).
|
|
74
|
+
*
|
|
75
|
+
* Closed form in ESTIMATOR coordinates. Let δ̂ = x̄_B − x̄_A be the observed
|
|
76
|
+
* mean difference and v = Var(δ̂) its variance. Integrating the per-θ Gaussian
|
|
77
|
+
* likelihood ratio against the N(0, τ²) mixture prior (Johari, Pekelis &
|
|
78
|
+
* Walsh 2017) gives:
|
|
79
|
+
*
|
|
80
|
+
* Λ = sqrt( v / (v + τ²) ) · exp( τ²·δ̂² / (2·v·(v + τ²)) ), Λ ≥ 1/α ⇒ reject H0
|
|
81
|
+
*
|
|
82
|
+
* We estimate v with the WELCH variance of the difference of means,
|
|
83
|
+
* v = s²_A/n_A + s²_B/n_B (Bessel-corrected per-arm sample variances). Welch
|
|
84
|
+
* (rather than a pooled within-arm variance) keeps the form unambiguous and
|
|
85
|
+
* robust to unequal arm variances — it does not assume homoscedasticity. In
|
|
86
|
+
* estimator coordinates no `nEff` factor appears: the sample sizes enter only
|
|
87
|
+
* through v (a larger n shrinks v, which grows Λ), so the historical
|
|
88
|
+
* "n² vs n" ambiguity of the sample-mean form is avoided entirely.
|
|
89
|
+
*
|
|
90
|
+
* Defensive: empty/below-warmup arms ⇒ abstain; zero variance on either arm
|
|
91
|
+
* with a non-zero gap AND ≥2 samples per arm ⇒ decisive (deterministic arms
|
|
92
|
+
* differ); <2 samples ⇒ abstain (cannot estimate variance); NaN-free.
|
|
93
|
+
*/
|
|
94
|
+
export function msprtTwoSample(samplesA, samplesB, opts = {}) {
|
|
95
|
+
const alpha = clampAlpha(opts.alpha);
|
|
96
|
+
const tau = Number.isFinite(opts.tau) && opts.tau > 0 ? opts.tau : DEFAULT_TAU;
|
|
97
|
+
const minSamples = Number.isFinite(opts.minSamplesPerArm) && opts.minSamplesPerArm >= 1
|
|
98
|
+
? Math.floor(opts.minSamplesPerArm)
|
|
99
|
+
: DEFAULT_MIN_SAMPLES;
|
|
100
|
+
const threshold = 1 / alpha;
|
|
101
|
+
const a = meanVar(samplesA);
|
|
102
|
+
const b = meanVar(samplesB);
|
|
103
|
+
const base = {
|
|
104
|
+
method: "msprt",
|
|
105
|
+
threshold,
|
|
106
|
+
nA: a.n,
|
|
107
|
+
nB: b.n,
|
|
108
|
+
};
|
|
109
|
+
if (a.n < minSamples || b.n < minSamples) {
|
|
110
|
+
return {
|
|
111
|
+
...base,
|
|
112
|
+
decisive: false,
|
|
113
|
+
direction: 0,
|
|
114
|
+
statistic: 0,
|
|
115
|
+
reason: `warmup: need ≥${minSamples} samples/arm, have A=${a.n} B=${b.n}`,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
const delta = b.mean - a.mean;
|
|
119
|
+
const direction = delta > 0 ? 1 : delta < 0 ? -1 : 0;
|
|
120
|
+
// Welch variance of the difference of means: v = Var(δ̂) = s²_A/n_A + s²_B/n_B.
|
|
121
|
+
// This is the noise scale the mixture SPRT runs against; using it directly
|
|
122
|
+
// (not a pooled within-arm variance) handles unequal arm variances and
|
|
123
|
+
// removes the n-scaling ambiguity of the sample-mean form.
|
|
124
|
+
const varDelta = a.variance / a.n + b.variance / b.n;
|
|
125
|
+
// Degenerate branch: (near-)zero observed variance on the difference. With
|
|
126
|
+
// ≥2 samples per arm a non-zero gap between two deterministic arms is fully
|
|
127
|
+
// decisive; with <2 samples we cannot estimate variance at all → abstain.
|
|
128
|
+
if (!(varDelta > 0)) {
|
|
129
|
+
if (delta === 0 || a.n < 2 || b.n < 2) {
|
|
130
|
+
return {
|
|
131
|
+
...base,
|
|
132
|
+
decisive: false,
|
|
133
|
+
direction: 0,
|
|
134
|
+
statistic: 0,
|
|
135
|
+
reason: a.n < 2 || b.n < 2
|
|
136
|
+
? "insufficient samples to estimate variance"
|
|
137
|
+
: "identical constant arms",
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
return {
|
|
141
|
+
...base,
|
|
142
|
+
decisive: true,
|
|
143
|
+
direction,
|
|
144
|
+
statistic: Number.POSITIVE_INFINITY,
|
|
145
|
+
reason: "deterministic arms differ (zero variance)",
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
// Mixture SPRT closed form (estimator coordinates, prior δ ~ N(0, τ²)):
|
|
149
|
+
// Λ = √(v/(v+τ²)) · exp( τ²·δ̂² / (2·v·(v+τ²)) ), v = Var(δ̂)
|
|
150
|
+
const denom = varDelta + tau * tau;
|
|
151
|
+
const logLambda = 0.5 * Math.log(varDelta / denom) +
|
|
152
|
+
(tau * tau * delta * delta) / (2 * varDelta * denom);
|
|
153
|
+
const lambda = Math.exp(logLambda);
|
|
154
|
+
// Compare in log-space against log(1/alpha) for numerical robustness when Λ
|
|
155
|
+
// is astronomically large (exp overflow → Infinity is still > threshold).
|
|
156
|
+
const decisive = logLambda >= Math.log(threshold);
|
|
157
|
+
return {
|
|
158
|
+
...base,
|
|
159
|
+
decisive,
|
|
160
|
+
direction: decisive ? direction : 0,
|
|
161
|
+
statistic: lambda,
|
|
162
|
+
reason: decisive
|
|
163
|
+
? `Λ=${fmt(lambda)} ≥ 1/α=${fmt(threshold)}`
|
|
164
|
+
: `Λ=${fmt(lambda)} < 1/α=${fmt(threshold)} (keep testing)`,
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Two-sample, variance-free, always-valid decision via per-arm time-uniform
|
|
169
|
+
* Hoeffding confidence sequences for bounded variables.
|
|
170
|
+
*
|
|
171
|
+
* Each arm's mean is bracketed by a half-width that shrinks with n while
|
|
172
|
+
* staying valid under continuous monitoring:
|
|
173
|
+
*
|
|
174
|
+
* w(n) = (hi − lo) · sqrt( ln( (n+1)/alpha ) / (2n) )
|
|
175
|
+
*
|
|
176
|
+
* (a standard union-bound / Cramér–Chernoff time-uniform Hoeffding bound).
|
|
177
|
+
* The arms are declared decisively different when their mean gap exceeds the
|
|
178
|
+
* sum of the two half-widths — i.e. the confidence intervals no longer
|
|
179
|
+
* overlap. No variance estimate needed, so this is the honest method when
|
|
180
|
+
* only a handful of runs exist or the score distribution is skewed/bounded.
|
|
181
|
+
*
|
|
182
|
+
* Conservative by construction (wider than mSPRT) — prefer mSPRT once both
|
|
183
|
+
* arms have enough runs for a stable variance estimate.
|
|
184
|
+
*/
|
|
185
|
+
export function hoeffdingTwoSample(samplesA, samplesB, opts = {}) {
|
|
186
|
+
const alpha = clampAlpha(opts.alpha);
|
|
187
|
+
const lo = Number.isFinite(opts.lo) ? opts.lo : 0;
|
|
188
|
+
const hiRaw = Number.isFinite(opts.hi) ? opts.hi : 1;
|
|
189
|
+
const range = hiRaw > lo ? hiRaw - lo : 1; // guard inverted/zero range
|
|
190
|
+
const minSamples = Number.isFinite(opts.minSamplesPerArm) && opts.minSamplesPerArm >= 1
|
|
191
|
+
? Math.floor(opts.minSamplesPerArm)
|
|
192
|
+
: 2;
|
|
193
|
+
const a = meanVar(samplesA);
|
|
194
|
+
const b = meanVar(samplesB);
|
|
195
|
+
const base = { method: "hoeffding", nA: a.n, nB: b.n };
|
|
196
|
+
if (a.n < minSamples || b.n < minSamples) {
|
|
197
|
+
return {
|
|
198
|
+
...base,
|
|
199
|
+
decisive: false,
|
|
200
|
+
direction: 0,
|
|
201
|
+
statistic: 0,
|
|
202
|
+
threshold: range,
|
|
203
|
+
reason: `warmup: need ≥${minSamples} samples/arm, have A=${a.n} B=${b.n}`,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
const halfWidth = (n) => range * Math.sqrt(Math.log((n + 1) / alpha) / (2 * n));
|
|
207
|
+
const wA = halfWidth(a.n);
|
|
208
|
+
const wB = halfWidth(b.n);
|
|
209
|
+
const gap = Math.abs(b.mean - a.mean);
|
|
210
|
+
const threshold = wA + wB;
|
|
211
|
+
const decisive = gap > threshold;
|
|
212
|
+
const delta = b.mean - a.mean;
|
|
213
|
+
const direction = decisive ? (delta > 0 ? 1 : -1) : 0;
|
|
214
|
+
return {
|
|
215
|
+
...base,
|
|
216
|
+
decisive,
|
|
217
|
+
direction,
|
|
218
|
+
statistic: gap,
|
|
219
|
+
threshold,
|
|
220
|
+
reason: decisive
|
|
221
|
+
? `|Δ|=${fmt(gap)} > CS half-widths ${fmt(threshold)} (non-overlap)`
|
|
222
|
+
: `|Δ|=${fmt(gap)} ≤ CS half-widths ${fmt(threshold)} (overlap)`,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
function clampAlpha(alpha) {
|
|
226
|
+
if (!Number.isFinite(alpha))
|
|
227
|
+
return DEFAULT_ALPHA;
|
|
228
|
+
const a = alpha;
|
|
229
|
+
// Keep strictly inside (0,1); silly inputs fall back to the default.
|
|
230
|
+
if (a <= 0 || a >= 1)
|
|
231
|
+
return DEFAULT_ALPHA;
|
|
232
|
+
return a;
|
|
233
|
+
}
|
|
234
|
+
function fmt(x) {
|
|
235
|
+
if (!Number.isFinite(x))
|
|
236
|
+
return x > 0 ? "∞" : "-∞";
|
|
237
|
+
return x.toFixed(3);
|
|
238
|
+
}
|
|
239
|
+
//# sourceMappingURL=sequential.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sequential.js","sourceRoot":"","sources":["../../../src/evolution/sequential.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AAmCH;;;;GAIG;AACH,MAAM,UAAU,OAAO,CAAC,OAA8B;IACpD,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;YAChD,CAAC,EAAE,CAAC;YACJ,GAAG,IAAI,CAAC,CAAC;QACX,CAAC;IACH,CAAC;IACD,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC;IACnD,MAAM,IAAI,GAAG,GAAG,GAAG,CAAC,CAAC;IACrB,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC;IAC3C,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;YAChD,GAAG,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC;AAC9C,CAAC;AAqBD,MAAM,aAAa,GAAG,IAAI,CAAC;AAC3B,MAAM,WAAW,GAAG,GAAG,CAAC;AACxB,MAAM,mBAAmB,GAAG,CAAC,CAAC;AAE9B;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,UAAU,cAAc,CAC5B,QAA+B,EAC/B,QAA+B,EAC/B,OAAqB,EAAE;IAEvB,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACrC,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,IAAK,IAAI,CAAC,GAAc,GAAG,CAAC,CAAC,CAAC,CAAE,IAAI,CAAC,GAAc,CAAC,CAAC,CAAC,WAAW,CAAC;IACvG,MAAM,UAAU,GACd,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAK,IAAI,CAAC,gBAA2B,IAAI,CAAC;QAC9E,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,gBAA0B,CAAC;QAC7C,CAAC,CAAC,mBAAmB,CAAC;IAC1B,MAAM,SAAS,GAAG,CAAC,GAAG,KAAK,CAAC;IAE5B,MAAM,CAAC,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC5B,MAAM,CAAC,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC5B,MAAM,IAAI,GAAG;QACX,MAAM,EAAE,OAAgB;QACxB,SAAS;QACT,EAAE,EAAE,CAAC,CAAC,CAAC;QACP,EAAE,EAAE,CAAC,CAAC,CAAC;KACR,CAAC;IAEF,IAAI,CAAC,CAAC,CAAC,GAAG,UAAU,IAAI,CAAC,CAAC,CAAC,GAAG,UAAU,EAAE,CAAC;QACzC,OAAO;YACL,GAAG,IAAI;YACP,QAAQ,EAAE,KAAK;YACf,SAAS,EAAE,CAAC;YACZ,SAAS,EAAE,CAAC;YACZ,MAAM,EAAE,iBAAiB,UAAU,wBAAwB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;SAC1E,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;IAC9B,MAAM,SAAS,GAAe,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEjE,gFAAgF;IAChF,2EAA2E;IAC3E,uEAAuE;IACvE,2DAA2D;IAC3D,MAAM,QAAQ,GAAG,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC,CAAC;IAErD,2EAA2E;IAC3E,4EAA4E;IAC5E,0EAA0E;IAC1E,IAAI,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,EAAE,CAAC;QACpB,IAAI,KAAK,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACtC,OAAO;gBACL,GAAG,IAAI;gBACP,QAAQ,EAAE,KAAK;gBACf,SAAS,EAAE,CAAC;gBACZ,SAAS,EAAE,CAAC;gBACZ,MAAM,EACJ,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC;oBAChB,CAAC,CAAC,2CAA2C;oBAC7C,CAAC,CAAC,yBAAyB;aAChC,CAAC;QACJ,CAAC;QACD,OAAO;YACL,GAAG,IAAI;YACP,QAAQ,EAAE,IAAI;YACd,SAAS;YACT,SAAS,EAAE,MAAM,CAAC,iBAAiB;YACnC,MAAM,EAAE,2CAA2C;SACpD,CAAC;IACJ,CAAC;IAED,wEAAwE;IACxE,iEAAiE;IACjE,MAAM,KAAK,GAAG,QAAQ,GAAG,GAAG,GAAG,GAAG,CAAC;IACnC,MAAM,SAAS,GACb,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,GAAG,KAAK,CAAC;QAChC,CAAC,GAAG,GAAG,GAAG,GAAG,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,GAAG,KAAK,CAAC,CAAC;IACvD,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAEnC,4EAA4E;IAC5E,0EAA0E;IAC1E,MAAM,QAAQ,GAAG,SAAS,IAAI,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAElD,OAAO;QACL,GAAG,IAAI;QACP,QAAQ;QACR,SAAS,EAAE,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;QACnC,SAAS,EAAE,MAAM;QACjB,MAAM,EAAE,QAAQ;YACd,CAAC,CAAC,KAAK,GAAG,CAAC,MAAM,CAAC,UAAU,GAAG,CAAC,SAAS,CAAC,EAAE;YAC5C,CAAC,CAAC,KAAK,GAAG,CAAC,MAAM,CAAC,UAAU,GAAG,CAAC,SAAS,CAAC,iBAAiB;KAC9D,CAAC;AACJ,CAAC;AAcD;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,kBAAkB,CAChC,QAA+B,EAC/B,QAA+B,EAC/B,OAAyB,EAAE;IAE3B,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACrC,MAAM,EAAE,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAE,IAAI,CAAC,EAAa,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9D,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAE,IAAI,CAAC,EAAa,CAAC,CAAC,CAAC,CAAC,CAAC;IACjE,MAAM,KAAK,GAAG,KAAK,GAAG,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,4BAA4B;IACvE,MAAM,UAAU,GACd,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAK,IAAI,CAAC,gBAA2B,IAAI,CAAC;QAC9E,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,gBAA0B,CAAC;QAC7C,CAAC,CAAC,CAAC,CAAC;IAER,MAAM,CAAC,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC5B,MAAM,CAAC,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC5B,MAAM,IAAI,GAAG,EAAE,MAAM,EAAE,WAAoB,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IAEhE,IAAI,CAAC,CAAC,CAAC,GAAG,UAAU,IAAI,CAAC,CAAC,CAAC,GAAG,UAAU,EAAE,CAAC;QACzC,OAAO;YACL,GAAG,IAAI;YACP,QAAQ,EAAE,KAAK;YACf,SAAS,EAAE,CAAC;YACZ,SAAS,EAAE,CAAC;YACZ,SAAS,EAAE,KAAK;YAChB,MAAM,EAAE,iBAAiB,UAAU,wBAAwB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;SAC1E,CAAC;IACJ,CAAC;IAED,MAAM,SAAS,GAAG,CAAC,CAAS,EAAU,EAAE,CACtC,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IAEzD,MAAM,EAAE,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,EAAE,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;IACtC,MAAM,SAAS,GAAG,EAAE,GAAG,EAAE,CAAC;IAC1B,MAAM,QAAQ,GAAG,GAAG,GAAG,SAAS,CAAC;IACjC,MAAM,KAAK,GAAG,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;IAC9B,MAAM,SAAS,GAAe,QAAQ,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAElE,OAAO;QACL,GAAG,IAAI;QACP,QAAQ;QACR,SAAS;QACT,SAAS,EAAE,GAAG;QACd,SAAS;QACT,MAAM,EAAE,QAAQ;YACd,CAAC,CAAC,OAAO,GAAG,CAAC,GAAG,CAAC,qBAAqB,GAAG,CAAC,SAAS,CAAC,gBAAgB;YACpE,CAAC,CAAC,OAAO,GAAG,CAAC,GAAG,CAAC,qBAAqB,GAAG,CAAC,SAAS,CAAC,YAAY;KACnE,CAAC;AACJ,CAAC;AAED,SAAS,UAAU,CAAC,KAAyB;IAC3C,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;QAAE,OAAO,aAAa,CAAC;IAClD,MAAM,CAAC,GAAG,KAAe,CAAC;IAC1B,qEAAqE;IACrE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,aAAa,CAAC;IAC3C,OAAO,CAAC,CAAC;AACX,CAAC;AAED,SAAS,GAAG,CAAC,CAAS;IACpB,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;QAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC;IACnD,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;AACtB,CAAC"}
|
|
@@ -44,6 +44,18 @@ export declare class ExperimentTracker {
|
|
|
44
44
|
* (otherwise the incumbent version's historical data skews the comparison).
|
|
45
45
|
*/
|
|
46
46
|
getAverageComposite(agentName: string, version: string, weights?: MetricWeights, since?: string): Promise<number>;
|
|
47
|
+
/**
|
|
48
|
+
* v0.7.0 — Per-experiment composite scores for a specific agent + prompt
|
|
49
|
+
* version, in chronological order. Unlike {@link getAverageComposite} this
|
|
50
|
+
* does NOT collapse to a scalar — it feeds the always-valid sequential
|
|
51
|
+
* confidence gate (mSPRT / Hoeffding), which needs the individual samples
|
|
52
|
+
* (and therefore their variance), not just the mean.
|
|
53
|
+
*
|
|
54
|
+
* If `since` is provided, only experiments at/after that ISO timestamp are
|
|
55
|
+
* included — pass the A/B test start so the incumbent's historical runs do
|
|
56
|
+
* not skew the comparison (same convention as {@link getAverageComposite}).
|
|
57
|
+
*/
|
|
58
|
+
getCompositeScores(agentName: string, version: string, weights?: MetricWeights, since?: string): Promise<number[]>;
|
|
47
59
|
/**
|
|
48
60
|
* v0.6.0 — Average raw metric vector for a specific agent + prompt version,
|
|
49
61
|
* keyed by the names in `DarwinMetrics` / `DARWIN_DEFAULT_OBJECTIVES`
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tracker.d.ts","sourceRoot":"","sources":["../../../src/evolution/tracker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,gBAAgB,EAEhB,cAAc,EACd,aAAa,EACb,kBAAkB,EACnB,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpD,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,MAAM,CAAiB;gBAEnB,MAAM,EAAE,cAAc;IAIlC;;;;OAIG;IACG,gBAAgB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IA+B5D;;;OAGG;IACG,QAAQ,CACZ,SAAS,EAAE,MAAM,EACjB,OAAO,CAAC,EAAE,MAAM,GACf,OAAO,CAAC,kBAAkB,CAAC;IA6C9B;;;;;;;;;OASG;IACH,iBAAiB,CACf,GAAG,EAAE,gBAAgB,EACrB,OAAO,GAAE,aAA+B,GACvC,MAAM;IA8BT;;;OAGG;IACG,kBAAkB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;IA6BrE;;;;;;OAMG;IACG,mBAAmB,CACvB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,EACf,OAAO,GAAE,aAA+B,EACxC,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,MAAM,CAAC;IAoBlB;;;;;;;;;;;;;;;;OAgBG;IACG,iBAAiB,CACrB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,EACf,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CA2BnC"}
|
|
1
|
+
{"version":3,"file":"tracker.d.ts","sourceRoot":"","sources":["../../../src/evolution/tracker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,gBAAgB,EAEhB,cAAc,EACd,aAAa,EACb,kBAAkB,EACnB,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpD,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,MAAM,CAAiB;gBAEnB,MAAM,EAAE,cAAc;IAIlC;;;;OAIG;IACG,gBAAgB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IA+B5D;;;OAGG;IACG,QAAQ,CACZ,SAAS,EAAE,MAAM,EACjB,OAAO,CAAC,EAAE,MAAM,GACf,OAAO,CAAC,kBAAkB,CAAC;IA6C9B;;;;;;;;;OASG;IACH,iBAAiB,CACf,GAAG,EAAE,gBAAgB,EACrB,OAAO,GAAE,aAA+B,GACvC,MAAM;IA8BT;;;OAGG;IACG,kBAAkB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;IA6BrE;;;;;;OAMG;IACG,mBAAmB,CACvB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,EACf,OAAO,GAAE,aAA+B,EACxC,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,MAAM,CAAC;IAoBlB;;;;;;;;;;OAUG;IACG,kBAAkB,CACtB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,EACf,OAAO,GAAE,aAA+B,EACxC,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,MAAM,EAAE,CAAC;IAgBpB;;;;;;;;;;;;;;;;OAgBG;IACG,iBAAiB,CACrB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,EACf,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CA2BnC"}
|
|
@@ -159,6 +159,30 @@ export class ExperimentTracker {
|
|
|
159
159
|
const total = filtered.reduce((sum, exp) => sum + this.getCompositeScore(exp, weights), 0);
|
|
160
160
|
return total / filtered.length;
|
|
161
161
|
}
|
|
162
|
+
/**
|
|
163
|
+
* v0.7.0 — Per-experiment composite scores for a specific agent + prompt
|
|
164
|
+
* version, in chronological order. Unlike {@link getAverageComposite} this
|
|
165
|
+
* does NOT collapse to a scalar — it feeds the always-valid sequential
|
|
166
|
+
* confidence gate (mSPRT / Hoeffding), which needs the individual samples
|
|
167
|
+
* (and therefore their variance), not just the mean.
|
|
168
|
+
*
|
|
169
|
+
* If `since` is provided, only experiments at/after that ISO timestamp are
|
|
170
|
+
* included — pass the A/B test start so the incumbent's historical runs do
|
|
171
|
+
* not skew the comparison (same convention as {@link getAverageComposite}).
|
|
172
|
+
*/
|
|
173
|
+
async getCompositeScores(agentName, version, weights = DEFAULT_WEIGHTS, since) {
|
|
174
|
+
const experiments = await this.memory.loadExperiments(agentName);
|
|
175
|
+
let filtered = experiments.filter((e) => e.promptVersion === version);
|
|
176
|
+
if (since) {
|
|
177
|
+
filtered = filtered.filter((e) => e.startedAt >= since);
|
|
178
|
+
}
|
|
179
|
+
// loadExperiments() returns newest-first; reverse to chronological order
|
|
180
|
+
// so the sequence mirrors how the data actually accrued during the test.
|
|
181
|
+
return filtered
|
|
182
|
+
.slice()
|
|
183
|
+
.reverse()
|
|
184
|
+
.map((exp) => this.getCompositeScore(exp, weights));
|
|
185
|
+
}
|
|
162
186
|
/**
|
|
163
187
|
* v0.6.0 — Average raw metric vector for a specific agent + prompt version,
|
|
164
188
|
* keyed by the names in `DarwinMetrics` / `DARWIN_DEFAULT_OBJECTIVES`
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tracker.js","sourceRoot":"","sources":["../../../src/evolution/tracker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAE9C,MAAM,OAAO,iBAAiB;IACpB,MAAM,CAAiB;IAE/B,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,gBAAgB,CAAC,GAAqB;QAC1C,gCAAgC;QAChC,MAAM,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;QAEtC,oDAAoD;QACpD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvE,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,GAAG,CAAC,aAAa,CAAC,CAAC;QACtE,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,SAAS,EAAE,GAAG,CAAC,aAAa,CAAC,CAAC;YAC3E,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC;YAC7B,MAAM,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;QAC/C,CAAC;QAED,+EAA+E;QAC/E,qEAAqE;QACrE,+EAA+E;QAC/E,MAAM,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,KAAK,EAAE,EAAE;YACtC,KAAK,CAAC,gBAAgB,CAAC,GAAG,CAAC,SAAS,CAAC;gBACnC,CAAC,KAAK,CAAC,gBAAgB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YAEnD,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;gBAChB,KAAK,CAAC,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC/C,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAC;oBACtC,CAAC,KAAK,CAAC,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YACxD,CAAC;YAED,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,SAAiB,EACjB,OAAgB;QAEhB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QAEjE,MAAM,QAAQ,GAAG,OAAO;YACtB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,KAAK,OAAO,CAAC;YACxD,CAAC,CAAC,WAAW,CAAC;QAEhB,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO;gBACL,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,CAAC;gBACb,WAAW,EAAE,CAAC;gBACd,WAAW,EAAE,CAAC;gBACd,cAAc,EAAE,CAAC;aAClB,CAAC;QACJ,CAAC;QAED,MAAM,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC;QAClC,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAE9D,4DAA4D;QAC5D,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CACjC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CACvC,CAAC;QACF,MAAM,UAAU,GACd,WAAW,CAAC,MAAM,GAAG,CAAC;YACpB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBACtE,WAAW,CAAC,MAAM;YACpB,CAAC,CAAC,CAAC,CAAC;QAER,MAAM,WAAW,GACf,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC;QAEzE,MAAM,cAAc,GAClB,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC;QAE1E,OAAO;YACL,SAAS;YACT,UAAU;YACV,WAAW;YACX,WAAW,EAAE,YAAY,GAAG,SAAS;YACrC,cAAc;SACf,CAAC;IACJ,CAAC;IAED;;;;;;;;;OASG;IACH,iBAAiB,CACf,GAAqB,EACrB,UAAyB,eAAe;QAExC,iFAAiF;QACjF,iEAAiE;QACjE,MAAM,UAAU,GAAG,GAAG,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CAAC;QACrD,MAAM,WAAW,GAAG,UAAU,CAAC,CAAC,CAAE,GAAG,CAAC,OAAO,CAAC,YAAuB,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAE/E,oEAAoE;QACpE,MAAM,sBAAsB,GAAG,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;QAChE,MAAM,SAAS,GAAG,sBAAsB,GAAG,OAAO,CAAC,WAAW,GAAG,OAAO,CAAC,YAAY,GAAG,OAAO,CAAC,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC;QAC3H,MAAM,KAAK,GAAG,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,UAAU,GAAG;YACjB,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,GAAG,EAAE,EAAE,CAAC,CAAC;YACtD,YAAY,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,GAAG,KAAK,EAAE,CAAC,CAAC;YAC3D,QAAQ,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,UAAU,GAAG,MAAM,EAAE,CAAC,CAAC;YAC1D,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;SAC7B,CAAC;QAEF,MAAM,KAAK,GAAG,CACZ,UAAU,CAAC,OAAO,GAAG,sBAAsB;YAC3C,UAAU,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW;YAC5C,UAAU,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY;YAC9C,UAAU,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ;YACtC,UAAU,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,CACrC,GAAG,KAAK,CAAC;QAEV,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,kBAAkB,CAAC,SAAiB;QACxC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACjE,MAAM,UAAU,GAAG,IAAI,GAAG,EAA8B,CAAC;QAEzD,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;YAC9B,MAAM,GAAG,GAAG,GAAG,CAAC,QAAQ,IAAI,SAAS,CAAC;YACtC,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACjC,IAAI,IAAI,EAAE,CAAC;gBACT,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACjB,CAAC;iBAAM,CAAC;gBACN,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,KAAK,MAAM,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,UAAU,EAAE,CAAC;YAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CAAC,CAAC;YACxE,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC;gBACvC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,WAAW,CAAC,MAAM;gBACzF,CAAC,CAAC,CAAC,CAAC;YACN,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC;YACzF,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;YAEvE,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,WAAW,EAAE,CAAC,CAAC;QAC7F,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;IAC1D,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,mBAAmB,CACvB,SAAiB,EACjB,OAAe,EACf,UAAyB,eAAe,EACxC,KAAc;QAEd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACjE,IAAI,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,KAAK,OAAO,CAAC,CAAC;QAEtE,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,CAAC;QACX,CAAC;QAED,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAC3B,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,iBAAiB,CAAC,GAAG,EAAE,OAAO,CAAC,EACxD,CAAC,CACF,CAAC;QAEF,OAAO,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC;IACjC,CAAC;IAED;;;;;;;;;;;;;;;;OAgBG;IACH,KAAK,CAAC,iBAAiB,CACrB,SAAiB,EACjB,OAAe,EACf,KAAc;QAEd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACjE,IAAI,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,KAAK,OAAO,CAAC,CAAC;QAEtE,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC;QAC1B,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CAAC,CAAC;QAC5E,MAAM,UAAU,GACd,WAAW,CAAC,MAAM,GAAG,CAAC;YACpB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBAClE,WAAW,CAAC,MAAM;YACpB,CAAC,CAAC,CAAC,CAAC;QAER,OAAO;YACL,YAAY,EAAE,UAAU;YACxB,WAAW,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,CAAC;YACxE,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC,GAAG,CAAC;YAC1E,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,CAAC;SACvE,CAAC;IACJ,CAAC;CACF"}
|
|
1
|
+
{"version":3,"file":"tracker.js","sourceRoot":"","sources":["../../../src/evolution/tracker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAE9C,MAAM,OAAO,iBAAiB;IACpB,MAAM,CAAiB;IAE/B,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,gBAAgB,CAAC,GAAqB;QAC1C,gCAAgC;QAChC,MAAM,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;QAEtC,oDAAoD;QACpD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvE,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,GAAG,CAAC,aAAa,CAAC,CAAC;QACtE,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,SAAS,EAAE,GAAG,CAAC,aAAa,CAAC,CAAC;YAC3E,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC;YAC7B,MAAM,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;QAC/C,CAAC;QAED,+EAA+E;QAC/E,qEAAqE;QACrE,+EAA+E;QAC/E,MAAM,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,KAAK,EAAE,EAAE;YACtC,KAAK,CAAC,gBAAgB,CAAC,GAAG,CAAC,SAAS,CAAC;gBACnC,CAAC,KAAK,CAAC,gBAAgB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YAEnD,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;gBAChB,KAAK,CAAC,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC/C,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAC;oBACtC,CAAC,KAAK,CAAC,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YACxD,CAAC;YAED,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,SAAiB,EACjB,OAAgB;QAEhB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QAEjE,MAAM,QAAQ,GAAG,OAAO;YACtB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,KAAK,OAAO,CAAC;YACxD,CAAC,CAAC,WAAW,CAAC;QAEhB,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO;gBACL,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,CAAC;gBACb,WAAW,EAAE,CAAC;gBACd,WAAW,EAAE,CAAC;gBACd,cAAc,EAAE,CAAC;aAClB,CAAC;QACJ,CAAC;QAED,MAAM,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC;QAClC,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAE9D,4DAA4D;QAC5D,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CACjC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CACvC,CAAC;QACF,MAAM,UAAU,GACd,WAAW,CAAC,MAAM,GAAG,CAAC;YACpB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBACtE,WAAW,CAAC,MAAM;YACpB,CAAC,CAAC,CAAC,CAAC;QAER,MAAM,WAAW,GACf,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC;QAEzE,MAAM,cAAc,GAClB,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC;QAE1E,OAAO;YACL,SAAS;YACT,UAAU;YACV,WAAW;YACX,WAAW,EAAE,YAAY,GAAG,SAAS;YACrC,cAAc;SACf,CAAC;IACJ,CAAC;IAED;;;;;;;;;OASG;IACH,iBAAiB,CACf,GAAqB,EACrB,UAAyB,eAAe;QAExC,iFAAiF;QACjF,iEAAiE;QACjE,MAAM,UAAU,GAAG,GAAG,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CAAC;QACrD,MAAM,WAAW,GAAG,UAAU,CAAC,CAAC,CAAE,GAAG,CAAC,OAAO,CAAC,YAAuB,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAE/E,oEAAoE;QACpE,MAAM,sBAAsB,GAAG,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;QAChE,MAAM,SAAS,GAAG,sBAAsB,GAAG,OAAO,CAAC,WAAW,GAAG,OAAO,CAAC,YAAY,GAAG,OAAO,CAAC,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC;QAC3H,MAAM,KAAK,GAAG,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,UAAU,GAAG;YACjB,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,GAAG,EAAE,EAAE,CAAC,CAAC;YACtD,YAAY,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,GAAG,KAAK,EAAE,CAAC,CAAC;YAC3D,QAAQ,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,UAAU,GAAG,MAAM,EAAE,CAAC,CAAC;YAC1D,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;SAC7B,CAAC;QAEF,MAAM,KAAK,GAAG,CACZ,UAAU,CAAC,OAAO,GAAG,sBAAsB;YAC3C,UAAU,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW;YAC5C,UAAU,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY;YAC9C,UAAU,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ;YACtC,UAAU,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,CACrC,GAAG,KAAK,CAAC;QAEV,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,kBAAkB,CAAC,SAAiB;QACxC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACjE,MAAM,UAAU,GAAG,IAAI,GAAG,EAA8B,CAAC;QAEzD,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;YAC9B,MAAM,GAAG,GAAG,GAAG,CAAC,QAAQ,IAAI,SAAS,CAAC;YACtC,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACjC,IAAI,IAAI,EAAE,CAAC;gBACT,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACjB,CAAC;iBAAM,CAAC;gBACN,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,KAAK,MAAM,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,UAAU,EAAE,CAAC;YAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CAAC,CAAC;YACxE,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC;gBACvC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,WAAW,CAAC,MAAM;gBACzF,CAAC,CAAC,CAAC,CAAC;YACN,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC;YACzF,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;YAEvE,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,WAAW,EAAE,CAAC,CAAC;QAC7F,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;IAC1D,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,mBAAmB,CACvB,SAAiB,EACjB,OAAe,EACf,UAAyB,eAAe,EACxC,KAAc;QAEd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACjE,IAAI,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,KAAK,OAAO,CAAC,CAAC;QAEtE,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,CAAC,CAAC;QACX,CAAC;QAED,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAC3B,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,iBAAiB,CAAC,GAAG,EAAE,OAAO,CAAC,EACxD,CAAC,CACF,CAAC;QAEF,OAAO,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC;IACjC,CAAC;IAED;;;;;;;;;;OAUG;IACH,KAAK,CAAC,kBAAkB,CACtB,SAAiB,EACjB,OAAe,EACf,UAAyB,eAAe,EACxC,KAAc;QAEd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACjE,IAAI,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,KAAK,OAAO,CAAC,CAAC;QAEtE,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC;QAC1D,CAAC;QAED,yEAAyE;QACzE,yEAAyE;QACzE,OAAO,QAAQ;aACZ,KAAK,EAAE;aACP,OAAO,EAAE;aACT,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,iBAAiB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC;IACxD,CAAC;IAED;;;;;;;;;;;;;;;;OAgBG;IACH,KAAK,CAAC,iBAAiB,CACrB,SAAiB,EACjB,OAAe,EACf,KAAc;QAEd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACjE,IAAI,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,KAAK,OAAO,CAAC,CAAC;QAEtE,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC;QAC1B,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI,CAAC,CAAC;QAC5E,MAAM,UAAU,GACd,WAAW,CAAC,MAAM,GAAG,CAAC;YACpB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBAClE,WAAW,CAAC,MAAM;YACpB,CAAC,CAAC,CAAC,CAAC;QAER,OAAO;YACL,YAAY,EAAE,UAAU;YACxB,WAAW,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,CAAC;YACxE,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC,GAAG,CAAC;YAC1E,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,CAAC;SACvE,CAAC;IACJ,CAAC;CACF"}
|
package/dist/src/index.d.ts
CHANGED
|
@@ -31,9 +31,11 @@ export { ClaudeCliProvider } from './providers/claude-cli.js';
|
|
|
31
31
|
export { createMemory, SqliteMemoryProvider, PostgresMemoryProvider } from './memory/index.js';
|
|
32
32
|
export { loadNotificationConfig } from './evolution/notifications.js';
|
|
33
33
|
export type { NotificationConfig } from './evolution/notifications.js';
|
|
34
|
-
export { dominates, nonDominatedFront, paretoSelect, scalarise, crowdingDistance, DARWIN_DEFAULT_OBJECTIVES, type ParetoObjective, type ParetoTruncationStrategy, } from './evolution/pareto.js';
|
|
34
|
+
export { dominates, dominatesEpsilon, nonDominatedFront, paretoSelect, scalarise, crowdingDistance, coverageFrontier, coverageWeights, selectByCoverage, sampleByCoverage, DARWIN_DEFAULT_OBJECTIVES, type ParetoObjective, type ParetoTruncationStrategy, type FrontierKey, type CoverageScores, } from './evolution/pareto.js';
|
|
35
35
|
export { Reflector, type ReflectiveFeedback, type ReflectOptions, } from './evolution/reflector.js';
|
|
36
36
|
export type { RunPromptFn } from './evolution/run-prompt-fn.js';
|
|
37
|
-
export { GepaOptimizer, type ScoredVariant, type GenerateOptions as GepaGenerateOptions, type NextGenerationOptions as GepaNextGenerationOptions, type GepaOptimizerOptions, type MergeOptions as GepaMergeOptions, } from './evolution/optimizer-gepa.js';
|
|
38
|
-
export { checkAlignmentPreservation, SAFETY_PATTERNS, } from './evolution/alignment.js';
|
|
37
|
+
export { GepaOptimizer, epochShuffledMinibatch, type ScoredVariant, type GenerateOptions as GepaGenerateOptions, type NextGenerationOptions as GepaNextGenerationOptions, type GepaOptimizerOptions, type MergeOptions as GepaMergeOptions, } from './evolution/optimizer-gepa.js';
|
|
38
|
+
export { checkAlignmentPreservation, checkAlignmentPreservationSemantic, SAFETY_PATTERNS, type EmbedFn, type SemanticAlignmentOptions, } from './evolution/alignment.js';
|
|
39
|
+
export { runMultiCritic, stripMarkdownForJudging, getCriticPrompts, type RunCriticFn, type RunMultiCriticOptions, type CriticPromptDef, type CriticScore, type MultiCriticResult, } from './evolution/multi-critic.js';
|
|
40
|
+
export { meanVar, msprtTwoSample, hoeffdingTwoSample, type ConfidenceMethod, type SequentialVerdict, type MeanVar, type MsprtOptions, type HoeffdingOptions, } from './evolution/sequential.js';
|
|
39
41
|
//# sourceMappingURL=index.d.ts.map
|