@tangle-network/agent-eval 0.48.0 → 0.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/{traceai.d.ts → otel.d.ts} +29 -29
- package/dist/adapters/{traceai.js → otel.js} +9 -5
- package/dist/adapters/otel.js.map +1 -0
- package/dist/campaign/index.d.ts +3 -3
- package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
- package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
- package/dist/chunk-EGIPWXHL.js.map +1 -0
- package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
- package/dist/chunk-FQK2CCIM.js.map +1 -0
- package/dist/chunk-MAZ26DC7.js +99 -0
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
- package/dist/contract/index.d.ts +206 -9
- package/dist/contract/index.js +751 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +8 -192
- package/dist/hosted/index.js +1 -1
- package/dist/index-BRxz6qov.d.ts +409 -0
- package/dist/index.d.ts +18 -462
- package/dist/index.js +14 -106
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
- package/dist/registry-8KAs18kY.d.ts +457 -0
- package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +6 -4
- package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
- package/dist/rl.d.ts +9 -8
- package/dist/rl.js +3 -2
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
- package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
- package/dist/sequential-5iSVfzl2.d.ts +139 -0
- package/dist/store-CJbzDxZ2.d.ts +220 -0
- package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
- package/dist/traces.d.ts +3 -220
- package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
- package/dist/types-DhqpAi_z.d.ts +296 -0
- package/docs/adapters-observability.md +3 -3
- package/package.json +5 -5
- package/dist/adapters/traceai.js.map +0 -1
- package/dist/chunk-MNL6LXGQ.js.map +0 -1
- package/dist/chunk-OYI6RZJK.js.map +0 -1
- /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
- /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
- /package/docs/design/{substrate-gaps-2026-05-27.md → substrate-gaps.md} +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
// src/sequential.ts
|
|
2
|
+
function pairedEvalueSequence(deltas, opts = {}) {
|
|
3
|
+
const c = opts.bound ?? 1;
|
|
4
|
+
const alpha = opts.alpha ?? 0.05;
|
|
5
|
+
const initialShrink = opts.initialBetShrinkage ?? 0.5;
|
|
6
|
+
const rope = opts.rope ?? null;
|
|
7
|
+
if (c <= 0) throw new Error("pairedEvalueSequence: bound must be > 0");
|
|
8
|
+
if (alpha <= 0 || alpha >= 1) throw new Error("pairedEvalueSequence: alpha must be in (0,1)");
|
|
9
|
+
if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {
|
|
10
|
+
throw new Error("pairedEvalueSequence: rope must satisfy low \u2264 high");
|
|
11
|
+
}
|
|
12
|
+
const steps = [];
|
|
13
|
+
let clipped = false;
|
|
14
|
+
let evalue = 1;
|
|
15
|
+
let decisionFiredAt = null;
|
|
16
|
+
let sum = 0;
|
|
17
|
+
let sumSq = 0;
|
|
18
|
+
let count = 0;
|
|
19
|
+
for (let i = 0; i < deltas.length; i++) {
|
|
20
|
+
let d = deltas[i];
|
|
21
|
+
if (d < -c || d > c) {
|
|
22
|
+
d = Math.max(-c, Math.min(c, d));
|
|
23
|
+
clipped = true;
|
|
24
|
+
}
|
|
25
|
+
const muHat = count === 0 ? 0 : sum / count;
|
|
26
|
+
const varHat = count === 0 ? c * c : Math.max(1e-12, sumSq / count - muHat * muHat);
|
|
27
|
+
const t = i + 1;
|
|
28
|
+
const shrink = initialShrink * Math.min(1, count / 32);
|
|
29
|
+
let lambda = muHat / (varHat + c * c) * shrink;
|
|
30
|
+
const lambdaMax = 0.99 / c;
|
|
31
|
+
if (lambda > lambdaMax) lambda = lambdaMax;
|
|
32
|
+
if (lambda < -lambdaMax) lambda = -lambdaMax;
|
|
33
|
+
evalue = evalue * (1 + lambda * d);
|
|
34
|
+
if (!Number.isFinite(evalue) || evalue < 0) evalue = 0;
|
|
35
|
+
sum += d;
|
|
36
|
+
sumSq += d * d;
|
|
37
|
+
count += 1;
|
|
38
|
+
const pValue = Math.min(1, 1 / Math.max(evalue, 1e-300));
|
|
39
|
+
const cs = empiricalBernsteinCs(sum, sumSq, count, c, alpha);
|
|
40
|
+
let decision = "continue";
|
|
41
|
+
if (rope && cs.low >= rope.low && cs.high <= rope.high) decision = "equivalent";
|
|
42
|
+
else if (evalue >= 2 / alpha && muHat > 0) decision = "promote_now";
|
|
43
|
+
else if (evalue >= 2 / alpha && muHat < 0) decision = "reject_now";
|
|
44
|
+
else if (rope && cs.high < rope.low) decision = "reject_now";
|
|
45
|
+
if (decision !== "continue" && decisionFiredAt === null) decisionFiredAt = t;
|
|
46
|
+
steps.push({ t, delta: d, evalue, pValue, csLow: cs.low, csHigh: cs.high, decision });
|
|
47
|
+
}
|
|
48
|
+
const finalDecision = steps.length === 0 ? "continue" : steps[steps.length - 1].decision;
|
|
49
|
+
return { steps, finalDecision, decisionFiredAt, clipped };
|
|
50
|
+
}
|
|
51
|
+
function evaluateInterimReleaseConfidence(input) {
|
|
52
|
+
const candidates = input.deltaSeries.map((s) => {
|
|
53
|
+
const seq = pairedEvalueSequence(s.deltas, {
|
|
54
|
+
alpha: input.alpha,
|
|
55
|
+
bound: input.bound,
|
|
56
|
+
rope: input.rope
|
|
57
|
+
});
|
|
58
|
+
const last = seq.steps[seq.steps.length - 1];
|
|
59
|
+
return {
|
|
60
|
+
candidateId: s.candidateId,
|
|
61
|
+
decision: seq.finalDecision,
|
|
62
|
+
decisionFiredAt: seq.decisionFiredAt,
|
|
63
|
+
finalEvalue: last?.evalue ?? 1,
|
|
64
|
+
finalPValue: last?.pValue ?? 1,
|
|
65
|
+
pairs: seq.steps.length,
|
|
66
|
+
csLow: last?.csLow ?? Number.NEGATIVE_INFINITY,
|
|
67
|
+
csHigh: last?.csHigh ?? Number.POSITIVE_INFINITY
|
|
68
|
+
};
|
|
69
|
+
});
|
|
70
|
+
const promote = candidates.find((c) => c.decision === "promote_now");
|
|
71
|
+
if (promote)
|
|
72
|
+
return {
|
|
73
|
+
candidates,
|
|
74
|
+
recommendation: { decision: "promote_now", candidateId: promote.candidateId }
|
|
75
|
+
};
|
|
76
|
+
const live = candidates.find((c) => c.decision === "continue");
|
|
77
|
+
if (live) return { candidates, recommendation: { decision: "continue", candidateId: null } };
|
|
78
|
+
const equiv = candidates.find((c) => c.decision === "equivalent");
|
|
79
|
+
if (equiv)
|
|
80
|
+
return {
|
|
81
|
+
candidates,
|
|
82
|
+
recommendation: { decision: "equivalent", candidateId: equiv.candidateId }
|
|
83
|
+
};
|
|
84
|
+
return { candidates, recommendation: { decision: "reject_now", candidateId: null } };
|
|
85
|
+
}
|
|
86
|
+
function empiricalBernsteinCs(sum, sumSq, n, bound, alpha) {
|
|
87
|
+
if (n === 0) return { low: -bound, high: bound };
|
|
88
|
+
const mean = sum / n;
|
|
89
|
+
const variance = Math.max(0, sumSq / n - mean * mean);
|
|
90
|
+
const psi = Math.log(2 / alpha) + 1.7 * Math.log(Math.log(Math.max(Math.E, n)) + 1);
|
|
91
|
+
const radius = Math.sqrt(2 * variance * psi / n) + 3 * bound * psi / n;
|
|
92
|
+
return { low: mean - radius, high: mean + radius };
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export {
|
|
96
|
+
pairedEvalueSequence,
|
|
97
|
+
evaluateInterimReleaseConfidence
|
|
98
|
+
};
|
|
99
|
+
//# sourceMappingURL=chunk-MAZ26DC7.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/sequential.ts"],"sourcesContent":["/**\n * Always-valid sequential evaluation.\n *\n * `researchReport` assumes a single pre-specified analysis. Real\n * consumers run campaigns weekly / nightly / per-PR; each new run silently\n * inflates the false-discovery rate, because the BH-FDR guarantee is for\n * the *first* look, not the 47th. Without time-uniform inference,\n * launch-decision teams either (a) don't peek, which forfeits the cost\n * advantage of stop-when-decisive, or (b) peek and pretend they didn't,\n * which forfeits scientific validity.\n *\n * This module ships **e-value-based confidence sequences** for paired\n * bounded outcomes. The methodology is the predictable plug-in betting\n * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*\n * stopping time. Concretely:\n *\n * For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,\n * a betting fraction λ_i is chosen using only D_{1..i-1} (predictable\n * plug-in), and the running e-value is\n *\n * E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)\n *\n * E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by\n * Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null\n * at any time without inflating the type-I error.\n *\n * Combined with `runEvalCampaign`, every consumer running rolling\n * campaigns gains the ability to ship the moment evidence is decisive,\n * stop-early on dead-on-arrival variants, and accumulate evidence across\n * partial runs without spending the FDR budget. No new sweep is wasted.\n *\n * References:\n * - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).\n * Time-uniform, nonparametric, nonasymptotic confidence sequences.\n * Annals of Statistics, 49(2), 1055–1080.\n * - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded\n * random variables by betting. JRSS B, 86(1), 1–27.\n */\n\nexport type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent'\n\nexport interface PairedEvalueOptions {\n /**\n * Bound on |delta|. Default 1 (matching most score scales). Must satisfy\n * c > 0; deltas outside [-c, c] are clipped with a warning attached to\n * the return value.\n */\n bound?: number\n /** Target Type-I error. Default 0.05. */\n alpha?: number\n /**\n * Region of Practical Equivalence on the *mean* paired delta. When\n * supplied, the verdict can return `'equivalent'` once the running\n * confidence sequence on the mean is fully contained in [low, high].\n */\n rope?: { low: number; high: number }\n /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */\n initialBetShrinkage?: number\n}\n\nexport interface PairedEvalueStep {\n /** 1-indexed observation count. */\n t: number\n delta: number\n /** Running e-value E_t = ∏ (1 + λ_i · D_i). */\n evalue: number\n /** Time-uniform p-value at stopping time t. */\n pValue: number\n /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */\n csLow: number\n csHigh: number\n /** Verdict at this stopping time. */\n decision: SequentialDecision\n}\n\nexport interface PairedEvalueSequence {\n steps: PairedEvalueStep[]\n /** The decision at the final step. */\n finalDecision: SequentialDecision\n /** Index (1-based) at which a non-`continue` decision first fired, or null. */\n decisionFiredAt: number | null\n /** True if any deltas were clipped to [-bound, bound]. */\n clipped: boolean\n}\n\n/**\n * Run the paired e-value sequence over an in-order delta stream.\n *\n * Use for *streaming* / interim analyses: pass the deltas you have so\n * far, get the verdict at every prefix length. The decision is\n * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`\n * fires, the verdict at later steps remains decisive (the e-value is a\n * non-negative martingale; once it crosses the threshold, it's crossed).\n */\nexport function pairedEvalueSequence(\n deltas: number[],\n opts: PairedEvalueOptions = {},\n): PairedEvalueSequence {\n const c = opts.bound ?? 1\n const alpha = opts.alpha ?? 0.05\n const initialShrink = opts.initialBetShrinkage ?? 0.5\n const rope = opts.rope ?? null\n if (c <= 0) throw new Error('pairedEvalueSequence: bound must be > 0')\n if (alpha <= 0 || alpha >= 1) throw new Error('pairedEvalueSequence: alpha must be in (0,1)')\n if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {\n throw new Error('pairedEvalueSequence: rope must satisfy low ≤ high')\n }\n\n const steps: PairedEvalueStep[] = []\n let clipped = false\n let evalue = 1\n let decisionFiredAt: number | null = null\n\n // Running statistics (using only D_{1..i-1} for the bet → predictable plug-in).\n let sum = 0\n let sumSq = 0\n let count = 0\n\n for (let i = 0; i < deltas.length; i++) {\n let d = deltas[i]!\n if (d < -c || d > c) {\n d = Math.max(-c, Math.min(c, d))\n clipped = true\n }\n\n // Predictable plug-in bet (positive λ tests for E[D] > 0; we run a two-sided\n // test by tracking the symmetric e-value via |bet|).\n // λ_i ∝ mean / (variance + bound^2). Shrink early to avoid overbetting.\n const muHat = count === 0 ? 0 : sum / count\n const varHat = count === 0 ? c * c : Math.max(1e-12, sumSq / count - muHat * muHat)\n const t = i + 1\n const shrink = initialShrink * Math.min(1, count / 32) // anneal toward 1\n let lambda = (muHat / (varHat + c * c)) * shrink\n // Clip to ensure 1 + λ·D > 0 for all |D| ≤ c (so the e-value stays non-negative).\n const lambdaMax = 0.99 / c\n if (lambda > lambdaMax) lambda = lambdaMax\n if (lambda < -lambdaMax) lambda = -lambdaMax\n\n evalue = evalue * (1 + lambda * d)\n if (!Number.isFinite(evalue) || evalue < 0) evalue = 0\n\n sum += d\n sumSq += d * d\n count += 1\n\n const pValue = Math.min(1, 1 / Math.max(evalue, 1e-300))\n\n // Empirical Bernstein confidence sequence on the mean. Howard et al.\n // (2021), Theorem 4.4 with σ̂² the running sample variance and a\n // calibration constant tuned for two-sided coverage at level 1 - α.\n const cs = empiricalBernsteinCs(sum, sumSq, count, c, alpha)\n\n let decision: SequentialDecision = 'continue'\n if (rope && cs.low >= rope.low && cs.high <= rope.high) decision = 'equivalent'\n else if (evalue >= 2 / alpha && muHat > 0) decision = 'promote_now'\n else if (evalue >= 2 / alpha && muHat < 0) decision = 'reject_now'\n else if (rope && cs.high < rope.low) decision = 'reject_now'\n\n if (decision !== 'continue' && decisionFiredAt === null) decisionFiredAt = t\n\n steps.push({ t, delta: d, evalue, pValue, csLow: cs.low, csHigh: cs.high, decision })\n }\n\n const finalDecision = steps.length === 0 ? 'continue' : steps[steps.length - 1]!.decision\n return { steps, finalDecision, decisionFiredAt, clipped }\n}\n\nexport interface InterimReleaseConfidenceInput {\n /**\n * One delta series per candidate (paired deltas vs comparator). Order\n * within a series is the order the campaigns were run.\n */\n deltaSeries: Array<{ candidateId: string; deltas: number[] }>\n alpha?: number\n bound?: number\n rope?: { low: number; high: number }\n}\n\nexport interface InterimReleaseConfidence {\n candidates: Array<{\n candidateId: string\n decision: SequentialDecision\n decisionFiredAt: number | null\n finalEvalue: number\n finalPValue: number\n pairs: number\n csLow: number\n csHigh: number\n }>\n /**\n * Campaign-level recommendation: pick the strongest 'promote_now', else\n * 'continue' if any candidate is still live, else 'reject_now' if every\n * candidate is dead, else 'equivalent'.\n */\n recommendation: { decision: SequentialDecision; candidateId: string | null }\n}\n\n/**\n * Run interim sequential analyses across many candidates at once,\n * preserving the time-uniform α guarantee for each candidate's series and\n * synthesising a campaign-level recommendation. Designed to be called on\n * every campaign tick — the recommendation is anytime-valid.\n */\nexport function evaluateInterimReleaseConfidence(\n input: InterimReleaseConfidenceInput,\n): InterimReleaseConfidence {\n const candidates = input.deltaSeries.map((s) => {\n const seq = pairedEvalueSequence(s.deltas, {\n alpha: input.alpha,\n bound: input.bound,\n rope: input.rope,\n })\n const last = seq.steps[seq.steps.length - 1]\n return {\n candidateId: s.candidateId,\n decision: seq.finalDecision,\n decisionFiredAt: seq.decisionFiredAt,\n finalEvalue: last?.evalue ?? 1,\n finalPValue: last?.pValue ?? 1,\n pairs: seq.steps.length,\n csLow: last?.csLow ?? Number.NEGATIVE_INFINITY,\n csHigh: last?.csHigh ?? Number.POSITIVE_INFINITY,\n }\n })\n\n const promote = candidates.find((c) => c.decision === 'promote_now')\n if (promote)\n return {\n candidates,\n recommendation: { decision: 'promote_now', candidateId: promote.candidateId },\n }\n const live = candidates.find((c) => c.decision === 'continue')\n if (live) return { candidates, recommendation: { decision: 'continue', candidateId: null } }\n const equiv = candidates.find((c) => c.decision === 'equivalent')\n if (equiv)\n return {\n candidates,\n recommendation: { decision: 'equivalent', candidateId: equiv.candidateId },\n }\n return { candidates, recommendation: { decision: 'reject_now', candidateId: null } }\n}\n\n// ── Internals ────────────────────────────────────────────────────────────\n\n/**\n * Empirical Bernstein confidence sequence on the mean of bounded variables.\n * Adapted from Howard et al. (2021) §4.4. Provides a time-uniform CI on\n * the running mean; valid at every stopping time.\n */\nfunction empiricalBernsteinCs(\n sum: number,\n sumSq: number,\n n: number,\n bound: number,\n alpha: number,\n): { low: number; high: number } {\n if (n === 0) return { low: -bound, high: bound }\n const mean = sum / n\n const variance = Math.max(0, sumSq / n - mean * mean)\n // Iterated-log calibration constant. The 1.7 exponent matches the\n // recommended choice in Howard et al. for two-sided coverage at level\n // 1 - α with mild log-corrections; tightening further requires a\n // tuned mixture and is out of scope.\n const psi = Math.log(2 / alpha) + 1.7 * Math.log(Math.log(Math.max(Math.E, n)) + 1)\n const radius = Math.sqrt((2 * variance * psi) / n) + (3 * bound * psi) / n\n return { low: mean - radius, high: mean + radius }\n}\n"],"mappings":";AA8FO,SAAS,qBACd,QACA,OAA4B,CAAC,GACP;AACtB,QAAM,IAAI,KAAK,SAAS;AACxB,QAAM,QAAQ,KAAK,SAAS;AAC5B,QAAM,gBAAgB,KAAK,uBAAuB;AAClD,QAAM,OAAO,KAAK,QAAQ;AAC1B,MAAI,KAAK,EAAG,OAAM,IAAI,MAAM,yCAAyC;AACrE,MAAI,SAAS,KAAK,SAAS,EAAG,OAAM,IAAI,MAAM,8CAA8C;AAC5F,MAAI,QAAQ,EAAE,OAAO,SAAS,KAAK,GAAG,KAAK,OAAO,SAAS,KAAK,IAAI,KAAK,KAAK,OAAO,KAAK,OAAO;AAC/F,UAAM,IAAI,MAAM,yDAAoD;AAAA,EACtE;AAEA,QAAM,QAA4B,CAAC;AACnC,MAAI,UAAU;AACd,MAAI,SAAS;AACb,MAAI,kBAAiC;AAGrC,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,QAAQ;AAEZ,WAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,QAAI,IAAI,OAAO,CAAC;AAChB,QAAI,IAAI,CAAC,KAAK,IAAI,GAAG;AACnB,UAAI,KAAK,IAAI,CAAC,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;AAC/B,gBAAU;AAAA,IACZ;AAKA,UAAM,QAAQ,UAAU,IAAI,IAAI,MAAM;AACtC,UAAM,SAAS,UAAU,IAAI,IAAI,IAAI,KAAK,IAAI,OAAO,QAAQ,QAAQ,QAAQ,KAAK;AAClF,UAAM,IAAI,IAAI;AACd,UAAM,SAAS,gBAAgB,KAAK,IAAI,GAAG,QAAQ,EAAE;AACrD,QAAI,SAAU,SAAS,SAAS,IAAI,KAAM;AAE1C,UAAM,YAAY,OAAO;AACzB,QAAI,SAAS,UAAW,UAAS;AACjC,QAAI,SAAS,CAAC,UAAW,UAAS,CAAC;AAEnC,aAAS,UAAU,IAAI,SAAS;AAChC,QAAI,CAAC,OAAO,SAAS,MAAM,KAAK,SAAS,EAAG,UAAS;AAErD,WAAO;AACP,aAAS,IAAI;AACb,aAAS;AAET,UAAM,SAAS,KAAK,IAAI,GAAG,IAAI,KAAK,IAAI,QAAQ,MAAM,CAAC;AAKvD,UAAM,KAAK,qBAAqB,KAAK,OAAO,OAAO,GAAG,KAAK;AAE3D,QAAI,WAA+B;AACnC,QAAI,QAAQ,GAAG,OAAO,KAAK,OAAO,GAAG,QAAQ,KAAK,KAAM,YAAW;AAAA,aAC1D,UAAU,IAAI,SAAS,QAAQ,EAAG,YAAW;AAAA,aAC7C,UAAU,IAAI,SAAS,QAAQ,EAAG,YAAW;AAAA,aAC7C,QAAQ,GAAG,OAAO,KAAK,IAAK,YAAW;AAEhD,QAAI,aAAa,cAAc,oBAAoB,KAAM,mBAAkB;AAE3E,UAAM,KAAK,EAAE,GAAG,OAAO,GAAG,QAAQ,QAAQ,OAAO,GAAG,KAAK,QAAQ,GAAG,MAAM,SAAS,CAAC;AAAA,EACtF;AAEA,QAAM,gBAAgB,MAAM,WAAW,IAAI,aAAa,MAAM,MAAM,SAAS,CAAC,EAAG;AACjF,SAAO,EAAE,OAAO,eAAe,iBAAiB,QAAQ;AAC1D;AAsCO,SAAS,iCACd,OAC0B;AAC1B,QAAM,aAAa,MAAM,YAAY,IAAI,CAAC,MAAM;AAC9C,UAAM,MAAM,qBAAqB,EAAE,QAAQ;AAAA,MACzC,OAAO,MAAM;AAAA,MACb,OAAO,MAAM;AAAA,MACb,MAAM,MAAM;AAAA,IACd,CAAC;AACD,UAAM,OAAO,IAAI,MAAM,IAAI,MAAM,SAAS,CAAC;AAC3C,WAAO;AAAA,MACL,aAAa,EAAE;AAAA,MACf,UAAU,IAAI;AAAA,MACd,iBAAiB,IAAI;AAAA,MACrB,aAAa,MAAM,UAAU;AAAA,MAC7B,aAAa,MAAM,UAAU;AAAA,MAC7B,OAAO,IAAI,MAAM;AAAA,MACjB,OAAO,MAAM,SAAS,OAAO;AAAA,MAC7B,QAAQ,MAAM,UAAU,OAAO;AAAA,IACjC;AAAA,EACF,CAAC;AAED,QAAM,UAAU,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,aAAa;AACnE,MAAI;AACF,WAAO;AAAA,MACL;AAAA,MACA,gBAAgB,EAAE,UAAU,eAAe,aAAa,QAAQ,YAAY;AAAA,IAC9E;AACF,QAAM,OAAO,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,UAAU;AAC7D,MAAI,KAAM,QAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,YAAY,aAAa,KAAK,EAAE;AAC3F,QAAM,QAAQ,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,YAAY;AAChE,MAAI;AACF,WAAO;AAAA,MACL;AAAA,MACA,gBAAgB,EAAE,UAAU,cAAc,aAAa,MAAM,YAAY;AAAA,IAC3E;AACF,SAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,cAAc,aAAa,KAAK,EAAE;AACrF;AASA,SAAS,qBACP,KACA,OACA,GACA,OACA,OAC+B;AAC/B,MAAI,MAAM,EAAG,QAAO,EAAE,KAAK,CAAC,OAAO,MAAM,MAAM;AAC/C,QAAM,OAAO,MAAM;AACnB,QAAM,WAAW,KAAK,IAAI,GAAG,QAAQ,IAAI,OAAO,IAAI;AAKpD,QAAM,MAAM,KAAK,IAAI,IAAI,KAAK,IAAI,MAAM,KAAK,IAAI,KAAK,IAAI,KAAK,IAAI,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC;AAClF,QAAM,SAAS,KAAK,KAAM,IAAI,WAAW,MAAO,CAAC,IAAK,IAAI,QAAQ,MAAO;AACzE,SAAO,EAAE,KAAK,OAAO,QAAQ,MAAM,OAAO,OAAO;AACnD;","names":[]}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import {
|
|
2
|
+
llmSpans
|
|
3
|
+
} from "./chunk-47X6LRCE.js";
|
|
4
|
+
|
|
5
|
+
// src/contamination-guard.ts
|
|
6
|
+
function checkCanaries(output, scenarios) {
|
|
7
|
+
const leaks = [];
|
|
8
|
+
for (const s of scenarios) {
|
|
9
|
+
if (!s.canary) continue;
|
|
10
|
+
if (output.includes(s.canary)) {
|
|
11
|
+
leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt(output, s.canary) });
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
return leaks;
|
|
15
|
+
}
|
|
16
|
+
function checkBehavioralCanary(output, scenario) {
|
|
17
|
+
const pattern = scenario.forbiddenPattern ?? scenario.canary;
|
|
18
|
+
if (!pattern) return null;
|
|
19
|
+
const hit = matchForbidden(output, pattern);
|
|
20
|
+
if (!hit) return null;
|
|
21
|
+
return {
|
|
22
|
+
scenarioId: scenario.id,
|
|
23
|
+
canary: pattern,
|
|
24
|
+
evidence: excerpt(output, hit)
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
function runBehavioralCanaries(cases) {
|
|
28
|
+
const leaks = [];
|
|
29
|
+
for (const c of cases) {
|
|
30
|
+
const leak = checkBehavioralCanary(c.output, c.scenario);
|
|
31
|
+
if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
|
|
32
|
+
}
|
|
33
|
+
return leaks;
|
|
34
|
+
}
|
|
35
|
+
function matchForbidden(output, pattern) {
|
|
36
|
+
const re = tryParseRegex(pattern);
|
|
37
|
+
if (re) {
|
|
38
|
+
const m = output.match(re);
|
|
39
|
+
return m && m[0].length > 0 ? m[0] : null;
|
|
40
|
+
}
|
|
41
|
+
return output.includes(pattern) ? pattern : null;
|
|
42
|
+
}
|
|
43
|
+
function tryParseRegex(pattern) {
|
|
44
|
+
if (pattern.length < 2 || pattern[0] !== "/") return null;
|
|
45
|
+
const last = pattern.lastIndexOf("/");
|
|
46
|
+
if (last <= 0) return null;
|
|
47
|
+
const body = pattern.slice(1, last);
|
|
48
|
+
const flags = pattern.slice(last + 1);
|
|
49
|
+
if (!/^[gimsuy]*$/.test(flags)) return null;
|
|
50
|
+
try {
|
|
51
|
+
return new RegExp(body, flags);
|
|
52
|
+
} catch {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
async function canaryLeakView(store, scenarios) {
|
|
57
|
+
const targets = scenarios.filter((s) => !!s.canary);
|
|
58
|
+
if (targets.length === 0) return [];
|
|
59
|
+
const spans = await llmSpans(store);
|
|
60
|
+
const leaks = [];
|
|
61
|
+
for (const span of spans) {
|
|
62
|
+
const output = span.output ?? "";
|
|
63
|
+
for (const s of targets) {
|
|
64
|
+
if (s.canary && output.includes(s.canary)) {
|
|
65
|
+
leaks.push({
|
|
66
|
+
scenarioId: s.id,
|
|
67
|
+
canary: s.canary,
|
|
68
|
+
runId: span.runId,
|
|
69
|
+
evidence: excerpt(output, s.canary)
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return leaks;
|
|
75
|
+
}
|
|
76
|
+
var HoldoutAuditor = class {
|
|
77
|
+
scenarios;
|
|
78
|
+
accessLog = [];
|
|
79
|
+
constructor(scenarios) {
|
|
80
|
+
this.scenarios = scenarios;
|
|
81
|
+
}
|
|
82
|
+
/** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
|
|
83
|
+
get(scenarioId, purpose) {
|
|
84
|
+
if (purpose !== "evaluation" && purpose !== "debugging") {
|
|
85
|
+
throw new Error(
|
|
86
|
+
`HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
const s = this.scenarios.find((x) => x.id === scenarioId);
|
|
90
|
+
if (!s) throw new Error(`holdout scenario "${scenarioId}" not found`);
|
|
91
|
+
this.accessLog.push({ scenarioId, purpose, at: Date.now() });
|
|
92
|
+
return s;
|
|
93
|
+
}
|
|
94
|
+
getAccessLog() {
|
|
95
|
+
return this.accessLog;
|
|
96
|
+
}
|
|
97
|
+
};
|
|
98
|
+
function excerpt(source, needle) {
|
|
99
|
+
const at = source.indexOf(needle);
|
|
100
|
+
if (at < 0) return "";
|
|
101
|
+
const start = Math.max(0, at - 30);
|
|
102
|
+
const end = Math.min(source.length, at + needle.length + 30);
|
|
103
|
+
return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export {
|
|
107
|
+
checkCanaries,
|
|
108
|
+
checkBehavioralCanary,
|
|
109
|
+
runBehavioralCanaries,
|
|
110
|
+
canaryLeakView,
|
|
111
|
+
HoldoutAuditor
|
|
112
|
+
};
|
|
113
|
+
//# sourceMappingURL=chunk-SHTXZ4O2.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/contamination-guard.ts"],"sourcesContent":["/**\n * ContaminationGuard — ensures held-out scenarios don't leak into\n * training/prompt paths, and flags model memorization.\n *\n * Three probes:\n * 1. `checkCanaries(output, scenario)` — if the scenario carries a\n * canary token, it MUST NOT appear in the agent's output.\n * Canaries are strings that are statistically impossible to\n * reconstruct from the scenario description alone — so if they\n * echo back, the model memorized them.\n * 2. `canaryLeakView(store)` — cross-corpus view of every run whose\n * output contained a canary, with the offending scenario + run.\n * 3. `HoldoutAuditor` — wraps a Dataset and emits a structured error\n * on any code path that reads holdout scenarios but doesn't flag\n * `purpose: 'evaluation'`. Keeps engineers honest.\n */\n\nimport type { DatasetScenario } from './dataset'\nimport { llmSpans } from './trace/query'\nimport type { TraceStore } from './trace/store'\n\nexport interface CanaryLeak {\n scenarioId: string\n canary: string\n runId?: string\n evidence: string\n}\n\nexport function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[] {\n const leaks: CanaryLeak[] = []\n for (const s of scenarios) {\n if (!s.canary) continue\n if (output.includes(s.canary)) {\n leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt(output, s.canary) })\n }\n }\n return leaks\n}\n\n/**\n * Behavioral canary: tests the AGENT, not the eval grep.\n *\n * - `forbiddenPattern` PRESENT in output ⇒ AGENT EMITTED BAD CONTENT ⇒ LEAK.\n * - `forbiddenPattern` ABSENT from output ⇒ AGENT HELD THE LINE ⇒ PASS.\n *\n * Use when running known-bad-prompt scenarios against the agent under\n * test and you want to know if the agent misbehaved. The classical\n * {@link checkCanaries} / {@link import('./canary').runCanaries | runCanaries}\n * test whether the eval check fires when the bad output is forced\n * into the eval flow — different question, different answer.\n *\n * Pattern resolution order (first match wins):\n * 1. `scenario.forbiddenPattern` — if it parses as `/body/flags`,\n * treated as a regex; otherwise a literal substring.\n * 2. `scenario.canary` — literal substring fallback so the helper\n * works on existing scenario fixtures.\n *\n * Returns `null` when nothing forbidden was found OR the scenario\n * declared no pattern.\n */\nexport function checkBehavioralCanary(\n output: string,\n scenario: DatasetScenario,\n): CanaryLeak | null {\n const pattern = scenario.forbiddenPattern ?? scenario.canary\n if (!pattern) return null\n const hit = matchForbidden(output, pattern)\n if (!hit) return null\n return {\n scenarioId: scenario.id,\n canary: pattern,\n evidence: excerpt(output, hit),\n }\n}\n\n/**\n * Behavioral canary over many (scenario, output) pairs. Sibling to\n * {@link import('./canary').runCanaries | runCanaries} — same idea\n * (run-many → report) but the question being answered is \"did the\n * AGENT misbehave?\" rather than \"did the EVAL grep fire?\".\n *\n * Returns one `CanaryLeak` per pair where the agent's output\n * contained its scenario's `forbiddenPattern` (or `canary` fallback).\n */\nexport function runBehavioralCanaries(\n cases: Array<{ scenario: DatasetScenario; output: string; runId?: string }>,\n): CanaryLeak[] {\n const leaks: CanaryLeak[] = []\n for (const c of cases) {\n const leak = checkBehavioralCanary(c.output, c.scenario)\n if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId })\n }\n return leaks\n}\n\n/**\n * Resolve a forbidden-pattern string to the matched substring inside\n * `output`. `/body/flags` notation is interpreted as a regex; anything\n * else is a literal substring.\n */\nfunction matchForbidden(output: string, pattern: string): string | null {\n const re = tryParseRegex(pattern)\n if (re) {\n const m = output.match(re)\n return m && m[0].length > 0 ? m[0] : null\n }\n return output.includes(pattern) ? pattern : null\n}\n\nfunction tryParseRegex(pattern: string): RegExp | null {\n if (pattern.length < 2 || pattern[0] !== '/') return null\n const last = pattern.lastIndexOf('/')\n if (last <= 0) return null\n const body = pattern.slice(1, last)\n const flags = pattern.slice(last + 1)\n if (!/^[gimsuy]*$/.test(flags)) return null\n try {\n return new RegExp(body, flags)\n } catch {\n return null\n }\n}\n\n/**\n * Scan the LLM-output history in a corpus; returns every case where a\n * canary from a known scenario appeared in agent output. Pass the full\n * set of scenarios whose canaries you care about (typically the whole\n * held-out slice).\n */\nexport async function canaryLeakView(\n store: TraceStore,\n scenarios: DatasetScenario[],\n): Promise<CanaryLeak[]> {\n const targets = scenarios.filter((s) => !!s.canary)\n if (targets.length === 0) return []\n const spans = await llmSpans(store)\n const leaks: CanaryLeak[] = []\n for (const span of spans) {\n const output = span.output ?? ''\n for (const s of targets) {\n if (s.canary && output.includes(s.canary)) {\n leaks.push({\n scenarioId: s.id,\n canary: s.canary,\n runId: span.runId,\n evidence: excerpt(output, s.canary),\n })\n }\n }\n }\n return leaks\n}\n\nexport class HoldoutAuditor {\n private scenarios: DatasetScenario[]\n private accessLog: Array<{ scenarioId: string; purpose: string; at: number }> = []\n\n constructor(scenarios: DatasetScenario[]) {\n this.scenarios = scenarios\n }\n\n /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */\n get(scenarioId: string, purpose: 'evaluation' | 'debugging'): DatasetScenario {\n if (purpose !== 'evaluation' && purpose !== 'debugging') {\n throw new Error(\n `HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`,\n )\n }\n const s = this.scenarios.find((x) => x.id === scenarioId)\n if (!s) throw new Error(`holdout scenario \"${scenarioId}\" not found`)\n this.accessLog.push({ scenarioId, purpose, at: Date.now() })\n return s\n }\n\n getAccessLog(): ReadonlyArray<{ scenarioId: string; purpose: string; at: number }> {\n return this.accessLog\n }\n}\n\nfunction excerpt(source: string, needle: string): string {\n const at = source.indexOf(needle)\n if (at < 0) return ''\n const start = Math.max(0, at - 30)\n const end = Math.min(source.length, at + needle.length + 30)\n return (start > 0 ? '…' : '') + source.slice(start, end) + (end < source.length ? '…' : '')\n}\n"],"mappings":";;;;;AA4BO,SAAS,cAAc,QAAgB,WAA4C;AACxF,QAAM,QAAsB,CAAC;AAC7B,aAAW,KAAK,WAAW;AACzB,QAAI,CAAC,EAAE,OAAQ;AACf,QAAI,OAAO,SAAS,EAAE,MAAM,GAAG;AAC7B,YAAM,KAAK,EAAE,YAAY,EAAE,IAAI,QAAQ,EAAE,QAAQ,UAAU,QAAQ,QAAQ,EAAE,MAAM,EAAE,CAAC;AAAA,IACxF;AAAA,EACF;AACA,SAAO;AACT;AAuBO,SAAS,sBACd,QACA,UACmB;AACnB,QAAM,UAAU,SAAS,oBAAoB,SAAS;AACtD,MAAI,CAAC,QAAS,QAAO;AACrB,QAAM,MAAM,eAAe,QAAQ,OAAO;AAC1C,MAAI,CAAC,IAAK,QAAO;AACjB,SAAO;AAAA,IACL,YAAY,SAAS;AAAA,IACrB,QAAQ;AAAA,IACR,UAAU,QAAQ,QAAQ,GAAG;AAAA,EAC/B;AACF;AAWO,SAAS,sBACd,OACc;AACd,QAAM,QAAsB,CAAC;AAC7B,aAAW,KAAK,OAAO;AACrB,UAAM,OAAO,sBAAsB,EAAE,QAAQ,EAAE,QAAQ;AACvD,QAAI,KAAM,OAAM,KAAK,EAAE,GAAG,MAAM,OAAO,EAAE,SAAS,KAAK,MAAM,CAAC;AAAA,EAChE;AACA,SAAO;AACT;AAOA,SAAS,eAAe,QAAgB,SAAgC;AACtE,QAAM,KAAK,cAAc,OAAO;AAChC,MAAI,IAAI;AACN,UAAM,IAAI,OAAO,MAAM,EAAE;AACzB,WAAO,KAAK,EAAE,CAAC,EAAE,SAAS,IAAI,EAAE,CAAC,IAAI;AAAA,EACvC;AACA,SAAO,OAAO,SAAS,OAAO,IAAI,UAAU;AAC9C;AAEA,SAAS,cAAc,SAAgC;AACrD,MAAI,QAAQ,SAAS,KAAK,QAAQ,CAAC,MAAM,IAAK,QAAO;AACrD,QAAM,OAAO,QAAQ,YAAY,GAAG;AACpC,MAAI,QAAQ,EAAG,QAAO;AACtB,QAAM,OAAO,QAAQ,MAAM,GAAG,IAAI;AAClC,QAAM,QAAQ,QAAQ,MAAM,OAAO,CAAC;AACpC,MAAI,CAAC,cAAc,KAAK,KAAK,EAAG,QAAO;AACvC,MAAI;AACF,WAAO,IAAI,OAAO,MAAM,KAAK;AAAA,EAC/B,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAQA,eAAsB,eACpB,OACA,WACuB;AACvB,QAAM,UAAU,UAAU,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,MAAM;AAClD,MAAI,QAAQ,WAAW,EAAG,QAAO,CAAC;AAClC,QAAM,QAAQ,MAAM,SAAS,KAAK;AAClC,QAAM,QAAsB,CAAC;AAC7B,aAAW,QAAQ,OAAO;AACxB,UAAM,SAAS,KAAK,UAAU;AAC9B,eAAW,KAAK,SAAS;AACvB,UAAI,EAAE,UAAU,OAAO,SAAS,EAAE,MAAM,GAAG;AACzC,cAAM,KAAK;AAAA,UACT,YAAY,EAAE;AAAA,UACd,QAAQ,EAAE;AAAA,UACV,OAAO,KAAK;AAAA,UACZ,UAAU,QAAQ,QAAQ,EAAE,MAAM;AAAA,QACpC,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAEO,IAAM,iBAAN,MAAqB;AAAA,EAClB;AAAA,EACA,YAAwE,CAAC;AAAA,EAEjF,YAAY,WAA8B;AACxC,SAAK,YAAY;AAAA,EACnB;AAAA;AAAA,EAGA,IAAI,YAAoB,SAAsD;AAC5E,QAAI,YAAY,gBAAgB,YAAY,aAAa;AACvD,YAAM,IAAI;AAAA,QACR,wEAAwE,OAAO;AAAA,MACjF;AAAA,IACF;AACA,UAAM,IAAI,KAAK,UAAU,KAAK,CAAC,MAAM,EAAE,OAAO,UAAU;AACxD,QAAI,CAAC,EAAG,OAAM,IAAI,MAAM,qBAAqB,UAAU,aAAa;AACpE,SAAK,UAAU,KAAK,EAAE,YAAY,SAAS,IAAI,KAAK,IAAI,EAAE,CAAC;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,eAAmF;AACjF,WAAO,KAAK;AAAA,EACd;AACF;AAEA,SAAS,QAAQ,QAAgB,QAAwB;AACvD,QAAM,KAAK,OAAO,QAAQ,MAAM;AAChC,MAAI,KAAK,EAAG,QAAO;AACnB,QAAM,QAAQ,KAAK,IAAI,GAAG,KAAK,EAAE;AACjC,QAAM,MAAM,KAAK,IAAI,OAAO,QAAQ,KAAK,OAAO,SAAS,EAAE;AAC3D,UAAQ,QAAQ,IAAI,WAAM,MAAM,OAAO,MAAM,OAAO,GAAG,KAAK,MAAM,OAAO,SAAS,WAAM;AAC1F;","names":[]}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
summaryTable
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-EGIPWXHL.js";
|
|
4
4
|
import {
|
|
5
5
|
VerificationError
|
|
6
6
|
} from "./chunk-QYJT52YW.js";
|
|
@@ -574,4 +574,4 @@ export {
|
|
|
574
574
|
judgeReplayGate,
|
|
575
575
|
renderReleaseReport
|
|
576
576
|
};
|
|
577
|
-
//# sourceMappingURL=chunk-
|
|
577
|
+
//# sourceMappingURL=chunk-UBQGWD3O.js.map
|
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,16 +1,24 @@
|
|
|
1
|
-
import { S as Scenario, M as MutableSurface,
|
|
2
|
-
export {
|
|
3
|
-
import { C as CampaignStorage,
|
|
4
|
-
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions,
|
|
5
|
-
export { D as DeploymentOutcome, F as FileSystemOutcomeStore,
|
|
6
|
-
import { HostedTenant } from '../
|
|
1
|
+
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-Dbj5gu8n.js';
|
|
2
|
+
export { f as CampaignAggregates, g as CampaignArtifactWriter, h as CampaignCellResult, i as CampaignCostMeter, j as CampaignResult, k as CampaignTraceWriter, C as CodeSurface, D as Dispatch, l as GateContext, m as GateDecision, n as GateResult, o as GenerationCandidate, p as GenerationRecord, r as JudgeDimension, J as JudgeScore, t as Mutator, O as OptimizerConfig, v as SessionScript } from '../types-Dbj5gu8n.js';
|
|
3
|
+
import { C as CampaignStorage, d as RunImprovementLoopResult } from '../run-improvement-loop-BPMjNKMJ.js';
|
|
4
|
+
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, g as composeGate, h as defaultProductionGate, i as evolutionaryDriver, j as fsCampaignStorage, k as gepaDriver, l as heldOutGate, m as inMemoryCampaignStorage, r as runCampaign, n as runEval, p as runImprovementLoop } from '../run-improvement-loop-BPMjNKMJ.js';
|
|
5
|
+
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
6
|
+
import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-BRxz6qov.js';
|
|
7
|
+
export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-BRxz6qov.js';
|
|
8
|
+
import { A as AnalystRegistry } from '../registry-8KAs18kY.js';
|
|
9
|
+
import { a as DatasetScenario } from '../dataset-BlwAtYYf.js';
|
|
10
|
+
import { R as RunRecord, a as RunSplitTag } from '../run-record-BGY6bHRh.js';
|
|
7
11
|
import '../llm-client-BXVRUZyX.js';
|
|
8
12
|
import '../errors-mje_cKOs.js';
|
|
9
13
|
import '../raw-provider-sink-C46HDghv.js';
|
|
10
14
|
import '../red-team-30II1T4o.js';
|
|
11
|
-
import '../dataset-BlwAtYYf.js';
|
|
12
15
|
import '../store-Db2Bv8Cf.js';
|
|
13
|
-
import '../
|
|
16
|
+
import '../summary-report-B7gNRX-r.js';
|
|
17
|
+
import '../failure-cluster-Cw65_5FY.js';
|
|
18
|
+
import '../judge-calibration-DilmB3Ml.js';
|
|
19
|
+
import '../store-CJbzDxZ2.js';
|
|
20
|
+
import '../types-DhqpAi_z.js';
|
|
21
|
+
import '@tangle-network/tcloud';
|
|
14
22
|
|
|
15
23
|
/**
|
|
16
24
|
* # `selfImprove()` — the LAND-tier one-shot.
|
|
@@ -188,6 +196,13 @@ interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
|
|
|
188
196
|
durationMs: number;
|
|
189
197
|
/** Total cost across baseline + every generation. */
|
|
190
198
|
totalCostUsd: number;
|
|
199
|
+
/**
|
|
200
|
+
* Rigor packet: distributional summary, paired-bootstrap lift CI,
|
|
201
|
+
* judge stats, contamination check, recommendations. Wired through
|
|
202
|
+
* `analyzeRuns()` on the baseline + winner cells of the campaign.
|
|
203
|
+
* Hosted-tier dashboards render this as the v3-vs-v4 decision view.
|
|
204
|
+
*/
|
|
205
|
+
insight: InsightReport;
|
|
191
206
|
/**
|
|
192
207
|
* Raw substrate result for advanced inspection — full per-generation
|
|
193
208
|
* candidates, full campaign artifacts, all judge scores. Useful for
|
|
@@ -222,4 +237,186 @@ interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
|
|
|
222
237
|
*/
|
|
223
238
|
declare function selfImprove<TScenario extends Scenario, TArtifact>(opts: SelfImproveOptions<TScenario, TArtifact>): Promise<SelfImproveResult<TScenario, TArtifact>>;
|
|
224
239
|
|
|
225
|
-
|
|
240
|
+
/**
|
|
241
|
+
* # `analyzeRuns()` — turn a set of agent runs into an actionable decision packet.
|
|
242
|
+
*
|
|
243
|
+
* Wires the substrate's statistical, calibration, clustering, Pareto, and
|
|
244
|
+
* release-confidence primitives into one `InsightReport`. Two top-level
|
|
245
|
+
* entry points use this function:
|
|
246
|
+
*
|
|
247
|
+
* - `selfImprove()` calls it on the campaign output to attach a packet
|
|
248
|
+
* to every run.
|
|
249
|
+
* - Consumers with observed `RunRecord[]` (production traces, gold
|
|
250
|
+
* corpora, approve/reject tables) call it directly via `analyzeRuns()`
|
|
251
|
+
* for analysis without a closed loop.
|
|
252
|
+
*
|
|
253
|
+
* Every section is opt-in based on what the input data supports — the
|
|
254
|
+
* function never invents signal. If runs carry no judge scores, `judges`
|
|
255
|
+
* is empty. If there's no baseline/candidate split, `lift` is undefined.
|
|
256
|
+
* If no `analyst` is wired, `failureClusters` is undefined.
|
|
257
|
+
*
|
|
258
|
+
* The `recommendations` array is the human-readable layer; everything
|
|
259
|
+
* else is the evidence backing each recommendation.
|
|
260
|
+
*/
|
|
261
|
+
|
|
262
|
+
interface AnalyzeRunsOptions {
|
|
263
|
+
/** The runs to analyze. */
|
|
264
|
+
runs: RunRecord[];
|
|
265
|
+
/** Which split to score against when reading composite from RunOutcome.
|
|
266
|
+
* Default: holdout when ANY run has a `holdoutScore`, else search. */
|
|
267
|
+
split?: 'search' | 'holdout' | 'auto';
|
|
268
|
+
/** Pairwise analysis configuration. When both `baselineCandidateId` and
|
|
269
|
+
* `candidateCandidateId` are present, lift is computed on paired
|
|
270
|
+
* (experimentId, seed) tuples shared between the two sides. */
|
|
271
|
+
baselineCandidateId?: string;
|
|
272
|
+
candidateCandidateId?: string;
|
|
273
|
+
/** Canary scenarios — checked against every run's raw output for
|
|
274
|
+
* holdout contamination. */
|
|
275
|
+
canaryScenarios?: DatasetScenario[];
|
|
276
|
+
/** Analyst registry for failure clustering. When omitted, the
|
|
277
|
+
* `failureClusters` section is left undefined. */
|
|
278
|
+
analyst?: AnalystRegistry;
|
|
279
|
+
/** Downstream outcome metric per run (e.g. engagement rate, approval
|
|
280
|
+
* rate, downstream pass rate). When present, the report includes
|
|
281
|
+
* `outcomeCorrelation` + a simple linear reward model fit. */
|
|
282
|
+
outcomeSignal?: {
|
|
283
|
+
metric: string;
|
|
284
|
+
valueByRunId: Record<string, number>;
|
|
285
|
+
};
|
|
286
|
+
/** Multi-rater feedback for inter-rater agreement. Each entry is one
|
|
287
|
+
* rater's score for one run. Two or more raters → kappa + disagreement
|
|
288
|
+
* triage list. */
|
|
289
|
+
raterScores?: Array<{
|
|
290
|
+
runId: string;
|
|
291
|
+
rater: string;
|
|
292
|
+
score: number;
|
|
293
|
+
}>;
|
|
294
|
+
/** Number of histogram bins for distributional summaries. Default 12. */
|
|
295
|
+
histogramBins?: number;
|
|
296
|
+
/** Decision threshold — the smallest composite lift the caller cares
|
|
297
|
+
* about. Used by the recommendations engine to call ship vs hold.
|
|
298
|
+
* Default 0.02. */
|
|
299
|
+
decisionThreshold?: number;
|
|
300
|
+
}
|
|
301
|
+
declare function analyzeRuns(opts: AnalyzeRunsOptions): Promise<InsightReport>;
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* # `intake/feedback-table` — multi-rater approve/reject corpus → `RunRecord[]`.
|
|
305
|
+
*
|
|
306
|
+
* The generic shape behind Obsidian's `#approved` / `#rejected` tags, a
|
|
307
|
+
* Google Sheet, a Postgres `feedback` table, or any CSV with ratings.
|
|
308
|
+
*
|
|
309
|
+
* Caller supplies one row per (run, rater) tuple plus per-run metadata; the
|
|
310
|
+
* adapter rolls them up into the substrate-canonical `RunRecord` shape so
|
|
311
|
+
* `analyzeRuns({ runs, raterScores })` can produce inter-rater agreement,
|
|
312
|
+
* disagreement triage, and downstream recommendations.
|
|
313
|
+
*
|
|
314
|
+
* Per-run `RunRecord.outcome.searchScore` is the rater-mean rating
|
|
315
|
+
* (normalised to 0..1 when scale is supplied); `outcome.raw` carries the
|
|
316
|
+
* per-rater scores keyed by rater id for downstream attribution.
|
|
317
|
+
*/
|
|
318
|
+
|
|
319
|
+
interface FeedbackTableRow {
|
|
320
|
+
/** Stable id for this run — the unit a rater scored. Drives pairing
|
|
321
|
+
* across analysis primitives. */
|
|
322
|
+
runId: string;
|
|
323
|
+
/** Identifier of the rater that produced this rating. */
|
|
324
|
+
rater: string;
|
|
325
|
+
/** The rating itself. Accepts boolean (approve/reject), 0..1 scalar,
|
|
326
|
+
* or any numeric scale — see `scale`. */
|
|
327
|
+
rating: number | boolean;
|
|
328
|
+
/** Optional metadata carried through to `RunRecord.outcome.raw` and the
|
|
329
|
+
* custom-shape metadata bag. */
|
|
330
|
+
metadata?: Record<string, unknown>;
|
|
331
|
+
}
|
|
332
|
+
interface FeedbackTableMeta {
|
|
333
|
+
runId: string;
|
|
334
|
+
/** When omitted, defaults to `'feedback-corpus'`. Used to group related
|
|
335
|
+
* runs in `analyzeRuns()` lift analysis. */
|
|
336
|
+
experimentId?: string;
|
|
337
|
+
/** When omitted, defaults to `runId` — each run is its own candidate. */
|
|
338
|
+
candidateId?: string;
|
|
339
|
+
/** Cost in USD, when available. Set to 0 when unknown — the consumer's
|
|
340
|
+
* cost analysis sections will collapse gracefully. */
|
|
341
|
+
costUsd?: number;
|
|
342
|
+
/** Wall-clock ms, when available. Defaults to 0. */
|
|
343
|
+
wallMs?: number;
|
|
344
|
+
/** Model identifier including snapshot. Default `unknown@unknown`. */
|
|
345
|
+
model?: string;
|
|
346
|
+
/** Optional sha256 of the prompt; default `'sha256:unknown'`. */
|
|
347
|
+
promptHash?: string;
|
|
348
|
+
/** Default `'sha256:unknown'`. */
|
|
349
|
+
configHash?: string;
|
|
350
|
+
/** Default `'unknown'`. */
|
|
351
|
+
commitSha?: string;
|
|
352
|
+
/** Default `'holdout'` — feedback corpora are by nature the holdout
|
|
353
|
+
* signal a closed-loop improvement aims at. */
|
|
354
|
+
splitTag?: RunSplitTag;
|
|
355
|
+
/** Free-form metadata available to consumers via the cast-out path on
|
|
356
|
+
* the resulting RunRecord. */
|
|
357
|
+
extras?: Record<string, unknown>;
|
|
358
|
+
}
|
|
359
|
+
interface FromFeedbackTableOptions {
|
|
360
|
+
/** Per-(run, rater) ratings. */
|
|
361
|
+
ratings: FeedbackTableRow[];
|
|
362
|
+
/** Per-run metadata. When a runId appears in `ratings` but not here, the
|
|
363
|
+
* adapter synthesises minimal metadata with defaults documented above. */
|
|
364
|
+
meta?: FeedbackTableMeta[];
|
|
365
|
+
/** Rating scale. Provide `{ min, max }` for non-0..1 numeric scales.
|
|
366
|
+
* Booleans are normalised: true → 1, false → 0. Default: assumes
|
|
367
|
+
* ratings are already 0..1. */
|
|
368
|
+
scale?: {
|
|
369
|
+
min: number;
|
|
370
|
+
max: number;
|
|
371
|
+
};
|
|
372
|
+
/** When true, the rater scores are emitted into `raterScores` (a sibling
|
|
373
|
+
* array `analyzeRuns()` accepts) instead of being averaged into the
|
|
374
|
+
* run's `outcome.searchScore`. Default `true` — preserves rater-level
|
|
375
|
+
* signal for inter-rater analysis. */
|
|
376
|
+
emitRaterScores?: boolean;
|
|
377
|
+
}
|
|
378
|
+
interface FromFeedbackTableResult {
|
|
379
|
+
runs: RunRecord[];
|
|
380
|
+
/** Rater-level scores ready to pass into `analyzeRuns({ raterScores })`
|
|
381
|
+
* for inter-rater agreement + disagreement triage. */
|
|
382
|
+
raterScores: Array<{
|
|
383
|
+
runId: string;
|
|
384
|
+
rater: string;
|
|
385
|
+
score: number;
|
|
386
|
+
}>;
|
|
387
|
+
}
|
|
388
|
+
declare function fromFeedbackTable(opts: FromFeedbackTableOptions): FromFeedbackTableResult;
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* # `intake/otel-spans` — OTel `TraceSpanEvent[]` → `RunRecord[]`.
|
|
392
|
+
*
|
|
393
|
+
* Turns an existing observability stream into the substrate-canonical
|
|
394
|
+
* `RunRecord` shape so consumers with logs but no eval discipline can
|
|
395
|
+
* call `analyzeRuns()` against their production traffic immediately.
|
|
396
|
+
*
|
|
397
|
+
* Pivot rule: spans are grouped by `tangle.runId` (the same attribute the
|
|
398
|
+
* hosted-tier wire format uses) or, when absent, by `traceId`. One group
|
|
399
|
+
* becomes one `RunRecord`. The root span (no `parentSpanId`) supplies:
|
|
400
|
+
*
|
|
401
|
+
* - `runId` (the group key)
|
|
402
|
+
* - `wallMs` from `endTimeUnixNano - startTimeUnixNano`
|
|
403
|
+
* - `model` from `gen_ai.request.model` / `llm.model` / `tangle.model`
|
|
404
|
+
* - cost from `cost.usd` / `gen_ai.usage.cost_usd` / `tangle.cost.usd`
|
|
405
|
+
* - token usage from `gen_ai.usage.{input,output}_tokens`
|
|
406
|
+
* - `outcome.searchScore` from `tangle.score` / `eval.score` when
|
|
407
|
+
* present; `outcome.raw` collects every numeric attribute.
|
|
408
|
+
*
|
|
409
|
+
* Spans that ERRORed (`status.code === 'ERROR'`) populate `failureMode`
|
|
410
|
+
* with their `name` so `analyzeRuns()`'s failure clustering sees them.
|
|
411
|
+
*/
|
|
412
|
+
|
|
413
|
+
interface FromOtelSpansOptions {
|
|
414
|
+
spans: TraceSpanEvent[];
|
|
415
|
+
/** Default split tag for synthesized records. Defaults to `'holdout'`. */
|
|
416
|
+
defaultSplit?: RunSplitTag;
|
|
417
|
+
/** Default `experimentId` when not present on any span. */
|
|
418
|
+
experimentId?: string;
|
|
419
|
+
}
|
|
420
|
+
declare function fromOtelSpans(opts: FromOtelSpansOptions): RunRecord[];
|
|
421
|
+
|
|
422
|
+
export { type AnalyzeRunsOptions, CampaignStorage, DispatchContext, type FeedbackTableMeta, type FeedbackTableRow, type FromFeedbackTableOptions, type FromFeedbackTableResult, type FromOtelSpansOptions, Gate, ImprovementDriver, InsightReport, JudgeConfig, MutableSurface, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, analyzeRuns, fromFeedbackTable, fromOtelSpans, selfImprove };
|