@tangle-network/agent-eval 0.49.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/langchain.d.ts +1 -1
  3. package/dist/adapters/otel.d.ts +8 -2
  4. package/dist/campaign/index.d.ts +3 -3
  5. package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
  6. package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
  7. package/dist/chunk-EGIPWXHL.js.map +1 -0
  8. package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
  9. package/dist/chunk-FQK2CCIM.js.map +1 -0
  10. package/dist/chunk-MAZ26DC7.js +99 -0
  11. package/dist/chunk-MAZ26DC7.js.map +1 -0
  12. package/dist/chunk-SHTXZ4O2.js +113 -0
  13. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  14. package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
  15. package/dist/contract/index.d.ts +206 -9
  16. package/dist/contract/index.js +751 -3
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/governance/index.d.ts +1 -1
  19. package/dist/hosted/index.d.ts +8 -192
  20. package/dist/hosted/index.js +1 -1
  21. package/dist/index-BRxz6qov.d.ts +409 -0
  22. package/dist/index.d.ts +18 -462
  23. package/dist/index.js +14 -106
  24. package/dist/index.js.map +1 -1
  25. package/dist/meta-eval/index.d.ts +3 -3
  26. package/dist/openapi.json +1 -1
  27. package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
  28. package/dist/registry-8KAs18kY.d.ts +457 -0
  29. package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
  30. package/dist/reporting.d.ts +6 -4
  31. package/dist/reporting.js +6 -4
  32. package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
  33. package/dist/rl.d.ts +9 -8
  34. package/dist/rl.js +3 -2
  35. package/dist/rl.js.map +1 -1
  36. package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  37. package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
  38. package/dist/sequential-5iSVfzl2.d.ts +139 -0
  39. package/dist/store-CJbzDxZ2.d.ts +220 -0
  40. package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
  41. package/dist/traces.d.ts +3 -220
  42. package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
  43. package/dist/types-DhqpAi_z.d.ts +296 -0
  44. package/package.json +1 -1
  45. package/dist/chunk-MNL6LXGQ.js.map +0 -1
  46. package/dist/chunk-OYI6RZJK.js.map +0 -1
  47. /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
  48. /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
@@ -14,20 +14,538 @@ import {
14
14
  } from "../chunk-J3EIOI3O.js";
15
15
  import {
16
16
  createHostedClient
17
- } from "../chunk-OYI6RZJK.js";
17
+ } from "../chunk-FQK2CCIM.js";
18
+ import {
19
+ checkCanaries
20
+ } from "../chunk-SHTXZ4O2.js";
18
21
  import "../chunk-N4SBKEPJ.js";
19
22
  import "../chunk-YV7J7X5N.js";
20
23
  import {
21
24
  FileSystemOutcomeStore,
22
25
  InMemoryOutcomeStore
23
26
  } from "../chunk-3RF76KTD.js";
24
- import "../chunk-WP7SY7AI.js";
27
+ import {
28
+ paretoChart
29
+ } from "../chunk-EGIPWXHL.js";
30
+ import {
31
+ cohensD,
32
+ pairedBootstrap,
33
+ pairedMde,
34
+ pairedTTest,
35
+ requiredSampleSize
36
+ } from "../chunk-WP7SY7AI.js";
25
37
  import "../chunk-GGE4NNQT.js";
38
+ import "../chunk-47X6LRCE.js";
39
+ import "../chunk-5BKGXME7.js";
40
+ import "../chunk-VSMTAMNK.js";
26
41
  import "../chunk-VXNVVBZO.js";
27
42
  import "../chunk-PC4UYEBM.js";
28
43
  import "../chunk-QYJT52YW.js";
29
44
  import "../chunk-NSBPE2FW.js";
30
45
 
46
+ // src/contract/analyze-runs.ts
47
+ async function analyzeRuns(opts) {
48
+ const runs = opts.runs;
49
+ const bins = opts.histogramBins ?? 12;
50
+ const threshold = opts.decisionThreshold ?? 0.02;
51
+ const split = resolveSplit(runs, opts.split ?? "auto");
52
+ const composite = distributionOf(
53
+ runs.map((r) => compositeOf(r, split)).filter(Number.isFinite),
54
+ bins
55
+ );
56
+ const perDimension = computePerDimension(runs, bins);
57
+ const costQuality = {
58
+ cost: distributionOf(runs.map((r) => r.costUsd).filter(Number.isFinite), bins),
59
+ pareto: paretoChart(runs, { split })
60
+ };
61
+ const judges = computeJudgeInsights(runs);
62
+ const interRater = opts.raterScores ? computeInterRater(opts.raterScores) : void 0;
63
+ const lift = computeLift(runs, opts.baselineCandidateId, opts.candidateCandidateId, split);
64
+ const failureClusters = opts.analyst ? await computeFailureClusters(runs, opts.analyst, split) : void 0;
65
+ const contamination = opts.canaryScenarios ? computeContamination(runs, opts.canaryScenarios) : void 0;
66
+ const outcomeCorrelation = opts.outcomeSignal ? computeOutcomeCorrelation(runs, opts.outcomeSignal, split) : void 0;
67
+ const release = buildReleaseScorecard(composite, lift, contamination);
68
+ const recommendations = buildRecommendations({
69
+ composite,
70
+ judges,
71
+ interRater,
72
+ lift,
73
+ failureClusters,
74
+ contamination,
75
+ outcomeCorrelation,
76
+ threshold
77
+ });
78
+ return {
79
+ n: runs.length,
80
+ composite,
81
+ perDimension,
82
+ costQuality,
83
+ judges,
84
+ interRater,
85
+ lift,
86
+ failureClusters,
87
+ contamination,
88
+ outcomeCorrelation,
89
+ release,
90
+ recommendations
91
+ };
92
+ }
93
+ function resolveSplit(runs, pref) {
94
+ if (pref !== "auto") return pref;
95
+ const hasHoldout = runs.some((r) => Number.isFinite(r.outcome.holdoutScore));
96
+ return hasHoldout ? "holdout" : "search";
97
+ }
98
+ function compositeOf(run, split) {
99
+ const primary = split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore;
100
+ if (Number.isFinite(primary)) return primary;
101
+ const alt = split === "holdout" ? run.outcome.searchScore : run.outcome.holdoutScore;
102
+ return Number.isFinite(alt) ? alt : Number.NaN;
103
+ }
104
+ function distributionOf(values, bins) {
105
+ if (values.length === 0) {
106
+ return {
107
+ n: 0,
108
+ mean: 0,
109
+ p50: 0,
110
+ p95: 0,
111
+ stddev: 0,
112
+ min: 0,
113
+ max: 0,
114
+ histogram: []
115
+ };
116
+ }
117
+ const sorted = [...values].sort((a, b) => a - b);
118
+ const n = sorted.length;
119
+ const mean2 = sorted.reduce((s, v) => s + v, 0) / n;
120
+ const variance = sorted.reduce((s, v) => s + (v - mean2) ** 2, 0) / n;
121
+ const stddev = Math.sqrt(variance);
122
+ return {
123
+ n,
124
+ mean: mean2,
125
+ p50: percentile(sorted, 0.5),
126
+ p95: percentile(sorted, 0.95),
127
+ stddev,
128
+ min: sorted[0],
129
+ max: sorted[n - 1],
130
+ histogram: histogram(sorted, bins)
131
+ };
132
+ }
133
+ function percentile(sorted, q) {
134
+ if (sorted.length === 0) return 0;
135
+ if (sorted.length === 1) return sorted[0];
136
+ const idx = (sorted.length - 1) * q;
137
+ const lo = Math.floor(idx);
138
+ const hi = Math.ceil(idx);
139
+ if (lo === hi) return sorted[lo];
140
+ const w = idx - lo;
141
+ return sorted[lo] * (1 - w) + sorted[hi] * w;
142
+ }
143
+ function histogram(sorted, bins) {
144
+ if (sorted.length === 0 || bins < 1) return [];
145
+ const min = sorted[0];
146
+ const max = sorted[sorted.length - 1];
147
+ if (min === max) return [{ lo: min, hi: max, count: sorted.length }];
148
+ const width = (max - min) / bins;
149
+ const out = [];
150
+ for (let i = 0; i < bins; i++) {
151
+ const lo = min + i * width;
152
+ const hi = i === bins - 1 ? max : lo + width;
153
+ out.push({ lo, hi, count: 0 });
154
+ }
155
+ for (const v of sorted) {
156
+ const idx = Math.min(bins - 1, Math.floor((v - min) / width));
157
+ out[idx].count++;
158
+ }
159
+ return out;
160
+ }
161
+ function computePerDimension(runs, bins) {
162
+ const byDim = /* @__PURE__ */ new Map();
163
+ for (const run of runs) {
164
+ const scores = run.outcome.judgeScores;
165
+ if (!scores) continue;
166
+ for (const [dim, value] of Object.entries(scores.perDimMean ?? {})) {
167
+ if (!Number.isFinite(value)) continue;
168
+ const arr = byDim.get(dim) ?? [];
169
+ arr.push(value);
170
+ byDim.set(dim, arr);
171
+ }
172
+ }
173
+ const out = {};
174
+ for (const [dim, values] of byDim) out[dim] = distributionOf(values, bins);
175
+ return out;
176
+ }
177
+ function computeJudgeInsights(runs) {
178
+ const out = {};
179
+ const byJudge = /* @__PURE__ */ new Map();
180
+ for (const run of runs) {
181
+ const scores = run.outcome.judgeScores;
182
+ if (!scores?.perJudge) continue;
183
+ for (const [judgeId, dims] of Object.entries(scores.perJudge)) {
184
+ const dimValues = Object.values(dims).filter(Number.isFinite);
185
+ if (dimValues.length === 0) continue;
186
+ const judgeMean = dimValues.reduce((s, v) => s + v, 0) / dimValues.length;
187
+ const arr = byJudge.get(judgeId) ?? [];
188
+ arr.push(judgeMean);
189
+ byJudge.set(judgeId, arr);
190
+ }
191
+ }
192
+ for (const [judgeId, values] of byJudge) {
193
+ out[judgeId] = {
194
+ n: values.length,
195
+ meanScore: values.reduce((s, v) => s + v, 0) / values.length
196
+ };
197
+ }
198
+ return out;
199
+ }
200
+ function computeInterRater(ratings) {
201
+ const byRun = /* @__PURE__ */ new Map();
202
+ for (const r of ratings) {
203
+ if (!Number.isFinite(r.score)) continue;
204
+ const list = byRun.get(r.runId) ?? [];
205
+ list.push({ rater: r.rater, score: r.score });
206
+ byRun.set(r.runId, list);
207
+ }
208
+ const raters = new Set(ratings.map((r) => r.rater));
209
+ const jointlyRated = [];
210
+ for (const [runId, ratersForRun] of byRun) {
211
+ const seen = new Set(ratersForRun.map((r) => r.rater));
212
+ let all = true;
213
+ for (const r of raters) if (!seen.has(r)) all = false;
214
+ if (all) jointlyRated.push(runId);
215
+ }
216
+ if (raters.size < 2 || jointlyRated.length === 0) return void 0;
217
+ const raterList = [...raters].sort();
218
+ const perPair = {};
219
+ for (let i = 0; i < raterList.length; i++) {
220
+ for (let j = i + 1; j < raterList.length; j++) {
221
+ const a = raterList[i];
222
+ const b = raterList[j];
223
+ const aScores = [];
224
+ const bScores = [];
225
+ for (const runId of jointlyRated) {
226
+ const ratersForRun = byRun.get(runId);
227
+ const sa = ratersForRun.find((r) => r.rater === a)?.score;
228
+ const sb = ratersForRun.find((r) => r.rater === b)?.score;
229
+ if (sa !== void 0 && sb !== void 0) {
230
+ aScores.push(sa);
231
+ bScores.push(sb);
232
+ }
233
+ }
234
+ perPair[`${a}::${b}`] = pearson(aScores, bScores);
235
+ }
236
+ }
237
+ const pairKappas = Object.values(perPair);
238
+ const kappa = pairKappas.length === 0 ? 0 : pairKappas.reduce((s, v) => s + v, 0) / pairKappas.length;
239
+ const disagreementCases = jointlyRated.map((runId) => {
240
+ const ratersForRun = byRun.get(runId);
241
+ const scores = ratersForRun.map((r) => r.score);
242
+ const range = Math.max(...scores) - Math.min(...scores);
243
+ return { runId, ratings: ratersForRun, range };
244
+ }).sort((a, b) => b.range - a.range).slice(0, 20);
245
+ return {
246
+ raters: raters.size,
247
+ jointlyRated: jointlyRated.length,
248
+ kappa,
249
+ perPair,
250
+ disagreementCases
251
+ };
252
+ }
253
+ function pearson(a, b) {
254
+ if (a.length !== b.length || a.length === 0) return 0;
255
+ const n = a.length;
256
+ const meanA = a.reduce((s, v) => s + v, 0) / n;
257
+ const meanB = b.reduce((s, v) => s + v, 0) / n;
258
+ let num = 0;
259
+ let denomA = 0;
260
+ let denomB = 0;
261
+ for (let i = 0; i < n; i++) {
262
+ const da = a[i] - meanA;
263
+ const db = b[i] - meanB;
264
+ num += da * db;
265
+ denomA += da * da;
266
+ denomB += db * db;
267
+ }
268
+ const denom = Math.sqrt(denomA * denomB);
269
+ return denom === 0 ? 0 : num / denom;
270
+ }
271
+ function computeLift(runs, baselineId, candidateId, split) {
272
+ let bId = baselineId;
273
+ let cId = candidateId;
274
+ if (!bId || !cId) {
275
+ const ids = [...new Set(runs.map((r) => r.candidateId))];
276
+ if (ids.length !== 2) return void 0;
277
+ const [idA, idB] = ids;
278
+ const meanA = mean(runs.filter((r) => r.candidateId === idA).map((r) => compositeOf(r, split)));
279
+ const meanB = mean(runs.filter((r) => r.candidateId === idB).map((r) => compositeOf(r, split)));
280
+ bId = meanA <= meanB ? idA : idB;
281
+ cId = meanA <= meanB ? idB : idA;
282
+ }
283
+ const baseline = runs.filter((r) => r.candidateId === bId);
284
+ const candidate = runs.filter((r) => r.candidateId === cId);
285
+ if (baseline.length === 0 || candidate.length === 0) return void 0;
286
+ const baselineByKey = new Map(baseline.map((r) => [pairingKey(r), r]));
287
+ const pairedBaseline = [];
288
+ const pairedCandidate = [];
289
+ let usedKeyPairing = false;
290
+ for (const cand of candidate) {
291
+ const b = baselineByKey.get(pairingKey(cand));
292
+ if (b) {
293
+ const bC = compositeOf(b, split);
294
+ const cC = compositeOf(cand, split);
295
+ if (Number.isFinite(bC) && Number.isFinite(cC)) {
296
+ pairedBaseline.push(bC);
297
+ pairedCandidate.push(cC);
298
+ usedKeyPairing = true;
299
+ }
300
+ }
301
+ }
302
+ if (!usedKeyPairing) {
303
+ const n = Math.min(baseline.length, candidate.length);
304
+ for (let i = 0; i < n; i++) {
305
+ const bC = compositeOf(baseline[i], split);
306
+ const cC = compositeOf(candidate[i], split);
307
+ if (Number.isFinite(bC) && Number.isFinite(cC)) {
308
+ pairedBaseline.push(bC);
309
+ pairedCandidate.push(cC);
310
+ }
311
+ }
312
+ }
313
+ if (pairedBaseline.length === 0) return void 0;
314
+ const baselineMean = mean(pairedBaseline);
315
+ const candidateMean = mean(pairedCandidate);
316
+ const delta = candidateMean - baselineMean;
317
+ const bootstrap = pairedBootstrap(pairedBaseline, pairedCandidate, {
318
+ confidence: 0.95,
319
+ resamples: 2e3,
320
+ statistic: "mean"
321
+ });
322
+ const tTest = pairedTTest(pairedBaseline, pairedCandidate);
323
+ const d = cohensD(pairedBaseline, pairedCandidate);
324
+ const mde = pairedMde({ nPaired: pairedBaseline.length, power: 0.8, alpha: 0.05 });
325
+ const requiredN = requiredSampleSize({
326
+ effect: Math.max(Math.abs(delta), 1e-6),
327
+ power: 0.8,
328
+ alpha: 0.05
329
+ });
330
+ return {
331
+ baselineMean,
332
+ candidateMean,
333
+ delta,
334
+ ci95: [bootstrap.low, bootstrap.high],
335
+ pValue: tTest.p,
336
+ n: pairedBaseline.length,
337
+ cohensD: d,
338
+ mde,
339
+ requiredN
340
+ };
341
+ }
342
+ function pairingKey(r) {
343
+ return `${r.experimentId}::${r.seed}`;
344
+ }
345
+ function mean(arr) {
346
+ return arr.length === 0 ? 0 : arr.reduce((s, v) => s + v, 0) / arr.length;
347
+ }
348
+ async function computeFailureClusters(runs, analyst, split) {
349
+ const failed = runs.filter((r) => compositeOf(r, split) < 0.5 || r.failureMode !== void 0);
350
+ if (failed.length === 0) return { clusters: [], totalFailures: 0 };
351
+ const clusters = /* @__PURE__ */ new Map();
352
+ for (const run of failed) {
353
+ try {
354
+ const result = await analyst.run(run.runId, {
355
+ kind: "run-record",
356
+ run
357
+ });
358
+ for (const finding of result.findings) {
359
+ const key = finding.area || finding.analyst_id || "unclassified";
360
+ const c = clusters.get(key) ?? { exemplars: [], share: 0 };
361
+ if (c.exemplars.length < 5) c.exemplars.push(run.runId);
362
+ clusters.set(key, c);
363
+ }
364
+ } catch {
365
+ const c = clusters.get("analyst-error") ?? { exemplars: [], share: 0 };
366
+ if (c.exemplars.length < 5) c.exemplars.push(run.runId);
367
+ clusters.set("analyst-error", c);
368
+ }
369
+ }
370
+ const clusterList = [...clusters.entries()].map(([id, c]) => ({
371
+ id,
372
+ name: id,
373
+ share: c.exemplars.length / failed.length,
374
+ exemplars: c.exemplars
375
+ }));
376
+ clusterList.sort((a, b) => b.share - a.share);
377
+ return { clusters: clusterList, totalFailures: failed.length };
378
+ }
379
+ function computeContamination(runs, canaries) {
380
+ let leaks = 0;
381
+ const details = [];
382
+ for (const run of runs) {
383
+ const output = stringifyOutput(run);
384
+ if (!output) continue;
385
+ const leaksHere = checkCanaries(output, canaries);
386
+ for (const leak of leaksHere) {
387
+ leaks++;
388
+ details.push({ runId: run.runId, canary: leak.canary, matched: leak.evidence });
389
+ }
390
+ }
391
+ return { leaks, holdoutAuditPassed: leaks === 0, details };
392
+ }
393
+ function stringifyOutput(run) {
394
+ const metadata = run.metadata;
395
+ if (typeof metadata?.output === "string") return metadata.output;
396
+ if (typeof metadata?.text === "string") return metadata.text;
397
+ return void 0;
398
+ }
399
+ function computeOutcomeCorrelation(runs, outcome, split) {
400
+ const xs = [];
401
+ const ys = [];
402
+ for (const run of runs) {
403
+ const y = outcome.valueByRunId[run.runId];
404
+ if (y === void 0 || !Number.isFinite(y)) continue;
405
+ const x = compositeOf(run, split);
406
+ if (!Number.isFinite(x)) continue;
407
+ xs.push(x);
408
+ ys.push(y);
409
+ }
410
+ if (xs.length < 3) return void 0;
411
+ const p = pearson(xs, ys);
412
+ const s = spearman(xs, ys);
413
+ const meanX = mean(xs);
414
+ const meanY = mean(ys);
415
+ let num = 0;
416
+ let denom = 0;
417
+ for (let i = 0; i < xs.length; i++) {
418
+ num += (xs[i] - meanX) * (ys[i] - meanY);
419
+ denom += (xs[i] - meanX) ** 2;
420
+ }
421
+ const slope = denom === 0 ? 0 : num / denom;
422
+ const intercept = meanY - slope * meanX;
423
+ const ssTot = ys.reduce((a, y) => a + (y - meanY) ** 2, 0);
424
+ const ssRes = ys.reduce((a, y, i) => a + (y - (intercept + slope * xs[i])) ** 2, 0);
425
+ const r2 = ssTot === 0 ? 0 : 1 - ssRes / ssTot;
426
+ return {
427
+ metric: outcome.metric,
428
+ n: xs.length,
429
+ pearson: p,
430
+ spearman: s,
431
+ rewardModel: { intercept, slope, r2 }
432
+ };
433
+ }
434
+ function spearman(a, b) {
435
+ if (a.length !== b.length || a.length === 0) return 0;
436
+ return pearson(rank(a), rank(b));
437
+ }
438
+ function rank(arr) {
439
+ const indexed = arr.map((v, i2) => ({ v, i: i2 }));
440
+ indexed.sort((x, y) => x.v - y.v);
441
+ const ranks = new Array(arr.length).fill(0);
442
+ let i = 0;
443
+ while (i < indexed.length) {
444
+ let j = i;
445
+ while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
446
+ const avg = (i + j + 2) / 2;
447
+ for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
448
+ i = j + 1;
449
+ }
450
+ return ranks;
451
+ }
452
+ function buildReleaseScorecard(composite, lift, contamination) {
453
+ const axes = [];
454
+ const liftPass = lift === void 0 || lift.ci95[0] > 0 ? "pass" : lift.delta > 0 ? "warn" : "fail";
455
+ axes.push({
456
+ name: "quality-lift",
457
+ status: liftPass,
458
+ detail: lift ? `delta=${lift.delta.toFixed(3)}, CI95=[${lift.ci95[0].toFixed(3)}, ${lift.ci95[1].toFixed(3)}], n=${lift.n}` : "no baseline/candidate pair available"
459
+ });
460
+ const contamPass = contamination === void 0 || contamination.leaks === 0 ? "pass" : "fail";
461
+ axes.push({
462
+ name: "contamination",
463
+ status: contamPass,
464
+ detail: contamination ? `${contamination.leaks} canary leak(s)` : "no canaries supplied"
465
+ });
466
+ axes.push({
467
+ name: "composite-distribution",
468
+ status: composite.mean >= 0.5 ? "pass" : composite.mean >= 0.3 ? "warn" : "fail",
469
+ detail: `mean=${composite.mean.toFixed(3)}, p50=${composite.p50.toFixed(3)}, p95=${composite.p95.toFixed(3)} over n=${composite.n}`
470
+ });
471
+ const status = axes.some((a) => a.status === "fail") ? "fail" : axes.some((a) => a.status === "warn") ? "warn" : "pass";
472
+ return {
473
+ status,
474
+ axes,
475
+ issues: []
476
+ };
477
+ }
478
+ function buildRecommendations(ctx) {
479
+ const out = [];
480
+ if (ctx.lift) {
481
+ const decisive = ctx.lift.ci95[0] > ctx.threshold;
482
+ const inconclusive = ctx.lift.ci95[0] <= ctx.threshold && ctx.lift.ci95[1] > ctx.threshold;
483
+ if (decisive) {
484
+ out.push({
485
+ priority: "critical",
486
+ kind: "ship",
487
+ title: `Ship \u2014 lift ${ctx.lift.delta.toFixed(3)} (95% CI ${ctx.lift.ci95[0].toFixed(3)}..${ctx.lift.ci95[1].toFixed(3)})`,
488
+ detail: `Holdout lift exceeds threshold ${ctx.threshold} with 95% bootstrap confidence (n=${ctx.lift.n}, p=${ctx.lift.pValue.toFixed(4)}, d=${ctx.lift.cohensD.toFixed(2)}).`,
489
+ evidencePath: "lift"
490
+ });
491
+ } else if (inconclusive) {
492
+ out.push({
493
+ priority: "high",
494
+ kind: "expand-corpus",
495
+ title: `Inconclusive \u2014 need ~${ctx.lift.requiredN} paired runs (have ${ctx.lift.n}) at current effect size`,
496
+ detail: `CI straddles threshold. Current MDE at 80% power is ${ctx.lift.mde.toFixed(3)}; observed delta is ${ctx.lift.delta.toFixed(3)}.`,
497
+ evidencePath: "lift"
498
+ });
499
+ } else {
500
+ out.push({
501
+ priority: "critical",
502
+ kind: "hold",
503
+ title: `Hold \u2014 lift CI lower bound ${ctx.lift.ci95[0].toFixed(3)} is at or below threshold ${ctx.threshold}`,
504
+ detail: `Bootstrap CI provides no statistical evidence the candidate is better. Consider tightening the mutation or expanding the holdout.`,
505
+ evidencePath: "lift"
506
+ });
507
+ }
508
+ }
509
+ if (ctx.contamination && ctx.contamination.leaks > 0) {
510
+ out.push({
511
+ priority: "critical",
512
+ kind: "fix",
513
+ title: `${ctx.contamination.leaks} canary leak${ctx.contamination.leaks === 1 ? "" : "s"} detected`,
514
+ detail: `Holdout integrity is compromised. The lift number is unreliable until you investigate.`,
515
+ evidencePath: "contamination"
516
+ });
517
+ }
518
+ if (ctx.interRater && ctx.interRater.kappa < 0.5) {
519
+ out.push({
520
+ priority: "high",
521
+ kind: "recalibrate",
522
+ title: `Inter-rater agreement \u03BA=${ctx.interRater.kappa.toFixed(2)} is below 0.5`,
523
+ detail: `Raters disagree on what 'good' looks like. Top disagreement cases listed in interRater.disagreementCases \u2014 consider a triage meeting or refining the rubric.`,
524
+ evidencePath: "interRater"
525
+ });
526
+ }
527
+ if (ctx.failureClusters && ctx.failureClusters.clusters.length > 0) {
528
+ const top = ctx.failureClusters.clusters[0];
529
+ out.push({
530
+ priority: "high",
531
+ kind: "investigate",
532
+ title: `Top failure cluster: ${top.name} (${(top.share * 100).toFixed(0)}% of failures)`,
533
+ detail: `${ctx.failureClusters.totalFailures} runs failed. The largest cluster groups ${top.exemplars.length} exemplars under '${top.name}'.`,
534
+ evidencePath: "failureClusters.clusters[0]"
535
+ });
536
+ }
537
+ if (ctx.outcomeCorrelation && Math.abs(ctx.outcomeCorrelation.spearman) < 0.3) {
538
+ out.push({
539
+ priority: "medium",
540
+ kind: "recalibrate",
541
+ title: `Judge scores decoupled from ${ctx.outcomeCorrelation.metric} (Spearman \u03C1=${ctx.outcomeCorrelation.spearman.toFixed(2)})`,
542
+ detail: `Your judges score what they were trained to score, but it isn't predicting downstream ${ctx.outcomeCorrelation.metric}. Consider retraining the judge against ${ctx.outcomeCorrelation.metric} as the gold signal.`,
543
+ evidencePath: "outcomeCorrelation"
544
+ });
545
+ }
546
+ return out;
547
+ }
548
+
31
549
  // src/contract/self-improve.ts
32
550
  function splitTrainHoldout(scenarios, fraction) {
33
551
  function hash(s) {
@@ -139,6 +657,14 @@ async function selfImprove(opts) {
139
657
  (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
140
658
  0
141
659
  );
660
+ const insight = await analyzeRuns({
661
+ runs: [
662
+ ...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir),
663
+ ...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir)
664
+ ],
665
+ baselineCandidateId: "baseline",
666
+ candidateCandidateId: "winner"
667
+ });
142
668
  const summary = {
143
669
  baseline,
144
670
  winner: {
@@ -150,6 +676,7 @@ async function selfImprove(opts) {
150
676
  generationsExplored: result.generations.length,
151
677
  durationMs: Date.now() - startedAt,
152
678
  totalCostUsd: totalCost,
679
+ insight,
153
680
  raw: result
154
681
  };
155
682
  if (opts.hostedTenant) {
@@ -212,7 +739,8 @@ async function shipEvalRunToHosted(tenant, opts, summary, raw, runDir) {
212
739
  gateDecision: summary.gateDecision,
213
740
  holdoutLift: summary.lift,
214
741
  totalCostUsd: summary.totalCostUsd,
215
- totalDurationMs: summary.durationMs
742
+ totalDurationMs: summary.durationMs,
743
+ insightReport: summary.insight
216
744
  };
217
745
  await client.ingestEvalRun(event);
218
746
  }
@@ -228,12 +756,232 @@ function hashString(s) {
228
756
  }
229
757
  return h.toString(16).padStart(8, "0");
230
758
  }
759
+ function cellsToRunRecords(cells, candidateId, runId) {
760
+ return cells.map((cell) => {
761
+ const perJudge = {};
762
+ const perDimMeanAccum = {};
763
+ let compositeSum = 0;
764
+ let compositeCount = 0;
765
+ for (const [judgeId, score] of Object.entries(cell.judgeScores)) {
766
+ perJudge[judgeId] = { ...score.dimensions };
767
+ for (const [dim, value] of Object.entries(score.dimensions)) {
768
+ if (!Number.isFinite(value)) continue;
769
+ const accum = perDimMeanAccum[dim] ?? { sum: 0, n: 0 };
770
+ accum.sum += value;
771
+ accum.n += 1;
772
+ perDimMeanAccum[dim] = accum;
773
+ }
774
+ if (Number.isFinite(score.composite)) {
775
+ compositeSum += score.composite;
776
+ compositeCount += 1;
777
+ }
778
+ }
779
+ const perDimMean = {};
780
+ for (const [dim, { sum, n }] of Object.entries(perDimMeanAccum)) {
781
+ perDimMean[dim] = n === 0 ? 0 : sum / n;
782
+ }
783
+ const composite = compositeCount === 0 ? 0 : compositeSum / compositeCount;
784
+ const judgeScores = {
785
+ perJudge,
786
+ perDimMean,
787
+ composite
788
+ };
789
+ return {
790
+ runId: `${runId}::${candidateId}::${cell.cellId}`,
791
+ experimentId: runId,
792
+ candidateId,
793
+ // Pair on (scenarioId, rep) — analyzeRuns pairs on (experimentId, seed).
794
+ // Synthesize a stable seed for that pairing.
795
+ seed: cell.rep * 1e6 + hashString(cell.scenarioId).slice(0, 6).split("").reduce((a, c) => a * 31 + c.charCodeAt(0) >>> 0, 0),
796
+ model: "campaign-cell",
797
+ promptHash: "sha256:cell",
798
+ configHash: "sha256:cell",
799
+ commitSha: "cell",
800
+ wallMs: cell.durationMs,
801
+ costUsd: cell.costUsd,
802
+ tokenUsage: { input: 0, output: 0 },
803
+ outcome: {
804
+ holdoutScore: composite,
805
+ raw: {},
806
+ judgeScores
807
+ },
808
+ splitTag: "holdout",
809
+ ...cell.error ? { failureMode: cell.error } : {}
810
+ };
811
+ });
812
+ }
813
+
814
+ // src/contract/intake/feedback-table.ts
815
+ function fromFeedbackTable(opts) {
816
+ const { ratings, meta = [], scale, emitRaterScores = true } = opts;
817
+ const metaByRun = new Map(meta.map((m) => [m.runId, m]));
818
+ const normalise = (rating) => {
819
+ if (typeof rating === "boolean") return rating ? 1 : 0;
820
+ if (!Number.isFinite(rating)) return Number.NaN;
821
+ if (!scale) return rating;
822
+ const { min, max } = scale;
823
+ if (max === min) return rating;
824
+ return (rating - min) / (max - min);
825
+ };
826
+ const byRun = /* @__PURE__ */ new Map();
827
+ for (const row of ratings) {
828
+ const list = byRun.get(row.runId) ?? [];
829
+ list.push(row);
830
+ byRun.set(row.runId, list);
831
+ }
832
+ const runs = [];
833
+ const raterScores = [];
834
+ for (const [runId, rowsForRun] of byRun) {
835
+ const normalised = rowsForRun.map((r) => ({ rater: r.rater, score: normalise(r.rating) })).filter((r) => Number.isFinite(r.score));
836
+ if (normalised.length === 0) continue;
837
+ const meanScore = normalised.reduce((s, r) => s + r.score, 0) / normalised.length;
838
+ const runMeta = metaByRun.get(runId) ?? { runId };
839
+ const judgeScores = {
840
+ perJudge: Object.fromEntries(normalised.map((r) => [r.rater, { rating: r.score }])),
841
+ perDimMean: { rating: meanScore },
842
+ composite: meanScore
843
+ };
844
+ const outcome = {
845
+ // Feedback corpora ARE the holdout signal — score lands on
846
+ // `holdoutScore` so downstream substrate primitives (`paretoChart`,
847
+ // promotion gates) read it correctly by default.
848
+ holdoutScore: meanScore,
849
+ raw: Object.fromEntries(normalised.map((r) => [`rater:${r.rater}`, r.score])),
850
+ judgeScores
851
+ };
852
+ runs.push({
853
+ runId,
854
+ experimentId: runMeta.experimentId ?? "feedback-corpus",
855
+ candidateId: runMeta.candidateId ?? runId,
856
+ seed: 0,
857
+ model: runMeta.model ?? "unknown@unknown",
858
+ promptHash: runMeta.promptHash ?? "sha256:unknown",
859
+ configHash: runMeta.configHash ?? "sha256:unknown",
860
+ commitSha: runMeta.commitSha ?? "unknown",
861
+ wallMs: runMeta.wallMs ?? 0,
862
+ costUsd: runMeta.costUsd ?? 0,
863
+ tokenUsage: { input: 0, output: 0 },
864
+ outcome,
865
+ splitTag: runMeta.splitTag ?? "holdout"
866
+ });
867
+ if (emitRaterScores) {
868
+ for (const r of normalised) raterScores.push({ runId, rater: r.rater, score: r.score });
869
+ }
870
+ }
871
+ return { runs, raterScores };
872
+ }
873
+
874
+ // src/contract/intake/otel-spans.ts
875
+ var SCORE_KEYS = ["tangle.score", "eval.score", "score"];
876
+ var MODEL_KEYS = ["tangle.model", "gen_ai.request.model", "llm.model", "model"];
877
+ var COST_KEYS = ["tangle.cost.usd", "gen_ai.usage.cost_usd", "cost.usd", "cost"];
878
+ var INPUT_TOKEN_KEYS = ["gen_ai.usage.input_tokens", "tangle.tokens.in", "tokens.in"];
879
+ var OUTPUT_TOKEN_KEYS = ["gen_ai.usage.output_tokens", "tangle.tokens.out", "tokens.out"];
880
+ var PROMPT_HASH_KEYS = ["tangle.prompt_hash", "prompt.hash"];
881
+ var CONFIG_HASH_KEYS = ["tangle.config_hash", "config.hash"];
882
+ function fromOtelSpans(opts) {
883
+ const { spans, defaultSplit = "holdout", experimentId = "otel-corpus" } = opts;
884
+ const grouped = groupSpans(spans);
885
+ const runs = [];
886
+ for (const [groupKey, groupSpans2] of grouped) {
887
+ const root = findRoot(groupSpans2);
888
+ if (!root) continue;
889
+ const wallMs = Math.max(0, (root.endTimeUnixNano - root.startTimeUnixNano) / 1e6);
890
+ const model = readAttrString(groupSpans2, MODEL_KEYS) ?? "unknown@unknown";
891
+ const costUsd = readAttrNumber(groupSpans2, COST_KEYS) ?? 0;
892
+ const inputTokens = readAttrNumber(groupSpans2, INPUT_TOKEN_KEYS) ?? 0;
893
+ const outputTokens = readAttrNumber(groupSpans2, OUTPUT_TOKEN_KEYS) ?? 0;
894
+ const promptHash = readAttrString(groupSpans2, PROMPT_HASH_KEYS) ?? "sha256:unknown";
895
+ const configHash = readAttrString(groupSpans2, CONFIG_HASH_KEYS) ?? "sha256:unknown";
896
+ const score = readAttrNumber(groupSpans2, SCORE_KEYS);
897
+ const rawNumeric = collectNumericAttrs(groupSpans2);
898
+ const tokenUsage = {
899
+ input: inputTokens,
900
+ output: outputTokens
901
+ };
902
+ const judgeScores = score !== void 0 ? {
903
+ perJudge: { "otel-derived": { score } },
904
+ perDimMean: { score },
905
+ composite: score
906
+ } : void 0;
907
+ const errorSpan = groupSpans2.find((s) => s.status?.code === "ERROR");
908
+ const outcome = {
909
+ ...opts.defaultSplit === "search" ? { searchScore: score } : { holdoutScore: score },
910
+ raw: rawNumeric,
911
+ ...judgeScores ? { judgeScores } : {}
912
+ };
913
+ runs.push({
914
+ runId: groupKey,
915
+ experimentId,
916
+ candidateId: root.attributes["tangle.candidateId"] ?? "otel-default",
917
+ seed: 0,
918
+ model,
919
+ promptHash,
920
+ configHash,
921
+ commitSha: root.attributes["tangle.commit_sha"] ?? "unknown",
922
+ wallMs,
923
+ costUsd,
924
+ tokenUsage,
925
+ outcome,
926
+ splitTag: defaultSplit,
927
+ ...errorSpan ? { failureMode: errorSpan.name } : {}
928
+ });
929
+ }
930
+ return runs;
931
+ }
932
+ function groupSpans(spans) {
933
+ const m = /* @__PURE__ */ new Map();
934
+ for (const span of spans) {
935
+ const key = span["tangle.runId"] ?? span.traceId;
936
+ const list = m.get(key) ?? [];
937
+ list.push(span);
938
+ m.set(key, list);
939
+ }
940
+ return m;
941
+ }
942
+ function findRoot(group) {
943
+ return group.find((s) => !s.parentSpanId) ?? group[0];
944
+ }
945
+ function readAttrString(spans, keys) {
946
+ for (const span of spans) {
947
+ for (const key of keys) {
948
+ const v = span.attributes[key];
949
+ if (typeof v === "string" && v.length > 0) return v;
950
+ }
951
+ }
952
+ return void 0;
953
+ }
954
+ function readAttrNumber(spans, keys) {
955
+ for (const span of spans) {
956
+ for (const key of keys) {
957
+ const v = span.attributes[key];
958
+ if (typeof v === "number" && Number.isFinite(v)) return v;
959
+ if (typeof v === "string") {
960
+ const parsed = Number(v);
961
+ if (Number.isFinite(parsed)) return parsed;
962
+ }
963
+ }
964
+ }
965
+ return void 0;
966
+ }
967
+ function collectNumericAttrs(spans) {
968
+ const raw = {};
969
+ for (const span of spans) {
970
+ for (const [k, v] of Object.entries(span.attributes)) {
971
+ if (typeof v === "number" && Number.isFinite(v)) raw[k] = v;
972
+ }
973
+ }
974
+ return raw;
975
+ }
231
976
  export {
232
977
  FileSystemOutcomeStore,
233
978
  InMemoryOutcomeStore,
979
+ analyzeRuns,
234
980
  composeGate,
235
981
  defaultProductionGate,
236
982
  evolutionaryDriver,
983
+ fromFeedbackTable,
984
+ fromOtelSpans,
237
985
  fsCampaignStorage,
238
986
  gepaDriver,
239
987
  heldOutGate,