@tangle-network/agent-eval 0.77.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +50 -19
  2. package/dist/adapters/http.d.ts +2 -2
  3. package/dist/adapters/langchain.d.ts +2 -2
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  6. package/dist/analyst/index.d.ts +42 -8
  7. package/dist/analyst/index.js +32 -2
  8. package/dist/analyst/index.js.map +1 -1
  9. package/dist/authenticity/index.d.ts +54 -1
  10. package/dist/authenticity/index.js +88 -1
  11. package/dist/authenticity/index.js.map +1 -1
  12. package/dist/belief-state/index.d.ts +188 -0
  13. package/dist/belief-state/index.js +486 -0
  14. package/dist/belief-state/index.js.map +1 -0
  15. package/dist/benchmarks/index.d.ts +2 -2
  16. package/dist/calibration-Cpr3WaX3.d.ts +101 -0
  17. package/dist/campaign/index.d.ts +11 -11
  18. package/dist/campaign/index.js +4 -4
  19. package/dist/chunk-4DIJWVUT.js +131 -0
  20. package/dist/chunk-4DIJWVUT.js.map +1 -0
  21. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  22. package/dist/chunk-5LVWPNS5.js.map +1 -0
  23. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  24. package/dist/chunk-CF67I6QY.js.map +1 -0
  25. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  26. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  27. package/dist/chunk-KWRRMR3J.js.map +1 -0
  28. package/dist/chunk-NPCTHQIO.js +91 -0
  29. package/dist/chunk-NPCTHQIO.js.map +1 -0
  30. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  31. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  32. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  33. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  34. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  35. package/dist/contract/index.d.ts +128 -15
  36. package/dist/contract/index.js +118 -2
  37. package/dist/contract/index.js.map +1 -1
  38. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  39. package/dist/control.d.ts +2 -2
  40. package/dist/control.js +2 -2
  41. package/dist/governance/index.d.ts +1 -1
  42. package/dist/hosted/index.d.ts +4 -4
  43. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  44. package/dist/index.d.ts +127 -26
  45. package/dist/index.js +32 -7
  46. package/dist/index.js.map +1 -1
  47. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  48. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  49. package/dist/meta-eval/index.d.ts +6 -99
  50. package/dist/meta-eval/index.js +7 -76
  51. package/dist/meta-eval/index.js.map +1 -1
  52. package/dist/off-policy-DiwuKKg7.d.ts +132 -0
  53. package/dist/openapi.json +1 -1
  54. package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
  55. package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
  56. package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
  57. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  58. package/dist/reporting.d.ts +5 -5
  59. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  60. package/dist/rl.d.ts +10 -140
  61. package/dist/rl.js +8 -122
  62. package/dist/rl.js.map +1 -1
  63. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
  64. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
  65. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  66. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
  67. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  68. package/dist/traces.d.ts +1 -1
  69. package/dist/traces.js +2 -2
  70. package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
  71. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  72. package/dist/workflow/index.d.ts +4 -4
  73. package/dist/workflow/index.js +1 -1
  74. package/docs/auto-research-loop-end-to-end.md +1 -1
  75. package/docs/feature-guide.md +4 -4
  76. package/docs/multi-shot-optimization.md +61 -115
  77. package/docs/product-eval-adoption.md +1 -1
  78. package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
  79. package/docs/research/research-roadmap.md +1 -0
  80. package/docs/three-package-architecture.md +1 -1
  81. package/docs/trace-analysis.md +19 -0
  82. package/package.json +7 -2
  83. package/dist/chunk-7W4SM7FD.js.map +0 -1
  84. package/dist/chunk-F3SRAAZO.js.map +0 -1
  85. package/dist/chunk-JYE3WOTE.js.map +0 -1
  86. package/dist/chunk-WYIHD6EB.js.map +0 -1
  87. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  88. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  89. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  90. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -1,4 +1,4 @@
1
- import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-ByiOUrHj.js';
1
+ import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-BTaXq1TS.js';
2
2
  import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
3
3
 
4
4
  /**
@@ -1,7 +1,7 @@
1
1
  import { AxAIService, AxFunction } from '@ax-llm/ax';
2
2
  import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
3
3
  import { z } from 'zod';
4
- import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-CRD68aH7.js';
4
+ import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-DRvV0zRo.js';
5
5
 
6
6
  /**
7
7
  * Typed Ax output for analyst findings.
@@ -1,100 +1,7 @@
1
- import { T as TraceStore } from '../store-CKUAgsJz.js';
2
- import { R as Run } from '../schema-m0gsnbt3.js';
3
- import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
- export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
5
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-D_4BSXGV.js';
6
- import '../run-record-BgTFzO2r.js';
1
+ export { C as CalibrationBin, a as CalibrationOptions, b as CalibrationPair, c as CalibrationReport, d as CorrelationResult, e as CorrelationStudyOptions, f as CorrelationStudyResult, E as EvalMetricSpec, O as OutcomePair, g as calibrationCurve, h as calibrationFromPairs, i as correlationStudy } from '../calibration-Cpr3WaX3.js';
2
+ export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeFilter, b as OutcomeStore } from '../outcome-store-rnXLEqSn.js';
3
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CLPuwiUw.js';
4
+ import '../store-CKUAgsJz.js';
5
+ import '../schema-m0gsnbt3.js';
6
+ import '../run-record-sItO5ftF.js';
7
7
  import '../errors-Dwqw-T_m.js';
8
-
9
- /**
10
- * Correlation study — "does our eval score predict real-world outcomes?"
11
- *
12
- * This is the load-bearing signal. Takes a TraceStore + OutcomeStore,
13
- * joins on runId, computes Pearson + Spearman + bootstrap CI for every
14
- * (evalMetric, outcomeMetric) pair the caller declares.
15
- *
16
- * Without this number the framework is ornamental. With it and r > 0.6
17
- * the framework is a moat — no other agent-eval tool publishes one.
18
- */
19
-
20
- interface EvalMetricSpec {
21
- id: string;
22
- /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
23
- extract?: (run: Run, store: TraceStore) => Promise<number | null>;
24
- }
25
- interface OutcomePair {
26
- evalMetric: string;
27
- outcomeMetric: string;
28
- }
29
- interface CorrelationResult {
30
- evalMetric: string;
31
- outcomeMetric: string;
32
- n: number;
33
- pearson: number;
34
- spearman: number;
35
- /** 95% bootstrap CI for Pearson. */
36
- pearsonCi95: {
37
- lower: number;
38
- upper: number;
39
- };
40
- /** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */
41
- verdict: 'strong' | 'moderate' | 'weak';
42
- }
43
- interface CorrelationStudyResult {
44
- pairs: CorrelationResult[];
45
- joinedSamples: number;
46
- skippedRuns: number;
47
- }
48
- interface CorrelationStudyOptions {
49
- /** Only join outcomes captured within this window after run.startedAt. */
50
- maxCaptureLagMs?: number;
51
- /** Restrict to a subset of outcomes (cohort, region, source). */
52
- outcomeFilter?: OutcomeFilter;
53
- /** Which outcome per run to use when multiple exist. Default 'latest'. */
54
- reduction?: 'latest' | 'mean' | 'max';
55
- /** Bootstrap iterations for the CI. Default 500. */
56
- bootstrapIterations?: number;
57
- }
58
- declare function correlationStudy(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetrics: EvalMetricSpec[], outcomeMetricNames: string[], options?: CorrelationStudyOptions): Promise<CorrelationStudyResult>;
59
-
60
- /**
61
- * Calibration curve — binned "if eval says X, what does reality show?"
62
- *
63
- * Companion to correlationStudy. Raw correlation is a single number;
64
- * the calibration curve shows *where* the eval is well-calibrated vs
65
- * overconfident / underconfident. Buckets the eval metric, computes
66
- * mean outcome per bucket, reports expected-calibration-error (ECE).
67
- */
68
-
69
- interface CalibrationBin {
70
- lower: number;
71
- upper: number;
72
- n: number;
73
- evalMean: number;
74
- outcomeMean: number;
75
- /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */
76
- gap: number;
77
- }
78
- interface CalibrationReport {
79
- evalMetric: string;
80
- outcomeMetric: string;
81
- n: number;
82
- bins: CalibrationBin[];
83
- /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */
84
- ece: number;
85
- /** Max bin gap — upper bound on miscalibration. */
86
- maxGap: number;
87
- }
88
- interface CalibrationOptions {
89
- bins?: number;
90
- /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */
91
- binning?: 'equal-width' | 'equal-frequency';
92
- /** Clip eval values to [lo, hi] before binning. */
93
- range?: {
94
- lo: number;
95
- hi: number;
96
- };
97
- }
98
- declare function calibrationCurve(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetric: EvalMetricSpec, outcomeMetric: string, options?: CalibrationOptions): Promise<CalibrationReport | null>;
99
-
100
- export { type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type EvalMetricSpec, OutcomeFilter, type OutcomePair, OutcomeStore, calibrationCurve, correlationStudy };
@@ -1,3 +1,7 @@
1
+ import {
2
+ calibrationCurve,
3
+ calibrationFromPairs
4
+ } from "../chunk-NPCTHQIO.js";
1
5
  import {
2
6
  FileSystemOutcomeStore,
3
7
  InMemoryOutcomeStore
@@ -12,80 +16,6 @@ import {
12
16
  import "../chunk-5BKGXME7.js";
13
17
  import "../chunk-PZ5AY32C.js";
14
18
 
15
- // src/meta-eval/calibration.ts
16
- async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
17
- const runs = await traceStore.listRuns();
18
- const outcomes = await outcomeStore.list();
19
- const byRun = /* @__PURE__ */ new Map();
20
- for (const o of outcomes) {
21
- const arr = byRun.get(o.runId) ?? [];
22
- arr.push(o);
23
- byRun.set(o.runId, arr);
24
- }
25
- const extract = evalMetric.extract ?? defaultExtract(evalMetric.id);
26
- const pairs = [];
27
- for (const run of runs) {
28
- const os = byRun.get(run.runId);
29
- if (!os?.length) continue;
30
- const x = await extract(run, traceStore);
31
- if (x === null || !Number.isFinite(x)) continue;
32
- const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0];
33
- const y = latest.metrics[outcomeMetric];
34
- if (typeof y !== "number" || !Number.isFinite(y)) continue;
35
- pairs.push({ x, y });
36
- }
37
- if (pairs.length < 2) return null;
38
- const numBins = options.bins ?? 10;
39
- const binning = options.binning ?? "equal-width";
40
- const xs = pairs.map((p) => p.x);
41
- const lo = options.range?.lo ?? Math.min(...xs);
42
- const hi = options.range?.hi ?? Math.max(...xs);
43
- const bins = [];
44
- if (binning === "equal-frequency") {
45
- const sorted = [...pairs].sort((a, b) => a.x - b.x);
46
- const perBin = Math.max(1, Math.floor(sorted.length / numBins));
47
- for (let i = 0; i < sorted.length; i += perBin) {
48
- const chunk = sorted.slice(i, i + perBin);
49
- if (chunk.length === 0) continue;
50
- bins.push(toBin(chunk));
51
- }
52
- } else {
53
- const width = (hi - lo) / numBins;
54
- if (width === 0) return null;
55
- for (let i = 0; i < numBins; i++) {
56
- const binLo = lo + i * width;
57
- const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width;
58
- const chunk = pairs.filter((p) => p.x >= binLo && p.x < binHi);
59
- if (chunk.length === 0) continue;
60
- bins.push(toBin(chunk, binLo, binHi));
61
- }
62
- }
63
- const total = bins.reduce((a, b) => a + b.n, 0);
64
- const ece = bins.reduce((a, b) => a + b.n / total * b.gap, 0);
65
- const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0);
66
- return { evalMetric: evalMetric.id, outcomeMetric, n: pairs.length, bins, ece, maxGap };
67
- }
68
- function toBin(chunk, lower, upper) {
69
- const xs = chunk.map((c) => c.x);
70
- const ys = chunk.map((c) => c.y);
71
- const evalMean = mean(xs);
72
- const outcomeMean = mean(ys);
73
- return {
74
- lower: lower ?? Math.min(...xs),
75
- upper: upper ?? Math.max(...xs),
76
- n: chunk.length,
77
- evalMean,
78
- outcomeMean,
79
- gap: Math.abs(outcomeMean - evalMean)
80
- };
81
- }
82
- function mean(xs) {
83
- return xs.reduce((a, b) => a + b, 0) / xs.length;
84
- }
85
- function defaultExtract(metric) {
86
- return async (run) => run.outcome?.score ?? (metric === "pass" ? run.outcome?.pass === true ? 1 : 0 : null);
87
- }
88
-
89
19
  // src/meta-eval/correlation-study.ts
90
20
  async function correlationStudy(traceStore, outcomeStore, evalMetrics, outcomeMetricNames, options = {}) {
91
21
  const runs = await traceStore.listRuns();
@@ -118,7 +48,7 @@ async function correlationStudy(traceStore, outcomeStore, evalMetrics, outcomeMe
118
48
  continue;
119
49
  }
120
50
  for (const em of evalMetrics) {
121
- const extract = em.extract ?? defaultExtract2(em.id);
51
+ const extract = em.extract ?? defaultExtract(em.id);
122
52
  const x = await extract(run, traceStore);
123
53
  if (x === null || !Number.isFinite(x)) continue;
124
54
  for (const om of outcomeMetricNames) {
@@ -216,7 +146,7 @@ function bootstrapPearsonCi(xs, ys, iterations) {
216
146
  upper: rs[Math.min(rs.length - 1, Math.floor(0.975 * rs.length))]
217
147
  };
218
148
  }
219
- function defaultExtract2(metric) {
149
+ function defaultExtract(metric) {
220
150
  return async (run, store) => {
221
151
  switch (metric) {
222
152
  case "score":
@@ -243,6 +173,7 @@ export {
243
173
  FileSystemOutcomeStore,
244
174
  InMemoryOutcomeStore,
245
175
  calibrationCurve,
176
+ calibrationFromPairs,
246
177
  correlationStudy,
247
178
  rubricPredictiveValidity
248
179
  };
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/meta-eval/calibration.ts","../../src/meta-eval/correlation-study.ts"],"sourcesContent":["/**\n * Calibration curve — binned \"if eval says X, what does reality show?\"\n *\n * Companion to correlationStudy. Raw correlation is a single number;\n * the calibration curve shows *where* the eval is well-calibrated vs\n * overconfident / underconfident. Buckets the eval metric, computes\n * mean outcome per bucket, reports expected-calibration-error (ECE).\n */\n\nimport type { Run } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport type { EvalMetricSpec } from './correlation-study'\nimport type { DeploymentOutcome, OutcomeStore } from './outcome-store'\n\nexport interface CalibrationBin {\n lower: number\n upper: number\n n: number\n evalMean: number\n outcomeMean: number\n /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */\n gap: number\n}\n\nexport interface CalibrationReport {\n evalMetric: string\n outcomeMetric: string\n n: number\n bins: CalibrationBin[]\n /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */\n ece: number\n /** Max bin gap — upper bound on miscalibration. */\n maxGap: number\n}\n\nexport interface CalibrationOptions {\n bins?: number\n /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */\n binning?: 'equal-width' | 'equal-frequency'\n /** Clip eval values to [lo, hi] before binning. */\n range?: { lo: number; hi: number }\n}\n\nexport async function calibrationCurve(\n traceStore: TraceStore,\n outcomeStore: OutcomeStore,\n evalMetric: EvalMetricSpec,\n outcomeMetric: string,\n options: CalibrationOptions = {},\n): Promise<CalibrationReport | null> {\n const runs = await traceStore.listRuns()\n const outcomes = await outcomeStore.list()\n const byRun = new Map<string, DeploymentOutcome[]>()\n for (const o of outcomes) {\n const arr = byRun.get(o.runId) ?? []\n arr.push(o)\n byRun.set(o.runId, arr)\n }\n\n const extract = evalMetric.extract ?? defaultExtract(evalMetric.id)\n const pairs: Array<{ x: number; y: number }> = []\n for (const run of runs) {\n const os = byRun.get(run.runId)\n if (!os?.length) continue\n const x = await extract(run, traceStore)\n if (x === null || !Number.isFinite(x)) continue\n const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0]!\n const y = latest.metrics[outcomeMetric]\n if (typeof y !== 'number' || !Number.isFinite(y)) continue\n pairs.push({ x, y })\n }\n if (pairs.length < 2) return null\n\n const numBins = options.bins ?? 10\n const binning = options.binning ?? 'equal-width'\n const xs = pairs.map((p) => p.x)\n const lo = options.range?.lo ?? Math.min(...xs)\n const hi = options.range?.hi ?? Math.max(...xs)\n\n const bins: CalibrationBin[] = []\n if (binning === 'equal-frequency') {\n const sorted = [...pairs].sort((a, b) => a.x - b.x)\n const perBin = Math.max(1, Math.floor(sorted.length / numBins))\n for (let i = 0; i < sorted.length; i += perBin) {\n const chunk = sorted.slice(i, i + perBin)\n if (chunk.length === 0) continue\n bins.push(toBin(chunk))\n }\n } else {\n const width = (hi - lo) / numBins\n if (width === 0) return null\n for (let i = 0; i < numBins; i++) {\n const binLo = lo + i * width\n const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width\n const chunk = pairs.filter((p) => p.x >= binLo && p.x < binHi)\n if (chunk.length === 0) continue\n bins.push(toBin(chunk, binLo, binHi))\n }\n }\n\n const total = bins.reduce((a, b) => a + b.n, 0)\n const ece = bins.reduce((a, b) => a + (b.n / total) * b.gap, 0)\n const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0)\n\n return { evalMetric: evalMetric.id, outcomeMetric, n: pairs.length, bins, ece, maxGap }\n}\n\nfunction toBin(\n chunk: Array<{ x: number; y: number }>,\n lower?: number,\n upper?: number,\n): CalibrationBin {\n const xs = chunk.map((c) => c.x)\n const ys = chunk.map((c) => c.y)\n const evalMean = mean(xs)\n const outcomeMean = mean(ys)\n return {\n lower: lower ?? Math.min(...xs),\n upper: upper ?? Math.max(...xs),\n n: chunk.length,\n evalMean,\n outcomeMean,\n gap: Math.abs(outcomeMean - evalMean),\n }\n}\n\nfunction mean(xs: number[]): number {\n return xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run) =>\n run.outcome?.score ?? (metric === 'pass' ? (run.outcome?.pass === true ? 1 : 0) : null)\n}\n","/**\n * Correlation study — \"does our eval score predict real-world outcomes?\"\n *\n * This is the load-bearing signal. Takes a TraceStore + OutcomeStore,\n * joins on runId, computes Pearson + Spearman + bootstrap CI for every\n * (evalMetric, outcomeMetric) pair the caller declares.\n *\n * Without this number the framework is ornamental. With it and r > 0.6\n * the framework is a moat — no other agent-eval tool publishes one.\n */\n\nimport { aggregateLlm, llmSpans } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport type { DeploymentOutcome, OutcomeFilter, OutcomeStore } from './outcome-store'\n\nexport interface EvalMetricSpec {\n id: string\n /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface OutcomePair {\n evalMetric: string\n outcomeMetric: string\n}\n\nexport interface CorrelationResult {\n evalMetric: string\n outcomeMetric: string\n n: number\n pearson: number\n spearman: number\n /** 95% bootstrap CI for Pearson. */\n pearsonCi95: { lower: number; upper: number }\n /** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */\n verdict: 'strong' | 'moderate' | 'weak'\n}\n\nexport interface CorrelationStudyResult {\n pairs: CorrelationResult[]\n joinedSamples: number\n skippedRuns: number\n}\n\nexport interface CorrelationStudyOptions {\n /** Only join outcomes captured within this window after run.startedAt. */\n maxCaptureLagMs?: number\n /** Restrict to a subset of outcomes (cohort, region, source). */\n outcomeFilter?: OutcomeFilter\n /** Which outcome per run to use when multiple exist. Default 'latest'. */\n reduction?: 'latest' | 'mean' | 'max'\n /** Bootstrap iterations for the CI. Default 500. */\n bootstrapIterations?: number\n}\n\nexport async function correlationStudy(\n traceStore: TraceStore,\n outcomeStore: OutcomeStore,\n evalMetrics: EvalMetricSpec[],\n outcomeMetricNames: string[],\n options: CorrelationStudyOptions = {},\n): Promise<CorrelationStudyResult> {\n const runs = await traceStore.listRuns()\n const outcomes = await outcomeStore.list(options.outcomeFilter)\n const outcomesByRun = new Map<string, DeploymentOutcome[]>()\n for (const o of outcomes) {\n const arr = outcomesByRun.get(o.runId) ?? []\n arr.push(o)\n outcomesByRun.set(o.runId, arr)\n }\n\n const reduction = options.reduction ?? 'latest'\n const maxLag = options.maxCaptureLagMs ?? Infinity\n\n const pairs: Array<{ evalMetric: string; outcomeMetric: string; xs: number[]; ys: number[] }> = []\n for (const em of evalMetrics) {\n for (const om of outcomeMetricNames) {\n pairs.push({ evalMetric: em.id, outcomeMetric: om, xs: [], ys: [] })\n }\n }\n\n let joined = 0\n let skipped = 0\n for (const run of runs) {\n const os = outcomesByRun.get(run.runId)\n if (!os || os.length === 0) {\n skipped++\n continue\n }\n const eligible = os.filter((o) => o.capturedAt - run.startedAt <= maxLag)\n if (eligible.length === 0) {\n skipped++\n continue\n }\n\n for (const em of evalMetrics) {\n const extract = em.extract ?? defaultExtract(em.id)\n const x = await extract(run, traceStore)\n if (x === null || !Number.isFinite(x)) continue\n\n for (const om of outcomeMetricNames) {\n const values = eligible\n .map((o) => o.metrics[om])\n .filter((v): v is number => typeof v === 'number' && Number.isFinite(v))\n if (values.length === 0) continue\n const y = reduce(values, reduction, eligible)\n if (y === null) continue\n const pair = pairs.find((p) => p.evalMetric === em.id && p.outcomeMetric === om)!\n pair.xs.push(x)\n pair.ys.push(y)\n }\n }\n joined++\n }\n\n const results: CorrelationResult[] = pairs\n .filter((p) => p.xs.length >= 3)\n .map((p) => {\n const pearson = pearsonR(p.xs, p.ys)\n const spearman = pearsonR(ranks(p.xs), ranks(p.ys))\n const pearsonCi95 = bootstrapPearsonCi(p.xs, p.ys, options.bootstrapIterations ?? 500)\n const verdict: CorrelationResult['verdict'] =\n Math.abs(pearson) >= 0.7 ? 'strong' : Math.abs(pearson) >= 0.4 ? 'moderate' : 'weak'\n return {\n evalMetric: p.evalMetric,\n outcomeMetric: p.outcomeMetric,\n n: p.xs.length,\n pearson,\n spearman,\n pearsonCi95,\n verdict,\n }\n })\n\n return { pairs: results, joinedSamples: joined, skippedRuns: skipped }\n}\n\n// ── Helpers ──────────────────────────────────────────────────────────\n\nfunction reduce(\n values: number[],\n kind: 'latest' | 'mean' | 'max',\n outcomes: DeploymentOutcome[],\n): number | null {\n if (values.length === 0) return null\n if (kind === 'mean') return values.reduce((a, b) => a + b, 0) / values.length\n if (kind === 'max') return Math.max(...values)\n // 'latest': pick the outcome captured last, then lookup its metric\n const latest = [...outcomes].sort((a, b) => b.capturedAt - a.capturedAt)[0]\n if (!latest) return null\n const latestKey = Object.keys(latest.metrics)[0]\n const v = latestKey !== undefined ? latest.metrics[latestKey] : undefined\n // For 'latest' we already have `values` aligned; use the last-captured one\n const paired = outcomes\n .map((o) => {\n const k = Object.keys(o.metrics)[0]\n return {\n at: o.capturedAt,\n v: k !== undefined ? values.find((x) => o.metrics[k] === x) : undefined,\n }\n })\n .filter((p) => p.v !== undefined)\n if (paired.length === 0) return v ?? null\n return paired.sort((a, b) => b.at - a.at)[0]?.v ?? null\n}\n\nfunction pearsonR(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n dA = 0,\n dB = 0\n for (let i = 0; i < a.length; i++) {\n const da = a[i]! - mA,\n db = b[i]! - mB\n num += da * db\n dA += da * da\n dB += db * db\n }\n if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0\n return num / Math.sqrt(dA * dB)\n}\n\nfunction ranks(xs: number[]): number[] {\n const indexed = xs.map((v, i) => ({ v, i })).sort((x, y) => x.v - y.v)\n const r = new Array<number>(xs.length)\n for (let i = 0; i < indexed.length; i++) {\n let j = i\n while (j + 1 < indexed.length && indexed[j + 1]!.v === indexed[i]!.v) j++\n const avg = (i + j + 2) / 2\n for (let k = i; k <= j; k++) r[indexed[k]!.i] = avg\n i = j\n }\n return r\n}\n\nfunction bootstrapPearsonCi(\n xs: number[],\n ys: number[],\n iterations: number,\n): { lower: number; upper: number } {\n const n = xs.length\n if (n < 3) return { lower: NaN, upper: NaN }\n const rs: number[] = []\n for (let b = 0; b < iterations; b++) {\n const rx: number[] = new Array(n)\n const ry: number[] = new Array(n)\n for (let i = 0; i < n; i++) {\n const idx = Math.floor(Math.random() * n)\n rx[i] = xs[idx]!\n ry[i] = ys[idx]!\n }\n const r = pearsonR(rx, ry)\n if (Number.isFinite(r)) rs.push(r)\n }\n rs.sort((a, b) => a - b)\n if (rs.length === 0) return { lower: NaN, upper: NaN }\n return {\n lower: rs[Math.floor(0.025 * rs.length)]!,\n upper: rs[Math.min(rs.length - 1, Math.floor(0.975 * rs.length))]!,\n }\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n default:\n return null\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;AA2CA,eAAsB,iBACpB,YACA,cACA,YACA,eACA,UAA8B,CAAC,GACI;AACnC,QAAM,OAAO,MAAM,WAAW,SAAS;AACvC,QAAM,WAAW,MAAM,aAAa,KAAK;AACzC,QAAM,QAAQ,oBAAI,IAAiC;AACnD,aAAW,KAAK,UAAU;AACxB,UAAM,MAAM,MAAM,IAAI,EAAE,KAAK,KAAK,CAAC;AACnC,QAAI,KAAK,CAAC;AACV,UAAM,IAAI,EAAE,OAAO,GAAG;AAAA,EACxB;AAEA,QAAM,UAAU,WAAW,WAAW,eAAe,WAAW,EAAE;AAClE,QAAM,QAAyC,CAAC;AAChD,aAAW,OAAO,MAAM;AACtB,UAAM,KAAK,MAAM,IAAI,IAAI,KAAK;AAC9B,QAAI,CAAC,IAAI,OAAQ;AACjB,UAAM,IAAI,MAAM,QAAQ,KAAK,UAAU;AACvC,QAAI,MAAM,QAAQ,CAAC,OAAO,SAAS,CAAC,EAAG;AACvC,UAAM,SAAS,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU,EAAE,CAAC;AACpE,UAAM,IAAI,OAAO,QAAQ,aAAa;AACtC,QAAI,OAAO,MAAM,YAAY,CAAC,OAAO,SAAS,CAAC,EAAG;AAClD,UAAM,KAAK,EAAE,GAAG,EAAE,CAAC;AAAA,EACrB;AACA,MAAI,MAAM,SAAS,EAAG,QAAO;AAE7B,QAAM,UAAU,QAAQ,QAAQ;AAChC,QAAM,UAAU,QAAQ,WAAW;AACnC,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;AAC/B,QAAM,KAAK,QAAQ,OAAO,MAAM,KAAK,IAAI,GAAG,EAAE;AAC9C,QAAM,KAAK,QAAQ,OAAO,MAAM,KAAK,IAAI,GAAG,EAAE;AAE9C,QAAM,OAAyB,CAAC;AAChC,MAAI,YAAY,mBAAmB;AACjC,UAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,CAAC;AAClD,UAAM,SAAS,KAAK,IAAI,GAAG,KAAK,MAAM,OAAO,SAAS,OAAO,CAAC;AAC9D,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK,QAAQ;AAC9C,YAAM,QAAQ,OAAO,MAAM,GAAG,IAAI,MAAM;AACxC,UAAI,MAAM,WAAW,EAAG;AACxB,WAAK,KAAK,MAAM,KAAK,CAAC;AAAA,IACxB;AAAA,EACF,OAAO;AACL,UAAM,SAAS,KAAK,MAAM;AAC1B,QAAI,UAAU,EAAG,QAAO;AACxB,aAAS,IAAI,GAAG,IAAI,SAAS,KAAK;AAChC,YAAM,QAAQ,KAAK,IAAI;AACvB,YAAM,QAAQ,MAAM,UAAU,IAAI,KAAK,OAAO,MAAM,IAAI,KAAK;AAC7D,YAAM,QAAQ,MAAM,OAAO,CAAC,MAAM,EAAE,KAAK,SAAS,EAAE,IAAI,KAAK;AAC7D,UAAI,MAAM,WAAW,EAAG;AACxB,WAAK,KAAK,MAAM,OAAO,OAAO,KAAK,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,QAAM,QAAQ,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,GAAG,CAAC;AAC9C,QAAM,MAAM,KAAK,OAAO,CAAC,GAAG,MAAM,IAAK,EAAE,IAAI,QAAS,EAAE,KAAK,CAAC;AAC9D,QAAM,SAAS,KAAK,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,GAAG,EAAE,GAAG,GAAG,CAAC;AAE1D,SAAO,EAAE,YAAY,WAAW,IAAI,eAAe,GAAG,MAAM,QAAQ,MAAM,KAAK,OAAO;AACxF;AAEA,SAAS,MACP,OACA,OACA,OACgB;AAChB,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;AAC/B,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;AAC/B,QAAM,WAAW,KAAK,EAAE;AACxB,QAAM,cAAc,KAAK,EAAE;AAC3B,SAAO;AAAA,IACL,OAAO,SAAS,KAAK,IAAI,GAAG,EAAE;AAAA,IAC9B,OAAO,SAAS,KAAK,IAAI,GAAG,EAAE;AAAA,IAC9B,GAAG,MAAM;AAAA,IACT;AAAA,IACA;AAAA,IACA,KAAK,KAAK,IAAI,cAAc,QAAQ;AAAA,EACtC;AACF;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAC5C;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,QACZ,IAAI,SAAS,UAAU,WAAW,SAAU,IAAI,SAAS,SAAS,OAAO,IAAI,IAAK;AACtF;;;AC7EA,eAAsB,iBACpB,YACA,cACA,aACA,oBACA,UAAmC,CAAC,GACH;AACjC,QAAM,OAAO,MAAM,WAAW,SAAS;AACvC,QAAM,WAAW,MAAM,aAAa,KAAK,QAAQ,aAAa;AAC9D,QAAM,gBAAgB,oBAAI,IAAiC;AAC3D,aAAW,KAAK,UAAU;AACxB,UAAM,MAAM,cAAc,IAAI,EAAE,KAAK,KAAK,CAAC;AAC3C,QAAI,KAAK,CAAC;AACV,kBAAc,IAAI,EAAE,OAAO,GAAG;AAAA,EAChC;AAEA,QAAM,YAAY,QAAQ,aAAa;AACvC,QAAM,SAAS,QAAQ,mBAAmB;AAE1C,QAAM,QAA0F,CAAC;AACjG,aAAW,MAAM,aAAa;AAC5B,eAAW,MAAM,oBAAoB;AACnC,YAAM,KAAK,EAAE,YAAY,GAAG,IAAI,eAAe,IAAI,IAAI,CAAC,GAAG,IAAI,CAAC,EAAE,CAAC;AAAA,IACrE;AAAA,EACF;AAEA,MAAI,SAAS;AACb,MAAI,UAAU;AACd,aAAW,OAAO,MAAM;AACtB,UAAM,KAAK,cAAc,IAAI,IAAI,KAAK;AACtC,QAAI,CAAC,MAAM,GAAG,WAAW,GAAG;AAC1B;AACA;AAAA,IACF;AACA,UAAM,WAAW,GAAG,OAAO,CAAC,MAAM,EAAE,aAAa,IAAI,aAAa,MAAM;AACxE,QAAI,SAAS,WAAW,GAAG;AACzB;AACA;AAAA,IACF;AAEA,eAAW,MAAM,aAAa;AAC5B,YAAM,UAAU,GAAG,WAAWA,gBAAe,GAAG,EAAE;AAClD,YAAM,IAAI,MAAM,QAAQ,KAAK,UAAU;AACvC,UAAI,MAAM,QAAQ,CAAC,OAAO,SAAS,CAAC,EAAG;AAEvC,iBAAW,MAAM,oBAAoB;AACnC,cAAM,SAAS,SACZ,IAAI,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,EACxB,OAAO,CAAC,MAAmB,OAAO,MAAM,YAAY,OAAO,SAAS,CAAC,CAAC;AACzE,YAAI,OAAO,WAAW,EAAG;AACzB,cAAM,IAAI,OAAO,QAAQ,WAAW,QAAQ;AAC5C,YAAI,MAAM,KAAM;AAChB,cAAM,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,EAAE,kBAAkB,EAAE;AAC/E,aAAK,GAAG,KAAK,CAAC;AACd,aAAK,GAAG,KAAK,CAAC;AAAA,MAChB;AAAA,IACF;AACA;AAAA,EACF;AAEA,QAAM,UAA+B,MAClC,OAAO,CAAC,MAAM,EAAE,GAAG,UAAU,CAAC,EAC9B,IAAI,CAAC,MAAM;AACV,UAAM,UAAU,SAAS,EAAE,IAAI,EAAE,EAAE;AACnC,UAAM,WAAW,SAAS,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,EAAE,CAAC;AAClD,UAAM,cAAc,mBAAmB,EAAE,IAAI,EAAE,IAAI,QAAQ,uBAAuB,GAAG;AACrF,UAAM,UACJ,KAAK,IAAI,OAAO,KAAK,MAAM,WAAW,KAAK,IAAI,OAAO,KAAK,MAAM,aAAa;AAChF,WAAO;AAAA,MACL,YAAY,EAAE;AAAA,MACd,eAAe,EAAE;AAAA,MACjB,GAAG,EAAE,GAAG;AAAA,MACR;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF,CAAC;AAEH,SAAO,EAAE,OAAO,SAAS,eAAe,QAAQ,aAAa,QAAQ;AACvE;AAIA,SAAS,OACP,QACA,MACA,UACe;AACf,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,SAAS,OAAQ,QAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AACvE,MAAI,SAAS,MAAO,QAAO,KAAK,IAAI,GAAG,MAAM;AAE7C,QAAM,SAAS,CAAC,GAAG,QAAQ,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU,EAAE,CAAC;AAC1E,MAAI,CAAC,OAAQ,QAAO;AACpB,QAAM,YAAY,OAAO,KAAK,OAAO,OAAO,EAAE,CAAC;AAC/C,QAAM,IAAI,cAAc,SAAY,OAAO,QAAQ,SAAS,IAAI;AAEhE,QAAM,SAAS,SACZ,IAAI,CAAC,MAAM;AACV,UAAM,IAAI,OAAO,KAAK,EAAE,OAAO,EAAE,CAAC;AAClC,WAAO;AAAA,MACL,IAAI,EAAE;AAAA,MACN,GAAG,MAAM,SAAY,OAAO,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,IAAI;AAAA,IAChE;AAAA,EACF,CAAC,EACA,OAAO,CAAC,MAAM,EAAE,MAAM,MAAS;AAClC,MAAI,OAAO,WAAW,EAAG,QAAO,KAAK;AACrC,SAAO,OAAO,KAAK,CAAC,GAAG,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,GAAG,KAAK;AACrD;AAEA,SAAS,SAAS,GAAa,GAAqB;AAClD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,KAAK,GACL,KAAK;AACP,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK,IACjB,KAAK,EAAE,CAAC,IAAK;AACf,WAAO,KAAK;AACZ,UAAM,KAAK;AACX,UAAM,KAAK;AAAA,EACb;AACA,MAAI,OAAO,KAAK,OAAO,EAAG,QAAO,OAAO,KAAK,OAAO,IAAI,IAAI;AAC5D,SAAO,MAAM,KAAK,KAAK,KAAK,EAAE;AAChC;AAEA,SAAS,MAAM,IAAwB;AACrC,QAAM,UAAU,GAAG,IAAI,CAAC,GAAG,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,CAAC;AACrE,QAAM,IAAI,IAAI,MAAc,GAAG,MAAM;AACrC,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,QAAI,IAAI;AACR,WAAO,IAAI,IAAI,QAAQ,UAAU,QAAQ,IAAI,CAAC,EAAG,MAAM,QAAQ,CAAC,EAAG,EAAG;AACtE,UAAM,OAAO,IAAI,IAAI,KAAK;AAC1B,aAAS,IAAI,GAAG,KAAK,GAAG,IAAK,GAAE,QAAQ,CAAC,EAAG,CAAC,IAAI;AAChD,QAAI;AAAA,EACN;AACA,SAAO;AACT;AAEA,SAAS,mBACP,IACA,IACA,YACkC;AAClC,QAAM,IAAI,GAAG;AACb,MAAI,IAAI,EAAG,QAAO,EAAE,OAAO,KAAK,OAAO,IAAI;AAC3C,QAAM,KAAe,CAAC;AACtB,WAAS,IAAI,GAAG,IAAI,YAAY,KAAK;AACnC,UAAM,KAAe,IAAI,MAAM,CAAC;AAChC,UAAM,KAAe,IAAI,MAAM,CAAC;AAChC,aAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,YAAM,MAAM,KAAK,MAAM,KAAK,OAAO,IAAI,CAAC;AACxC,SAAG,CAAC,IAAI,GAAG,GAAG;AACd,SAAG,CAAC,IAAI,GAAG,GAAG;AAAA,IAChB;AACA,UAAM,IAAI,SAAS,IAAI,EAAE;AACzB,QAAI,OAAO,SAAS,CAAC,EAAG,IAAG,KAAK,CAAC;AAAA,EACnC;AACA,KAAG,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AACvB,MAAI,GAAG,WAAW,EAAG,QAAO,EAAE,OAAO,KAAK,OAAO,IAAI;AACrD,SAAO;AAAA,IACL,OAAO,GAAG,KAAK,MAAM,QAAQ,GAAG,MAAM,CAAC;AAAA,IACvC,OAAO,GAAG,KAAK,IAAI,GAAG,SAAS,GAAG,KAAK,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC;AAAA,EAClE;AACF;AAEA,SAASA,gBAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;","names":["defaultExtract"]}
1
+ {"version":3,"sources":["../../src/meta-eval/correlation-study.ts"],"sourcesContent":["/**\n * Correlation study — \"does our eval score predict real-world outcomes?\"\n *\n * This is the load-bearing signal. Takes a TraceStore + OutcomeStore,\n * joins on runId, computes Pearson + Spearman + bootstrap CI for every\n * (evalMetric, outcomeMetric) pair the caller declares.\n *\n * Without this number the framework is ornamental. With it and r > 0.6\n * the framework is a moat — no other agent-eval tool publishes one.\n */\n\nimport { aggregateLlm, llmSpans } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport type { DeploymentOutcome, OutcomeFilter, OutcomeStore } from './outcome-store'\n\nexport interface EvalMetricSpec {\n id: string\n /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface OutcomePair {\n evalMetric: string\n outcomeMetric: string\n}\n\nexport interface CorrelationResult {\n evalMetric: string\n outcomeMetric: string\n n: number\n pearson: number\n spearman: number\n /** 95% bootstrap CI for Pearson. */\n pearsonCi95: { lower: number; upper: number }\n /** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */\n verdict: 'strong' | 'moderate' | 'weak'\n}\n\nexport interface CorrelationStudyResult {\n pairs: CorrelationResult[]\n joinedSamples: number\n skippedRuns: number\n}\n\nexport interface CorrelationStudyOptions {\n /** Only join outcomes captured within this window after run.startedAt. */\n maxCaptureLagMs?: number\n /** Restrict to a subset of outcomes (cohort, region, source). */\n outcomeFilter?: OutcomeFilter\n /** Which outcome per run to use when multiple exist. Default 'latest'. */\n reduction?: 'latest' | 'mean' | 'max'\n /** Bootstrap iterations for the CI. Default 500. */\n bootstrapIterations?: number\n}\n\nexport async function correlationStudy(\n traceStore: TraceStore,\n outcomeStore: OutcomeStore,\n evalMetrics: EvalMetricSpec[],\n outcomeMetricNames: string[],\n options: CorrelationStudyOptions = {},\n): Promise<CorrelationStudyResult> {\n const runs = await traceStore.listRuns()\n const outcomes = await outcomeStore.list(options.outcomeFilter)\n const outcomesByRun = new Map<string, DeploymentOutcome[]>()\n for (const o of outcomes) {\n const arr = outcomesByRun.get(o.runId) ?? []\n arr.push(o)\n outcomesByRun.set(o.runId, arr)\n }\n\n const reduction = options.reduction ?? 'latest'\n const maxLag = options.maxCaptureLagMs ?? Infinity\n\n const pairs: Array<{ evalMetric: string; outcomeMetric: string; xs: number[]; ys: number[] }> = []\n for (const em of evalMetrics) {\n for (const om of outcomeMetricNames) {\n pairs.push({ evalMetric: em.id, outcomeMetric: om, xs: [], ys: [] })\n }\n }\n\n let joined = 0\n let skipped = 0\n for (const run of runs) {\n const os = outcomesByRun.get(run.runId)\n if (!os || os.length === 0) {\n skipped++\n continue\n }\n const eligible = os.filter((o) => o.capturedAt - run.startedAt <= maxLag)\n if (eligible.length === 0) {\n skipped++\n continue\n }\n\n for (const em of evalMetrics) {\n const extract = em.extract ?? defaultExtract(em.id)\n const x = await extract(run, traceStore)\n if (x === null || !Number.isFinite(x)) continue\n\n for (const om of outcomeMetricNames) {\n const values = eligible\n .map((o) => o.metrics[om])\n .filter((v): v is number => typeof v === 'number' && Number.isFinite(v))\n if (values.length === 0) continue\n const y = reduce(values, reduction, eligible)\n if (y === null) continue\n const pair = pairs.find((p) => p.evalMetric === em.id && p.outcomeMetric === om)!\n pair.xs.push(x)\n pair.ys.push(y)\n }\n }\n joined++\n }\n\n const results: CorrelationResult[] = pairs\n .filter((p) => p.xs.length >= 3)\n .map((p) => {\n const pearson = pearsonR(p.xs, p.ys)\n const spearman = pearsonR(ranks(p.xs), ranks(p.ys))\n const pearsonCi95 = bootstrapPearsonCi(p.xs, p.ys, options.bootstrapIterations ?? 500)\n const verdict: CorrelationResult['verdict'] =\n Math.abs(pearson) >= 0.7 ? 'strong' : Math.abs(pearson) >= 0.4 ? 'moderate' : 'weak'\n return {\n evalMetric: p.evalMetric,\n outcomeMetric: p.outcomeMetric,\n n: p.xs.length,\n pearson,\n spearman,\n pearsonCi95,\n verdict,\n }\n })\n\n return { pairs: results, joinedSamples: joined, skippedRuns: skipped }\n}\n\n// ── Helpers ──────────────────────────────────────────────────────────\n\nfunction reduce(\n values: number[],\n kind: 'latest' | 'mean' | 'max',\n outcomes: DeploymentOutcome[],\n): number | null {\n if (values.length === 0) return null\n if (kind === 'mean') return values.reduce((a, b) => a + b, 0) / values.length\n if (kind === 'max') return Math.max(...values)\n // 'latest': pick the outcome captured last, then lookup its metric\n const latest = [...outcomes].sort((a, b) => b.capturedAt - a.capturedAt)[0]\n if (!latest) return null\n const latestKey = Object.keys(latest.metrics)[0]\n const v = latestKey !== undefined ? latest.metrics[latestKey] : undefined\n // For 'latest' we already have `values` aligned; use the last-captured one\n const paired = outcomes\n .map((o) => {\n const k = Object.keys(o.metrics)[0]\n return {\n at: o.capturedAt,\n v: k !== undefined ? values.find((x) => o.metrics[k] === x) : undefined,\n }\n })\n .filter((p) => p.v !== undefined)\n if (paired.length === 0) return v ?? null\n return paired.sort((a, b) => b.at - a.at)[0]?.v ?? null\n}\n\nfunction pearsonR(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n dA = 0,\n dB = 0\n for (let i = 0; i < a.length; i++) {\n const da = a[i]! - mA,\n db = b[i]! - mB\n num += da * db\n dA += da * da\n dB += db * db\n }\n if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0\n return num / Math.sqrt(dA * dB)\n}\n\nfunction ranks(xs: number[]): number[] {\n const indexed = xs.map((v, i) => ({ v, i })).sort((x, y) => x.v - y.v)\n const r = new Array<number>(xs.length)\n for (let i = 0; i < indexed.length; i++) {\n let j = i\n while (j + 1 < indexed.length && indexed[j + 1]!.v === indexed[i]!.v) j++\n const avg = (i + j + 2) / 2\n for (let k = i; k <= j; k++) r[indexed[k]!.i] = avg\n i = j\n }\n return r\n}\n\nfunction bootstrapPearsonCi(\n xs: number[],\n ys: number[],\n iterations: number,\n): { lower: number; upper: number } {\n const n = xs.length\n if (n < 3) return { lower: NaN, upper: NaN }\n const rs: number[] = []\n for (let b = 0; b < iterations; b++) {\n const rx: number[] = new Array(n)\n const ry: number[] = new Array(n)\n for (let i = 0; i < n; i++) {\n const idx = Math.floor(Math.random() * n)\n rx[i] = xs[idx]!\n ry[i] = ys[idx]!\n }\n const r = pearsonR(rx, ry)\n if (Number.isFinite(r)) rs.push(r)\n }\n rs.sort((a, b) => a - b)\n if (rs.length === 0) return { lower: NaN, upper: NaN }\n return {\n lower: rs[Math.floor(0.025 * rs.length)]!,\n upper: rs[Math.min(rs.length - 1, Math.floor(0.975 * rs.length))]!,\n }\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n default:\n return null\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;AAwDA,eAAsB,iBACpB,YACA,cACA,aACA,oBACA,UAAmC,CAAC,GACH;AACjC,QAAM,OAAO,MAAM,WAAW,SAAS;AACvC,QAAM,WAAW,MAAM,aAAa,KAAK,QAAQ,aAAa;AAC9D,QAAM,gBAAgB,oBAAI,IAAiC;AAC3D,aAAW,KAAK,UAAU;AACxB,UAAM,MAAM,cAAc,IAAI,EAAE,KAAK,KAAK,CAAC;AAC3C,QAAI,KAAK,CAAC;AACV,kBAAc,IAAI,EAAE,OAAO,GAAG;AAAA,EAChC;AAEA,QAAM,YAAY,QAAQ,aAAa;AACvC,QAAM,SAAS,QAAQ,mBAAmB;AAE1C,QAAM,QAA0F,CAAC;AACjG,aAAW,MAAM,aAAa;AAC5B,eAAW,MAAM,oBAAoB;AACnC,YAAM,KAAK,EAAE,YAAY,GAAG,IAAI,eAAe,IAAI,IAAI,CAAC,GAAG,IAAI,CAAC,EAAE,CAAC;AAAA,IACrE;AAAA,EACF;AAEA,MAAI,SAAS;AACb,MAAI,UAAU;AACd,aAAW,OAAO,MAAM;AACtB,UAAM,KAAK,cAAc,IAAI,IAAI,KAAK;AACtC,QAAI,CAAC,MAAM,GAAG,WAAW,GAAG;AAC1B;AACA;AAAA,IACF;AACA,UAAM,WAAW,GAAG,OAAO,CAAC,MAAM,EAAE,aAAa,IAAI,aAAa,MAAM;AACxE,QAAI,SAAS,WAAW,GAAG;AACzB;AACA;AAAA,IACF;AAEA,eAAW,MAAM,aAAa;AAC5B,YAAM,UAAU,GAAG,WAAW,eAAe,GAAG,EAAE;AAClD,YAAM,IAAI,MAAM,QAAQ,KAAK,UAAU;AACvC,UAAI,MAAM,QAAQ,CAAC,OAAO,SAAS,CAAC,EAAG;AAEvC,iBAAW,MAAM,oBAAoB;AACnC,cAAM,SAAS,SACZ,IAAI,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,EACxB,OAAO,CAAC,MAAmB,OAAO,MAAM,YAAY,OAAO,SAAS,CAAC,CAAC;AACzE,YAAI,OAAO,WAAW,EAAG;AACzB,cAAM,IAAI,OAAO,QAAQ,WAAW,QAAQ;AAC5C,YAAI,MAAM,KAAM;AAChB,cAAM,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,EAAE,kBAAkB,EAAE;AAC/E,aAAK,GAAG,KAAK,CAAC;AACd,aAAK,GAAG,KAAK,CAAC;AAAA,MAChB;AAAA,IACF;AACA;AAAA,EACF;AAEA,QAAM,UAA+B,MAClC,OAAO,CAAC,MAAM,EAAE,GAAG,UAAU,CAAC,EAC9B,IAAI,CAAC,MAAM;AACV,UAAM,UAAU,SAAS,EAAE,IAAI,EAAE,EAAE;AACnC,UAAM,WAAW,SAAS,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,EAAE,CAAC;AAClD,UAAM,cAAc,mBAAmB,EAAE,IAAI,EAAE,IAAI,QAAQ,uBAAuB,GAAG;AACrF,UAAM,UACJ,KAAK,IAAI,OAAO,KAAK,MAAM,WAAW,KAAK,IAAI,OAAO,KAAK,MAAM,aAAa;AAChF,WAAO;AAAA,MACL,YAAY,EAAE;AAAA,MACd,eAAe,EAAE;AAAA,MACjB,GAAG,EAAE,GAAG;AAAA,MACR;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF,CAAC;AAEH,SAAO,EAAE,OAAO,SAAS,eAAe,QAAQ,aAAa,QAAQ;AACvE;AAIA,SAAS,OACP,QACA,MACA,UACe;AACf,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,SAAS,OAAQ,QAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AACvE,MAAI,SAAS,MAAO,QAAO,KAAK,IAAI,GAAG,MAAM;AAE7C,QAAM,SAAS,CAAC,GAAG,QAAQ,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU,EAAE,CAAC;AAC1E,MAAI,CAAC,OAAQ,QAAO;AACpB,QAAM,YAAY,OAAO,KAAK,OAAO,OAAO,EAAE,CAAC;AAC/C,QAAM,IAAI,cAAc,SAAY,OAAO,QAAQ,SAAS,IAAI;AAEhE,QAAM,SAAS,SACZ,IAAI,CAAC,MAAM;AACV,UAAM,IAAI,OAAO,KAAK,EAAE,OAAO,EAAE,CAAC;AAClC,WAAO;AAAA,MACL,IAAI,EAAE;AAAA,MACN,GAAG,MAAM,SAAY,OAAO,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,IAAI;AAAA,IAChE;AAAA,EACF,CAAC,EACA,OAAO,CAAC,MAAM,EAAE,MAAM,MAAS;AAClC,MAAI,OAAO,WAAW,EAAG,QAAO,KAAK;AACrC,SAAO,OAAO,KAAK,CAAC,GAAG,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,GAAG,KAAK;AACrD;AAEA,SAAS,SAAS,GAAa,GAAqB;AAClD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,KAAK,GACL,KAAK;AACP,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK,IACjB,KAAK,EAAE,CAAC,IAAK;AACf,WAAO,KAAK;AACZ,UAAM,KAAK;AACX,UAAM,KAAK;AAAA,EACb;AACA,MAAI,OAAO,KAAK,OAAO,EAAG,QAAO,OAAO,KAAK,OAAO,IAAI,IAAI;AAC5D,SAAO,MAAM,KAAK,KAAK,KAAK,EAAE;AAChC;AAEA,SAAS,MAAM,IAAwB;AACrC,QAAM,UAAU,GAAG,IAAI,CAAC,GAAG,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,CAAC;AACrE,QAAM,IAAI,IAAI,MAAc,GAAG,MAAM;AACrC,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,QAAI,IAAI;AACR,WAAO,IAAI,IAAI,QAAQ,UAAU,QAAQ,IAAI,CAAC,EAAG,MAAM,QAAQ,CAAC,EAAG,EAAG;AACtE,UAAM,OAAO,IAAI,IAAI,KAAK;AAC1B,aAAS,IAAI,GAAG,KAAK,GAAG,IAAK,GAAE,QAAQ,CAAC,EAAG,CAAC,IAAI;AAChD,QAAI;AAAA,EACN;AACA,SAAO;AACT;AAEA,SAAS,mBACP,IACA,IACA,YACkC;AAClC,QAAM,IAAI,GAAG;AACb,MAAI,IAAI,EAAG,QAAO,EAAE,OAAO,KAAK,OAAO,IAAI;AAC3C,QAAM,KAAe,CAAC;AACtB,WAAS,IAAI,GAAG,IAAI,YAAY,KAAK;AACnC,UAAM,KAAe,IAAI,MAAM,CAAC;AAChC,UAAM,KAAe,IAAI,MAAM,CAAC;AAChC,aAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,YAAM,MAAM,KAAK,MAAM,KAAK,OAAO,IAAI,CAAC;AACxC,SAAG,CAAC,IAAI,GAAG,GAAG;AACd,SAAG,CAAC,IAAI,GAAG,GAAG;AAAA,IAChB;AACA,UAAM,IAAI,SAAS,IAAI,EAAE;AACzB,QAAI,OAAO,SAAS,CAAC,EAAG,IAAG,KAAK,CAAC;AAAA,EACnC;AACA,KAAG,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AACvB,MAAI,GAAG,WAAW,EAAG,QAAO,EAAE,OAAO,KAAK,OAAO,IAAI;AACrD,SAAO;AAAA,IACL,OAAO,GAAG,KAAK,MAAM,QAAQ,GAAG,MAAM,CAAC;AAAA,IACvC,OAAO,GAAG,KAAK,IAAI,GAAG,SAAS,GAAG,KAAK,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC;AAAA,EAClE;AACF;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;","names":[]}
@@ -0,0 +1,132 @@
1
+ /**
2
+ * Off-policy evaluation primitives.
3
+ *
4
+ * Standard inverse-probability-weighted (IPS), self-normalized
5
+ * importance-weighted (SNIPS), and doubly-robust (DR) estimators for the
6
+ * value of a *target* policy given trajectories collected under a
7
+ * *behavior* policy. This is the canonical RL eval task: "we have last
8
+ * week's runs, we changed the policy — how would the new one do without
9
+ * re-running?"
10
+ *
11
+ * The math here is textbook (Dudík, Langford, Li 2011 for DR; Swaminathan
12
+ * & Joachims 2015 for SNIPS) but the *application* to LLM-agent
13
+ * evaluation needs care:
14
+ *
15
+ * - The "policy" is the (prompt, tool config, model snapshot) triple.
16
+ * Two policies have the same probability over an action *iff* their
17
+ * LLM call would emit the same token with the same probability —
18
+ * which is generally unknowable without the model log-probs.
19
+ * - For LLM agents, propensity scores must be supplied by the caller
20
+ * (logged in the trace, recovered from token log-probs, or estimated
21
+ * via a learned propensity model). We do NOT estimate propensity here.
22
+ * - Doubly-robust requires a Q-function (model-based reward predictor).
23
+ * We accept any callable; consumers pass either a tabular average,
24
+ * a regression fit, or a learned reward model.
25
+ *
26
+ * Bias / variance tradeoffs:
27
+ * - IPS: unbiased; high variance for small overlap, infinite variance
28
+ * when target has support outside behavior.
29
+ * - SNIPS: lower variance, slight bias; usually preferred in practice.
30
+ * - DR: doubly-robust — unbiased if either propensity OR Q-function is
31
+ * correct. Lowest practical variance when Q is decent. Use this.
32
+ *
33
+ * Caveat the panel will land: on the LLM-agent setting, propensity scores
34
+ * recovered from token log-probs are noisy, the action space is enormous,
35
+ * and overlap is often poor. These estimators are useful but not magic;
36
+ * complement with `replayCampaign` (exact replay where the request hashes
37
+ * match) for high-confidence answers and OPE for the gap.
38
+ */
39
+ interface OffPolicyTrajectory {
40
+ /** Stable id, for traceability through the dataset. */
41
+ runId: string;
42
+ /** Reward observed under the behavior policy (the realized outcome). */
43
+ reward: number;
44
+ /**
45
+ * Behavior-policy probability of the action that was taken. For LLM
46
+ * agents this is typically `exp(sum(token_log_probs))` over the chosen
47
+ * trajectory. Must be in (0, 1].
48
+ */
49
+ behaviorProb: number;
50
+ /**
51
+ * Target-policy probability of the same action. For replay-style
52
+ * counterfactual evaluation this is what the *new* policy would have
53
+ * assigned to the *old* trajectory. Must be in [0, 1].
54
+ */
55
+ targetProb: number;
56
+ /**
57
+ * Optional model-based reward prediction at the same context. Used by
58
+ * `doublyRobust`. Set to `null` for IPS-only evaluation.
59
+ */
60
+ qHat?: number | null;
61
+ }
62
+ interface OffPolicyEstimate {
63
+ /** Estimated value of the target policy. */
64
+ value: number;
65
+ /** Standard error of the estimate. */
66
+ standardError: number;
67
+ /** Effective sample size (Kong 1992). Lower = more reliance on a few high-weight samples. */
68
+ effectiveSampleSize: number;
69
+ /** Number of trajectories used. */
70
+ n: number;
71
+ /**
72
+ * Diagnostic: maximum importance weight observed. Large values (>>10x
73
+ * mean) are a red flag — variance is dominated by a few outliers.
74
+ */
75
+ maxImportanceWeight: number;
76
+ }
77
+ interface OffPolicyOptions {
78
+ /**
79
+ * Cap importance weights at this value (Ionides 2008 truncated IS) to
80
+ * trade unbiasedness for variance reduction. Default `Infinity` (no cap).
81
+ * Set e.g. `10` for stable estimates when the policies are close.
82
+ */
83
+ weightCap?: number;
84
+ /** Reward clipping range. Default `[0, 1]`. */
85
+ rewardClip?: {
86
+ low: number;
87
+ high: number;
88
+ };
89
+ }
90
+ /**
91
+ * Inverse Probability Weighting (Horvitz-Thompson). Unbiased estimator
92
+ * of E[reward under target policy]. Variance scales with the spread of
93
+ * target/behavior ratios.
94
+ */
95
+ declare function inverseProbabilityWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
96
+ /**
97
+ * Self-Normalized Importance Sampling. Lower variance than vanilla IPS at
98
+ * the cost of small bias (vanishing as N grows). The right default for
99
+ * LLM-agent evaluation where overlap is often poor.
100
+ */
101
+ declare function selfNormalizedImportanceWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
102
+ /**
103
+ * Doubly-robust off-policy estimator (Dudík, Langford, Li 2011).
104
+ *
105
+ * V_DR = (1/N) * sum_i [ q_hat_i + (target_prob_i / behavior_prob_i) * (r_i - q_hat_i) ]
106
+ *
107
+ * Unbiased if EITHER:
108
+ * - the importance ratios are correct (IPS-style validity), OR
109
+ * - the Q-hat function is correct (model-based validity).
110
+ *
111
+ * In practice both are imperfect, but the residual bias is the *product*
112
+ * of both errors — much smaller than either alone. This is why DR is the
113
+ * default in production OPE pipelines.
114
+ *
115
+ * Requires `qHat` on every trajectory. If any are `null`, the estimator
116
+ * falls back to SNIPS for those entries (loud-fallback behavior; the
117
+ * report's `n` reflects the full set but `effectiveSampleSize` accounts
118
+ * for the lost variance reduction).
119
+ */
120
+ declare function doublyRobust(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
121
+ /**
122
+ * Convenience: run all three estimators and return them side-by-side.
123
+ * The recommended diagnostic — agreement across estimators is a much
124
+ * stronger signal than any single one.
125
+ */
126
+ declare function offPolicyEstimateAll(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): {
127
+ ips: OffPolicyEstimate;
128
+ snips: OffPolicyEstimate;
129
+ dr: OffPolicyEstimate;
130
+ };
131
+
132
+ export { type OffPolicyEstimate as O, type OffPolicyOptions as a, type OffPolicyTrajectory as b, doublyRobust as d, inverseProbabilityWeighting as i, offPolicyEstimateAll as o, selfNormalizedImportanceWeighting as s };
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.77.0",
5
+ "version": "0.80.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -60,4 +60,4 @@ declare class FileSystemOutcomeStore implements OutcomeStore {
60
60
  list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
61
61
  }
62
62
 
63
- export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeStore as O, type OutcomeFilter as a, type FileSystemOutcomeStoreOptions as b };
63
+ export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeFilter as O, type FileSystemOutcomeStoreOptions as a, type OutcomeStore as b };
@@ -1,9 +1,9 @@
1
- import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-Bba0vl1V.js';
1
+ import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, j as GateContext, g as CampaignResult, M as MutableSurface, c as GateDecision } from './types-4mm2msnR.js';
2
2
  import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
3
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
3
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
4
4
  import { D as Direction } from './pareto-E-pembql.js';
5
5
  import { a as PairedBootstrapResult } from './statistics-B7yCbi9i.js';
6
- import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BqYH2vCR.js';
6
+ import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BAl_aVOZ.js';
7
7
  import { HostedClient, TraceSpanEvent } from './hosted/index.js';
8
8
 
9
9
  /**
@@ -1,4 +1,4 @@
1
- import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-CRD68aH7.js';
1
+ import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-DRvV0zRo.js';
2
2
 
3
3
  /**
4
4
  * AnalystRegistry — orchestrate N analysts against one run.
@@ -1,6 +1,6 @@
1
1
  import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
2
- import { m as GateDecision } from './summary-report-ByiOUrHj.js';
3
- import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
2
+ import { m as GateDecision } from './summary-report-BTaXq1TS.js';
3
+ import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
4
4
 
5
5
  /**
6
6
  * Release confidence gate.
@@ -1,12 +1,12 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CLPuwiUw.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CXXZlR8g.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
4
  export { P as PairedBootstrapOptions, a as PairedBootstrapResult, b as benjaminiHochberg, p as pairedBootstrap, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
5
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
6
- import './run-record-BgTFzO2r.js';
5
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BTaXq1TS.js';
6
+ import './run-record-sItO5ftF.js';
7
7
  import './errors-Dwqw-T_m.js';
8
8
  import './schema-m0gsnbt3.js';
9
- import './outcome-store-D6KWmYvj.js';
9
+ import './outcome-store-rnXLEqSn.js';
10
10
  import './dataset-B2kL-fSM.js';
11
11
  import './judge-calibration-DilmB3Ml.js';
12
12
  import './types-Croy5h7V.js';
@@ -1,6 +1,6 @@
1
- import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BgTFzO2r.js';
1
+ import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-sItO5ftF.js';
2
2
  import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
3
- import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-ByiOUrHj.js';
3
+ import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BTaXq1TS.js';
4
4
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
5
5
  import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
6
6
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';