npm - @tangle-network/agent-eval - Versions diffs - 0.79.0 → 0.80.0 - Mend

@tangle-network/agent-eval 0.79.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/README.md +50 -19
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +1 -1
package/dist/analyst/index.d.ts +3 -3
package/dist/belief-state/index.d.ts +188 -0
package/dist/belief-state/index.js +486 -0
package/dist/belief-state/index.js.map +1 -0
package/dist/calibration-Cpr3WaX3.d.ts +101 -0
package/dist/campaign/index.d.ts +5 -5
package/dist/chunk-4DIJWVUT.js +131 -0
package/dist/chunk-4DIJWVUT.js.map +1 -0
package/dist/chunk-NPCTHQIO.js +91 -0
package/dist/chunk-NPCTHQIO.js.map +1 -0
package/dist/contract/index.d.ts +123 -10
package/dist/contract/index.js +116 -0
package/dist/contract/index.js.map +1 -1
package/dist/governance/index.d.ts +1 -1
package/dist/hosted/index.d.ts +1 -1
package/dist/index.d.ts +5 -5
package/dist/meta-eval/index.d.ts +5 -98
package/dist/meta-eval/index.js +7 -76
package/dist/meta-eval/index.js.map +1 -1
package/dist/off-policy-DiwuKKg7.d.ts +132 -0
package/dist/openapi.json +1 -1
package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
package/dist/{provenance-CEAJI9rm.d.ts → provenance-jG-Gngg8.d.ts} +2 -2
package/dist/{registry-BmEuU94S.d.ts → registry-BK0Zee01.d.ts} +1 -1
package/dist/reporting.d.ts +2 -2
package/dist/rl.d.ts +6 -136
package/dist/rl.js +6 -120
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CWyWWLBg.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +1 -1
package/dist/{run-improvement-loop-Bgu4C59E.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +1 -1
package/dist/{semantic-concept-judge-Du4ZVyef.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +1 -1
package/dist/{types-QHG0KnkF.d.ts → types-4mm2msnR.d.ts} +1 -1
package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
package/docs/research/research-roadmap.md +1 -0
package/package.json +7 -2

package/dist/campaign/index.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
 import { A as AnalyzeTracesOptions, a as AnalyzeTracesInput, b as AnalyzeTracesResult } from '../analyst-t7zZS3TV.js';
-import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, q as ProposeContext, J as JudgeScore, L as LabeledScenarioStore, r as LabeledScenarioWrite, s as LabeledScenarioSampleArgs, t as LabeledScenarioRecord, u as LabelTrust, v as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-QHG0KnkF.js';
-export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, w as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, x as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-QHG0KnkF.js';
-import { a as RunCampaignOptions, b as RunImprovementLoopOptions, C as CampaignStorage } from '../run-improvement-loop-Bgu4C59E.js';
-export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bgu4C59E.js';
-export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, k as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, l as EmitLoopProvenanceArgs, m as EmitLoopProvenanceResult, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, n as LoopProvenanceBackend, o as LoopProvenanceCandidate, L as LoopProvenanceRecord, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, q as buildLoopProvenanceRecord, f as composeGate, g as defaultProductionGate, s as emitLoopProvenance, h as evolutionaryDriver, i as heldOutGate, t as loopProvenanceSpans, p as paretoPolicy, j as paretoSignificanceGate, u as provenanceRecordPath, v as provenanceSpansPath, r as runEval, w as surfaceContentHash } from '../provenance-CEAJI9rm.js';
+import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, q as ProposeContext, J as JudgeScore, L as LabeledScenarioStore, r as LabeledScenarioWrite, s as LabeledScenarioSampleArgs, t as LabeledScenarioRecord, u as LabelTrust, v as LabeledScenarioSource, g as CampaignResult, i as CodeSurface } from '../types-4mm2msnR.js';
+export { C as CampaignAggregates, d as CampaignArtifactWriter, e as CampaignCellResult, f as CampaignCostMeter, w as CampaignTokenUsage, h as CampaignTraceWriter, D as DispatchFn, G as Gate, j as GateContext, c as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, x as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-4mm2msnR.js';
+import { a as RunCampaignOptions, b as RunImprovementLoopOptions, C as CampaignStorage } from '../run-improvement-loop-BAl_aVOZ.js';
+export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-BAl_aVOZ.js';
+export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, k as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, l as EmitLoopProvenanceArgs, m as EmitLoopProvenanceResult, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, n as LoopProvenanceBackend, o as LoopProvenanceCandidate, L as LoopProvenanceRecord, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, q as buildLoopProvenanceRecord, f as composeGate, g as defaultProductionGate, s as emitLoopProvenance, h as evolutionaryDriver, i as heldOutGate, t as loopProvenanceSpans, p as paretoPolicy, j as paretoSignificanceGate, u as provenanceRecordPath, v as provenanceSpansPath, r as runEval, w as surfaceContentHash } from '../provenance-jG-Gngg8.js';
 import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
 import { c as TraceAnalystKindSpec } from '../kind-factory-DqV2t1Xk.js';
 import { c as AnalystFinding } from '../types-DRvV0zRo.js';

package/dist/chunk-4DIJWVUT.js ADDED Viewed

@@ -0,0 +1,131 @@
+import {
+  ValidationError
+} from "./chunk-3BFEG2F6.js";
+// src/rl/off-policy.ts
+function inverseProbabilityWeighting(trajectories, opts = {}) {
+  const cap = opts.weightCap ?? Infinity;
+  const clip = opts.rewardClip ?? { low: 0, high: 1 };
+  if (trajectories.length === 0) {
+    return zeroEstimate();
+  }
+  const weights = [];
+  const weightedRewards = [];
+  let maxW = 0;
+  for (const t of trajectories) {
+    if (t.behaviorProb <= 0) {
+      throw new ValidationError(
+        `inverseProbabilityWeighting: behaviorProb must be > 0 (runId=${t.runId})`
+      );
+    }
+    const w = Math.min(cap, t.targetProb / t.behaviorProb);
+    const r = clamp(t.reward, clip.low, clip.high);
+    weights.push(w);
+    weightedRewards.push(w * r);
+    if (w > maxW) maxW = w;
+  }
+  const n = weights.length;
+  const value = weightedRewards.reduce((s, x) => s + x, 0) / n;
+  const variance = weightedRewards.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
+  const sumW = weights.reduce((s, w) => s + w, 0);
+  const sumW2 = weights.reduce((s, w) => s + w * w, 0);
+  const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
+  return {
+    value,
+    standardError: Math.sqrt(variance / n),
+    effectiveSampleSize: effN,
+    n,
+    maxImportanceWeight: maxW
+  };
+}
+function selfNormalizedImportanceWeighting(trajectories, opts = {}) {
+  const cap = opts.weightCap ?? Infinity;
+  const clip = opts.rewardClip ?? { low: 0, high: 1 };
+  if (trajectories.length === 0) return zeroEstimate();
+  const weights = [];
+  const rewards = [];
+  let maxW = 0;
+  for (const t of trajectories) {
+    if (t.behaviorProb <= 0) {
+      throw new ValidationError(
+        `selfNormalizedImportanceWeighting: behaviorProb must be > 0 (runId=${t.runId})`
+      );
+    }
+    const w = Math.min(cap, t.targetProb / t.behaviorProb);
+    weights.push(w);
+    rewards.push(clamp(t.reward, clip.low, clip.high));
+    if (w > maxW) maxW = w;
+  }
+  const sumW = weights.reduce((s, w) => s + w, 0);
+  const sumWR = weights.reduce((s, w, i) => s + w * rewards[i], 0);
+  const value = sumW === 0 ? 0 : sumWR / sumW;
+  const sumW2 = weights.reduce((s, w) => s + w * w, 0);
+  const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
+  const phi = weights.map((w, i) => w * (rewards[i] - value));
+  const variance = phi.reduce((s, x) => s + x * x, 0) / Math.max(1, sumW * sumW);
+  return {
+    value,
+    standardError: Math.sqrt(variance),
+    effectiveSampleSize: effN,
+    n: trajectories.length,
+    maxImportanceWeight: maxW
+  };
+}
+function doublyRobust(trajectories, opts = {}) {
+  const cap = opts.weightCap ?? Infinity;
+  const clip = opts.rewardClip ?? { low: 0, high: 1 };
+  if (trajectories.length === 0) return zeroEstimate();
+  const contributions = [];
+  let maxW = 0;
+  let sumW = 0;
+  let sumW2 = 0;
+  for (const t of trajectories) {
+    if (t.behaviorProb <= 0) {
+      throw new ValidationError(`doublyRobust: behaviorProb must be > 0 (runId=${t.runId})`);
+    }
+    const w = Math.min(cap, t.targetProb / t.behaviorProb);
+    const r = clamp(t.reward, clip.low, clip.high);
+    const q = typeof t.qHat === "number" && Number.isFinite(t.qHat) ? clamp(t.qHat, clip.low, clip.high) : null;
+    if (q === null) {
+      contributions.push(w * r);
+    } else {
+      contributions.push(q + w * (r - q));
+    }
+    if (w > maxW) maxW = w;
+    sumW += w;
+    sumW2 += w * w;
+  }
+  const n = contributions.length;
+  const value = contributions.reduce((s, x) => s + x, 0) / n;
+  const variance = contributions.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
+  const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
+  return {
+    value,
+    standardError: Math.sqrt(variance / n),
+    effectiveSampleSize: effN,
+    n,
+    maxImportanceWeight: maxW
+  };
+}
+function offPolicyEstimateAll(trajectories, opts = {}) {
+  return {
+    ips: inverseProbabilityWeighting(trajectories, opts),
+    snips: selfNormalizedImportanceWeighting(trajectories, opts),
+    dr: doublyRobust(trajectories, opts)
+  };
+}
+function zeroEstimate() {
+  return { value: 0, standardError: 0, effectiveSampleSize: 0, n: 0, maxImportanceWeight: 0 };
+}
+function clamp(x, lo, hi) {
+  if (!Number.isFinite(x)) return lo;
+  return Math.max(lo, Math.min(hi, x));
+}
+export {
+  inverseProbabilityWeighting,
+  selfNormalizedImportanceWeighting,
+  doublyRobust,
+  offPolicyEstimateAll
+};
+//# sourceMappingURL=chunk-4DIJWVUT.js.map

package/dist/chunk-4DIJWVUT.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/rl/off-policy.ts"],"sourcesContent":["/**\n * Off-policy evaluation primitives.\n *\n * Standard inverse-probability-weighted (IPS), self-normalized\n * importance-weighted (SNIPS), and doubly-robust (DR) estimators for the\n * value of a *target* policy given trajectories collected under a\n * *behavior* policy. This is the canonical RL eval task: \"we have last\n * week's runs, we changed the policy — how would the new one do without\n * re-running?\"\n *\n * The math here is textbook (Dudík, Langford, Li 2011 for DR; Swaminathan\n * & Joachims 2015 for SNIPS) but the *application* to LLM-agent\n * evaluation needs care:\n *\n * - The \"policy\" is the (prompt, tool config, model snapshot) triple.\n * Two policies have the same probability over an action *iff* their\n * LLM call would emit the same token with the same probability —\n * which is generally unknowable without the model log-probs.\n * - For LLM agents, propensity scores must be supplied by the caller\n * (logged in the trace, recovered from token log-probs, or estimated\n * via a learned propensity model). We do NOT estimate propensity here.\n * - Doubly-robust requires a Q-function (model-based reward predictor).\n * We accept any callable; consumers pass either a tabular average,\n * a regression fit, or a learned reward model.\n *\n * Bias / variance tradeoffs:\n * - IPS: unbiased; high variance for small overlap, infinite variance\n * when target has support outside behavior.\n * - SNIPS: lower variance, slight bias; usually preferred in practice.\n * - DR: doubly-robust — unbiased if either propensity OR Q-function is\n * correct. Lowest practical variance when Q is decent. Use this.\n *\n * Caveat the panel will land: on the LLM-agent setting, propensity scores\n * recovered from token log-probs are noisy, the action space is enormous,\n * and overlap is often poor. These estimators are useful but not magic;\n * complement with `replayCampaign` (exact replay where the request hashes\n * match) for high-confidence answers and OPE for the gap.\n */\n\nimport { ValidationError } from '../errors'\n\nexport interface OffPolicyTrajectory {\n /** Stable id, for traceability through the dataset. */\n runId: string\n /** Reward observed under the behavior policy (the realized outcome). */\n reward: number\n /**\n * Behavior-policy probability of the action that was taken. For LLM\n * agents this is typically `exp(sum(token_log_probs))` over the chosen\n * trajectory. Must be in (0, 1].\n */\n behaviorProb: number\n /**\n * Target-policy probability of the same action. For replay-style\n * counterfactual evaluation this is what the *new* policy would have\n * assigned to the *old* trajectory. Must be in [0, 1].\n */\n targetProb: number\n /**\n * Optional model-based reward prediction at the same context. Used by\n * `doublyRobust`. Set to `null` for IPS-only evaluation.\n */\n qHat?: number | null\n}\n\nexport interface OffPolicyEstimate {\n /** Estimated value of the target policy. */\n value: number\n /** Standard error of the estimate. */\n standardError: number\n /** Effective sample size (Kong 1992). Lower = more reliance on a few high-weight samples. */\n effectiveSampleSize: number\n /** Number of trajectories used. */\n n: number\n /**\n * Diagnostic: maximum importance weight observed. Large values (>>10x\n * mean) are a red flag — variance is dominated by a few outliers.\n */\n maxImportanceWeight: number\n}\n\nexport interface OffPolicyOptions {\n /**\n * Cap importance weights at this value (Ionides 2008 truncated IS) to\n * trade unbiasedness for variance reduction. Default `Infinity` (no cap).\n * Set e.g. `10` for stable estimates when the policies are close.\n */\n weightCap?: number\n /** Reward clipping range. Default `[0, 1]`. */\n rewardClip?: { low: number; high: number }\n}\n\n/**\n * Inverse Probability Weighting (Horvitz-Thompson). Unbiased estimator\n * of E[reward under target policy]. Variance scales with the spread of\n * target/behavior ratios.\n */\nexport function inverseProbabilityWeighting(\n trajectories: OffPolicyTrajectory[],\n opts: OffPolicyOptions = {},\n): OffPolicyEstimate {\n const cap = opts.weightCap ?? Infinity\n const clip = opts.rewardClip ?? { low: 0, high: 1 }\n\n if (trajectories.length === 0) {\n return zeroEstimate()\n }\n\n const weights: number[] = []\n const weightedRewards: number[] = []\n let maxW = 0\n for (const t of trajectories) {\n if (t.behaviorProb <= 0) {\n throw new ValidationError(\n `inverseProbabilityWeighting: behaviorProb must be > 0 (runId=${t.runId})`,\n )\n }\n const w = Math.min(cap, t.targetProb / t.behaviorProb)\n const r = clamp(t.reward, clip.low, clip.high)\n weights.push(w)\n weightedRewards.push(w * r)\n if (w > maxW) maxW = w\n }\n const n = weights.length\n const value = weightedRewards.reduce((s, x) => s + x, 0) / n\n const variance = weightedRewards.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1)\n const sumW = weights.reduce((s, w) => s + w, 0)\n const sumW2 = weights.reduce((s, w) => s + w * w, 0)\n const effN = sumW === 0 ? 0 : (sumW * sumW) / sumW2\n\n return {\n value,\n standardError: Math.sqrt(variance / n),\n effectiveSampleSize: effN,\n n,\n maxImportanceWeight: maxW,\n }\n}\n\n/**\n * Self-Normalized Importance Sampling. Lower variance than vanilla IPS at\n * the cost of small bias (vanishing as N grows). The right default for\n * LLM-agent evaluation where overlap is often poor.\n */\nexport function selfNormalizedImportanceWeighting(\n trajectories: OffPolicyTrajectory[],\n opts: OffPolicyOptions = {},\n): OffPolicyEstimate {\n const cap = opts.weightCap ?? Infinity\n const clip = opts.rewardClip ?? { low: 0, high: 1 }\n if (trajectories.length === 0) return zeroEstimate()\n\n const weights: number[] = []\n const rewards: number[] = []\n let maxW = 0\n for (const t of trajectories) {\n if (t.behaviorProb <= 0) {\n throw new ValidationError(\n `selfNormalizedImportanceWeighting: behaviorProb must be > 0 (runId=${t.runId})`,\n )\n }\n const w = Math.min(cap, t.targetProb / t.behaviorProb)\n weights.push(w)\n rewards.push(clamp(t.reward, clip.low, clip.high))\n if (w > maxW) maxW = w\n }\n const sumW = weights.reduce((s, w) => s + w, 0)\n const sumWR = weights.reduce((s, w, i) => s + w * rewards[i]!, 0)\n const value = sumW === 0 ? 0 : sumWR / sumW\n const sumW2 = weights.reduce((s, w) => s + w * w, 0)\n const effN = sumW === 0 ? 0 : (sumW * sumW) / sumW2\n // Influence-function-based SE for SNIPS (Owen 2013, Ch. 9).\n const phi = weights.map((w, i) => w * (rewards[i]! - value))\n const variance = phi.reduce((s, x) => s + x * x, 0) / Math.max(1, sumW * sumW)\n return {\n value,\n standardError: Math.sqrt(variance),\n effectiveSampleSize: effN,\n n: trajectories.length,\n maxImportanceWeight: maxW,\n }\n}\n\n/**\n * Doubly-robust off-policy estimator (Dudík, Langford, Li 2011).\n *\n * V_DR = (1/N) * sum_i [ q_hat_i + (target_prob_i / behavior_prob_i) * (r_i - q_hat_i) ]\n *\n * Unbiased if EITHER:\n * - the importance ratios are correct (IPS-style validity), OR\n * - the Q-hat function is correct (model-based validity).\n *\n * In practice both are imperfect, but the residual bias is the *product*\n * of both errors — much smaller than either alone. This is why DR is the\n * default in production OPE pipelines.\n *\n * Requires `qHat` on every trajectory. If any are `null`, the estimator\n * falls back to SNIPS for those entries (loud-fallback behavior; the\n * report's `n` reflects the full set but `effectiveSampleSize` accounts\n * for the lost variance reduction).\n */\nexport function doublyRobust(\n trajectories: OffPolicyTrajectory[],\n opts: OffPolicyOptions = {},\n): OffPolicyEstimate {\n const cap = opts.weightCap ?? Infinity\n const clip = opts.rewardClip ?? { low: 0, high: 1 }\n if (trajectories.length === 0) return zeroEstimate()\n\n const contributions: number[] = []\n let maxW = 0\n let sumW = 0\n let sumW2 = 0\n for (const t of trajectories) {\n if (t.behaviorProb <= 0) {\n throw new ValidationError(`doublyRobust: behaviorProb must be > 0 (runId=${t.runId})`)\n }\n const w = Math.min(cap, t.targetProb / t.behaviorProb)\n const r = clamp(t.reward, clip.low, clip.high)\n const q =\n typeof t.qHat === 'number' && Number.isFinite(t.qHat)\n ? clamp(t.qHat, clip.low, clip.high)\n : null\n if (q === null) {\n contributions.push(w * r) // fallback: IPS for this entry\n } else {\n contributions.push(q + w * (r - q))\n }\n if (w > maxW) maxW = w\n sumW += w\n sumW2 += w * w\n }\n const n = contributions.length\n const value = contributions.reduce((s, x) => s + x, 0) / n\n const variance = contributions.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1)\n const effN = sumW === 0 ? 0 : (sumW * sumW) / sumW2\n return {\n value,\n standardError: Math.sqrt(variance / n),\n effectiveSampleSize: effN,\n n,\n maxImportanceWeight: maxW,\n }\n}\n\n/**\n * Convenience: run all three estimators and return them side-by-side.\n * The recommended diagnostic — agreement across estimators is a much\n * stronger signal than any single one.\n */\nexport function offPolicyEstimateAll(\n trajectories: OffPolicyTrajectory[],\n opts: OffPolicyOptions = {},\n): { ips: OffPolicyEstimate; snips: OffPolicyEstimate; dr: OffPolicyEstimate } {\n return {\n ips: inverseProbabilityWeighting(trajectories, opts),\n snips: selfNormalizedImportanceWeighting(trajectories, opts),\n dr: doublyRobust(trajectories, opts),\n }\n}\n\n// ── Helpers ──────────────────────────────────────────────────────────────\n\nfunction zeroEstimate(): OffPolicyEstimate {\n return { value: 0, standardError: 0, effectiveSampleSize: 0, n: 0, maxImportanceWeight: 0 }\n}\n\nfunction clamp(x: number, lo: number, hi: number): number {\n if (!Number.isFinite(x)) return lo\n return Math.max(lo, Math.min(hi, x))\n}\n"],"mappings":";;;;;AAiGO,SAAS,4BACd,cACA,OAAyB,CAAC,GACP;AACnB,QAAM,MAAM,KAAK,aAAa;AAC9B,QAAM,OAAO,KAAK,cAAc,EAAE,KAAK,GAAG,MAAM,EAAE;AAElD,MAAI,aAAa,WAAW,GAAG;AAC7B,WAAO,aAAa;AAAA,EACtB;AAEA,QAAM,UAAoB,CAAC;AAC3B,QAAM,kBAA4B,CAAC;AACnC,MAAI,OAAO;AACX,aAAW,KAAK,cAAc;AAC5B,QAAI,EAAE,gBAAgB,GAAG;AACvB,YAAM,IAAI;AAAA,QACR,gEAAgE,EAAE,KAAK;AAAA,MACzE;AAAA,IACF;AACA,UAAM,IAAI,KAAK,IAAI,KAAK,EAAE,aAAa,EAAE,YAAY;AACrD,UAAM,IAAI,MAAM,EAAE,QAAQ,KAAK,KAAK,KAAK,IAAI;AAC7C,YAAQ,KAAK,CAAC;AACd,oBAAgB,KAAK,IAAI,CAAC;AAC1B,QAAI,IAAI,KAAM,QAAO;AAAA,EACvB;AACA,QAAM,IAAI,QAAQ;AAClB,QAAM,QAAQ,gBAAgB,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAC3D,QAAM,WAAW,gBAAgB,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,UAAU,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AAC9F,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC;AAC9C,QAAM,QAAQ,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,IAAI,GAAG,CAAC;AACnD,QAAM,OAAO,SAAS,IAAI,IAAK,OAAO,OAAQ;AAE9C,SAAO;AAAA,IACL;AAAA,IACA,eAAe,KAAK,KAAK,WAAW,CAAC;AAAA,IACrC,qBAAqB;AAAA,IACrB;AAAA,IACA,qBAAqB;AAAA,EACvB;AACF;AAOO,SAAS,kCACd,cACA,OAAyB,CAAC,GACP;AACnB,QAAM,MAAM,KAAK,aAAa;AAC9B,QAAM,OAAO,KAAK,cAAc,EAAE,KAAK,GAAG,MAAM,EAAE;AAClD,MAAI,aAAa,WAAW,EAAG,QAAO,aAAa;AAEnD,QAAM,UAAoB,CAAC;AAC3B,QAAM,UAAoB,CAAC;AAC3B,MAAI,OAAO;AACX,aAAW,KAAK,cAAc;AAC5B,QAAI,EAAE,gBAAgB,GAAG;AACvB,YAAM,IAAI;AAAA,QACR,sEAAsE,EAAE,KAAK;AAAA,MAC/E;AAAA,IACF;AACA,UAAM,IAAI,KAAK,IAAI,KAAK,EAAE,aAAa,EAAE,YAAY;AACrD,YAAQ,KAAK,CAAC;AACd,YAAQ,KAAK,MAAM,EAAE,QAAQ,KAAK,KAAK,KAAK,IAAI,CAAC;AACjD,QAAI,IAAI,KAAM,QAAO;AAAA,EACvB;AACA,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC;AAC9C,QAAM,QAAQ,QAAQ,OAAO,CAAC,GAAG,GAAG,MAAM,IAAI,IAAI,QAAQ,CAAC,GAAI,CAAC;AAChE,QAAM,QAAQ,SAAS,IAAI,IAAI,QAAQ;AACvC,QAAM,QAAQ,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,IAAI,GAAG,CAAC;AACnD,QAAM,OAAO,SAAS,IAAI,IAAK,OAAO,OAAQ;AAE9C,QAAM,MAAM,QAAQ,IAAI,CAAC,GAAG,MAAM,KAAK,QAAQ,CAAC,IAAK,MAAM;AAC3D,QAAM,WAAW,IAAI,OAAO,CAAC,GAAG,MAAM,IAAI,IAAI,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,OAAO,IAAI;AAC7E,SAAO;AAAA,IACL;AAAA,IACA,eAAe,KAAK,KAAK,QAAQ;AAAA,IACjC,qBAAqB;AAAA,IACrB,GAAG,aAAa;AAAA,IAChB,qBAAqB;AAAA,EACvB;AACF;AAoBO,SAAS,aACd,cACA,OAAyB,CAAC,GACP;AACnB,QAAM,MAAM,KAAK,aAAa;AAC9B,QAAM,OAAO,KAAK,cAAc,EAAE,KAAK,GAAG,MAAM,EAAE;AAClD,MAAI,aAAa,WAAW,EAAG,QAAO,aAAa;AAEnD,QAAM,gBAA0B,CAAC;AACjC,MAAI,OAAO;AACX,MAAI,OAAO;AACX,MAAI,QAAQ;AACZ,aAAW,KAAK,cAAc;AAC5B,QAAI,EAAE,gBAAgB,GAAG;AACvB,YAAM,IAAI,gBAAgB,iDAAiD,EAAE,KAAK,GAAG;AAAA,IACvF;AACA,UAAM,IAAI,KAAK,IAAI,KAAK,EAAE,aAAa,EAAE,YAAY;AACrD,UAAM,IAAI,MAAM,EAAE,QAAQ,KAAK,KAAK,KAAK,IAAI;AAC7C,UAAM,IACJ,OAAO,EAAE,SAAS,YAAY,OAAO,SAAS,EAAE,IAAI,IAChD,MAAM,EAAE,MAAM,KAAK,KAAK,KAAK,IAAI,IACjC;AACN,QAAI,MAAM,MAAM;AACd,oBAAc,KAAK,IAAI,CAAC;AAAA,IAC1B,OAAO;AACL,oBAAc,KAAK,IAAI,KAAK,IAAI,EAAE;AAAA,IACpC;AACA,QAAI,IAAI,KAAM,QAAO;AACrB,YAAQ;AACR,aAAS,IAAI;AAAA,EACf;AACA,QAAM,IAAI,cAAc;AACxB,QAAM,QAAQ,cAAc,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AACzD,QAAM,WAAW,cAAc,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,UAAU,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AAC5F,QAAM,OAAO,SAAS,IAAI,IAAK,OAAO,OAAQ;AAC9C,SAAO;AAAA,IACL;AAAA,IACA,eAAe,KAAK,KAAK,WAAW,CAAC;AAAA,IACrC,qBAAqB;AAAA,IACrB;AAAA,IACA,qBAAqB;AAAA,EACvB;AACF;AAOO,SAAS,qBACd,cACA,OAAyB,CAAC,GACmD;AAC7E,SAAO;AAAA,IACL,KAAK,4BAA4B,cAAc,IAAI;AAAA,IACnD,OAAO,kCAAkC,cAAc,IAAI;AAAA,IAC3D,IAAI,aAAa,cAAc,IAAI;AAAA,EACrC;AACF;AAIA,SAAS,eAAkC;AACzC,SAAO,EAAE,OAAO,GAAG,eAAe,GAAG,qBAAqB,GAAG,GAAG,GAAG,qBAAqB,EAAE;AAC5F;AAEA,SAAS,MAAM,GAAW,IAAY,IAAoB;AACxD,MAAI,CAAC,OAAO,SAAS,CAAC,EAAG,QAAO;AAChC,SAAO,KAAK,IAAI,IAAI,KAAK,IAAI,IAAI,CAAC,CAAC;AACrC;","names":[]}

package/dist/chunk-NPCTHQIO.js ADDED Viewed

@@ -0,0 +1,91 @@
+// src/meta-eval/calibration.ts
+async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
+  const runs = await traceStore.listRuns();
+  const outcomes = await outcomeStore.list();
+  const byRun = /* @__PURE__ */ new Map();
+  for (const o of outcomes) {
+    const arr = byRun.get(o.runId) ?? [];
+    arr.push(o);
+    byRun.set(o.runId, arr);
+  }
+  const extract = evalMetric.extract ?? defaultExtract(evalMetric.id);
+  const pairs = [];
+  for (const run of runs) {
+    const os = byRun.get(run.runId);
+    if (!os?.length) continue;
+    const x = await extract(run, traceStore);
+    if (x === null || !Number.isFinite(x)) continue;
+    const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0];
+    const y = latest.metrics[outcomeMetric];
+    if (typeof y !== "number" || !Number.isFinite(y)) continue;
+    pairs.push({ x, y });
+  }
+  if (pairs.length < 2) return null;
+  return calibrationFromPairs(
+    pairs.map((p) => ({ evalScore: p.x, outcome: p.y })),
+    evalMetric.id,
+    outcomeMetric,
+    options
+  );
+}
+function calibrationFromPairs(inputPairs, evalMetric, outcomeMetric, options = {}) {
+  const pairs = inputPairs.filter(
+    (pair) => Number.isFinite(pair.evalScore) && Number.isFinite(pair.outcome)
+  );
+  if (pairs.length < 2) return null;
+  const numBins = options.bins ?? 10;
+  const binning = options.binning ?? "equal-width";
+  const xs = pairs.map((p) => p.evalScore);
+  const lo = options.range?.lo ?? Math.min(...xs);
+  const hi = options.range?.hi ?? Math.max(...xs);
+  const bins = [];
+  if (binning === "equal-frequency") {
+    const sorted = [...pairs].sort((a, b) => a.evalScore - b.evalScore);
+    const perBin = Math.max(1, Math.floor(sorted.length / numBins));
+    for (let i = 0; i < sorted.length; i += perBin) {
+      const chunk = sorted.slice(i, i + perBin);
+      if (chunk.length === 0) continue;
+      bins.push(toBin(chunk));
+    }
+  } else {
+    const width = (hi - lo) / numBins;
+    if (width === 0) return null;
+    for (let i = 0; i < numBins; i++) {
+      const binLo = lo + i * width;
+      const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width;
+      const chunk = pairs.filter((p) => p.evalScore >= binLo && p.evalScore < binHi);
+      if (chunk.length === 0) continue;
+      bins.push(toBin(chunk, binLo, binHi));
+    }
+  }
+  const total = bins.reduce((a, b) => a + b.n, 0);
+  const ece = bins.reduce((a, b) => a + b.n / total * b.gap, 0);
+  const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0);
+  return { evalMetric, outcomeMetric, n: pairs.length, bins, ece, maxGap };
+}
+function toBin(chunk, lower, upper) {
+  const xs = chunk.map((c) => c.evalScore);
+  const ys = chunk.map((c) => c.outcome);
+  const evalMean = mean(xs);
+  const outcomeMean = mean(ys);
+  return {
+    lower: lower ?? Math.min(...xs),
+    upper: upper ?? Math.max(...xs),
+    n: chunk.length,
+    evalMean,
+    outcomeMean,
+    gap: Math.abs(outcomeMean - evalMean)
+  };
+}
+function mean(xs) {
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+function defaultExtract(metric) {
+  return async (run) => run.outcome?.score ?? (metric === "pass" ? run.outcome?.pass === true ? 1 : 0 : null);
+}
+export {
+  calibrationCurve,
+  calibrationFromPairs
+};
+//# sourceMappingURL=chunk-NPCTHQIO.js.map

package/dist/chunk-NPCTHQIO.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/meta-eval/calibration.ts"],"sourcesContent":["/**\n * Calibration curve — binned \"if eval says X, what does reality show?\"\n *\n * Companion to correlationStudy. Raw correlation is a single number;\n * the calibration curve shows *where* the eval is well-calibrated vs\n * overconfident / underconfident. Buckets the eval metric, computes\n * mean outcome per bucket, reports expected-calibration-error (ECE).\n */\n\nimport type { Run } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport type { EvalMetricSpec } from './correlation-study'\nimport type { DeploymentOutcome, OutcomeStore } from './outcome-store'\n\nexport interface CalibrationBin {\n lower: number\n upper: number\n n: number\n evalMean: number\n outcomeMean: number\n /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */\n gap: number\n}\n\nexport interface CalibrationReport {\n evalMetric: string\n outcomeMetric: string\n n: number\n bins: CalibrationBin[]\n /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */\n ece: number\n /** Max bin gap — upper bound on miscalibration. */\n maxGap: number\n}\n\nexport interface CalibrationOptions {\n bins?: number\n /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */\n binning?: 'equal-width' | 'equal-frequency'\n /** Clip eval values to [lo, hi] before binning. */\n range?: { lo: number; hi: number }\n}\n\nexport interface CalibrationPair {\n evalScore: number\n outcome: number\n}\n\nexport async function calibrationCurve(\n traceStore: TraceStore,\n outcomeStore: OutcomeStore,\n evalMetric: EvalMetricSpec,\n outcomeMetric: string,\n options: CalibrationOptions = {},\n): Promise<CalibrationReport | null> {\n const runs = await traceStore.listRuns()\n const outcomes = await outcomeStore.list()\n const byRun = new Map<string, DeploymentOutcome[]>()\n for (const o of outcomes) {\n const arr = byRun.get(o.runId) ?? []\n arr.push(o)\n byRun.set(o.runId, arr)\n }\n\n const extract = evalMetric.extract ?? defaultExtract(evalMetric.id)\n const pairs: Array<{ x: number; y: number }> = []\n for (const run of runs) {\n const os = byRun.get(run.runId)\n if (!os?.length) continue\n const x = await extract(run, traceStore)\n if (x === null || !Number.isFinite(x)) continue\n const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0]!\n const y = latest.metrics[outcomeMetric]\n if (typeof y !== 'number' || !Number.isFinite(y)) continue\n pairs.push({ x, y })\n }\n if (pairs.length < 2) return null\n\n return calibrationFromPairs(\n pairs.map((p) => ({ evalScore: p.x, outcome: p.y })),\n evalMetric.id,\n outcomeMetric,\n options,\n )\n}\n\nexport function calibrationFromPairs(\n inputPairs: CalibrationPair[],\n evalMetric: string,\n outcomeMetric: string,\n options: CalibrationOptions = {},\n): CalibrationReport | null {\n const pairs = inputPairs.filter(\n (pair) => Number.isFinite(pair.evalScore) && Number.isFinite(pair.outcome),\n )\n if (pairs.length < 2) return null\n\n const numBins = options.bins ?? 10\n const binning = options.binning ?? 'equal-width'\n const xs = pairs.map((p) => p.evalScore)\n const lo = options.range?.lo ?? Math.min(...xs)\n const hi = options.range?.hi ?? Math.max(...xs)\n\n const bins: CalibrationBin[] = []\n if (binning === 'equal-frequency') {\n const sorted = [...pairs].sort((a, b) => a.evalScore - b.evalScore)\n const perBin = Math.max(1, Math.floor(sorted.length / numBins))\n for (let i = 0; i < sorted.length; i += perBin) {\n const chunk = sorted.slice(i, i + perBin)\n if (chunk.length === 0) continue\n bins.push(toBin(chunk))\n }\n } else {\n const width = (hi - lo) / numBins\n if (width === 0) return null\n for (let i = 0; i < numBins; i++) {\n const binLo = lo + i * width\n const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width\n const chunk = pairs.filter((p) => p.evalScore >= binLo && p.evalScore < binHi)\n if (chunk.length === 0) continue\n bins.push(toBin(chunk, binLo, binHi))\n }\n }\n\n const total = bins.reduce((a, b) => a + b.n, 0)\n const ece = bins.reduce((a, b) => a + (b.n / total) * b.gap, 0)\n const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0)\n\n return { evalMetric, outcomeMetric, n: pairs.length, bins, ece, maxGap }\n}\n\nfunction toBin(chunk: CalibrationPair[], lower?: number, upper?: number): CalibrationBin {\n const xs = chunk.map((c) => c.evalScore)\n const ys = chunk.map((c) => c.outcome)\n const evalMean = mean(xs)\n const outcomeMean = mean(ys)\n return {\n lower: lower ?? Math.min(...xs),\n upper: upper ?? Math.max(...xs),\n n: chunk.length,\n evalMean,\n outcomeMean,\n gap: Math.abs(outcomeMean - evalMean),\n }\n}\n\nfunction mean(xs: number[]): number {\n return xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run) =>\n run.outcome?.score ?? (metric === 'pass' ? (run.outcome?.pass === true ? 1 : 0) : null)\n}\n"],"mappings":";AAgDA,eAAsB,iBACpB,YACA,cACA,YACA,eACA,UAA8B,CAAC,GACI;AACnC,QAAM,OAAO,MAAM,WAAW,SAAS;AACvC,QAAM,WAAW,MAAM,aAAa,KAAK;AACzC,QAAM,QAAQ,oBAAI,IAAiC;AACnD,aAAW,KAAK,UAAU;AACxB,UAAM,MAAM,MAAM,IAAI,EAAE,KAAK,KAAK,CAAC;AACnC,QAAI,KAAK,CAAC;AACV,UAAM,IAAI,EAAE,OAAO,GAAG;AAAA,EACxB;AAEA,QAAM,UAAU,WAAW,WAAW,eAAe,WAAW,EAAE;AAClE,QAAM,QAAyC,CAAC;AAChD,aAAW,OAAO,MAAM;AACtB,UAAM,KAAK,MAAM,IAAI,IAAI,KAAK;AAC9B,QAAI,CAAC,IAAI,OAAQ;AACjB,UAAM,IAAI,MAAM,QAAQ,KAAK,UAAU;AACvC,QAAI,MAAM,QAAQ,CAAC,OAAO,SAAS,CAAC,EAAG;AACvC,UAAM,SAAS,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,aAAa,EAAE,UAAU,EAAE,CAAC;AACpE,UAAM,IAAI,OAAO,QAAQ,aAAa;AACtC,QAAI,OAAO,MAAM,YAAY,CAAC,OAAO,SAAS,CAAC,EAAG;AAClD,UAAM,KAAK,EAAE,GAAG,EAAE,CAAC;AAAA,EACrB;AACA,MAAI,MAAM,SAAS,EAAG,QAAO;AAE7B,SAAO;AAAA,IACL,MAAM,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE,GAAG,SAAS,EAAE,EAAE,EAAE;AAAA,IACnD,WAAW;AAAA,IACX;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,qBACd,YACA,YACA,eACA,UAA8B,CAAC,GACL;AAC1B,QAAM,QAAQ,WAAW;AAAA,IACvB,CAAC,SAAS,OAAO,SAAS,KAAK,SAAS,KAAK,OAAO,SAAS,KAAK,OAAO;AAAA,EAC3E;AACA,MAAI,MAAM,SAAS,EAAG,QAAO;AAE7B,QAAM,UAAU,QAAQ,QAAQ;AAChC,QAAM,UAAU,QAAQ,WAAW;AACnC,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,SAAS;AACvC,QAAM,KAAK,QAAQ,OAAO,MAAM,KAAK,IAAI,GAAG,EAAE;AAC9C,QAAM,KAAK,QAAQ,OAAO,MAAM,KAAK,IAAI,GAAG,EAAE;AAE9C,QAAM,OAAyB,CAAC;AAChC,MAAI,YAAY,mBAAmB;AACjC,UAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,UAAM,SAAS,KAAK,IAAI,GAAG,KAAK,MAAM,OAAO,SAAS,OAAO,CAAC;AAC9D,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK,QAAQ;AAC9C,YAAM,QAAQ,OAAO,MAAM,GAAG,IAAI,MAAM;AACxC,UAAI,MAAM,WAAW,EAAG;AACxB,WAAK,KAAK,MAAM,KAAK,CAAC;AAAA,IACxB;AAAA,EACF,OAAO;AACL,UAAM,SAAS,KAAK,MAAM;AAC1B,QAAI,UAAU,EAAG,QAAO;AACxB,aAAS,IAAI,GAAG,IAAI,SAAS,KAAK;AAChC,YAAM,QAAQ,KAAK,IAAI;AACvB,YAAM,QAAQ,MAAM,UAAU,IAAI,KAAK,OAAO,MAAM,IAAI,KAAK;AAC7D,YAAM,QAAQ,MAAM,OAAO,CAAC,MAAM,EAAE,aAAa,SAAS,EAAE,YAAY,KAAK;AAC7E,UAAI,MAAM,WAAW,EAAG;AACxB,WAAK,KAAK,MAAM,OAAO,OAAO,KAAK,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,QAAM,QAAQ,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,GAAG,CAAC;AAC9C,QAAM,MAAM,KAAK,OAAO,CAAC,GAAG,MAAM,IAAK,EAAE,IAAI,QAAS,EAAE,KAAK,CAAC;AAC9D,QAAM,SAAS,KAAK,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,GAAG,EAAE,GAAG,GAAG,CAAC;AAE1D,SAAO,EAAE,YAAY,eAAe,GAAG,MAAM,QAAQ,MAAM,KAAK,OAAO;AACzE;AAEA,SAAS,MAAM,OAA0B,OAAgB,OAAgC;AACvF,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,SAAS;AACvC,QAAM,KAAK,MAAM,IAAI,CAAC,MAAM,EAAE,OAAO;AACrC,QAAM,WAAW,KAAK,EAAE;AACxB,QAAM,cAAc,KAAK,EAAE;AAC3B,SAAO;AAAA,IACL,OAAO,SAAS,KAAK,IAAI,GAAG,EAAE;AAAA,IAC9B,OAAO,SAAS,KAAK,IAAI,GAAG,EAAE;AAAA,IAC9B,GAAG,MAAM;AAAA,IACT;AAAA,IACA;AAAA,IACA,KAAK,KAAK,IAAI,cAAc,QAAQ;AAAA,EACtC;AACF;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAC5C;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,QACZ,IAAI,SAAS,UAAU,WAAW,SAAU,IAAI,SAAS,SAAS,OAAO,IAAI,IAAK;AACtF;","names":[]}

package/dist/contract/index.d.ts CHANGED Viewed

@@ -1,15 +1,15 @@
-import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-QHG0KnkF.js';
-export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, f as CampaignResult, g as CampaignTraceWriter, h as CodeSurface, D as Dispatch, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, n as JudgeDimension, J as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-QHG0KnkF.js';
-import { L as LoopProvenanceRecord } from '../provenance-CEAJI9rm.js';
-export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, D as DefaultProductionGateOptions, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, f as composeGate, g as defaultProductionGate, h as evolutionaryDriver, i as heldOutGate, p as paretoPolicy, j as paretoSignificanceGate, r as runEval } from '../provenance-CEAJI9rm.js';
-import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-Bgu4C59E.js';
-export { G as GepaDriverOptions, a as RunCampaignOptions, b as RunImprovementLoopOptions, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, r as runCampaign, c as runImprovementLoop } from '../run-improvement-loop-Bgu4C59E.js';
-export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
-import { HostedTenant, TraceSpanEvent } from '../hosted/index.js';
+import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate, c as GateDecision } from '../types-4mm2msnR.js';
+export { C as CampaignAggregates, d as CampaignArtifactWriter, e as CampaignCellResult, f as CampaignCostMeter, g as CampaignResult, h as CampaignTraceWriter, i as CodeSurface, D as Dispatch, j as GateContext, k as GateResult, l as GenerationCandidate, m as GenerationRecord, n as JudgeDimension, J as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-4mm2msnR.js';
+import { L as LoopProvenanceRecord } from '../provenance-jG-Gngg8.js';
+export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, D as DefaultProductionGateOptions, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, f as composeGate, g as defaultProductionGate, h as evolutionaryDriver, i as heldOutGate, p as paretoPolicy, j as paretoSignificanceGate, r as runEval } from '../provenance-jG-Gngg8.js';
+import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-BAl_aVOZ.js';
+export { G as GepaDriverOptions, a as RunCampaignOptions, b as RunImprovementLoopOptions, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, r as runCampaign, c as runImprovementLoop } from '../run-improvement-loop-BAl_aVOZ.js';
+export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, b as OutcomeStore } from '../outcome-store-rnXLEqSn.js';
+import { HostedTenant, EvalRunCellScore, EvalRunGenerationSnapshot, EvalRunEvent, TraceSpanEvent } from '../hosted/index.js';
 import { R as RunRecord, b as RunSplitTag } from '../run-record-sItO5ftF.js';
 import { I as InsightReport } from '../insight-report-dlpEzQDi.js';
 export { F as FailureClusterInsight, a as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, b as ReleaseSummary, S as ScalarDistribution } from '../insight-report-dlpEzQDi.js';
-import { A as AnalystRegistry } from '../registry-BmEuU94S.js';
+import { a as AnalystRegistry } from '../registry-BK0Zee01.js';
 import { a as DatasetScenario } from '../dataset-B2kL-fSM.js';
 import '../red-team-DW9Ca_tj.js';
 import '../store-CKUAgsJz.js';
@@ -351,6 +351,119 @@ interface AnalyzeRunsOptions {
 }
 declare function analyzeRuns(opts: AnalyzeRunsOptions): Promise<InsightReport>;
+/**
+ * # `@tangle-network/agent-eval/contract` — eval-run diff primitive.
+ *
+ * The substrate side of the v-N-versus-v-N+1 dashboard view. Given two
+ * `EvalRunEvent`s (or two `EvalRunGenerationSnapshot`s from one run), this
+ * returns a normalised diff: per-cell composite + per-judge/per-dimension
+ * deltas, surface-hash change, aggregate cost + duration shifts.
+ *
+ * Consumed by:
+ *   - The hosted-tier dashboard (intelligence-web) — renders v3 vs v4
+ *     comparisons of cells × judges × dimensions.
+ *   - CI reporting — emits a "shipped: composite +0.07, cost +$1.20" line
+ *     in PR review for autonomous-improvement runs.
+ *   - Any downstream consumer that needs "what actually changed" without
+ *     reimplementing the matching + arithmetic.
+ *
+ * Cells are matched on the natural composite key `(scenarioId, rep)`.
+ * Unmatched cells surface as `removed` / `added` so callers can tell
+ * "this cell got worse" from "this cell wasn't run."
+ */
+/** Per-dimension delta. `before` / `after` are null when the judge did not
+ *  emit a value for that side. `delta` is `after - before`; null when
+ *  either side is null. */
+interface EvalDimensionDelta {
+    before: number | null;
+    after: number | null;
+    delta: number | null;
+}
+/** Per-cell delta, keyed on `(scenarioId, rep)`. */
+interface EvalCellScoreDelta {
+    scenarioId: string;
+    rep: number;
+    compositeBefore: number;
+    compositeAfter: number;
+    compositeDelta: number;
+    /** Per-judge → per-dimension deltas. Outer key = judge name from
+     *  `EvalRunCellScore.dimensions`; inner key = dimension name. */
+    dimensions: Record<string, Record<string, EvalDimensionDelta>>;
+}
+/** Diff between two generation snapshots — the unit the dashboard renders
+ *  for a single "v3 vs v4" comparison. */
+interface EvalGenerationDiff {
+    beforeIndex: number;
+    afterIndex: number;
+    beforeSurfaceHash: string;
+    afterSurfaceHash: string;
+    surfaceChanged: boolean;
+    /** Cells present in both snapshots, matched on `(scenarioId, rep)`. */
+    matched: EvalCellScoreDelta[];
+    /** Cells present in `before` but missing from `after`. */
+    removed: EvalRunCellScore[];
+    /** Cells present in `after` but missing from `before`. */
+    added: EvalRunCellScore[];
+    /** Aggregate composite mean across all cells in the snapshot. */
+    compositeBefore: number;
+    compositeAfter: number;
+    compositeDelta: number;
+    costUsdBefore: number;
+    costUsdAfter: number;
+    costUsdDelta: number;
+    durationMsBefore: number;
+    durationMsAfter: number;
+    durationMsDelta: number;
+}
+/** Diff between two full eval-runs. Includes both baseline-vs-baseline and
+ *  winner-vs-winner generation diffs when both sides expose them, plus
+ *  run-level metadata. */
+interface EvalRunDiff {
+    beforeRunId: string;
+    afterRunId: string;
+    beforeTimestamp: string;
+    afterTimestamp: string;
+    beforeGateDecision: GateDecision | null;
+    afterGateDecision: GateDecision | null;
+    beforeHoldoutLift: number | null;
+    afterHoldoutLift: number | null;
+    holdoutLiftDelta: number | null;
+    beforeTotalCostUsd: number;
+    afterTotalCostUsd: number;
+    totalCostUsdDelta: number;
+    beforeTotalDurationMs: number;
+    afterTotalDurationMs: number;
+    totalDurationMsDelta: number;
+    /** Baseline-vs-baseline diff. Null when either run has no baseline. */
+    baselineDiff: EvalGenerationDiff | null;
+    /** Highest-index-generation comparison. Null when either run has no
+     *  recorded generations (e.g. baseline-only or errored before any
+     *  generation completed). */
+    winnersDiff: EvalGenerationDiff | null;
+}
+/**
+ * Diff two generation snapshots. Cells are matched on `(scenarioId, rep)`;
+ * unmatched cells surface in `added` / `removed`. Aggregate fields are
+ * recomputed from the snapshot's stored fields, not re-derived from cells —
+ * this keeps the diff consistent with whatever aggregation the substrate
+ * actually reported.
+ */
+declare function diffGenerations(before: EvalRunGenerationSnapshot, after: EvalRunGenerationSnapshot): EvalGenerationDiff;
+/**
+ * Diff two full eval-runs. Produces baseline-vs-baseline and
+ * winner-vs-winner generation diffs when both sides expose them, plus
+ * run-level cost / lift / gate-decision deltas.
+ */
+declare function diffRuns(before: EvalRunEvent, after: EvalRunEvent): EvalRunDiff;
+/**
+ * Within-run baseline → winning-generation diff. The natural "what did the
+ * improvement loop produce" view for a single run. Returns null when the
+ * run never reached a generation past baseline (errored early, or the gate
+ * shipped the baseline as-is).
+ */
+declare function diffRunBaselineToWinner(run: EvalRunEvent): EvalGenerationDiff | null;
 /**
  * `fromAgentTrace` — provenance correlation from Cursor's Agent Trace spec
  * (https://github.com/cursor/agent-trace, RFC v0.1.0).
@@ -567,4 +680,4 @@ interface FromOtelSpansOptions {
 }
 declare function fromOtelSpans(opts: FromOtelSpansOptions): RunRecord[];
-export { type AgentTraceContributor, type AgentTraceContributorType, type AgentTraceConversation, type AgentTraceFile, type AgentTraceIndex, type AgentTraceRange, type AgentTraceRecord, type AnalyzeRunsOptions, type AuthoringProvenance, CampaignStorage, DispatchContext, type FeedbackTableMeta, type FeedbackTableRow, type FromFeedbackTableOptions, type FromFeedbackTableResult, type FromOtelSpansOptions, Gate, ImprovementDriver, InsightReport, JudgeConfig, MutableSurface, type PartitionByAuthoringModelResult, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, analyzeRuns, fromFeedbackTable, fromOtelSpans, parseAgentTrace, partitionRunsByAuthoringModel, selfImprove };
+export { type AgentTraceContributor, type AgentTraceContributorType, type AgentTraceConversation, type AgentTraceFile, type AgentTraceIndex, type AgentTraceRange, type AgentTraceRecord, type AnalyzeRunsOptions, type AuthoringProvenance, CampaignStorage, DispatchContext, type EvalCellScoreDelta, type EvalDimensionDelta, type EvalGenerationDiff, type EvalRunDiff, type FeedbackTableMeta, type FeedbackTableRow, type FromFeedbackTableOptions, type FromFeedbackTableResult, type FromOtelSpansOptions, Gate, GateDecision, ImprovementDriver, InsightReport, JudgeConfig, MutableSurface, type PartitionByAuthoringModelResult, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, analyzeRuns, diffGenerations, diffRunBaselineToWinner, diffRuns, fromFeedbackTable, fromOtelSpans, parseAgentTrace, partitionRunsByAuthoringModel, selfImprove };

package/dist/contract/index.js CHANGED Viewed

@@ -1087,6 +1087,119 @@ function cellsToRunRecords(cells, candidateId, runId, surface) {
   });
 }
+// src/contract/diff.ts
+function keyForCell(cell) {
+  return JSON.stringify([cell.scenarioId, cell.rep]);
+}
+function diffDimensions(before, after) {
+  const out = {};
+  const judges = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
+  for (const judge of judges) {
+    const beforeDims = before[judge] ?? {};
+    const afterDims = after[judge] ?? {};
+    const dims = /* @__PURE__ */ new Set([...Object.keys(beforeDims), ...Object.keys(afterDims)]);
+    const judgeOut = {};
+    for (const dim of dims) {
+      const rawBefore = beforeDims[dim];
+      const rawAfter = afterDims[dim];
+      const b = typeof rawBefore === "number" && Number.isFinite(rawBefore) ? rawBefore : null;
+      const a = typeof rawAfter === "number" && Number.isFinite(rawAfter) ? rawAfter : null;
+      judgeOut[dim] = {
+        before: b,
+        after: a,
+        delta: b !== null && a !== null ? a - b : null
+      };
+    }
+    out[judge] = judgeOut;
+  }
+  return out;
+}
+function diffGenerations(before, after) {
+  const beforeMap = new Map(before.cells.map((c) => [keyForCell(c), c]));
+  const afterMap = new Map(after.cells.map((c) => [keyForCell(c), c]));
+  const matched = [];
+  const removed = [];
+  const added = [];
+  for (const [key, beforeCell] of beforeMap) {
+    const afterCell = afterMap.get(key);
+    if (!afterCell) {
+      removed.push(beforeCell);
+      continue;
+    }
+    matched.push({
+      scenarioId: beforeCell.scenarioId,
+      rep: beforeCell.rep,
+      compositeBefore: beforeCell.compositeMean,
+      compositeAfter: afterCell.compositeMean,
+      compositeDelta: afterCell.compositeMean - beforeCell.compositeMean,
+      dimensions: diffDimensions(beforeCell.dimensions, afterCell.dimensions)
+    });
+  }
+  for (const [key, afterCell] of afterMap) {
+    if (!beforeMap.has(key)) added.push(afterCell);
+  }
+  return {
+    beforeIndex: before.index,
+    afterIndex: after.index,
+    beforeSurfaceHash: before.surfaceHash,
+    afterSurfaceHash: after.surfaceHash,
+    surfaceChanged: before.surfaceHash !== after.surfaceHash,
+    matched,
+    removed,
+    added,
+    compositeBefore: before.compositeMean,
+    compositeAfter: after.compositeMean,
+    compositeDelta: after.compositeMean - before.compositeMean,
+    costUsdBefore: before.costUsd,
+    costUsdAfter: after.costUsd,
+    costUsdDelta: after.costUsd - before.costUsd,
+    durationMsBefore: before.durationMs,
+    durationMsAfter: after.durationMs,
+    durationMsDelta: after.durationMs - before.durationMs
+  };
+}
+function winnerOf(run) {
+  if (run.generations.length === 0) return null;
+  let winner = run.generations[0];
+  for (const gen of run.generations) {
+    if (gen.index > winner.index) winner = gen;
+  }
+  return winner;
+}
+function diffRuns(before, after) {
+  const beforeWinner = winnerOf(before);
+  const afterWinner = winnerOf(after);
+  const baselineDiff = before.baseline && after.baseline ? diffGenerations(before.baseline, after.baseline) : null;
+  const winnersDiff = beforeWinner && afterWinner ? diffGenerations(beforeWinner, afterWinner) : null;
+  const beforeLift = before.holdoutLift ?? null;
+  const afterLift = after.holdoutLift ?? null;
+  return {
+    beforeRunId: before.runId,
+    afterRunId: after.runId,
+    beforeTimestamp: before.timestamp,
+    afterTimestamp: after.timestamp,
+    beforeGateDecision: before.gateDecision ?? null,
+    afterGateDecision: after.gateDecision ?? null,
+    beforeHoldoutLift: beforeLift,
+    afterHoldoutLift: afterLift,
+    holdoutLiftDelta: beforeLift !== null && afterLift !== null ? afterLift - beforeLift : null,
+    beforeTotalCostUsd: before.totalCostUsd,
+    afterTotalCostUsd: after.totalCostUsd,
+    totalCostUsdDelta: after.totalCostUsd - before.totalCostUsd,
+    beforeTotalDurationMs: before.totalDurationMs,
+    afterTotalDurationMs: after.totalDurationMs,
+    totalDurationMsDelta: after.totalDurationMs - before.totalDurationMs,
+    baselineDiff,
+    winnersDiff
+  };
+}
+function diffRunBaselineToWinner(run) {
+  if (!run.baseline) return null;
+  const winner = winnerOf(run);
+  if (!winner || winner.index === run.baseline.index) return null;
+  return diffGenerations(run.baseline, winner);
+}
 // src/contract/intake/agent-trace.ts
 function rangeLines(r) {
   return Math.max(0, r.end_line - r.start_line + 1);
@@ -1328,6 +1441,9 @@ export {
   buildEvidenceVector,
   composeGate,
   defaultProductionGate,
+  diffGenerations,
+  diffRunBaselineToWinner,
+  diffRuns,
   evolutionaryDriver,
   fromFeedbackTable,
   fromOtelSpans,