npm - @tangle-network/agent-eval - Versions diffs - 0.53.0 → 0.55.0 - Mend

@tangle-network/agent-eval 0.53.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +7 -6
package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
package/dist/benchmarks/index.d.ts +3 -2
package/dist/builder-eval/index.d.ts +4 -3
package/dist/campaign/index.d.ts +9 -7
package/dist/campaign/index.js +33 -4
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
package/dist/{chunk-5KSDYBYH.js → chunk-LYL4SOKT.js} +3 -2
package/dist/chunk-LYL4SOKT.js.map +1 -0
package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
package/dist/chunk-NCK5QLGT.js.map +1 -0
package/dist/contract/index.d.ts +13 -12
package/dist/contract/index.js +25 -0
package/dist/contract/index.js.map +1 -1
package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
package/dist/control.d.ts +7 -6
package/dist/control.js +2 -2
package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
package/dist/governance/index.d.ts +3 -2
package/dist/hosted/index.d.ts +7 -6
package/dist/{index-C7RhhEME.d.ts → index-D2nT6_KT.d.ts} +20 -2
package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
package/dist/index.d.ts +31 -29
package/dist/index.js +3 -3
package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
package/dist/knowledge/index.d.ts +4 -3
package/dist/meta-eval/index.d.ts +4 -3
package/dist/openapi.json +1 -1
package/dist/pipelines/index.d.ts +7 -6
package/dist/prm/index.d.ts +5 -4
package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
package/dist/reporting.d.ts +7 -6
package/dist/{researcher-LZD0qHEa.d.ts → researcher-JP8EvnLv.d.ts} +11 -6
package/dist/rl.d.ts +11 -10
package/dist/rl.js +2 -2
package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
package/dist/store-CKUAgsJz.d.ts +101 -0
package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
package/dist/traces.d.ts +7 -6
package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
package/dist/wire/index.d.ts +5 -4
package/docs/pilot/README.md +62 -0
package/docs/pilot/customer-checklist.md +90 -0
package/docs/pilot/integration-foreign-stack.md +296 -0
package/docs/pilot/integration-tangle-stack.md +248 -0
package/docs/pilot/one-pager.md +161 -0
package/docs/pilot/sample-insight-report.json +172 -0
package/docs/research/research-roadmap.md +204 -0
package/package.json +1 -1
package/dist/chunk-5KSDYBYH.js.map +0 -1
package/dist/chunk-BWZEGTES.js.map +0 -1
/package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0

package/docs/pilot/one-pager.md ADDED Viewed

@@ -0,0 +1,161 @@
+# Statistical self-improvement for your agent — one-pager
+**For:** teams running an agent on the Tangle stack (sandbox + tcloud), OR any agent emitting OTel traces, OR LangChain / LlamaIndex / Anthropic SDK / OpenAI Assistants / OpenRouter / custom — we meet you where you are.
+**The pitch:** every week, get a statistically-rigorous answer to *"did my last change help?"* + a closed loop that proposes the next improvement + a held-out gate that refuses to ship regressions.
+## What you get
+| Deliverable | Cadence | LLM cost |
+|---|---|---|
+| **Decision packet** — composite distribution, per-dimension judges, cost-quality Pareto, failure clusters, named worst-N runs, ranked recommendations | Whenever you want it. Hosted runs on a 15-min schedule by default. | $0 (deterministic) |
+| **Prior-period comparison** — Welch CI on composite / cost / duration / per-dimension deltas vs your prior week, with regressed + improved metrics named | Same cadence | $0 |
+| **Closed-loop improvement** — `selfImprove()` proposes prompt edits, runs scenarios, gates on paired-bootstrap CI, auto-PRs the winner | On-demand, opt-in | Real $; you set a `maxUsd` ceiling |
+Every claim is falsifiable: `n=`, `CI95=[a, b]`, `p=`, `Cohen's d=`. No vibes, no "score went up." Where the data doesn't support a section, the report says so explicitly instead of inventing signal.
+## Why this is different
+| | LangSmith / Braintrust / Phoenix | Hermes / Claude Code skills | **Tangle** |
+|---|---|---|---|
+| Trace ingest | proprietary | own runtime | universal (sandbox + tcloud + OTel + any custom) |
+| Decision packet | scorecards (no CI) | none | **paired-bootstrap CI on every claim** |
+| Closed loop | none | heuristic, no gate | **statistically-gated; refuses regressions** |
+| Prior-period delta | none | none | **Welch CI on every metric** |
+| Sample-size guidance | none | none | **MDE-aware** |
+| Auto-PR promotion | none | none | **opt-in, on green gate only** |
+## Integration paths — pick your stack
+| Your stack | Intake adapter | LLM provider for closed loop |
+|---|---|---|
+| **Tangle (sandbox + tcloud)** | `fromTangleSandbox` | tcloud (already wired) |
+| Any OTel exporter (Datadog APM, Honeycomb, NewRelic, OpenInference) | `fromOtelSpans` | any OpenAI-compat |
+| LangChain (LangSmith) | LangSmith → OTel export → `fromOtelSpans` today; `fromLangChain` queued 0.55.0 | OpenAI, Anthropic, OpenRouter, tcloud |
+| LlamaIndex | `OpenInferenceCallbackHandler` → OTel → ingest | any OpenAI-compat |
+| Anthropic SDK direct | OTel wrapping (~20 LOC) → `fromOtelSpans`; `fromAnthropicSDK` queued | Anthropic, OpenRouter |
+| OpenAI Assistants API | Custom mapper (~20 LOC) today; `fromOpenAIAssistants` queued | OpenAI, OpenRouter |
+| OpenRouter (any model on any path) | Whatever you already use for tracing | OpenRouter (OpenAI-compat baseUrl) |
+| vLLM / Ollama / LMStudio / self-hosted | OTel wrap | Your local OpenAI-compat endpoint |
+| Multi-rater human feedback (no automated judge yet) | `fromFeedbackTable` | n/a — gives you κ + disagreement triage |
+| Custom logs / DB rows | ~20-line mapper to `RunRecord` | any OpenAI-compat |
+Full integration walkthroughs:
+- **Tangle stack** → [`integration-tangle-stack.md`](./integration-tangle-stack.md)
+- **Everything else** → [`integration-foreign-stack.md`](./integration-foreign-stack.md)
+## Zero-setup demo first — 30 seconds
+Before any integration, run the demo against synthetic data so you see the output shape live:
+```sh
+npx @tangle-network/intelligence demo
+```
+No install, no key, no data. Synthetic agent runs through synthetic scenarios; the CLI prints a real `InsightReport` with composite distribution + Pareto + prior-period delta + ranked recommendations. Same output shape you'll get on your real data once we integrate.
+When you're ready to integrate, the same CLI scaffolds your repo:
+```sh
+npx @tangle-network/intelligence init           # creates eval/scenarios.json + judges.ts + pnpm scripts + .runs/
+npx @tangle-network/intelligence report          # renders InsightReport from your latest traces
+npx @tangle-network/intelligence improve --max-usd 25    # runs selfImprove with cost ceiling, opens auto-PR on green gate
+```
+Hosted equivalent: **[staging-intelligence.tangle.tools](https://staging-intelligence.tangle.tools)** — open in your browser, ingest your traces, see the dashboard render the same packet your CLI produces.
+## How you integrate (Tangle stack — 4 steps)
+```ts
+import { fromTangleSandbox } from '@tangle-network/agent-eval/adapters/sandbox'
+import { analyzeRuns, selfImprove, gepaDriver } from '@tangle-network/agent-eval/contract'
+// 1. You already emit traces via @tangle-network/sandbox + tcloud.
+//    Pull them into canonical RunRecord[]:
+const runs = fromTangleSandbox({ sessionId, sinceMs: lastReportTime })
+// 2. Get the decision packet — no LLM cost.
+const report = await analyzeRuns({ runs, baselineRuns: priorWeekRuns })
+//    → report.composite + .priorPeriodComparison + .recommendations
+// 3. When you want to actually improve, run the closed loop.
+const result = await selfImprove({
+  scenarios: yourScenarios,                     // we help you build these
+  agent: (surface, scenario) => runYourAgent(scenario, surface),
+  judge: yourJudge,                              // any function (artifact) → JudgeScore
+  baselineSurface: currentSystemPrompt,
+  driver: gepaDriver({ llm: tcloud, model: 'claude-sonnet-4.6', target: 'agent prompt' }),
+  budget: { generations: 3, populationSize: 4, holdoutFraction: 0.3, maxUsd: 25 },
+})
+// 4. Result is a verifiable diff with statistical evidence.
+//    Auto-PR if result.gateDecision === 'ship-substrate'.
+```
+That's it. ~30 lines of integration code; the rest is your existing agent + tcloud setup.
+## What we need from you
+- API key for tcloud (you already have this)
+- Read access to your sandbox session traces
+- A list of 20-50 representative scenarios your agent should handle
+- A judge function — even a simple LLM-as-judge gets you 80% of the value
+- An LLM-cost budget for the closed loop (default: $25/campaign)
+## What you ship back to your customers
+The substrate produces a single JSON `InsightReport` your dashboard renders. Live demo embedded in the Tangle Intelligence dashboard. Example below — every section optional based on what your data supports.
+```json
+{
+  "n": 36,
+  "composite": {
+    "mean": 0.823, "p50": 0.85, "p95": 0.96, "stddev": 0.11,
+    "tailRuns": [
+      { "runId": "scenario::checkout-bug", "score": 0.41 },
+      { "runId": "scenario::refund-policy", "score": 0.48 }
+    ]
+  },
+  "priorPeriodComparison": {
+    "baselineN": 34,
+    "currentN": 36,
+    "windowLabel": "vs prior 7 days",
+    "metrics": {
+      "composite": {
+        "current": 0.823, "baseline": 0.731, "delta": 0.092,
+        "ci95": [0.041, 0.143], "pValue": 0.0008,
+        "cohensD": 0.84, "significant": true
+      }
+    },
+    "improvedMetrics": ["composite"],
+    "regressedMetrics": []
+  },
+  "recommendations": [
+    {
+      "priority": "low",
+      "kind": "ship",
+      "title": "composite improved from 0.731 → 0.823 vs prior 7 days",
+      "detail": "Welch CI95=[0.041, 0.143], p=0.0008, Cohen's d=0.84 (n_current=36, n_baseline=34). Statistically significant improvement worth flagging."
+    },
+    {
+      "priority": "high",
+      "kind": "investigate",
+      "title": "Top failure cluster: refund-policy (12% of failures)",
+      "detail": "4 runs failed. Largest cluster groups by intent — agent missed compliance flag in 3 of 4."
+    }
+  ]
+}
+```
+## Pricing for the pilot
+- Free for the first 30 days
+- Hosted decision-packet generation: included
+- LLM cost on closed-loop campaigns: pass-through to your tcloud account
+- Post-pilot: per-campaign pricing tied to budget cap + per-decision-packet billed monthly
+## Next step
+Reply with: which agent + which week you want to start, and we'll set up the integration on a shared call. ~1 hour to first running report.
+—
+*Tangle Network · @tangle-network/agent-eval @0.53.0 · MIT · Self-hostable*

package/docs/pilot/sample-insight-report.json ADDED Viewed

@@ -0,0 +1,172 @@
+{
+  "n": 36,
+  "composite": {
+    "n": 36,
+    "mean": 0.823,
+    "p50": 0.85,
+    "p95": 0.96,
+    "stddev": 0.114,
+    "min": 0.41,
+    "max": 0.98,
+    "tailRuns": [
+      { "runId": "scenario::checkout-bug", "score": 0.41 },
+      { "runId": "scenario::refund-policy-edge", "score": 0.48 },
+      { "runId": "scenario::multi-tenant-isolation", "score": 0.52 },
+      { "runId": "scenario::stale-cache-invalidation", "score": 0.55 },
+      { "runId": "scenario::partial-payment", "score": 0.61 }
+    ],
+    "histogram": [
+      { "lo": 0.40, "hi": 0.46, "count": 1 },
+      { "lo": 0.46, "hi": 0.51, "count": 1 },
+      { "lo": 0.51, "hi": 0.57, "count": 1 },
+      { "lo": 0.57, "hi": 0.62, "count": 1 },
+      { "lo": 0.62, "hi": 0.68, "count": 2 },
+      { "lo": 0.68, "hi": 0.74, "count": 2 },
+      { "lo": 0.74, "hi": 0.80, "count": 4 },
+      { "lo": 0.80, "hi": 0.85, "count": 9 },
+      { "lo": 0.85, "hi": 0.91, "count": 8 },
+      { "lo": 0.91, "hi": 0.96, "count": 6 },
+      { "lo": 0.96, "hi": 1.0, "count": 1 }
+    ]
+  },
+  "perDimension": {
+    "intent-recognition": {
+      "n": 36, "mean": 0.89, "p50": 0.92, "p95": 0.98, "stddev": 0.08
+    },
+    "compliance-flagging": {
+      "n": 36, "mean": 0.71, "p50": 0.75, "p95": 0.92, "stddev": 0.18
+    },
+    "tone": {
+      "n": 36, "mean": 0.94, "p50": 0.95, "p95": 0.99, "stddev": 0.04
+    }
+  },
+  "costQuality": {
+    "cost": {
+      "n": 36, "mean": 0.087, "p50": 0.082, "p95": 0.124, "stddev": 0.021
+    },
+    "pareto": {
+      "kind": "pareto-cost-quality",
+      "split": "holdout",
+      "axes": { "x": "costUsd", "y": "score" },
+      "points": [
+        { "candidateId": "baseline-v3.1", "cost": 0.087, "quality": 0.823, "n": 36, "onFrontier": true }
+      ]
+    }
+  },
+  "judges": {
+    "claude-sonnet-4.6": {
+      "n": 36,
+      "meanScore": 0.831,
+      "calibration": null
+    }
+  },
+  "lift": null,
+  "failureClusters": {
+    "totalFailures": 4,
+    "clusters": [
+      {
+        "id": "cluster_refund_compliance",
+        "name": "refund-policy missed compliance flag",
+        "share": 0.75,
+        "exemplars": [
+          "scenario::refund-policy-edge",
+          "scenario::partial-payment",
+          "scenario::cross-border-refund"
+        ],
+        "suggestedFix": "Add explicit step to the addendum: when refund amount > $100 OR cross-border, surface compliance flag before responding."
+      }
+    ]
+  },
+  "priorPeriodComparison": {
+    "baselineN": 34,
+    "currentN": 36,
+    "windowLabel": "vs prior 7 days",
+    "metrics": {
+      "composite": {
+        "current": 0.823,
+        "baseline": 0.731,
+        "delta": 0.092,
+        "ci95": [0.041, 0.143],
+        "pValue": 0.0008,
+        "cohensD": 0.84,
+        "baselineN": 34,
+        "currentN": 36,
+        "significant": true
+      },
+      "cost": {
+        "current": 0.087,
+        "baseline": 0.082,
+        "delta": 0.005,
+        "ci95": [-0.003, 0.013],
+        "pValue": 0.21,
+        "cohensD": 0.23,
+        "baselineN": 34,
+        "currentN": 36,
+        "significant": false
+      },
+      "duration": {
+        "current": 4820,
+        "baseline": 5340,
+        "delta": -520,
+        "ci95": [-840, -200],
+        "pValue": 0.002,
+        "cohensD": -0.71,
+        "baselineN": 34,
+        "currentN": 36,
+        "significant": true
+      },
+      "dim.compliance-flagging": {
+        "current": 0.71,
+        "baseline": 0.58,
+        "delta": 0.13,
+        "ci95": [0.06, 0.20],
+        "pValue": 0.0004,
+        "cohensD": 0.79,
+        "baselineN": 34,
+        "currentN": 36,
+        "significant": true
+      }
+    },
+    "improvedMetrics": ["composite", "duration", "dim.compliance-flagging"],
+    "regressedMetrics": []
+  },
+  "release": {
+    "status": "pass",
+    "axes": [
+      { "name": "quality-lift", "status": "pass", "detail": "no candidate/baseline pair within campaign; relying on priorPeriodComparison" },
+      { "name": "contamination", "status": "pass", "detail": "no canaries supplied" },
+      { "name": "composite-distribution", "status": "pass", "detail": "mean=0.823, p50=0.85, p95=0.96 over n=36" }
+    ],
+    "issues": []
+  },
+  "recommendations": [
+    {
+      "priority": "low",
+      "kind": "ship",
+      "title": "composite improved from 0.731 → 0.823 vs prior 7 days",
+      "detail": "Welch CI95=[0.041, 0.143], p=0.0008, Cohen's d=0.84 (n_current=36, n_baseline=34). Statistically significant improvement worth flagging.",
+      "evidencePath": "priorPeriodComparison.metrics.composite"
+    },
+    {
+      "priority": "low",
+      "kind": "ship",
+      "title": "compliance-flagging dimension improved from 0.58 → 0.71",
+      "detail": "Welch CI95=[0.06, 0.20], p=0.0004, Cohen's d=0.79. The fix from last week's PR is statistically validated.",
+      "evidencePath": "priorPeriodComparison.metrics.dim.compliance-flagging"
+    },
+    {
+      "priority": "high",
+      "kind": "investigate",
+      "title": "Top failure cluster: refund-policy missed compliance flag (75% of failures)",
+      "detail": "3 of 4 failed runs cluster here. Suggested fix: add explicit step to addendum for refund > $100 OR cross-border → surface compliance flag.",
+      "evidencePath": "failureClusters.clusters[0]"
+    },
+    {
+      "priority": "low",
+      "kind": "ship",
+      "title": "duration improved from 5340ms → 4820ms vs prior 7 days",
+      "detail": "Welch CI95=[-840, -200]ms, p=0.002, Cohen's d=-0.71. Agent is meaningfully faster — worth keeping the optimization that drove this.",
+      "evidencePath": "priorPeriodComparison.metrics.duration"
+    }
+  ]
+}

package/docs/research/research-roadmap.md ADDED Viewed

@@ -0,0 +1,204 @@
+# Research Roadmap — Agent Self-Improvement as a Research Field
+**Status:** Living. Separate from the product roadmap. Updated when a thesis formalizes, an experiment runs, or a draft posts.
+**Tracking:** task #107.
+**Audience:** Dario, Yann, Ilya, Sam, lab researchers, peer reviewers.
+**Posture:** Honest about what we have, sharp about what we'd need.
+## One-sentence pitch
+**Agent self-improvement is missing its statistical foundation, its formal model of two-writer state, and its standard benchmark. We claim all three.**
+## The three publishable theses
+### Thesis 1 — Branch-benchmark consensus for safe offline+online self-improvement
+**Setup.** Two writers concurrently mutate an agent's behavior surface: an in-sandbox harness (per-turn, online) and an offline substrate (batch, statistically-gated). Both produce divergent versions from a common ancestor. Existing literature handles online RL (single writer = policy) and offline RL (no in-runtime writes) — nobody has formalized the *combined* regime where both writers coexist.
+**Claim to prove.** Given common ancestor `P_anc`, harness branch `P_h`, substrate branch `P_s`, scenarios `S`, and judge `J`, a branch-benchmark consensus procedure produces a winner `P_w` with regret bounded by `max(R(P_h), R(P_s)) − ε` with probability `≥ 1−δ` under explicit assumptions about judge calibration + scenario coverage.
+**Why it's publishable.**
+- Genuinely novel regime — the combined offline+online assumption set is uncharted.
+- Maps to a real customer pain point (Hermes-on-our-sandbox drift).
+- Tractable proof structure: paired-bootstrap + Hoeffding + union bound across surface dimensions.
+- Empirical validation: instrument Hermes-on-our-sandbox, measure consensus-vs-naïve-merge regret.
+**Estimated effort.** ~3 months focused. Maps to product task #98 (profile-versioning architecture).
+**Venue.** NeurIPS or ICLR main track. Or workshop at NeurIPS Foundation Models for Decision Making.
+---
+### Thesis 2 — Natural-language corrective feedback as a learnable gradient
+**Setup.** RL provides scalar reward `r ∈ ℝ`. Natural-language feedback ("stop doing X", "you always Y", "this is too verbose") carries strictly more information per bit but no formal model says how to combine it with scalar reward to update policy or skill state. Hermes uses corrective feedback heuristically; GEPA paper claims language is a richer learning medium but doesn't formalize this specific signal. The gap is wide open.
+**Claim to prove.**
+1. Information-theoretic: corrective utterances carry `H(c) > H(r)` bits of policy-relevant information under realistic distributions of user satisfaction.
+2. Algorithmic: an extraction+integration procedure exists that improves sample efficiency over scalar-only RL by `k×` (target k ≥ 5) on the proposed benchmark.
+3. Empirical: validation on multi-turn agent tasks with explicit user-corrective channels.
+**Why it's publishable.**
+- Connects to GEPA paper's central claim, makes it falsifiable for the corrective sub-class.
+- Maps to product task #103 (`extractUserCorrections`).
+- Distinctive — no observability or RL competitor formalizes corrective feedback as a gradient.
+**Estimated effort.** ~4 months. Information-theoretic framing is delicate.
+**Venue.** ICLR or main-track NeurIPS. Possibly EMNLP for the NLP angle.
+---
+### Thesis 3 — Sample-efficient self-improvement under a paired-bootstrap gate
+**Setup.** GEPA paper claims ~35× fewer rollouts than GRPO. They use a binary improvement check. Our substrate uses paired-bootstrap CI + Cohen's d + MDE (strictly stricter gate). The trade-off between gate-strictness and rollout efficiency is unmodeled.
+**Claim to prove.** Given a paired-bootstrap gate with significance level α and minimum detectable effect δ, `selfImprove` requires `O((σ/δ)² · log(1/α))` rollouts to detect a true ε-improvement with power `1-β`. Tight constants. Compare empirically to GRPO and to GEPA's simple-improvement gate on identical benchmarks.
+**Why it's publishable.**
+- Closes a gap GEPA left open (their efficiency claim has no power analysis).
+- Maps to product task #101 (real GEPA Pareto + sample-size theory).
+- Provides a tool — power calculator for the field's self-improvement runs.
+**Estimated effort.** ~2 months — the cleanest of the three. Mostly classical sample-size theory + careful experiments.
+**Venue.** ICML or AISTATS. The statistical framing fits both.
+## The fourth thesis (long-horizon, highest prestige)
+### Thesis 4 — A standardized benchmark for self-improvement
+**Setup.** No standard benchmark exists for "did self-improvement help, robustly, across distribution shift?" GAIA + SWE-Bench + AgentBench measure agent capability; nothing measures self-improvement quality. The field is publishing self-improvement results on disparate ad-hoc setups; nobody compares.
+**Claim to ship.** A benchmark with:
+- 100+ scenarios spanning distinct distribution shifts (intra-domain, cross-domain, adversarial corruption)
+- Held-out test split with strict contamination guards
+- Reference baselines (no-driver / random / scalar-only-RL / GEPA / our substrate)
+- Standard scorecard: lift CI, sample efficiency, distribution-shift robustness, cost
+- Public leaderboard
+**Why it matters.** Whoever owns the benchmark owns the measuring stick. ImageNet for vision, GLUE for NLP, GAIA for agent capability — the gap for *self-improvement quality* is open.
+**Estimated effort.** 6 months. Real scenario authoring, contamination engineering, community outreach for leaderboard adoption.
+**Venue.** Datasets + Benchmarks track at NeurIPS. Or workshop debut → main-track followup.
+## 12 open research questions, ranked by signal-to-noise
+Each is a falsifiable claim or unanswered formal question. Each maps to publishable work.
+1. **Information content of corrective feedback.** What's the empirical mutual information `I(correction; preferred_policy)` across realistic agent deployments? Is it consistently `> H(scalar_reward)`?
+2. **Convergence of branch-benchmark consensus.** Under what assumptions on judge calibration does the symmetric-fork merge protocol converge to a global optimum vs a local one?
+3. **The cost of statistical strictness.** How much does a paired-bootstrap gate cost in rollouts vs a literal `>` gate (SkillOpt's choice), as a function of true effect size? Where's the crossover where strictness costs more than it saves?
+4. **Cross-surface attribution.** When `compositeDriver` ships a winner where N surfaces changed, which surface's change drove the lift? Shapley estimators on agent-profile surfaces — tractable? Required sample size?
+5. **Sample-efficient evaluation under distribution shift.** Given a held-out test slice and a known shift class (intra-domain / cross-domain / adversarial), how few held-out scenarios are needed to detect lift with target power? Is it a function of shift magnitude?
+6. **Diminishing returns of recursive self-improvement.** A substrate that optimizes its own SKILL.md against held-out tasks — does it converge or drift? At what point do recursive self-edits become net-negative on a true holdout? Map the loss landscape.
+7. **Skill semantic-duplicate detection.** Substrate's `summarize-pr` vs harness's `pr-summarizer`. What's the right embedding + threshold? Is human review for borderline cases unavoidable or can it be automated?
+8. **Reward-hacking under self-improvement.** When the optimizer can mutate the judge prompt (the recursive surface), what's the formal condition under which it learns to game the judge instead of solving the task? Connect to Goodhart + AIRP.
+9. **Cost-quality Pareto across drivers.** What's the empirical Pareto frontier when you trade off `gepaDriver` (high $/gen) vs `evolutionaryDriver` ($0/gen) vs heuristic mutations? Is it task-dependent or universal?
+10. **Online-offline merge regret.** When harness branch and substrate branch are merged, what's the regret of the merged policy vs the better-of-two? Bounded? Worst-case adversarial?
+11. **Universal trace ingest tax.** Cross-framework adapter coverage (LangChain / LlamaIndex / Anthropic / OpenAI) — how much signal loss is forced by the lowest-common-denominator RunRecord shape? Quantify in terms of recoverable lift CI.
+12. **Foundation-model-as-judge calibration drift.** When the judge LLM updates (Claude → Claude+1), what's the variance in judge scores on a fixed corpus? Is held-out gate validity preserved across judge versions? Empirical study, longitudinal.
+## The processes (how we actually do this)
+**Cadence.**
+- Daily: product work continues (Track A). Research is a separate 30%-time block.
+- Weekly: research log + open-questions revision. One paper-quality paragraph per week.
+- Monthly: experiment milestone — either proof attempt or empirical-run results.
+- Quarterly: paper-draft milestone.
+**Artifacts.**
+- `docs/research/<thesis>/notes.md` — running research log, hypothesis, current status.
+- `docs/research/<thesis>/experiments.md` — every run + numbers + analysis.
+- `docs/research/<thesis>/paper-draft.md` — building toward arXiv submission.
+- `.evolve/research/<thesis>/` — code + data + figures, version-controlled.
+**Quality bar.**
+- Every claim falsifiable. Every number has CI, p, and sample size.
+- Every experiment reproducible — script + seed + commit hash + data hash.
+- Every figure has an underlying CSV the reviewer can download.
+- Every theorem has a proof in the doc, not just a citation.
+**Review cadence.**
+- Internal critique pass before any external sharing — find every weak spot.
+- External review at 80%-draft: one peer in the field, one peer outside.
+- ArXiv submission as the gating event for public claim.
+## What we explicitly will NOT do
+- **Will not pretend product-grade engineering is research.** Architecture docs are not papers. Strategic framing is not contribution.
+- **Will not chase trendy directions (RLHF variants, constitutional AI, scaling laws) where we have no edge.** Our edges are specific: two-writer state, corrective feedback as gradient, statistical strictness. Stay in lane.
+- **Will not publish empirical results without proper baselines.** "Our substrate produces N% lift on dataset X" is meaningless without no-driver/random/GEPA/SkillOpt baselines on identical infrastructure.
+- **Will not optimize for citation count over insight.** One paper that changes how the field thinks > five papers that move a benchmark by 2 points.
+## Where we are right now
+**Track A (product) status as of 2026-05-27:**
+- agent-eval shipped 0.47 → 0.53 in one session
+- Six consumers on substrate 0.50+
+- Honest spec docs landed
+- Product roadmap 0.53 → 1.1 mapped
+**Track B (research) status as of 2026-05-27:**
+- This doc exists
+- Zero experiments run on published benchmarks
+- Zero papers drafted
+- Three theses identified, none formalized
+- Twelve open questions enumerated, none answered
+We are at Track-B day 0. Honesty matters.
+## Deliverables — 12-month plan
+**Q3 2026 — proof of life.**
+- Run our drivers against AgentBench / SWE-Bench Verified / GAIA. Report numbers with CI.
+- Pick one named partner customer who'd validate Thesis 1 with us on their real deployment.
+**Q4 2026 — Thesis 3 paper draft.**
+- Sample-efficient self-improvement is the cleanest claim — fastest to publish, sharpens our gate's edge.
+- Target: arXiv pre-print + AISTATS submission.
+**Q1 2027 — Thesis 1 paper draft.**
+- Branch-benchmark consensus — the deepest claim, the one that needs forcing-function data from a Hermes-on-sandbox deployment.
+- Target: NeurIPS / ICLR submission.
+**Q2 2027 — Thesis 4 benchmark public release.**
+- The benchmark + leaderboard is the highest-prestige play.
+- Target: Datasets + Benchmarks track at NeurIPS 2027.
+**Q3 2027 — Thesis 2 paper draft.**
+- Corrective feedback as gradient — slowest to ripen, hardest to formalize.
+- Target: ICLR submission.
+## How a lab lead would react to this doc
+If you printed this and slid it across Dario's desk:
+**The good.** Specific named theses with falsifiable claims. Honest about gap from product to research. Three publishable directions in clear scope. Twelve open questions readable as a research-program statement.
+**The hostile-reviewer attack.** "Show me one number on one published benchmark from your existing infrastructure. You have a substrate with a paired-bootstrap gate that's never been compared to anything." That is correct. Q3 2026 deliverable is the answer.
+**The deepest question they'd ask.** "Why does this matter for AGI / safety / capability? Why work on this instead of pretraining / alignment / interpretability?" Honest answer: agents that self-improve in production are a near-term reality. The work to make that *safe* and *measurable* is path-dependent on whether the field formalizes it or accepts ad-hoc product implementations. Our pitch is "be the lab that formalized it before it became a 1000-org engineering mess." That's a defensible answer if backed by the published work.
+## The one-sentence inspirational version per audience
+- **For Dario:** "We're building the statistical foundation that turns 'agents that self-improve' from a marketing slogan into a measurable claim with calibrated error bars."
+- **For Yann:** "Self-improvement is offline-RL with two writers — and nobody has formalized the consensus regime. We will."
+- **For Ilya:** "What's the simplest formalism under which self-improving agents converge to a global optimum vs a local one? Branch-benchmark consensus is our hypothesis."
+- **For Sam:** "We are going to ship the substrate that lets every customer's agent self-improve safely, then publish the science that proves it works. The product builds the data; the data writes the papers; the papers create the moat."
+## The harshest honest sentence
+If we don't run a published benchmark by Q3 2026, this entire doc is fan-fiction. Build the empirical infrastructure first, formalize after, publish last.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.53.0",
+  "version": "0.55.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {

package/dist/chunk-5KSDYBYH.js.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"sources":["../src/eval-campaign.ts"],"sourcesContent":["/**\n * EvalCampaign — opinionated matrix runner that wires the four\n * capture-integrity directives by construction.\n *\n * The canonical benchmark shape — matrix runner → for each\n * (variant, scenario, seed) → start a TraceEmitter → call LLMs → end the\n * run → analyze — has a bug class at the integration boundary: raw\n * events not captured, route silently wrong, integrity not asserted,\n * analyst never run. The directives in `SKILL.md § Capture integrity`\n * are the mitigations.\n *\n * `EvalCampaign` is the structural fix — consumers don't wire the\n * integrity surface themselves; the campaign owns it. Specifically:\n *\n * - calls `assertLlmRoute` once at preflight before any work runs\n * - constructs a per-run `TraceStore` and `RawProviderSink` via factories\n * - constructs the `TraceEmitter` with `onRunComplete: [analyst hook]`\n * - hands the runner an `LlmClientOptions` pre-wired with the sink and\n * trace context — the runner can't accidentally call an LLM without\n * capturing the raw HTTP envelope\n * - calls `assertRunCaptured` after every `endRun` and routes failures\n * through a configurable policy (`throw` / `mark_failed` / `log`)\n * - assembles per-run `RunRecord`s and runs `researchReport` at the end\n * so the campaign artifact is launch-decision-grade by default\n * - embeds the campaign fingerprint (a SHA-256 over the canonicalised\n * run set) and optional `preregistrationHash` in the report\n *\n * The runner contract is intentionally narrow: produce a `CampaignRunOutcome`\n * given a fully-wired `CampaignRunContext`. Everything orchestration-shaped\n * lives in the campaign. This is the inversion-of-control point — consumers\n * stop writing matrix runners and start writing scenario-runners.\n *\n * Out of scope for v1 (tracked in `docs/research-report-methodology.md`):\n *\n * - Distributed/cluster execution (concurrency is local async)\n * - Adaptive sampling / sequential interim looks\n * - Resume from partial state across crashes\n * - LLM-call retry beyond what `LlmClient` already does\n */\n\nimport {\n type AgentProfileCell,\n type AgentProfileCellInput,\n buildAgentProfileCell,\n verifyAgentProfileCell,\n} from './agent-profile-cell'\nimport { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client'\nimport { canonicalize, hashJson } from './pre-registration'\nimport type {\n JudgeScoresRecord,\n RunJudgeMetadata,\n RunOutcome,\n RunRecord,\n RunSplitTag,\n RunTokenUsage,\n} from './run-record'\nimport { validateRunRecord } from './run-record'\nimport { type ResearchReport, type ResearchReportOptions, researchReport } from './summary-report'\nimport type { RunCompleteHook } from './trace/emitter'\nimport { TraceEmitter } from './trace/emitter'\nimport {\n assertRunCaptured,\n RunIntegrityError,\n type RunIntegrityExpectations,\n type RunIntegrityReport,\n} from './trace/integrity'\nimport { FileSystemRawProviderSink, type RawProviderSink } from './trace/raw-provider-sink'\nimport type { TraceStore } from './trace/store'\n\n// ── Public types ─────────────────────────────────────────────────────────\n\nexport interface CampaignVariant<V> {\n id: string\n payload: V\n}\n\nexport interface CampaignScenario {\n scenarioId: string\n /** Free-form metadata propagated to runs and reports. */\n tags?: Record<string, string>\n}\n\nexport interface CampaignRunContext<V> {\n /** Stable run id. The campaign generates this; the runner does not. */\n runId: string\n /** Logical experiment id (campaignId by default; overridable per-run via opts). */\n experimentId: string\n variant: V\n variantId: string\n scenarioId: string\n scenarioTags: Record<string, string>\n seed: number\n splitTag: RunSplitTag\n /**\n * The TraceEmitter for this run, with `onRunComplete` hooks pre-wired\n * (analyst auto-execution if configured, plus integrity check). The\n * runner MUST call `emitter.startRun` before doing any work and either\n * `emitter.endRun` or `emitter.abortRun` before returning.\n */\n emitter: TraceEmitter\n store: TraceStore\n rawSink: RawProviderSink\n /**\n * Pre-wired LLM client options — `rawSink` and `traceContext` are populated\n * so any `callLlm(req, ctx.llmOpts)` automatically captures raw HTTP. The\n * runner can spread additional fields if needed.\n */\n llmOpts: LlmClientOptions\n}\n\nexport interface CampaignRunOutcome {\n /** Did the run pass? Mirrors `RunOutcome.pass` semantics. */\n pass: boolean\n /** Score for the run on its split. Maps to `searchScore` or `holdoutScore`. */\n score: number\n /** Mandatory cost in USD. Use 0 + raw.cost_unknown=1 only if truly unknown. */\n costUsd: number\n tokenUsage: RunTokenUsage\n /** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */\n model: string\n /** sha256 of the effective prompt sent to the model. */\n promptHash: string\n /** sha256 of the effective config (model, temperature, tools, judges, splits). */\n configHash: string\n /** Optional extra numeric metrics to land in `outcome.raw`. */\n raw?: Record<string, number>\n /** Optional failure-taxonomy tag if the run failed. */\n failureMode?: string\n /** Optional judge metadata when a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /**\n * Optional per-judge / per-dim breakdown for ensemble-judged runs.\n * Propagated to `outcome.judgeScores` on the resulting `RunRecord`.\n * Single-judge or scalar-only runs leave this unset.\n */\n judgeScores?: JudgeScoresRecord\n /**\n * Agent profile cell observed by the runner. When supplied, it overrides\n * `EvalCampaignOptions.agentProfile` for this run and must match the\n * outcome's `model` and `promptHash`.\n */\n agentProfile?: AgentProfileCell | AgentProfileCellInput\n}\n\nexport type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>\n\nexport type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log'\n\nexport interface EvalCampaignOptions<V> {\n /**\n * Stable id for the campaign. Used as the default `experimentId` on\n * every run, and folded into the campaign fingerprint.\n */\n campaignId: string\n variants: CampaignVariant<V>[]\n scenarios: CampaignScenario[]\n /** Default `[0, 1, 2]`. */\n seeds?: number[]\n /** Default `'holdout'` — the split that anchors a launch decision. */\n splitTag?: RunSplitTag\n /** Git SHA the campaign is run against. Mandatory; `RunRecord` rejects unset. */\n commitSha: string\n /**\n * LLM client config. Augmented per-run with `rawSink` and `traceContext`\n * before being passed to the runner. The campaign asserts this config\n * matches `routeRequirements` once at preflight.\n */\n llmOpts: LlmClientOptions\n /**\n * Default `{ requireExplicitBaseUrl: true, requireAuth: true }` — fail\n * loud if the campaign would silently fall back to the public router or\n * run unauthenticated. Override with an empty object to disable.\n */\n routeRequirements?: LlmRouteRequirements\n /**\n * Per-run TraceStore factory. Common shape: a fresh store per run keyed\n * on `runId`. Implementations that share a store across the campaign\n * are valid — the campaign only writes through `emitter`.\n */\n storeFactory: (params: CampaignFactoryParams) => TraceStore\n /**\n * Per-run RawProviderSink factory. Defaults to `FileSystemRawProviderSink`\n * rooted at `${workDir}/raw-events/${runId}` if `workDir` is supplied;\n * otherwise required. Forensic capture is non-negotiable in a campaign\n * run — pass `NoopRawProviderSink` explicitly if you want to opt out.\n */\n rawSinkFactory?: (params: CampaignFactoryParams) => RawProviderSink\n /**\n * Filesystem root for default `rawSinkFactory`. Ignored if\n * `rawSinkFactory` is supplied.\n */\n workDir?: string\n /**\n * Extra `onRunComplete` hooks the campaign appends (after its own\n * integrity-check hook). Pass `traceAnalystOnRunComplete(...)` here.\n */\n onRunComplete?: RunCompleteHook[]\n /**\n * Per-run integrity expectations. Defaults to:\n * `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }`.\n * Override (e.g. `{ llmSpansMin: 0 }`) for runs that don't call LLMs.\n */\n integrity?: RunIntegrityExpectations\n /** Behaviour when integrity fails. Default `'mark_failed'`. */\n onIntegrityFailure?: CampaignIntegrityPolicy\n /**\n * Per-run runner. Receives a fully-wired context; produces an outcome\n * the campaign converts into a `RunRecord`.\n */\n runner: CampaignRunner<V>\n /**\n * If set, the campaign computes `researchReport` at the end. `comparator`\n * is a `variantId`. Other fields are forwarded verbatim.\n */\n report?: { comparator?: string } & Omit<\n ResearchReportOptions,\n 'comparator' | 'preregistrationHash' | 'generatedAt'\n >\n /**\n * Hash of a signed `HypothesisManifest` (see `pre-registration.ts`).\n * Embedded in the campaign fingerprint and the research report.\n */\n preregistrationHash?: string\n /** Local concurrency. Default `1` (sequential). */\n concurrency?: number\n /**\n * Override the time source. Tests pass a mock to make wallMs deterministic.\n */\n now?: () => number\n /** Override the runId generator. Tests pin this. */\n runId?: (params: CampaignFactoryParams) => string\n /**\n * Agent profile cell for campaign runs. Static profiles can pass an object;\n * routers or variant-specific harnesses can pass a factory. The campaign\n * stamps the built cell onto every `RunRecord` and rejects profile/model or\n * profile/prompt contradictions.\n */\n agentProfile?:\n | AgentProfileCell\n | AgentProfileCellInput\n | ((\n params: CampaignFactoryParams & {\n variant: V\n scenarioTags: Record<string, string>\n },\n ) =>\n | AgentProfileCell\n | AgentProfileCellInput\n | Promise<AgentProfileCell | AgentProfileCellInput>)\n}\n\nexport interface CampaignFactoryParams {\n campaignId: string\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n}\n\nexport interface FailedRun {\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n reason: string\n error?: string\n}\n\nexport interface EvalCampaignResult {\n campaignId: string\n /** SHA-256 over canonicalised `(variantIds, scenarioIds, seeds, comparator, splitTag, baseUrl, provider, preregistrationHash)`. */\n campaignFingerprint: string\n preregistrationHash: string | null\n /** Successful runs only. Failed runs land in `failedRuns`. */\n runs: RunRecord[]\n /** Integrity reports for every successful run. */\n integrityReports: RunIntegrityReport[]\n failedRuns: FailedRun[]\n /** Computed when `report` is set on options. */\n report?: ResearchReport\n startedAt: string\n endedAt: string\n}\n\n// ── Implementation ───────────────────────────────────────────────────────\n\nconst DEFAULT_INTEGRITY: RunIntegrityExpectations = {\n llmSpansMin: 1,\n requireRawCoverageOfLlmSpans: true,\n requireOutcome: true,\n}\n\nconst DEFAULT_ROUTE: LlmRouteRequirements = {\n requireExplicitBaseUrl: true,\n requireAuth: true,\n}\n\nexport async function runEvalCampaign<V>(\n opts: EvalCampaignOptions<V>,\n): Promise<EvalCampaignResult> {\n // ── Preflight ──────────────────────────────────────────────────────\n assertLlmRoute(opts.llmOpts, opts.routeRequirements ?? DEFAULT_ROUTE)\n\n if (opts.variants.length === 0) {\n throw new Error('runEvalCampaign: variants must be non-empty.')\n }\n if (opts.scenarios.length === 0) {\n throw new Error('runEvalCampaign: scenarios must be non-empty.')\n }\n const variantIds = new Set<string>()\n for (const v of opts.variants) {\n if (variantIds.has(v.id)) {\n throw new Error(`runEvalCampaign: duplicate variant id \"${v.id}\".`)\n }\n variantIds.add(v.id)\n }\n const scenarioIds = new Set<string>()\n for (const s of opts.scenarios) {\n if (scenarioIds.has(s.scenarioId)) {\n throw new Error(`runEvalCampaign: duplicate scenarioId \"${s.scenarioId}\".`)\n }\n scenarioIds.add(s.scenarioId)\n }\n if (opts.report?.comparator && !variantIds.has(opts.report.comparator)) {\n throw new Error(\n `runEvalCampaign: report.comparator \"${opts.report.comparator}\" is not a configured variantId.`,\n )\n }\n if (!opts.commitSha) {\n throw new Error('runEvalCampaign: commitSha is required (every RunRecord needs it).')\n }\n\n const seeds = opts.seeds ?? [0, 1, 2]\n const splitTag: RunSplitTag = opts.splitTag ?? 'holdout'\n const concurrency = Math.max(1, opts.concurrency ?? 1)\n const integrity = { ...DEFAULT_INTEGRITY, ...(opts.integrity ?? {}) }\n const onIntegrityFailure: CampaignIntegrityPolicy = opts.onIntegrityFailure ?? 'mark_failed'\n const now = opts.now ?? (() => Date.now())\n const baseUrl = (opts.llmOpts.baseUrl ?? '').replace(/\\/+$/, '')\n const provider = opts.llmOpts.provider ?? null\n const preregistrationHash = opts.preregistrationHash ?? null\n\n const rawSinkFactory = opts.rawSinkFactory ?? defaultRawSinkFactory(opts.workDir)\n\n // ── Fingerprint ────────────────────────────────────────────────────\n const campaignFingerprint = await hashJson(\n canonicalize({\n campaignId: opts.campaignId,\n variants: opts.variants.map((v) => v.id).sort(),\n scenarios: opts.scenarios.map((s) => s.scenarioId).sort(),\n seeds: [...seeds].sort((a, b) => a - b),\n splitTag,\n comparator: opts.report?.comparator ?? null,\n baseUrl,\n provider,\n preregistrationHash,\n }),\n )\n\n // ── Plan the matrix ────────────────────────────────────────────────\n type Cell = { variant: CampaignVariant<V>; scenario: CampaignScenario; seed: number }\n const cells: Cell[] = []\n for (const variant of opts.variants) {\n for (const scenario of opts.scenarios) {\n for (const seed of seeds) {\n cells.push({ variant, scenario, seed })\n }\n }\n }\n\n const startedAt = new Date(now()).toISOString()\n const runs: RunRecord[] = []\n const integrityReports: RunIntegrityReport[] = []\n const failedRuns: FailedRun[] = []\n\n // ── Execute (bounded-concurrency worker pool) ──────────────────────\n let cursor = 0\n async function worker(): Promise<void> {\n while (true) {\n const i = cursor++\n if (i >= cells.length) return\n const cell = cells[i]!\n try {\n const result = await runOneCell(cell)\n runs.push(result.record)\n integrityReports.push(result.integrity)\n } catch (err) {\n if (err instanceof CellExecutionError) {\n failedRuns.push(err.failed)\n if (err.integrity) integrityReports.push(err.integrity)\n } else {\n // Genuine bug — not a runner failure, not an integrity failure.\n // Surface it; don't silently mask.\n throw err\n }\n }\n }\n }\n\n async function runOneCell(\n cell: Cell,\n ): Promise<{ record: RunRecord; integrity: RunIntegrityReport }> {\n const runId = (opts.runId ?? defaultRunId)({\n campaignId: opts.campaignId,\n runId: '', // unused by default generator\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n })\n const factoryParams: CampaignFactoryParams = {\n campaignId: opts.campaignId,\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n }\n const store = opts.storeFactory(factoryParams)\n const rawSink = rawSinkFactory(factoryParams)\n\n const emitter = new TraceEmitter(store, {\n runId,\n now: opts.now,\n onRunComplete: opts.onRunComplete,\n })\n\n const llmOpts: LlmClientOptions = {\n ...opts.llmOpts,\n rawSink,\n traceContext: { runId },\n }\n\n const ctx: CampaignRunContext<V> = {\n runId,\n experimentId: opts.campaignId,\n variant: cell.variant.payload,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n scenarioTags: cell.scenario.tags ?? {},\n seed: cell.seed,\n splitTag,\n emitter,\n store,\n rawSink,\n llmOpts,\n }\n\n const wallStart = now()\n let outcome: CampaignRunOutcome\n try {\n outcome = await opts.runner(ctx)\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err)\n // The runner threw mid-execution; give it a chance to have aborted.\n try {\n await emitter.abortRun(message)\n } catch {\n // Already aborted/ended; ignore.\n }\n throw new CellExecutionError({\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'runner_threw',\n error: message,\n })\n }\n const wallMs = now() - wallStart\n\n const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink })\n if (!integrityReport.ok) {\n switch (onIntegrityFailure) {\n case 'throw':\n throw new RunIntegrityError(integrityReport)\n case 'mark_failed':\n throw new CellExecutionError(\n {\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'integrity_failed',\n error: integrityReport.issues.map((i) => i.code).join(', '),\n },\n integrityReport,\n )\n case 'log':\n // Caller wants the run admitted with a flagged report; fall through.\n break\n }\n }\n\n const recordOutcome: RunOutcome = {\n raw: outcome.raw ?? {},\n }\n if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score\n else recordOutcome.searchScore = outcome.score\n if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores\n\n const record: RunRecord = {\n runId,\n experimentId: opts.campaignId,\n candidateId: cell.variant.id,\n seed: cell.seed,\n model: outcome.model,\n promptHash: outcome.promptHash,\n configHash: outcome.configHash,\n commitSha: opts.commitSha,\n wallMs,\n costUsd: outcome.costUsd,\n tokenUsage: outcome.tokenUsage,\n judgeMetadata: outcome.judgeMetadata,\n outcome: recordOutcome,\n failureMode: outcome.failureMode,\n splitTag,\n scenarioId: cell.scenario.scenarioId,\n }\n const profileSource =\n outcome.agentProfile ??\n (typeof opts.agentProfile === 'function'\n ? await opts.agentProfile({\n campaignId: opts.campaignId,\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n variant: cell.variant.payload,\n scenarioTags: cell.scenario.tags ?? {},\n })\n : opts.agentProfile)\n if (profileSource !== undefined) {\n const agentProfile = await resolveAgentProfileCell(profileSource)\n assertAgentProfileMatchesRun(agentProfile, outcome.model, outcome.promptHash)\n record.agentProfile = agentProfile\n }\n return { record: validateRunRecord(record), integrity: integrityReport }\n }\n\n const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker())\n await Promise.all(workers)\n\n // ── Optional research report ───────────────────────────────────────\n let report: ResearchReport | undefined\n if (opts.report) {\n const reportOpts: ResearchReportOptions = {\n ...opts.report,\n comparator: opts.report.comparator,\n split: splitTag === 'dev' ? 'search' : splitTag,\n generatedAt: new Date(now()).toISOString(),\n preregistrationHash: preregistrationHash ?? undefined,\n }\n report = await researchReport(runs, reportOpts)\n }\n\n const endedAt = new Date(now()).toISOString()\n\n return {\n campaignId: opts.campaignId,\n campaignFingerprint,\n preregistrationHash,\n runs,\n integrityReports,\n failedRuns,\n report,\n startedAt,\n endedAt,\n }\n}\n\n// ── Internal ─────────────────────────────────────────────────────────────\n\nclass CellExecutionError extends Error {\n readonly failed: FailedRun\n readonly integrity?: RunIntegrityReport\n constructor(failed: FailedRun, integrity?: RunIntegrityReport) {\n super(`cell ${failed.variantId}/${failed.scenarioId}@${failed.seed} failed: ${failed.reason}`)\n this.failed = failed\n this.integrity = integrity\n }\n}\n\nfunction defaultRawSinkFactory(workDir: string | undefined) {\n return (params: CampaignFactoryParams): RawProviderSink => {\n if (!workDir) {\n throw new Error(\n 'runEvalCampaign: rawSinkFactory not supplied and workDir not set. Pass either to enable raw provider capture, or pass `new NoopRawProviderSink()` via rawSinkFactory to opt out explicitly.',\n )\n }\n return new FileSystemRawProviderSink({\n dir: `${workDir}/raw-events/${params.runId}`,\n })\n }\n}\n\nasync function resolveAgentProfileCell(\n input: AgentProfileCell | AgentProfileCellInput,\n): Promise<AgentProfileCell> {\n if (isAgentProfileCell(input)) {\n if (!(await verifyAgentProfileCell(input))) {\n throw new Error(`runEvalCampaign: agentProfile.cellId does not match its content`)\n }\n return input\n }\n return buildAgentProfileCell(input)\n}\n\nfunction isAgentProfileCell(\n input: AgentProfileCell | AgentProfileCellInput,\n): input is AgentProfileCell {\n return 'schemaVersion' in input && 'cellId' in input\n}\n\nfunction assertAgentProfileMatchesRun(\n profile: AgentProfileCell,\n model: string,\n promptHash: string,\n): void {\n if (profile.model !== undefined && profile.model !== model) {\n throw new Error(\n `runEvalCampaign: agentProfile.model \"${profile.model}\" does not match outcome.model \"${model}\"`,\n )\n }\n if (profile.promptHash !== undefined && profile.promptHash !== promptHash) {\n throw new Error(\n `runEvalCampaign: agentProfile.promptHash \"${profile.promptHash}\" does not match outcome.promptHash \"${promptHash}\"`,\n )\n }\n}\n\nfunction defaultRunId(params: CampaignFactoryParams): string {\n // Stable across re-runs: fingerprint of (campaignId, variantId, scenarioId, seed).\n // Caller can override via opts.runId for non-deterministic IDs.\n const base = `${params.campaignId}::${params.variantId}::${params.scenarioId}::${params.seed}`\n // Lightweight hex: we don't need crypto-grade here, just stability + uniqueness.\n let h1 = 0x811c9dc5\n let h2 = 0x12345678\n for (let i = 0; i < base.length; i++) {\n const c = base.charCodeAt(i)\n h1 = Math.imul(h1 ^ c, 0x01000193) >>> 0\n h2 = Math.imul(h2 ^ c, 0x9e3779b1) >>> 0\n }\n return `run-${h1.toString(16).padStart(8, '0')}${h2.toString(16).padStart(8, '0')}`\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;AA8RA,IAAM,oBAA8C;AAAA,EAClD,aAAa;AAAA,EACb,8BAA8B;AAAA,EAC9B,gBAAgB;AAClB;AAEA,IAAM,gBAAsC;AAAA,EAC1C,wBAAwB;AAAA,EACxB,aAAa;AACf;AAEA,eAAsB,gBACpB,MAC6B;AAE7B,iBAAe,KAAK,SAAS,KAAK,qBAAqB,aAAa;AAEpE,MAAI,KAAK,SAAS,WAAW,GAAG;AAC9B,UAAM,IAAI,MAAM,8CAA8C;AAAA,EAChE;AACA,MAAI,KAAK,UAAU,WAAW,GAAG;AAC/B,UAAM,IAAI,MAAM,+CAA+C;AAAA,EACjE;AACA,QAAM,aAAa,oBAAI,IAAY;AACnC,aAAW,KAAK,KAAK,UAAU;AAC7B,QAAI,WAAW,IAAI,EAAE,EAAE,GAAG;AACxB,YAAM,IAAI,MAAM,0CAA0C,EAAE,EAAE,IAAI;AAAA,IACpE;AACA,eAAW,IAAI,EAAE,EAAE;AAAA,EACrB;AACA,QAAM,cAAc,oBAAI,IAAY;AACpC,aAAW,KAAK,KAAK,WAAW;AAC9B,QAAI,YAAY,IAAI,EAAE,UAAU,GAAG;AACjC,YAAM,IAAI,MAAM,0CAA0C,EAAE,UAAU,IAAI;AAAA,IAC5E;AACA,gBAAY,IAAI,EAAE,UAAU;AAAA,EAC9B;AACA,MAAI,KAAK,QAAQ,cAAc,CAAC,WAAW,IAAI,KAAK,OAAO,UAAU,GAAG;AACtE,UAAM,IAAI;AAAA,MACR,uCAAuC,KAAK,OAAO,UAAU;AAAA,IAC/D;AAAA,EACF;AACA,MAAI,CAAC,KAAK,WAAW;AACnB,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AAEA,QAAM,QAAQ,KAAK,SAAS,CAAC,GAAG,GAAG,CAAC;AACpC,QAAM,WAAwB,KAAK,YAAY;AAC/C,QAAM,cAAc,KAAK,IAAI,GAAG,KAAK,eAAe,CAAC;AACrD,QAAM,YAAY,EAAE,GAAG,mBAAmB,GAAI,KAAK,aAAa,CAAC,EAAG;AACpE,QAAM,qBAA8C,KAAK,sBAAsB;AAC/E,QAAM,MAAM,KAAK,QAAQ,MAAM,KAAK,IAAI;AACxC,QAAM,WAAW,KAAK,QAAQ,WAAW,IAAI,QAAQ,QAAQ,EAAE;AAC/D,QAAM,WAAW,KAAK,QAAQ,YAAY;AAC1C,QAAM,sBAAsB,KAAK,uBAAuB;AAExD,QAAM,iBAAiB,KAAK,kBAAkB,sBAAsB,KAAK,OAAO;AAGhF,QAAM,sBAAsB,MAAM;AAAA,IAChC,aAAa;AAAA,MACX,YAAY,KAAK;AAAA,MACjB,UAAU,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,KAAK;AAAA,MAC9C,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,KAAK;AAAA,MACxD,OAAO,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAAA,MACtC;AAAA,MACA,YAAY,KAAK,QAAQ,cAAc;AAAA,MACvC;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAIA,QAAM,QAAgB,CAAC;AACvB,aAAW,WAAW,KAAK,UAAU;AACnC,eAAW,YAAY,KAAK,WAAW;AACrC,iBAAW,QAAQ,OAAO;AACxB,cAAM,KAAK,EAAE,SAAS,UAAU,KAAK,CAAC;AAAA,MACxC;AAAA,IACF;AAAA,EACF;AAEA,QAAM,YAAY,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAC9C,QAAM,OAAoB,CAAC;AAC3B,QAAM,mBAAyC,CAAC;AAChD,QAAM,aAA0B,CAAC;AAGjC,MAAI,SAAS;AACb,iBAAe,SAAwB;AACrC,WAAO,MAAM;AACX,YAAM,IAAI;AACV,UAAI,KAAK,MAAM,OAAQ;AACvB,YAAM,OAAO,MAAM,CAAC;AACpB,UAAI;AACF,cAAM,SAAS,MAAM,WAAW,IAAI;AACpC,aAAK,KAAK,OAAO,MAAM;AACvB,yBAAiB,KAAK,OAAO,SAAS;AAAA,MACxC,SAAS,KAAK;AACZ,YAAI,eAAe,oBAAoB;AACrC,qBAAW,KAAK,IAAI,MAAM;AAC1B,cAAI,IAAI,UAAW,kBAAiB,KAAK,IAAI,SAAS;AAAA,QACxD,OAAO;AAGL,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,iBAAe,WACb,MAC+D;AAC/D,UAAM,SAAS,KAAK,SAAS,cAAc;AAAA,MACzC,YAAY,KAAK;AAAA,MACjB,OAAO;AAAA;AAAA,MACP,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb,CAAC;AACD,UAAM,gBAAuC;AAAA,MAC3C,YAAY,KAAK;AAAA,MACjB;AAAA,MACA,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb;AACA,UAAM,QAAQ,KAAK,aAAa,aAAa;AAC7C,UAAM,UAAU,eAAe,aAAa;AAE5C,UAAM,UAAU,IAAI,aAAa,OAAO;AAAA,MACtC;AAAA,MACA,KAAK,KAAK;AAAA,MACV,eAAe,KAAK;AAAA,IACtB,CAAC;AAED,UAAM,UAA4B;AAAA,MAChC,GAAG,KAAK;AAAA,MACR;AAAA,MACA,cAAc,EAAE,MAAM;AAAA,IACxB;AAEA,UAAM,MAA6B;AAAA,MACjC;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,SAAS,KAAK,QAAQ;AAAA,MACtB,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,cAAc,KAAK,SAAS,QAAQ,CAAC;AAAA,MACrC,MAAM,KAAK;AAAA,MACX;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAEA,UAAM,YAAY,IAAI;AACtB,QAAI;AACJ,QAAI;AACF,gBAAU,MAAM,KAAK,OAAO,GAAG;AAAA,IACjC,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE/D,UAAI;AACF,cAAM,QAAQ,SAAS,OAAO;AAAA,MAChC,QAAQ;AAAA,MAER;AACA,YAAM,IAAI,mBAAmB;AAAA,QAC3B;AAAA,QACA,WAAW,KAAK,QAAQ;AAAA,QACxB,YAAY,KAAK,SAAS;AAAA,QAC1B,MAAM,KAAK;AAAA,QACX,QAAQ;AAAA,QACR,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AACA,UAAM,SAAS,IAAI,IAAI;AAEvB,UAAM,kBAAkB,MAAM,kBAAkB,OAAO,OAAO,EAAE,GAAG,WAAW,QAAQ,CAAC;AACvF,QAAI,CAAC,gBAAgB,IAAI;AACvB,cAAQ,oBAAoB;AAAA,QAC1B,KAAK;AACH,gBAAM,IAAI,kBAAkB,eAAe;AAAA,QAC7C,KAAK;AACH,gBAAM,IAAI;AAAA,YACR;AAAA,cACE;AAAA,cACA,WAAW,KAAK,QAAQ;AAAA,cACxB,YAAY,KAAK,SAAS;AAAA,cAC1B,MAAM,KAAK;AAAA,cACX,QAAQ;AAAA,cACR,OAAO,gBAAgB,OAAO,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,IAAI;AAAA,YAC5D;AAAA,YACA;AAAA,UACF;AAAA,QACF,KAAK;AAEH;AAAA,MACJ;AAAA,IACF;AAEA,UAAM,gBAA4B;AAAA,MAChC,KAAK,QAAQ,OAAO,CAAC;AAAA,IACvB;AACA,QAAI,aAAa,UAAW,eAAc,eAAe,QAAQ;AAAA,QAC5D,eAAc,cAAc,QAAQ;AACzC,QAAI,QAAQ,gBAAgB,OAAW,eAAc,cAAc,QAAQ;AAE3E,UAAM,SAAoB;AAAA,MACxB;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,aAAa,KAAK,QAAQ;AAAA,MAC1B,MAAM,KAAK;AAAA,MACX,OAAO,QAAQ;AAAA,MACf,YAAY,QAAQ;AAAA,MACpB,YAAY,QAAQ;AAAA,MACpB,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,SAAS,QAAQ;AAAA,MACjB,YAAY,QAAQ;AAAA,MACpB,eAAe,QAAQ;AAAA,MACvB,SAAS;AAAA,MACT,aAAa,QAAQ;AAAA,MACrB;AAAA,MACA,YAAY,KAAK,SAAS;AAAA,IAC5B;AACA,UAAM,gBACJ,QAAQ,iBACP,OAAO,KAAK,iBAAiB,aAC1B,MAAM,KAAK,aAAa;AAAA,MACtB,YAAY,KAAK;AAAA,MACjB;AAAA,MACA,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,MACX,SAAS,KAAK,QAAQ;AAAA,MACtB,cAAc,KAAK,SAAS,QAAQ,CAAC;AAAA,IACvC,CAAC,IACD,KAAK;AACX,QAAI,kBAAkB,QAAW;AAC/B,YAAM,eAAe,MAAM,wBAAwB,aAAa;AAChE,mCAA6B,cAAc,QAAQ,OAAO,QAAQ,UAAU;AAC5E,aAAO,eAAe;AAAA,IACxB;AACA,WAAO,EAAE,QAAQ,kBAAkB,MAAM,GAAG,WAAW,gBAAgB;AAAA,EACzE;AAEA,QAAM,UAAU,MAAM,KAAK,EAAE,QAAQ,KAAK,IAAI,aAAa,MAAM,MAAM,EAAE,GAAG,MAAM,OAAO,CAAC;AAC1F,QAAM,QAAQ,IAAI,OAAO;AAGzB,MAAI;AACJ,MAAI,KAAK,QAAQ;AACf,UAAM,aAAoC;AAAA,MACxC,GAAG,KAAK;AAAA,MACR,YAAY,KAAK,OAAO;AAAA,MACxB,OAAO,aAAa,QAAQ,WAAW;AAAA,MACvC,aAAa,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAAA,MACzC,qBAAqB,uBAAuB;AAAA,IAC9C;AACA,aAAS,MAAM,eAAe,MAAM,UAAU;AAAA,EAChD;AAEA,QAAM,UAAU,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAE5C,SAAO;AAAA,IACL,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAIA,IAAM,qBAAN,cAAiC,MAAM;AAAA,EAC5B;AAAA,EACA;AAAA,EACT,YAAY,QAAmB,WAAgC;AAC7D,UAAM,QAAQ,OAAO,SAAS,IAAI,OAAO,UAAU,IAAI,OAAO,IAAI,YAAY,OAAO,MAAM,EAAE;AAC7F,SAAK,SAAS;AACd,SAAK,YAAY;AAAA,EACnB;AACF;AAEA,SAAS,sBAAsB,SAA6B;AAC1D,SAAO,CAAC,WAAmD;AACzD,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,WAAO,IAAI,0BAA0B;AAAA,MACnC,KAAK,GAAG,OAAO,eAAe,OAAO,KAAK;AAAA,IAC5C,CAAC;AAAA,EACH;AACF;AAEA,eAAe,wBACb,OAC2B;AAC3B,MAAI,mBAAmB,KAAK,GAAG;AAC7B,QAAI,CAAE,MAAM,uBAAuB,KAAK,GAAI;AAC1C,YAAM,IAAI,MAAM,iEAAiE;AAAA,IACnF;AACA,WAAO;AAAA,EACT;AACA,SAAO,sBAAsB,KAAK;AACpC;AAEA,SAAS,mBACP,OAC2B;AAC3B,SAAO,mBAAmB,SAAS,YAAY;AACjD;AAEA,SAAS,6BACP,SACA,OACA,YACM;AACN,MAAI,QAAQ,UAAU,UAAa,QAAQ,UAAU,OAAO;AAC1D,UAAM,IAAI;AAAA,MACR,wCAAwC,QAAQ,KAAK,mCAAmC,KAAK;AAAA,IAC/F;AAAA,EACF;AACA,MAAI,QAAQ,eAAe,UAAa,QAAQ,eAAe,YAAY;AACzE,UAAM,IAAI;AAAA,MACR,6CAA6C,QAAQ,UAAU,wCAAwC,UAAU;AAAA,IACnH;AAAA,EACF;AACF;AAEA,SAAS,aAAa,QAAuC;AAG3D,QAAM,OAAO,GAAG,OAAO,UAAU,KAAK,OAAO,SAAS,KAAK,OAAO,UAAU,KAAK,OAAO,IAAI;AAE5F,MAAI,KAAK;AACT,MAAI,KAAK;AACT,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,WAAW,CAAC;AAC3B,SAAK,KAAK,KAAK,KAAK,GAAG,QAAU,MAAM;AACvC,SAAK,KAAK,KAAK,KAAK,GAAG,UAAU,MAAM;AAAA,EACzC;AACA,SAAO,OAAO,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC,GAAG,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC;AACnF;","names":[]}