@tangle-network/agent-eval 0.77.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +50 -19
  2. package/dist/adapters/http.d.ts +2 -2
  3. package/dist/adapters/langchain.d.ts +2 -2
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  6. package/dist/analyst/index.d.ts +42 -8
  7. package/dist/analyst/index.js +32 -2
  8. package/dist/analyst/index.js.map +1 -1
  9. package/dist/authenticity/index.d.ts +54 -1
  10. package/dist/authenticity/index.js +88 -1
  11. package/dist/authenticity/index.js.map +1 -1
  12. package/dist/belief-state/index.d.ts +188 -0
  13. package/dist/belief-state/index.js +486 -0
  14. package/dist/belief-state/index.js.map +1 -0
  15. package/dist/benchmarks/index.d.ts +2 -2
  16. package/dist/calibration-Cpr3WaX3.d.ts +101 -0
  17. package/dist/campaign/index.d.ts +11 -11
  18. package/dist/campaign/index.js +4 -4
  19. package/dist/chunk-4DIJWVUT.js +131 -0
  20. package/dist/chunk-4DIJWVUT.js.map +1 -0
  21. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  22. package/dist/chunk-5LVWPNS5.js.map +1 -0
  23. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  24. package/dist/chunk-CF67I6QY.js.map +1 -0
  25. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  26. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  27. package/dist/chunk-KWRRMR3J.js.map +1 -0
  28. package/dist/chunk-NPCTHQIO.js +91 -0
  29. package/dist/chunk-NPCTHQIO.js.map +1 -0
  30. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  31. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  32. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  33. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  34. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  35. package/dist/contract/index.d.ts +128 -15
  36. package/dist/contract/index.js +118 -2
  37. package/dist/contract/index.js.map +1 -1
  38. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  39. package/dist/control.d.ts +2 -2
  40. package/dist/control.js +2 -2
  41. package/dist/governance/index.d.ts +1 -1
  42. package/dist/hosted/index.d.ts +4 -4
  43. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  44. package/dist/index.d.ts +127 -26
  45. package/dist/index.js +32 -7
  46. package/dist/index.js.map +1 -1
  47. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  48. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  49. package/dist/meta-eval/index.d.ts +6 -99
  50. package/dist/meta-eval/index.js +7 -76
  51. package/dist/meta-eval/index.js.map +1 -1
  52. package/dist/off-policy-DiwuKKg7.d.ts +132 -0
  53. package/dist/openapi.json +1 -1
  54. package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
  55. package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
  56. package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
  57. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  58. package/dist/reporting.d.ts +5 -5
  59. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  60. package/dist/rl.d.ts +10 -140
  61. package/dist/rl.js +8 -122
  62. package/dist/rl.js.map +1 -1
  63. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
  64. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
  65. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  66. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
  67. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  68. package/dist/traces.d.ts +1 -1
  69. package/dist/traces.js +2 -2
  70. package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
  71. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  72. package/dist/workflow/index.d.ts +4 -4
  73. package/dist/workflow/index.js +1 -1
  74. package/docs/auto-research-loop-end-to-end.md +1 -1
  75. package/docs/feature-guide.md +4 -4
  76. package/docs/multi-shot-optimization.md +61 -115
  77. package/docs/product-eval-adoption.md +1 -1
  78. package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
  79. package/docs/research/research-roadmap.md +1 -0
  80. package/docs/three-package-architecture.md +1 -1
  81. package/docs/trace-analysis.md +19 -0
  82. package/package.json +7 -2
  83. package/dist/chunk-7W4SM7FD.js.map +0 -1
  84. package/dist/chunk-F3SRAAZO.js.map +0 -1
  85. package/dist/chunk-JYE3WOTE.js.map +0 -1
  86. package/dist/chunk-WYIHD6EB.js.map +0 -1
  87. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  88. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  89. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  90. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -0,0 +1,558 @@
1
+ # Belief-State Work in agent-eval
2
+
3
+ **Status:** Planning and tracking artifact.
4
+ **Created:** 2026-06-04.
5
+ **Owner:** agent-eval research track.
6
+ **Scope:** What belongs in `@tangle-network/agent-eval`, what should stay in runtime/knowledge/graph packages, and what must be proven before stable belief-state APIs ship.
7
+
8
+ ## Executive Summary
9
+
10
+ Belief-state work belongs in `agent-eval` only as an analysis-time and evaluation-time substrate: trace-derived state estimates, calibrated uncertainty, replay/OPE reports, selective action gates, memory policy evaluation, and causal attribution over mutable agent surfaces. It should not become the runtime's memory system, tool executor, workflow registry, or source of truth for production state.
11
+
12
+ The original "categorize the whole MDP" idea is useful as a research north star but too broad as the first build. The durable characterization is narrower: an agent is a partially observed adaptive control system over mutable surfaces, and `agent-eval` owns the evidence that says whether a proposed state/policy change is valid, calibrated, and worth shipping.
13
+
14
+ The first year should produce hard evidence: selectors that know when to abstain/verify/retry, OPE/replay for policy changes, memory admission gates, surface-level causal attribution, and held-out reports that beat baselines with confidence intervals. The second year should only happen if year one clears those gates: learned state estimators, graph-native provenance adapters, cross-domain transfer, and an externally defensible benchmark/paper package.
15
+
16
+ ## Source Register
17
+
18
+ | Source | Location | Durable Claim | Confidence | Next Check |
19
+ |---|---|---|---:|---|
20
+ | agent-eval current substrate | `README.md`, `src/run-record.ts`, `src/contract/analyze-runs.ts`, `src/trace/schema.ts` | `agent-eval` is the bottom substrate: RunRecord, traces, analysts, OPE, gates, and decision packets belong here. Runtime imports eval; eval must not import runtime. | High | Keep every belief-state primitive trace-derived or consumer-supplied. |
21
+ | Self-improvement roadmap | `docs/design/self-improvement-roadmap.md` | Existing program already frames run -> observe -> diagnose -> propose -> evaluate -> gate -> promote. Belief state must plug into this loop, not replace it. | High | Map each belief-state phase to an existing evidence bus, analyst, RL, gate, or report surface. |
22
+ | Research roadmap | `docs/research/research-roadmap.md` | The field claim is not "we built an architecture"; it is statistical foundation, two-writer state, and standard benchmark. | High | Require publishable experiments before public belief-state package claims. |
23
+ | Empirical proof pursuit | `.evolve/pursuits/2026-06-01-empirical-proof.md` | Mechanism ran end-to-end, but positive lift did not materialize. This is a warning: belief-state work must prove lift under headroom, not just add mechanism. | High | Pre-register baselines and kill criteria. |
24
+ | Belief-state bundle | `/Users/drew/code/belief-state-agents` | Good vocabulary: EvidenceAtom, RuntimeSnapshot, belief variables, hypotheses, memory state, action descriptors. Not production-ready; tests exposed Python/TS issues and some policy/math gaps. | Medium | Port concepts only after validating against agent-eval traces. |
25
+ | agent-runtime PR 155 | `https://github.com/tangle-network/agent-runtime/pull/155` | Docs-only PR, closed unmerged as of 2026-06-04. Useful as thinking, not landed evidence. | Medium | Do not cite it as product substrate. Extract only claims that survive repo-local verification. |
26
+ | PiGraph worldclass kit | `/Users/drew/code/pigraph-worldclass-kit` | Useful as graph/provenance adapter idea. Not core agent-eval: nondeterministic IDs/timestamps and linearized topology need correction before research use. | Medium | Build adapter after trace-derived belief state is measurable. |
27
+
28
+ ## Core Decision
29
+
30
+ Belief-state work should start in `agent-eval`, but not as "the agent's state." It should start as `agent-eval` research infrastructure for answering four questions:
31
+
32
+ 1. What did the agent appear to believe at each decision point, based only on trace evidence?
33
+ 2. Was that belief calibrated against later outcomes?
34
+ 3. Would a different policy over verify/retry/ask/memory/skill/tool/workflow choices have improved the outcome?
35
+ 4. Which mutable surface caused the improvement or regression?
36
+
37
+ That framing maps cleanly to existing code:
38
+
39
+ | Existing Surface | Why It Matters |
40
+ |---|---|
41
+ | `RunRecord` | Paper-grade run projection with pinned model/config/cost/outcome. Belief-state reports must be joinable to it. |
42
+ | `TraceSchema` | Source stream for evidence atoms: LLM spans, tools, retrieval, judges, state mutations, policy violations, budgets. |
43
+ | `AnalystRegistry` | Converts traces into findings; belief-state estimators should consume findings, not duplicate analyst pipelines. |
44
+ | `analyzeRuns` / `InsightReport` | Decision packet where belief calibration, abstention value, memory policy value, and surface attribution should surface. |
45
+ | `/rl/off-policy` | OPE is the correct first tool for "would another policy have done better?" |
46
+ | `counterfactual.ts` | Replay and mutation experiments for causal claims. |
47
+ | `causal-attribution.ts` | Surface-level variance attribution for model/prompt/tool/memory/workflow changes. |
48
+ | `knowledge/readiness.ts` | Existing readiness/gap model for knowledge requirements. Belief state should connect to it, not replace it. |
49
+ | `control-runtime.ts` | Generic observe -> validate -> decide -> act loop. Belief-state evaluation can score decisions produced by control loops without owning execution. |
50
+
51
+ ## Boundary Rules
52
+
53
+ In `agent-eval`:
54
+
55
+ - Trace-derived evidence atoms.
56
+ - State estimator reports.
57
+ - Calibration curves and ECE for state confidence.
58
+ - Selective prediction/abstention/retry/verify gates.
59
+ - OPE/replay for policy changes.
60
+ - Memory admission/update evaluation.
61
+ - Skill/tool/workflow selection evaluation.
62
+ - Prompt/directive/subagent surface attribution.
63
+ - Decision packet extensions and held-out promotion gates.
64
+
65
+ Not in `agent-eval`:
66
+
67
+ - Runtime memory writes.
68
+ - Tool execution.
69
+ - Workflow orchestration.
70
+ - Subagent lifecycle.
71
+ - Prompt registry ownership.
72
+ - Production state source of truth.
73
+ - Graph database ownership.
74
+ - Default LLM-backed researcher brain.
75
+
76
+ Downstream packages can own mutation and execution. `agent-eval` owns whether the mutation is supported by evidence.
77
+
78
+ ## First-Principles Characterization
79
+
80
+ The broad MDP frame is not wrong, but it hides the build order. For a real agent, the "state" contains at least:
81
+
82
+ - Environment state: files, APIs, user session, external systems.
83
+ - Agent-observed state: transcript, tool results, retrieved documents, artifacts.
84
+ - Internal policy surfaces: model, prompt, directives, skills, subagents, workflows, tool policies.
85
+ - Learned state: memory, preferences, playbooks, derived knowledge, failure findings.
86
+ - Evaluation state: judges, rubrics, gates, calibration reports, holdouts.
87
+ - Governance state: budgets, approvals, consent, risk class, data sensitivity.
88
+
89
+ The key refinement: do not try to enumerate the full latent state. Build calibrated sufficient statistics for specific decisions.
90
+
91
+ Concrete example:
92
+
93
+ - Bad first target: "model the complete belief state of the agent."
94
+ - Good first target: "given trace evidence before a risky action, estimate whether the agent should continue, verify, ask, retry, or stop, and prove that policy beats baseline on held-out runs."
95
+
96
+ ## Alternative Characterizations
97
+
98
+ | Characterization | What It Explains | Best First Use | Belongs Where | Verdict |
99
+ |---|---|---|---|---|
100
+ | POMDP / belief-state agent | Hidden task state, uncertainty, partial observation | Long-term formalism for papers | `agent-eval` research docs, later optional state-estimator API | Useful but too broad as first API. |
101
+ | Selective prediction | Knowing when not to act | abstain/verify/retry/ask gates | `agent-eval` | Build first. Highest signal-to-cost. |
102
+ | Off-policy evaluation / contextual bandits | Would another decision policy have done better? | replay old runs under candidate decision policy | `agent-eval/rl` | Build first. Existing substrate already supports it. |
103
+ | Typed stochastic computation graph | Provenance and topology of evidence, beliefs, decisions, outcomes | explainability and causal paths | separate graph adapter, fed by `agent-eval` traces | Build later, not core. |
104
+ | Causal credit assignment over mutable surfaces | Which change caused lift/regression? | prompt/model/tool/memory/workflow attribution | `agent-eval` | Build after replay corpus exists. |
105
+ | Resource-rational control | Cost-aware decide/verify/stop policies | budgeted verification and escalation | `agent-eval` + runtime policy consumer | Build early after selective gates. |
106
+ | Memory lifecycle / knowledge governance | When memories should be written, updated, trusted, forgotten | memory admission and poisoning detection | `agent-eval` evaluation; knowledge package execution | Build in year one. |
107
+ | Workflow/skill market | Policy over capabilities and subagents | skill/tool/workflow selection experiments | runtime executes; `agent-eval` evaluates | Build after baseline selectors. |
108
+ | Agent as self-modifying system | Prompts, skills, workflows, memories mutate over time | long-horizon self-improvement | across eval/runtime/knowledge/graph | Two-year target only. |
109
+
110
+ Recommendation: use "trace-grounded adaptive control" as the umbrella. Belief state is one estimator family inside that umbrella, not the entire architecture.
111
+
112
+ ## What I Would Do Differently From the Original Idea
113
+
114
+ Do not start by completely categorizing the MDP. That creates a taxonomy before we know which variables have predictive or causal value.
115
+
116
+ Start with decision points that are observable and valuable:
117
+
118
+ 1. Continue vs verify vs ask vs stop.
119
+ 2. Retry same approach vs change approach.
120
+ 3. Write memory vs skip memory.
121
+ 4. Retrieve memory vs ignore memory.
122
+ 5. Use skill/tool/workflow A vs B.
123
+ 6. Promote prompt/directive/subagent variant vs hold.
124
+
125
+ For each decision point, require:
126
+
127
+ - logged context,
128
+ - candidate actions,
129
+ - outcome,
130
+ - cost,
131
+ - calibration target,
132
+ - baseline policy,
133
+ - held-out split,
134
+ - replay/OPE support diagnostic,
135
+ - failure mode taxonomy,
136
+ - promotion gate.
137
+
138
+ Only after that should we generalize to a broader belief-state estimator.
139
+
140
+ ## Proposed Architecture
141
+
142
+ Current:
143
+
144
+ ```text
145
+ runtime/agents -> traces + RunRecord -> agent-eval analysts/gates/reports
146
+ knowledge -> memory/claims -> agent-eval readiness/evidence checks
147
+ experiments -> runs -> agent-eval OPE/replay/causal reports
148
+ ```
149
+
150
+ Target:
151
+
152
+ ```text
153
+ runtime/agents -> traces + RunRecord ------------------------------+
154
+ knowledge -> claims/memories/readiness -----------------------+
155
+ graph adapter -> optional provenance topology --------------------+
156
+ v
157
+ agent-eval belief-state research layer
158
+ - evidence extraction
159
+ - state estimator report
160
+ - calibration + abstention metrics
161
+ - replay/OPE policy value
162
+ - memory policy value
163
+ - surface attribution
164
+ v
165
+ InsightReport + HeldOutGate + research artifacts
166
+ ```
167
+
168
+ No runtime dependency is added to `agent-eval`. Runtime emits richer traces and consumes gates. Knowledge owns memory writes. A graph adapter consumes traces and emits topology features.
169
+
170
+ ## Minimal Future API Shape
171
+
172
+ Do not add this stable public API until Phase 1 gates pass. The draft dogfood surface is `@tangle-network/agent-eval/experimental/belief-state`; the likely eventual stable subpath is `@tangle-network/agent-eval/belief-state`.
173
+
174
+ ```ts
175
+ export interface BeliefEvidenceAtom {
176
+ id: string
177
+ runId: string
178
+ stepIndex?: number
179
+ source: 'llm' | 'tool' | 'retrieval' | 'judge' | 'memory' | 'policy' | 'runtime'
180
+ subject: string
181
+ signal: string
182
+ value: unknown
183
+ confidence?: number
184
+ timestamp?: number
185
+ metadata?: Record<string, unknown>
186
+ }
187
+
188
+ export interface BeliefStateEstimate {
189
+ runId: string
190
+ stepIndex?: number
191
+ variables: Record<string, { value: unknown; confidence: number }>
192
+ evidenceIds: string[]
193
+ unsupportedVariables: string[]
194
+ }
195
+
196
+ export interface BeliefPolicyEvaluation {
197
+ policyId: string
198
+ baselinePolicyId: string
199
+ decisionKind: 'continue' | 'verify' | 'ask' | 'retry' | 'stop' | 'memory-write' | 'surface-promote'
200
+ n: number
201
+ calibration?: { ece: number; maxGap: number }
202
+ offPolicy?: { value: number; effectiveSampleSize: number; supportWarning?: string }
203
+ lift?: { delta: number; ci95: [number, number] }
204
+ recommendation: 'ship' | 'hold' | 'need_more_data'
205
+ }
206
+ ```
207
+
208
+ The API should report uncertainty and support problems, not hide them behind a scalar score.
209
+
210
+ ## Year 1 Roadmap
211
+
212
+ ### Q3 2026 - Phase 0: Tracking, Corpus, and Decision Inventory
213
+
214
+ - [ ] Keep this document current as the research tracker.
215
+ - [ ] Create a decision inventory over existing traces: continue, verify, retry, ask, stop, memory-write, memory-read, tool-select, skill-select, workflow-select, prompt-promote.
216
+ - [ ] Define trace extraction rules for each decision kind.
217
+ - [ ] Define the minimum event fields needed from runtime and knowledge packages.
218
+ - [ ] Build a replay corpus from existing `RunRecord` and trace stores.
219
+ - [ ] Label at least 200 decision points with outcome, cost, and whether the action was retrospectively correct.
220
+ - [ ] Add support diagnostics: missing candidates, missing outcomes, no cost, no raw trace, no held-out split.
221
+ - [ ] Decide which decision kind has enough data for Phase 1.
222
+
223
+ Completion criteria:
224
+
225
+ - [ ] Corpus has >= 200 labeled decision points across >= 3 projects or task families.
226
+ - [ ] Every row joins to a `RunRecord`.
227
+ - [ ] Every row has train/dev/holdout split.
228
+ - [ ] Backend and capture integrity are checked before analysis.
229
+ - [ ] No producerless schema fields are introduced.
230
+ - [ ] One baseline policy is recorded for every decision kind under study.
231
+
232
+ ### Q4 2026 - Phase 1: Selective Prediction and Abstention
233
+
234
+ - [ ] Build abstain/verify/ask/stop evaluation over the chosen decision kind.
235
+ - [ ] Compute calibration curves for confidence vs actual outcome.
236
+ - [ ] Compare baseline policy vs selective policy on holdout.
237
+ - [ ] Add cost-aware utility: quality lift minus verification/ask/retry cost.
238
+ - [ ] Add report rows into `InsightReport` or an experimental research report.
239
+ - [ ] Run negative controls: shuffled confidence, random abstention, always-verify, never-verify.
240
+ - [ ] Pre-register thresholds before holdout.
241
+
242
+ Completion criteria:
243
+
244
+ - [ ] Selective policy beats baseline utility with CI.low > 0 on holdout, or the result is recorded as an honest negative.
245
+ - [ ] ECE is lower than the uncalibrated baseline.
246
+ - [ ] Always-verify does not trivially dominate once cost is counted.
247
+ - [ ] No metric improves while user-visible outcome regresses.
248
+ - [ ] Report includes abstention coverage, error rate on accepted actions, rejected-action lift, and cost.
249
+
250
+ ### Q1 2027 - Phase 2: OPE and Replay for Decision Policies
251
+
252
+ - [ ] Convert decision logs into off-policy trajectories.
253
+ - [ ] Estimate target policy value using IPS, SNIPS, and DR where support allows.
254
+ - [ ] Record effective sample size and max importance weight in every report.
255
+ - [ ] Add exact replay where request hashes match.
256
+ - [ ] Compare OPE predictions to live A/B or replay outcomes.
257
+ - [ ] Add support-mismatch failure mode.
258
+
259
+ Completion criteria:
260
+
261
+ - [ ] OPE only claims value when support diagnostics pass.
262
+ - [ ] SNIPS/DR estimates agree within pre-registered tolerance or report disagreement.
263
+ - [ ] Exact replay validates at least one candidate policy.
264
+ - [ ] At least one target policy survives held-out gate.
265
+ - [ ] Every result includes n, effective sample size, cost, and split.
266
+
267
+ ### Q2 2027 - Phase 3: Memory and Knowledge Policy Evaluation
268
+
269
+ - [ ] Evaluate memory write/admit/update/forget policies.
270
+ - [ ] Link memory policy decisions to `knowledge/readiness.ts` reports.
271
+ - [ ] Add poisoning/staleness/contradiction labels.
272
+ - [ ] Measure whether memory retrieval changes outcome, cost, or failure mode.
273
+ - [ ] Add counterfactual memory ablations where replay is possible.
274
+ - [ ] Define "memory should not have been written" and "memory should have been retrieved" labels.
275
+
276
+ Completion criteria:
277
+
278
+ - [ ] Memory policy improves holdout utility or reduces harmful memory events with CI support.
279
+ - [ ] Poisoning/staleness detection has measured precision/recall on labeled examples.
280
+ - [ ] Memory admission gate is fail-closed for secrets, stale claims, and untrusted sources.
281
+ - [ ] Knowledge readiness reports are present for tasks with declared requirements.
282
+ - [ ] No runtime memory write is owned by `agent-eval`.
283
+
284
+ ## Year 2 Roadmap
285
+
286
+ ### Q3 2027 - Phase 4: Surface Attribution and Graph Adapter
287
+
288
+ - [ ] Factor mutable surfaces: model, prompt, directive, skill, tool policy, workflow, memory policy, analyst, gate.
289
+ - [ ] Run factorial or quasi-factorial experiments where practical.
290
+ - [ ] Extend causal attribution reports to surface families.
291
+ - [ ] Build a PiGraph-style adapter from traces to typed provenance graph.
292
+ - [ ] Make graph IDs deterministic and topology-preserving before use in research claims.
293
+ - [ ] Compare graph features to non-graph features for predictive value.
294
+
295
+ Completion criteria:
296
+
297
+ - [ ] Surface attribution report identifies main effects and interactions with confidence intervals or explicit uncertainty.
298
+ - [ ] Graph adapter preserves branching topology.
299
+ - [ ] Graph features beat a flat trace baseline on at least one prediction task.
300
+ - [ ] Any graph claim is reproducible from trace IDs and commit hashes.
301
+
302
+ ### Q4 2027 - Phase 5: Learned State Estimators
303
+
304
+ - [ ] Train or fit state estimators from trace evidence to outcome-relevant variables.
305
+ - [ ] Compare learned estimators to deterministic baselines.
306
+ - [ ] Measure calibration, transfer, and drift.
307
+ - [ ] Add abstention when estimator confidence is unsupported.
308
+ - [ ] Audit leakage from judge outputs, holdout labels, or future trace spans.
309
+
310
+ Completion criteria:
311
+
312
+ - [ ] Learned estimator improves a downstream decision policy, not just variable prediction.
313
+ - [ ] Calibration holds on a future time split.
314
+ - [ ] Leakage audit passes.
315
+ - [ ] Estimator outputs unsupported variables explicitly.
316
+ - [ ] The estimator can be disabled without breaking existing reports.
317
+
318
+ ### Q1 2028 - Phase 6: Cross-Domain Transfer and Benchmark
319
+
320
+ - [ ] Build a benchmark with multiple domains and distribution shifts.
321
+ - [ ] Include no-belief, heuristic, selective, OPE, graph, and learned-estimator baselines.
322
+ - [ ] Publish scenario construction, splits, contamination checks, and scoring.
323
+ - [ ] Evaluate transfer: state estimator trained on domain A, used on domain B.
324
+ - [ ] Add partner/product traces only if consent, redaction, and governance are solved.
325
+
326
+ Completion criteria:
327
+
328
+ - [ ] Benchmark has >= 100 scenarios or >= 1,000 decision points.
329
+ - [ ] At least three baselines are strong enough to be credible.
330
+ - [ ] Held-out results include confidence intervals.
331
+ - [ ] Contamination guard passes.
332
+ - [ ] Public artifact can be reproduced from scripts, seeds, model snapshots, and data hashes.
333
+
334
+ ### Q2 2028 - Phase 7: Self-Improving Belief-State Policies
335
+
336
+ - [ ] Let the system propose candidate state features or decision policies.
337
+ - [ ] Gate feature/policy changes through the same held-out machinery.
338
+ - [ ] Add reward-hacking checks for estimator/gate manipulation.
339
+ - [ ] Add rollback rules for policies that regress production outcomes.
340
+ - [ ] Prepare paper/product package only if empirical gates pass.
341
+
342
+ Completion criteria:
343
+
344
+ - [ ] Self-proposed policy improves over the previous policy on heldout.
345
+ - [ ] Reward-hacking probes pass.
346
+ - [ ] Production feedback confirms or falsifies offline estimate.
347
+ - [ ] Rollback path is tested.
348
+ - [ ] External writeup can state a falsifiable claim without overstating mechanism.
349
+
350
+ ## Completion Checklist
351
+
352
+ Do not call belief-state work "done" until these are true:
353
+
354
+ - [ ] Source tracking exists and is updated after each material experiment.
355
+ - [ ] Every derived state variable has a producer and evidence path.
356
+ - [ ] Every decision policy has a baseline.
357
+ - [ ] Every reported lift has split, n, CI, cost, and integrity status.
358
+ - [ ] Every OPE result has effective sample size and support diagnostics.
359
+ - [ ] Every calibration claim has ECE or equivalent metric.
360
+ - [ ] Every memory claim has poisoning/staleness handling.
361
+ - [ ] Every graph claim preserves topology and deterministic identity.
362
+ - [ ] Every surface attribution claim distinguishes correlation from causal evidence.
363
+ - [ ] Every public API is disabled or marked experimental until a replay corpus validates it.
364
+ - [ ] No runtime ownership boundary is crossed from `agent-eval`.
365
+ - [ ] Negative results are recorded instead of hidden.
366
+
367
+ ## Kill Criteria
368
+
369
+ Stop or pivot if any of these persist for two consecutive phases:
370
+
371
+ - OPE support is too weak to make policy claims.
372
+ - Selective gates improve eval scores but not real outcomes.
373
+ - The best policy is always "ask/verify everything" after cost accounting.
374
+ - Memory policy evaluation cannot distinguish useful memory from context bloat.
375
+ - Graph features do not beat flat trace features.
376
+ - Belief-state variables are mostly hand-labeled and not trace-derived.
377
+ - The work requires runtime ownership to make sense.
378
+ - The roadmap produces mechanism but no held-out lift or calibrated reduction in risk.
379
+
380
+ ## Immediate Next Build
381
+
382
+ The first code build should be small and internal:
383
+
384
+ 1. Add a decision-point extraction experiment that reads existing traces and emits JSONL rows joined to `RunRecord`.
385
+ 2. Add a selective prediction report over one decision kind.
386
+ 3. Add a calibration report using existing `meta-eval/calibration.ts` patterns.
387
+ 4. Add a holdout gate that reports honest negative if no utility lift appears.
388
+
389
+ Do not add a stable public `belief-state` subpath until the first selective policy clears its completion criteria.
390
+
391
+ ## Exact Integration Map
392
+
393
+ The most succinct integration is an experimental `src/belief-state/` module that consumes existing substrate data and produces a policy-evaluation report. It should not create a second runtime model.
394
+
395
+ ### Existing Abstractions to Extend
396
+
397
+ | Existing abstraction | Fit | Extension |
398
+ |---|---|---|
399
+ | `TraceStore` + `TraceSchema` | Good source of evidence. Runs, spans, and custom events already carry the data needed for decision extraction. | Use existing `custom`, `state_mutation`, and `policy_violation` events. Do not add event kinds until producers prove a missing field. |
400
+ | `RunRecord` | Good analysis join row: run id, candidate, split, seed, model, hashes, cost, outcome. | Do not change it. Belief decision rows should be sidecar records keyed by `runId`, `scenarioId`, and `stepIndex`. |
401
+ | `AnalystFinding` / evidence refs | Good optional semantic evidence layer. | Reference findings by id when present; do not require analysts for deterministic extraction. |
402
+ | `/rl/off-policy` | Already owns IPS/SNIPS/DR estimators and support diagnostics. | Add a converter from decision rows to `OffPolicyTrajectory[]` using an explicit named target policy; do not reimplement OPE math. |
403
+ | `meta-eval/calibration.ts` | Good calibration shape, but store-bound today. | Add a pure `calibrationFromPairs()` helper and reuse it from both meta-eval and belief-state. |
404
+ | `InsightReport` | Correct eventual home for summary rows. | Do not extend in the first internal slice. Add `beliefPolicies?: BeliefPolicyInsight[]` only after one policy clears holdout gates. |
405
+ | `control-runtime.ts` | Useful producer shape for typed decisions. | Optional adapter from `ControlRunResult` to decision rows. Do not make belief-state depend on control loops only. |
406
+
407
+ ### Files to Add First
408
+
409
+ | File | Purpose | Notes |
410
+ |---|---|---|
411
+ | `src/belief-state/types.ts` | Defines `BeliefDecisionPoint`, `BeliefDecisionKind`, `BeliefActionChoice`, `BeliefDecisionOutcome`, `BeliefEvidenceRef`, `BeliefPolicyEvaluationReport`, `SupportDiagnostics`. | Pure types. No runtime dependency. |
412
+ | `src/belief-state/extract.ts` | Extracts decision points from `TraceStore` runs/spans/events. | Structural parsing only. Unknown events are skipped with diagnostics. |
413
+ | `src/belief-state/selective.ts` | Evaluates continue/verify/ask/retry/stop policies against observed outcomes. | Computes coverage, accepted-error rate, rejected-action lift, cost-adjusted utility. |
414
+ | `src/belief-state/calibration.ts` | Computes confidence calibration for decision predictions. | Calls shared `calibrationFromPairs()` once added. |
415
+ | `src/belief-state/ope.ts` | Converts decision rows into `OffPolicyTrajectory[]` for an explicit named target policy and calls `offPolicyEstimateAll`. | Must report ESS and support mismatch; no silent value claims. |
416
+ | `src/belief-state/report.ts` | Orchestrates extraction + selective eval + calibration + OPE into one report. | Returns honest negative / need-more-data when unsupported. |
417
+ | `src/belief-state/index.ts` | Experimental barrel for the module. | Keep out of root barrel and expose only through `./experimental/belief-state` while evidence gates are open. |
418
+
419
+ ### Files to Change First
420
+
421
+ | File | Change | Why |
422
+ |---|---|---|
423
+ | `src/meta-eval/calibration.ts` | Extract pure `calibrationFromPairs(pairs, options)` and keep `calibrationCurve()` as the TraceStore/OutcomeStore wrapper. | Avoid duplicate ECE/binning logic. |
424
+ | `src/rl/index.ts` | No change in first slice unless helper types need re-export. | OPE stays under `/rl`; belief-state imports it internally. |
425
+ | `docs/research/belief-state-agent-eval-roadmap.md` | Keep this tracker updated with real results. | Prevent mechanism drift and unsupported claims. |
426
+ | `.evolve/pursuits/2026-06-04-belief-state-agent-eval.md` | Update phase status and empirical result. | Keeps active research state discoverable. |
427
+
428
+ ### Files Not to Change First
429
+
430
+ | File | Reason |
431
+ |---|---|
432
+ | `src/run-record.ts` | Adding belief fields here would pollute the paper-grade run row before producers exist. Use sidecar rows. |
433
+ | `src/trace/schema.ts` | Existing `custom` events and span attributes are enough for Phase 0. Schema bumps need producer evidence. |
434
+ | `src/contract/index.ts` | `/contract` is the stable LAND-tier surface. Belief-state should not enter it until proven. |
435
+ | `src/contract/insight-report.ts` | Do not add `beliefPolicies` until the module has one validated report shape. |
436
+ | `src/index.ts` | Root barrel is already broad; do not add experimental research APIs there. |
437
+ | `package.json` exports / `tsup.config.ts` | Defer stable `./belief-state` subpath until Phase 1 gates pass. Experimental dogfooding may use `./experimental/belief-state`. |
438
+
439
+ ### Public Export Gate
440
+
441
+ During the draft phase:
442
+
443
+ - `tsup.config.ts`: entry `'belief-state/index': 'src/belief-state/index.ts'` may exist to build the experimental subpath.
444
+ - `package.json`: export `"./experimental/belief-state"` only.
445
+ - Docs and PR bodies must call the surface experimental.
446
+
447
+ Only after Phase 1 succeeds, promote to stable:
448
+
449
+ - `package.json`: export `"./belief-state"` to `dist/belief-state/index.js`.
450
+ - `docs/feature-guide.md` or a dedicated docs page: document the promotion evidence and remaining caveats.
451
+ - `src/contract/insight-report.ts`: optional `beliefPolicies?: BeliefPolicyInsight[]`, only if the report is stable enough for dashboards.
452
+
453
+ Do not add to `/contract` until at least one downstream product uses it without source imports and the report shape survives a second corpus.
454
+
455
+ ### Minimal Data Model
456
+
457
+ ```ts
458
+ export type BeliefDecisionKind =
459
+ | 'continue'
460
+ | 'verify'
461
+ | 'ask'
462
+ | 'retry'
463
+ | 'stop'
464
+ | 'memory-write'
465
+ | 'memory-read'
466
+ | 'tool-select'
467
+ | 'skill-select'
468
+ | 'workflow-select'
469
+ | 'surface-promote'
470
+
471
+ export interface BeliefDecisionPoint {
472
+ id: string
473
+ runId: string
474
+ scenarioId?: string
475
+ stepIndex: number
476
+ kind: BeliefDecisionKind
477
+ chosenAction: string
478
+ candidateActions?: string[]
479
+ confidence?: number
480
+ behaviorProb?: number
481
+ targetProb?: number
482
+ costUsd?: number
483
+ evidence: BeliefEvidenceRef[]
484
+ outcome?: BeliefDecisionOutcome
485
+ metadata?: Record<string, unknown>
486
+ }
487
+ ```
488
+
489
+ This is intentionally a decision-point schema, not a global belief-state schema. Global state estimators can be built later from these rows.
490
+
491
+ ### Evaluation Criteria
492
+
493
+ Phase 0 corpus admission:
494
+
495
+ - [ ] >= 200 decision points or the report returns `need_more_data`.
496
+ - [ ] >= 3 task families or explicit single-domain label.
497
+ - [ ] 100% of decision points join to a `RunRecord.runId`.
498
+ - [ ] Every row has `kind`, `chosenAction`, `stepIndex`, and at least one evidence ref.
499
+ - [ ] Every analyzed run passes backend/capture integrity before scoring.
500
+ - [ ] Train/dev/holdout split is present.
501
+
502
+ Selective policy:
503
+
504
+ - [ ] Cost-adjusted utility beats baseline on holdout with CI.low > 0, or records honest negative.
505
+ - [ ] Accepted-action error rate decreases relative to baseline.
506
+ - [ ] Coverage is reported; low coverage cannot masquerade as high quality.
507
+ - [ ] Always-verify and never-verify baselines are included.
508
+ - [ ] Shuffled-confidence negative control does not pass.
509
+
510
+ Calibration:
511
+
512
+ - [ ] ECE is reported.
513
+ - [ ] Max bin gap is reported.
514
+ - [ ] Calibration improves over uncalibrated confidence or reports failure.
515
+ - [ ] Confidence with fewer than 2 bins of support returns `need_more_data`.
516
+
517
+ OPE:
518
+
519
+ - [ ] ESS >= 30 and ESS/raw n >= 0.25 before making a value claim.
520
+ - [ ] Max importance weight <= configured cap, or report support mismatch.
521
+ - [ ] IPS, SNIPS, and DR either agree within tolerance or disagreement is surfaced.
522
+ - [ ] No propensity defaults are invented. Missing propensities disable OPE.
523
+
524
+ Promotion:
525
+
526
+ - [ ] Recommendation is one of `ship`, `hold`, `need_more_data`.
527
+ - [ ] `ship` requires selective-policy lift plus calibration support, not OPE alone.
528
+ - [ ] Any missing integrity, split, outcome, cost, or support data forces `need_more_data` or `hold`.
529
+
530
+ ### Test Map
531
+
532
+ | Test file | Required cases |
533
+ |---|---|
534
+ | `src/belief-state/extract.test.ts` | extracts from custom trace events; joins run ids; skips malformed events with diagnostics; never throws on unknown payloads. |
535
+ | `src/belief-state/selective.test.ts` | baseline vs selective utility; always-verify cost penalty; shuffled confidence negative control; honest negative. |
536
+ | `src/belief-state/calibration.test.ts` | ECE bins; equal-width/equal-frequency behavior; too-few-pairs returns unsupported. |
537
+ | `src/belief-state/ope.test.ts` | converts to `OffPolicyTrajectory`; explicit target policy required; invalid propensity disables OPE without throwing; low ESS support mismatch; estimator agreement surfaced. |
538
+ | `src/belief-state/report.test.ts` | full report status: `ship`, `hold`, `need_more_data`; recommendation cannot ship on OPE alone. |
539
+ | `src/meta-eval/calibration.test.ts` | existing `calibrationCurve()` still works after extracting pure helper. |
540
+
541
+ ### Verification Commands
542
+
543
+ - `pnpm typecheck`
544
+ - `pnpm test -- src/belief-state src/meta-eval/calibration.test.ts`
545
+ - `pnpm build`
546
+ - `pnpm verify:package`
547
+
548
+ ### One-Sprint Implementation Order
549
+
550
+ 1. Add types and extraction.
551
+ 2. Add pure calibration helper.
552
+ 3. Add selective-policy evaluator.
553
+ 4. Add OPE converter using `/rl/off-policy`.
554
+ 5. Add report orchestrator.
555
+ 6. Add tests.
556
+ 7. Run on one existing trace corpus and update this tracker with honest result.
557
+
558
+ This is the smallest integration that is useful: it answers "when should the agent continue/verify/ask/retry/stop?" with evidence, and it leaves memory, graph topology, and learned estimators for later phases.
@@ -123,6 +123,7 @@ Each is a falsifiable claim or unanswered formal question. Each maps to publisha
123
123
  - `docs/research/<thesis>/notes.md` — running research log, hypothesis, current status.
124
124
  - `docs/research/<thesis>/experiments.md` — every run + numbers + analysis.
125
125
  - `docs/research/<thesis>/paper-draft.md` — building toward arXiv submission.
126
+ - `docs/research/belief-state-agent-eval-roadmap.md` — belief-state / adaptive-control research tracker and 24-month gate plan.
126
127
  - `.evolve/research/<thesis>/` — code + data + figures, version-controlled.
127
128
 
128
129
  **Quality bar.**
@@ -136,7 +136,7 @@ report, RL bridge).
136
136
 
137
137
  | From → To | Type | What it carries |
138
138
  |---|---|---|
139
- | agent-knowledge → agent-eval | `RunRecord` | (consumed via `runMultiShotOptimization` for knowledge-base optimization) |
139
+ | agent-knowledge → agent-eval | `RunRecord` | (consumed via `runImprovementLoop` for knowledge-base optimization) |
140
140
  | agent-knowledge → agent-eval | `KnowledgeReadinessReport`, `KnowledgeBundle`, `KnowledgeRequirement` | (re-exported from agent-eval; agent-knowledge populates) |
141
141
  | agent-knowledge → agent-eval | `ControlRuntimeConfig<KnowledgeBaseCandidate>` | (knowledge research adapter) |
142
142
  | agent-runtime → agent-eval | `runAgentControlLoop`, `scoreKnowledgeReadiness`, `blockingKnowledgeEval` | (consumed; agent-runtime calls these in its task lifecycle) |
@@ -44,6 +44,25 @@ console.log(result.findings)
44
44
  Products can pass any `TraceAnalysisStore`; they do not need to use the file
45
45
  store in production.
46
46
 
47
+ ## Deterministic failure coverage (no LLM)
48
+
49
+ Before (or alongside) the LLM analyst, `OtlpFileTraceStore.getOverview()` returns a
50
+ `DatasetOverview` whose `error_clusters` are computed deterministically — error
51
+ spans are grouped by a normalized failure signature (uuids / hex ids / numbers /
52
+ absolute paths / durations collapsed), each cluster carrying its prevalence,
53
+ exemplar `trace_id`/`span_id`, and a verbatim sample. This is a zero-LLM,
54
+ reproducible failure checklist the analyst then explains and closes:
55
+
56
+ ```ts
57
+ const overview = await store.getOverview()
58
+ for (const c of overview.error_clusters) {
59
+ console.log(`${c.trace_count}× ${c.signature} — e.g. trace ${c.exemplar_trace_ids[0]}`)
60
+ }
61
+ ```
62
+
63
+ See `failureClusters` in [insight-report.md](./insight-report.md) and the
64
+ `ErrorCluster` type doc-comments for the field-level contract.
65
+
47
66
  ## Required Trace Shape
48
67
 
49
68
  Every serious product run should include:
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.77.0",
4
- "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
3
+ "version": "0.80.0",
4
+ "description": "Decision-grade evals for agents: one call — selfImprove (closed loop) or analyzeRuns (observed runs) — returns a decision packet: lift CI, judge calibration, contamination check, failure clusters, cost-quality Pareto, ranked actions. The scoring and ship-gate substrate @tangle-network/agent-runtime delegates to; TypeScript and Python over one wire protocol.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
7
7
  "type": "git",
@@ -119,6 +119,11 @@
119
119
  "import": "./dist/authenticity/index.js",
120
120
  "default": "./dist/authenticity/index.js"
121
121
  },
122
+ "./experimental/belief-state": {
123
+ "types": "./dist/belief-state/index.d.ts",
124
+ "import": "./dist/belief-state/index.js",
125
+ "default": "./dist/belief-state/index.js"
126
+ },
122
127
  "./workflow": {
123
128
  "types": "./dist/workflow/index.d.ts",
124
129
  "import": "./dist/workflow/index.js",