@tangle-network/agent-eval 0.79.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +50 -19
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +1 -1
  5. package/dist/analyst/index.d.ts +3 -3
  6. package/dist/belief-state/index.d.ts +188 -0
  7. package/dist/belief-state/index.js +486 -0
  8. package/dist/belief-state/index.js.map +1 -0
  9. package/dist/calibration-Cpr3WaX3.d.ts +101 -0
  10. package/dist/campaign/index.d.ts +5 -5
  11. package/dist/chunk-4DIJWVUT.js +131 -0
  12. package/dist/chunk-4DIJWVUT.js.map +1 -0
  13. package/dist/chunk-NPCTHQIO.js +91 -0
  14. package/dist/chunk-NPCTHQIO.js.map +1 -0
  15. package/dist/contract/index.d.ts +123 -10
  16. package/dist/contract/index.js +116 -0
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/governance/index.d.ts +1 -1
  19. package/dist/hosted/index.d.ts +1 -1
  20. package/dist/index.d.ts +5 -5
  21. package/dist/meta-eval/index.d.ts +5 -98
  22. package/dist/meta-eval/index.js +7 -76
  23. package/dist/meta-eval/index.js.map +1 -1
  24. package/dist/off-policy-DiwuKKg7.d.ts +132 -0
  25. package/dist/openapi.json +1 -1
  26. package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
  27. package/dist/{provenance-CEAJI9rm.d.ts → provenance-jG-Gngg8.d.ts} +2 -2
  28. package/dist/{registry-BmEuU94S.d.ts → registry-BK0Zee01.d.ts} +1 -1
  29. package/dist/reporting.d.ts +2 -2
  30. package/dist/rl.d.ts +6 -136
  31. package/dist/rl.js +6 -120
  32. package/dist/rl.js.map +1 -1
  33. package/dist/{rubric-predictive-validity-CWyWWLBg.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +1 -1
  34. package/dist/{run-improvement-loop-Bgu4C59E.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +1 -1
  35. package/dist/{semantic-concept-judge-Du4ZVyef.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +1 -1
  36. package/dist/{types-QHG0KnkF.d.ts → types-4mm2msnR.d.ts} +1 -1
  37. package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
  38. package/docs/research/research-roadmap.md +1 -0
  39. package/package.json +7 -2
package/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # `@tangle-network/agent-eval`
2
2
 
3
- **Ship better agent prompts with statistical confidence.** One function call returns a decision packet: lift CI, judge calibration, contamination check, failure clusters, cost-quality Pareto, and a ranked action list. Same shape whether you've got a closed improvement loop or just production logs.
3
+ **Decision-grade evals for agents.** One function call returns a decision packet lift CI, judge calibration, contamination check, failure clusters, cost-quality Pareto, and a ranked action list with the same shape whether you have a closed improvement loop or just production logs.
4
+
5
+ It is the **substrate at the bottom of the stack**: [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime) runs agents and captures every run as a trace, then delegates scoring and the ship gate here. The dependency arrow only points up — agent-eval never imports the runtime.
4
6
 
5
7
  [![npm](https://img.shields.io/npm/v/@tangle-network/agent-eval.svg)](https://www.npmjs.com/package/@tangle-network/agent-eval)
6
8
  [![pypi](https://img.shields.io/pypi/v/agent-eval-rpc.svg)](https://pypi.org/project/agent-eval-rpc/)
@@ -207,28 +209,55 @@ Each example: `README.md` + a single `index.ts` runnable via `pnpm tsx`. Prints
207
209
 
208
210
  | Subpath | What it gives you |
209
211
  |---|---|
210
- | `@tangle-network/agent-eval/contract` | **The headline surface.** `selfImprove`, `analyzeRuns`, `runImprovementLoop`, `runCampaign`, `runEval`, `diffRuns`, intake adapters (`fromFeedbackTable`, `fromOtelSpans`), drivers (`gepaDriver`, `evolutionaryDriver`), gates (`defaultProductionGate`, `heldOutGate`, `composeGate`), storage. **New code starts here.** |
211
- | `@tangle-network/agent-eval/hosted` | Hosted-tier wire-format types + `createHostedClient` to ship eval-run events + trace spans to any orchestrator speaking the spec |
212
- | `@tangle-network/agent-eval/adapters/otel` | `createOtelBridge` — forwards OpenTelemetry-shape spans into the hosted-tier ingest |
213
- | `@tangle-network/agent-eval/adapters/langchain` | LangChain runnable `Dispatch` adapter |
214
- | `@tangle-network/agent-eval/adapters/http` | `httpDispatch` + `runDispatchServer` for distributed campaigns across machines |
215
- | `@tangle-network/agent-eval/campaign` | Lower-level campaign primitives (storage, drivers, types) |
216
- | `@tangle-network/agent-eval/multishot` | N-shot persona × shot matrix runner |
217
- | `@tangle-network/agent-eval/control` | Agent control loop primitives (`runAgentControlLoop`, action policy, propose/review) |
218
- | `@tangle-network/agent-eval/traces` | Trace stores, emitters, OTLP-JSONL replay |
219
- | `@tangle-network/agent-eval/reporting` | Release confidence, paired stats, sequential e-values, launch reports |
220
- | `@tangle-network/agent-eval/rl` | RL bridge verifiable rewards, preferences, OPE, PRM, tournaments, contamination, compute curves, auto-research |
221
- | `@tangle-network/agent-eval/matrix` | N-axis cartesian over substrate types |
222
- | `@tangle-network/agent-eval/wire` | HTTP/RPC server + Zod schemas (same protocol the Python client speaks) |
223
- | `@tangle-network/agent-eval/benchmarks` | Benchmark adapter contracts and reference wrappers |
224
-
225
- The root export remains available for backward compatibility; new code should prefer focused subpaths. Anything under `/rl`, `/pipelines`, `/meta-eval`, `/prm`, or `/builder-eval` is **only** reachable via its subpath.
212
+ | `…/contract` | **The headline, frozen surface — new code starts here.** `selfImprove`, `analyzeRuns`, `runEval`, `runCampaign`, `runImprovementLoop`, `diffRuns`; intake adapters (`fromFeedbackTable`, `fromOtelSpans`); drivers (`gepaDriver`, `evolutionaryDriver`); gates (`defaultProductionGate`, `heldOutGate`, `paretoSignificanceGate`, `composeGate`); the deployment-outcome store; storage; and the five core types `Scenario` / `Dispatch` / `JudgeConfig` / `Mutator` / `Gate`. |
213
+ | `…/hosted` | `createHostedClient` / `hostedClientFromEnv` + the wire types to ship eval-run events + trace spans to a hosted orchestrator (ours or your own implementation of the spec) |
214
+ | `…/adapters/otel` | `createOtelBridge` — forwards OpenTelemetry-shape spans into the hosted-tier ingest, no `@opentelemetry/*` dependency |
215
+ | `…/adapters/langchain` | Wrap any LangChain `Runnable` as a `Dispatch` (or `JudgeConfig`), no `@langchain/core` peer dep |
216
+ | `…/adapters/http` | `httpDispatch` + `runDispatchServer` run a campaign's worker on another machine (multi-region, driver-as-a-service) |
217
+ | `…/campaign` | **The measurement + improvement engine** (`@experimental`): `runProfileMatrix`, `compareDrivers`, every driver (`gepaDriver`, `haloDriver`, `skillOptDriver`, `aceDriver`, `memoryCurationDriver`, …), the gates, storage backends, and loop provenance. `/contract` re-exports the stable subset. |
218
+ | `…/rl` | RL bridge from eval artifacts to training signal: verifiable rewards, preferences, OPE, PRM, tournaments, contamination, compute curves, plus the durable corpus + `buildRlDataset` / datasheet bundle |
219
+ | `…/reporting` | Release-decision statistics: `pairedBootstrap`, `benjaminiHochberg`, anytime-valid sequential e-values, `evaluateReleaseConfidence`, and the report renderers |
220
+ | `…/analyst` | The trace-analyst surface: `AnalystRegistry` + `buildDefaultAnalystRegistry` (run the failure-clustering panel), `FindingsStore`, and the LLM chat transports |
221
+ | `…/traces` | Trace stores + emitters, OTLP-JSONL deterministic replay, `analyzeTraces`, and the `traceAnalystOnRunComplete` hook |
222
+ | `…/control` | Agent control loop: `runAgentControlLoop` (observe validate decide → act), action policy, propose/review |
223
+ | `…/matrix` | `runAgentMatrix` — an N-axis cartesian over caller-supplied substrate values, per-axis pass/score/cost/duration |
224
+ | `…/multishot` | N-shot persona × shot matrix runner (`runMultishot` / `runMultishotMatrix`) |
225
+ | `…/wire` | The cross-language HTTP/RPC server + Zod schemas (the source-of-truth protocol the Python client speaks) + the built-in rubric registry |
226
+ | `…/benchmarks` | `BenchmarkAdapter` contract + `deterministicSplit` + the bundled `routing` reference benchmark |
227
+
228
+ **Specialized surfaces** (subpath-only): `…/prm` (process-reward grading + best-of-N), `…/meta-eval` (judge calibration + the deployment-outcome store), `…/pipelines` (trace-diagnostic views: budget breach, failure cluster, stuck loop, …), `…/governance` (EU AI Act / NIST AI RMF / SOC2 reports), `…/knowledge` (knowledge-readiness gating before a run), `…/builder-eval` (code-generator three-layer eval), `…/storyboard` (trace → watchable replay), `…/authenticity` (anti-Goodhart "real or convincing BS" scorer over produced files), `…/workflow` (workflow-trace eval + partner export), `…/telemetry` (Workers-safe telemetry client).
229
+
230
+ The root export remains available for backward compatibility; new code should prefer the focused subpaths above — `/contract` first.
231
+
232
+ ---
233
+
234
+ ## Composition with the stack
235
+
236
+ agent-eval is the bottom of the layering: consumers depend on it, it depends on none of them.
237
+
238
+ ```
239
+ agent-runtime Runs agents (chat turns, one-shot tasks, multi-attempt loops), captures every
240
+ run as a trace, and calls optimizePrompt / runImprovementLoop. Produces the
241
+ RunRecords + traces agent-eval scores. Depends on agent-eval.
242
+
243
+ agent-eval selfImprove, analyzeRuns, runCampaign + drivers (gepaDriver, …), the gates
244
+ (this repo) (heldOutGate, defaultProductionGate, paretoSignificanceGate), the InsightReport
245
+ decision packet, the RL bridge, the wire protocol. Depends on neither consumer.
246
+
247
+ agent-knowledge proposeKnowledgeWrites / applyKnowledgeWriteBlocks. agent-eval's analyst findings
248
+ feed it; the knowledge gate consumes them. Depends on agent-eval.
249
+
250
+ sandbox AgentProfile, Sandbox.create, streamPrompt. The execution surface the runtime's
251
+ loops run on; agent-eval scores what comes back.
252
+ ```
253
+
254
+ The rule: **agent-eval has zero upward dependencies on a consumer.** A concept that makes sense *without* a running agent loop — a verdict, a run record, a scenario, a judge score — is substrate and lives here; a runtime-shaped one (a sandbox profile, a validation context with an abort signal) lives in agent-runtime. When in doubt, lean substrate.
226
255
 
227
256
  ---
228
257
 
229
258
  ## Concepts + design
230
259
 
231
- - [`docs/concepts.md`](./docs/concepts.md) — five types, three top-level functions, the layering rule, the wire protocol contract
260
+ - [`docs/concepts.md`](./docs/concepts.md) — the three top-level functions, the layering rule, and the wire-protocol contract (the five core contract types are documented in the `/contract` barrel itself)
232
261
  - [`docs/insight-report.md`](./docs/insight-report.md) — annotated walkthrough of every section of the decision packet
233
262
  - [`docs/customer-journeys.md`](./docs/customer-journeys.md) — three end-to-end journeys with code + expected output
234
263
  - [`docs/adapters-observability.md`](./docs/adapters-observability.md) — composing agent-eval with LangSmith, Langfuse, Phoenix, OpenLLMetry, TraceAI
@@ -287,7 +316,9 @@ pnpm test
287
316
 
288
317
  ## Stability + versioning
289
318
 
290
- Public exports carry JSDoc stability markers visible in IDE hover + `.d.ts`:
319
+ The `/contract` surface is the **stability contract**: its barrel freezes the API — a `0.x` minor only *adds*; nothing there changes shape or disappears. Depend on `/contract` (and the documented subpaths) rather than the root barrel.
320
+
321
+ In the deeper subpaths, `@stable` / `@experimental` JSDoc markers (visible in IDE hover + `.d.ts`) call out what may still move — most granularly in `/rl` (tagged per export) and `/campaign` (whole barrel `@experimental`, since `/contract` re-exports only its settled subset).
291
322
 
292
323
  | Tag | Meaning |
293
324
  |---|---|
@@ -1,4 +1,4 @@
1
- import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-QHG0KnkF.js';
1
+ import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-4mm2msnR.js';
2
2
  import '../run-record-sItO5ftF.js';
3
3
  import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
@@ -1,4 +1,4 @@
1
- import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-QHG0KnkF.js';
1
+ import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-4mm2msnR.js';
2
2
  import '../run-record-sItO5ftF.js';
3
3
  import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
@@ -1,5 +1,5 @@
1
1
  import { TraceSpanEvent, HostedClient } from '../hosted/index.js';
2
- import '../types-QHG0KnkF.js';
2
+ import '../types-4mm2msnR.js';
3
3
  import '../run-record-sItO5ftF.js';
4
4
  import '../errors-Dwqw-T_m.js';
5
5
  import '../schema-m0gsnbt3.js';
@@ -1,8 +1,8 @@
1
1
  import { AxAIService, AxFunction } from '@ax-llm/ax';
2
2
  import { M as MultiLayerVerifier, V as VerifyOptions, S as Severity } from '../multi-layer-verifier-DlWCXuxL.js';
3
3
  import { c as RunCritic, a as RunTrace } from '../run-critic-BAIjX99r.js';
4
- import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-Du4ZVyef.js';
5
- export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-Du4ZVyef.js';
4
+ import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-qXEUV2w7.js';
5
+ export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-qXEUV2w7.js';
6
6
  import { A as AnalyzeTracesOptions } from '../analyst-t7zZS3TV.js';
7
7
  import { T as TraceAnalysisStore } from '../store-GmBE2pZZ.js';
8
8
  import { b as JudgeFn, a as JudgeInput } from '../types-Croy5h7V.js';
@@ -10,7 +10,7 @@ import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../type
10
10
  export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-DRvV0zRo.js';
11
11
  import { TCloud } from '@tangle-network/tcloud';
12
12
  export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-DqV2t1Xk.js';
13
- export { a as AnalystHooks, A as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-BmEuU94S.js';
13
+ export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-BK0Zee01.js';
14
14
  import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
15
15
  import '../schema-m0gsnbt3.js';
16
16
  import '../store-CKUAgsJz.js';
@@ -0,0 +1,188 @@
1
+ import { c as CalibrationReport } from '../calibration-Cpr3WaX3.js';
2
+ import { O as OffPolicyEstimate, a as OffPolicyOptions, b as OffPolicyTrajectory } from '../off-policy-DiwuKKg7.js';
3
+ import { T as TraceStore } from '../store-CKUAgsJz.js';
4
+ import '../schema-m0gsnbt3.js';
5
+ import '../outcome-store-rnXLEqSn.js';
6
+
7
+ type BeliefDecisionKind = 'continue' | 'verify' | 'ask' | 'retry' | 'stop' | 'memory-write' | 'memory-read' | 'tool-select' | 'skill-select' | 'workflow-select' | 'surface-promote';
8
+ type BeliefEvidenceSource = 'run' | 'span' | 'event' | 'finding' | 'memory' | 'knowledge' | 'policy';
9
+ interface BeliefEvidenceRef {
10
+ source: BeliefEvidenceSource;
11
+ id: string;
12
+ runId?: string;
13
+ spanId?: string;
14
+ eventId?: string;
15
+ detail?: string;
16
+ metadata?: Record<string, unknown>;
17
+ }
18
+ interface BeliefDecisionOutcome {
19
+ success?: boolean;
20
+ score?: number;
21
+ reward?: number;
22
+ costUsd?: number;
23
+ observedAt?: string;
24
+ metadata?: Record<string, unknown>;
25
+ }
26
+ interface BeliefDecisionPoint {
27
+ id: string;
28
+ runId: string;
29
+ scenarioId?: string;
30
+ stepIndex: number;
31
+ kind: BeliefDecisionKind;
32
+ chosenAction: string;
33
+ candidateActions?: string[];
34
+ confidence?: number;
35
+ behaviorProb?: number;
36
+ targetProb?: number;
37
+ qHat?: number | null;
38
+ costUsd?: number;
39
+ evidence: BeliefEvidenceRef[];
40
+ outcome?: BeliefDecisionOutcome;
41
+ metadata?: Record<string, unknown>;
42
+ }
43
+ interface BeliefDecisionExtractionDiagnostic {
44
+ runId: string;
45
+ eventId?: string;
46
+ severity: 'info' | 'warning' | 'error';
47
+ reason: string;
48
+ }
49
+ interface BeliefDecisionExtractionReport {
50
+ decisions: BeliefDecisionPoint[];
51
+ diagnostics: BeliefDecisionExtractionDiagnostic[];
52
+ }
53
+ type BeliefPolicyAction = 'accept' | 'defer' | 'verify' | 'ask' | 'retry' | 'stop';
54
+ interface BeliefPolicyDecision {
55
+ action: BeliefPolicyAction;
56
+ confidence?: number;
57
+ targetProb?: number;
58
+ qHat?: number | null;
59
+ reason?: string;
60
+ }
61
+ interface BeliefSelectivePolicy {
62
+ id: string;
63
+ decide(point: BeliefDecisionPoint): BeliefPolicyDecision;
64
+ }
65
+ interface BeliefOpeTargetPolicy {
66
+ id: string;
67
+ targetProbOf(point: BeliefDecisionPoint): number | null | undefined;
68
+ qHatOf?(point: BeliefDecisionPoint): number | null | undefined;
69
+ }
70
+ interface BeliefUtilityOptions {
71
+ successUtility?: number;
72
+ failureUtility?: number;
73
+ deferUtility?: number;
74
+ verifyCost?: number;
75
+ askCost?: number;
76
+ retryCost?: number;
77
+ stopUtility?: number;
78
+ costWeight?: number;
79
+ }
80
+ interface BeliefSelectivePolicyMetrics {
81
+ policyId: string;
82
+ n: number;
83
+ accepted: number;
84
+ rejected: number;
85
+ coverage: number;
86
+ acceptedErrorRate: number;
87
+ baselineUtility: number;
88
+ policyUtility: number;
89
+ utilityDelta: number;
90
+ utilityCi95: {
91
+ mean: number;
92
+ lower: number;
93
+ upper: number;
94
+ };
95
+ rejectedMeanReward: number | null;
96
+ recommendation: 'ship' | 'hold' | 'need_more_data';
97
+ reasons: string[];
98
+ }
99
+ interface BeliefOpeSupportDiagnostics {
100
+ supported: boolean;
101
+ n: number;
102
+ dropped: number;
103
+ effectiveSampleSize: number;
104
+ effectiveSampleRatio: number;
105
+ maxImportanceWeight: number;
106
+ reasons: string[];
107
+ }
108
+ interface BeliefOpeReport {
109
+ targetPolicyId: string;
110
+ ips: OffPolicyEstimate;
111
+ snips: OffPolicyEstimate;
112
+ dr: OffPolicyEstimate;
113
+ support: BeliefOpeSupportDiagnostics;
114
+ }
115
+ type BeliefEvaluationStatus = 'ship' | 'hold' | 'need_more_data';
116
+ type BeliefCalibrationStatus = 'supported' | 'unsupported';
117
+ type BeliefOpeStatus = 'supported' | 'unsupported' | 'not_requested';
118
+ interface BeliefPolicyEvaluationReport {
119
+ policyId: string;
120
+ n: number;
121
+ status: BeliefEvaluationStatus;
122
+ selectiveStatus: BeliefEvaluationStatus;
123
+ calibrationStatus: BeliefCalibrationStatus;
124
+ opeStatus: BeliefOpeStatus;
125
+ opeTargetPolicyId?: string;
126
+ selective: BeliefSelectivePolicyMetrics;
127
+ calibration?: CalibrationReport;
128
+ ope?: BeliefOpeReport;
129
+ diagnostics: string[];
130
+ }
131
+
132
+ type BeliefCalibrationRegion = 'all' | 'accepted' | 'rejected';
133
+ interface BeliefCalibrationOptions {
134
+ bins?: number;
135
+ minPairs?: number;
136
+ policy?: BeliefSelectivePolicy;
137
+ region?: BeliefCalibrationRegion;
138
+ }
139
+ declare function calibrateBeliefDecisions(points: BeliefDecisionPoint[], options?: BeliefCalibrationOptions): CalibrationReport | null;
140
+
141
+ interface ExtractBeliefDecisionPointsOptions {
142
+ runIds?: string[];
143
+ }
144
+ declare function extractBeliefDecisionPoints(store: TraceStore, options?: ExtractBeliefDecisionPointsOptions): Promise<BeliefDecisionExtractionReport>;
145
+
146
+ interface BeliefOpeOptions extends OffPolicyOptions {
147
+ minEffectiveSampleSize?: number;
148
+ minEffectiveSampleRatio?: number;
149
+ maxDiagnostics?: number;
150
+ }
151
+ interface BeliefOffPolicyTrajectoryReport {
152
+ targetPolicyId: string;
153
+ trajectories: OffPolicyTrajectory[];
154
+ dropped: number;
155
+ diagnostics: string[];
156
+ }
157
+ declare function embeddedBeliefOpeTargetPolicy(id?: string): BeliefOpeTargetPolicy;
158
+ declare function beliefDecisionsToOffPolicyTrajectories(points: BeliefDecisionPoint[], targetPolicy: BeliefOpeTargetPolicy, options?: Pick<BeliefOpeOptions, 'maxDiagnostics'>): BeliefOffPolicyTrajectoryReport;
159
+ declare function evaluateBeliefOffPolicy(points: BeliefDecisionPoint[], targetPolicy: BeliefOpeTargetPolicy, options?: BeliefOpeOptions): BeliefOpeReport;
160
+
161
+ interface EvaluateBeliefSelectivePolicyOptions {
162
+ utility?: BeliefUtilityOptions;
163
+ minN?: number;
164
+ minAccepted?: number;
165
+ minUtilityDelta?: number;
166
+ seed?: number;
167
+ }
168
+ declare function thresholdSelectivePolicy(options: {
169
+ id?: string;
170
+ confidenceThreshold: number;
171
+ belowThresholdAction?: Exclude<BeliefPolicyAction, 'accept'>;
172
+ }): BeliefSelectivePolicy;
173
+ declare function evaluateBeliefSelectivePolicy(points: BeliefDecisionPoint[], policy: BeliefSelectivePolicy, options?: EvaluateBeliefSelectivePolicyOptions): BeliefSelectivePolicyMetrics;
174
+
175
+ interface AnalyzeBeliefPolicyOpeOptions extends BeliefOpeOptions {
176
+ targetPolicy?: BeliefOpeTargetPolicy;
177
+ }
178
+ interface AnalyzeBeliefPolicyOptions {
179
+ points: BeliefDecisionPoint[];
180
+ policy: BeliefSelectivePolicy;
181
+ selective?: EvaluateBeliefSelectivePolicyOptions;
182
+ calibration?: BeliefCalibrationOptions;
183
+ ope?: AnalyzeBeliefPolicyOpeOptions;
184
+ requireOpe?: boolean;
185
+ }
186
+ declare function analyzeBeliefPolicy(options: AnalyzeBeliefPolicyOptions): BeliefPolicyEvaluationReport;
187
+
188
+ export { type AnalyzeBeliefPolicyOpeOptions, type AnalyzeBeliefPolicyOptions, type BeliefCalibrationOptions, type BeliefCalibrationRegion, type BeliefCalibrationStatus, type BeliefDecisionExtractionDiagnostic, type BeliefDecisionExtractionReport, type BeliefDecisionKind, type BeliefDecisionOutcome, type BeliefDecisionPoint, type BeliefEvaluationStatus, type BeliefEvidenceRef, type BeliefEvidenceSource, type BeliefOffPolicyTrajectoryReport, type BeliefOpeOptions, type BeliefOpeReport, type BeliefOpeStatus, type BeliefOpeSupportDiagnostics, type BeliefOpeTargetPolicy, type BeliefPolicyAction, type BeliefPolicyDecision, type BeliefPolicyEvaluationReport, type BeliefSelectivePolicy, type BeliefSelectivePolicyMetrics, type BeliefUtilityOptions, type EvaluateBeliefSelectivePolicyOptions, type ExtractBeliefDecisionPointsOptions, analyzeBeliefPolicy, beliefDecisionsToOffPolicyTrajectories, calibrateBeliefDecisions, embeddedBeliefOpeTargetPolicy, evaluateBeliefOffPolicy, evaluateBeliefSelectivePolicy, extractBeliefDecisionPoints, thresholdSelectivePolicy };