@tangle-network/agent-eval 0.79.0 → 0.80.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +1 -1
- package/dist/analyst/index.d.ts +3 -3
- package/dist/belief-state/index.d.ts +188 -0
- package/dist/belief-state/index.js +486 -0
- package/dist/belief-state/index.js.map +1 -0
- package/dist/calibration-Cpr3WaX3.d.ts +101 -0
- package/dist/campaign/index.d.ts +5 -5
- package/dist/chunk-4DIJWVUT.js +131 -0
- package/dist/chunk-4DIJWVUT.js.map +1 -0
- package/dist/chunk-NPCTHQIO.js +91 -0
- package/dist/chunk-NPCTHQIO.js.map +1 -0
- package/dist/contract/index.d.ts +123 -10
- package/dist/contract/index.js +116 -0
- package/dist/contract/index.js.map +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +1 -1
- package/dist/index.d.ts +5 -5
- package/dist/meta-eval/index.d.ts +5 -98
- package/dist/meta-eval/index.js +7 -76
- package/dist/meta-eval/index.js.map +1 -1
- package/dist/off-policy-DiwuKKg7.d.ts +132 -0
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
- package/dist/{provenance-CEAJI9rm.d.ts → provenance-jG-Gngg8.d.ts} +2 -2
- package/dist/{registry-BmEuU94S.d.ts → registry-BK0Zee01.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/rl.d.ts +6 -136
- package/dist/rl.js +6 -120
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CWyWWLBg.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +1 -1
- package/dist/{run-improvement-loop-Bgu4C59E.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +1 -1
- package/dist/{semantic-concept-judge-Du4ZVyef.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +1 -1
- package/dist/{types-QHG0KnkF.d.ts → types-4mm2msnR.d.ts} +1 -1
- package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
- package/docs/research/research-roadmap.md +1 -0
- package/package.json +7 -2
package/README.md
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# `@tangle-network/agent-eval`
|
|
2
2
|
|
|
3
|
-
**
|
|
3
|
+
**Decision-grade evals for agents.** One function call returns a decision packet — lift CI, judge calibration, contamination check, failure clusters, cost-quality Pareto, and a ranked action list — with the same shape whether you have a closed improvement loop or just production logs.
|
|
4
|
+
|
|
5
|
+
It is the **substrate at the bottom of the stack**: [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime) runs agents and captures every run as a trace, then delegates scoring and the ship gate here. The dependency arrow only points up — agent-eval never imports the runtime.
|
|
4
6
|
|
|
5
7
|
[](https://www.npmjs.com/package/@tangle-network/agent-eval)
|
|
6
8
|
[](https://pypi.org/project/agent-eval-rpc/)
|
|
@@ -207,28 +209,55 @@ Each example: `README.md` + a single `index.ts` runnable via `pnpm tsx`. Prints
|
|
|
207
209
|
|
|
208
210
|
| Subpath | What it gives you |
|
|
209
211
|
|---|---|
|
|
210
|
-
|
|
|
211
|
-
|
|
|
212
|
-
|
|
|
213
|
-
|
|
|
214
|
-
|
|
|
215
|
-
|
|
|
216
|
-
|
|
|
217
|
-
|
|
|
218
|
-
|
|
|
219
|
-
|
|
|
220
|
-
|
|
|
221
|
-
|
|
|
222
|
-
|
|
|
223
|
-
|
|
|
224
|
-
|
|
225
|
-
|
|
212
|
+
| `…/contract` | **The headline, frozen surface — new code starts here.** `selfImprove`, `analyzeRuns`, `runEval`, `runCampaign`, `runImprovementLoop`, `diffRuns`; intake adapters (`fromFeedbackTable`, `fromOtelSpans`); drivers (`gepaDriver`, `evolutionaryDriver`); gates (`defaultProductionGate`, `heldOutGate`, `paretoSignificanceGate`, `composeGate`); the deployment-outcome store; storage; and the five core types `Scenario` / `Dispatch` / `JudgeConfig` / `Mutator` / `Gate`. |
|
|
213
|
+
| `…/hosted` | `createHostedClient` / `hostedClientFromEnv` + the wire types to ship eval-run events + trace spans to a hosted orchestrator (ours or your own implementation of the spec) |
|
|
214
|
+
| `…/adapters/otel` | `createOtelBridge` — forwards OpenTelemetry-shape spans into the hosted-tier ingest, no `@opentelemetry/*` dependency |
|
|
215
|
+
| `…/adapters/langchain` | Wrap any LangChain `Runnable` as a `Dispatch` (or `JudgeConfig`), no `@langchain/core` peer dep |
|
|
216
|
+
| `…/adapters/http` | `httpDispatch` + `runDispatchServer` — run a campaign's worker on another machine (multi-region, driver-as-a-service) |
|
|
217
|
+
| `…/campaign` | **The measurement + improvement engine** (`@experimental`): `runProfileMatrix`, `compareDrivers`, every driver (`gepaDriver`, `haloDriver`, `skillOptDriver`, `aceDriver`, `memoryCurationDriver`, …), the gates, storage backends, and loop provenance. `/contract` re-exports the stable subset. |
|
|
218
|
+
| `…/rl` | RL bridge from eval artifacts to training signal: verifiable rewards, preferences, OPE, PRM, tournaments, contamination, compute curves, plus the durable corpus + `buildRlDataset` / datasheet bundle |
|
|
219
|
+
| `…/reporting` | Release-decision statistics: `pairedBootstrap`, `benjaminiHochberg`, anytime-valid sequential e-values, `evaluateReleaseConfidence`, and the report renderers |
|
|
220
|
+
| `…/analyst` | The trace-analyst surface: `AnalystRegistry` + `buildDefaultAnalystRegistry` (run the failure-clustering panel), `FindingsStore`, and the LLM chat transports |
|
|
221
|
+
| `…/traces` | Trace stores + emitters, OTLP-JSONL deterministic replay, `analyzeTraces`, and the `traceAnalystOnRunComplete` hook |
|
|
222
|
+
| `…/control` | Agent control loop: `runAgentControlLoop` (observe → validate → decide → act), action policy, propose/review |
|
|
223
|
+
| `…/matrix` | `runAgentMatrix` — an N-axis cartesian over caller-supplied substrate values, per-axis pass/score/cost/duration |
|
|
224
|
+
| `…/multishot` | N-shot persona × shot matrix runner (`runMultishot` / `runMultishotMatrix`) |
|
|
225
|
+
| `…/wire` | The cross-language HTTP/RPC server + Zod schemas (the source-of-truth protocol the Python client speaks) + the built-in rubric registry |
|
|
226
|
+
| `…/benchmarks` | `BenchmarkAdapter` contract + `deterministicSplit` + the bundled `routing` reference benchmark |
|
|
227
|
+
|
|
228
|
+
**Specialized surfaces** (subpath-only): `…/prm` (process-reward grading + best-of-N), `…/meta-eval` (judge calibration + the deployment-outcome store), `…/pipelines` (trace-diagnostic views: budget breach, failure cluster, stuck loop, …), `…/governance` (EU AI Act / NIST AI RMF / SOC2 reports), `…/knowledge` (knowledge-readiness gating before a run), `…/builder-eval` (code-generator three-layer eval), `…/storyboard` (trace → watchable replay), `…/authenticity` (anti-Goodhart "real or convincing BS" scorer over produced files), `…/workflow` (workflow-trace eval + partner export), `…/telemetry` (Workers-safe telemetry client).
|
|
229
|
+
|
|
230
|
+
The root export remains available for backward compatibility; new code should prefer the focused subpaths above — `/contract` first.
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Composition with the stack
|
|
235
|
+
|
|
236
|
+
agent-eval is the bottom of the layering: consumers depend on it, it depends on none of them.
|
|
237
|
+
|
|
238
|
+
```
|
|
239
|
+
agent-runtime Runs agents (chat turns, one-shot tasks, multi-attempt loops), captures every
|
|
240
|
+
run as a trace, and calls optimizePrompt / runImprovementLoop. Produces the
|
|
241
|
+
RunRecords + traces agent-eval scores. Depends on agent-eval.
|
|
242
|
+
|
|
243
|
+
agent-eval selfImprove, analyzeRuns, runCampaign + drivers (gepaDriver, …), the gates
|
|
244
|
+
(this repo) (heldOutGate, defaultProductionGate, paretoSignificanceGate), the InsightReport
|
|
245
|
+
decision packet, the RL bridge, the wire protocol. Depends on neither consumer.
|
|
246
|
+
|
|
247
|
+
agent-knowledge proposeKnowledgeWrites / applyKnowledgeWriteBlocks. agent-eval's analyst findings
|
|
248
|
+
feed it; the knowledge gate consumes them. Depends on agent-eval.
|
|
249
|
+
|
|
250
|
+
sandbox AgentProfile, Sandbox.create, streamPrompt. The execution surface the runtime's
|
|
251
|
+
loops run on; agent-eval scores what comes back.
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
The rule: **agent-eval has zero upward dependencies on a consumer.** A concept that makes sense *without* a running agent loop — a verdict, a run record, a scenario, a judge score — is substrate and lives here; a runtime-shaped one (a sandbox profile, a validation context with an abort signal) lives in agent-runtime. When in doubt, lean substrate.
|
|
226
255
|
|
|
227
256
|
---
|
|
228
257
|
|
|
229
258
|
## Concepts + design
|
|
230
259
|
|
|
231
|
-
- [`docs/concepts.md`](./docs/concepts.md) —
|
|
260
|
+
- [`docs/concepts.md`](./docs/concepts.md) — the three top-level functions, the layering rule, and the wire-protocol contract (the five core contract types are documented in the `/contract` barrel itself)
|
|
232
261
|
- [`docs/insight-report.md`](./docs/insight-report.md) — annotated walkthrough of every section of the decision packet
|
|
233
262
|
- [`docs/customer-journeys.md`](./docs/customer-journeys.md) — three end-to-end journeys with code + expected output
|
|
234
263
|
- [`docs/adapters-observability.md`](./docs/adapters-observability.md) — composing agent-eval with LangSmith, Langfuse, Phoenix, OpenLLMetry, TraceAI
|
|
@@ -287,7 +316,9 @@ pnpm test
|
|
|
287
316
|
|
|
288
317
|
## Stability + versioning
|
|
289
318
|
|
|
290
|
-
|
|
319
|
+
The `/contract` surface is the **stability contract**: its barrel freezes the API — a `0.x` minor only *adds*; nothing there changes shape or disappears. Depend on `/contract` (and the documented subpaths) rather than the root barrel.
|
|
320
|
+
|
|
321
|
+
In the deeper subpaths, `@stable` / `@experimental` JSDoc markers (visible in IDE hover + `.d.ts`) call out what may still move — most granularly in `/rl` (tagged per export) and `/campaign` (whole barrel `@experimental`, since `/contract` re-exports only its settled subset).
|
|
291
322
|
|
|
292
323
|
| Tag | Meaning |
|
|
293
324
|
|---|---|
|
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-4mm2msnR.js';
|
|
2
2
|
import '../run-record-sItO5ftF.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-4mm2msnR.js';
|
|
2
2
|
import '../run-record-sItO5ftF.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
package/dist/adapters/otel.d.ts
CHANGED
package/dist/analyst/index.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
2
2
|
import { M as MultiLayerVerifier, V as VerifyOptions, S as Severity } from '../multi-layer-verifier-DlWCXuxL.js';
|
|
3
3
|
import { c as RunCritic, a as RunTrace } from '../run-critic-BAIjX99r.js';
|
|
4
|
-
import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-
|
|
5
|
-
export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-
|
|
4
|
+
import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-qXEUV2w7.js';
|
|
5
|
+
export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-qXEUV2w7.js';
|
|
6
6
|
import { A as AnalyzeTracesOptions } from '../analyst-t7zZS3TV.js';
|
|
7
7
|
import { T as TraceAnalysisStore } from '../store-GmBE2pZZ.js';
|
|
8
8
|
import { b as JudgeFn, a as JudgeInput } from '../types-Croy5h7V.js';
|
|
@@ -10,7 +10,7 @@ import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../type
|
|
|
10
10
|
export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-DRvV0zRo.js';
|
|
11
11
|
import { TCloud } from '@tangle-network/tcloud';
|
|
12
12
|
export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-DqV2t1Xk.js';
|
|
13
|
-
export {
|
|
13
|
+
export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-BK0Zee01.js';
|
|
14
14
|
import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
|
|
15
15
|
import '../schema-m0gsnbt3.js';
|
|
16
16
|
import '../store-CKUAgsJz.js';
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import { c as CalibrationReport } from '../calibration-Cpr3WaX3.js';
|
|
2
|
+
import { O as OffPolicyEstimate, a as OffPolicyOptions, b as OffPolicyTrajectory } from '../off-policy-DiwuKKg7.js';
|
|
3
|
+
import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
4
|
+
import '../schema-m0gsnbt3.js';
|
|
5
|
+
import '../outcome-store-rnXLEqSn.js';
|
|
6
|
+
|
|
7
|
+
type BeliefDecisionKind = 'continue' | 'verify' | 'ask' | 'retry' | 'stop' | 'memory-write' | 'memory-read' | 'tool-select' | 'skill-select' | 'workflow-select' | 'surface-promote';
|
|
8
|
+
type BeliefEvidenceSource = 'run' | 'span' | 'event' | 'finding' | 'memory' | 'knowledge' | 'policy';
|
|
9
|
+
interface BeliefEvidenceRef {
|
|
10
|
+
source: BeliefEvidenceSource;
|
|
11
|
+
id: string;
|
|
12
|
+
runId?: string;
|
|
13
|
+
spanId?: string;
|
|
14
|
+
eventId?: string;
|
|
15
|
+
detail?: string;
|
|
16
|
+
metadata?: Record<string, unknown>;
|
|
17
|
+
}
|
|
18
|
+
interface BeliefDecisionOutcome {
|
|
19
|
+
success?: boolean;
|
|
20
|
+
score?: number;
|
|
21
|
+
reward?: number;
|
|
22
|
+
costUsd?: number;
|
|
23
|
+
observedAt?: string;
|
|
24
|
+
metadata?: Record<string, unknown>;
|
|
25
|
+
}
|
|
26
|
+
interface BeliefDecisionPoint {
|
|
27
|
+
id: string;
|
|
28
|
+
runId: string;
|
|
29
|
+
scenarioId?: string;
|
|
30
|
+
stepIndex: number;
|
|
31
|
+
kind: BeliefDecisionKind;
|
|
32
|
+
chosenAction: string;
|
|
33
|
+
candidateActions?: string[];
|
|
34
|
+
confidence?: number;
|
|
35
|
+
behaviorProb?: number;
|
|
36
|
+
targetProb?: number;
|
|
37
|
+
qHat?: number | null;
|
|
38
|
+
costUsd?: number;
|
|
39
|
+
evidence: BeliefEvidenceRef[];
|
|
40
|
+
outcome?: BeliefDecisionOutcome;
|
|
41
|
+
metadata?: Record<string, unknown>;
|
|
42
|
+
}
|
|
43
|
+
interface BeliefDecisionExtractionDiagnostic {
|
|
44
|
+
runId: string;
|
|
45
|
+
eventId?: string;
|
|
46
|
+
severity: 'info' | 'warning' | 'error';
|
|
47
|
+
reason: string;
|
|
48
|
+
}
|
|
49
|
+
interface BeliefDecisionExtractionReport {
|
|
50
|
+
decisions: BeliefDecisionPoint[];
|
|
51
|
+
diagnostics: BeliefDecisionExtractionDiagnostic[];
|
|
52
|
+
}
|
|
53
|
+
type BeliefPolicyAction = 'accept' | 'defer' | 'verify' | 'ask' | 'retry' | 'stop';
|
|
54
|
+
interface BeliefPolicyDecision {
|
|
55
|
+
action: BeliefPolicyAction;
|
|
56
|
+
confidence?: number;
|
|
57
|
+
targetProb?: number;
|
|
58
|
+
qHat?: number | null;
|
|
59
|
+
reason?: string;
|
|
60
|
+
}
|
|
61
|
+
interface BeliefSelectivePolicy {
|
|
62
|
+
id: string;
|
|
63
|
+
decide(point: BeliefDecisionPoint): BeliefPolicyDecision;
|
|
64
|
+
}
|
|
65
|
+
interface BeliefOpeTargetPolicy {
|
|
66
|
+
id: string;
|
|
67
|
+
targetProbOf(point: BeliefDecisionPoint): number | null | undefined;
|
|
68
|
+
qHatOf?(point: BeliefDecisionPoint): number | null | undefined;
|
|
69
|
+
}
|
|
70
|
+
interface BeliefUtilityOptions {
|
|
71
|
+
successUtility?: number;
|
|
72
|
+
failureUtility?: number;
|
|
73
|
+
deferUtility?: number;
|
|
74
|
+
verifyCost?: number;
|
|
75
|
+
askCost?: number;
|
|
76
|
+
retryCost?: number;
|
|
77
|
+
stopUtility?: number;
|
|
78
|
+
costWeight?: number;
|
|
79
|
+
}
|
|
80
|
+
interface BeliefSelectivePolicyMetrics {
|
|
81
|
+
policyId: string;
|
|
82
|
+
n: number;
|
|
83
|
+
accepted: number;
|
|
84
|
+
rejected: number;
|
|
85
|
+
coverage: number;
|
|
86
|
+
acceptedErrorRate: number;
|
|
87
|
+
baselineUtility: number;
|
|
88
|
+
policyUtility: number;
|
|
89
|
+
utilityDelta: number;
|
|
90
|
+
utilityCi95: {
|
|
91
|
+
mean: number;
|
|
92
|
+
lower: number;
|
|
93
|
+
upper: number;
|
|
94
|
+
};
|
|
95
|
+
rejectedMeanReward: number | null;
|
|
96
|
+
recommendation: 'ship' | 'hold' | 'need_more_data';
|
|
97
|
+
reasons: string[];
|
|
98
|
+
}
|
|
99
|
+
interface BeliefOpeSupportDiagnostics {
|
|
100
|
+
supported: boolean;
|
|
101
|
+
n: number;
|
|
102
|
+
dropped: number;
|
|
103
|
+
effectiveSampleSize: number;
|
|
104
|
+
effectiveSampleRatio: number;
|
|
105
|
+
maxImportanceWeight: number;
|
|
106
|
+
reasons: string[];
|
|
107
|
+
}
|
|
108
|
+
interface BeliefOpeReport {
|
|
109
|
+
targetPolicyId: string;
|
|
110
|
+
ips: OffPolicyEstimate;
|
|
111
|
+
snips: OffPolicyEstimate;
|
|
112
|
+
dr: OffPolicyEstimate;
|
|
113
|
+
support: BeliefOpeSupportDiagnostics;
|
|
114
|
+
}
|
|
115
|
+
type BeliefEvaluationStatus = 'ship' | 'hold' | 'need_more_data';
|
|
116
|
+
type BeliefCalibrationStatus = 'supported' | 'unsupported';
|
|
117
|
+
type BeliefOpeStatus = 'supported' | 'unsupported' | 'not_requested';
|
|
118
|
+
interface BeliefPolicyEvaluationReport {
|
|
119
|
+
policyId: string;
|
|
120
|
+
n: number;
|
|
121
|
+
status: BeliefEvaluationStatus;
|
|
122
|
+
selectiveStatus: BeliefEvaluationStatus;
|
|
123
|
+
calibrationStatus: BeliefCalibrationStatus;
|
|
124
|
+
opeStatus: BeliefOpeStatus;
|
|
125
|
+
opeTargetPolicyId?: string;
|
|
126
|
+
selective: BeliefSelectivePolicyMetrics;
|
|
127
|
+
calibration?: CalibrationReport;
|
|
128
|
+
ope?: BeliefOpeReport;
|
|
129
|
+
diagnostics: string[];
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
type BeliefCalibrationRegion = 'all' | 'accepted' | 'rejected';
|
|
133
|
+
interface BeliefCalibrationOptions {
|
|
134
|
+
bins?: number;
|
|
135
|
+
minPairs?: number;
|
|
136
|
+
policy?: BeliefSelectivePolicy;
|
|
137
|
+
region?: BeliefCalibrationRegion;
|
|
138
|
+
}
|
|
139
|
+
declare function calibrateBeliefDecisions(points: BeliefDecisionPoint[], options?: BeliefCalibrationOptions): CalibrationReport | null;
|
|
140
|
+
|
|
141
|
+
interface ExtractBeliefDecisionPointsOptions {
|
|
142
|
+
runIds?: string[];
|
|
143
|
+
}
|
|
144
|
+
declare function extractBeliefDecisionPoints(store: TraceStore, options?: ExtractBeliefDecisionPointsOptions): Promise<BeliefDecisionExtractionReport>;
|
|
145
|
+
|
|
146
|
+
interface BeliefOpeOptions extends OffPolicyOptions {
|
|
147
|
+
minEffectiveSampleSize?: number;
|
|
148
|
+
minEffectiveSampleRatio?: number;
|
|
149
|
+
maxDiagnostics?: number;
|
|
150
|
+
}
|
|
151
|
+
interface BeliefOffPolicyTrajectoryReport {
|
|
152
|
+
targetPolicyId: string;
|
|
153
|
+
trajectories: OffPolicyTrajectory[];
|
|
154
|
+
dropped: number;
|
|
155
|
+
diagnostics: string[];
|
|
156
|
+
}
|
|
157
|
+
declare function embeddedBeliefOpeTargetPolicy(id?: string): BeliefOpeTargetPolicy;
|
|
158
|
+
declare function beliefDecisionsToOffPolicyTrajectories(points: BeliefDecisionPoint[], targetPolicy: BeliefOpeTargetPolicy, options?: Pick<BeliefOpeOptions, 'maxDiagnostics'>): BeliefOffPolicyTrajectoryReport;
|
|
159
|
+
declare function evaluateBeliefOffPolicy(points: BeliefDecisionPoint[], targetPolicy: BeliefOpeTargetPolicy, options?: BeliefOpeOptions): BeliefOpeReport;
|
|
160
|
+
|
|
161
|
+
interface EvaluateBeliefSelectivePolicyOptions {
|
|
162
|
+
utility?: BeliefUtilityOptions;
|
|
163
|
+
minN?: number;
|
|
164
|
+
minAccepted?: number;
|
|
165
|
+
minUtilityDelta?: number;
|
|
166
|
+
seed?: number;
|
|
167
|
+
}
|
|
168
|
+
declare function thresholdSelectivePolicy(options: {
|
|
169
|
+
id?: string;
|
|
170
|
+
confidenceThreshold: number;
|
|
171
|
+
belowThresholdAction?: Exclude<BeliefPolicyAction, 'accept'>;
|
|
172
|
+
}): BeliefSelectivePolicy;
|
|
173
|
+
declare function evaluateBeliefSelectivePolicy(points: BeliefDecisionPoint[], policy: BeliefSelectivePolicy, options?: EvaluateBeliefSelectivePolicyOptions): BeliefSelectivePolicyMetrics;
|
|
174
|
+
|
|
175
|
+
interface AnalyzeBeliefPolicyOpeOptions extends BeliefOpeOptions {
|
|
176
|
+
targetPolicy?: BeliefOpeTargetPolicy;
|
|
177
|
+
}
|
|
178
|
+
interface AnalyzeBeliefPolicyOptions {
|
|
179
|
+
points: BeliefDecisionPoint[];
|
|
180
|
+
policy: BeliefSelectivePolicy;
|
|
181
|
+
selective?: EvaluateBeliefSelectivePolicyOptions;
|
|
182
|
+
calibration?: BeliefCalibrationOptions;
|
|
183
|
+
ope?: AnalyzeBeliefPolicyOpeOptions;
|
|
184
|
+
requireOpe?: boolean;
|
|
185
|
+
}
|
|
186
|
+
declare function analyzeBeliefPolicy(options: AnalyzeBeliefPolicyOptions): BeliefPolicyEvaluationReport;
|
|
187
|
+
|
|
188
|
+
export { type AnalyzeBeliefPolicyOpeOptions, type AnalyzeBeliefPolicyOptions, type BeliefCalibrationOptions, type BeliefCalibrationRegion, type BeliefCalibrationStatus, type BeliefDecisionExtractionDiagnostic, type BeliefDecisionExtractionReport, type BeliefDecisionKind, type BeliefDecisionOutcome, type BeliefDecisionPoint, type BeliefEvaluationStatus, type BeliefEvidenceRef, type BeliefEvidenceSource, type BeliefOffPolicyTrajectoryReport, type BeliefOpeOptions, type BeliefOpeReport, type BeliefOpeStatus, type BeliefOpeSupportDiagnostics, type BeliefOpeTargetPolicy, type BeliefPolicyAction, type BeliefPolicyDecision, type BeliefPolicyEvaluationReport, type BeliefSelectivePolicy, type BeliefSelectivePolicyMetrics, type BeliefUtilityOptions, type EvaluateBeliefSelectivePolicyOptions, type ExtractBeliefDecisionPointsOptions, analyzeBeliefPolicy, beliefDecisionsToOffPolicyTrajectories, calibrateBeliefDecisions, embeddedBeliefOpeTargetPolicy, evaluateBeliefOffPolicy, evaluateBeliefSelectivePolicy, extractBeliefDecisionPoints, thresholdSelectivePolicy };
|