@tangle-network/agent-eval 0.48.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +7 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/{traceai.d.ts → otel.d.ts} +29 -29
  5. package/dist/adapters/{traceai.js → otel.js} +9 -5
  6. package/dist/adapters/otel.js.map +1 -0
  7. package/dist/campaign/index.d.ts +3 -3
  8. package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
  9. package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
  10. package/dist/chunk-EGIPWXHL.js.map +1 -0
  11. package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
  12. package/dist/chunk-FQK2CCIM.js.map +1 -0
  13. package/dist/chunk-MAZ26DC7.js +99 -0
  14. package/dist/chunk-MAZ26DC7.js.map +1 -0
  15. package/dist/chunk-SHTXZ4O2.js +113 -0
  16. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  17. package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
  18. package/dist/contract/index.d.ts +206 -9
  19. package/dist/contract/index.js +751 -3
  20. package/dist/contract/index.js.map +1 -1
  21. package/dist/governance/index.d.ts +1 -1
  22. package/dist/hosted/index.d.ts +8 -192
  23. package/dist/hosted/index.js +1 -1
  24. package/dist/index-BRxz6qov.d.ts +409 -0
  25. package/dist/index.d.ts +18 -462
  26. package/dist/index.js +14 -106
  27. package/dist/index.js.map +1 -1
  28. package/dist/meta-eval/index.d.ts +3 -3
  29. package/dist/openapi.json +1 -1
  30. package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
  31. package/dist/registry-8KAs18kY.d.ts +457 -0
  32. package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
  33. package/dist/reporting.d.ts +6 -4
  34. package/dist/reporting.js +6 -4
  35. package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
  36. package/dist/rl.d.ts +9 -8
  37. package/dist/rl.js +3 -2
  38. package/dist/rl.js.map +1 -1
  39. package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  40. package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
  41. package/dist/sequential-5iSVfzl2.d.ts +139 -0
  42. package/dist/store-CJbzDxZ2.d.ts +220 -0
  43. package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
  44. package/dist/traces.d.ts +3 -220
  45. package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
  46. package/dist/types-DhqpAi_z.d.ts +296 -0
  47. package/docs/adapters-observability.md +3 -3
  48. package/package.json +5 -5
  49. package/dist/adapters/traceai.js.map +0 -1
  50. package/dist/chunk-MNL6LXGQ.js.map +0 -1
  51. package/dist/chunk-OYI6RZJK.js.map +0 -1
  52. /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
  53. /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
  54. /package/docs/design/{substrate-gaps-2026-05-27.md → substrate-gaps.md} +0 -0
@@ -0,0 +1,409 @@
1
+ import { M as MutableSurface, m as GateDecision } from './types-Dbj5gu8n.js';
2
+ import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-B7gNRX-r.js';
3
+ import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
4
+
5
+ /**
6
+ * # InsightReport — the rigorous decision packet for any set of agent runs.
7
+ *
8
+ * Returned by `analyzeRuns()` and embedded in `SelfImproveResult.insight` +
9
+ * the hosted-tier `EvalRunEvent.insightReport`. One shape across two surfaces:
10
+ *
11
+ * - **Customer who has a closed loop** (`selfImprove`): the report ships
12
+ * with the loop output. Their dashboard renders ship/hold + lift CI +
13
+ * calibration + cluster + Pareto in one packet.
14
+ * - **Customer who has observed runs but no loop** (`analyzeRuns` directly):
15
+ * same packet from a `RunRecord[]` they already have — production traces,
16
+ * approve/reject corpus, CSV gold set.
17
+ *
18
+ * Every field is optional except the distributional summary — fields are
19
+ * populated when the input data supports them:
20
+ *
21
+ * - `lift` requires both baseline and candidate splits to be present.
22
+ * - `interRater` requires multi-rater feedback (≥2 raters per run).
23
+ * - `judges` populates per-judge stats only when the run records carry
24
+ * `outcome.judgeScores`.
25
+ * - `failureClusters` requires the optional `analystRegistry` to be wired.
26
+ * - `contamination` requires canary scenarios to be passed in.
27
+ * - `outcomeCorrelation` requires a downstream outcome signal.
28
+ * - `sequential` requires the run set to be ordered (treats them as a
29
+ * stream and emits an anytime-valid interim decision).
30
+ *
31
+ * Consumers read the `recommendations` array first — that's the
32
+ * actionable layer, ranked by priority. The numeric sections back it up.
33
+ */
34
+
35
+ interface InsightReport {
36
+ /** Number of runs analyzed. */
37
+ n: number;
38
+ /** Composite-score distribution across all runs. Always present. */
39
+ composite: ScalarDistribution;
40
+ /** Per-dimension distributions for every dimension that appeared in any
41
+ * run's judge scores. Empty when no judge scores were recorded. */
42
+ perDimension: Record<string, ScalarDistribution>;
43
+ /** Cost/quality distribution and Pareto frontier. */
44
+ costQuality: {
45
+ cost: ScalarDistribution;
46
+ pareto: ParetoFigureSpec;
47
+ };
48
+ /** Per-judge calibration + bias detection. Populated for every judge name
49
+ * that appears in `outcome.judgeScores`. Bias fields require either a
50
+ * gold reference or multi-rater data. */
51
+ judges: Record<string, JudgeInsight>;
52
+ /** Inter-rater agreement when multiple judges scored the same runs.
53
+ * Includes pairwise kappa and the specific run ids where raters
54
+ * disagree — the cases worth a human meeting. */
55
+ interRater?: InterRaterInsight;
56
+ /** Pairwise lift (baseline → candidate) with bootstrap CI. Present when
57
+ * `RunRecord.splitTag` includes both `holdout` and search/dev splits,
58
+ * or when caller passes an explicit baseline/candidate split. */
59
+ lift?: LiftInsight;
60
+ /** Failure clusters with exemplars. Populated when an AnalystRegistry
61
+ * is wired in `analyzeRuns({ analyst })`. */
62
+ failureClusters?: FailureClusterInsight;
63
+ /** Canary leak count + holdout audit status. Populated when canary
64
+ * scenarios are passed in. */
65
+ contamination?: ContaminationInsight;
66
+ /** Correlation between judge composite and a downstream outcome the
67
+ * caller supplies (engagement, revenue, downstream pass rate, etc.).
68
+ * When present, the optional reward model is the model that maps
69
+ * judge scores → predicted outcome. */
70
+ outcomeCorrelation?: OutcomeCorrelationInsight;
71
+ /** Aggregate release-readiness summary. A consumer needing the full
72
+ * substrate `ReleaseConfidenceScorecard` (SLO-axis evaluation,
73
+ * ActionableSideInfo bag) calls `evaluateReleaseConfidence()` directly;
74
+ * this summary captures the analyzeRuns-derived axes. */
75
+ release: ReleaseSummary;
76
+ /** Top-N actionable recommendations, ranked by priority. The packet's
77
+ * human-readable layer; the numeric sections are the evidence. */
78
+ recommendations: Recommendation[];
79
+ }
80
+ /** Distributional summary of a scalar-valued metric. */
81
+ interface ScalarDistribution {
82
+ /** Sample count after dropping non-finite values. */
83
+ n: number;
84
+ mean: number;
85
+ p50: number;
86
+ p95: number;
87
+ stddev: number;
88
+ min: number;
89
+ max: number;
90
+ /** Histogram bins using `agent-eval`'s `gainHistogram` primitive. */
91
+ histogram: GainDistributionBin[];
92
+ }
93
+ interface JudgeInsight {
94
+ /** Number of times this judge scored a run. */
95
+ n: number;
96
+ /** Mean composite over this judge's runs. */
97
+ meanScore: number;
98
+ /** Calibration against a gold reference, when provided. Cohen's κ for
99
+ * binary thresholding + continuous agreement metrics. */
100
+ calibration?: ContinuousAgreement;
101
+ /** Positional bias — when the judge sees options in different orders,
102
+ * do its preferences track the content or the position? */
103
+ positionalBias?: number;
104
+ /** Self-preference — when the judge sees its own model's output vs a
105
+ * competitor, does it over-pick its own? */
106
+ selfPreference?: number;
107
+ /** Verbosity bias — does the judge reward longer outputs regardless of
108
+ * quality? */
109
+ verbosityBias?: number;
110
+ }
111
+ interface InterRaterInsight {
112
+ /** Number of raters whose scores were aggregated. */
113
+ raters: number;
114
+ /** Number of runs every rater scored. */
115
+ jointlyRated: number;
116
+ /** Cohen's κ averaged across rater pairs. */
117
+ kappa: number;
118
+ /** Pairwise κ per rater pair (key = `"raterA::raterB"`). */
119
+ perPair: Record<string, number>;
120
+ /** Run ids where raters disagree the most — the high-value triage list. */
121
+ disagreementCases: Array<{
122
+ runId: string;
123
+ ratings: Array<{
124
+ rater: string;
125
+ score: number;
126
+ }>;
127
+ range: number;
128
+ }>;
129
+ }
130
+ interface LiftInsight {
131
+ baselineMean: number;
132
+ candidateMean: number;
133
+ /** Candidate − baseline. */
134
+ delta: number;
135
+ /** Lower / upper bound of bootstrap CI on the delta. */
136
+ ci95: [number, number];
137
+ /** Paired-t-test p-value. */
138
+ pValue: number;
139
+ /** Number of paired observations. */
140
+ n: number;
141
+ /** Cohen's d for the delta. */
142
+ cohensD: number;
143
+ /** Minimum detectable effect at current n, 80% power. */
144
+ mde: number;
145
+ /** Sample size needed to detect the observed delta at 80% power. */
146
+ requiredN: number;
147
+ }
148
+ interface FailureClusterInsight {
149
+ /** All clusters identified by the registry, ranked by share descending. */
150
+ clusters: Array<{
151
+ id: string;
152
+ name: string;
153
+ /** Fraction of failed runs in this cluster, 0..1. */
154
+ share: number;
155
+ /** Exemplar `runId`s (≤ 5) the consumer can drill into. */
156
+ exemplars: string[];
157
+ /** Short LLM-generated suggested fix when the registry supports it. */
158
+ suggestedFix?: string;
159
+ }>;
160
+ totalFailures: number;
161
+ }
162
+ interface ContaminationInsight {
163
+ /** Canary phrases that leaked into outputs. */
164
+ leaks: number;
165
+ /** Holdout audit verdict — did any holdout-tagged run end up in the
166
+ * search/dev pool, or vice versa? */
167
+ holdoutAuditPassed: boolean;
168
+ details?: Array<{
169
+ runId: string;
170
+ canary: string;
171
+ matched: string;
172
+ }>;
173
+ }
174
+ interface OutcomeCorrelationInsight {
175
+ /** What outcome the consumer is correlating against (e.g.
176
+ * `'engagement_rate'`, `'approval_rate'`, `'downstream_pass'`). */
177
+ metric: string;
178
+ /** Number of (run, outcome) pairs used. */
179
+ n: number;
180
+ /** Pearson correlation between composite score and outcome. */
181
+ pearson: number;
182
+ /** Spearman rank correlation — robust to monotonic non-linearity. */
183
+ spearman: number;
184
+ /** When present, the simple linear reward model fit to the data. */
185
+ rewardModel?: {
186
+ intercept: number;
187
+ slope: number;
188
+ r2: number;
189
+ };
190
+ }
191
+ interface ReleaseSummary {
192
+ /** Overall verdict across axes — fail if any axis fails, else warn if any
193
+ * warns, else pass. */
194
+ status: 'pass' | 'warn' | 'fail';
195
+ axes: Array<{
196
+ name: 'quality-lift' | 'contamination' | 'composite-distribution';
197
+ status: 'pass' | 'warn' | 'fail';
198
+ detail: string;
199
+ }>;
200
+ /** Free-form issues surfaced beyond the standard axes. Empty by default;
201
+ * consumers can post-process to populate. */
202
+ issues: string[];
203
+ }
204
+ interface Recommendation {
205
+ priority: 'critical' | 'high' | 'medium' | 'low';
206
+ kind: 'ship' | 'hold' | 'investigate' | 'fix' | 'recalibrate' | 'expand-corpus';
207
+ title: string;
208
+ detail: string;
209
+ /** Optional pointer back into the report for the evidence. */
210
+ evidencePath?: string;
211
+ }
212
+
213
+ /**
214
+ * # Hosted-tier wire format — the schema that EVERY orchestrator (ours,
215
+ * a partner's self-hosted one, a future open implementation) must accept.
216
+ *
217
+ * **Stability:** every type in this file is committed under semver. New
218
+ * minors only ADD optional fields. Breaking changes mean a major bump
219
+ * (`HostedWireVersion` literal increment).
220
+ *
221
+ * The wire format is two event streams in one transport:
222
+ *
223
+ * 1. **Eval-run events** (`POST /v1/ingest/eval-runs`). Posted when a
224
+ * campaign / improvement-loop completes (or per-generation if
225
+ * streaming). Carries the structured result + per-cell scores +
226
+ * surface diffs the orchestrator stores for the dashboard.
227
+ *
228
+ * 2. **Trace spans** (`POST /v1/ingest/traces`). Standard OTLP-shaped
229
+ * spans with a few additional attributes so the orchestrator can
230
+ * pivot from eval-run → underlying execution. Compatible with any
231
+ * OTel collector.
232
+ *
233
+ * Both endpoints are authenticated with a bearer token + a tenant id
234
+ * header. Tenants isolate everything downstream of ingest; no tenant
235
+ * ever sees another tenant's data.
236
+ */
237
+
238
+ declare const HOSTED_WIRE_VERSION: "2026-05-26.v1";
239
+ type HostedWireVersion = typeof HOSTED_WIRE_VERSION;
240
+ /** Every ingest request carries these. */
241
+ interface HostedIngestHeaders {
242
+ /** Bearer token. The orchestrator validates against the tenant key. */
243
+ authorization: `Bearer ${string}`;
244
+ /** Stable tenant id (the orchestrator-side primary key for the tenant). */
245
+ 'x-tangle-tenant-id': string;
246
+ /** Wire-version pin so the server can reject incompatible payloads. */
247
+ 'x-tangle-wire-version': HostedWireVersion;
248
+ /** Optional idempotency key for retry-safe ingest. */
249
+ 'idempotency-key'?: string;
250
+ }
251
+ /** Lifecycle stages of an eval-run as the substrate reports them. */
252
+ type EvalRunStatus = 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored';
253
+ interface EvalRunCellScore {
254
+ /** Stable scenario id from the consumer's scenario set. */
255
+ scenarioId: string;
256
+ /** Repetition index when reps > 1; 0 for the default. */
257
+ rep: number;
258
+ /** Composite score across all judges + dimensions for this cell. */
259
+ compositeMean: number;
260
+ /** Per-judge → per-dimension scores; null where the judge did not run. */
261
+ dimensions: Record<string, Record<string, number>>;
262
+ /** Per-cell error message if the dispatch threw. Null on success. */
263
+ errorMessage?: string;
264
+ }
265
+ interface EvalRunGenerationSnapshot {
266
+ /** Generation index. 0 is baseline. */
267
+ index: number;
268
+ /** Candidate surface fingerprint (stable hash) — pivot key into the
269
+ * trace stream to fetch the underlying execution. */
270
+ surfaceHash: string;
271
+ /** The candidate surface itself. May be omitted to avoid PII when the
272
+ * consumer prefers not to ship verbatim prompts. */
273
+ surface?: MutableSurface;
274
+ /** Per-cell scores for this generation. */
275
+ cells: EvalRunCellScore[];
276
+ /** Aggregate composite mean across all cells in this generation. */
277
+ compositeMean: number;
278
+ /** Total $ spent across this generation. */
279
+ costUsd: number;
280
+ /** Wall-clock duration of this generation. */
281
+ durationMs: number;
282
+ }
283
+ /**
284
+ * The top-level eval-run event. One ingest call per logical eval-run;
285
+ * generations stream in incrementally via repeated calls with the same
286
+ * `runId`. The orchestrator deduplicates by `(runId, generation.index)`.
287
+ */
288
+ interface EvalRunEvent {
289
+ /** Stable run id (the substrate's `runId`). UUID or substrate-generated. */
290
+ runId: string;
291
+ /** Where this run was happening — derived from `RunCampaignOptions.runDir`. */
292
+ runDir: string;
293
+ /** ISO-8601 timestamp the substrate recorded the event. */
294
+ timestamp: string;
295
+ /** Lifecycle stage this event represents. */
296
+ status: EvalRunStatus;
297
+ /** Free-form consumer tags (env, branch, model id, etc.). Searchable. */
298
+ labels: Record<string, string>;
299
+ /** Baseline campaign snapshot. Present when status >= baseline-complete. */
300
+ baseline?: EvalRunGenerationSnapshot;
301
+ /** Per-generation snapshots. Streams in; orchestrator appends. */
302
+ generations: EvalRunGenerationSnapshot[];
303
+ /** Final gate decision. Present when status >= gate-decided. */
304
+ gateDecision?: GateDecision;
305
+ /** Held-out lift = winner-on-holdout - baseline-on-holdout. */
306
+ holdoutLift?: number;
307
+ /** Total $ spent across baseline + every generation. */
308
+ totalCostUsd: number;
309
+ /** Total wall-clock duration. */
310
+ totalDurationMs: number;
311
+ /** Error message if status === 'errored'. */
312
+ errorMessage?: string;
313
+ /** Rigor packet emitted alongside the run — distributional summary,
314
+ * paired-bootstrap lift CI, judge stats, inter-rater agreement,
315
+ * contamination check, failure clusters (when an analyst is wired),
316
+ * outcome correlation (when downstream signal is supplied), and the
317
+ * recommendations the dashboard surfaces verbatim. Additive; older
318
+ * clients that don't know about this field continue to work. */
319
+ insightReport?: InsightReport;
320
+ }
321
+ /**
322
+ * OTel-shape span with a few additional attributes for eval-run pivoting.
323
+ * Compatible with any OTLP collector — `name`, `traceId`, `spanId`,
324
+ * `startTimeUnixNano`, `endTimeUnixNano`, `attributes` are stock OTel.
325
+ */
326
+ interface TraceSpanEvent {
327
+ traceId: string;
328
+ spanId: string;
329
+ parentSpanId?: string;
330
+ name: string;
331
+ startTimeUnixNano: number;
332
+ endTimeUnixNano: number;
333
+ attributes: Record<string, string | number | boolean>;
334
+ events?: Array<{
335
+ timeUnixNano: number;
336
+ name: string;
337
+ attributes?: Record<string, string | number | boolean>;
338
+ }>;
339
+ status?: {
340
+ code: 'OK' | 'ERROR' | 'UNSET';
341
+ message?: string;
342
+ };
343
+ /** Pivot back into the eval-run stream. */
344
+ 'tangle.runId'?: string;
345
+ /** Pivot to the specific generation. */
346
+ 'tangle.generation'?: number;
347
+ /** Pivot to the specific cell. */
348
+ 'tangle.cellId'?: string;
349
+ /** Pivot to the specific scenario. */
350
+ 'tangle.scenarioId'?: string;
351
+ }
352
+ interface IngestEvalRunsRequest {
353
+ wireVersion: HostedWireVersion;
354
+ events: EvalRunEvent[];
355
+ }
356
+ interface IngestTracesRequest {
357
+ wireVersion: HostedWireVersion;
358
+ spans: TraceSpanEvent[];
359
+ }
360
+ interface IngestResponse {
361
+ /** Accepted events / spans count. */
362
+ accepted: number;
363
+ /** Rejected events with reasons (validation failures, dup idempotency key, etc.). */
364
+ rejected: Array<{
365
+ index: number;
366
+ reason: string;
367
+ }>;
368
+ }
369
+
370
+ /**
371
+ * # Hosted-tier ingest client.
372
+ *
373
+ * Ships eval-run events + trace spans to any orchestrator (ours, a
374
+ * partner's self-hosted one, or a future open implementation) that
375
+ * speaks the wire format in `./types.ts`.
376
+ *
377
+ * Three modes:
378
+ * - **Ours:** point at `https://orchestrator.tangle.tools/v1`. We
379
+ * handle ingest + storage + dashboard.
380
+ * - **Self-hosted:** point at whatever URL runs the reference receiver
381
+ * from `examples/hosted-ingest-server/`.
382
+ * - **Off (default):** when `hostedTenant` is unset, nothing is sent.
383
+ * Everything stays local.
384
+ */
385
+
386
+ interface HostedTenant {
387
+ /** Orchestrator endpoint base URL (no trailing slash). Required. */
388
+ endpoint: string;
389
+ /** Bearer token issued by the orchestrator. Required. */
390
+ apiKey: string;
391
+ /** Tenant id — the orchestrator's primary key for this consumer. Required. */
392
+ tenantId: string;
393
+ /** Optional `fetch` override (auth wrappers, custom agent, test mocks). */
394
+ fetchImpl?: typeof fetch;
395
+ /** Per-call timeout in ms. Default 30s. */
396
+ timeoutMs?: number;
397
+ /** Retries on 5xx / network errors. Default 2. */
398
+ retries?: number;
399
+ }
400
+ interface HostedClient {
401
+ ingestEvalRun(event: EvalRunEvent, idempotencyKey?: string): Promise<IngestResponse>;
402
+ ingestEvalRuns(events: EvalRunEvent[], idempotencyKey?: string): Promise<IngestResponse>;
403
+ ingestTraces(spans: TraceSpanEvent[], idempotencyKey?: string): Promise<IngestResponse>;
404
+ readonly tenant: HostedTenant;
405
+ readonly wireVersion: HostedWireVersion;
406
+ }
407
+ declare function createHostedClient(tenant: HostedTenant): HostedClient;
408
+
409
+ export { type EvalRunCellScore as E, type FailureClusterInsight as F, type HostedClient as H, type InsightReport as I, type JudgeInsight as J, type LiftInsight as L, type OutcomeCorrelationInsight as O, type Recommendation as R, type ScalarDistribution as S, type TraceSpanEvent as T, type HostedTenant as a, type InterRaterInsight as b, type ReleaseSummary as c, type EvalRunEvent as d, type EvalRunGenerationSnapshot as e, type EvalRunStatus as f, HOSTED_WIRE_VERSION as g, type HostedIngestHeaders as h, type HostedWireVersion as i, type IngestEvalRunsRequest as j, type IngestResponse as k, type IngestTracesRequest as l, createHostedClient as m };