@tangle-network/agent-eval 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/dist/adapters/otel.d.ts +1 -1
  3. package/dist/campaign/index.d.ts +110 -6
  4. package/dist/campaign/index.js +26 -19
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
  7. package/dist/chunk-6XQIEUQ2.js.map +1 -0
  8. package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
  9. package/dist/chunk-DFS3FEXO.js.map +1 -0
  10. package/dist/chunk-MZ2IYGGN.js +592 -0
  11. package/dist/chunk-MZ2IYGGN.js.map +1 -0
  12. package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
  13. package/dist/chunk-NV2PF37Q.js.map +1 -0
  14. package/dist/contract/index.d.ts +11 -9
  15. package/dist/contract/index.js +11 -12
  16. package/dist/contract/index.js.map +1 -1
  17. package/dist/hosted/index.d.ts +1 -1
  18. package/dist/hosted/index.js +1 -1
  19. package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
  20. package/dist/index.d.ts +251 -7
  21. package/dist/index.js +292 -2
  22. package/dist/index.js.map +1 -1
  23. package/dist/openapi.json +1 -1
  24. package/dist/provenance-CChUqexv.d.ts +314 -0
  25. package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
  26. package/dist/release-report-CN8hJlhk.d.ts +233 -0
  27. package/dist/reporting.d.ts +4 -3
  28. package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
  29. package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
  30. package/dist/statistics-B7yCbi9i.d.ts +253 -0
  31. package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
  32. package/package.json +1 -1
  33. package/dist/chunk-4ODZXQV2.js.map +0 -1
  34. package/dist/chunk-7TPYV2ER.js.map +0 -1
  35. package/dist/chunk-CZRKD2X2.js +0 -1104
  36. package/dist/chunk-CZRKD2X2.js.map +0 -1
  37. package/dist/chunk-E22YUOAL.js +0 -111
  38. package/dist/chunk-E22YUOAL.js.map +0 -1
  39. package/dist/chunk-HKINEDRZ.js.map +0 -1
  40. package/dist/release-report-DGoeObZT.d.ts +0 -484
  41. /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
@@ -1,79 +1,5 @@
1
- import { S as Scenario, f as CampaignResult, k as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, M as MutableSurface, m as GenerationRecord, P as ParetoParent, j as GateDecision } from './types-c2R2kfmv.js';
2
1
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
3
- import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
4
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
5
- import { H as HostedClient, T as TraceSpanEvent } from './index-CzhtwYBT.js';
6
-
7
- /**
8
- * @experimental
9
- *
10
- * `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's
11
- * `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening
12
- * code consumers duplicated 4 times. The PR body includes the campaign's
13
- * manifest hash, gate verdict, and scorecard summary so reviewers can see
14
- * exactly what was promoted + why.
15
- *
16
- * NOT a deploy mechanism — this only OPENS a PR. The human reviews + merges.
17
- * The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is
18
- * deferred to Pass B with the full shadow / canary / rollback stack.
19
- */
20
-
21
- interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {
22
- /** Campaign result to attach to the PR. */
23
- result: CampaignResult<TArtifact, TScenario>;
24
- /** Gate verdict explaining the promotion. Substrate refuses to open a PR
25
- * when `gate.decision !== 'ship'` — fails loud. */
26
- gate: GateResult;
27
- /** Promoted surface diff — typically the new system prompt addendum or
28
- * full profile diff. Substrate writes it as the PR body. */
29
- promotedDiff: string;
30
- /** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */
31
- ghOwner: string;
32
- ghRepo: string;
33
- /** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */
34
- branch?: string;
35
- /** PR title. Default includes manifest hash. */
36
- title?: string;
37
- /** Whether to actually open the PR or just dry-run. Default reads
38
- * `GH_AUTO_PR_TOKEN` env — present = open, absent = dry-run. */
39
- dryRun?: boolean;
40
- /** Test seam — substitute `gh pr create` invocation. */
41
- ghExec?: (args: string[]) => {
42
- stdout: string;
43
- stderr: string;
44
- status: number;
45
- };
46
- }
47
- interface OpenAutoPrResult {
48
- opened: boolean;
49
- prUrl?: string;
50
- dryRun: boolean;
51
- reason: string;
52
- }
53
- declare function openAutoPr<TArtifact, TScenario extends Scenario>(options: OpenAutoPrOptions<TArtifact, TScenario>): OpenAutoPrResult;
54
-
55
- /**
56
- * @experimental
57
- *
58
- * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
59
- * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
60
- * the evolutionary strategy: each generation, mutate the current best surface
61
- * into N candidates, measure, select. No generation memory beyond the current
62
- * surface; the loop body handles ranking + promotion.
63
- *
64
- * The reflective alternative is agent-runtime's `improvementDriver` with a
65
- * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +
66
- * trace findings to propose targeted edits rather than blind mutations. Both
67
- * conform to `ImprovementDriver`; the improvement loop is identical regardless
68
- * of which drives it.
69
- */
70
-
71
- interface EvolutionaryDriverOptions<TFindings = unknown> {
72
- mutator: Mutator<TFindings>;
73
- /** External findings fed to the mutator each generation. Default: []. */
74
- findings?: TFindings[];
75
- }
76
- declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
2
+ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-c2R2kfmv.js';
77
3
 
78
4
  /**
79
5
  * @experimental
@@ -167,66 +93,50 @@ declare function countSentenceEdits(baseline: string, candidate: string): number
167
93
  /**
168
94
  * @experimental
169
95
  *
170
- * Compose multiple `Gate` implementations every gate must pass for the
171
- * composite to ship. Closes the alignment reviewer's "default-only
172
- * heldOutGate + costGate would happily promote a reward-hacked prompt"
173
- * concern by making safety gates first-class composable defaults.
174
- */
175
-
176
- /** Compose gates — all must `ship` for the composite to `ship`. First
177
- * non-ship verdict short-circuits the composite verdict, but ALL gates run
178
- * (so the result records every gate's reason — useful for diagnostics). */
179
- declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
180
-
181
- /**
182
- * @experimental
183
- *
184
- * `defaultProductionGate` — composes the substrate's existing safety
185
- * primitives (red-team / reward-hacking / canary / heldout) into a single
186
- * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
187
- * primitives are off the critical path" blocker.
96
+ * `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's
97
+ * `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening
98
+ * code consumers duplicated 4 times. The PR body includes the campaign's
99
+ * manifest hash, gate verdict, and scorecard summary so reviewers can see
100
+ * exactly what was promoted + why.
188
101
  *
189
- * The composition is opinionatedwhen consumers wire `runImprovementLoop`,
190
- * THIS gate is the default. Consumers can still pass a custom gate to
191
- * override; the recommended pattern is to compose THIS gate with whatever
192
- * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
102
+ * NOT a deploy mechanismthis only OPENS a PR. The human reviews + merges.
103
+ * The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is
104
+ * deferred to Pass B with the full shadow / canary / rollback stack.
193
105
  */
194
106
 
195
- interface DefaultProductionGateOptions {
196
- /** Required: scenarios held out from training; substrate compares
197
- * candidate-on-holdout vs baseline-on-holdout. */
198
- holdoutScenarios: Scenario[];
199
- /** Minimum mean-composite improvement required to ship. Default 0.5. */
200
- deltaThreshold?: number;
201
- /** Total $ budget for ALL cells in this campaign including baseline + candidate.
202
- * Composite verdict refuses to ship when spend exceeded budget. */
203
- budgetUsd?: number;
204
- /** Red-team cases to probe candidate outputs against. When omitted the
205
- * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
206
- * battery for tighter coverage. */
207
- redTeamBattery?: RedTeamCase[];
208
- /** Run records (oldest-first) needed for the reward-hacking detector.
209
- * Substrate populates from prior production-loop generations. */
210
- recentRuns?: RunRecord[];
211
- /** When true, the gate refuses to ship if the reward-hacking detector
212
- * fires at the `gaming` severity. Default true. */
213
- blockOnRewardHackingGaming?: boolean;
107
+ interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {
108
+ /** Campaign result to attach to the PR. */
109
+ result: CampaignResult<TArtifact, TScenario>;
110
+ /** Gate verdict explaining the promotion. Substrate refuses to open a PR
111
+ * when `gate.decision !== 'ship'` fails loud. */
112
+ gate: GateResult;
113
+ /** Promoted surface diff typically the new system prompt addendum or
114
+ * full profile diff. Substrate writes it as the PR body. */
115
+ promotedDiff: string;
116
+ /** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */
117
+ ghOwner: string;
118
+ ghRepo: string;
119
+ /** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */
120
+ branch?: string;
121
+ /** PR title. Default includes manifest hash. */
122
+ title?: string;
123
+ /** Whether to actually open the PR or just dry-run. Default reads
124
+ * `GH_AUTO_PR_TOKEN` env present = open, absent = dry-run. */
125
+ dryRun?: boolean;
126
+ /** Test seam — substitute `gh pr create` invocation. */
127
+ ghExec?: (args: string[]) => {
128
+ stdout: string;
129
+ stderr: string;
130
+ status: number;
131
+ };
214
132
  }
215
- declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
216
-
217
- /**
218
- * @experimental
219
- *
220
- * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
221
- * `Gate`. Use when you want held-out as one of N composed gates instead of
222
- * the full `defaultProductionGate` stack.
223
- */
224
-
225
- interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
226
- scenarios: TScenario[];
227
- deltaThreshold?: number;
133
+ interface OpenAutoPrResult {
134
+ opened: boolean;
135
+ prUrl?: string;
136
+ dryRun: boolean;
137
+ reason: string;
228
138
  }
229
- declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
139
+ declare function openAutoPr<TArtifact, TScenario extends Scenario>(options: OpenAutoPrOptions<TArtifact, TScenario>): OpenAutoPrResult;
230
140
 
231
141
  /**
232
142
  * @experimental
@@ -303,6 +213,16 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
303
213
  costCeiling?: number;
304
214
  /** Max concurrent cells. Default 2. */
305
215
  maxConcurrency?: number;
216
+ /**
217
+ * Per-cell dispatch deadline in ms. A `dispatch` that neither resolves nor
218
+ * rejects within this window is a hang (a stalled model request, an
219
+ * exhausted runtime resource, a backend that never closes its stream). When
220
+ * set, the cell's `ctx.signal` is aborted and the cell is recorded as a LOUD
221
+ * error (`dispatch exceeded <N>ms`) so the campaign proceeds and the failure
222
+ * is visible — instead of one wedged cell silently hanging the whole run (and
223
+ * every loop/CI job above it) forever. `undefined`/`0` = unbounded (legacy).
224
+ */
225
+ dispatchTimeoutMs?: number;
306
226
  /** Required: where artifacts + traces land. */
307
227
  runDir: string;
308
228
  /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted
@@ -349,21 +269,6 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
349
269
  }
350
270
  declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
351
271
 
352
- /**
353
- * @experimental
354
- *
355
- * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
356
- * gate, no auto-PR. Just: run scenarios through dispatch, score with
357
- * judges, return CampaignResult.
358
- *
359
- * The 80% case for consumers who want a scorecard, not an improvement loop.
360
- */
361
-
362
- interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
363
- runDir: string;
364
- }
365
- declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
366
-
367
272
  /**
368
273
  * @experimental
369
274
  *
@@ -497,187 +402,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
497
402
  declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
498
403
  declare function defaultRenderDiff(winnerSurface: MutableSurface, baselineSurface: MutableSurface): string;
499
404
 
500
- /**
501
- * @experimental
502
- *
503
- * Loop provenance — the durable, queryable record of WHAT a self-improvement
504
- * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
505
- * an eval-run to the underlying candidate→cell→gate→promote chain.
506
- *
507
- * Two artifacts, one source of truth:
508
- *
509
- * 1. `LoopProvenanceRecord` — a structured JSON record capturing every
510
- * candidate (surfaceHash + label + rationale), its measured composite,
511
- * the gate decision + reasons + delta, the held-out lift, the explicit
512
- * baseline→candidate diff, and BACKEND PROVENANCE (the
513
- * `assertRealBackend` verdict + worker call count + model). This is the
514
- * ingestable audit artifact: the +lift recomputes from it, the "because
515
- * Z" rationale survives in it, and a stub backend is detectable from it.
516
- *
517
- * 2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
518
- * `TraceSpanEvent`s, pivoted on the substrate's standard
519
- * `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
520
- * `tangle.generation` attributes (the same pivots `/adapters/otel`
521
- * reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
522
- * not just the `cost.*` spans `runCampaign` already emits per cell.
523
- *
524
- * The record is built from the substrate's own loop result + the per-call
525
- * `RunRecord`s the worker emitted — no new measurement, no recomputation that
526
- * could drift from what the gate actually saw.
527
- */
528
-
529
- /** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
530
- * their worktree+base identity since the content lives in git. Distinct from
531
- * `surfaceHash` (16-char content fingerprint used as a loop identity key);
532
- * this is the byte-identical-verifiable content hash the provenance record +
533
- * `RunRecord.promptHash` carry. */
534
- declare function surfaceContentHash(surface: MutableSurface): string;
535
- interface LoopProvenanceCandidate {
536
- /** Generation index this candidate was proposed in. */
537
- generation: number;
538
- /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
539
- surfaceHash: string;
540
- /** Full sha256 content hash — byte-identical-verifiable. */
541
- contentHash: string;
542
- /** Driver label, when the driver returned a `ProposedCandidate`. */
543
- label?: string;
544
- /** Driver rationale — the "because Z". When the driver returned a bare
545
- * surface (blind mutator) this is absent. */
546
- rationale?: string;
547
- /** Mean composite this candidate scored on the search split. */
548
- composite: number;
549
- /** Whether this candidate was promoted out of its generation. */
550
- promoted: boolean;
551
- }
552
- interface LoopProvenanceBackend {
553
- /** `assertRealBackend`-grade verdict over the worker call records. */
554
- verdict: 'real' | 'mixed' | 'stub';
555
- /** Number of worker LLM calls captured (the audit's "worker call count"). */
556
- workerCallCount: number;
557
- /** Distinct model ids observed across worker calls. */
558
- models: string[];
559
- totalInputTokens: number;
560
- totalOutputTokens: number;
561
- totalCostUsd: number;
562
- }
563
- /**
564
- * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
565
- * ADDS the rationale + the explicit baseline→candidate diff (both omitted from
566
- * the bare hosted event) + backend provenance.
567
- */
568
- interface LoopProvenanceRecord {
569
- schema: 'tangle.loop-provenance.v1';
570
- runId: string;
571
- runDir: string;
572
- timestamp: string;
573
- /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
574
- baselineContentHash: string;
575
- winnerContentHash: string;
576
- /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
577
- winnerLabel?: string;
578
- winnerRationale?: string;
579
- /** The explicit baseline→winner unified diff the gate decided on. */
580
- diff: string;
581
- /** Every candidate across every generation, each carrying its rationale. */
582
- candidates: LoopProvenanceCandidate[];
583
- /** The gate verdict — decision + reasons + contributing gates + delta. */
584
- gate: {
585
- decision: GateDecision;
586
- reasons: string[];
587
- delta?: number;
588
- contributingGates: Array<{
589
- name: string;
590
- passed: boolean;
591
- }>;
592
- };
593
- /** baseline-on-holdout composite mean. */
594
- baselineHoldoutComposite: number;
595
- /** winner-on-holdout composite mean. */
596
- winnerHoldoutComposite: number;
597
- /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
598
- heldOutLift: number;
599
- /** Backend provenance: stub-vs-real verdict + worker call count + models. */
600
- backend: LoopProvenanceBackend;
601
- totalCostUsd: number;
602
- totalDurationMs: number;
603
- }
604
- interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
605
- runId: string;
606
- runDir: string;
607
- timestamp: string;
608
- baselineSurface: MutableSurface;
609
- winnerSurface: MutableSurface;
610
- winnerLabel?: string;
611
- winnerRationale?: string;
612
- diff: string;
613
- /** Per-generation candidate records straight off the loop result. */
614
- generations: Array<{
615
- generationIndex: number;
616
- candidates: Array<{
617
- surfaceHash: string;
618
- composite: number;
619
- label?: string;
620
- rationale?: string;
621
- }>;
622
- promoted: string[];
623
- /** Surfaces measured this generation, keyed positionally to candidates so
624
- * the content hash can be computed from the real surface text. */
625
- surfaces: Array<{
626
- surfaceHash: string;
627
- surface: MutableSurface;
628
- }>;
629
- }>;
630
- gate: GateResult;
631
- baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
632
- winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
633
- /** Worker call records — the source for backend provenance. */
634
- workerRecords: ReadonlyArray<RunRecord>;
635
- totalCostUsd: number;
636
- totalDurationMs: number;
637
- }
638
- /** Build the durable provenance record from a completed loop result. */
639
- declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
640
- /**
641
- * Build the loop's OTLP-ingestable spans from a provenance record. One root
642
- * span per loop (`tangle.runId`), one span per generation, one span per
643
- * candidate (carrying its surfaceHash + label), and one span for the gate
644
- * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
645
- * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
646
- * reads, so the hosted collector reconstructs the full tree.
647
- *
648
- * Times are synthesized monotonically off a single base so the span tree is
649
- * orderable; the substrate does not retain per-candidate wall-clock starts.
650
- */
651
- declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
652
- baseTimeMs?: number;
653
- }): TraceSpanEvent[];
654
- /** Canonical durable paths under the run dir. */
655
- declare function provenanceRecordPath(runDir: string): string;
656
- declare function provenanceSpansPath(runDir: string): string;
657
- interface EmitLoopProvenanceResult {
658
- record: LoopProvenanceRecord;
659
- spans: TraceSpanEvent[];
660
- /** Absolute paths the record + spans were written to, when storage persists. */
661
- recordPath: string;
662
- spansPath: string;
663
- }
664
- interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
665
- /** Storage the record + spans are written through. */
666
- storage: CampaignStorage;
667
- /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
668
- * endpoint so the collector receives the full loop, not just `cost.*`. */
669
- hostedClient?: HostedClient;
670
- }
671
- /**
672
- * Build the provenance record + OTel spans and persist them durably under the
673
- * run dir (and ship spans to a hosted collector when one is wired). Returns
674
- * both artifacts so the caller can assert on / re-derive from them.
675
- *
676
- * Fail-loud: the durable write throws on storage failure (a swallowed write is
677
- * exactly the "emitted but lost" failure this closes). The hosted span ship is
678
- * the one best-effort leg — its failure is logged, not thrown, so an offline
679
- * collector never fails the loop (the durable artifact is the source of truth).
680
- */
681
- declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
682
-
683
- export { loopProvenanceSpans as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, openAutoPr as F, type GepaDriverOptions as G, type HeldOutGateOptions as H, provenanceRecordPath as I, provenanceSpansPath as J, runOptimization as K, type LoopProvenanceRecord as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type EmitLoopProvenanceArgs as m, type EmitLoopProvenanceResult as n, type GepaDriverConstraints as o, type LoopProvenanceBackend as p, type LoopProvenanceCandidate as q, runCampaign as r, type OpenAutoPrResult as s, type RunOptimizationOptions as t, type RunOptimizationResult as u, buildLoopProvenanceRecord as v, countSentenceEdits as w, defaultRenderDiff as x, emitLoopProvenance as y, extractH2Sections as z };
405
+ export { type CampaignStorage as C, type GepaDriverOptions as G, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunImprovementLoopOptions as b, runImprovementLoop as c, type GepaDriverConstraints as d, type OpenAutoPrResult as e, fsCampaignStorage as f, gepaDriver as g, type RunOptimizationOptions as h, inMemoryCampaignStorage as i, type RunOptimizationResult as j, countSentenceEdits as k, defaultRenderDiff as l, extractH2Sections as m, runOptimization as n, openAutoPr as o, runCampaign as r, surfaceHash as s };
@@ -0,0 +1,253 @@
1
+ import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
+ import { J as JudgeScore } from './types-Croy5h7V.js';
3
+
4
+ /**
5
+ * Normalize scores so all dimensions follow "higher = better".
6
+ * Inverted dimensions (hallucination, false_confidence, worst_failure)
7
+ * already use inverted scoring in the prompt (10 = no hallucination),
8
+ * but this function ensures consistency if raw scores leak through.
9
+ */
10
+ declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
11
+ /** Weighted mean — falls back to uniform weights when omitted */
12
+ declare function weightedMean(scores: {
13
+ score: number;
14
+ weight?: number;
15
+ }[]): number;
16
+ /** Bootstrap confidence interval */
17
+ declare function confidenceInterval(scores: number[], confidence?: number, opts?: {
18
+ seed?: number;
19
+ resamples?: number;
20
+ }): {
21
+ mean: number;
22
+ lower: number;
23
+ upper: number;
24
+ };
25
+ /**
26
+ * Inter-rater reliability — simplified Krippendorff's alpha.
27
+ *
28
+ * Each inner array is one judge's scores for all items.
29
+ * All arrays must have the same length (same items scored).
30
+ */
31
+ declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
32
+ /**
33
+ * Mann-Whitney U test for comparing two independent groups.
34
+ * Returns U statistic and approximate p-value (normal approximation).
35
+ */
36
+ declare function mannWhitneyU(a: number[], b: number[]): {
37
+ u: number;
38
+ p: number;
39
+ };
40
+ /** Partial credit: returns 0-1 ratio of current toward target */
41
+ declare function partialCredit(current: number, target: number): number;
42
+ /**
43
+ * Paired t-test — before/after measurements on the SAME items.
44
+ * Pairing removes inter-item variance, giving tighter significance than
45
+ * an unpaired test when comparing prompt v1 vs prompt v2 on identical
46
+ * scenarios.
47
+ */
48
+ declare function pairedTTest(before: number[], after: number[]): {
49
+ t: number;
50
+ df: number;
51
+ p: number;
52
+ };
53
+ /**
54
+ * Wilcoxon signed-rank test — paired non-parametric alternative.
55
+ * Use when the differences aren't normally distributed.
56
+ */
57
+ declare function wilcoxonSignedRank(before: number[], after: number[]): {
58
+ w: number;
59
+ p: number;
60
+ };
61
+ /**
62
+ * Cohen's d — standardized effect size for two independent groups.
63
+ * Positive d means group b has higher mean than group a.
64
+ * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
65
+ */
66
+ declare function cohensD(a: number[], b: number[]): number;
67
+ type CliffsMagnitude = 'negligible' | 'small' | 'medium' | 'large';
68
+ /**
69
+ * Cliff's delta — a non-parametric effect size for two independent samples.
70
+ * `δ = (#(after > before) − #(after < before)) / (n_before · n_after)`,
71
+ * ranging [-1, 1]. Positive ⇒ `after` tends to exceed `before` (improvement).
72
+ *
73
+ * Distribution-free counterpart to Cohen's d: no normality assumption, robust
74
+ * to the bounded/skewed score distributions judges produce. Pairs with
75
+ * `pairedBootstrap` / `wilcoxonSignedRank` for the non-parametric reporting
76
+ * path. Returns 0 when either sample is empty.
77
+ */
78
+ declare function cliffsDelta(before: number[], after: number[]): number;
79
+ /**
80
+ * Map a Cliff's delta to a qualitative magnitude using the standard
81
+ * Romano et al. thresholds (|δ|): <0.147 negligible, <0.33 small,
82
+ * <0.474 medium, else large.
83
+ */
84
+ declare function interpretCliffs(delta: number): CliffsMagnitude;
85
+ interface WeightedCompositeInput {
86
+ /** Per-dimension scores (typically 0..1). */
87
+ dims: Record<string, number>;
88
+ /** Weight per dimension. Every weighted dimension MUST be present in
89
+ * `dims` — a weight for an absent dimension is a config error and throws,
90
+ * because silently dropping it would renormalise the composite onto a
91
+ * different denominator than intended. */
92
+ weights: Record<string, number>;
93
+ /** Optional pass threshold; when set, the result reports `pass`. */
94
+ threshold?: number;
95
+ }
96
+ interface WeightedCompositeResult {
97
+ composite: number;
98
+ pass?: boolean;
99
+ }
100
+ /**
101
+ * Weighted composite over judge dimensions: `Σ(score_d · w_d) / Σ(w_d)` across
102
+ * the weighted dimensions. The canonical replacement for the per-consumer
103
+ * hand-rolled composite math (tax/legal/creative/gtm each ship a copy).
104
+ *
105
+ * Fail-loud: throws if a weighted dimension is missing from `dims`, if any
106
+ * weight is negative, or if the weights sum to 0 — none of which can produce
107
+ * a meaningful composite.
108
+ */
109
+ declare function weightedComposite(input: WeightedCompositeInput): WeightedCompositeResult;
110
+ interface CorpusScoreRecord {
111
+ /** Stable identifier for the rated item (scenario, span, turn, …). */
112
+ itemId: string;
113
+ /** Identifier for the judge that produced this score. */
114
+ judgeName: string;
115
+ /** Dimension name (matches `JudgeScore.dimension`). */
116
+ dimension: string;
117
+ /** Numeric score; must be finite. */
118
+ score: number;
119
+ }
120
+ interface CorpusAgreementPerDimension extends ContinuousAgreement {
121
+ dimension: string;
122
+ /** Item IDs that contributed to this dimension's matrix (every judge scored them). */
123
+ itemIds: string[];
124
+ /** Judge IDs that contributed to this dimension's matrix. */
125
+ judgeIds: string[];
126
+ }
127
+ interface CorpusAgreementReport {
128
+ /** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
129
+ perDimension: CorpusAgreementPerDimension[];
130
+ /** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
131
+ overallIcc: number;
132
+ /** Mean weighted κ across dimensions (NaN if none finite). */
133
+ overallWeightedKappa: number;
134
+ /** Dimensions evaluated (sorted). */
135
+ dimensions: string[];
136
+ /** Judges seen across the corpus (sorted). */
137
+ judgeIds: string[];
138
+ }
139
+ interface CorpusAgreementOptions extends ContinuousAgreementOptions {
140
+ /**
141
+ * Restrict the audit to these dimensions. Default = every dimension
142
+ * that appears in the input. A dimension named here but absent from
143
+ * the input throws — silent omission would corrupt the overall metric.
144
+ */
145
+ dimensions?: string[];
146
+ /**
147
+ * Restrict the audit to these judges. Default = every judge that
148
+ * appears in the input. A judge named here but absent from a
149
+ * dimension throws (see "fail loud" below).
150
+ */
151
+ judges?: string[];
152
+ }
153
+ /**
154
+ * Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
155
+ *
156
+ * For each dimension, builds the [n_items][n_judges] matrix of scores
157
+ * (keeping only items every judge rated on that dimension), then runs
158
+ * `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
159
+ * bootstrap CIs. Reports a pooled mean across dimensions as a single
160
+ * "is this judge panel reliable on this corpus?" number.
161
+ *
162
+ * Fail-loud contract:
163
+ * - Empty input throws.
164
+ * - Fewer than 2 judges or fewer than 2 items per dimension throws.
165
+ * - A judge present in some dimensions but with zero scored items on
166
+ * another dimension throws (would silently shrink the matrix).
167
+ * - Duplicate (itemId, judgeName, dimension) records throw.
168
+ */
169
+ declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
170
+ /**
171
+ * Convenience adapter for `JudgeScore[]` data keyed externally by item.
172
+ *
173
+ * Use when you have per-item arrays of `JudgeScore[]` (e.g. one
174
+ * `ScenarioResult.judgeScores` per scenario) and want corpus-wide
175
+ * agreement without manually flattening. `itemId` must be unique per
176
+ * row of `itemsScores`.
177
+ */
178
+ declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
179
+ itemId: string;
180
+ scores: JudgeScore[];
181
+ }>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
182
+ /**
183
+ * Required N per arm for a two-sample comparison at target effect size,
184
+ * alpha, and power. Normal-approximation formula:
185
+ * n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
186
+ * where d is Cohen's d. Returns Infinity for effect ≤ 0.
187
+ */
188
+ declare function requiredSampleSize(opts: {
189
+ effect: number;
190
+ alpha?: number;
191
+ power?: number;
192
+ twoSided?: boolean;
193
+ }): number;
194
+ /**
195
+ * Minimum detectable paired effect (standardised units) for a target paired
196
+ * sample size: d_min = (z_{1-α/2} + z_β) / sqrt(n_paired). Multiply by
197
+ * sd(deltas) for score units; treat as a lower bound — Wilcoxon and bootstrap
198
+ * have asymptotic relative efficiency below 1 vs the t-test on heavy tails.
199
+ */
200
+ declare function pairedMde(opts: {
201
+ nPaired: number;
202
+ alpha?: number;
203
+ power?: number;
204
+ twoSided?: boolean;
205
+ }): number;
206
+ /** Bonferroni adjustment: multiply every p-value by the test count, clamp at 1. */
207
+ declare function bonferroni(pValues: number[], alpha?: number): {
208
+ adjusted: number[];
209
+ significant: boolean[];
210
+ };
211
+ /**
212
+ * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
213
+ * significance at the target FDR; handles ties and preserves q monotonicity.
214
+ */
215
+ declare function benjaminiHochberg(pValues: number[], fdr?: number): {
216
+ qValues: number[];
217
+ significant: boolean[];
218
+ };
219
+ interface PairedBootstrapResult {
220
+ /** Number of paired observations. */
221
+ n: number;
222
+ /** Median of paired deltas (after − before). */
223
+ median: number;
224
+ /** Mean of paired deltas. */
225
+ mean: number;
226
+ /** Lower bound of the bootstrap CI on the chosen statistic. */
227
+ low: number;
228
+ /** Upper bound of the bootstrap CI on the chosen statistic. */
229
+ high: number;
230
+ /** Confidence level used (e.g. 0.95). */
231
+ confidence: number;
232
+ /** Number of bootstrap resamples used. */
233
+ resamples: number;
234
+ }
235
+ interface PairedBootstrapOptions {
236
+ /** Confidence level. Default 0.95. */
237
+ confidence?: number;
238
+ /** Bootstrap resample count. Default 2000. */
239
+ resamples?: number;
240
+ /** Statistic to bootstrap. Default 'median'. */
241
+ statistic?: 'median' | 'mean';
242
+ /** Deterministic seed. If omitted, uses Math.random(). */
243
+ seed?: number;
244
+ }
245
+ /**
246
+ * Paired bootstrap on (after − before) deltas. Returns a CI on the chosen
247
+ * statistic (median by default); pairs are resampled with replacement. The
248
+ * lower bound is what the promotion gate checks — `low > threshold` means the
249
+ * gain is real at the confidence level. Throws on unequal sample sizes.
250
+ */
251
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
252
+
253
+ export { type CliffsMagnitude as C, type PairedBootstrapOptions as P, type WeightedCompositeInput as W, type PairedBootstrapResult as a, benjaminiHochberg as b, type CorpusAgreementOptions as c, type CorpusAgreementPerDimension as d, type CorpusAgreementReport as e, type CorpusScoreRecord as f, type WeightedCompositeResult as g, bonferroni as h, cliffsDelta as i, cohensD as j, confidenceInterval as k, corpusInterRaterAgreement as l, corpusInterRaterAgreementFromJudgeScores as m, interRaterReliability as n, interpretCliffs as o, pairedBootstrap as p, mannWhitneyU as q, normalizeScores as r, pairedMde as s, pairedTTest as t, partialCredit as u, requiredSampleSize as v, wilcoxonSignedRank as w, weightedComposite as x, weightedMean as y };