@tangle-network/agent-eval 0.14.2 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1215,7 +1215,7 @@ interface BudgetSpec {
1215
1215
  calls?: number;
1216
1216
  usd?: number;
1217
1217
  }
1218
- interface RunOutcome {
1218
+ interface RunOutcome$1 {
1219
1219
  score?: number;
1220
1220
  pass?: boolean;
1221
1221
  failureClass?: FailureClass;
@@ -1257,7 +1257,7 @@ interface Run {
1257
1257
  startedAt: number;
1258
1258
  endedAt?: number;
1259
1259
  status: RunStatus;
1260
- outcome?: RunOutcome;
1260
+ outcome?: RunOutcome$1;
1261
1261
  budget?: BudgetSpec;
1262
1262
  /** Free-form labels for downstream grouping. */
1263
1263
  tags?: Record<string, string>;
@@ -1514,7 +1514,7 @@ declare class TraceEmitter {
1514
1514
  constructor(store: TraceStore, options?: TraceEmitterOptions);
1515
1515
  get runId(): string;
1516
1516
  startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
1517
- endRun(outcome?: RunOutcome): Promise<void>;
1517
+ endRun(outcome?: RunOutcome$1): Promise<void>;
1518
1518
  abortRun(reason: string): Promise<void>;
1519
1519
  span<S extends Span = Span>(init: {
1520
1520
  kind: SpanKind;
@@ -5907,6 +5907,35 @@ interface ViteDeployRunnerInput {
5907
5907
  * factory for {@link deployGateLayer}.
5908
5908
  */
5909
5909
  declare function viteDeployRunner(input: ViteDeployRunnerInput): DeployRunner;
5910
+ interface WranglerDeployRunnerInput {
5911
+ workdir: string;
5912
+ exec: (cmd: string, opts?: {
5913
+ cwd?: string;
5914
+ timeoutMs?: number;
5915
+ }) => Promise<{
5916
+ stdout: string;
5917
+ stderr: string;
5918
+ exitCode: number;
5919
+ }>;
5920
+ exists: (relativePath: string) => boolean | Promise<boolean>;
5921
+ /** Build command. Default `npm run build`. */
5922
+ buildCommand?: string;
5923
+ /** Wrangler dry-run command. Default `npx wrangler deploy --dry-run --outdir dist`. */
5924
+ dryRunCommand?: string;
5925
+ /** Per-step cap (ms). Default 120s. */
5926
+ timeoutMs?: number;
5927
+ }
5928
+ /**
5929
+ * Canonical runner for the `fullstack-ts` family on Cloudflare Workers
5930
+ * (Remix / React Router v7 / Hono on Workers). Detects wrangler.toml or
5931
+ * wrangler.jsonc in the workdir, builds, then `wrangler deploy --dry-run`
5932
+ * to catch missing bindings, syntax errors in wrangler config, and
5933
+ * import-time crashes that don't surface in `tsc`.
5934
+ *
5935
+ * No wrangler config = skip with "no wrangler" evidence (not a failure
5936
+ * — the gate caller decides whether to require deploy validation).
5937
+ */
5938
+ declare function wranglerDeployRunner(input: WranglerDeployRunnerInput): DeployRunner;
5910
5939
 
5911
5940
  /**
5912
5941
  * Keyword-coverage judge — baseline complement to the semantic concept
@@ -6220,6 +6249,936 @@ declare function compareReferenceReplay(baseline: ReferenceReplayScore, candidat
6220
6249
  declare function decideReferenceReplayPromotion(baseline: ReferenceReplayScore, candidate: ReferenceReplayScore, policy?: ReferenceReplayPromotionPolicy): ReferenceReplayPromotionDecision;
6221
6250
  declare function defaultReferenceReplayMatcher(reference: ReferenceReplayItem, candidate: ReferenceReplayCandidate): ReferenceMatchResult;
6222
6251
 
6252
+ /**
6253
+ * Paper-grade paired statistics for held-out promotion gates.
6254
+ *
6255
+ * The promotion gate (`HeldOutGate`) needs three things:
6256
+ *
6257
+ * 1. A bootstrap confidence interval on the per-item paired delta
6258
+ * (`pairedBootstrap`). Median delta is the headline number; the
6259
+ * CI lower bound is what the gate checks against `pairedDeltaThreshold`.
6260
+ * 2. A non-parametric significance test on the paired deltas
6261
+ * (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
6262
+ * paper-style name).
6263
+ * 3. False-discovery-rate correction across simultaneously-tested
6264
+ * candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
6265
+ *
6266
+ * Why a separate file: every existing primitive lives in `statistics.ts`
6267
+ * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
6268
+ * paired-only, paper-grade, and load-bearing for the promotion gate.
6269
+ * Putting it next to `statistics.ts` would require editing that file;
6270
+ * the brief forbids that. New file, new exports, no surface change.
6271
+ */
6272
+ interface PairedBootstrapResult {
6273
+ /** Number of paired observations (after dropping unequal lengths is rejected). */
6274
+ n: number;
6275
+ /** Median of paired deltas (after − before). */
6276
+ median: number;
6277
+ /** Mean of paired deltas. */
6278
+ mean: number;
6279
+ /** Lower bound of the bootstrap CI on the median delta. */
6280
+ low: number;
6281
+ /** Upper bound of the bootstrap CI on the median delta. */
6282
+ high: number;
6283
+ /** Confidence level used (e.g. 0.95). */
6284
+ confidence: number;
6285
+ /** Number of bootstrap resamples used. */
6286
+ resamples: number;
6287
+ }
6288
+ interface PairedBootstrapOptions {
6289
+ /** Confidence level. Default 0.95. */
6290
+ confidence?: number;
6291
+ /** Bootstrap resample count. Default 2000. */
6292
+ resamples?: number;
6293
+ /** Statistic to bootstrap. Default 'median'. */
6294
+ statistic?: 'median' | 'mean';
6295
+ /** Deterministic seed. If omitted, uses Math.random(). */
6296
+ seed?: number;
6297
+ }
6298
+ /**
6299
+ * Paired bootstrap on (after - before) deltas. Returns a CI on the
6300
+ * chosen statistic (median by default). Pairs are resampled with
6301
+ * replacement. The lower bound is what the promotion gate checks: if
6302
+ * `low > pairedDeltaThreshold`, the gain is real at the chosen
6303
+ * confidence level.
6304
+ *
6305
+ * Throws on unequal sample sizes — caller must align pairs upstream.
6306
+ */
6307
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
6308
+ /**
6309
+ * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
6310
+ * paired deltas is the standard non-parametric significance test for
6311
+ * "candidate beats baseline on matched items." Use alongside the
6312
+ * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
6313
+ */
6314
+ declare function pairedWilcoxon(before: number[], after: number[]): {
6315
+ w: number;
6316
+ p: number;
6317
+ };
6318
+ /**
6319
+ * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
6320
+ * across multiple candidate-vs-baseline comparisons run in the same
6321
+ * promotion sweep. Returns BH-adjusted q-values and significance at
6322
+ * the requested FDR (default 0.05).
6323
+ */
6324
+ declare function bhAdjust(pValues: number[], fdr?: number): {
6325
+ qValues: number[];
6326
+ significant: boolean[];
6327
+ };
6328
+
6329
+ /**
6330
+ * Paper-grade RunRecord schema + runtime validator.
6331
+ *
6332
+ * Every run that participates in a promotion gate, paper table, or
6333
+ * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
6334
+ * fields are exactly those the paper "Two Loops, Three Roles" requires
6335
+ * for reproducibility: who/what/when/cost/seed/hash, plus the search vs
6336
+ * holdout split tag and either a `searchScore` or a `holdoutScore`.
6337
+ *
6338
+ * This is intentionally NOT a replacement for the rich `Run` /
6339
+ * `ProposeReviewReport` / `ScenarioResult` types already in the
6340
+ * package. Those are runtime structures with full provenance. A
6341
+ * `RunRecord` is the analysis-time projection — the JSON-friendly
6342
+ * row you'd put in a parquet file or paste into a notebook.
6343
+ *
6344
+ * Validate at the boundary:
6345
+ *
6346
+ * const rec = validateRunRecord(rawJson) // throws on missing
6347
+ * const ok = isRunRecord(rawJson) // boolean check
6348
+ * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }
6349
+ *
6350
+ * The validator runs in pure TS — zod is intentionally NOT a
6351
+ * dependency. Round-trip tested in `tests/run-record.test.ts`.
6352
+ */
6353
+ /** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
6354
+ * combined train+test pool that the optimizer is allowed to read. */
6355
+ type RunSplitTag = 'search' | 'dev' | 'holdout';
6356
+ interface RunTokenUsage {
6357
+ input: number;
6358
+ output: number;
6359
+ cached?: number;
6360
+ }
6361
+ interface RunJudgeMetadata {
6362
+ model: string;
6363
+ promptVersion: string;
6364
+ /** [0,1] confidence the judge declared. Constant judge confidence
6365
+ * across many runs is a fallback signal (see `canary.ts`). */
6366
+ confidence: number;
6367
+ /** True if the judge degraded to a fallback path (rules-only,
6368
+ * prior-call cache, etc.). The canary uses this to alert. */
6369
+ fallback: boolean;
6370
+ }
6371
+ interface RunOutcome {
6372
+ /** Score on the search/optimization split. Optional because a
6373
+ * holdout-only evaluation only fills `holdoutScore`. */
6374
+ searchScore?: number;
6375
+ /** Score on the held-out split. Optional because a search-only run
6376
+ * only fills `searchScore`. At least one must be present. */
6377
+ holdoutScore?: number;
6378
+ /** Bag of any other metric the run produced — judge dimensions,
6379
+ * pass/fail counters, latency stats, etc. Numeric only — keeps
6380
+ * reporters honest. */
6381
+ raw: Record<string, number>;
6382
+ }
6383
+ /**
6384
+ * Mandatory paper-grade fields for a single evaluation run. Optional
6385
+ * fields are extension points; mandatory fields throw if missing.
6386
+ *
6387
+ * Hash discipline:
6388
+ * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
6389
+ * model (after any steering bundle merge).
6390
+ * - `configHash` is the sha256 of the effective run config (model,
6391
+ * temperature, tools, judges, splits). The pair (promptHash,
6392
+ * configHash) uniquely identifies an experimental cell.
6393
+ *
6394
+ * Model snapshot discipline:
6395
+ * - `model` MUST encode a snapshot version. Bare aliases like
6396
+ * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
6397
+ * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
6398
+ */
6399
+ interface RunRecord {
6400
+ /** UUID for the run. */
6401
+ runId: string;
6402
+ /** Logical experiment grouping (a treatment vs a baseline within
6403
+ * the same sweep should share `experimentId`). */
6404
+ experimentId: string;
6405
+ /** Stable identifier for the candidate (variant) being run. The
6406
+ * promotion gate compares two `candidateId`s on matched items. */
6407
+ candidateId: string;
6408
+ /** RNG seed for the run. Always recorded — silent re-seeding is
6409
+ * the most common cause of non-reproducible numbers. */
6410
+ seed: number;
6411
+ /** Model identifier WITH snapshot version. */
6412
+ model: string;
6413
+ /** sha256 of the effective prompt (post-steering). */
6414
+ promptHash: string;
6415
+ /** sha256 of the effective config. */
6416
+ configHash: string;
6417
+ /** Git SHA the harness was run from. */
6418
+ commitSha: string;
6419
+ /** End-to-end wall-clock duration in milliseconds. */
6420
+ wallMs: number;
6421
+ /** Time spent queued before execution started, if known. */
6422
+ queueMs?: number;
6423
+ /** Total USD cost. Mandatory — runs without a cost number are
6424
+ * unbounded by definition and must not be admitted into the gate. */
6425
+ costUsd: number;
6426
+ /** Token usage breakdown. */
6427
+ tokenUsage: RunTokenUsage;
6428
+ /** Judge-side metadata, if a judge was used. */
6429
+ judgeMetadata?: RunJudgeMetadata;
6430
+ /** Per-split scores + raw bag. */
6431
+ outcome: RunOutcome;
6432
+ /** Categorical failure tag, when the run failed and the harness
6433
+ * classified it. Free-form string; standard tags live in
6434
+ * `failure-taxonomy.ts`. */
6435
+ failureMode?: string;
6436
+ /** Which split this run was drawn from. */
6437
+ splitTag: RunSplitTag;
6438
+ }
6439
+ declare class RunRecordValidationError extends Error {
6440
+ readonly path: string;
6441
+ constructor(message: string, path?: string);
6442
+ }
6443
+ /**
6444
+ * Strict validator. Throws `RunRecordValidationError` on the first
6445
+ * missing or wrongly-typed field. Returns the input cast to
6446
+ * `RunRecord` on success — the validator does not coerce.
6447
+ */
6448
+ declare function validateRunRecord(input: unknown): RunRecord;
6449
+ /** Boolean validator — convenience for filtering arrays. */
6450
+ declare function isRunRecord(input: unknown): input is RunRecord;
6451
+ /** Non-throwing validator — returns a discriminated union. */
6452
+ declare function parseRunRecordSafe(input: unknown): {
6453
+ ok: true;
6454
+ value: RunRecord;
6455
+ } | {
6456
+ ok: false;
6457
+ error: RunRecordValidationError;
6458
+ };
6459
+ /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
6460
+ declare function roundTripRunRecord(record: RunRecord): RunRecord;
6461
+
6462
+ /**
6463
+ * HeldOutGate — first-class held-out paired-delta promotion gate.
6464
+ *
6465
+ * Encodes the "honesty override" pattern that lived inline in
6466
+ * `~/webb/redteam/scripts/agent-eval-autoresearch.ts:138–171`.
6467
+ * The optimizer's best-guess is one thing; what we should actually
6468
+ * ship is another. The gate is the line between them.
6469
+ *
6470
+ * A candidate is promoted iff ALL three pass:
6471
+ *
6472
+ * 1. **Productive runs**: the candidate has at least
6473
+ * `minProductiveRuns` paired observations on items where BOTH
6474
+ * candidate and baseline produced a real (non-silent) score.
6475
+ * 2. **Paired delta**: the lower bound of the bootstrap CI on the
6476
+ * median per-item delta (candidate − baseline) on the HOLDOUT
6477
+ * split is strictly greater than `pairedDeltaThreshold`.
6478
+ * 3. **Overfit gap**: the candidate's gap between search-split
6479
+ * score and holdout-split score is no worse (more positive)
6480
+ * than the baseline's gap by more than `overfitGapThreshold`.
6481
+ * "Better on search, worse on holdout" is the canonical
6482
+ * overfit pattern; this catches it.
6483
+ *
6484
+ * The decision carries a machine-readable `rejectionCode` plus an
6485
+ * `evidence` block with every number the gate looked at, so the
6486
+ * downstream researcher / paper / dashboard can re-derive the
6487
+ * verdict without re-running.
6488
+ *
6489
+ * See also:
6490
+ * - `src/paired-stats.ts` for `pairedBootstrap` + `pairedWilcoxon`
6491
+ * - `src/run-record.ts` for the input row schema
6492
+ * - `src/reference-replay.ts` for the older, reference-replay-
6493
+ * specific promotion path (still useful for replay-style evals).
6494
+ */
6495
+
6496
+ type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
6497
+ interface HeldOutGateConfig {
6498
+ /** Minimum number of paired (candidate, baseline) holdout observations
6499
+ * required before the gate will even consider promoting. Default 3. */
6500
+ minProductiveRuns?: number;
6501
+ /** The bootstrap-CI lower bound on the median paired holdout delta
6502
+ * must exceed this to promote. Default 0. */
6503
+ pairedDeltaThreshold?: number;
6504
+ /** Maximum allowed worsening of (search − holdout) gap relative to
6505
+ * baseline. Default 0.15 (i.e. candidate may overfit by up to 15
6506
+ * absolute score points more than baseline before rejection). */
6507
+ overfitGapThreshold?: number;
6508
+ /** Stable label of the baseline candidate. Required — paper-grade
6509
+ * evaluation never compares two unlabelled candidates. */
6510
+ baselineKey: string;
6511
+ /** Confidence level for the bootstrap CI. Default 0.95. */
6512
+ confidence?: number;
6513
+ /** Bootstrap resamples. Default 2000. */
6514
+ bootstrapResamples?: number;
6515
+ /** Optional deterministic seed for the bootstrap. Default undefined
6516
+ * (Math.random). */
6517
+ seed?: number;
6518
+ }
6519
+ interface GateEvidence {
6520
+ /** Number of paired (candidate, baseline) holdout observations used. */
6521
+ productiveRuns: number;
6522
+ /** Median of (candidate − baseline) paired holdout deltas. */
6523
+ medianPairedDelta: number;
6524
+ /** Bootstrap CI on the median paired holdout delta. */
6525
+ pairedCI: {
6526
+ low: number;
6527
+ high: number;
6528
+ };
6529
+ /** Wilcoxon signed-rank p-value on the paired holdout deltas. */
6530
+ pairedPValue: number;
6531
+ /** Mean candidate score on the search split (NaN if none). */
6532
+ searchScore: number;
6533
+ /** Mean candidate score on the holdout split (NaN if none). */
6534
+ holdoutScore: number;
6535
+ /** Candidate (search − holdout) gap. */
6536
+ overfitGap: number;
6537
+ /** Baseline (search − holdout) gap. */
6538
+ baselineOverfitGap: number;
6539
+ }
6540
+ interface GateDecision {
6541
+ /** Final promote/no-promote verdict. */
6542
+ promote: boolean;
6543
+ /** The candidate that was evaluated. */
6544
+ candidateId: string;
6545
+ /** The baseline it was compared against. */
6546
+ baselineId: string;
6547
+ /** Every number the gate looked at, for audit + paper export. */
6548
+ evidence: GateEvidence;
6549
+ /** Human-readable reason. */
6550
+ reason: string;
6551
+ /** Machine-readable rejection code, or null on promote. */
6552
+ rejectionCode: HeldOutGateRejectionCode | null;
6553
+ }
6554
+ /**
6555
+ * Held-out paired-delta promotion gate. Construct once with config,
6556
+ * call `evaluate(candidateRuns, baselineRuns)` per (candidate,
6557
+ * baseline) pair. Stateless across calls.
6558
+ */
6559
+ declare class HeldOutGate {
6560
+ private readonly minProductiveRuns;
6561
+ private readonly pairedDeltaThreshold;
6562
+ private readonly overfitGapThreshold;
6563
+ private readonly baselineKey;
6564
+ private readonly confidence;
6565
+ private readonly resamples;
6566
+ private readonly seed?;
6567
+ constructor(config: HeldOutGateConfig);
6568
+ /** Decide whether `candidate` should replace `baseline`. Pairing
6569
+ * is by (experimentId, seed) — identical experiment + seed pairs
6570
+ * the candidate run with the matching baseline run. Pairs without
6571
+ * a holdout score on both sides are dropped. */
6572
+ evaluate(candidate: RunRecord[], baseline: RunRecord[]): GateDecision;
6573
+ }
6574
+
6575
+ /**
6576
+ * Researcher interface — stable hook for an external autonomous-research
6577
+ * agent to drive the meta-loop.
6578
+ *
6579
+ * Implementations live downstream (typically in a private repo that
6580
+ * runs the actual LLM). This package ships only the contract + a
6581
+ * `NoopResearcher` so consumers can wire the surface without being
6582
+ * forced to implement every method up front.
6583
+ *
6584
+ * The four methods mirror the four stages of the paper "Two Loops,
6585
+ * Three Roles":
6586
+ *
6587
+ * inspectFailures — given the observed runs, what failure modes
6588
+ * are present? (data → diagnosis)
6589
+ * proposeChange — given diagnosed failure modes, what
6590
+ * structural changes should we try?
6591
+ * (diagnosis → plan delta)
6592
+ * applyChange — fold the proposed deltas into a concrete
6593
+ * experiment plan against an existing baseline.
6594
+ * (plan delta → executable plan)
6595
+ * evaluateChange — run the plan, return runs + the gate verdict.
6596
+ * (executable plan → verdict)
6597
+ *
6598
+ * Composition is the discipline: a Researcher implementation MUST
6599
+ * keep these four steps separate and inspectable. Conflating
6600
+ * "diagnose + propose + run" into a single LLM call defeats the
6601
+ * point of the framework — you can't audit which step lied.
6602
+ *
6603
+ * THIS INTERFACE IS STABLE. Breaking changes require a new module
6604
+ * (e.g. `Researcher2`) so existing implementations keep working.
6605
+ */
6606
+
6607
+ /** A diagnosed failure mode with the run-IDs that exhibit it. */
6608
+ interface FailureMode {
6609
+ /** Short machine-readable code. Must be stable across runs of the
6610
+ * same researcher to enable longitudinal tracking. */
6611
+ code: string;
6612
+ /** Human-readable description for the paper / dashboard. */
6613
+ description: string;
6614
+ evidence: {
6615
+ /** Run IDs (from `RunRecord.runId`) where this failure mode was
6616
+ * observed. */
6617
+ runIds: string[];
6618
+ /** Number of run samples that informed the diagnosis. */
6619
+ samples: number;
6620
+ };
6621
+ }
6622
+ /** A single steering change the researcher wants to try. */
6623
+ interface SteeringChange {
6624
+ kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
6625
+ /** Implementation-specific payload. Researcher implementations
6626
+ * define the schema — keep this `unknown` here to avoid coupling
6627
+ * the public interface to any one researcher's internal model. */
6628
+ payload: unknown;
6629
+ /** Why the researcher proposed this change. Goes into the audit
6630
+ * trail next to the failure-mode evidence. */
6631
+ rationale: string;
6632
+ /** Optional self-reported expected delta on the headline metric. */
6633
+ expectedDelta?: number;
6634
+ }
6635
+ /** A single experiment plan, mapped onto the search/holdout splits. */
6636
+ interface ExperimentPlan {
6637
+ baselineCandidateId: string;
6638
+ proposedCandidateId: string;
6639
+ changes: SteeringChange[];
6640
+ /** USD ceiling for the entire experiment. The runner must stop
6641
+ * before exceeding this and report a partial result. */
6642
+ evaluationBudgetUsd: number;
6643
+ /** Item IDs (your dataset keys) for the search vs holdout splits. */
6644
+ splits: {
6645
+ search: string[];
6646
+ holdout: string[];
6647
+ };
6648
+ }
6649
+ /** Result of running a plan: every run, plus the gate verdict. */
6650
+ interface ExperimentResult {
6651
+ plan: ExperimentPlan;
6652
+ runs: RunRecord[];
6653
+ gateDecision: GateDecision;
6654
+ }
6655
+ /**
6656
+ * The researcher loop. Stable, four-step, inspectable.
6657
+ *
6658
+ * ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
6659
+ * │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
6660
+ * └──────────┘ └──────────┘ └────┬─────┘
6661
+ * │
6662
+ * ▼
6663
+ * ┌────────────────┐ applyChange ┌────────┐
6664
+ * │ ExperimentPlan │ ◀────────────│ base │
6665
+ * └────────┬───────┘ └────────┘
6666
+ * │
6667
+ * evaluateChange ▼
6668
+ * ┌────────────────┐
6669
+ * │ ExperimentResult│
6670
+ * └────────────────┘
6671
+ */
6672
+ interface Researcher {
6673
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
6674
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
6675
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
6676
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
6677
+ }
6678
+ /**
6679
+ * No-op researcher — fails loud on every method. Use as a placeholder
6680
+ * in code paths that wire the interface but don't have an implementation
6681
+ * yet. Importantly, this does NOT silently succeed: a no-op researcher
6682
+ * that returned empty arrays would muffle the loop's signal that
6683
+ * nobody implemented the brain.
6684
+ */
6685
+ declare class NoopResearcher implements Researcher {
6686
+ private readonly hint;
6687
+ constructor(hint?: string);
6688
+ inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
6689
+ proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
6690
+ applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
6691
+ evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
6692
+ }
6693
+
6694
+ /**
6695
+ * Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
6696
+ * than replacing it.
6697
+ *
6698
+ * Three artefacts:
6699
+ *
6700
+ * - `summaryTable` Markdown table of per-candidate means,
6701
+ * 95% bootstrap CIs, BH-adjusted Wilcoxon
6702
+ * p-values, and Cohen's d versus a
6703
+ * comparator candidate.
6704
+ * - `paretoChart` Abstract spec for a cost vs quality
6705
+ * scatter, with gate decisions overlaid.
6706
+ * Returns numbers + labels — caller
6707
+ * chooses the plotting library.
6708
+ * - `gainHistogram`
6709
+ * Per-item paired holdout deltas as a
6710
+ * histogram spec (bins + counts + median +
6711
+ * CI). Same "data, not images" contract.
6712
+ *
6713
+ * The figure types are PlotSpecs — JSON-friendly, library-agnostic.
6714
+ * They aren't React components and they aren't PNGs; they are
6715
+ * what you'd hand to vega-lite, plotly, matplotlib, or your own
6716
+ * Canvas renderer to draw the actual figure.
6717
+ */
6718
+
6719
+ interface SummaryTableOptions {
6720
+ /** Comparator candidate id. Wilcoxon + Cohen's d are computed
6721
+ * versus this candidate. Required for paired stats columns. */
6722
+ comparator?: string;
6723
+ /** Which split to read scores from. Default 'holdout'. */
6724
+ split?: 'search' | 'holdout';
6725
+ /** Confidence level for the bootstrap CI on the mean. Default 0.95. */
6726
+ confidence?: number;
6727
+ /** FDR for BH adjustment of the comparison p-values. Default 0.05. */
6728
+ fdr?: number;
6729
+ }
6730
+ interface SummaryTableRow {
6731
+ candidateId: string;
6732
+ n: number;
6733
+ mean: number;
6734
+ ciLow: number;
6735
+ ciHigh: number;
6736
+ /** BH-adjusted q-value vs comparator. NaN if no comparator. */
6737
+ qValue: number;
6738
+ /** Cohen's d vs comparator. NaN if no comparator. */
6739
+ cohensD: number;
6740
+ }
6741
+ interface SummaryTable {
6742
+ rows: SummaryTableRow[];
6743
+ comparator: string | null;
6744
+ split: 'search' | 'holdout';
6745
+ /** Pre-rendered markdown — drop into a paper or PR. */
6746
+ markdown: string;
6747
+ }
6748
+ /**
6749
+ * Table 1 helper. Buckets runs by `candidateId`, computes mean +
6750
+ * bootstrap CI on the chosen split, and (when a comparator is given)
6751
+ * BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
6752
+ */
6753
+ declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
6754
+ interface ParetoPoint {
6755
+ candidateId: string;
6756
+ /** Mean USD cost per run on the chosen split. */
6757
+ cost: number;
6758
+ /** Mean score on the chosen split. */
6759
+ quality: number;
6760
+ /** Number of runs that informed this point. */
6761
+ n: number;
6762
+ /** Whether this candidate is on the Pareto frontier — high
6763
+ * quality, low cost, no dominator. */
6764
+ onFrontier: boolean;
6765
+ /** Optional gate verdict for this candidate, if a `GateDecision`
6766
+ * for it was passed in. */
6767
+ gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
6768
+ }
6769
+ interface ParetoFigureSpec {
6770
+ kind: 'pareto-cost-quality';
6771
+ split: 'search' | 'holdout';
6772
+ points: ParetoPoint[];
6773
+ axes: {
6774
+ x: 'costUsd';
6775
+ y: 'score';
6776
+ };
6777
+ }
6778
+ /**
6779
+ * Cost vs quality scatter spec. `gateDecisions` is keyed by
6780
+ * candidate id; if present, every point picks up the gate verdict
6781
+ * for overlay.
6782
+ */
6783
+ declare function paretoChart(runs: RunRecord[], opts?: {
6784
+ split?: 'search' | 'holdout';
6785
+ gateDecisions?: Record<string, GateDecision>;
6786
+ }): ParetoFigureSpec;
6787
+ interface GainDistributionBin {
6788
+ /** Inclusive lower edge. */
6789
+ lo: number;
6790
+ /** Exclusive upper edge (or inclusive if it's the last bin). */
6791
+ hi: number;
6792
+ /** Number of pairs whose delta lands in this bin. */
6793
+ count: number;
6794
+ }
6795
+ interface GainDistributionFigureSpec {
6796
+ kind: 'gain-distribution';
6797
+ candidateId: string;
6798
+ comparator: string;
6799
+ split: 'search' | 'holdout';
6800
+ /** Number of pairs used. */
6801
+ n: number;
6802
+ bins: GainDistributionBin[];
6803
+ median: number;
6804
+ ci: {
6805
+ low: number;
6806
+ high: number;
6807
+ };
6808
+ }
6809
+ interface GainDistributionOptions {
6810
+ /** Number of histogram bins. Default 11 (so the centre is exact at 0). */
6811
+ bins?: number;
6812
+ /** Which split to use. Default 'holdout'. */
6813
+ split?: 'search' | 'holdout';
6814
+ /** Confidence level for the CI. Default 0.95. */
6815
+ confidence?: number;
6816
+ /** Bootstrap resamples. Default 2000. */
6817
+ resamples?: number;
6818
+ /** Deterministic seed. */
6819
+ seed?: number;
6820
+ }
6821
+ /**
6822
+ * Held-out improvement distribution: per-pair delta (candidate −
6823
+ * comparator), histogrammed. Includes the bootstrap CI on the median
6824
+ * delta — same primitive the promotion gate uses.
6825
+ */
6826
+ declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
6827
+
6828
+ /**
6829
+ * Liveness canaries — cheap statistical checks that catch the failure
6830
+ * modes a green test suite never sees.
6831
+ *
6832
+ * Three canary types in this module:
6833
+ *
6834
+ * 1. **Silent judge fallback** — the judge degraded to a fallback
6835
+ * path (rules-only / cached / heuristic) without anyone
6836
+ * noticing. Signature: a string of consecutive runs whose
6837
+ * `judgeMetadata.confidence` equals a known fallback constant
6838
+ * (default 0.30) OR whose `judgeMetadata.fallback` is true.
6839
+ *
6840
+ * 2. **Judge calibration drift** — the judge's confidence
6841
+ * distribution has drifted from a historical window. Two-sample
6842
+ * Kolmogorov-Smirnov test on the recent vs historical confidences,
6843
+ * with the empirical-CDF max-difference statistic.
6844
+ *
6845
+ * 3. **Eval-set distribution shift** — the mix of categories /
6846
+ * buckets in the recent runs differs significantly from the
6847
+ * historical mix. Chi-square test on the binned counts.
6848
+ *
6849
+ * Outputs are alerts. The canary does NOT fail loud the way a test
6850
+ * does — failing tests are reserved for hard correctness violations.
6851
+ * A canary that fires is a *signal* to investigate, not a verdict.
6852
+ *
6853
+ * Why this lives here rather than in `observability.ts`: that module
6854
+ * exports already, and is a pure-fanout-to-Langfuse/Prometheus
6855
+ * adapter. Canaries are statistical detectors, not adapters.
6856
+ */
6857
+
6858
+ type CanaryKind = 'silent_judge_fallback' | 'judge_calibration_drift' | 'distribution_shift';
6859
+ type CanarySeverity = 'info' | 'warn' | 'error';
6860
+ interface CanaryAlert {
6861
+ kind: CanaryKind;
6862
+ severity: CanarySeverity;
6863
+ message: string;
6864
+ /** Numbers that informed the decision — drop straight into a
6865
+ * dashboard / paper figure. */
6866
+ evidence: Record<string, unknown>;
6867
+ }
6868
+ interface CanaryReport {
6869
+ alerts: CanaryAlert[];
6870
+ /** Per-kind summary count. */
6871
+ counts: Record<CanaryKind, number>;
6872
+ }
6873
+ interface CanaryOptions {
6874
+ /**
6875
+ * Silent-fallback detection.
6876
+ * - `constant`: confidence value treated as the fallback signal.
6877
+ * Default 0.30 (matches the soft-fail default in
6878
+ * `propose-review.ts`).
6879
+ * - `consecutiveThreshold`: trip the alert after this many
6880
+ * consecutive runs at `constant` (or `fallback === true`).
6881
+ * Default 3.
6882
+ */
6883
+ silentFallback?: {
6884
+ constant?: number;
6885
+ consecutiveThreshold?: number;
6886
+ /** Floating-point tolerance when comparing against `constant`. */
6887
+ epsilon?: number;
6888
+ };
6889
+ /**
6890
+ * Calibration-drift detection.
6891
+ * - `historyWindow`: number of past runs (oldest-first) treated as
6892
+ * the historical baseline. Default 50.
6893
+ * - `recentWindow`: number of recent runs (newest-first) compared
6894
+ * against history. Default 20.
6895
+ * - `ksAlpha`: alpha for the KS statistic vs critical value.
6896
+ * Default 0.05.
6897
+ * - `minRecent`: minimum recent runs required to even attempt the
6898
+ * check. Default 10.
6899
+ */
6900
+ calibrationDrift?: {
6901
+ historyWindow?: number;
6902
+ recentWindow?: number;
6903
+ ksAlpha?: number;
6904
+ minRecent?: number;
6905
+ };
6906
+ /**
6907
+ * Distribution-shift detection.
6908
+ * - `category`: function that maps a run to a categorical bucket.
6909
+ * Required to enable this canary; if omitted the chi-square check
6910
+ * is skipped entirely.
6911
+ * - `chiSquareAlpha`: alpha. Default 0.05.
6912
+ * - `historyWindow`, `recentWindow`, `minRecent`: like above.
6913
+ */
6914
+ distributionShift?: {
6915
+ category: (run: RunRecord) => string | null;
6916
+ chiSquareAlpha?: number;
6917
+ historyWindow?: number;
6918
+ recentWindow?: number;
6919
+ minRecent?: number;
6920
+ };
6921
+ }
6922
+ /**
6923
+ * Run all configured canaries against a chronological run list.
6924
+ * Runs MUST be sorted oldest-to-newest by the caller — the order of
6925
+ * the input is used to define "recent" vs "historical" windows.
6926
+ */
6927
+ declare function runCanaries(runs: RunRecord[], opts?: CanaryOptions): CanaryReport;
6928
+
6929
+ /**
6930
+ * Shared types for the reference benchmark wrappers under
6931
+ * `src/benchmarks/`. Each wrapper exports the three functions in
6932
+ * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.
6933
+ */
6934
+
6935
+ interface BenchmarkDatasetItem<TPayload = unknown> {
6936
+ /** Stable dataset-local item id (used for split assignment + paper
6937
+ * references). Unique within a benchmark. */
6938
+ id: string;
6939
+ /** Free-form payload. Each benchmark defines its own shape. */
6940
+ payload: TPayload;
6941
+ }
6942
+ interface BenchmarkEvaluation {
6943
+ /** [0, 1] score for the response on this item. Exact-match
6944
+ * benchmarks use 0/1; partial-credit benchmarks may return
6945
+ * fractional values. */
6946
+ score: number;
6947
+ /** Optional bag of raw scoring signals — e.g. parsed numeric
6948
+ * answer, regex match, judge sub-scores. */
6949
+ raw: Record<string, unknown>;
6950
+ }
6951
+ /** Common signature implemented by every adapter under `src/benchmarks/*`. */
6952
+ interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {
6953
+ /** Load the dataset for the given split. May hit the network on
6954
+ * first call but should be cache-friendly. Adapters that don't
6955
+ * ship the dataset itself MUST throw a clearly-marked error
6956
+ * pointing the caller at the loader script. */
6957
+ loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>;
6958
+ /** Score a single response. Pure with respect to the inputs. */
6959
+ evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>;
6960
+ /** Deterministic split assignment via item id hashing. The
6961
+ * fraction of items in each split is implementation-defined but
6962
+ * MUST be stable across processes and platforms. */
6963
+ assignSplit(itemId: string): RunSplitTag;
6964
+ }
6965
+ /** Split-assignment seed shared across all benchmarks. Bumping this
6966
+ * value reshuffles every split — do NOT do that lightly. */
6967
+ declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
6968
+ /**
6969
+ * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a
6970
+ * stable 32-bit hash of `${seed}::${id}`. Default proportions:
6971
+ *
6972
+ * search: 60% (optimization-readable)
6973
+ * dev: 20% (held-out for tuning, leak-on-purpose during dev)
6974
+ * holdout:20% (paper-grade held-out, gated reads)
6975
+ */
6976
+ declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
6977
+
6978
+ /**
6979
+ * GSM8K wrapper — exact-match grading on the final numeric answer.
6980
+ *
6981
+ * The dataset itself is NOT bundled. `loadDataset` will:
6982
+ * 1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
6983
+ * file with `{ id, question, answer }` records — the standard
6984
+ * HF mirror layout converted to JSONL);
6985
+ * 2. otherwise throw a clearly-marked error pointing to the loader.
6986
+ *
6987
+ * `evaluate` parses the final number out of the response (last
6988
+ * occurrence of a signed-decimal-or-integer literal, optionally after
6989
+ * `####`, the GSM8K answer convention) and compares to the ground-
6990
+ * truth integer. Floating-point comparisons use a 1e-6 tolerance.
6991
+ */
6992
+
6993
+ interface Gsm8kPayload {
6994
+ question: string;
6995
+ /** Reference answer, post-#### normalization. May be a number or
6996
+ * a numeric string ("72", "1.5"). */
6997
+ answer: string;
6998
+ }
6999
+ type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>;
7000
+ declare class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
7001
+ loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]>;
7002
+ evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation>;
7003
+ assignSplit(itemId: string): RunSplitTag;
7004
+ }
7005
+ /**
7006
+ * Parse a GSM8K-style answer. Honors the dataset's `#### N`
7007
+ * convention (the canonical answer comes after `####`); otherwise
7008
+ * returns the LAST signed numeric literal in the string.
7009
+ */
7010
+ declare function parseGsm8kAnswer(text: string): number | null;
7011
+ declare const loadDataset$2: (split: RunSplitTag) => Promise<Gsm8kItem[]>;
7012
+ declare const evaluate$2: (item: Gsm8kItem, response: string) => Promise<BenchmarkEvaluation>;
7013
+ declare const assignSplit$2: (itemId: string) => RunSplitTag;
7014
+
7015
+ type index$3_Gsm8kAdapter = Gsm8kAdapter;
7016
+ declare const index$3_Gsm8kAdapter: typeof Gsm8kAdapter;
7017
+ type index$3_Gsm8kItem = Gsm8kItem;
7018
+ type index$3_Gsm8kPayload = Gsm8kPayload;
7019
+ declare const index$3_parseGsm8kAnswer: typeof parseGsm8kAnswer;
7020
+ declare namespace index$3 {
7021
+ export { index$3_Gsm8kAdapter as Gsm8kAdapter, type index$3_Gsm8kItem as Gsm8kItem, type index$3_Gsm8kPayload as Gsm8kPayload, assignSplit$2 as assignSplit, evaluate$2 as evaluate, loadDataset$2 as loadDataset, index$3_parseGsm8kAnswer as parseGsm8kAnswer };
7022
+ }
7023
+
7024
+ /**
7025
+ * SWE-Bench Lite wrapper — 30-instance subset.
7026
+ *
7027
+ * Status: STUB. The actual SWE-Bench harness needs a Docker host and
7028
+ * is too heavy to ship inside this package. We expose the contract
7029
+ * (loadDataset, evaluate, assignSplit) so consumers can plug in their
7030
+ * own grader without touching call sites.
7031
+ *
7032
+ * Wire-up paths in priority order:
7033
+ *
7034
+ * 1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
7035
+ * lite instances + per-instance metadata (instance_id,
7036
+ * problem_statement, base_commit, repo, FAIL_TO_PASS,
7037
+ * PASS_TO_PASS).
7038
+ * 2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
7039
+ * that reads `{instance_id, patch}` JSON on stdin and writes
7040
+ * `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
7041
+ * JSON on stdout. Implementations can shell out to the
7042
+ * official `swebench` runner here.
7043
+ *
7044
+ * If neither is set, every public method throws a clearly-marked
7045
+ * "not implemented" error. The stub fails LOUD; it never silently
7046
+ * scores zero.
7047
+ */
7048
+
7049
+ interface SweBenchLitePayload {
7050
+ instanceId: string;
7051
+ problemStatement: string;
7052
+ baseCommit: string;
7053
+ repo: string;
7054
+ failToPass: string[];
7055
+ passToPass: string[];
7056
+ }
7057
+ type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>;
7058
+ declare class SweBenchLiteAdapter implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload> {
7059
+ loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]>;
7060
+ evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation>;
7061
+ assignSplit(itemId: string): RunSplitTag;
7062
+ }
7063
+ declare const loadDataset$1: (split: RunSplitTag) => Promise<SweBenchLiteItem[]>;
7064
+ declare const evaluate$1: (item: SweBenchLiteItem, response: string) => Promise<BenchmarkEvaluation>;
7065
+ declare const assignSplit$1: (itemId: string) => RunSplitTag;
7066
+
7067
+ type index$2_SweBenchLiteAdapter = SweBenchLiteAdapter;
7068
+ declare const index$2_SweBenchLiteAdapter: typeof SweBenchLiteAdapter;
7069
+ type index$2_SweBenchLiteItem = SweBenchLiteItem;
7070
+ type index$2_SweBenchLitePayload = SweBenchLitePayload;
7071
+ declare namespace index$2 {
7072
+ export { index$2_SweBenchLiteAdapter as SweBenchLiteAdapter, type index$2_SweBenchLiteItem as SweBenchLiteItem, type index$2_SweBenchLitePayload as SweBenchLitePayload, assignSplit$1 as assignSplit, evaluate$1 as evaluate, loadDataset$1 as loadDataset };
7073
+ }
7074
+
7075
+ /**
7076
+ * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
7077
+ * deterministic, dependency-free benchmark for any router that maps a
7078
+ * natural-language request to one of a fixed set of route labels.
7079
+ *
7080
+ * Format (see `routing/README.md` for prose):
7081
+ *
7082
+ * {
7083
+ * id: stable per-task ID (matches across processes).
7084
+ * category: one of the four route labels.
7085
+ * prompt: the user-facing request the router must classify.
7086
+ * route: the ground-truth route the router should pick.
7087
+ * synonyms: other strings that count as a correct answer.
7088
+ * hardNegatives:close-but-wrong route labels — used to detect the
7089
+ * "always picks the popular route" failure mode.
7090
+ * }
7091
+ *
7092
+ * The four categories are intentionally cross-domain (file ops,
7093
+ * math, search, conversation) so a router that collapses to one
7094
+ * category is easy to spot.
7095
+ */
7096
+ interface RoutingItem {
7097
+ id: string;
7098
+ category: 'file' | 'math' | 'search' | 'chat';
7099
+ prompt: string;
7100
+ /** Canonical correct route label. */
7101
+ route: string;
7102
+ /** Alternate route labels that also count as correct. */
7103
+ synonyms: string[];
7104
+ /** Wrong-but-tempting route labels (for analysis, not grading). */
7105
+ hardNegatives: string[];
7106
+ }
7107
+ declare const ROUTING_DATASET: RoutingItem[];
7108
+
7109
+ /**
7110
+ * Routing benchmark — synthetic, dependency-free, ships in the
7111
+ * package. 16 cross-category items in `dataset.ts`. See
7112
+ * `routing/README.md` for the format.
7113
+ *
7114
+ * `evaluate` does case-insensitive exact match against the canonical
7115
+ * route plus declared synonyms. The first valid route token in the
7116
+ * response wins; everything else is ignored. Wrong answers also
7117
+ * report whether they hit a hard negative — useful when triaging
7118
+ * "always picks the popular route" failure modes.
7119
+ */
7120
+
7121
+ type RoutingPayload = RoutingItem;
7122
+ type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>;
7123
+ declare class RoutingAdapter implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload> {
7124
+ loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]>;
7125
+ evaluate(item: RoutingDatasetItem, response: string): Promise<BenchmarkEvaluation>;
7126
+ assignSplit(itemId: string): RunSplitTag;
7127
+ }
7128
+ /**
7129
+ * Pull route-shaped tokens out of a model response. Routes look like
7130
+ * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics
7131
+ * are not routes, but `category.action` patterns are robust to most
7132
+ * model wrappers (JSON output, prose explanations, code fences).
7133
+ */
7134
+ declare function extractRouteTokens(response: string): string[];
7135
+ declare const loadDataset: (split: RunSplitTag) => Promise<RoutingDatasetItem[]>;
7136
+ declare const evaluate: (item: RoutingDatasetItem, response: string) => Promise<BenchmarkEvaluation>;
7137
+ declare const assignSplit: (itemId: string) => RunSplitTag;
7138
+
7139
+ declare const index$1_ROUTING_DATASET: typeof ROUTING_DATASET;
7140
+ type index$1_RoutingAdapter = RoutingAdapter;
7141
+ declare const index$1_RoutingAdapter: typeof RoutingAdapter;
7142
+ type index$1_RoutingDatasetItem = RoutingDatasetItem;
7143
+ type index$1_RoutingItem = RoutingItem;
7144
+ type index$1_RoutingPayload = RoutingPayload;
7145
+ declare const index$1_assignSplit: typeof assignSplit;
7146
+ declare const index$1_evaluate: typeof evaluate;
7147
+ declare const index$1_extractRouteTokens: typeof extractRouteTokens;
7148
+ declare const index$1_loadDataset: typeof loadDataset;
7149
+ declare namespace index$1 {
7150
+ export { index$1_ROUTING_DATASET as ROUTING_DATASET, index$1_RoutingAdapter as RoutingAdapter, type index$1_RoutingDatasetItem as RoutingDatasetItem, type index$1_RoutingItem as RoutingItem, type index$1_RoutingPayload as RoutingPayload, index$1_assignSplit as assignSplit, index$1_evaluate as evaluate, index$1_extractRouteTokens as extractRouteTokens, index$1_loadDataset as loadDataset };
7151
+ }
7152
+
7153
+ /**
7154
+ * Reference benchmark wrappers — entry point.
7155
+ *
7156
+ * Three benchmarks ship under `src/benchmarks/`:
7157
+ * - `gsm8k` — exact-match math reasoning (HF mirror,
7158
+ * dataset NOT bundled — see `gsm8k/index.ts`).
7159
+ * - `swebench-lite` — 30-instance SWE-Bench subset (STUB; needs
7160
+ * external grader).
7161
+ * - `routing` — synthetic 16-task router benchmark, ships
7162
+ * in the package.
7163
+ *
7164
+ * Every benchmark exposes the same three exports — `loadDataset`,
7165
+ * `evaluate`, `assignSplit` — and a typed adapter class. Pick the
7166
+ * import path that matches the benchmark.
7167
+ *
7168
+ * Shared types (`BenchmarkAdapter`, `BenchmarkDatasetItem`,
7169
+ * `BenchmarkEvaluation`, `deterministicSplit`, `BENCHMARK_SPLIT_SEED`)
7170
+ * live in `./types`.
7171
+ */
7172
+
7173
+ declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
7174
+ type index_BenchmarkAdapter<_TItem = unknown, TPayload = unknown> = BenchmarkAdapter<_TItem, TPayload>;
7175
+ type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayload>;
7176
+ type index_BenchmarkEvaluation = BenchmarkEvaluation;
7177
+ declare const index_deterministicSplit: typeof deterministicSplit;
7178
+ declare namespace index {
7179
+ export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$3 as gsm8k, index$1 as routing, index$2 as swebenchLite };
7180
+ }
7181
+
6223
7182
  interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
6224
7183
  bundleForRun?: (run: ReferenceReplayRun<Input>) => SteeringBundle;
6225
7184
  scoreForCase?: (caseRun: ReferenceReplayCaseRun<Input>, run: ReferenceReplayRun<Input>) => RunScore;
@@ -7120,4 +8079,4 @@ interface ReflectionProposal {
7120
8079
  */
7121
8080
  declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
7122
8081
 
7123
- export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
8082
+ export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };