@tangle-network/agent-eval 0.17.1 → 0.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -513,1052 +513,1562 @@ declare function formatDriverReport(results: DriverResult[]): string;
513
513
  declare function printDriverSummary(results: DriverResult[]): void;
514
514
 
515
515
  /**
516
- * Normalize scores so all dimensions follow "higher = better".
517
- * Inverted dimensions (hallucination, false_confidence, worst_failure)
518
- * already use inverted scoring in the prompt (10 = no hallucination),
519
- * but this function ensures consistency if raw scores leak through.
520
- */
521
- declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
522
- /** Weighted mean — falls back to uniform weights when omitted */
523
- declare function weightedMean(scores: {
524
- score: number;
525
- weight?: number;
526
- }[]): number;
527
- /** Bootstrap confidence interval */
528
- declare function confidenceInterval(scores: number[], confidence?: number): {
529
- mean: number;
530
- lower: number;
531
- upper: number;
532
- };
533
- /**
534
- * Inter-rater reliability — simplified Krippendorff's alpha.
516
+ * TraceSchema v1 the canonical data model for agent-eval.
535
517
  *
536
- * Each inner array is one judge's scores for all items.
537
- * All arrays must have the same length (same items scored).
538
- */
539
- declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
540
- /**
541
- * Mann-Whitney U test for comparing two independent groups.
542
- * Returns U statistic and approximate p-value (normal approximation).
543
- */
544
- declare function mannWhitneyU(a: number[], b: number[]): {
545
- u: number;
546
- p: number;
547
- };
548
- /** Partial credit: returns 0-1 ratio of current toward target */
549
- declare function partialCredit(current: number, target: number): number;
550
- /**
551
- * Paired t-test — before/after measurements on the SAME items.
552
- * Pairing removes inter-item variance, giving tighter significance than
553
- * an unpaired test when comparing prompt v1 vs prompt v2 on identical
554
- * scenarios.
555
- */
556
- declare function pairedTTest(before: number[], after: number[]): {
557
- t: number;
558
- df: number;
559
- p: number;
560
- };
561
- /**
562
- * Wilcoxon signed-rank test — paired non-parametric alternative.
563
- * Use when the differences aren't normally distributed.
518
+ * Every score, every failure class, every pipeline in the framework is
519
+ * a view over this data. Shape it once, live with it.
520
+ *
521
+ * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
522
+ * but extended with agent-specific span kinds (llm, tool, retrieval,
523
+ * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
524
+ * entities that OTEL leaves as free-form attributes.
564
525
  */
565
- declare function wilcoxonSignedRank(before: number[], after: number[]): {
566
- w: number;
567
- p: number;
568
- };
526
+ declare const TRACE_SCHEMA_VERSION = "1.0.0";
527
+ type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
528
+ interface BudgetSpec {
529
+ tokens?: number;
530
+ wallMs?: number;
531
+ calls?: number;
532
+ usd?: number;
533
+ }
534
+ interface RunOutcome$1 {
535
+ score?: number;
536
+ pass?: boolean;
537
+ failureClass?: FailureClass;
538
+ notes?: string;
539
+ }
569
540
  /**
570
- * Cohen's d standardized effect size for two independent groups.
571
- * Positive d means group b has higher mean than group a.
572
- * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
541
+ * Layeroptional classification in a nested build workflow.
542
+ * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
543
+ * `app-build`: sandbox harness that compiled + tested the generated scaffold.
544
+ * `app-runtime`: a run of the generated agent against a domain scenario.
545
+ * `meta`: any meta-eval (judge replay, correlation analysis).
573
546
  */
574
- declare function cohensD(a: number[], b: number[]): number;
547
+ type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
548
+ interface Run$1 {
549
+ runId: string;
550
+ scenarioId: string;
551
+ variantId?: string;
552
+ datasetVersion?: string;
553
+ /** Git SHA of agent code at run time. */
554
+ codeSha?: string;
555
+ /** Hash of the prompt template + any system prompt. */
556
+ promptSha?: string;
557
+ /** Model id + date + system-prompt hash, concatenated. */
558
+ modelFingerprint?: string;
559
+ seed?: number;
560
+ /** Arbitrary environment markers (shell, docker version, tz). */
561
+ envFingerprint?: Record<string, string>;
562
+ /** Version of the redaction rules applied to this run. */
563
+ redactionVersion?: string;
564
+ /** Parent run in a nested build workflow. A builder run's children are
565
+ * app-build runs; those children are app-runtime runs. */
566
+ parentRunId?: string;
567
+ /** Stable project identifier — groups runs across chats + sessions. */
568
+ projectId?: string;
569
+ /** Chat/conversation identifier within a project. */
570
+ chatId?: string;
571
+ /** Layer classification — hint for aggregation; not enforced. */
572
+ layer?: RunLayer;
573
+ startedAt: number;
574
+ endedAt?: number;
575
+ status: RunStatus;
576
+ outcome?: RunOutcome$1;
577
+ budget?: BudgetSpec;
578
+ /** Free-form labels for downstream grouping. */
579
+ tags?: Record<string, string>;
580
+ }
581
+ type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
582
+ type SpanStatus = 'ok' | 'error';
583
+ interface SpanBase {
584
+ spanId: string;
585
+ parentSpanId?: string;
586
+ runId: string;
587
+ kind: SpanKind;
588
+ name: string;
589
+ startedAt: number;
590
+ endedAt?: number;
591
+ status?: SpanStatus;
592
+ error?: string;
593
+ /** Anything not covered by typed fields. Kept deliberately free-form. */
594
+ attributes?: Record<string, unknown>;
595
+ }
596
+ interface Message {
597
+ role: 'system' | 'user' | 'assistant' | 'tool';
598
+ content: string;
599
+ tokens?: number;
600
+ /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
601
+ images?: Array<{
602
+ artifactId?: string;
603
+ url?: string;
604
+ mime?: string;
605
+ }>;
606
+ }
607
+ interface LlmSpan extends SpanBase {
608
+ kind: 'llm';
609
+ model: string;
610
+ messages: Message[];
611
+ output?: string;
612
+ inputTokens?: number;
613
+ outputTokens?: number;
614
+ cachedTokens?: number;
615
+ reasoningTokens?: number;
616
+ costUsd?: number;
617
+ finishReason?: string;
618
+ }
619
+ interface ToolSpan extends SpanBase {
620
+ kind: 'tool';
621
+ toolName: string;
622
+ args: unknown;
623
+ result?: unknown;
624
+ latencyMs?: number;
625
+ }
626
+ interface RetrievalSpan extends SpanBase {
627
+ kind: 'retrieval';
628
+ query: string;
629
+ hits: Array<{
630
+ docId: string;
631
+ score: number;
632
+ content?: string;
633
+ }>;
634
+ }
635
+ interface JudgeSpan extends SpanBase {
636
+ kind: 'judge';
637
+ judgeId: string;
638
+ /** Span this judgment applies to. */
639
+ targetSpanId: string;
640
+ dimension: string;
641
+ /** Numeric score (free-range; interpretation up to the judge). */
642
+ score: number;
643
+ rationale?: string;
644
+ evidence?: string;
645
+ }
646
+ interface SandboxSpan extends SpanBase {
647
+ kind: 'sandbox';
648
+ image?: string;
649
+ command?: string;
650
+ exitCode?: number;
651
+ testsTotal?: number;
652
+ testsPassed?: number;
653
+ stdoutHash?: string;
654
+ stderrHash?: string;
655
+ /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
656
+ wallMs?: number;
657
+ }
658
+ interface GenericSpan extends SpanBase {
659
+ kind: 'agent' | 'custom';
660
+ }
661
+ type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
662
+ type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
663
+ interface TraceEvent {
664
+ eventId: string;
665
+ runId: string;
666
+ spanId?: string;
667
+ kind: EventKind;
668
+ timestamp: number;
669
+ payload: Record<string, unknown>;
670
+ }
671
+ interface BudgetLedgerEntry {
672
+ runId: string;
673
+ dimension: keyof BudgetSpec;
674
+ limit: number;
675
+ consumed: number;
676
+ remaining: number;
677
+ timestamp: number;
678
+ breached: boolean;
679
+ /** Span that triggered this entry, if any. */
680
+ spanId?: string;
681
+ }
682
+ interface Artifact$1 {
683
+ artifactId: string;
684
+ runId: string;
685
+ spanId?: string;
686
+ contentType: string;
687
+ sizeBytes: number;
688
+ /** sha256 in hex. */
689
+ hash: string;
690
+ /** External storage URL (R2, S3, filesystem path). */
691
+ storageUrl?: string;
692
+ /** Inline content for small blobs — keep under ~64KB. */
693
+ inlineContent?: string;
694
+ }
695
+ type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
696
+ declare const FAILURE_CLASSES: readonly FailureClass[];
697
+ declare function isLlmSpan(s: Span): s is LlmSpan;
698
+ declare function isToolSpan(s: Span): s is ToolSpan;
699
+ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
700
+ declare function isJudgeSpan(s: Span): s is JudgeSpan;
701
+ declare function isSandboxSpan(s: Span): s is SandboxSpan;
575
702
 
576
- /**
577
- * ConvergenceTracker — tracks completion percentage over turns.
578
- *
579
- * Produces convergence curves showing how quickly the agent reaches
580
- * completion criteria.
581
- */
582
- declare class ConvergenceTracker {
583
- private criteria;
584
- private history;
585
- constructor(criteria: CompletionCriterion[]);
586
- /** Evaluate criteria against current state, record result */
587
- record(turn: number, state: DriverState): {
588
- completionPercent: number;
589
- complete: boolean;
590
- criteriaStatus: Record<string, boolean | number>;
703
+ interface RunFilter {
704
+ scenarioId?: string;
705
+ variantId?: string;
706
+ status?: RunStatus;
707
+ since?: number;
708
+ until?: number;
709
+ tag?: {
710
+ key: string;
711
+ value: string;
591
712
  };
592
- /** Get convergence curve */
593
- getCurve(): number[];
594
- /** Get full history with per-criterion status */
595
- getHistory(): {
596
- turn: number;
597
- completionPercent: number;
598
- criteriaStatus: Record<string, boolean | number>;
599
- }[];
600
- /** Find the turn where completion first reached 100% (or null) */
601
- getTurnToCompletion(): number | null;
713
+ parentRunId?: string;
714
+ projectId?: string;
715
+ chatId?: string;
716
+ layer?: RunLayer;
717
+ }
718
+ interface SpanFilter {
719
+ runId?: string;
720
+ parentSpanId?: string;
721
+ kind?: SpanKind;
722
+ name?: string;
723
+ toolName?: string;
724
+ judgeId?: string;
725
+ since?: number;
726
+ until?: number;
727
+ }
728
+ interface EventFilter {
729
+ runId?: string;
730
+ spanId?: string;
731
+ kind?: EventKind;
732
+ since?: number;
733
+ until?: number;
734
+ }
735
+ interface TraceStore {
736
+ appendRun(run: Run$1): Promise<void>;
737
+ updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
738
+ appendSpan(span: Span): Promise<void>;
739
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
740
+ appendEvent(event: TraceEvent): Promise<void>;
741
+ appendArtifact(artifact: Artifact$1): Promise<void>;
742
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
743
+ getRun(runId: string): Promise<Run$1 | undefined>;
744
+ listRuns(filter?: RunFilter): Promise<Run$1[]>;
745
+ spans(filter?: SpanFilter): Promise<Span[]>;
746
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
747
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
748
+ artifacts(runId: string): Promise<Artifact$1[]>;
749
+ }
750
+ declare class InMemoryTraceStore implements TraceStore {
751
+ private runs;
752
+ private allSpans;
753
+ private allEvents;
754
+ private allArtifacts;
755
+ private allBudget;
756
+ appendRun(run: Run$1): Promise<void>;
757
+ updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
758
+ appendSpan(span: Span): Promise<void>;
759
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
760
+ appendEvent(event: TraceEvent): Promise<void>;
761
+ appendArtifact(artifact: Artifact$1): Promise<void>;
762
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
763
+ getRun(runId: string): Promise<Run$1 | undefined>;
764
+ listRuns(filter?: RunFilter): Promise<Run$1[]>;
765
+ spans(filter?: SpanFilter): Promise<Span[]>;
766
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
767
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
768
+ artifacts(runId: string): Promise<Artifact$1[]>;
769
+ }
770
+ interface FileSystemTraceStoreOptions {
771
+ dir: string;
772
+ /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
773
+ maxBytes?: number;
774
+ }
775
+ declare class FileSystemTraceStore implements TraceStore {
776
+ private dir;
777
+ private maxBytes;
778
+ /** Lazy in-memory index for queries — populated on first read. */
779
+ private index?;
780
+ private loaded;
781
+ constructor(options: FileSystemTraceStoreOptions);
782
+ private ensureDir;
783
+ private append;
784
+ private insertInto;
785
+ private load;
786
+ appendRun(run: Run$1): Promise<void>;
787
+ updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
788
+ appendSpan(span: Span): Promise<void>;
789
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
790
+ appendEvent(event: TraceEvent): Promise<void>;
791
+ appendArtifact(artifact: Artifact$1): Promise<void>;
792
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
793
+ getRun(runId: string): Promise<Run$1 | undefined>;
794
+ listRuns(filter?: RunFilter): Promise<Run$1[]>;
795
+ spans(filter?: SpanFilter): Promise<Span[]>;
796
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
797
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
798
+ artifacts(runId: string): Promise<Artifact$1[]>;
602
799
  }
603
800
 
604
801
  /**
605
- * Versioned prompt registry.
606
- *
607
- * Every prompt used in an eval run is registered with an explicit version.
608
- * Reports include the content hash so A/B compares are rigorous: if the
609
- * hash changes between two reports, the prompt actually changed; if it
610
- * matches, the variance is elsewhere.
802
+ * TraceEmitter hierarchical span builder that auto-parents using an
803
+ * internal stack. One emitter per Run; emitters do NOT share state.
611
804
  *
612
- * Hash is SHA-256(content), truncated to 12 hex chars for readability.
613
- * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
805
+ * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
806
+ * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
807
+ * have to thread spanIds manually. For async workflows that can't use
808
+ * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
809
+ * explicitly.
614
810
  */
615
- interface PromptHandle {
616
- /** Stable human-readable id, e.g. 'legal.system' */
617
- id: string;
618
- /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
619
- version: string;
620
- /** SHA-256 of content, 12-hex-char prefix */
621
- hash: string;
622
- /** Full prompt body */
623
- content: string;
811
+
812
+ interface SpanHandle<S extends Span = Span> {
813
+ span: S;
814
+ end(patch?: Partial<S>): Promise<void>;
815
+ fail(error: string | Error, patch?: Partial<S>): Promise<void>;
624
816
  }
625
- declare class PromptRegistry {
626
- private readonly entries;
817
+ interface TraceEmitterOptions {
818
+ runId?: string;
819
+ /** Inject a clock for deterministic tests. */
820
+ now?: () => number;
821
+ /** Inject an id generator for deterministic tests. */
822
+ id?: () => string;
823
+ }
824
+ declare class TraceEmitter {
825
+ private store;
826
+ private stack;
827
+ private _runId;
828
+ private now;
829
+ private id;
830
+ constructor(store: TraceStore, options?: TraceEmitterOptions);
831
+ get runId(): string;
832
+ startRun(run: Omit<Run$1, 'runId' | 'startedAt' | 'status'>): Promise<Run$1>;
833
+ endRun(outcome?: RunOutcome$1): Promise<void>;
834
+ abortRun(reason: string): Promise<void>;
835
+ span<S extends Span = Span>(init: {
836
+ kind: SpanKind;
837
+ name: string;
838
+ parentSpanId?: string;
839
+ attributes?: Record<string, unknown>;
840
+ } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
841
+ private handle;
842
+ private pop;
843
+ llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
844
+ tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
845
+ retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
846
+ recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
847
+ sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
848
+ emit(event: {
849
+ kind: EventKind;
850
+ spanId?: string;
851
+ payload?: Record<string, unknown>;
852
+ }): Promise<TraceEvent>;
853
+ recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
854
+ timestamp?: number;
855
+ }): Promise<BudgetLedgerEntry>;
856
+ recordArtifact(artifact: Omit<Artifact$1, 'artifactId' | 'runId'>): Promise<Artifact$1>;
627
857
  /**
628
- * Register a prompt. Re-registering the same id+version with DIFFERENT
629
- * content throws versions are immutable. Re-registering with the SAME
630
- * content is a no-op (idempotent).
631
- */
632
- register(id: string, version: string, content: string): Promise<PromptHandle>;
633
- /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
634
- get(id: string, version: string): PromptHandle;
635
- /** Return all versions of an id, newest-first (lex-descending on version). */
636
- listVersions(id: string): PromptHandle[];
637
- /** Snapshot the whole registry — useful for including in reports. */
638
- list(): PromptHandle[];
639
- /** Verify a hash against registered content. Returns null if not found. */
640
- verifyHash(id: string, version: string, expectedHash: string): boolean | null;
858
+ * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
859
+ * Returns the fn's return value. Use this for the 95% case.
860
+ */
861
+ within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
641
862
  }
642
- /** SHA-256(content) first 12 hex chars. Stable across runtimes. */
643
- declare function hashContent(content: string): Promise<string>;
863
+ /** Helper to build an LLM span handle args object from a provider-shaped response. */
864
+ declare function llmSpanFromProvider(args: {
865
+ name?: string;
866
+ model: string;
867
+ messages: Message[];
868
+ output: string;
869
+ usage?: {
870
+ inputTokens?: number;
871
+ outputTokens?: number;
872
+ cachedTokens?: number;
873
+ reasoningTokens?: number;
874
+ };
875
+ costUsd?: number;
876
+ finishReason?: string;
877
+ }): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
644
878
 
645
879
  /**
646
- * Anti-slop quality judge.
880
+ * Policy-based agent control runtime.
647
881
  *
648
- * Deterministic pattern-based quality check no LLM call. Catches the
649
- * 80% of AI slop that every production agent leaks:
650
- * - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
651
- * - N-gram repetition (same phrase over and over)
652
- * - Hedging overuse ("I could be wrong, but...")
653
- * - Apology padding ("I'm so sorry for the confusion...")
654
- * - Unused opening formulas ("Great question!")
655
- * - Length bounds (too short to be useful, too long to be read)
882
+ * This is the minimal reusable loop behind driver-agent patterns:
656
883
  *
657
- * Produces a JudgeScore in the same shape as LLM judges so it composes into
658
- * `BenchmarkRunner`'s judge array transparently.
884
+ * observe state -> validate -> decide next action -> act -> observe -> ...
885
+ *
886
+ * It deliberately does not model named "topologies". Direct execution,
887
+ * critic/revise, driver intervention, specialist calls, and human escalation
888
+ * are all just actions chosen by the control policy.
659
889
  */
660
890
 
661
- interface AntiSlopConfig {
662
- /** Domain label appears in the JudgeScore output */
663
- domain?: string;
664
- /** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
665
- bannedPhrases?: string[];
666
- /** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
667
- bannedOpenings?: RegExp[];
668
- /** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
669
- hedgingPatterns?: RegExp[];
670
- /** Regexes matching apology padding. */
671
- apologyPatterns?: RegExp[];
672
- /** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
673
- repetitionThreshold?: number;
674
- /** Min output length in chars; below this the turn is deemed too terse. */
675
- minLength?: number;
676
- /** Max output length in chars; above this the turn is deemed too verbose. */
677
- maxLength?: number;
678
- /** How heavily each violation class reduces the score (default 1). */
679
- penaltyWeights?: Partial<Record<SlopCategory, number>>;
891
+ type ControlSeverity = 'info' | 'warning' | 'error' | 'critical';
892
+ type ControlActionFailureMode = 'continue' | 'stop';
893
+ interface ControlEvalResult {
894
+ /** Stable validator or judge id. */
895
+ id: string;
896
+ /** Whether this check passed. */
897
+ passed: boolean;
898
+ /** Optional normalized score. 1 = best, 0 = worst. */
899
+ score?: number;
900
+ /** Objective validators should usually be "error" or "critical" when failed. */
901
+ severity?: ControlSeverity;
902
+ /** Human-readable result. */
903
+ detail?: string;
904
+ /** Small evidence string or pointer. Avoid large payloads. */
905
+ evidence?: string;
906
+ /** True when the result came from deterministic state, not LLM judgment. */
907
+ objective?: boolean;
680
908
  }
681
- type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
682
- /** Create a reusable Judge function from an anti-slop config. */
683
- declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
684
- interface AntiSlopIssue {
685
- category: SlopCategory;
686
- detail: string;
687
- example?: string;
909
+ interface ControlBudget {
910
+ maxSteps: number;
911
+ maxWallMs?: number;
912
+ maxCostUsd?: number;
688
913
  }
689
- interface AntiSlopReport {
690
- /** 0–10 score; 10 is clean, lower values mean more slop. */
691
- score: number;
692
- issues: AntiSlopIssue[];
693
- /** Count of each category for programmatic aggregation. */
694
- counts: Record<SlopCategory, number>;
914
+ interface ControlStopPolicies<TState, TAction> {
915
+ /**
916
+ * Stop after N consecutive steps with no state fingerprint change and
917
+ * less than `minScoreDelta` score movement. Disabled when omitted.
918
+ */
919
+ maxNoProgressSteps?: number;
920
+ /**
921
+ * Stop after the same action fingerprint is selected N consecutive
922
+ * times. Disabled when omitted.
923
+ */
924
+ maxRepeatedActions?: number;
925
+ /** Minimum score movement that counts as progress. Default 0.001. */
926
+ minScoreDelta?: number;
927
+ /** Override the default JSON/string fingerprint for state comparisons. */
928
+ stateFingerprint?: (state: TState) => string;
929
+ /** Override the default JSON/string fingerprint for repeated-action checks. */
930
+ actionFingerprint?: (action: TAction) => string;
931
+ }
932
+ interface ControlContext<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
933
+ intent: string;
934
+ state: TState;
935
+ evals: TEval[];
936
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
937
+ budget: ControlBudget;
938
+ stepIndex: number;
939
+ wallMs: number;
940
+ spentCostUsd: number;
941
+ remainingCostUsd?: number;
942
+ abortSignal: AbortSignal;
943
+ emitter?: TraceEmitter;
695
944
  }
696
- /**
697
- * Pure function — analyze one or more outputs against the config. Exposed
698
- * separately so consumers can build their own reporters on top.
699
- */
700
- declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
701
- penaltyWeights: Record<SlopCategory, number>;
702
- }): AntiSlopReport;
945
+ type ControlDecision<TAction> = {
946
+ type: 'continue';
947
+ action: TAction;
948
+ reason?: string;
949
+ } | {
950
+ type: 'stop';
951
+ reason: string;
952
+ pass?: boolean;
953
+ score?: number;
954
+ };
955
+ interface StopDecision {
956
+ stop: boolean;
957
+ pass: boolean;
958
+ reason: string;
959
+ score?: number;
960
+ failureClass?: FailureClass;
961
+ }
962
+ interface ControlActionOutcome<TActionResult> {
963
+ ok: boolean;
964
+ result?: TActionResult;
965
+ error?: string;
966
+ costUsd?: number;
967
+ durationMs: number;
968
+ }
969
+ interface ControlRuntimeError {
970
+ phase: 'observe' | 'validate' | 'decide' | 'act' | 'stop-policy' | 'on-step' | 'trace';
971
+ stepIndex: number;
972
+ message: string;
973
+ }
974
+ interface ControlStep<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
975
+ index: number;
976
+ decision: ControlDecision<TAction>;
977
+ beforeState: TState;
978
+ afterState: TState;
979
+ evalsBefore: TEval[];
980
+ evalsAfter: TEval[];
981
+ actionOutcome?: ControlActionOutcome<TActionResult>;
982
+ startedAt: string;
983
+ endedAt: string;
984
+ }
985
+ interface ControlRunResult<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
986
+ intent: string;
987
+ pass: boolean;
988
+ completed: boolean;
989
+ reason: string;
990
+ score?: number;
991
+ steps: ControlStep<TState, TAction, TActionResult, TEval>[];
992
+ finalState: TState | undefined;
993
+ finalEvals: TEval[];
994
+ wallMs: number;
995
+ spentCostUsd: number;
996
+ runId: string | null;
997
+ failureClass?: FailureClass;
998
+ runtimeErrors: ControlRuntimeError[];
999
+ stoppedBy: 'policy' | 'stop-policy' | 'budget' | 'abort' | 'runtime-error';
1000
+ }
1001
+ interface ControlRuntimeConfig<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
1002
+ intent: string;
1003
+ budget?: Partial<ControlBudget>;
1004
+ signal?: AbortSignal;
1005
+ /** Defaults to `continue`: action failures are recorded, then the policy gets another chance. */
1006
+ actionFailure?: ControlActionFailureMode;
1007
+ /**
1008
+ * Extract cost from an action result. Used for `maxCostUsd` budget
1009
+ * enforcement and trace budget ledger emission.
1010
+ */
1011
+ getActionCostUsd?: (ctx: {
1012
+ action: TAction;
1013
+ result: TActionResult;
1014
+ state: TState;
1015
+ evals: TEval[];
1016
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
1017
+ }) => number | undefined;
1018
+ /** Read typed task/product state. Prefer structured state over transcript-only context. */
1019
+ observe: (ctx: {
1020
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
1021
+ abortSignal: AbortSignal;
1022
+ }) => Promise<TState> | TState;
1023
+ /** Objective validators first, subjective judges only where objective state is insufficient. */
1024
+ validate: (ctx: {
1025
+ intent: string;
1026
+ state: TState;
1027
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
1028
+ abortSignal: AbortSignal;
1029
+ }) => Promise<TEval[]> | TEval[];
1030
+ /** Choose the next control action. Can call a worker, ask user, run critic, inspect state, or stop. */
1031
+ decide: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<ControlDecision<TAction>> | ControlDecision<TAction>;
1032
+ /** Execute the action selected by the policy. */
1033
+ act: (action: TAction, ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<TActionResult> | TActionResult;
1034
+ /** Final stopping policy. Called before decide and after each action. */
1035
+ shouldStop?: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<StopDecision> | StopDecision;
1036
+ /** Optional hook for tracing or live progress updates. */
1037
+ onStep?: (step: ControlStep<TState, TAction, TActionResult, TEval>) => Promise<void> | void;
1038
+ /** Optional generic stuck-loop policies. Custom `shouldStop` still runs first. */
1039
+ stopPolicies?: ControlStopPolicies<TState, TAction>;
1040
+ /** Optional trace sink. Emits one run plus one span per control step. */
1041
+ store?: TraceStore;
1042
+ scenarioId?: string;
1043
+ projectId?: string;
1044
+ variantId?: string;
1045
+ }
1046
+ declare function runAgentControlLoop<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(config: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>>;
1047
+ declare function stopOnNoProgress<TState, TAction>(maxNoProgressSteps: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxNoProgressSteps'>): ControlStopPolicies<TState, TAction>;
1048
+ declare function stopOnRepeatedAction<TState, TAction>(maxRepeatedActions: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxRepeatedActions'>): ControlStopPolicies<TState, TAction>;
1049
+ declare function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
1050
+ declare function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
1051
+ declare function allCriticalPassed(evals: ControlEvalResult[]): boolean;
703
1052
 
704
1053
  /**
705
- * Artifact validators.
1054
+ * Dataset — versioned, sliceable, content-hashed scenario collection.
706
1055
  *
707
- * Generic "score a produced artifact" primitive. Tax uses it for PDF form
708
- * correctness, legal for contract clauses, film for script breakdowns, GTM
709
- * for social posts. One interface, many validators; all plug into
710
- * `BenchmarkRunner` the same way.
1056
+ * Scenarios stop being ephemeral arrays and become first-class
1057
+ * artifacts. Every Dataset carries:
1058
+ * - content hash (sha256 over canonicalized scenario array)
1059
+ * - provenance (contributor, createdAt, sourceUrl)
1060
+ * - split labels (train | dev | test | holdout)
1061
+ * - difficulty tiers (easy | medium | hard | extreme)
1062
+ * - tags (free-form, per-scenario)
711
1063
  *
712
- * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
713
- * plus a `ValidationContext` (scenario id, the turns that produced it) and
714
- * returns a `ValidationResult` with pass/fail + 0..1 score + structured
715
- * issues.
1064
+ * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
1065
+ * deterministic, reproducible subset. Holdout slices are locked: you
1066
+ * can read them but `mutate` throws, which prevents "oh I'll just
1067
+ * tweak that one scenario" contamination drift.
716
1068
  */
717
- interface Artifact$1 {
718
- /** Logical kind validators type-guard on this */
719
- kind: 'file' | 'json' | 'text' | 'binary' | string;
720
- /** Filesystem-style path, optional */
721
- path?: string;
722
- /** String content for text/json/file kinds */
723
- content?: string;
724
- /** Binary content (if kind === 'binary') */
725
- bytes?: Uint8Array;
726
- /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
727
- metadata?: Record<string, unknown>;
1069
+ type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
1070
+ type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
1071
+ interface DatasetScenario {
1072
+ id: string;
1073
+ /** Arbitrary payload; the framework doesn't interpret it. */
1074
+ payload: unknown;
1075
+ split?: DatasetSplit;
1076
+ difficulty?: DatasetDifficulty;
1077
+ /** Canary token that MUST NOT round-trip through a correct agent output. */
1078
+ canary?: string;
1079
+ tags?: Record<string, string>;
728
1080
  }
729
- interface ValidationContext {
730
- scenarioId: string;
731
- turnIndex?: number;
732
- /** Prior artifacts for multi-artifact scenarios */
733
- priorArtifacts?: Artifact$1[];
734
- /** Free-form hints the validator uses for domain-specific checks */
735
- hints?: Record<string, unknown>;
1081
+ interface DatasetProvenance {
1082
+ contributor?: string;
1083
+ createdAt: string;
1084
+ sourceUrl?: string;
1085
+ license?: string;
1086
+ description?: string;
1087
+ /** Monotonic human-readable version (e.g. "2026.04.20"). */
1088
+ version: string;
736
1089
  }
737
- interface ValidationIssue {
738
- severity: 'error' | 'warning' | 'info';
739
- message: string;
740
- /** Optional path into the artifact (e.g. JSON path or byte offset) */
741
- locus?: string;
1090
+ interface DatasetManifest {
1091
+ name: string;
1092
+ provenance: DatasetProvenance;
1093
+ /** sha256 hex over canonicalized scenarios. */
1094
+ contentHash: string;
1095
+ scenarioCount: number;
1096
+ splitCounts: Record<DatasetSplit, number>;
742
1097
  }
743
- interface ValidationResult {
744
- pass: boolean;
745
- /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
746
- score: number;
747
- issues: ValidationIssue[];
748
- /** Diagnostic payload for reporters */
749
- evidence?: Record<string, unknown>;
1098
+ interface SliceOptions {
1099
+ split?: DatasetSplit;
1100
+ difficulty?: DatasetDifficulty;
1101
+ /** Number of scenarios (random sample, seeded). Omit to take all that match. */
1102
+ limit?: number;
1103
+ seed?: number;
1104
+ /** Predicate narrowing. Applied after split/difficulty filters. */
1105
+ filter?: (scenario: DatasetScenario) => boolean;
1106
+ /** If true, include scenarios marked as holdout. Default false. */
1107
+ includeHoldout?: boolean;
750
1108
  }
751
- interface ArtifactValidator {
752
- /** Stable identifier for the validator; appears in reports. */
753
- name: string;
754
- /** Optional description for human-facing reports. */
755
- description?: string;
756
- /** Called once per artifact; validators are expected to be pure + idempotent. */
757
- validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
1109
+ /** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
1110
+ declare class HoldoutLockedError extends Error {
1111
+ constructor(datasetName: string);
758
1112
  }
759
- /**
760
- * Run every validator on the same artifact; aggregate pass as AND, score as
761
- * (weighted) mean, issues concatenated. Weights default to 1 each.
762
- */
763
- declare function composeValidators(validators: ArtifactValidator[], options?: {
764
- name?: string;
765
- weights?: number[];
766
- }): ArtifactValidator;
767
- /** Pass if the artifact body matches a provided regex. */
768
- declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
769
- /** Pass if JSON parses and every required key is present. */
770
- declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
771
- /** Pass if min ≤ byte length ≤ max. */
772
- declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
773
- /** Pass if the artifact contains every required substring (case-insensitive by default). */
774
- declare function containsAll(name: string, required: string[], options?: {
775
- caseSensitive?: boolean;
776
- }): ArtifactValidator;
1113
+ declare class Dataset {
1114
+ readonly name: string;
1115
+ readonly provenance: DatasetProvenance;
1116
+ private scenarios;
1117
+ private locked;
1118
+ constructor(init: {
1119
+ name: string;
1120
+ provenance: DatasetProvenance;
1121
+ scenarios: DatasetScenario[];
1122
+ locked?: boolean;
1123
+ });
1124
+ /** All scenarios. Readonly callers must go through `slice` or `clone`. */
1125
+ all(): readonly DatasetScenario[];
1126
+ get size(): number;
1127
+ /**
1128
+ * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
1129
+ * the same arguments always produce the same slice across machines.
1130
+ */
1131
+ slice(options?: SliceOptions): DatasetScenario[];
1132
+ /**
1133
+ * Assemble the manifest (name + provenance + content hash + counts).
1134
+ * Content hash is deterministic over canonicalized scenarios.
1135
+ */
1136
+ manifest(): Promise<DatasetManifest>;
1137
+ /** Fresh unlocked copy — for post-release forks when mutation is needed. */
1138
+ clone(overrides?: Partial<{
1139
+ name: string;
1140
+ version: string;
1141
+ }>): Dataset;
1142
+ lock(): void;
1143
+ add(scenario: DatasetScenario): void;
1144
+ remove(scenarioId: string): void;
1145
+ /**
1146
+ * Stable JSON-Lines serialization — deterministic byte-for-byte.
1147
+ * Write to disk for contamination-verifiable archives.
1148
+ */
1149
+ toJsonl(): string;
1150
+ static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
1151
+ }
1152
+ declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
777
1153
 
778
1154
  /**
779
- * Workspace inspectorscore the persisted state of an agent after a run.
1155
+ * Prompt optimizerA/B test prompt variants with statistical rigor.
780
1156
  *
781
- * Many evals don't ask "did the response say the right thing" but "did the
782
- * agent put the right rows in the DB / files in the vault / entities on the
783
- * canvas". This is the primitive for that.
1157
+ * Runs N prompt variants against a fixed scenario set, collects per-scenario
1158
+ * scores via the user-provided `scoreVariant` callback, and returns:
1159
+ * - per-variant mean + bootstrap CI
1160
+ * - pairwise significance (Mann-Whitney, non-parametric — works on any
1161
+ * score distribution, not just normal)
1162
+ * - a winner (highest mean, flagged if the lead is not significant)
784
1163
  *
785
- * Implementations read from D1, KV, filesystem, or any store — the interface
786
- * is deliberately small so consumers plug in their own backends.
1164
+ * Deliberately generic the `scoreVariant` callback does whatever domain
1165
+ * work the consumer needs (invoke the agent, judge the output, whatever),
1166
+ * and returns a number per scenario. This lets the optimizer stay small +
1167
+ * testable.
787
1168
  */
788
- interface WorkspaceSnapshot {
789
- /** Vault files: logical path → content */
790
- files: Record<string, string>;
791
- /** DB rows: table name → array of rows (post-validation) */
792
- rows: Record<string, Array<Record<string, unknown>>>;
793
- /** KV entries: key → value (scoped to whatever prefix the inspector chose) */
794
- kv: Record<string, string>;
795
- /** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
796
- blobs?: Record<string, {
797
- size: number;
798
- hash?: string;
799
- mimeType?: string;
1169
+ interface PromptVariant$1 {
1170
+ id: string;
1171
+ prompt: string;
1172
+ metadata?: Record<string, unknown>;
1173
+ }
1174
+ interface OptimizationConfig {
1175
+ variants: PromptVariant$1[];
1176
+ /** How many trials per (variant, scenario) controls CI tightness. Default 3. */
1177
+ trialsPerScenario?: number;
1178
+ /** Significance threshold for pairwise comparison (default 0.05). */
1179
+ significanceLevel?: number;
1180
+ /**
1181
+ * The scoring callback. For each (variant, scenarioId, trialIndex), produce
1182
+ * a score in 0..1 (or any numeric range — the optimizer only cares about
1183
+ * monotonicity).
1184
+ */
1185
+ scoreVariant: (args: {
1186
+ variant: PromptVariant$1;
1187
+ scenarioId: string;
1188
+ trialIndex: number;
1189
+ }) => Promise<number>;
1190
+ /** Scenario ids to run against. */
1191
+ scenarioIds: string[];
1192
+ /** Optional hook — fires after each (variant, scenario) fully scored. */
1193
+ onScenarioComplete?: (info: {
1194
+ variantId: string;
1195
+ scenarioId: string;
1196
+ scores: number[];
1197
+ }) => void;
1198
+ }
1199
+ interface VariantScore {
1200
+ variantId: string;
1201
+ mean: number;
1202
+ ci95: {
1203
+ lower: number;
1204
+ upper: number;
1205
+ };
1206
+ n: number;
1207
+ perScenario: Record<string, {
1208
+ mean: number;
1209
+ n: number;
1210
+ samples: number[];
800
1211
  }>;
801
1212
  }
802
- interface InspectorContext {
803
- /** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
804
- scopeId: string;
805
- /** Optional scenario id — allows scenario-specific snapshot shaping */
806
- scenarioId?: string;
1213
+ interface PairwiseComparison {
1214
+ variantA: string;
1215
+ variantB: string;
1216
+ pValue: number;
1217
+ /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
1218
+ qValue: number;
1219
+ /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
1220
+ significant: boolean;
1221
+ meanDelta: number;
807
1222
  }
808
- interface WorkspaceInspector {
809
- name: string;
810
- snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
1223
+ interface OptimizationResult {
1224
+ winner: {
1225
+ variantId: string;
1226
+ /** True when the winner's lead vs every other variant is statistically significant. */
1227
+ significant: boolean;
1228
+ ciLowerBoundExceedsSecondMean: boolean;
1229
+ };
1230
+ scores: VariantScore[];
1231
+ pairwise: PairwiseComparison[];
1232
+ config: {
1233
+ trialsPerScenario: number;
1234
+ significanceLevel: number;
1235
+ variants: string[];
1236
+ scenarios: string[];
1237
+ };
811
1238
  }
812
- declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
813
- readonly name = "in-memory";
814
- private readonly snapshots;
815
- set(scopeId: string, snapshot: WorkspaceSnapshot): void;
816
- snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
1239
+ declare class PromptOptimizer {
1240
+ run(config: OptimizationConfig): Promise<OptimizationResult>;
817
1241
  }
818
- interface WorkspaceAssertion {
819
- name: string;
820
- description?: string;
821
- check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
1242
+
1243
+ interface RunScore {
1244
+ success: number;
1245
+ goalProgress: number;
1246
+ repoGroundedness: number;
1247
+ driftPenalty: number;
1248
+ toolUseQuality: number;
1249
+ patchQuality: number;
1250
+ testReality: number;
1251
+ finalGate: number;
1252
+ reviewerBlockers: number;
1253
+ costUsd: number;
1254
+ wallSeconds: number;
1255
+ notes?: string[];
822
1256
  }
823
- interface WorkspaceAssertionResult {
824
- pass: boolean;
825
- /** 0..1 — partial credit for assertions that admit it */
826
- score: number;
827
- detail?: string;
1257
+ interface RunScoreWeights {
1258
+ success: number;
1259
+ goalProgress: number;
1260
+ repoGroundedness: number;
1261
+ driftPenalty: number;
1262
+ toolUseQuality: number;
1263
+ patchQuality: number;
1264
+ testReality: number;
1265
+ finalGate: number;
1266
+ reviewerBlockers: number;
1267
+ costUsd: number;
1268
+ wallSeconds: number;
828
1269
  }
829
- declare function fileExists(path: string): WorkspaceAssertion;
830
- declare function fileContains(path: string, needle: string): WorkspaceAssertion;
831
- declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
832
- declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
833
- min?: number;
834
- }): WorkspaceAssertion;
835
- /** Run many assertions; return aggregate pass + mean score + per-assertion details. */
836
- declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
837
- pass: boolean;
838
- score: number;
839
- results: Array<{
840
- assertion: string;
841
- result: WorkspaceAssertionResult;
842
- }>;
843
- };
1270
+ declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
1271
+ declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
1272
+ declare function clamp01(value: number): number;
844
1273
 
845
- /**
846
- * Experiment tracker — group runs, diff them, watch scores move over time.
847
- *
848
- * Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
849
- * - A run has a config (prompt hash, model, scenario ids, seed)
850
- * - Runs belong to experiments (named groups)
851
- * - The store is pluggable (in-memory for tests, filesystem for local,
852
- * custom for Langfuse/D1)
853
- * - Diffs show score deltas, new/dropped scenarios, and config changes
854
- *
855
- * The output plugs directly into `BenchmarkReport` — runs archive the full
856
- * report, diff operates on the summary.
857
- */
1274
+ interface SteeringRolePrompt {
1275
+ system?: string;
1276
+ append?: string;
1277
+ }
1278
+ interface SteeringBundle {
1279
+ id: string;
1280
+ coderPrompt?: string;
1281
+ continuePrompt?: string;
1282
+ reviewerPrompts?: Record<string, string>;
1283
+ skills?: string[];
1284
+ rolePrompts?: Record<string, SteeringRolePrompt>;
1285
+ metadata?: Record<string, unknown>;
1286
+ }
1287
+ interface SteeringDelta {
1288
+ coderPrompt?: string;
1289
+ continuePrompt?: string;
1290
+ reviewerPrompts?: Record<string, string>;
1291
+ skills?: string[];
1292
+ rolePrompts?: Record<string, SteeringRolePrompt>;
1293
+ metadata?: Record<string, unknown>;
1294
+ }
1295
+ declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
1296
+ declare function renderSteeringText(bundle: SteeringBundle): string;
1297
+
1298
+ interface OptimizationExample {
1299
+ scenarioId: string;
1300
+ metadata?: Record<string, unknown>;
1301
+ }
1302
+ interface SteeringEvaluation {
1303
+ variant: SteeringBundle;
1304
+ example: OptimizationExample;
1305
+ trialIndex: number;
1306
+ }
1307
+ interface SteeringVariantReport {
1308
+ variantId: string;
1309
+ bundle: SteeringBundle;
1310
+ mean: number;
1311
+ ci95: {
1312
+ lower: number;
1313
+ upper: number;
1314
+ };
1315
+ scenarioScores: Record<string, {
1316
+ mean: number;
1317
+ n: number;
1318
+ samples: number[];
1319
+ }>;
1320
+ }
1321
+ interface OptimizationLoopResult {
1322
+ winner: SteeringBundle;
1323
+ significant: boolean;
1324
+ reports: SteeringVariantReport[];
1325
+ pairwise: Array<{
1326
+ variantA: string;
1327
+ variantB: string;
1328
+ pValue: number;
1329
+ qValue: number;
1330
+ significant: boolean;
1331
+ meanDelta: number;
1332
+ }>;
1333
+ }
1334
+ interface OptimizationLoopConfig {
1335
+ variants: SteeringBundle[];
1336
+ examples: OptimizationExample[];
1337
+ evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
1338
+ scoreWeights?: Partial<RunScoreWeights>;
1339
+ trialsPerScenario?: number;
1340
+ }
1341
+ declare class OptimizationLoop {
1342
+ private readonly optimizer;
1343
+ constructor(optimizer?: PromptOptimizer);
1344
+ run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
1345
+ }
858
1346
 
859
- interface RunConfig {
860
- experimentId: string;
861
- name?: string;
862
- model?: string;
863
- promptHash?: string;
864
- promptVersion?: string;
865
- seed?: number;
1347
+ type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
1348
+ type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
1349
+ type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
1350
+ type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
1351
+ interface FeedbackTask {
1352
+ intent: string;
1353
+ context?: unknown;
1354
+ }
1355
+ interface ProposedSideEffect {
1356
+ type: string;
1357
+ risk?: 'low' | 'medium' | 'high';
1358
+ costUsd?: number;
1359
+ externalSideEffect?: boolean;
1360
+ requiresApproval?: boolean;
866
1361
  metadata?: Record<string, unknown>;
867
1362
  }
868
- interface Run$1 {
1363
+ interface FeedbackLabel {
1364
+ id?: string;
1365
+ source: FeedbackLabelSource;
1366
+ kind: FeedbackLabelKind;
1367
+ value: unknown;
1368
+ reason?: string;
1369
+ severity?: FeedbackSeverity;
1370
+ createdAt: string;
1371
+ metadata?: Record<string, unknown>;
1372
+ }
1373
+ interface FeedbackAttempt {
869
1374
  id: string;
870
- experimentId: string;
871
- name?: string;
872
- config: RunConfig;
873
- startedAt: string;
874
- completedAt?: string;
875
- status: 'running' | 'completed' | 'failed';
876
- report?: BenchmarkReport;
877
- error?: string;
1375
+ stepIndex: number;
1376
+ artifactType: FeedbackArtifactType;
1377
+ artifact: unknown;
1378
+ options?: unknown[];
1379
+ proposedAction?: ProposedSideEffect;
1380
+ evals?: ControlEvalResult[];
1381
+ feedback?: FeedbackLabel[];
1382
+ createdAt: string;
1383
+ metadata?: Record<string, unknown>;
878
1384
  }
879
- interface Experiment {
1385
+ interface FeedbackOutcome {
1386
+ success?: boolean;
1387
+ score?: number;
1388
+ metrics?: Record<string, number>;
1389
+ costUsd?: number;
1390
+ detail?: string;
1391
+ observedAt?: string;
1392
+ metadata?: Record<string, unknown>;
1393
+ }
1394
+ interface FeedbackTrajectory {
880
1395
  id: string;
881
- name: string;
1396
+ projectId?: string;
1397
+ scenarioId?: string;
1398
+ task: FeedbackTask;
1399
+ attempts: FeedbackAttempt[];
1400
+ labels: FeedbackLabel[];
1401
+ outcome?: FeedbackOutcome;
1402
+ split?: DatasetSplit;
1403
+ tags?: Record<string, string>;
882
1404
  createdAt: string;
1405
+ updatedAt?: string;
883
1406
  metadata?: Record<string, unknown>;
884
1407
  }
885
- interface ExperimentStore {
886
- saveExperiment(exp: Experiment): Promise<void>;
887
- getExperiment(id: string): Promise<Experiment | null>;
888
- listExperiments(): Promise<Experiment[]>;
889
- saveRun(run: Run$1): Promise<void>;
890
- getRun(id: string): Promise<Run$1 | null>;
891
- listRuns(experimentId: string): Promise<Run$1[]>;
1408
+ interface FeedbackTrajectoryStore {
1409
+ save(trajectory: FeedbackTrajectory): Promise<void>;
1410
+ get(id: string): Promise<FeedbackTrajectory | null>;
1411
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
1412
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
1413
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
892
1414
  }
893
- declare class InMemoryExperimentStore implements ExperimentStore {
894
- private readonly experiments;
895
- private readonly runs;
896
- saveExperiment(exp: Experiment): Promise<void>;
897
- getExperiment(id: string): Promise<Experiment | null>;
898
- listExperiments(): Promise<Experiment[]>;
899
- saveRun(run: Run$1): Promise<void>;
900
- getRun(id: string): Promise<Run$1 | null>;
901
- listRuns(experimentId: string): Promise<Run$1[]>;
1415
+ interface FeedbackTrajectoryFilter {
1416
+ projectId?: string;
1417
+ scenarioId?: string;
1418
+ split?: DatasetSplit;
1419
+ tag?: [string, string];
902
1420
  }
903
- declare class ExperimentTracker {
904
- private readonly store;
905
- constructor(store: ExperimentStore);
906
- startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
907
- startRun(config: RunConfig): Promise<Run$1>;
908
- completeRun(runId: string, report: BenchmarkReport): Promise<void>;
909
- failRun(runId: string, error: string): Promise<void>;
910
- /**
911
- * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
912
- * and config changes that may explain the movement.
913
- */
914
- diff(runIdA: string, runIdB: string): Promise<RunDiff>;
915
- /** Timeline of aggregate scores for an experiment. */
916
- timeline(experimentId: string): Promise<Array<{
917
- runId: string;
918
- startedAt: string;
919
- overall: number | null;
920
- }>>;
1421
+ interface FeedbackSplitPolicy {
1422
+ trainPct?: number;
1423
+ devPct?: number;
1424
+ testPct?: number;
1425
+ holdoutPct?: number;
921
1426
  }
922
- interface RunDiff {
923
- before: {
924
- runId: string;
925
- name?: string;
926
- startedAt: string;
927
- };
928
- after: {
929
- runId: string;
930
- name?: string;
931
- startedAt: string;
932
- };
933
- aggregateDelta: number;
934
- scenarios: Array<{
935
- scenarioId: string;
936
- before: number | null;
937
- after: number | null;
938
- delta: number | null;
939
- status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
940
- }>;
941
- configChanges: Record<string, {
942
- before: unknown;
943
- after: unknown;
944
- }>;
1427
+ interface PreferenceMemoryEntry {
1428
+ instruction: string;
1429
+ rationale: string;
1430
+ weight: number;
1431
+ sourceTrajectoryId: string;
1432
+ sourceLabelId?: string;
1433
+ category?: string;
945
1434
  }
946
-
947
- /**
948
- * FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
949
- *
950
- * Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
951
- * files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
952
- * based rollover. Writes are append-only so the file log doubles as an audit
953
- * trail of every state transition the tracker ever wrote.
954
- *
955
- * Reads lazy-load every NDJSON file in the directory (including rolled-over
956
- * archives), latest-write-wins per `id`. Subsequent writes update the
957
- * in-memory index in place so reads after writes are O(1).
958
- *
959
- * Node-only — imports `node:fs/promises`. Don't import this from a Worker;
960
- * use the in-memory store or the D1 store from `./experiment-tracker-d1`.
961
- */
962
-
963
- interface FileSystemExperimentStoreOptions {
964
- /** Directory the NDJSON files live in. Created on first write. */
965
- dir: string;
966
- /** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
967
- maxBytes?: number;
1435
+ interface FeedbackOptimizerRow extends OptimizationExample {
1436
+ trajectoryId: string;
1437
+ labelKinds: FeedbackLabelKind[];
1438
+ score?: number;
968
1439
  }
969
- declare class FileSystemExperimentStore implements ExperimentStore {
1440
+ interface FeedbackReplayResult {
1441
+ trajectoryId: string;
1442
+ pass: boolean;
1443
+ score?: number;
1444
+ labels: FeedbackLabel[];
1445
+ outcome?: FeedbackOutcome;
1446
+ metadata?: Record<string, unknown>;
1447
+ }
1448
+ interface FeedbackReplayAdapter {
1449
+ replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
1450
+ }
1451
+ declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
1452
+ private readonly trajectories;
1453
+ save(trajectory: FeedbackTrajectory): Promise<void>;
1454
+ get(id: string): Promise<FeedbackTrajectory | null>;
1455
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
1456
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
1457
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
1458
+ }
1459
+ declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
970
1460
  private readonly dir;
971
- private readonly maxBytes;
972
- private index?;
1461
+ private readonly memory;
973
1462
  private loaded;
974
- constructor(options: FileSystemExperimentStoreOptions);
975
- saveExperiment(exp: Experiment): Promise<void>;
976
- getExperiment(id: string): Promise<Experiment | null>;
977
- listExperiments(): Promise<Experiment[]>;
978
- saveRun(run: Run$1): Promise<void>;
979
- getRun(id: string): Promise<Run$1 | null>;
980
- listRuns(experimentId: string): Promise<Run$1[]>;
981
- private ensureDir;
1463
+ constructor(options: {
1464
+ dir: string;
1465
+ });
1466
+ save(trajectory: FeedbackTrajectory): Promise<void>;
1467
+ get(id: string): Promise<FeedbackTrajectory | null>;
1468
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
1469
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
1470
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
982
1471
  private append;
983
1472
  private load;
984
1473
  }
1474
+ declare function createFeedbackTrajectory(input: {
1475
+ id?: string;
1476
+ projectId?: string;
1477
+ scenarioId?: string;
1478
+ task: FeedbackTask;
1479
+ attempts?: FeedbackAttempt[];
1480
+ labels?: FeedbackLabel[];
1481
+ outcome?: FeedbackOutcome;
1482
+ split?: DatasetSplit;
1483
+ tags?: Record<string, string>;
1484
+ createdAt?: string;
1485
+ metadata?: Record<string, unknown>;
1486
+ }): FeedbackTrajectory;
1487
+ declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
1488
+ declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
1489
+ declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
1490
+ declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
1491
+ declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
1492
+ declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
1493
+ declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
1494
+ declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
1495
+ declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
1496
+ maxEntries?: number;
1497
+ }): PreferenceMemoryEntry[];
1498
+ declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
1499
+ declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
1500
+ declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
1501
+ declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
1502
+ projectId?: string;
1503
+ scenarioId?: string;
1504
+ artifactType?: FeedbackArtifactType;
1505
+ artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
1506
+ proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
1507
+ createdAt?: string;
1508
+ }): FeedbackTrajectory;
1509
+
1510
+ interface ActionExecutionPolicy {
1511
+ allowedTypes?: string[];
1512
+ blockedTypes?: string[];
1513
+ alwaysRequireApprovalTypes?: string[];
1514
+ autoApproveTypes?: string[];
1515
+ requireApprovalForExternalSideEffects?: boolean;
1516
+ requireApprovalAboveCostUsd?: number;
1517
+ maxActionCostUsd?: number;
1518
+ remainingBudgetUsd?: number;
1519
+ expectedOutcomeRequired?: boolean;
1520
+ killCriteriaRequired?: boolean;
1521
+ }
1522
+ interface ActionPolicyDecision {
1523
+ allowed: boolean;
1524
+ blocked: boolean;
1525
+ requiresApproval: boolean;
1526
+ reasons: string[];
1527
+ label?: FeedbackLabel;
1528
+ }
1529
+ declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
1530
+ createdAt?: string;
1531
+ }): ActionPolicyDecision;
985
1532
 
986
1533
  /**
987
- * D1ExperimentStore Cloudflare D1-backed `ExperimentStore`.
988
- *
989
- * Workers-safe (uses only the `D1Database` binding the runtime injects). Two
990
- * tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
991
- * a Worker route can both write the row at run start and update it at run end
992
- * without losing the original config — the row's lifecycle mirrors the
993
- * `Run.status` field one-to-one.
994
- *
995
- * Why this lives next to `InMemoryExperimentStore`:
996
- * - bad-app, legal-agent, gtm-agent, film-agent all run as Workers
997
- * - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
998
- * - Hand-rolling D1 SQL in every consumer is exactly the duplication this
999
- * module exists to prevent
1534
+ * Normalize scores so all dimensions follow "higher = better".
1535
+ * Inverted dimensions (hallucination, false_confidence, worst_failure)
1536
+ * already use inverted scoring in the prompt (10 = no hallucination),
1537
+ * but this function ensures consistency if raw scores leak through.
1538
+ */
1539
+ declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
1540
+ /** Weighted mean falls back to uniform weights when omitted */
1541
+ declare function weightedMean(scores: {
1542
+ score: number;
1543
+ weight?: number;
1544
+ }[]): number;
1545
+ /** Bootstrap confidence interval */
1546
+ declare function confidenceInterval(scores: number[], confidence?: number): {
1547
+ mean: number;
1548
+ lower: number;
1549
+ upper: number;
1550
+ };
1551
+ /**
1552
+ * Inter-rater reliability — simplified Krippendorff's alpha.
1000
1553
  *
1001
- * Schema versioning: the `meta` table records `schema_version` so a future
1002
- * column addition can be detected and migrated additively. Today's schema is
1003
- * v1; bump only on breaking shape changes.
1554
+ * Each inner array is one judge's scores for all items.
1555
+ * All arrays must have the same length (same items scored).
1004
1556
  */
1005
-
1557
+ declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
1006
1558
  /**
1007
- * Minimal `D1Database` shape we depend on. Avoids pulling in
1008
- * `@cloudflare/workers-types` as a hard dep consumers that already have
1009
- * those types installed can pass the binding directly.
1559
+ * Mann-Whitney U test for comparing two independent groups.
1560
+ * Returns U statistic and approximate p-value (normal approximation).
1010
1561
  */
1011
- interface D1Like {
1012
- prepare(query: string): D1PreparedStatementLike;
1013
- batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
1014
- exec(query: string): Promise<unknown>;
1015
- }
1016
- interface D1PreparedStatementLike {
1017
- bind(...values: unknown[]): D1PreparedStatementLike;
1018
- first<T = Record<string, unknown>>(): Promise<T | null>;
1019
- all<T = Record<string, unknown>>(): Promise<{
1020
- results: T[];
1021
- }>;
1022
- run(): Promise<unknown>;
1023
- }
1024
- interface D1ExperimentStoreOptions {
1025
- /** D1 binding from `env`. */
1026
- db: D1Like;
1027
- /**
1028
- * Optional table-name prefix so multiple ExperimentStores can share a DB
1029
- * without colliding (e.g. `tax_eval_experiments` vs `legal_eval_experiments`).
1030
- * Default: `agent_eval_`.
1031
- */
1032
- tablePrefix?: string;
1033
- }
1034
- declare class D1ExperimentStore implements ExperimentStore {
1035
- private readonly db;
1036
- private readonly experimentsTable;
1037
- private readonly runsTable;
1038
- private readonly metaTable;
1039
- private schemaReady;
1040
- constructor(options: D1ExperimentStoreOptions);
1041
- /**
1042
- * Idempotent schema setup. Safe to call before every operation; the second
1043
- * call short-circuits via `schemaReady`. Most consumers will call it once
1044
- * during Worker bootstrap.
1045
- */
1046
- ensureSchema(): Promise<void>;
1047
- saveExperiment(exp: Experiment): Promise<void>;
1048
- getExperiment(id: string): Promise<Experiment | null>;
1049
- listExperiments(): Promise<Experiment[]>;
1050
- saveRun(run: Run$1): Promise<void>;
1051
- getRun(id: string): Promise<Run$1 | null>;
1052
- listRuns(experimentId: string): Promise<Run$1[]>;
1562
+ declare function mannWhitneyU(a: number[], b: number[]): {
1563
+ u: number;
1564
+ p: number;
1565
+ };
1566
+ /** Partial credit: returns 0-1 ratio of current toward target */
1567
+ declare function partialCredit(current: number, target: number): number;
1568
+ /**
1569
+ * Paired t-test before/after measurements on the SAME items.
1570
+ * Pairing removes inter-item variance, giving tighter significance than
1571
+ * an unpaired test when comparing prompt v1 vs prompt v2 on identical
1572
+ * scenarios.
1573
+ */
1574
+ declare function pairedTTest(before: number[], after: number[]): {
1575
+ t: number;
1576
+ df: number;
1577
+ p: number;
1578
+ };
1579
+ /**
1580
+ * Wilcoxon signed-rank test paired non-parametric alternative.
1581
+ * Use when the differences aren't normally distributed.
1582
+ */
1583
+ declare function wilcoxonSignedRank(before: number[], after: number[]): {
1584
+ w: number;
1585
+ p: number;
1586
+ };
1587
+ /**
1588
+ * Cohen's d — standardized effect size for two independent groups.
1589
+ * Positive d means group b has higher mean than group a.
1590
+ * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
1591
+ */
1592
+ declare function cohensD(a: number[], b: number[]): number;
1593
+
1594
+ /**
1595
+ * ConvergenceTracker tracks completion percentage over turns.
1596
+ *
1597
+ * Produces convergence curves showing how quickly the agent reaches
1598
+ * completion criteria.
1599
+ */
1600
+ declare class ConvergenceTracker {
1601
+ private criteria;
1602
+ private history;
1603
+ constructor(criteria: CompletionCriterion[]);
1604
+ /** Evaluate criteria against current state, record result */
1605
+ record(turn: number, state: DriverState): {
1606
+ completionPercent: number;
1607
+ complete: boolean;
1608
+ criteriaStatus: Record<string, boolean | number>;
1609
+ };
1610
+ /** Get convergence curve */
1611
+ getCurve(): number[];
1612
+ /** Get full history with per-criterion status */
1613
+ getHistory(): {
1614
+ turn: number;
1615
+ completionPercent: number;
1616
+ criteriaStatus: Record<string, boolean | number>;
1617
+ }[];
1618
+ /** Find the turn where completion first reached 100% (or null) */
1619
+ getTurnToCompletion(): number | null;
1053
1620
  }
1054
1621
 
1055
1622
  /**
1056
- * Prompt optimizer — A/B test prompt variants with statistical rigor.
1623
+ * Versioned prompt registry.
1057
1624
  *
1058
- * Runs N prompt variants against a fixed scenario set, collects per-scenario
1059
- * scores via the user-provided `scoreVariant` callback, and returns:
1060
- * - per-variant mean + bootstrap CI
1061
- * - pairwise significance (Mann-Whitney, non-parametric works on any
1062
- * score distribution, not just normal)
1063
- * - a winner (highest mean, flagged if the lead is not significant)
1625
+ * Every prompt used in an eval run is registered with an explicit version.
1626
+ * Reports include the content hash so A/B compares are rigorous: if the
1627
+ * hash changes between two reports, the prompt actually changed; if it
1628
+ * matches, the variance is elsewhere.
1064
1629
  *
1065
- * Deliberately generic the `scoreVariant` callback does whatever domain
1066
- * work the consumer needs (invoke the agent, judge the output, whatever),
1067
- * and returns a number per scenario. This lets the optimizer stay small +
1068
- * testable.
1630
+ * Hash is SHA-256(content), truncated to 12 hex chars for readability.
1631
+ * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
1069
1632
  */
1070
- interface PromptVariant$1 {
1633
+ interface PromptHandle {
1634
+ /** Stable human-readable id, e.g. 'browser.system' */
1071
1635
  id: string;
1072
- prompt: string;
1073
- metadata?: Record<string, unknown>;
1636
+ /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
1637
+ version: string;
1638
+ /** SHA-256 of content, 12-hex-char prefix */
1639
+ hash: string;
1640
+ /** Full prompt body */
1641
+ content: string;
1074
1642
  }
1075
- interface OptimizationConfig {
1076
- variants: PromptVariant$1[];
1077
- /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
1078
- trialsPerScenario?: number;
1079
- /** Significance threshold for pairwise comparison (default 0.05). */
1080
- significanceLevel?: number;
1643
+ declare class PromptRegistry {
1644
+ private readonly entries;
1081
1645
  /**
1082
- * The scoring callback. For each (variant, scenarioId, trialIndex), produce
1083
- * a score in 0..1 (or any numeric range the optimizer only cares about
1084
- * monotonicity).
1646
+ * Register a prompt. Re-registering the same id+version with DIFFERENT
1647
+ * content throws versions are immutable. Re-registering with the SAME
1648
+ * content is a no-op (idempotent).
1085
1649
  */
1086
- scoreVariant: (args: {
1087
- variant: PromptVariant$1;
1088
- scenarioId: string;
1089
- trialIndex: number;
1090
- }) => Promise<number>;
1091
- /** Scenario ids to run against. */
1092
- scenarioIds: string[];
1093
- /** Optional hook fires after each (variant, scenario) fully scored. */
1094
- onScenarioComplete?: (info: {
1095
- variantId: string;
1096
- scenarioId: string;
1097
- scores: number[];
1098
- }) => void;
1099
- }
1100
- interface VariantScore {
1101
- variantId: string;
1102
- mean: number;
1103
- ci95: {
1104
- lower: number;
1105
- upper: number;
1106
- };
1107
- n: number;
1108
- perScenario: Record<string, {
1109
- mean: number;
1110
- n: number;
1111
- samples: number[];
1112
- }>;
1113
- }
1114
- interface PairwiseComparison {
1115
- variantA: string;
1116
- variantB: string;
1117
- pValue: number;
1118
- /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
1119
- qValue: number;
1120
- /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
1121
- significant: boolean;
1122
- meanDelta: number;
1123
- }
1124
- interface OptimizationResult {
1125
- winner: {
1126
- variantId: string;
1127
- /** True when the winner's lead vs every other variant is statistically significant. */
1128
- significant: boolean;
1129
- ciLowerBoundExceedsSecondMean: boolean;
1130
- };
1131
- scores: VariantScore[];
1132
- pairwise: PairwiseComparison[];
1133
- config: {
1134
- trialsPerScenario: number;
1135
- significanceLevel: number;
1136
- variants: string[];
1137
- scenarios: string[];
1138
- };
1139
- }
1140
- declare class PromptOptimizer {
1141
- run(config: OptimizationConfig): Promise<OptimizationResult>;
1142
- }
1143
-
1144
- interface SteeringRolePrompt {
1145
- system?: string;
1146
- append?: string;
1147
- }
1148
- interface SteeringBundle {
1149
- id: string;
1150
- coderPrompt?: string;
1151
- continuePrompt?: string;
1152
- reviewerPrompts?: Record<string, string>;
1153
- skills?: string[];
1154
- rolePrompts?: Record<string, SteeringRolePrompt>;
1155
- metadata?: Record<string, unknown>;
1156
- }
1157
- interface SteeringDelta {
1158
- coderPrompt?: string;
1159
- continuePrompt?: string;
1160
- reviewerPrompts?: Record<string, string>;
1161
- skills?: string[];
1162
- rolePrompts?: Record<string, SteeringRolePrompt>;
1163
- metadata?: Record<string, unknown>;
1164
- }
1165
- declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
1166
- declare function renderSteeringText(bundle: SteeringBundle): string;
1167
-
1168
- interface RunScore {
1169
- success: number;
1170
- goalProgress: number;
1171
- repoGroundedness: number;
1172
- driftPenalty: number;
1173
- toolUseQuality: number;
1174
- patchQuality: number;
1175
- testReality: number;
1176
- finalGate: number;
1177
- reviewerBlockers: number;
1178
- costUsd: number;
1179
- wallSeconds: number;
1180
- notes?: string[];
1181
- }
1182
- interface RunScoreWeights {
1183
- success: number;
1184
- goalProgress: number;
1185
- repoGroundedness: number;
1186
- driftPenalty: number;
1187
- toolUseQuality: number;
1188
- patchQuality: number;
1189
- testReality: number;
1190
- finalGate: number;
1191
- reviewerBlockers: number;
1192
- costUsd: number;
1193
- wallSeconds: number;
1650
+ register(id: string, version: string, content: string): Promise<PromptHandle>;
1651
+ /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
1652
+ get(id: string, version: string): PromptHandle;
1653
+ /** Return all versions of an id, newest-first (lex-descending on version). */
1654
+ listVersions(id: string): PromptHandle[];
1655
+ /** Snapshot the whole registry — useful for including in reports. */
1656
+ list(): PromptHandle[];
1657
+ /** Verify a hash against registered content. Returns null if not found. */
1658
+ verifyHash(id: string, version: string, expectedHash: string): boolean | null;
1194
1659
  }
1195
- declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
1196
- declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
1197
- declare function clamp01(value: number): number;
1660
+ /** SHA-256(content) first 12 hex chars. Stable across runtimes. */
1661
+ declare function hashContent(content: string): Promise<string>;
1198
1662
 
1199
1663
  /**
1200
- * TraceSchema v1 — the canonical data model for agent-eval.
1664
+ * Anti-slop quality judge.
1201
1665
  *
1202
- * Every score, every failure class, every pipeline in the framework is
1203
- * a view over this data. Shape it once, live with it.
1666
+ * Deterministic pattern-based quality check no LLM call. Catches the
1667
+ * 80% of AI slop that every production agent leaks:
1668
+ * - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
1669
+ * - N-gram repetition (same phrase over and over)
1670
+ * - Hedging overuse ("I could be wrong, but...")
1671
+ * - Apology padding ("I'm so sorry for the confusion...")
1672
+ * - Unused opening formulas ("Great question!")
1673
+ * - Length bounds (too short to be useful, too long to be read)
1204
1674
  *
1205
- * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
1206
- * but extended with agent-specific span kinds (llm, tool, retrieval,
1207
- * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
1208
- * entities that OTEL leaves as free-form attributes.
1675
+ * Produces a JudgeScore in the same shape as LLM judges so it composes into
1676
+ * `BenchmarkRunner`'s judge array transparently.
1209
1677
  */
1210
- declare const TRACE_SCHEMA_VERSION = "1.0.0";
1211
- type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
1212
- interface BudgetSpec {
1213
- tokens?: number;
1214
- wallMs?: number;
1215
- calls?: number;
1216
- usd?: number;
1678
+
1679
+ interface AntiSlopConfig {
1680
+ /** Domain label — appears in the JudgeScore output */
1681
+ domain?: string;
1682
+ /** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
1683
+ bannedPhrases?: string[];
1684
+ /** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
1685
+ bannedOpenings?: RegExp[];
1686
+ /** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
1687
+ hedgingPatterns?: RegExp[];
1688
+ /** Regexes matching apology padding. */
1689
+ apologyPatterns?: RegExp[];
1690
+ /** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
1691
+ repetitionThreshold?: number;
1692
+ /** Min output length in chars; below this the turn is deemed too terse. */
1693
+ minLength?: number;
1694
+ /** Max output length in chars; above this the turn is deemed too verbose. */
1695
+ maxLength?: number;
1696
+ /** How heavily each violation class reduces the score (default 1). */
1697
+ penaltyWeights?: Partial<Record<SlopCategory, number>>;
1217
1698
  }
1218
- interface RunOutcome$1 {
1219
- score?: number;
1220
- pass?: boolean;
1221
- failureClass?: FailureClass;
1222
- notes?: string;
1699
+ type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
1700
+ /** Create a reusable Judge function from an anti-slop config. */
1701
+ declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
1702
+ interface AntiSlopIssue {
1703
+ category: SlopCategory;
1704
+ detail: string;
1705
+ example?: string;
1706
+ }
1707
+ interface AntiSlopReport {
1708
+ /** 0–10 score; 10 is clean, lower values mean more slop. */
1709
+ score: number;
1710
+ issues: AntiSlopIssue[];
1711
+ /** Count of each category for programmatic aggregation. */
1712
+ counts: Record<SlopCategory, number>;
1223
1713
  }
1224
1714
  /**
1225
- * Layeroptional classification in a nested build workflow.
1226
- * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
1227
- * `app-build`: sandbox harness that compiled + tested the generated scaffold.
1228
- * `app-runtime`: a run of the generated agent against a domain scenario.
1229
- * `meta`: any meta-eval (judge replay, correlation analysis).
1715
+ * Pure function analyze one or more outputs against the config. Exposed
1716
+ * separately so consumers can build their own reporters on top.
1230
1717
  */
1231
- type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
1232
- interface Run {
1233
- runId: string;
1234
- scenarioId: string;
1235
- variantId?: string;
1236
- datasetVersion?: string;
1237
- /** Git SHA of agent code at run time. */
1238
- codeSha?: string;
1239
- /** Hash of the prompt template + any system prompt. */
1240
- promptSha?: string;
1241
- /** Model id + date + system-prompt hash, concatenated. */
1242
- modelFingerprint?: string;
1243
- seed?: number;
1244
- /** Arbitrary environment markers (shell, docker version, tz). */
1245
- envFingerprint?: Record<string, string>;
1246
- /** Version of the redaction rules applied to this run. */
1247
- redactionVersion?: string;
1248
- /** Parent run in a nested build workflow. A builder run's children are
1249
- * app-build runs; those children are app-runtime runs. */
1250
- parentRunId?: string;
1251
- /** Stable project identifier — groups runs across chats + sessions. */
1252
- projectId?: string;
1253
- /** Chat/conversation identifier within a project. */
1254
- chatId?: string;
1255
- /** Layer classification hint for aggregation; not enforced. */
1256
- layer?: RunLayer;
1257
- startedAt: number;
1258
- endedAt?: number;
1259
- status: RunStatus;
1260
- outcome?: RunOutcome$1;
1261
- budget?: BudgetSpec;
1262
- /** Free-form labels for downstream grouping. */
1263
- tags?: Record<string, string>;
1718
+ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
1719
+ penaltyWeights: Record<SlopCategory, number>;
1720
+ }): AntiSlopReport;
1721
+
1722
+ /**
1723
+ * Artifact validators.
1724
+ *
1725
+ * Generic "score a produced artifact" primitive. Tax uses it for PDF form
1726
+ * correctness, research for sourced briefs, browser for task assertions, coding
1727
+ * for social posts. One interface, many validators; all plug into
1728
+ * `BenchmarkRunner` the same way.
1729
+ *
1730
+ * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
1731
+ * plus a `ValidationContext` (scenario id, the turns that produced it) and
1732
+ * returns a `ValidationResult` with pass/fail + 0..1 score + structured
1733
+ * issues.
1734
+ */
1735
+ interface Artifact {
1736
+ /** Logical kind validators type-guard on this */
1737
+ kind: 'file' | 'json' | 'text' | 'binary' | string;
1738
+ /** Filesystem-style path, optional */
1739
+ path?: string;
1740
+ /** String content for text/json/file kinds */
1741
+ content?: string;
1742
+ /** Binary content (if kind === 'binary') */
1743
+ bytes?: Uint8Array;
1744
+ /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
1745
+ metadata?: Record<string, unknown>;
1264
1746
  }
1265
- type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
1266
- type SpanStatus = 'ok' | 'error';
1267
- interface SpanBase {
1268
- spanId: string;
1269
- parentSpanId?: string;
1270
- runId: string;
1271
- kind: SpanKind;
1272
- name: string;
1273
- startedAt: number;
1274
- endedAt?: number;
1275
- status?: SpanStatus;
1276
- error?: string;
1277
- /** Anything not covered by typed fields. Kept deliberately free-form. */
1278
- attributes?: Record<string, unknown>;
1747
+ interface ValidationContext {
1748
+ scenarioId: string;
1749
+ turnIndex?: number;
1750
+ /** Prior artifacts for multi-artifact scenarios */
1751
+ priorArtifacts?: Artifact[];
1752
+ /** Free-form hints the validator uses for domain-specific checks */
1753
+ hints?: Record<string, unknown>;
1279
1754
  }
1280
- interface Message {
1281
- role: 'system' | 'user' | 'assistant' | 'tool';
1282
- content: string;
1283
- tokens?: number;
1284
- /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
1285
- images?: Array<{
1286
- artifactId?: string;
1287
- url?: string;
1288
- mime?: string;
1289
- }>;
1755
+ interface ValidationIssue {
1756
+ severity: 'error' | 'warning' | 'info';
1757
+ message: string;
1758
+ /** Optional path into the artifact (e.g. JSON path or byte offset) */
1759
+ locus?: string;
1290
1760
  }
1291
- interface LlmSpan extends SpanBase {
1292
- kind: 'llm';
1293
- model: string;
1294
- messages: Message[];
1295
- output?: string;
1296
- inputTokens?: number;
1297
- outputTokens?: number;
1298
- cachedTokens?: number;
1299
- reasoningTokens?: number;
1300
- costUsd?: number;
1301
- finishReason?: string;
1761
+ interface ValidationResult {
1762
+ pass: boolean;
1763
+ /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
1764
+ score: number;
1765
+ issues: ValidationIssue[];
1766
+ /** Diagnostic payload for reporters */
1767
+ evidence?: Record<string, unknown>;
1302
1768
  }
1303
- interface ToolSpan extends SpanBase {
1304
- kind: 'tool';
1305
- toolName: string;
1306
- args: unknown;
1307
- result?: unknown;
1308
- latencyMs?: number;
1769
+ interface ArtifactValidator {
1770
+ /** Stable identifier for the validator; appears in reports. */
1771
+ name: string;
1772
+ /** Optional description for human-facing reports. */
1773
+ description?: string;
1774
+ /** Called once per artifact; validators are expected to be pure + idempotent. */
1775
+ validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
1309
1776
  }
1310
- interface RetrievalSpan extends SpanBase {
1311
- kind: 'retrieval';
1312
- query: string;
1313
- hits: Array<{
1314
- docId: string;
1315
- score: number;
1316
- content?: string;
1777
+ /**
1778
+ * Run every validator on the same artifact; aggregate pass as AND, score as
1779
+ * (weighted) mean, issues concatenated. Weights default to 1 each.
1780
+ */
1781
+ declare function composeValidators(validators: ArtifactValidator[], options?: {
1782
+ name?: string;
1783
+ weights?: number[];
1784
+ }): ArtifactValidator;
1785
+ /** Pass if the artifact body matches a provided regex. */
1786
+ declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
1787
+ /** Pass if JSON parses and every required key is present. */
1788
+ declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
1789
+ /** Pass if min ≤ byte length ≤ max. */
1790
+ declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
1791
+ /** Pass if the artifact contains every required substring (case-insensitive by default). */
1792
+ declare function containsAll(name: string, required: string[], options?: {
1793
+ caseSensitive?: boolean;
1794
+ }): ArtifactValidator;
1795
+
1796
+ /**
1797
+ * Workspace inspector — score the persisted state of an agent after a run.
1798
+ *
1799
+ * Many evals don't ask "did the response say the right thing" but "did the
1800
+ * agent put the right rows in the DB / files in the vault / entities on the
1801
+ * canvas". This is the primitive for that.
1802
+ *
1803
+ * Implementations read from D1, KV, filesystem, or any store — the interface
1804
+ * is deliberately small so consumers plug in their own backends.
1805
+ */
1806
+ interface WorkspaceSnapshot {
1807
+ /** Vault files: logical path → content */
1808
+ files: Record<string, string>;
1809
+ /** DB rows: table name → array of rows (post-validation) */
1810
+ rows: Record<string, Array<Record<string, unknown>>>;
1811
+ /** KV entries: key → value (scoped to whatever prefix the inspector chose) */
1812
+ kv: Record<string, string>;
1813
+ /** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
1814
+ blobs?: Record<string, {
1815
+ size: number;
1816
+ hash?: string;
1817
+ mimeType?: string;
1317
1818
  }>;
1318
1819
  }
1319
- interface JudgeSpan extends SpanBase {
1320
- kind: 'judge';
1321
- judgeId: string;
1322
- /** Span this judgment applies to. */
1323
- targetSpanId: string;
1324
- dimension: string;
1325
- /** Numeric score (free-range; interpretation up to the judge). */
1326
- score: number;
1327
- rationale?: string;
1328
- evidence?: string;
1329
- }
1330
- interface SandboxSpan extends SpanBase {
1331
- kind: 'sandbox';
1332
- image?: string;
1333
- command?: string;
1334
- exitCode?: number;
1335
- testsTotal?: number;
1336
- testsPassed?: number;
1337
- stdoutHash?: string;
1338
- stderrHash?: string;
1339
- /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
1340
- wallMs?: number;
1341
- }
1342
- interface GenericSpan extends SpanBase {
1343
- kind: 'agent' | 'custom';
1820
+ interface InspectorContext {
1821
+ /** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
1822
+ scopeId: string;
1823
+ /** Optional scenario id allows scenario-specific snapshot shaping */
1824
+ scenarioId?: string;
1344
1825
  }
1345
- type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
1346
- type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
1347
- interface TraceEvent {
1348
- eventId: string;
1349
- runId: string;
1350
- spanId?: string;
1351
- kind: EventKind;
1352
- timestamp: number;
1353
- payload: Record<string, unknown>;
1826
+ interface WorkspaceInspector {
1827
+ name: string;
1828
+ snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
1354
1829
  }
1355
- interface BudgetLedgerEntry {
1356
- runId: string;
1357
- dimension: keyof BudgetSpec;
1358
- limit: number;
1359
- consumed: number;
1360
- remaining: number;
1361
- timestamp: number;
1362
- breached: boolean;
1363
- /** Span that triggered this entry, if any. */
1364
- spanId?: string;
1830
+ declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
1831
+ readonly name = "in-memory";
1832
+ private readonly snapshots;
1833
+ set(scopeId: string, snapshot: WorkspaceSnapshot): void;
1834
+ snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
1365
1835
  }
1366
- interface Artifact {
1367
- artifactId: string;
1368
- runId: string;
1369
- spanId?: string;
1370
- contentType: string;
1371
- sizeBytes: number;
1372
- /** sha256 in hex. */
1373
- hash: string;
1374
- /** External storage URL (R2, S3, filesystem path). */
1375
- storageUrl?: string;
1376
- /** Inline content for small blobs — keep under ~64KB. */
1377
- inlineContent?: string;
1836
+ interface WorkspaceAssertion {
1837
+ name: string;
1838
+ description?: string;
1839
+ check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
1378
1840
  }
1379
- type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
1380
- declare const FAILURE_CLASSES: readonly FailureClass[];
1381
- declare function isLlmSpan(s: Span): s is LlmSpan;
1382
- declare function isToolSpan(s: Span): s is ToolSpan;
1383
- declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
1384
- declare function isJudgeSpan(s: Span): s is JudgeSpan;
1385
- declare function isSandboxSpan(s: Span): s is SandboxSpan;
1841
+ interface WorkspaceAssertionResult {
1842
+ pass: boolean;
1843
+ /** 0..1 partial credit for assertions that admit it */
1844
+ score: number;
1845
+ detail?: string;
1846
+ }
1847
+ declare function fileExists(path: string): WorkspaceAssertion;
1848
+ declare function fileContains(path: string, needle: string): WorkspaceAssertion;
1849
+ declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
1850
+ declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
1851
+ min?: number;
1852
+ }): WorkspaceAssertion;
1853
+ /** Run many assertions; return aggregate pass + mean score + per-assertion details. */
1854
+ declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
1855
+ pass: boolean;
1856
+ score: number;
1857
+ results: Array<{
1858
+ assertion: string;
1859
+ result: WorkspaceAssertionResult;
1860
+ }>;
1861
+ };
1386
1862
 
1387
- interface RunFilter {
1388
- scenarioId?: string;
1389
- variantId?: string;
1390
- status?: RunStatus;
1391
- since?: number;
1392
- until?: number;
1393
- tag?: {
1394
- key: string;
1395
- value: string;
1396
- };
1397
- parentRunId?: string;
1398
- projectId?: string;
1399
- chatId?: string;
1400
- layer?: RunLayer;
1863
+ /**
1864
+ * Experiment tracker — group runs, diff them, watch scores move over time.
1865
+ *
1866
+ * Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
1867
+ * - A run has a config (prompt hash, model, scenario ids, seed)
1868
+ * - Runs belong to experiments (named groups)
1869
+ * - The store is pluggable (in-memory for tests, filesystem for local,
1870
+ * custom for Langfuse/D1)
1871
+ * - Diffs show score deltas, new/dropped scenarios, and config changes
1872
+ *
1873
+ * The output plugs directly into `BenchmarkReport` — runs archive the full
1874
+ * report, diff operates on the summary.
1875
+ */
1876
+
1877
+ interface RunConfig {
1878
+ experimentId: string;
1879
+ name?: string;
1880
+ model?: string;
1881
+ promptHash?: string;
1882
+ promptVersion?: string;
1883
+ seed?: number;
1884
+ metadata?: Record<string, unknown>;
1401
1885
  }
1402
- interface SpanFilter {
1403
- runId?: string;
1404
- parentSpanId?: string;
1405
- kind?: SpanKind;
1886
+ interface Run {
1887
+ id: string;
1888
+ experimentId: string;
1406
1889
  name?: string;
1407
- toolName?: string;
1408
- judgeId?: string;
1409
- since?: number;
1410
- until?: number;
1890
+ config: RunConfig;
1891
+ startedAt: string;
1892
+ completedAt?: string;
1893
+ status: 'running' | 'completed' | 'failed';
1894
+ report?: BenchmarkReport;
1895
+ error?: string;
1411
1896
  }
1412
- interface EventFilter {
1413
- runId?: string;
1414
- spanId?: string;
1415
- kind?: EventKind;
1416
- since?: number;
1417
- until?: number;
1897
+ interface Experiment {
1898
+ id: string;
1899
+ name: string;
1900
+ createdAt: string;
1901
+ metadata?: Record<string, unknown>;
1418
1902
  }
1419
- interface TraceStore {
1420
- appendRun(run: Run): Promise<void>;
1421
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1422
- appendSpan(span: Span): Promise<void>;
1423
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1424
- appendEvent(event: TraceEvent): Promise<void>;
1425
- appendArtifact(artifact: Artifact): Promise<void>;
1426
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1427
- getRun(runId: string): Promise<Run | undefined>;
1428
- listRuns(filter?: RunFilter): Promise<Run[]>;
1429
- spans(filter?: SpanFilter): Promise<Span[]>;
1430
- events(filter?: EventFilter): Promise<TraceEvent[]>;
1431
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
1432
- artifacts(runId: string): Promise<Artifact[]>;
1903
+ interface ExperimentStore {
1904
+ saveExperiment(exp: Experiment): Promise<void>;
1905
+ getExperiment(id: string): Promise<Experiment | null>;
1906
+ listExperiments(): Promise<Experiment[]>;
1907
+ saveRun(run: Run): Promise<void>;
1908
+ getRun(id: string): Promise<Run | null>;
1909
+ listRuns(experimentId: string): Promise<Run[]>;
1433
1910
  }
1434
- declare class InMemoryTraceStore implements TraceStore {
1435
- private runs;
1436
- private allSpans;
1437
- private allEvents;
1438
- private allArtifacts;
1439
- private allBudget;
1440
- appendRun(run: Run): Promise<void>;
1441
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1442
- appendSpan(span: Span): Promise<void>;
1443
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1444
- appendEvent(event: TraceEvent): Promise<void>;
1445
- appendArtifact(artifact: Artifact): Promise<void>;
1446
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1447
- getRun(runId: string): Promise<Run | undefined>;
1448
- listRuns(filter?: RunFilter): Promise<Run[]>;
1449
- spans(filter?: SpanFilter): Promise<Span[]>;
1450
- events(filter?: EventFilter): Promise<TraceEvent[]>;
1451
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
1452
- artifacts(runId: string): Promise<Artifact[]>;
1911
+ declare class InMemoryExperimentStore implements ExperimentStore {
1912
+ private readonly experiments;
1913
+ private readonly runs;
1914
+ saveExperiment(exp: Experiment): Promise<void>;
1915
+ getExperiment(id: string): Promise<Experiment | null>;
1916
+ listExperiments(): Promise<Experiment[]>;
1917
+ saveRun(run: Run): Promise<void>;
1918
+ getRun(id: string): Promise<Run | null>;
1919
+ listRuns(experimentId: string): Promise<Run[]>;
1453
1920
  }
1454
- interface FileSystemTraceStoreOptions {
1921
+ declare class ExperimentTracker {
1922
+ private readonly store;
1923
+ constructor(store: ExperimentStore);
1924
+ startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
1925
+ startRun(config: RunConfig): Promise<Run>;
1926
+ completeRun(runId: string, report: BenchmarkReport): Promise<void>;
1927
+ failRun(runId: string, error: string): Promise<void>;
1928
+ /**
1929
+ * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
1930
+ * and config changes that may explain the movement.
1931
+ */
1932
+ diff(runIdA: string, runIdB: string): Promise<RunDiff>;
1933
+ /** Timeline of aggregate scores for an experiment. */
1934
+ timeline(experimentId: string): Promise<Array<{
1935
+ runId: string;
1936
+ startedAt: string;
1937
+ overall: number | null;
1938
+ }>>;
1939
+ }
1940
+ interface RunDiff {
1941
+ before: {
1942
+ runId: string;
1943
+ name?: string;
1944
+ startedAt: string;
1945
+ };
1946
+ after: {
1947
+ runId: string;
1948
+ name?: string;
1949
+ startedAt: string;
1950
+ };
1951
+ aggregateDelta: number;
1952
+ scenarios: Array<{
1953
+ scenarioId: string;
1954
+ before: number | null;
1955
+ after: number | null;
1956
+ delta: number | null;
1957
+ status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
1958
+ }>;
1959
+ configChanges: Record<string, {
1960
+ before: unknown;
1961
+ after: unknown;
1962
+ }>;
1963
+ }
1964
+
1965
+ /**
1966
+ * FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
1967
+ *
1968
+ * Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
1969
+ * files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
1970
+ * based rollover. Writes are append-only so the file log doubles as an audit
1971
+ * trail of every state transition the tracker ever wrote.
1972
+ *
1973
+ * Reads lazy-load every NDJSON file in the directory (including rolled-over
1974
+ * archives), latest-write-wins per `id`. Subsequent writes update the
1975
+ * in-memory index in place so reads after writes are O(1).
1976
+ *
1977
+ * Node-only — imports `node:fs/promises`. Don't import this from a Worker;
1978
+ * use the in-memory store or the D1 store from `./experiment-tracker-d1`.
1979
+ */
1980
+
1981
+ interface FileSystemExperimentStoreOptions {
1982
+ /** Directory the NDJSON files live in. Created on first write. */
1455
1983
  dir: string;
1456
- /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
1984
+ /** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
1457
1985
  maxBytes?: number;
1458
1986
  }
1459
- declare class FileSystemTraceStore implements TraceStore {
1460
- private dir;
1461
- private maxBytes;
1462
- /** Lazy in-memory index for queries — populated on first read. */
1987
+ declare class FileSystemExperimentStore implements ExperimentStore {
1988
+ private readonly dir;
1989
+ private readonly maxBytes;
1463
1990
  private index?;
1464
1991
  private loaded;
1465
- constructor(options: FileSystemTraceStoreOptions);
1992
+ constructor(options: FileSystemExperimentStoreOptions);
1993
+ saveExperiment(exp: Experiment): Promise<void>;
1994
+ getExperiment(id: string): Promise<Experiment | null>;
1995
+ listExperiments(): Promise<Experiment[]>;
1996
+ saveRun(run: Run): Promise<void>;
1997
+ getRun(id: string): Promise<Run | null>;
1998
+ listRuns(experimentId: string): Promise<Run[]>;
1466
1999
  private ensureDir;
1467
2000
  private append;
1468
- private insertInto;
1469
2001
  private load;
1470
- appendRun(run: Run): Promise<void>;
1471
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1472
- appendSpan(span: Span): Promise<void>;
1473
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1474
- appendEvent(event: TraceEvent): Promise<void>;
1475
- appendArtifact(artifact: Artifact): Promise<void>;
1476
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1477
- getRun(runId: string): Promise<Run | undefined>;
1478
- listRuns(filter?: RunFilter): Promise<Run[]>;
1479
- spans(filter?: SpanFilter): Promise<Span[]>;
1480
- events(filter?: EventFilter): Promise<TraceEvent[]>;
1481
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
1482
- artifacts(runId: string): Promise<Artifact[]>;
1483
2002
  }
1484
2003
 
1485
2004
  /**
1486
- * TraceEmitterhierarchical span builder that auto-parents using an
1487
- * internal stack. One emitter per Run; emitters do NOT share state.
2005
+ * D1ExperimentStoreCloudflare D1-backed `ExperimentStore`.
1488
2006
  *
1489
- * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
1490
- * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
1491
- * have to thread spanIds manually. For async workflows that can't use
1492
- * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
1493
- * explicitly.
2007
+ * Workers-safe (uses only the `D1Database` binding the runtime injects). Two
2008
+ * tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
2009
+ * a Worker route can both write the row at run start and update it at run end
2010
+ * without losing the original config the row's lifecycle mirrors the
2011
+ * `Run.status` field one-to-one.
2012
+ *
2013
+ * Why this lives next to `InMemoryExperimentStore`:
2014
+ * - browser, coding, and computer-use agents can all run as Workers
2015
+ * - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
2016
+ * - Hand-rolling D1 SQL in every consumer is exactly the duplication this
2017
+ * module exists to prevent
2018
+ *
2019
+ * Schema versioning: the `meta` table records `schema_version` so a future
2020
+ * column addition can be detected and migrated additively. Today's schema is
2021
+ * v1; bump only on breaking shape changes.
1494
2022
  */
1495
2023
 
1496
- interface SpanHandle<S extends Span = Span> {
1497
- span: S;
1498
- end(patch?: Partial<S>): Promise<void>;
1499
- fail(error: string | Error, patch?: Partial<S>): Promise<void>;
2024
+ /**
2025
+ * Minimal `D1Database` shape we depend on. Avoids pulling in
2026
+ * `@cloudflare/workers-types` as a hard dep — consumers that already have
2027
+ * those types installed can pass the binding directly.
2028
+ */
2029
+ interface D1Like {
2030
+ prepare(query: string): D1PreparedStatementLike;
2031
+ batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
2032
+ exec(query: string): Promise<unknown>;
1500
2033
  }
1501
- interface TraceEmitterOptions {
1502
- runId?: string;
1503
- /** Inject a clock for deterministic tests. */
1504
- now?: () => number;
1505
- /** Inject an id generator for deterministic tests. */
1506
- id?: () => string;
2034
+ interface D1PreparedStatementLike {
2035
+ bind(...values: unknown[]): D1PreparedStatementLike;
2036
+ first<T = Record<string, unknown>>(): Promise<T | null>;
2037
+ all<T = Record<string, unknown>>(): Promise<{
2038
+ results: T[];
2039
+ }>;
2040
+ run(): Promise<unknown>;
2041
+ }
2042
+ interface D1ExperimentStoreOptions {
2043
+ /** D1 binding from `env`. */
2044
+ db: D1Like;
2045
+ /**
2046
+ * Optional table-name prefix so multiple ExperimentStores can share a DB
2047
+ * without colliding (e.g. `browser_eval_experiments` vs `coding_eval_experiments`).
2048
+ * Default: `agent_eval_`.
2049
+ */
2050
+ tablePrefix?: string;
1507
2051
  }
1508
- declare class TraceEmitter {
1509
- private store;
1510
- private stack;
1511
- private _runId;
1512
- private now;
1513
- private id;
1514
- constructor(store: TraceStore, options?: TraceEmitterOptions);
1515
- get runId(): string;
1516
- startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
1517
- endRun(outcome?: RunOutcome$1): Promise<void>;
1518
- abortRun(reason: string): Promise<void>;
1519
- span<S extends Span = Span>(init: {
1520
- kind: SpanKind;
1521
- name: string;
1522
- parentSpanId?: string;
1523
- attributes?: Record<string, unknown>;
1524
- } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
1525
- private handle;
1526
- private pop;
1527
- llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
1528
- tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
1529
- retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
1530
- recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
1531
- sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
1532
- emit(event: {
1533
- kind: EventKind;
1534
- spanId?: string;
1535
- payload?: Record<string, unknown>;
1536
- }): Promise<TraceEvent>;
1537
- recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
1538
- timestamp?: number;
1539
- }): Promise<BudgetLedgerEntry>;
1540
- recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
2052
+ declare class D1ExperimentStore implements ExperimentStore {
2053
+ private readonly db;
2054
+ private readonly experimentsTable;
2055
+ private readonly runsTable;
2056
+ private readonly metaTable;
2057
+ private schemaReady;
2058
+ constructor(options: D1ExperimentStoreOptions);
1541
2059
  /**
1542
- * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
1543
- * Returns the fn's return value. Use this for the 95% case.
2060
+ * Idempotent schema setup. Safe to call before every operation; the second
2061
+ * call short-circuits via `schemaReady`. Most consumers will call it once
2062
+ * during Worker bootstrap.
1544
2063
  */
1545
- within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
2064
+ ensureSchema(): Promise<void>;
2065
+ saveExperiment(exp: Experiment): Promise<void>;
2066
+ getExperiment(id: string): Promise<Experiment | null>;
2067
+ listExperiments(): Promise<Experiment[]>;
2068
+ saveRun(run: Run): Promise<void>;
2069
+ getRun(id: string): Promise<Run | null>;
2070
+ listRuns(experimentId: string): Promise<Run[]>;
1546
2071
  }
1547
- /** Helper to build an LLM span handle args object from a provider-shaped response. */
1548
- declare function llmSpanFromProvider(args: {
1549
- name?: string;
1550
- model: string;
1551
- messages: Message[];
1552
- output: string;
1553
- usage?: {
1554
- inputTokens?: number;
1555
- outputTokens?: number;
1556
- cachedTokens?: number;
1557
- reasoningTokens?: number;
1558
- };
1559
- costUsd?: number;
1560
- finishReason?: string;
1561
- }): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
1562
2072
 
1563
2073
  /**
1564
2074
  * Typed query helpers over TraceStore.
@@ -1569,7 +2079,7 @@ declare function llmSpanFromProvider(args: {
1569
2079
  * tooling works out of the box.
1570
2080
  */
1571
2081
 
1572
- declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
2082
+ declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run$1[]>;
1573
2083
  declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
1574
2084
  declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
1575
2085
  declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
@@ -1585,7 +2095,7 @@ declare function aggregateLlm(spans: LlmSpan[]): {
1585
2095
  costUsd: number;
1586
2096
  };
1587
2097
  /** Pick the outcome's failure class when present, else derive 'success' from run status. */
1588
- declare function runFailureClass(run: Run): FailureClass;
2098
+ declare function runFailureClass(run: Run$1): FailureClass;
1589
2099
 
1590
2100
  /**
1591
2101
  * Redaction — remove PII / secrets from trace payloads before persist.
@@ -1689,10 +2199,10 @@ interface OtlpExport {
1689
2199
  declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
1690
2200
 
1691
2201
  interface RunTrace {
1692
- run: Run;
2202
+ run: Run$1;
1693
2203
  spans: Span[];
1694
2204
  events: TraceEvent[];
1695
- artifacts: Artifact[];
2205
+ artifacts: Artifact$1[];
1696
2206
  budget: BudgetLedgerEntry[];
1697
2207
  }
1698
2208
  interface RunCriticOptions {
@@ -1725,55 +2235,6 @@ declare function distillPlaybook(entries: PlaybookEntry[], options?: {
1725
2235
  }): Playbook;
1726
2236
  declare function renderPlaybookMarkdown(playbook: Playbook): string;
1727
2237
 
1728
- interface OptimizationExample {
1729
- scenarioId: string;
1730
- metadata?: Record<string, unknown>;
1731
- }
1732
- interface SteeringEvaluation {
1733
- variant: SteeringBundle;
1734
- example: OptimizationExample;
1735
- trialIndex: number;
1736
- }
1737
- interface SteeringVariantReport {
1738
- variantId: string;
1739
- bundle: SteeringBundle;
1740
- mean: number;
1741
- ci95: {
1742
- lower: number;
1743
- upper: number;
1744
- };
1745
- scenarioScores: Record<string, {
1746
- mean: number;
1747
- n: number;
1748
- samples: number[];
1749
- }>;
1750
- }
1751
- interface OptimizationLoopResult {
1752
- winner: SteeringBundle;
1753
- significant: boolean;
1754
- reports: SteeringVariantReport[];
1755
- pairwise: Array<{
1756
- variantA: string;
1757
- variantB: string;
1758
- pValue: number;
1759
- qValue: number;
1760
- significant: boolean;
1761
- meanDelta: number;
1762
- }>;
1763
- }
1764
- interface OptimizationLoopConfig {
1765
- variants: SteeringBundle[];
1766
- examples: OptimizationExample[];
1767
- evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
1768
- scoreWeights?: Partial<RunScoreWeights>;
1769
- trialsPerScenario?: number;
1770
- }
1771
- declare class OptimizationLoop {
1772
- private readonly optimizer;
1773
- constructor(optimizer?: PromptOptimizer);
1774
- run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
1775
- }
1776
-
1777
2238
  type SteeringOptimizerBackend = 'pairwise' | 'ax-gepa';
1778
2239
  interface SteeringOptimizationRow {
1779
2240
  variantId: string;
@@ -2167,7 +2628,7 @@ type HostedRunCriticConfig = Pick<RunCriticOptions, 'weights'> & {
2167
2628
  /**
2168
2629
  * Dual-agent convergence bench.
2169
2630
  *
2170
- * Pattern lifted from tax-agent + legal-agent: two agents take turns until
2631
+ * Pattern lifted from dual-worker review loops: two agents take turns until
2171
2632
  * they converge on a consensus artifact. One proposes, the other critiques;
2172
2633
  * the proposer revises; repeat until a score threshold is hit or max rounds.
2173
2634
  *
@@ -2400,6 +2861,51 @@ interface LlmReviewerConfig<State, Summary = unknown> {
2400
2861
  }
2401
2862
  declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
2402
2863
 
2864
+ interface ProposeReviewControlState<State, Summary = unknown> {
2865
+ shot: number;
2866
+ state: State;
2867
+ priorReview: Review | null;
2868
+ verification: Verification;
2869
+ traceSummary?: Summary;
2870
+ memory: ReviewMemoryEntry[];
2871
+ completed: boolean;
2872
+ reviewAvailable: boolean;
2873
+ reviewError?: string;
2874
+ }
2875
+ interface ProposeReviewControlAction {
2876
+ type: 'propose-review-shot';
2877
+ shot: number;
2878
+ }
2879
+ interface ProposeReviewControlResult<State, Summary = unknown> {
2880
+ state: State;
2881
+ verification: Verification;
2882
+ traceSummary?: Summary;
2883
+ review: Review | null;
2884
+ reviewAvailable: boolean;
2885
+ reviewError?: string;
2886
+ }
2887
+ interface ProposeReviewControlConfig<State, Summary = unknown> {
2888
+ goal: string;
2889
+ initialState: State;
2890
+ propose: ProposeFn<State, Summary>;
2891
+ verify: VerifyFn<State>;
2892
+ review: ReviewFn<State, Summary>;
2893
+ maxShots?: number;
2894
+ maxWallMs?: number;
2895
+ memory?: ReviewMemoryStore;
2896
+ store?: TraceStore;
2897
+ scenarioId?: string;
2898
+ projectId?: string;
2899
+ variantId?: string;
2900
+ fallbackInstruction?: string;
2901
+ confidenceFloor?: number;
2902
+ confidenceFloorWindow?: number;
2903
+ failureClassFromVerification?: (verification: Verification) => FailureClass | undefined;
2904
+ actionFailure?: ControlRuntimeConfig<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>['actionFailure'];
2905
+ }
2906
+ declare function runProposeReviewAsControlLoop<State, Summary = unknown>(config: ProposeReviewControlConfig<State, Summary>): Promise<ControlRunResult<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>>;
2907
+ declare function controlFailureClassFromVerification(verification: Verification): FailureClass | undefined;
2908
+
2403
2909
  /**
2404
2910
  * TestGradedScenario — a scenario whose score comes from a test suite.
2405
2911
  *
@@ -2428,7 +2934,7 @@ interface TestGradedRunOptions {
2428
2934
  variantId?: string;
2429
2935
  driver?: SandboxDriver;
2430
2936
  /** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
2431
- provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
2937
+ provenance?: Pick<Run$1, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
2432
2938
  }
2433
2939
  interface TestGradedRunResult {
2434
2940
  runId: string;
@@ -2481,7 +2987,7 @@ declare class BudgetGuard {
2481
2987
  */
2482
2988
 
2483
2989
  interface FailureContext {
2484
- run: Run;
2990
+ run: Run$1;
2485
2991
  spans: Span[];
2486
2992
  events: TraceEvent[];
2487
2993
  }
@@ -2824,7 +3330,7 @@ interface RegressionSpec {
2824
3330
  metric: string;
2825
3331
  higherIsBetter: boolean;
2826
3332
  /** Extract a scalar from a run. Default extractors handle common metrics. */
2827
- extract?: (run: Run, store: TraceStore) => Promise<number | null>;
3333
+ extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
2828
3334
  }
2829
3335
  interface RegressionOptions extends BaselineOptions {
2830
3336
  baseline: RunFilter;
@@ -2938,7 +3444,7 @@ declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): Ora
2938
3444
  /**
2939
3445
  * Cost tracker — token + USD accounting per scenario and per run.
2940
3446
  *
2941
- * Lifted from tax/legal metrics.ts + tangle-router UsageEvent. Every
3447
+ * Adapted from generic usage-event accounting. Every
2942
3448
  * optimizer needs to know "is the quality gain worth the cost delta?",
2943
3449
  * and every dashboard needs dollars-per-completed-task. MODEL_PRICING
2944
3450
  * from metrics.ts stays authoritative for estimate math; this module
@@ -3149,7 +3655,7 @@ declare function analyzeSeries(values: number[], options?: SeriesConvergenceOpti
3149
3655
  * State continuity scoring — measures how well a resumed/handed-off agent
3150
3656
  * preserves prior work.
3151
3657
  *
3152
- * Lifted from tax-agent's run-resume-eval.ts. When session 2 continues
3658
+ * When session 2 continues
3153
3659
  * session 1's work, the key question is: did it preserve key artifacts,
3154
3660
  * or start over and lose context? Each `ContinuityCheck` inspects one
3155
3661
  * aspect (file preserved, key count grew, status advanced) and yields
@@ -3192,107 +3698,6 @@ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minR
3192
3698
  /** Common check: a status field advanced in an expected order. */
3193
3699
  declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
3194
3700
 
3195
- /**
3196
- * Dataset — versioned, sliceable, content-hashed scenario collection.
3197
- *
3198
- * Scenarios stop being ephemeral arrays and become first-class
3199
- * artifacts. Every Dataset carries:
3200
- * - content hash (sha256 over canonicalized scenario array)
3201
- * - provenance (contributor, createdAt, sourceUrl)
3202
- * - split labels (train | dev | test | holdout)
3203
- * - difficulty tiers (easy | medium | hard | extreme)
3204
- * - tags (free-form, per-scenario)
3205
- *
3206
- * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
3207
- * deterministic, reproducible subset. Holdout slices are locked: you
3208
- * can read them but `mutate` throws, which prevents "oh I'll just
3209
- * tweak that one scenario" contamination drift.
3210
- */
3211
- type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
3212
- type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
3213
- interface DatasetScenario {
3214
- id: string;
3215
- /** Arbitrary payload; the framework doesn't interpret it. */
3216
- payload: unknown;
3217
- split?: DatasetSplit;
3218
- difficulty?: DatasetDifficulty;
3219
- /** Canary token that MUST NOT round-trip through a correct agent output. */
3220
- canary?: string;
3221
- tags?: Record<string, string>;
3222
- }
3223
- interface DatasetProvenance {
3224
- contributor?: string;
3225
- createdAt: string;
3226
- sourceUrl?: string;
3227
- license?: string;
3228
- description?: string;
3229
- /** Monotonic human-readable version (e.g. "2026.04.20"). */
3230
- version: string;
3231
- }
3232
- interface DatasetManifest {
3233
- name: string;
3234
- provenance: DatasetProvenance;
3235
- /** sha256 hex over canonicalized scenarios. */
3236
- contentHash: string;
3237
- scenarioCount: number;
3238
- splitCounts: Record<DatasetSplit, number>;
3239
- }
3240
- interface SliceOptions {
3241
- split?: DatasetSplit;
3242
- difficulty?: DatasetDifficulty;
3243
- /** Number of scenarios (random sample, seeded). Omit to take all that match. */
3244
- limit?: number;
3245
- seed?: number;
3246
- /** Predicate narrowing. Applied after split/difficulty filters. */
3247
- filter?: (scenario: DatasetScenario) => boolean;
3248
- /** If true, include scenarios marked as holdout. Default false. */
3249
- includeHoldout?: boolean;
3250
- }
3251
- /** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
3252
- declare class HoldoutLockedError extends Error {
3253
- constructor(datasetName: string);
3254
- }
3255
- declare class Dataset {
3256
- readonly name: string;
3257
- readonly provenance: DatasetProvenance;
3258
- private scenarios;
3259
- private locked;
3260
- constructor(init: {
3261
- name: string;
3262
- provenance: DatasetProvenance;
3263
- scenarios: DatasetScenario[];
3264
- locked?: boolean;
3265
- });
3266
- /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
3267
- all(): readonly DatasetScenario[];
3268
- get size(): number;
3269
- /**
3270
- * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
3271
- * the same arguments always produce the same slice across machines.
3272
- */
3273
- slice(options?: SliceOptions): DatasetScenario[];
3274
- /**
3275
- * Assemble the manifest (name + provenance + content hash + counts).
3276
- * Content hash is deterministic over canonicalized scenarios.
3277
- */
3278
- manifest(): Promise<DatasetManifest>;
3279
- /** Fresh unlocked copy — for post-release forks when mutation is needed. */
3280
- clone(overrides?: Partial<{
3281
- name: string;
3282
- version: string;
3283
- }>): Dataset;
3284
- lock(): void;
3285
- add(scenario: DatasetScenario): void;
3286
- remove(scenarioId: string): void;
3287
- /**
3288
- * Stable JSON-Lines serialization — deterministic byte-for-byte.
3289
- * Write to disk for contamination-verifiable archives.
3290
- */
3291
- toJsonl(): string;
3292
- static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
3293
- }
3294
- declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
3295
-
3296
3701
  /**
3297
3702
  * ContaminationGuard — ensures held-out scenarios don't leak into
3298
3703
  * training/prompt paths, and flags model memorization.
@@ -3608,7 +4013,7 @@ interface ContractMetric {
3608
4013
  /** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
3609
4014
  maxRegression?: number;
3610
4015
  /** Optional extractor if the metric isn't in the default set. */
3611
- extract?: (run: Run, store: TraceStore) => Promise<number | null>;
4016
+ extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
3612
4017
  }
3613
4018
  interface ThresholdContract {
3614
4019
  name: string;
@@ -3874,10 +4279,10 @@ declare class BuilderSession {
3874
4279
  */
3875
4280
  declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
3876
4281
  projectId: string;
3877
- chatRuns: Run[];
3878
- lastBuilderRun?: Run;
3879
- lastBuildRun?: Run;
3880
- lastAppRuntimeRuns: Run[];
4282
+ chatRuns: Run$1[];
4283
+ lastBuilderRun?: Run$1;
4284
+ lastBuildRun?: Run$1;
4285
+ lastAppRuntimeRuns: Run$1[];
3881
4286
  }>;
3882
4287
 
3883
4288
  /**
@@ -3997,8 +4402,8 @@ interface ChatSummary {
3997
4402
  builderRunId: string;
3998
4403
  startedAt: number;
3999
4404
  endedAt?: number;
4000
- status: Run['status'];
4001
- outcome?: Run['outcome'];
4405
+ status: Run$1['status'];
4406
+ outcome?: Run$1['outcome'];
4002
4407
  /** Counts of spans emitted during the chat. */
4003
4408
  llmTurns?: number;
4004
4409
  toolCalls?: number;
@@ -4006,7 +4411,7 @@ interface ChatSummary {
4006
4411
  appRuntimeRunIds: string[];
4007
4412
  }
4008
4413
  interface ProjectTimelineEntry {
4009
- run: Run;
4414
+ run: Run$1;
4010
4415
  layerBucket: 'chat' | 'build' | 'runtime' | 'other';
4011
4416
  }
4012
4417
  declare class ProjectRegistry {
@@ -4093,7 +4498,7 @@ declare class FileSystemOutcomeStore implements OutcomeStore {
4093
4498
  interface EvalMetricSpec {
4094
4499
  id: string;
4095
4500
  /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
4096
- extract?: (run: Run, store: TraceStore) => Promise<number | null>;
4501
+ extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
4097
4502
  }
4098
4503
  interface OutcomePair {
4099
4504
  evalMetric: string;
@@ -7978,4 +8383,4 @@ interface ReflectionProposal {
7978
8383
  }
7979
8384
  declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
7980
8385
 
7981
- export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
8386
+ export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };