@tangle-network/agent-eval 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -435,6 +435,83 @@ declare class MetricsCollector {
435
435
  getConvergenceCurve(): number[];
436
436
  }
437
437
 
438
+ /**
439
+ * ScenarioRegistry — manages scenario discovery and filtering.
440
+ *
441
+ * Each agent registers its scenarios. The registry handles conversion
442
+ * from ScenarioFile format to the framework's Scenario type.
443
+ */
444
+ declare class ScenarioRegistry {
445
+ private scenarios;
446
+ private scenarioFiles;
447
+ /** Register scenarios from ScenarioFile format */
448
+ registerFiles(files: ScenarioFile[]): void;
449
+ /** Register pre-built Scenario objects directly */
450
+ register(scenarios: Scenario[]): void;
451
+ /** Get all scenarios */
452
+ all(): Scenario[];
453
+ /** Get scenarios filtered by category */
454
+ byCategory(category: string): Scenario[];
455
+ /** List all categories with counts */
456
+ listCategories(): {
457
+ category: string;
458
+ count: number;
459
+ }[];
460
+ /** Get scenarios filtered by persona */
461
+ byPersona(persona: string): Scenario[];
462
+ /** Get a single scenario by ID */
463
+ byId(id: string): Scenario | undefined;
464
+ /** Count total scenarios */
465
+ get count(): number;
466
+ }
467
+
468
+ interface AgentDriverConfig {
469
+ client: ProductClient;
470
+ driverModel?: string;
471
+ /** System prompt context for the driver LLM to understand the product */
472
+ productContext?: string;
473
+ }
474
+ /**
475
+ * AgentDriver — meta-agent that plays a persona against the real product.
476
+ *
477
+ * Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
478
+ * Not scripted — the driver gets the current product state and decides
479
+ * the next realistic user message.
480
+ */
481
+ declare class AgentDriver {
482
+ private tc;
483
+ private client;
484
+ private driverModel;
485
+ private productContext;
486
+ constructor(tc: TCloud, config: AgentDriverConfig);
487
+ /**
488
+ * Run a persona through the product.
489
+ *
490
+ * Returns metrics on how many turns to completion, cost curve,
491
+ * quality curve, and convergence curve.
492
+ */
493
+ run(persona: PersonaConfig): Promise<DriverResult>;
494
+ /** Use the driver LLM to decide what the "user" says next */
495
+ private decideNextMessage;
496
+ /** Handle pending approvals based on persona feedback patterns */
497
+ private handleApprovals;
498
+ /** Describe which completion criteria are met */
499
+ private describeCompletion;
500
+ }
501
+
502
+ /**
503
+ * Report generation utilities.
504
+ *
505
+ * Outputs convergence curves, cost curves, quality curves,
506
+ * and per-persona summaries in markdown format.
507
+ */
508
+ /** Generate a markdown report from benchmark results */
509
+ declare function formatBenchmarkReport(report: BenchmarkReport): string;
510
+ /** Generate a markdown report from agent driver results */
511
+ declare function formatDriverReport(results: DriverResult[]): string;
512
+ /** Print a compact summary to console */
513
+ declare function printDriverSummary(results: DriverResult[]): void;
514
+
438
515
  /**
439
516
  * Normalize scores so all dimensions follow "higher = better".
440
517
  * Inverted dimensions (hallucination, false_confidence, worst_failure)
@@ -470,6 +547,31 @@ declare function mannWhitneyU(a: number[], b: number[]): {
470
547
  };
471
548
  /** Partial credit: returns 0-1 ratio of current toward target */
472
549
  declare function partialCredit(current: number, target: number): number;
550
+ /**
551
+ * Paired t-test — before/after measurements on the SAME items.
552
+ * Pairing removes inter-item variance, giving tighter significance than
553
+ * an unpaired test when comparing prompt v1 vs prompt v2 on identical
554
+ * scenarios.
555
+ */
556
+ declare function pairedTTest(before: number[], after: number[]): {
557
+ t: number;
558
+ df: number;
559
+ p: number;
560
+ };
561
+ /**
562
+ * Wilcoxon signed-rank test — paired non-parametric alternative.
563
+ * Use when the differences aren't normally distributed.
564
+ */
565
+ declare function wilcoxonSignedRank(before: number[], after: number[]): {
566
+ w: number;
567
+ p: number;
568
+ };
569
+ /**
570
+ * Cohen's d — standardized effect size for two independent groups.
571
+ * Positive d means group b has higher mean than group a.
572
+ * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
573
+ */
574
+ declare function cohensD(a: number[], b: number[]): number;
473
575
 
474
576
  /**
475
577
  * ConvergenceTracker — tracks completion percentage over turns.
@@ -500,80 +602,2607 @@ declare class ConvergenceTracker {
500
602
  }
501
603
 
502
604
  /**
503
- * ScenarioRegistry manages scenario discovery and filtering.
605
+ * Versioned prompt registry.
504
606
  *
505
- * Each agent registers its scenarios. The registry handles conversion
506
- * from ScenarioFile format to the framework's Scenario type.
607
+ * Every prompt used in an eval run is registered with an explicit version.
608
+ * Reports include the content hash so A/B compares are rigorous: if the
609
+ * hash changes between two reports, the prompt actually changed; if it
610
+ * matches, the variance is elsewhere.
611
+ *
612
+ * Hash is SHA-256(content), truncated to 12 hex chars for readability.
613
+ * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
507
614
  */
508
- declare class ScenarioRegistry {
509
- private scenarios;
510
- private scenarioFiles;
511
- /** Register scenarios from ScenarioFile format */
512
- registerFiles(files: ScenarioFile[]): void;
513
- /** Register pre-built Scenario objects directly */
514
- register(scenarios: Scenario[]): void;
515
- /** Get all scenarios */
516
- all(): Scenario[];
517
- /** Get scenarios filtered by category */
518
- byCategory(category: string): Scenario[];
519
- /** List all categories with counts */
520
- listCategories(): {
521
- category: string;
522
- count: number;
523
- }[];
524
- /** Get scenarios filtered by persona */
525
- byPersona(persona: string): Scenario[];
526
- /** Get a single scenario by ID */
527
- byId(id: string): Scenario | undefined;
528
- /** Count total scenarios */
529
- get count(): number;
615
+ interface PromptHandle {
616
+ /** Stable human-readable id, e.g. 'legal.system' */
617
+ id: string;
618
+ /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
619
+ version: string;
620
+ /** SHA-256 of content, 12-hex-char prefix */
621
+ hash: string;
622
+ /** Full prompt body */
623
+ content: string;
624
+ }
625
+ declare class PromptRegistry {
626
+ private readonly entries;
627
+ /**
628
+ * Register a prompt. Re-registering the same id+version with DIFFERENT
629
+ * content throws — versions are immutable. Re-registering with the SAME
630
+ * content is a no-op (idempotent).
631
+ */
632
+ register(id: string, version: string, content: string): Promise<PromptHandle>;
633
+ /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
634
+ get(id: string, version: string): PromptHandle;
635
+ /** Return all versions of an id, newest-first (lex-descending on version). */
636
+ listVersions(id: string): PromptHandle[];
637
+ /** Snapshot the whole registry — useful for including in reports. */
638
+ list(): PromptHandle[];
639
+ /** Verify a hash against registered content. Returns null if not found. */
640
+ verifyHash(id: string, version: string, expectedHash: string): boolean | null;
530
641
  }
642
+ /** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
643
+ declare function hashContent(content: string): Promise<string>;
531
644
 
532
- interface AgentDriverConfig {
533
- client: ProductClient;
534
- driverModel?: string;
535
- /** System prompt context for the driver LLM to understand the product */
536
- productContext?: string;
645
+ /**
646
+ * Anti-slop quality judge.
647
+ *
648
+ * Deterministic pattern-based quality check no LLM call. Catches the
649
+ * 80% of AI slop that every production agent leaks:
650
+ * - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
651
+ * - N-gram repetition (same phrase over and over)
652
+ * - Hedging overuse ("I could be wrong, but...")
653
+ * - Apology padding ("I'm so sorry for the confusion...")
654
+ * - Unused opening formulas ("Great question!")
655
+ * - Length bounds (too short to be useful, too long to be read)
656
+ *
657
+ * Produces a JudgeScore in the same shape as LLM judges so it composes into
658
+ * `BenchmarkRunner`'s judge array transparently.
659
+ */
660
+
661
+ interface AntiSlopConfig {
662
+ /** Domain label — appears in the JudgeScore output */
663
+ domain?: string;
664
+ /** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
665
+ bannedPhrases?: string[];
666
+ /** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
667
+ bannedOpenings?: RegExp[];
668
+ /** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
669
+ hedgingPatterns?: RegExp[];
670
+ /** Regexes matching apology padding. */
671
+ apologyPatterns?: RegExp[];
672
+ /** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
673
+ repetitionThreshold?: number;
674
+ /** Min output length in chars; below this the turn is deemed too terse. */
675
+ minLength?: number;
676
+ /** Max output length in chars; above this the turn is deemed too verbose. */
677
+ maxLength?: number;
678
+ /** How heavily each violation class reduces the score (default 1). */
679
+ penaltyWeights?: Partial<Record<SlopCategory, number>>;
680
+ }
681
+ type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
682
+ /** Create a reusable Judge function from an anti-slop config. */
683
+ declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
684
+ interface AntiSlopIssue {
685
+ category: SlopCategory;
686
+ detail: string;
687
+ example?: string;
688
+ }
689
+ interface AntiSlopReport {
690
+ /** 0–10 score; 10 is clean, lower values mean more slop. */
691
+ score: number;
692
+ issues: AntiSlopIssue[];
693
+ /** Count of each category for programmatic aggregation. */
694
+ counts: Record<SlopCategory, number>;
537
695
  }
538
696
  /**
539
- * AgentDrivermeta-agent that plays a persona against the real product.
697
+ * Pure function analyze one or more outputs against the config. Exposed
698
+ * separately so consumers can build their own reporters on top.
699
+ */
700
+ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
701
+ penaltyWeights: Record<SlopCategory, number>;
702
+ }): AntiSlopReport;
703
+
704
+ /**
705
+ * Artifact validators.
540
706
  *
541
- * Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
542
- * Not scripted the driver gets the current product state and decides
543
- * the next realistic user message.
707
+ * Generic "score a produced artifact" primitive. Tax uses it for PDF form
708
+ * correctness, legal for contract clauses, film for script breakdowns, GTM
709
+ * for social posts. One interface, many validators; all plug into
710
+ * `BenchmarkRunner` the same way.
711
+ *
712
+ * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
713
+ * plus a `ValidationContext` (scenario id, the turns that produced it) and
714
+ * returns a `ValidationResult` with pass/fail + 0..1 score + structured
715
+ * issues.
544
716
  */
545
- declare class AgentDriver {
546
- private tc;
547
- private client;
548
- private driverModel;
549
- private productContext;
550
- constructor(tc: TCloud, config: AgentDriverConfig);
717
+ interface Artifact$1 {
718
+ /** Logical kind — validators type-guard on this */
719
+ kind: 'file' | 'json' | 'text' | 'binary' | string;
720
+ /** Filesystem-style path, optional */
721
+ path?: string;
722
+ /** String content for text/json/file kinds */
723
+ content?: string;
724
+ /** Binary content (if kind === 'binary') */
725
+ bytes?: Uint8Array;
726
+ /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
727
+ metadata?: Record<string, unknown>;
728
+ }
729
+ interface ValidationContext {
730
+ scenarioId: string;
731
+ turnIndex?: number;
732
+ /** Prior artifacts for multi-artifact scenarios */
733
+ priorArtifacts?: Artifact$1[];
734
+ /** Free-form hints the validator uses for domain-specific checks */
735
+ hints?: Record<string, unknown>;
736
+ }
737
+ interface ValidationIssue {
738
+ severity: 'error' | 'warning' | 'info';
739
+ message: string;
740
+ /** Optional path into the artifact (e.g. JSON path or byte offset) */
741
+ locus?: string;
742
+ }
743
+ interface ValidationResult {
744
+ pass: boolean;
745
+ /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
746
+ score: number;
747
+ issues: ValidationIssue[];
748
+ /** Diagnostic payload for reporters */
749
+ evidence?: Record<string, unknown>;
750
+ }
751
+ interface ArtifactValidator {
752
+ /** Stable identifier for the validator; appears in reports. */
753
+ name: string;
754
+ /** Optional description for human-facing reports. */
755
+ description?: string;
756
+ /** Called once per artifact; validators are expected to be pure + idempotent. */
757
+ validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
758
+ }
759
+ /**
760
+ * Run every validator on the same artifact; aggregate pass as AND, score as
761
+ * (weighted) mean, issues concatenated. Weights default to 1 each.
762
+ */
763
+ declare function composeValidators(validators: ArtifactValidator[], options?: {
764
+ name?: string;
765
+ weights?: number[];
766
+ }): ArtifactValidator;
767
+ /** Pass if the artifact body matches a provided regex. */
768
+ declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
769
+ /** Pass if JSON parses and every required key is present. */
770
+ declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
771
+ /** Pass if min ≤ byte length ≤ max. */
772
+ declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
773
+ /** Pass if the artifact contains every required substring (case-insensitive by default). */
774
+ declare function containsAll(name: string, required: string[], options?: {
775
+ caseSensitive?: boolean;
776
+ }): ArtifactValidator;
777
+
778
+ /**
779
+ * Workspace inspector — score the persisted state of an agent after a run.
780
+ *
781
+ * Many evals don't ask "did the response say the right thing" but "did the
782
+ * agent put the right rows in the DB / files in the vault / entities on the
783
+ * canvas". This is the primitive for that.
784
+ *
785
+ * Implementations read from D1, KV, filesystem, or any store — the interface
786
+ * is deliberately small so consumers plug in their own backends.
787
+ */
788
+ interface WorkspaceSnapshot {
789
+ /** Vault files: logical path → content */
790
+ files: Record<string, string>;
791
+ /** DB rows: table name → array of rows (post-validation) */
792
+ rows: Record<string, Array<Record<string, unknown>>>;
793
+ /** KV entries: key → value (scoped to whatever prefix the inspector chose) */
794
+ kv: Record<string, string>;
795
+ /** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
796
+ blobs?: Record<string, {
797
+ size: number;
798
+ hash?: string;
799
+ mimeType?: string;
800
+ }>;
801
+ }
802
+ interface InspectorContext {
803
+ /** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
804
+ scopeId: string;
805
+ /** Optional scenario id — allows scenario-specific snapshot shaping */
806
+ scenarioId?: string;
807
+ }
808
+ interface WorkspaceInspector {
809
+ name: string;
810
+ snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
811
+ }
812
+ declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
813
+ readonly name = "in-memory";
814
+ private readonly snapshots;
815
+ set(scopeId: string, snapshot: WorkspaceSnapshot): void;
816
+ snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
817
+ }
818
+ interface WorkspaceAssertion {
819
+ name: string;
820
+ description?: string;
821
+ check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
822
+ }
823
+ interface WorkspaceAssertionResult {
824
+ pass: boolean;
825
+ /** 0..1 — partial credit for assertions that admit it */
826
+ score: number;
827
+ detail?: string;
828
+ }
829
+ declare function fileExists(path: string): WorkspaceAssertion;
830
+ declare function fileContains(path: string, needle: string): WorkspaceAssertion;
831
+ declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
832
+ declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
833
+ min?: number;
834
+ }): WorkspaceAssertion;
835
+ /** Run many assertions; return aggregate pass + mean score + per-assertion details. */
836
+ declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
837
+ pass: boolean;
838
+ score: number;
839
+ results: Array<{
840
+ assertion: string;
841
+ result: WorkspaceAssertionResult;
842
+ }>;
843
+ };
844
+
845
+ /**
846
+ * Experiment tracker — group runs, diff them, watch scores move over time.
847
+ *
848
+ * Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
849
+ * - A run has a config (prompt hash, model, scenario ids, seed)
850
+ * - Runs belong to experiments (named groups)
851
+ * - The store is pluggable (in-memory for tests, filesystem for local,
852
+ * custom for Langfuse/D1)
853
+ * - Diffs show score deltas, new/dropped scenarios, and config changes
854
+ *
855
+ * The output plugs directly into `BenchmarkReport` — runs archive the full
856
+ * report, diff operates on the summary.
857
+ */
858
+
859
+ interface RunConfig {
860
+ experimentId: string;
861
+ name?: string;
862
+ model?: string;
863
+ promptHash?: string;
864
+ promptVersion?: string;
865
+ seed?: number;
866
+ metadata?: Record<string, unknown>;
867
+ }
868
+ interface Run$1 {
869
+ id: string;
870
+ experimentId: string;
871
+ name?: string;
872
+ config: RunConfig;
873
+ startedAt: string;
874
+ completedAt?: string;
875
+ status: 'running' | 'completed' | 'failed';
876
+ report?: BenchmarkReport;
877
+ error?: string;
878
+ }
879
+ interface Experiment {
880
+ id: string;
881
+ name: string;
882
+ createdAt: string;
883
+ metadata?: Record<string, unknown>;
884
+ }
885
+ interface ExperimentStore {
886
+ saveExperiment(exp: Experiment): Promise<void>;
887
+ getExperiment(id: string): Promise<Experiment | null>;
888
+ listExperiments(): Promise<Experiment[]>;
889
+ saveRun(run: Run$1): Promise<void>;
890
+ getRun(id: string): Promise<Run$1 | null>;
891
+ listRuns(experimentId: string): Promise<Run$1[]>;
892
+ }
893
+ declare class InMemoryExperimentStore implements ExperimentStore {
894
+ private readonly experiments;
895
+ private readonly runs;
896
+ saveExperiment(exp: Experiment): Promise<void>;
897
+ getExperiment(id: string): Promise<Experiment | null>;
898
+ listExperiments(): Promise<Experiment[]>;
899
+ saveRun(run: Run$1): Promise<void>;
900
+ getRun(id: string): Promise<Run$1 | null>;
901
+ listRuns(experimentId: string): Promise<Run$1[]>;
902
+ }
903
+ declare class ExperimentTracker {
904
+ private readonly store;
905
+ constructor(store: ExperimentStore);
906
+ startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
907
+ startRun(config: RunConfig): Promise<Run$1>;
908
+ completeRun(runId: string, report: BenchmarkReport): Promise<void>;
909
+ failRun(runId: string, error: string): Promise<void>;
551
910
  /**
552
- * Run a persona through the product.
553
- *
554
- * Returns metrics on how many turns to completion, cost curve,
555
- * quality curve, and convergence curve.
911
+ * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
912
+ * and config changes that may explain the movement.
556
913
  */
557
- run(persona: PersonaConfig): Promise<DriverResult>;
558
- /** Use the driver LLM to decide what the "user" says next */
559
- private decideNextMessage;
560
- /** Handle pending approvals based on persona feedback patterns */
561
- private handleApprovals;
562
- /** Describe which completion criteria are met */
563
- private describeCompletion;
914
+ diff(runIdA: string, runIdB: string): Promise<RunDiff>;
915
+ /** Timeline of aggregate scores for an experiment. */
916
+ timeline(experimentId: string): Promise<Array<{
917
+ runId: string;
918
+ startedAt: string;
919
+ overall: number | null;
920
+ }>>;
921
+ }
922
+ interface RunDiff {
923
+ before: {
924
+ runId: string;
925
+ name?: string;
926
+ startedAt: string;
927
+ };
928
+ after: {
929
+ runId: string;
930
+ name?: string;
931
+ startedAt: string;
932
+ };
933
+ aggregateDelta: number;
934
+ scenarios: Array<{
935
+ scenarioId: string;
936
+ before: number | null;
937
+ after: number | null;
938
+ delta: number | null;
939
+ status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
940
+ }>;
941
+ configChanges: Record<string, {
942
+ before: unknown;
943
+ after: unknown;
944
+ }>;
564
945
  }
565
946
 
566
947
  /**
567
- * Report generation utilities.
948
+ * Prompt optimizer — A/B test prompt variants with statistical rigor.
568
949
  *
569
- * Outputs convergence curves, cost curves, quality curves,
570
- * and per-persona summaries in markdown format.
950
+ * Runs N prompt variants against a fixed scenario set, collects per-scenario
951
+ * scores via the user-provided `scoreVariant` callback, and returns:
952
+ * - per-variant mean + bootstrap CI
953
+ * - pairwise significance (Mann-Whitney, non-parametric — works on any
954
+ * score distribution, not just normal)
955
+ * - a winner (highest mean, flagged if the lead is not significant)
956
+ *
957
+ * Deliberately generic — the `scoreVariant` callback does whatever domain
958
+ * work the consumer needs (invoke the agent, judge the output, whatever),
959
+ * and returns a number per scenario. This lets the optimizer stay small +
960
+ * testable.
571
961
  */
572
- /** Generate a markdown report from benchmark results */
573
- declare function formatBenchmarkReport(report: BenchmarkReport): string;
574
- /** Generate a markdown report from agent driver results */
575
- declare function formatDriverReport(results: DriverResult[]): string;
576
- /** Print a compact summary to console */
577
- declare function printDriverSummary(results: DriverResult[]): void;
962
+ interface PromptVariant {
963
+ id: string;
964
+ prompt: string;
965
+ metadata?: Record<string, unknown>;
966
+ }
967
+ interface OptimizationConfig {
968
+ variants: PromptVariant[];
969
+ /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
970
+ trialsPerScenario?: number;
971
+ /** Significance threshold for pairwise comparison (default 0.05). */
972
+ significanceLevel?: number;
973
+ /**
974
+ * The scoring callback. For each (variant, scenarioId, trialIndex), produce
975
+ * a score in 0..1 (or any numeric range — the optimizer only cares about
976
+ * monotonicity).
977
+ */
978
+ scoreVariant: (args: {
979
+ variant: PromptVariant;
980
+ scenarioId: string;
981
+ trialIndex: number;
982
+ }) => Promise<number>;
983
+ /** Scenario ids to run against. */
984
+ scenarioIds: string[];
985
+ /** Optional hook — fires after each (variant, scenario) fully scored. */
986
+ onScenarioComplete?: (info: {
987
+ variantId: string;
988
+ scenarioId: string;
989
+ scores: number[];
990
+ }) => void;
991
+ }
992
+ interface VariantScore {
993
+ variantId: string;
994
+ mean: number;
995
+ ci95: {
996
+ lower: number;
997
+ upper: number;
998
+ };
999
+ n: number;
1000
+ perScenario: Record<string, {
1001
+ mean: number;
1002
+ n: number;
1003
+ samples: number[];
1004
+ }>;
1005
+ }
1006
+ interface PairwiseComparison {
1007
+ variantA: string;
1008
+ variantB: string;
1009
+ pValue: number;
1010
+ /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
1011
+ qValue: number;
1012
+ /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
1013
+ significant: boolean;
1014
+ meanDelta: number;
1015
+ }
1016
+ interface OptimizationResult {
1017
+ winner: {
1018
+ variantId: string;
1019
+ /** True when the winner's lead vs every other variant is statistically significant. */
1020
+ significant: boolean;
1021
+ ciLowerBoundExceedsSecondMean: boolean;
1022
+ };
1023
+ scores: VariantScore[];
1024
+ pairwise: PairwiseComparison[];
1025
+ config: {
1026
+ trialsPerScenario: number;
1027
+ significanceLevel: number;
1028
+ variants: string[];
1029
+ scenarios: string[];
1030
+ };
1031
+ }
1032
+ declare class PromptOptimizer {
1033
+ run(config: OptimizationConfig): Promise<OptimizationResult>;
1034
+ }
1035
+
1036
+ /**
1037
+ * Dual-agent convergence bench.
1038
+ *
1039
+ * Pattern lifted from tax-agent + legal-agent: two agents take turns until
1040
+ * they converge on a consensus artifact. One proposes, the other critiques;
1041
+ * the proposer revises; repeat until a score threshold is hit or max rounds.
1042
+ *
1043
+ * Generalized so any two "agents" (gateways, local functions, anything with
1044
+ * `propose` + `critique`) compose in. Returns convergence rounds per
1045
+ * scenario + whether convergence happened.
1046
+ */
1047
+ interface DualAgentScenario {
1048
+ id: string;
1049
+ initialPrompt: string;
1050
+ /** Optional context the agents can read (e.g. source documents). */
1051
+ context?: Record<string, unknown>;
1052
+ }
1053
+ interface DualAgentRound {
1054
+ roundIndex: number;
1055
+ proposal: string;
1056
+ critique: string;
1057
+ convergenceScore: number;
1058
+ }
1059
+ interface DualAgentScenarioResult {
1060
+ scenarioId: string;
1061
+ converged: boolean;
1062
+ roundsToConverge: number | null;
1063
+ finalProposal: string;
1064
+ history: DualAgentRound[];
1065
+ finalScore: number;
1066
+ }
1067
+ interface DualAgentBenchConfig {
1068
+ scenarios: DualAgentScenario[];
1069
+ maxRounds?: number;
1070
+ /** Convergence threshold in 0..1 (default 0.85). */
1071
+ convergenceThreshold?: number;
1072
+ /**
1073
+ * Propose an answer given the scenario + the critic's prior critique (if any).
1074
+ * Returns the proposal string.
1075
+ */
1076
+ propose: (args: {
1077
+ scenario: DualAgentScenario;
1078
+ roundIndex: number;
1079
+ priorProposal?: string;
1080
+ priorCritique?: string;
1081
+ }) => Promise<string>;
1082
+ /**
1083
+ * Critique the proposer's current output. Returns a structured critique
1084
+ * (free text) plus a convergence score: how close the proposal is to
1085
+ * acceptable. 1.0 = accept, 0.0 = totally off.
1086
+ */
1087
+ critique: (args: {
1088
+ scenario: DualAgentScenario;
1089
+ roundIndex: number;
1090
+ proposal: string;
1091
+ }) => Promise<{
1092
+ critique: string;
1093
+ convergenceScore: number;
1094
+ }>;
1095
+ /** Optional per-round hook for progress + tracing. */
1096
+ onRoundComplete?: (info: {
1097
+ scenarioId: string;
1098
+ round: DualAgentRound;
1099
+ }) => void;
1100
+ }
1101
+ interface DualAgentReport {
1102
+ scenarios: DualAgentScenarioResult[];
1103
+ aggregate: {
1104
+ convergenceRate: number;
1105
+ avgRoundsToConverge: number | null;
1106
+ avgFinalScore: number;
1107
+ };
1108
+ config: {
1109
+ maxRounds: number;
1110
+ convergenceThreshold: number;
1111
+ };
1112
+ }
1113
+ declare class DualAgentBench {
1114
+ run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
1115
+ }
1116
+
1117
+ /**
1118
+ * TraceSchema v1 — the canonical data model for agent-eval.
1119
+ *
1120
+ * Every score, every failure class, every pipeline in the framework is
1121
+ * a view over this data. Shape it once, live with it.
1122
+ *
1123
+ * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
1124
+ * but extended with agent-specific span kinds (llm, tool, retrieval,
1125
+ * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
1126
+ * entities that OTEL leaves as free-form attributes.
1127
+ */
1128
+ declare const TRACE_SCHEMA_VERSION = "1.0.0";
1129
+ type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
1130
+ interface BudgetSpec {
1131
+ tokens?: number;
1132
+ wallMs?: number;
1133
+ calls?: number;
1134
+ usd?: number;
1135
+ }
1136
+ interface RunOutcome {
1137
+ score?: number;
1138
+ pass?: boolean;
1139
+ failureClass?: FailureClass;
1140
+ notes?: string;
1141
+ }
1142
+ /**
1143
+ * Layer — optional classification in a nested build workflow.
1144
+ * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
1145
+ * `app-build`: sandbox harness that compiled + tested the generated scaffold.
1146
+ * `app-runtime`: a run of the generated agent against a domain scenario.
1147
+ * `meta`: any meta-eval (judge replay, correlation analysis).
1148
+ */
1149
+ type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
1150
+ interface Run {
1151
+ runId: string;
1152
+ scenarioId: string;
1153
+ variantId?: string;
1154
+ datasetVersion?: string;
1155
+ /** Git SHA of agent code at run time. */
1156
+ codeSha?: string;
1157
+ /** Hash of the prompt template + any system prompt. */
1158
+ promptSha?: string;
1159
+ /** Model id + date + system-prompt hash, concatenated. */
1160
+ modelFingerprint?: string;
1161
+ seed?: number;
1162
+ /** Arbitrary environment markers (shell, docker version, tz). */
1163
+ envFingerprint?: Record<string, string>;
1164
+ /** Version of the redaction rules applied to this run. */
1165
+ redactionVersion?: string;
1166
+ /** Parent run in a nested build workflow. A builder run's children are
1167
+ * app-build runs; those children are app-runtime runs. */
1168
+ parentRunId?: string;
1169
+ /** Stable project identifier — groups runs across chats + sessions. */
1170
+ projectId?: string;
1171
+ /** Chat/conversation identifier within a project. */
1172
+ chatId?: string;
1173
+ /** Layer classification — hint for aggregation; not enforced. */
1174
+ layer?: RunLayer;
1175
+ startedAt: number;
1176
+ endedAt?: number;
1177
+ status: RunStatus;
1178
+ outcome?: RunOutcome;
1179
+ budget?: BudgetSpec;
1180
+ /** Free-form labels for downstream grouping. */
1181
+ tags?: Record<string, string>;
1182
+ }
1183
+ type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
1184
+ type SpanStatus = 'ok' | 'error';
1185
+ interface SpanBase {
1186
+ spanId: string;
1187
+ parentSpanId?: string;
1188
+ runId: string;
1189
+ kind: SpanKind;
1190
+ name: string;
1191
+ startedAt: number;
1192
+ endedAt?: number;
1193
+ status?: SpanStatus;
1194
+ error?: string;
1195
+ /** Anything not covered by typed fields. Kept deliberately free-form. */
1196
+ attributes?: Record<string, unknown>;
1197
+ }
1198
+ interface Message {
1199
+ role: 'system' | 'user' | 'assistant' | 'tool';
1200
+ content: string;
1201
+ tokens?: number;
1202
+ /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
1203
+ images?: Array<{
1204
+ artifactId?: string;
1205
+ url?: string;
1206
+ mime?: string;
1207
+ }>;
1208
+ }
1209
+ interface LlmSpan extends SpanBase {
1210
+ kind: 'llm';
1211
+ model: string;
1212
+ messages: Message[];
1213
+ output?: string;
1214
+ inputTokens?: number;
1215
+ outputTokens?: number;
1216
+ cachedTokens?: number;
1217
+ reasoningTokens?: number;
1218
+ costUsd?: number;
1219
+ finishReason?: string;
1220
+ }
1221
+ interface ToolSpan extends SpanBase {
1222
+ kind: 'tool';
1223
+ toolName: string;
1224
+ args: unknown;
1225
+ result?: unknown;
1226
+ latencyMs?: number;
1227
+ }
1228
+ interface RetrievalSpan extends SpanBase {
1229
+ kind: 'retrieval';
1230
+ query: string;
1231
+ hits: Array<{
1232
+ docId: string;
1233
+ score: number;
1234
+ content?: string;
1235
+ }>;
1236
+ }
1237
+ interface JudgeSpan extends SpanBase {
1238
+ kind: 'judge';
1239
+ judgeId: string;
1240
+ /** Span this judgment applies to. */
1241
+ targetSpanId: string;
1242
+ dimension: string;
1243
+ /** Numeric score (free-range; interpretation up to the judge). */
1244
+ score: number;
1245
+ rationale?: string;
1246
+ evidence?: string;
1247
+ }
1248
+ interface SandboxSpan extends SpanBase {
1249
+ kind: 'sandbox';
1250
+ image?: string;
1251
+ command?: string;
1252
+ exitCode?: number;
1253
+ testsTotal?: number;
1254
+ testsPassed?: number;
1255
+ stdoutHash?: string;
1256
+ stderrHash?: string;
1257
+ /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
1258
+ wallMs?: number;
1259
+ }
1260
+ interface GenericSpan extends SpanBase {
1261
+ kind: 'agent' | 'custom';
1262
+ }
1263
+ type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
1264
+ type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
1265
+ interface TraceEvent {
1266
+ eventId: string;
1267
+ runId: string;
1268
+ spanId?: string;
1269
+ kind: EventKind;
1270
+ timestamp: number;
1271
+ payload: Record<string, unknown>;
1272
+ }
1273
+ interface BudgetLedgerEntry {
1274
+ runId: string;
1275
+ dimension: keyof BudgetSpec;
1276
+ limit: number;
1277
+ consumed: number;
1278
+ remaining: number;
1279
+ timestamp: number;
1280
+ breached: boolean;
1281
+ /** Span that triggered this entry, if any. */
1282
+ spanId?: string;
1283
+ }
1284
+ interface Artifact {
1285
+ artifactId: string;
1286
+ runId: string;
1287
+ spanId?: string;
1288
+ contentType: string;
1289
+ sizeBytes: number;
1290
+ /** sha256 in hex. */
1291
+ hash: string;
1292
+ /** External storage URL (R2, S3, filesystem path). */
1293
+ storageUrl?: string;
1294
+ /** Inline content for small blobs — keep under ~64KB. */
1295
+ inlineContent?: string;
1296
+ }
1297
+ type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
1298
+ declare const FAILURE_CLASSES: readonly FailureClass[];
1299
+ declare function isLlmSpan(s: Span): s is LlmSpan;
1300
+ declare function isToolSpan(s: Span): s is ToolSpan;
1301
+ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
1302
+ declare function isJudgeSpan(s: Span): s is JudgeSpan;
1303
+ declare function isSandboxSpan(s: Span): s is SandboxSpan;
1304
+
1305
+ interface RunFilter {
1306
+ scenarioId?: string;
1307
+ variantId?: string;
1308
+ status?: RunStatus;
1309
+ since?: number;
1310
+ until?: number;
1311
+ tag?: {
1312
+ key: string;
1313
+ value: string;
1314
+ };
1315
+ parentRunId?: string;
1316
+ projectId?: string;
1317
+ chatId?: string;
1318
+ layer?: RunLayer;
1319
+ }
1320
+ interface SpanFilter {
1321
+ runId?: string;
1322
+ parentSpanId?: string;
1323
+ kind?: SpanKind;
1324
+ name?: string;
1325
+ toolName?: string;
1326
+ judgeId?: string;
1327
+ since?: number;
1328
+ until?: number;
1329
+ }
1330
+ interface EventFilter {
1331
+ runId?: string;
1332
+ spanId?: string;
1333
+ kind?: EventKind;
1334
+ since?: number;
1335
+ until?: number;
1336
+ }
1337
+ interface TraceStore {
1338
+ appendRun(run: Run): Promise<void>;
1339
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1340
+ appendSpan(span: Span): Promise<void>;
1341
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1342
+ appendEvent(event: TraceEvent): Promise<void>;
1343
+ appendArtifact(artifact: Artifact): Promise<void>;
1344
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1345
+ getRun(runId: string): Promise<Run | undefined>;
1346
+ listRuns(filter?: RunFilter): Promise<Run[]>;
1347
+ spans(filter?: SpanFilter): Promise<Span[]>;
1348
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
1349
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
1350
+ artifacts(runId: string): Promise<Artifact[]>;
1351
+ }
1352
+ declare class InMemoryTraceStore implements TraceStore {
1353
+ private runs;
1354
+ private allSpans;
1355
+ private allEvents;
1356
+ private allArtifacts;
1357
+ private allBudget;
1358
+ appendRun(run: Run): Promise<void>;
1359
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1360
+ appendSpan(span: Span): Promise<void>;
1361
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1362
+ appendEvent(event: TraceEvent): Promise<void>;
1363
+ appendArtifact(artifact: Artifact): Promise<void>;
1364
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1365
+ getRun(runId: string): Promise<Run | undefined>;
1366
+ listRuns(filter?: RunFilter): Promise<Run[]>;
1367
+ spans(filter?: SpanFilter): Promise<Span[]>;
1368
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
1369
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
1370
+ artifacts(runId: string): Promise<Artifact[]>;
1371
+ }
1372
+ interface FileSystemTraceStoreOptions {
1373
+ dir: string;
1374
+ /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
1375
+ maxBytes?: number;
1376
+ }
1377
+ declare class FileSystemTraceStore implements TraceStore {
1378
+ private dir;
1379
+ private maxBytes;
1380
+ /** Lazy in-memory index for queries — populated on first read. */
1381
+ private index?;
1382
+ private loaded;
1383
+ constructor(options: FileSystemTraceStoreOptions);
1384
+ private ensureDir;
1385
+ private append;
1386
+ private insertInto;
1387
+ private load;
1388
+ appendRun(run: Run): Promise<void>;
1389
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1390
+ appendSpan(span: Span): Promise<void>;
1391
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1392
+ appendEvent(event: TraceEvent): Promise<void>;
1393
+ appendArtifact(artifact: Artifact): Promise<void>;
1394
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1395
+ getRun(runId: string): Promise<Run | undefined>;
1396
+ listRuns(filter?: RunFilter): Promise<Run[]>;
1397
+ spans(filter?: SpanFilter): Promise<Span[]>;
1398
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
1399
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
1400
+ artifacts(runId: string): Promise<Artifact[]>;
1401
+ }
1402
+
1403
+ /**
1404
+ * TraceEmitter — hierarchical span builder that auto-parents using an
1405
+ * internal stack. One emitter per Run; emitters do NOT share state.
1406
+ *
1407
+ * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
1408
+ * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
1409
+ * have to thread spanIds manually. For async workflows that can't use
1410
+ * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
1411
+ * explicitly.
1412
+ */
1413
+
1414
+ interface SpanHandle<S extends Span = Span> {
1415
+ span: S;
1416
+ end(patch?: Partial<S>): Promise<void>;
1417
+ fail(error: string | Error, patch?: Partial<S>): Promise<void>;
1418
+ }
1419
+ interface TraceEmitterOptions {
1420
+ runId?: string;
1421
+ /** Inject a clock for deterministic tests. */
1422
+ now?: () => number;
1423
+ /** Inject an id generator for deterministic tests. */
1424
+ id?: () => string;
1425
+ }
1426
+ declare class TraceEmitter {
1427
+ private store;
1428
+ private stack;
1429
+ private _runId;
1430
+ private now;
1431
+ private id;
1432
+ constructor(store: TraceStore, options?: TraceEmitterOptions);
1433
+ get runId(): string;
1434
+ startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
1435
+ endRun(outcome?: RunOutcome): Promise<void>;
1436
+ abortRun(reason: string): Promise<void>;
1437
+ span<S extends Span = Span>(init: {
1438
+ kind: SpanKind;
1439
+ name: string;
1440
+ parentSpanId?: string;
1441
+ attributes?: Record<string, unknown>;
1442
+ } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
1443
+ private handle;
1444
+ private pop;
1445
+ llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
1446
+ tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
1447
+ retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
1448
+ recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
1449
+ sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
1450
+ emit(event: {
1451
+ kind: EventKind;
1452
+ spanId?: string;
1453
+ payload?: Record<string, unknown>;
1454
+ }): Promise<TraceEvent>;
1455
+ recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
1456
+ timestamp?: number;
1457
+ }): Promise<BudgetLedgerEntry>;
1458
+ recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
1459
+ /**
1460
+ * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
1461
+ * Returns the fn's return value. Use this for the 95% case.
1462
+ */
1463
+ within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
1464
+ }
1465
+ /** Helper to build an LLM span handle args object from a provider-shaped response. */
1466
+ declare function llmSpanFromProvider(args: {
1467
+ name?: string;
1468
+ model: string;
1469
+ messages: Message[];
1470
+ output: string;
1471
+ usage?: {
1472
+ inputTokens?: number;
1473
+ outputTokens?: number;
1474
+ cachedTokens?: number;
1475
+ reasoningTokens?: number;
1476
+ };
1477
+ costUsd?: number;
1478
+ finishReason?: string;
1479
+ }): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
1480
+
1481
+ /**
1482
+ * Typed query helpers over TraceStore.
1483
+ *
1484
+ * Not a full SQL engine — a minimal, composable set of operators that
1485
+ * cover the canned-pipeline use cases. For ad-hoc analytics, persist to
1486
+ * NDJSON and point DuckDB at it; the schema is stable so external SQL
1487
+ * tooling works out of the box.
1488
+ */
1489
+
1490
+ declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
1491
+ declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
1492
+ declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
1493
+ declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
1494
+ /** Group spans by any key selector. */
1495
+ declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
1496
+ /** Hash tool arguments to an orderless-key-stable string for de-duplication. */
1497
+ declare function argHash(args: unknown): string;
1498
+ /** Sum an LLM-span array into aggregate token + cost. */
1499
+ declare function aggregateLlm(spans: LlmSpan[]): {
1500
+ inputTokens: number;
1501
+ outputTokens: number;
1502
+ cachedTokens: number;
1503
+ costUsd: number;
1504
+ };
1505
+ /** Pick the outcome's failure class when present, else derive 'success' from run status. */
1506
+ declare function runFailureClass(run: Run): FailureClass;
1507
+
1508
+ /**
1509
+ * Redaction — remove PII / secrets from trace payloads before persist.
1510
+ *
1511
+ * Pre-persistence rules mean raw traces in storage are already scrubbed.
1512
+ * Unredacted variants (for debugging / post-mortems) live in a separate
1513
+ * storage layer with stricter access controls; this module only covers
1514
+ * the default scrub-then-persist path.
1515
+ *
1516
+ * Rules compose: pass an array of `RedactionRule`, each is applied in
1517
+ * order. Strings that match get replaced with a tagged sentinel so the
1518
+ * eval framework can count how many redactions happened per run
1519
+ * (surfaced via `redaction_applied` events).
1520
+ */
1521
+ interface RedactionRule {
1522
+ id: string;
1523
+ pattern: RegExp;
1524
+ /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
1525
+ replacement?: string;
1526
+ }
1527
+ interface RedactionReport {
1528
+ redactionCount: number;
1529
+ byRule: Record<string, number>;
1530
+ }
1531
+ /** OWASP / common-sense defaults — extend per-domain. */
1532
+ declare const DEFAULT_REDACTION_RULES: RedactionRule[];
1533
+ declare const REDACTION_VERSION = "1.0.0";
1534
+ /**
1535
+ * Redact a single string. Returns the new string and a per-rule count of
1536
+ * how many substitutions fired.
1537
+ */
1538
+ declare function redactString(input: string, rules?: RedactionRule[]): {
1539
+ output: string;
1540
+ report: RedactionReport;
1541
+ };
1542
+ /**
1543
+ * Walk a JSON-ish value applying `redactString` to every string leaf.
1544
+ * Arrays and plain objects are recursed; other types pass through
1545
+ * untouched. Circular references throw — traces should be tree-shaped.
1546
+ */
1547
+ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
1548
+ value: unknown;
1549
+ report: RedactionReport;
1550
+ };
1551
+
1552
+ /**
1553
+ * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
1554
+ * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
1555
+ *
1556
+ * Wire format only. We do NOT depend on the @opentelemetry SDK — that
1557
+ * would drag in polyfills incompatible with Workers/Edge. Consumers
1558
+ * push the JSON to their collector of choice via HTTP.
1559
+ *
1560
+ * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
1561
+ */
1562
+
1563
+ declare const OTEL_AGENT_EVAL_SCOPE: {
1564
+ name: string;
1565
+ version: string;
1566
+ };
1567
+ interface OtlpSpan {
1568
+ traceId: string;
1569
+ spanId: string;
1570
+ parentSpanId?: string;
1571
+ name: string;
1572
+ kind: number;
1573
+ startTimeUnixNano: string;
1574
+ endTimeUnixNano: string;
1575
+ attributes: Array<{
1576
+ key: string;
1577
+ value: {
1578
+ stringValue?: string;
1579
+ intValue?: string;
1580
+ doubleValue?: number;
1581
+ boolValue?: boolean;
1582
+ };
1583
+ }>;
1584
+ events?: Array<{
1585
+ timeUnixNano: string;
1586
+ name: string;
1587
+ attributes?: OtlpSpan['attributes'];
1588
+ }>;
1589
+ status?: {
1590
+ code: number;
1591
+ message?: string;
1592
+ };
1593
+ }
1594
+ interface OtlpResourceSpans {
1595
+ resource: {
1596
+ attributes: OtlpSpan['attributes'];
1597
+ };
1598
+ scopeSpans: Array<{
1599
+ scope: typeof OTEL_AGENT_EVAL_SCOPE;
1600
+ spans: OtlpSpan[];
1601
+ }>;
1602
+ }
1603
+ interface OtlpExport {
1604
+ resourceSpans: OtlpResourceSpans[];
1605
+ }
1606
+ /** Export a single run's spans + events in OTLP/JSON. */
1607
+ declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
1608
+
1609
+ /**
1610
+ * SandboxHarness — executes a scenario in an isolated environment and
1611
+ * emits a rich SandboxSpan into the trace.
1612
+ *
1613
+ * Two built-in drivers:
1614
+ * - `SubprocessSandboxDriver` — spawn in a local cwd with env vars.
1615
+ * Fast, no dependencies, fine for unit tests and most CI gates.
1616
+ * - `DockerSandboxDriver` — lifted from tangle-router's sandbox path;
1617
+ * shells out to `docker run`. Stronger isolation, slower startup.
1618
+ *
1619
+ * Consumers implement `SandboxDriver` for custom backends (Firecracker,
1620
+ * Cloudflare sandbox product, etc.). The harness doesn't care which.
1621
+ */
1622
+
1623
+ interface HarnessConfig {
1624
+ /** Setup command (e.g. "pnpm install"). Non-zero exit fails the run. */
1625
+ setupCommand?: string;
1626
+ /** Run command (e.g. "pnpm build"). */
1627
+ runCommand?: string;
1628
+ /** Test command (e.g. "pnpm test --run"). Drives the test count + pass count. */
1629
+ testCommand?: string;
1630
+ /** Absolute cwd for the subprocess driver. Ignored by docker driver. */
1631
+ cwd?: string;
1632
+ /** Max wall-clock per phase in ms. Default 10 minutes. */
1633
+ timeoutMs?: number;
1634
+ /** Image for the docker driver. */
1635
+ image?: string;
1636
+ /** Extra env vars (validated; shell-escaped). */
1637
+ env?: Record<string, string>;
1638
+ /** Parser for the test output — maps stdout/stderr/exit code → pass count. */
1639
+ testParser?: TestOutputParser;
1640
+ }
1641
+ interface TestOutputParser {
1642
+ id: string;
1643
+ parse(stdout: string, stderr: string, exitCode: number): {
1644
+ testsTotal: number;
1645
+ testsPassed: number;
1646
+ } | undefined;
1647
+ }
1648
+ interface SandboxResult {
1649
+ phase: 'setup' | 'run' | 'test';
1650
+ exitCode: number;
1651
+ stdout: string;
1652
+ stderr: string;
1653
+ wallMs: number;
1654
+ testsTotal?: number;
1655
+ testsPassed?: number;
1656
+ }
1657
+ interface SandboxDriver {
1658
+ id: string;
1659
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
1660
+ }
1661
+ /** Vitest default summary line: "Tests X passed | Y failed". */
1662
+ declare const vitestTestParser: TestOutputParser;
1663
+ /** Pytest default: "collected N items" + " X passed, Y failed". */
1664
+ declare const pytestTestParser: TestOutputParser;
1665
+ /** Jest: "Tests: X passed, Y total" (and optional failed). */
1666
+ declare const jestTestParser: TestOutputParser;
1667
+ /** Composite parser — tries a list of parsers in order. */
1668
+ declare function composeParsers(...parsers: TestOutputParser[]): TestOutputParser;
1669
+ declare class SubprocessSandboxDriver implements SandboxDriver {
1670
+ id: string;
1671
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
1672
+ }
1673
+ declare class DockerSandboxDriver implements SandboxDriver {
1674
+ id: string;
1675
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
1676
+ }
1677
+ interface SandboxHarnessResult {
1678
+ passed: boolean;
1679
+ setup?: SandboxResult;
1680
+ run?: SandboxResult;
1681
+ test?: SandboxResult;
1682
+ totalWallMs: number;
1683
+ /** Final score — 0 when no tests; otherwise testsPassed/testsTotal. */
1684
+ score: number;
1685
+ }
1686
+ declare class SandboxHarness {
1687
+ private driver;
1688
+ constructor(driver?: SandboxDriver);
1689
+ run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
1690
+ }
1691
+
1692
+ /**
1693
+ * TestGradedScenario — a scenario whose score comes from a test suite.
1694
+ *
1695
+ * This is the SWE-bench pattern generalized. The scenario ships:
1696
+ * - fixture data (setup instructions)
1697
+ * - a test command the harness runs
1698
+ * - optional assertion overrides
1699
+ *
1700
+ * The runner emits a run, delegates to SandboxHarness, records the
1701
+ * outcome, and returns a structured verdict. Consumers bind their own
1702
+ * agent execution to this contract.
1703
+ */
1704
+
1705
+ interface TestGradedScenario {
1706
+ id: string;
1707
+ description?: string;
1708
+ harness: HarnessConfig;
1709
+ /** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
1710
+ passThreshold?: number;
1711
+ /** Provenance for dataset tracking. */
1712
+ datasetVersion?: string;
1713
+ /** Free-form tags (difficulty, category, etc.). */
1714
+ tags?: Record<string, string>;
1715
+ }
1716
+ interface TestGradedRunOptions {
1717
+ variantId?: string;
1718
+ driver?: SandboxDriver;
1719
+ /** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
1720
+ provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
1721
+ }
1722
+ interface TestGradedRunResult {
1723
+ runId: string;
1724
+ scenario: TestGradedScenario;
1725
+ harness: SandboxHarnessResult;
1726
+ pass: boolean;
1727
+ score: number;
1728
+ failureClass?: FailureClass;
1729
+ }
1730
+ declare function runTestGradedScenario(scenario: TestGradedScenario, store: TraceStore, options?: TestGradedRunOptions): Promise<TestGradedRunResult>;
1731
+
1732
+ /**
1733
+ * BudgetGuard — enforces token / wall-clock / call / $ caps, records
1734
+ * a ledger entry on every decrement, emits `budget_breach` + throws
1735
+ * `BudgetBreachError` when a cap is hit.
1736
+ *
1737
+ * Wraps a TraceEmitter. The emitter persists ledger entries + breach
1738
+ * events so the classifier, pipelines, and reports can all read
1739
+ * budget state from the trace corpus — no separate accounting.
1740
+ */
1741
+
1742
+ declare class BudgetBreachError extends Error {
1743
+ dimension: keyof BudgetSpec;
1744
+ limit: number;
1745
+ attempted: number;
1746
+ constructor(dimension: keyof BudgetSpec, limit: number, attempted: number);
1747
+ }
1748
+ declare class BudgetGuard {
1749
+ private consumed;
1750
+ private emitter;
1751
+ private budget;
1752
+ private startedAt;
1753
+ constructor(emitter: TraceEmitter, budget: BudgetSpec, now?: () => number);
1754
+ /** Record consumption. Throws `BudgetBreachError` if any dimension exceeds its cap. */
1755
+ charge(delta: Partial<Record<keyof BudgetSpec, number>>, spanId?: string): Promise<void>;
1756
+ /** Convenience: advance wall-clock budget based on elapsed wall time. */
1757
+ tickWall(nowMs: number, spanId?: string): Promise<void>;
1758
+ get state(): Record<keyof BudgetSpec, number>;
1759
+ }
1760
+
1761
+ /**
1762
+ * Failure taxonomy — canonical classes + a default classifier.
1763
+ *
1764
+ * Every failed run should end up in a named class. The classifier here
1765
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
1766
+ * the consumer for novel cases and trained into the rule base over time.
1767
+ *
1768
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
1769
+ * returned class as `Run.outcome.failureClass`.
1770
+ */
1771
+
1772
+ interface FailureContext {
1773
+ run: Run;
1774
+ spans: Span[];
1775
+ events: TraceEvent[];
1776
+ }
1777
+ interface FailureClassification {
1778
+ failureClass: FailureClass;
1779
+ reason: string;
1780
+ triggerSpanId?: string;
1781
+ triggerEventId?: string;
1782
+ }
1783
+ /** Ordered rules — first match wins. */
1784
+ interface FailureRule {
1785
+ id: string;
1786
+ match: (ctx: FailureContext) => {
1787
+ failureClass: FailureClass;
1788
+ reason: string;
1789
+ triggerSpanId?: string;
1790
+ triggerEventId?: string;
1791
+ } | null;
1792
+ }
1793
+ declare const DEFAULT_RULES: FailureRule[];
1794
+ /** Classify the failure mode of a run using an ordered rule list. */
1795
+ declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
1796
+
1797
+ /**
1798
+ * Trajectory — ordered, structured view over a run's spans.
1799
+ *
1800
+ * A pure function `buildTrajectory(store, runId) → Trajectory` returns
1801
+ * a topologically ordered list of `TrajectoryStep` with parent-child
1802
+ * grouping collapsed into a single line-of-agent-work. Separate
1803
+ * analyzers (stuck-loop detection, waste ratio) live in
1804
+ * `pipelines/` and consume the trajectory.
1805
+ */
1806
+
1807
+ interface TrajectoryStep {
1808
+ index: number;
1809
+ span: Span;
1810
+ /** Depth in the span tree from the root. 0 = top-level. */
1811
+ depth: number;
1812
+ /** Events attached to this span. */
1813
+ events: TraceEvent[];
1814
+ }
1815
+ interface Trajectory {
1816
+ runId: string;
1817
+ steps: TrajectoryStep[];
1818
+ llmTurns: number;
1819
+ toolCalls: number;
1820
+ judgeVerdicts: number;
1821
+ retrievals: number;
1822
+ totalDurationMs: number;
1823
+ }
1824
+ declare function buildTrajectory(store: TraceStore, runId: string): Promise<Trajectory>;
1825
+
1826
+ /**
1827
+ * Tool-use metrics — derived purely from trace data.
1828
+ *
1829
+ * No scoring assumptions: consumers supply optional ground-truth tool
1830
+ * selections per turn + optional "information used downstream" signals.
1831
+ * Without those, we still compute descriptive metrics (error rate,
1832
+ * retry rate, duplicate-call rate) that are useful on their own.
1833
+ */
1834
+
1835
+ interface ToolUseMetrics {
1836
+ runId: string;
1837
+ totalCalls: number;
1838
+ byTool: Record<string, ToolStats>;
1839
+ errorRate: number;
1840
+ /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
1841
+ duplicateRate: number;
1842
+ /** Ratio of error calls followed by ≥1 retry on same tool. */
1843
+ retryRate: number;
1844
+ /** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
1845
+ selectionAccuracy?: number;
1846
+ }
1847
+ interface ToolStats {
1848
+ calls: number;
1849
+ errors: number;
1850
+ avgLatencyMs: number;
1851
+ duplicates: number;
1852
+ }
1853
+ interface ToolUseOptions {
1854
+ /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
1855
+ selectionLabels?: Record<string, boolean>;
1856
+ }
1857
+ declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
1858
+
1859
+ /**
1860
+ * StuckLoopView — detects when an agent calls the same tool with the
1861
+ * same (or structurally similar) arguments ≥ N times in a short window.
1862
+ *
1863
+ * Rationale: agents that loop are the number-one production failure
1864
+ * mode on long-horizon flows. The view returns (runId, toolName,
1865
+ * argHash, occurrences, windowMs) for each detected loop plus a
1866
+ * fraction of runs affected.
1867
+ */
1868
+
1869
+ interface StuckLoopFinding {
1870
+ runId: string;
1871
+ toolName: string;
1872
+ argHash: string;
1873
+ occurrences: number;
1874
+ spanIds: string[];
1875
+ /** Milliseconds between first and last call in the loop. */
1876
+ windowMs: number;
1877
+ }
1878
+ interface StuckLoopReport {
1879
+ findings: StuckLoopFinding[];
1880
+ affectedRunRatio: number;
1881
+ totalRuns: number;
1882
+ }
1883
+ interface StuckLoopOptions {
1884
+ /** Minimum call count to flag a loop (default 3). */
1885
+ minOccurrences?: number;
1886
+ /** Filter to a specific runId; omit to scan the entire corpus. */
1887
+ runId?: string;
1888
+ }
1889
+ declare function stuckLoopView(store: TraceStore, options?: StuckLoopOptions): Promise<StuckLoopReport>;
1890
+
1891
+ /**
1892
+ * ToolWasteView — fraction of tool calls whose results weren't used
1893
+ * downstream. Without a "used" signal we fall back to structural
1894
+ * proxies: error calls, duplicate calls, and tool calls followed by
1895
+ * zero subsequent LLM spans are all considered waste.
1896
+ *
1897
+ * Consumers can pass a `usageOracle` that inspects a tool span and
1898
+ * returns true iff the tool's result appears in a later LLM message,
1899
+ * artifact, or state mutation — that's the canonical definition; the
1900
+ * default heuristic is a reasonable fallback.
1901
+ */
1902
+
1903
+ interface ToolWasteFinding {
1904
+ runId: string;
1905
+ wastedCalls: number;
1906
+ totalCalls: number;
1907
+ wasteRate: number;
1908
+ }
1909
+ interface ToolWasteReport {
1910
+ byRun: ToolWasteFinding[];
1911
+ overallWasteRate: number;
1912
+ }
1913
+ interface ToolWasteOptions {
1914
+ runId?: string;
1915
+ usageOracle?: (tool: ToolSpan, later: {
1916
+ llm: Awaited<ReturnType<typeof llmSpans>>;
1917
+ }) => boolean;
1918
+ }
1919
+ declare function toolWasteView(store: TraceStore, options?: ToolWasteOptions): Promise<ToolWasteReport>;
1920
+
1921
+ /**
1922
+ * BudgetBreachView — aggregates breach events across the corpus.
1923
+ *
1924
+ * Answers: which dimensions get hit most often? Which scenarios are
1925
+ * underbudgeted? Which variants trigger the most breaches?
1926
+ */
1927
+
1928
+ interface BudgetBreachFinding {
1929
+ runId: string;
1930
+ scenarioId: string;
1931
+ variantId?: string;
1932
+ dimension: keyof BudgetSpec;
1933
+ limit: number;
1934
+ consumed: number;
1935
+ excessRatio: number;
1936
+ timestamp: number;
1937
+ }
1938
+ interface BudgetBreachReport {
1939
+ findings: BudgetBreachFinding[];
1940
+ byDimension: Record<string, number>;
1941
+ byScenario: Record<string, number>;
1942
+ byVariant: Record<string, number>;
1943
+ totalRuns: number;
1944
+ breachedRunRatio: number;
1945
+ }
1946
+ declare function budgetBreachView(store: TraceStore, options?: {
1947
+ scenarioId?: string;
1948
+ variantId?: string;
1949
+ }): Promise<BudgetBreachReport>;
1950
+
1951
+ /**
1952
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
1953
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
1954
+ *
1955
+ * Each cluster includes: N runs, scenarios affected, representative
1956
+ * error message, a proposed mitigation hint (rule → action table).
1957
+ */
1958
+
1959
+ interface FailureCluster {
1960
+ failureClass: FailureClass;
1961
+ /** Tool name when the trigger was a tool span, else undefined. */
1962
+ toolName?: string;
1963
+ /** First 16 chars of argHash — clusters similar args. */
1964
+ argPrefix?: string;
1965
+ runCount: number;
1966
+ scenarioIds: string[];
1967
+ exampleError?: string;
1968
+ exampleRunId: string;
1969
+ }
1970
+ interface FailureClusterReport {
1971
+ clusters: FailureCluster[];
1972
+ totalFailures: number;
1973
+ totalRuns: number;
1974
+ }
1975
+ declare function failureClusterView(store: TraceStore, options?: {
1976
+ rules?: FailureRule[];
1977
+ minClusterSize?: number;
1978
+ }): Promise<FailureClusterReport>;
1979
+
1980
+ /**
1981
+ * JudgeAgreementView — pairwise agreement between judges across the
1982
+ * corpus, grouped by dimension.
1983
+ *
1984
+ * Output drives two workflows:
1985
+ * - Judge robustness audit: "does Claude agree with GPT at κ ≥ 0.6?"
1986
+ * - Calibration tracking: κ vs golden human labels over time (by
1987
+ * providing a `humanGoldenJudgeId`).
1988
+ */
1989
+
1990
+ interface JudgePair {
1991
+ judgeA: string;
1992
+ judgeB: string;
1993
+ dimension: string;
1994
+ /** Number of (targetSpanId, dimension) tuples both judges scored. */
1995
+ commonItems: number;
1996
+ pearson: number;
1997
+ krippendorff: number;
1998
+ }
1999
+ interface JudgeAgreementReport {
2000
+ pairs: JudgePair[];
2001
+ dimensions: string[];
2002
+ judgeIds: string[];
2003
+ }
2004
+ declare function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport>;
2005
+
2006
+ /**
2007
+ * FirstDivergenceView — aligns two trajectories by step index, reports
2008
+ * the first step where they differ.
2009
+ *
2010
+ * "Differ" is configurable — default is (kind, toolName if tool, model
2011
+ * if llm). Use this view to attribute "why is variant B better?" to a
2012
+ * specific step rather than an aggregate mean delta.
2013
+ */
2014
+
2015
+ interface DivergenceReport {
2016
+ runA: string;
2017
+ runB: string;
2018
+ firstDivergenceIndex: number | null;
2019
+ aStep?: TrajectoryStep;
2020
+ bStep?: TrajectoryStep;
2021
+ reason?: string;
2022
+ /** Common prefix length (steps that matched). */
2023
+ commonPrefixLen: number;
2024
+ }
2025
+ interface DivergenceOptions {
2026
+ /** Returns true if two steps are considered equal. Default: kind + tool/model match. */
2027
+ stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
2028
+ }
2029
+ declare function firstDivergenceView(store: TraceStore, runA: string, runB: string, options?: DivergenceOptions): Promise<DivergenceReport>;
2030
+
2031
+ /**
2032
+ * Baseline regression detection.
2033
+ *
2034
+ * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
2035
+ * to: "is this run measurably worse than baseline?" — with enough
2036
+ * statistical rigor to distinguish noise from drift.
2037
+ *
2038
+ * Uses:
2039
+ * - Welch's t-test (unequal variance) for per-metric mean comparison
2040
+ * - Cohen's d for effect size magnitude
2041
+ * - IQR for stability flag (unstable samples can't be trusted for comparisons)
2042
+ *
2043
+ * Returns a structured verdict: improved | regressed | stable | unstable.
2044
+ */
2045
+ interface MetricSamples {
2046
+ /** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
2047
+ metric: string;
2048
+ /** Whether higher values are better. */
2049
+ higherIsBetter: boolean;
2050
+ baseline: number[];
2051
+ candidate: number[];
2052
+ }
2053
+ interface MetricVerdict {
2054
+ metric: string;
2055
+ baselineMean: number;
2056
+ candidateMean: number;
2057
+ delta: number;
2058
+ cohensD: number;
2059
+ welchT: number;
2060
+ welchDf: number;
2061
+ welchP: number;
2062
+ stable: boolean;
2063
+ /** IQR of the combined samples — used as a rough stability indicator. */
2064
+ iqr: number;
2065
+ verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
2066
+ }
2067
+ interface BaselineReport {
2068
+ metrics: MetricVerdict[];
2069
+ /** True if any critical metric regressed. */
2070
+ hasRegression: boolean;
2071
+ /** True if any metric is unstable (too noisy to judge). */
2072
+ hasUnstable: boolean;
2073
+ }
2074
+ interface BaselineOptions {
2075
+ /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
2076
+ effectThreshold?: number;
2077
+ /** p-value threshold for statistical significance (default 0.05). */
2078
+ alpha?: number;
2079
+ /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
2080
+ unstableCvThreshold?: number;
2081
+ }
2082
+ /**
2083
+ * Compare candidate samples against baseline per metric. Verdict logic:
2084
+ * - unstable: IQR/|mean| > threshold on either set — not enough signal
2085
+ * - improved: meaningful effect in the "better" direction AND p < alpha
2086
+ * - regressed: meaningful effect in the "worse" direction AND p < alpha
2087
+ * - stable: otherwise (no significant change)
2088
+ */
2089
+ declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
2090
+ /** Inter-quartile range; 0 when the sample has no spread. */
2091
+ declare function iqr(xs: number[]): number;
2092
+ /**
2093
+ * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
2094
+ * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
2095
+ * when df is large.
2096
+ */
2097
+ declare function welchsTTest(a: number[], b: number[]): {
2098
+ t: number;
2099
+ df: number;
2100
+ p: number;
2101
+ };
2102
+
2103
+ /**
2104
+ * RegressionView — compares a candidate slice to a baseline slice on a
2105
+ * named metric. Delegates the statistics (Welch's t-test, Cohen's d,
2106
+ * IQR stability) to `baseline.ts`.
2107
+ *
2108
+ * This is the entry point for CI regression gates: "given runs tagged
2109
+ * release=A and release=B, did any metric regress?"
2110
+ */
2111
+
2112
+ interface RegressionSpec {
2113
+ metric: string;
2114
+ higherIsBetter: boolean;
2115
+ /** Extract a scalar from a run. Default extractors handle common metrics. */
2116
+ extract?: (run: Run, store: TraceStore) => Promise<number | null>;
2117
+ }
2118
+ interface RegressionOptions extends BaselineOptions {
2119
+ baseline: RunFilter;
2120
+ candidate: RunFilter;
2121
+ }
2122
+ declare function regressionView(store: TraceStore, metrics: RegressionSpec[], options: RegressionOptions): Promise<BaselineReport>;
2123
+
2124
+ /**
2125
+ * SLO gates — quantified pass/fail primitives beyond score thresholds.
2126
+ *
2127
+ * Lifted from ADC's sandbox eval suite. Each SLO defines a metric, a
2128
+ * threshold, and a severity (critical | warning). Critical breaches fail
2129
+ * the eval; warnings are reported but don't gate CI. Margin is the
2130
+ * ratio of actual to threshold for histogramming "how close are we?"
2131
+ *
2132
+ * Consumers assemble their own SLO arrays; DEFAULT_AGENT_SLOS covers
2133
+ * the generic agent flow (provision, first token, pass rate, cost).
2134
+ */
2135
+ type SloSeverity = 'critical' | 'warning';
2136
+ type SloComparator = 'lte' | 'gte';
2137
+ interface Slo {
2138
+ /** Stable identifier — must be unique within an SLO set. */
2139
+ id: string;
2140
+ /** Human description, shown in reports. */
2141
+ description: string;
2142
+ /** Metric key looked up in the candidate record. */
2143
+ metric: string;
2144
+ /** Whether the metric should stay below (lte) or above (gte) threshold. */
2145
+ comparator: SloComparator;
2146
+ /** Threshold value. */
2147
+ threshold: number;
2148
+ severity: SloSeverity;
2149
+ }
2150
+ interface SloCheckResult {
2151
+ slo: Slo;
2152
+ actual: number | undefined;
2153
+ passed: boolean;
2154
+ /** actual/threshold for lte, threshold/actual for gte. >1 means safe margin; <1 means breach. 0 when actual is missing. */
2155
+ margin: number;
2156
+ detail: string;
2157
+ }
2158
+ interface SloReport {
2159
+ results: SloCheckResult[];
2160
+ passedCritical: boolean;
2161
+ criticalBreaches: SloCheckResult[];
2162
+ warnings: SloCheckResult[];
2163
+ }
2164
+ /**
2165
+ * Evaluate an SLO set against a candidate metrics object. Missing metrics
2166
+ * count as breaches — if you declared it, you must measure it.
2167
+ */
2168
+ declare function checkSlos(metrics: Record<string, number>, slos: Slo[]): SloReport;
2169
+ /** Reference SLO set for agent-style evals. Tune per-product by cloning + overriding. */
2170
+ declare const DEFAULT_AGENT_SLOS: Slo[];
2171
+
2172
+ /**
2173
+ * Declarative oracles — ground-truth assertions without an LLM.
2174
+ *
2175
+ * Lifted from browser-agent-driver's _oracle.mjs. When you know the
2176
+ * expected outcome exactly (a URL, a text fragment, a JSON shape), you
2177
+ * don't need an LLM judge — you need a regex. These oracles are
2178
+ * composable pass/fail checks over an observation bundle.
2179
+ *
2180
+ * Each oracle returns { pass, detail, evidence? } and has a short
2181
+ * `id` for reporting. `evaluateOracles` runs a batch and aggregates.
2182
+ */
2183
+ interface OracleObservation {
2184
+ /** Final observable text output from the agent (response, page snapshot, stdout). */
2185
+ text?: string;
2186
+ /** Final URL — for browser-style scenarios. */
2187
+ url?: string;
2188
+ /** Any structured JSON the agent produced. */
2189
+ json?: unknown;
2190
+ /** Free-form context used by custom oracles. */
2191
+ context?: Record<string, unknown>;
2192
+ }
2193
+ interface OracleResult {
2194
+ id: string;
2195
+ pass: boolean;
2196
+ detail: string;
2197
+ evidence?: string;
2198
+ }
2199
+ interface Oracle {
2200
+ id: string;
2201
+ check(obs: OracleObservation): OracleResult;
2202
+ }
2203
+ declare function textInSnapshot(needle: string, opts?: {
2204
+ caseSensitive?: boolean;
2205
+ }): Oracle;
2206
+ declare function urlContains(fragment: string): Oracle;
2207
+ declare function jsonShape(expected: Record<string, unknown>): Oracle;
2208
+ declare function regexMatches(pattern: RegExp): Oracle;
2209
+ /**
2210
+ * Anti-bot detector — distinguishes genuine failures from blocked navigation
2211
+ * (cloudflare, recaptcha, etc). Returns an Oracle that PASSES when no block
2212
+ * marker is present; on block, detail names the blocker so runners can tag
2213
+ * results as "blocked" rather than "failed". Lifted from browser-agent-driver.
2214
+ */
2215
+ declare function notBlocked(): Oracle;
2216
+ interface OracleReport {
2217
+ results: OracleResult[];
2218
+ pass: boolean;
2219
+ passCount: number;
2220
+ failCount: number;
2221
+ /** 0-1 ratio of oracles passed. */
2222
+ score: number;
2223
+ }
2224
+ /** Run all oracles against one observation and aggregate. */
2225
+ declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): OracleReport;
2226
+
2227
+ /**
2228
+ * Cost tracker — token + USD accounting per scenario and per run.
2229
+ *
2230
+ * Lifted from tax/legal metrics.ts + tangle-router UsageEvent. Every
2231
+ * optimizer needs to know "is the quality gain worth the cost delta?",
2232
+ * and every dashboard needs dollars-per-completed-task. MODEL_PRICING
2233
+ * from metrics.ts stays authoritative for estimate math; this module
2234
+ * adds the aggregation + per-scenario roll-up that was duplicated
2235
+ * across 4 verticals.
2236
+ */
2237
+ interface TokenSpec {
2238
+ inputTokens: number;
2239
+ outputTokens: number;
2240
+ cachedTokens?: number;
2241
+ reasoningTokens?: number;
2242
+ }
2243
+ interface CostEntry extends TokenSpec {
2244
+ scenarioId: string;
2245
+ model: string;
2246
+ /** Override estimate with an observed cost (e.g. from provider response). */
2247
+ actualCostUsd?: number;
2248
+ timestamp: number;
2249
+ /** Free-form tags (variant id, round #, etc.). */
2250
+ tags?: Record<string, string>;
2251
+ }
2252
+ interface ScenarioCost {
2253
+ scenarioId: string;
2254
+ entries: CostEntry[];
2255
+ totalInputTokens: number;
2256
+ totalOutputTokens: number;
2257
+ totalCachedTokens: number;
2258
+ totalCostUsd: number;
2259
+ /** Pass flag — set by consumer via markOutcome; used for cost-per-completed-task. */
2260
+ completed?: boolean;
2261
+ }
2262
+ declare class CostTracker {
2263
+ private byScenario;
2264
+ record(entry: Omit<CostEntry, 'timestamp'> & {
2265
+ timestamp?: number;
2266
+ }): CostEntry;
2267
+ markOutcome(scenarioId: string, completed: boolean): void;
2268
+ get(scenarioId: string): ScenarioCost | undefined;
2269
+ list(): ScenarioCost[];
2270
+ summary(): CostSummary;
2271
+ }
2272
+ interface CostSummary {
2273
+ scenarioCount: number;
2274
+ completedCount: number;
2275
+ totalInputTokens: number;
2276
+ totalOutputTokens: number;
2277
+ totalCostUsd: number;
2278
+ avgCostPerScenarioUsd: number;
2279
+ /** Total USD / completed scenarios — null when nothing completed. */
2280
+ costPerCompletedTaskUsd: number | null;
2281
+ }
2282
+
2283
+ /**
2284
+ * Pareto frontier — multi-objective optimization over candidate runs.
2285
+ *
2286
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
2287
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
2288
+ * ttfb), you rarely have a single "winner" — you have a set of
2289
+ * non-dominated candidates. This module exposes:
2290
+ *
2291
+ * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
2292
+ * - `dominates`: does A dominate B across all objectives?
2293
+ *
2294
+ * Each objective is declared with a direction: 'maximize' (higher=better)
2295
+ * or 'minimize' (lower=better). Candidates are any object; pass an
2296
+ * `objective(candidate)` accessor.
2297
+ */
2298
+ type Direction = 'maximize' | 'minimize';
2299
+ interface Objective<T> {
2300
+ /** Stable label used in reports. */
2301
+ name: string;
2302
+ direction: Direction;
2303
+ value: (candidate: T) => number;
2304
+ }
2305
+ interface ParetoResult<T> {
2306
+ frontier: T[];
2307
+ dominated: T[];
2308
+ /** Index map: frontier[i] dominates each of dominatedBy[i]. */
2309
+ dominanceMap: Array<{
2310
+ dominator: T;
2311
+ dominated: T[];
2312
+ }>;
2313
+ }
2314
+ /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
2315
+ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
2316
+ /**
2317
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
2318
+ * objective are excluded (can't rank them). A candidate enters the frontier
2319
+ * iff no other candidate dominates it.
2320
+ */
2321
+ declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
2322
+
2323
+ /**
2324
+ * Series convergence — detects whether a sequence of scalar measurements
2325
+ * is stabilizing, drifting, or noisy.
2326
+ *
2327
+ * Lifted from ADC convergence.ts. The per-turn `ConvergenceTracker` is
2328
+ * about progress *within* a single run; this module is about drift
2329
+ * *across* runs (e.g. "are my nightly eval scores stabilizing?").
2330
+ *
2331
+ * Three signals:
2332
+ * - stabilized: last K values have low variance (< epsilon) — done
2333
+ * - drifting: recent trend is monotonic and beyond noise — regressing or improving
2334
+ * - noisy: neither — keep iterating, but flag as untrustworthy for gating
2335
+ */
2336
+ interface SeriesConvergenceOptions {
2337
+ /** Window size for "recent" analysis (default 5). */
2338
+ window?: number;
2339
+ /** Coefficient-of-variation threshold below which the window is stabilized (default 0.05 = 5%). */
2340
+ stableCv?: number;
2341
+ /** Minimum monotone run length to call drift (default 3). */
2342
+ driftRun?: number;
2343
+ }
2344
+ interface SeriesConvergenceResult {
2345
+ state: 'stabilized' | 'drifting-up' | 'drifting-down' | 'noisy' | 'insufficient-data';
2346
+ windowMean: number;
2347
+ windowCv: number;
2348
+ /** Longest monotonic run at the tail of the series (positive for up, negative for down). */
2349
+ tailRun: number;
2350
+ /** True when n ≥ window AND windowCv ≤ stableCv. */
2351
+ stable: boolean;
2352
+ }
2353
+ declare function analyzeSeries(values: number[], options?: SeriesConvergenceOptions): SeriesConvergenceResult;
2354
+
2355
+ /**
2356
+ * State continuity scoring — measures how well a resumed/handed-off agent
2357
+ * preserves prior work.
2358
+ *
2359
+ * Lifted from tax-agent's run-resume-eval.ts. When session 2 continues
2360
+ * session 1's work, the key question is: did it preserve key artifacts,
2361
+ * or start over and lose context? Each `ContinuityCheck` inspects one
2362
+ * aspect (file preserved, key count grew, status advanced) and yields
2363
+ * 0-1 credit; the aggregate is the simple mean.
2364
+ *
2365
+ * Generic over any "snapshot" shape — pass your own checks.
2366
+ */
2367
+ interface ContinuitySnapshotPair<T> {
2368
+ before: T;
2369
+ after: T;
2370
+ }
2371
+ interface ContinuityCheck<T> {
2372
+ /** Stable identifier; shown in the report. */
2373
+ id: string;
2374
+ /** Description of what this check measures. */
2375
+ description: string;
2376
+ /** Returns 0..1 credit for this dimension (1 = fully preserved/improved). */
2377
+ score: (pair: ContinuitySnapshotPair<T>) => number;
2378
+ }
2379
+ interface ContinuityCheckResult {
2380
+ id: string;
2381
+ description: string;
2382
+ score: number;
2383
+ pass: boolean;
2384
+ }
2385
+ interface ContinuityReport {
2386
+ results: ContinuityCheckResult[];
2387
+ /** Mean of per-check scores, in 0..1. */
2388
+ overallScore: number;
2389
+ /** True iff ALL checks scored ≥ passThreshold. */
2390
+ pass: boolean;
2391
+ }
2392
+ declare function scoreContinuity<T>(pair: ContinuitySnapshotPair<T>, checks: ContinuityCheck<T>[], options?: {
2393
+ passThreshold?: number;
2394
+ }): ContinuityReport;
2395
+ /** Common check: a required key in a record exists and equals the prior value. */
2396
+ declare function keyPreserved<T extends Record<string, unknown>>(key: keyof T & string): ContinuityCheck<T>;
2397
+ /** Common check: a collection (array) grew or stayed the same size. */
2398
+ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minRatio?: number): ContinuityCheck<T>;
2399
+ /** Common check: a status field advanced in an expected order. */
2400
+ declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
2401
+
2402
+ /**
2403
+ * Dataset — versioned, sliceable, content-hashed scenario collection.
2404
+ *
2405
+ * Scenarios stop being ephemeral arrays and become first-class
2406
+ * artifacts. Every Dataset carries:
2407
+ * - content hash (sha256 over canonicalized scenario array)
2408
+ * - provenance (contributor, createdAt, sourceUrl)
2409
+ * - split labels (train | dev | test | holdout)
2410
+ * - difficulty tiers (easy | medium | hard | extreme)
2411
+ * - tags (free-form, per-scenario)
2412
+ *
2413
+ * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
2414
+ * deterministic, reproducible subset. Holdout slices are locked: you
2415
+ * can read them but `mutate` throws, which prevents "oh I'll just
2416
+ * tweak that one scenario" contamination drift.
2417
+ */
2418
+ type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
2419
+ type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
2420
+ interface DatasetScenario {
2421
+ id: string;
2422
+ /** Arbitrary payload; the framework doesn't interpret it. */
2423
+ payload: unknown;
2424
+ split?: DatasetSplit;
2425
+ difficulty?: DatasetDifficulty;
2426
+ /** Canary token that MUST NOT round-trip through a correct agent output. */
2427
+ canary?: string;
2428
+ tags?: Record<string, string>;
2429
+ }
2430
+ interface DatasetProvenance {
2431
+ contributor?: string;
2432
+ createdAt: string;
2433
+ sourceUrl?: string;
2434
+ license?: string;
2435
+ description?: string;
2436
+ /** Monotonic human-readable version (e.g. "2026.04.20"). */
2437
+ version: string;
2438
+ }
2439
+ interface DatasetManifest {
2440
+ name: string;
2441
+ provenance: DatasetProvenance;
2442
+ /** sha256 hex over canonicalized scenarios. */
2443
+ contentHash: string;
2444
+ scenarioCount: number;
2445
+ splitCounts: Record<DatasetSplit, number>;
2446
+ }
2447
+ interface SliceOptions {
2448
+ split?: DatasetSplit;
2449
+ difficulty?: DatasetDifficulty;
2450
+ /** Number of scenarios (random sample, seeded). Omit to take all that match. */
2451
+ limit?: number;
2452
+ seed?: number;
2453
+ /** Predicate narrowing. Applied after split/difficulty filters. */
2454
+ filter?: (scenario: DatasetScenario) => boolean;
2455
+ /** If true, include scenarios marked as holdout. Default false. */
2456
+ includeHoldout?: boolean;
2457
+ }
2458
+ /** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
2459
+ declare class HoldoutLockedError extends Error {
2460
+ constructor(datasetName: string);
2461
+ }
2462
+ declare class Dataset {
2463
+ readonly name: string;
2464
+ readonly provenance: DatasetProvenance;
2465
+ private scenarios;
2466
+ private locked;
2467
+ constructor(init: {
2468
+ name: string;
2469
+ provenance: DatasetProvenance;
2470
+ scenarios: DatasetScenario[];
2471
+ locked?: boolean;
2472
+ });
2473
+ /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
2474
+ all(): readonly DatasetScenario[];
2475
+ get size(): number;
2476
+ /**
2477
+ * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
2478
+ * the same arguments always produce the same slice across machines.
2479
+ */
2480
+ slice(options?: SliceOptions): DatasetScenario[];
2481
+ /**
2482
+ * Assemble the manifest (name + provenance + content hash + counts).
2483
+ * Content hash is deterministic over canonicalized scenarios.
2484
+ */
2485
+ manifest(): Promise<DatasetManifest>;
2486
+ /** Fresh unlocked copy — for post-release forks when mutation is needed. */
2487
+ clone(overrides?: Partial<{
2488
+ name: string;
2489
+ version: string;
2490
+ }>): Dataset;
2491
+ lock(): void;
2492
+ add(scenario: DatasetScenario): void;
2493
+ remove(scenarioId: string): void;
2494
+ /**
2495
+ * Stable JSON-Lines serialization — deterministic byte-for-byte.
2496
+ * Write to disk for contamination-verifiable archives.
2497
+ */
2498
+ toJsonl(): string;
2499
+ static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
2500
+ }
2501
+ declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
2502
+
2503
+ /**
2504
+ * ContaminationGuard — ensures held-out scenarios don't leak into
2505
+ * training/prompt paths, and flags model memorization.
2506
+ *
2507
+ * Three probes:
2508
+ * 1. `checkCanaries(output, scenario)` — if the scenario carries a
2509
+ * canary token, it MUST NOT appear in the agent's output.
2510
+ * Canaries are strings that are statistically impossible to
2511
+ * reconstruct from the scenario description alone — so if they
2512
+ * echo back, the model memorized them.
2513
+ * 2. `canaryLeakView(store)` — cross-corpus view of every run whose
2514
+ * output contained a canary, with the offending scenario + run.
2515
+ * 3. `HoldoutAuditor` — wraps a Dataset and emits a structured error
2516
+ * on any code path that reads holdout scenarios but doesn't flag
2517
+ * `purpose: 'evaluation'`. Keeps engineers honest.
2518
+ */
2519
+
2520
+ interface CanaryLeak {
2521
+ scenarioId: string;
2522
+ canary: string;
2523
+ runId?: string;
2524
+ evidence: string;
2525
+ }
2526
+ declare function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[];
2527
+ /**
2528
+ * Scan the LLM-output history in a corpus; returns every case where a
2529
+ * canary from a known scenario appeared in agent output. Pass the full
2530
+ * set of scenarios whose canaries you care about (typically the whole
2531
+ * held-out slice).
2532
+ */
2533
+ declare function canaryLeakView(store: TraceStore, scenarios: DatasetScenario[]): Promise<CanaryLeak[]>;
2534
+ declare class HoldoutAuditor {
2535
+ private scenarios;
2536
+ private accessLog;
2537
+ constructor(scenarios: DatasetScenario[]);
2538
+ /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
2539
+ get(scenarioId: string, purpose: 'evaluation' | 'debugging'): DatasetScenario;
2540
+ getAccessLog(): ReadonlyArray<{
2541
+ scenarioId: string;
2542
+ purpose: string;
2543
+ at: number;
2544
+ }>;
2545
+ }
2546
+
2547
+ /**
2548
+ * Red-team battery — adversarial scenario corpus with per-category
2549
+ * scorers.
2550
+ *
2551
+ * Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
2552
+ * that declares: category, expected refusal, attack surface. Scorers
2553
+ * inspect the agent's output + trajectory and return a PASS/FAIL per
2554
+ * category. Aggregated via `redTeamReport`.
2555
+ *
2556
+ * The built-in corpus is intentionally conservative — 40 cases
2557
+ * covering the most-seen attack patterns. Extend via `extendCorpus`.
2558
+ */
2559
+
2560
+ type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
2561
+ interface RedTeamPayload {
2562
+ category: RedTeamCategory;
2563
+ /** The hostile input the agent sees. */
2564
+ input: string;
2565
+ /**
2566
+ * What a safe agent should do: "refuse" means explicit refusal,
2567
+ * "ignore" means process benignly without executing the attack.
2568
+ */
2569
+ expected: 'refuse' | 'ignore';
2570
+ /** Substring(s) that MUST NOT appear in output. */
2571
+ forbiddenStrings?: string[];
2572
+ /** Tool names the agent MUST NOT call for this scenario. */
2573
+ forbiddenTools?: string[];
2574
+ }
2575
+ interface RedTeamCase extends DatasetScenario {
2576
+ payload: RedTeamPayload;
2577
+ }
2578
+ interface RedTeamFinding {
2579
+ scenarioId: string;
2580
+ category: RedTeamCategory;
2581
+ passed: boolean;
2582
+ reason: string;
2583
+ evidence?: string;
2584
+ }
2585
+ interface RedTeamReport {
2586
+ findings: RedTeamFinding[];
2587
+ passRateByCategory: Record<RedTeamCategory, number>;
2588
+ overallPassRate: number;
2589
+ }
2590
+ /** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
2591
+ declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
2592
+ declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
2593
+ /**
2594
+ * Score a single agent response against the red-team case.
2595
+ * `toolCalls` should be the names of tools the agent invoked during the run.
2596
+ */
2597
+ declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
2598
+ /** Aggregate red-team findings into per-category pass rates. */
2599
+ declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
2600
+ /**
2601
+ * Extract the tool-call names from a corpus run — convenience for the
2602
+ * common pipeline (run the scenario → score the run).
2603
+ */
2604
+ declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
2605
+
2606
+ /**
2607
+ * Power analysis + multiple-comparison correction.
2608
+ *
2609
+ * Two jobs:
2610
+ * 1. Before running: `requiredSampleSize({ effect, alpha, power })`
2611
+ * returns the N per arm needed to detect a given effect size.
2612
+ * 2. After running: `benjaminiHochberg(pValues, fdr)` and
2613
+ * `bonferroni(pValues, alpha)` correct for multiple pairwise tests
2614
+ * so PromptOptimizer's "significant" flag is statistically honest.
2615
+ *
2616
+ * Fixes the correctness bug in 0.2's PromptOptimizer which applied
2617
+ * alpha directly across n*(n-1)/2 pairwise tests without correction —
2618
+ * dramatically inflating false-positive rate when variants ≥ 3.
2619
+ */
2620
+ /**
2621
+ * Required N per arm for a two-sample comparison at target effect size,
2622
+ * alpha, and power. Uses the normal-approximation formula:
2623
+ *
2624
+ * n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
2625
+ *
2626
+ * where d is Cohen's d. Returns Infinity for effect ≤ 0.
2627
+ */
2628
+ declare function requiredSampleSize(opts: {
2629
+ effect: number;
2630
+ alpha?: number;
2631
+ power?: number;
2632
+ twoSided?: boolean;
2633
+ }): number;
2634
+ /** Bonferroni adjustment: multiply every p-value by the number of tests, clamp at 1. */
2635
+ declare function bonferroni(pValues: number[], alpha?: number): {
2636
+ adjusted: number[];
2637
+ significant: boolean[];
2638
+ };
2639
+ /**
2640
+ * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
2641
+ * significance at the target FDR. Properly handles ties and preserves
2642
+ * monotonicity of q-values.
2643
+ */
2644
+ declare function benjaminiHochberg(pValues: number[], fdr?: number): {
2645
+ qValues: number[];
2646
+ significant: boolean[];
2647
+ };
2648
+
2649
+ /**
2650
+ * Behavior DSL — pytest-style assertions over a run's trajectory.
2651
+ *
2652
+ * Shape:
2653
+ * expect(store, runId).toCall('search').withArgs({ q: /.+/ })
2654
+ * expect(store, runId).toRefuse()
2655
+ * expect(store, runId).toOutputMatch(/confirmed/i)
2656
+ * expect(store, runId).toRespectBudget('tokens')
2657
+ * expect(store, runId).toCompleteWithin({ wallMs: 30_000 })
2658
+ *
2659
+ * Each matcher returns an `Expectation` with `.check() → MatcherResult`
2660
+ * so the DSL is composable with suite runners — you can collect all
2661
+ * expectations into a report instead of throwing on first failure.
2662
+ */
2663
+
2664
+ interface MatcherResult {
2665
+ ok: boolean;
2666
+ detail: string;
2667
+ evidence?: string;
2668
+ }
2669
+ interface Expectation {
2670
+ /** Human-facing label; used in reports. */
2671
+ label: string;
2672
+ check(): Promise<MatcherResult>;
2673
+ }
2674
+ declare class BehaviorAssertion {
2675
+ private store;
2676
+ private runId;
2677
+ constructor(store: TraceStore, runId: string);
2678
+ toCall(toolName: string): CallExpectation;
2679
+ toRefuse(markers?: RegExp[]): Expectation;
2680
+ toOutputMatch(pattern: RegExp): Expectation;
2681
+ toRespectBudget(dimension: keyof BudgetLedgerEntry['dimension'] | 'tokens' | 'wallMs' | 'calls' | 'usd'): Expectation;
2682
+ toCompleteWithin(limits: {
2683
+ wallMs?: number;
2684
+ toolCalls?: number;
2685
+ llmTurns?: number;
2686
+ }): Expectation;
2687
+ toNeverCall(toolName: string): Expectation;
2688
+ }
2689
+ declare class CallExpectation implements Expectation {
2690
+ private store;
2691
+ private runId;
2692
+ private toolName;
2693
+ private argMatchers;
2694
+ private minCount;
2695
+ private maxCount;
2696
+ constructor(store: TraceStore, runId: string, toolName: string);
2697
+ get label(): string;
2698
+ withArgs(shape: Record<string, unknown | RegExp>): this;
2699
+ times(n: number): this;
2700
+ atLeast(n: number): this;
2701
+ atMost(n: number): this;
2702
+ check(): Promise<MatcherResult>;
2703
+ }
2704
+ declare function expectAgent(store: TraceStore, runId: string): BehaviorAssertion;
2705
+ /** Runs every expectation, collects results. Never throws. */
2706
+ declare function runExpectations(expectations: Expectation[]): Promise<{
2707
+ results: Array<{
2708
+ label: string;
2709
+ result: MatcherResult;
2710
+ }>;
2711
+ pass: boolean;
2712
+ passCount: number;
2713
+ failCount: number;
2714
+ }>;
2715
+
2716
+ /**
2717
+ * Judge calibration — measure judge quality against human gold + bias.
2718
+ *
2719
+ * Workflow:
2720
+ * 1. Build a golden set: {itemId, humanScore}[].
2721
+ * 2. Run candidate judges; each produces {itemId, score}.
2722
+ * 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
2723
+ * 4. Run bias probes (positional, verbosity, self-preference) to
2724
+ * detect systematic score inflation.
2725
+ *
2726
+ * Returns actionable diagnostics, not a single number. Consumers then
2727
+ * decide whether to trust the judge, retrain it, or add a tie-breaker.
2728
+ */
2729
+ interface GoldenItem {
2730
+ itemId: string;
2731
+ humanScore: number;
2732
+ /** Optional group used for per-group bias audits (e.g. model-of-output family). */
2733
+ group?: string;
2734
+ }
2735
+ interface CandidateScore {
2736
+ itemId: string;
2737
+ score: number;
2738
+ /** Optional — enables positional-bias analysis (did order matter?). */
2739
+ positionOfAInput?: 'first' | 'second';
2740
+ }
2741
+ interface CalibrationResult {
2742
+ n: number;
2743
+ pearson: number;
2744
+ /** Cohen's κ with quadratic weights over integer-rounded scores. */
2745
+ kappa: number;
2746
+ /** Mean absolute error vs human. */
2747
+ mae: number;
2748
+ /** Worst-5 miscalibrations (largest |judge - human|). */
2749
+ worstItems: Array<{
2750
+ itemId: string;
2751
+ judge: number;
2752
+ human: number;
2753
+ delta: number;
2754
+ }>;
2755
+ }
2756
+ declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
2757
+ interface PositionalBiasResult {
2758
+ /**
2759
+ * Score delta (first-position - second-position) averaged across items
2760
+ * presented in both positions. Non-zero = positional bias.
2761
+ */
2762
+ avgDelta: number;
2763
+ n: number;
2764
+ }
2765
+ /**
2766
+ * Feed the same items to the judge twice with A/B swapped and pass all
2767
+ * results here. Items that don't appear in both positions are ignored.
2768
+ */
2769
+ declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
2770
+ interface VerbosityBiasResult {
2771
+ /** Pearson correlation between output length and score. Strong positive = verbosity bias. */
2772
+ pearson: number;
2773
+ n: number;
2774
+ }
2775
+ declare function verbosityBias(samples: Array<{
2776
+ outputLen: number;
2777
+ score: number;
2778
+ }>): VerbosityBiasResult;
2779
+ interface SelfPreferenceResult {
2780
+ /** Mean judge score when judge's family matches output's family. */
2781
+ inFamilyMean: number;
2782
+ outOfFamilyMean: number;
2783
+ deltaMean: number;
2784
+ n: number;
2785
+ }
2786
+ /**
2787
+ * Pass the same scenarios scored with judge-model X grading outputs from
2788
+ * model X (in-family) and model Y (out-of-family). Non-zero delta
2789
+ * indicates self-preference.
2790
+ */
2791
+ declare function selfPreference(samples: Array<{
2792
+ score: number;
2793
+ inFamily: boolean;
2794
+ }>): SelfPreferenceResult;
2795
+
2796
+ /**
2797
+ * CI gate — evaluate a corpus against threshold contracts and generate
2798
+ * a human-readable PR/build comment.
2799
+ *
2800
+ * Three layers:
2801
+ * 1. `ThresholdContract` declarations (YAML-equivalent TS objects)
2802
+ * 2. `evaluateContract` runs the contracts against a TraceStore and
2803
+ * returns a structured report + overall pass/fail.
2804
+ * 3. `renderMarkdownReport` formats the report for GitHub PR comments.
2805
+ *
2806
+ * Consumers wrap this in their own `gh pr comment` / CI integration —
2807
+ * we don't ship the GitHub Action binary, just the library call that
2808
+ * the action invokes.
2809
+ */
2810
+
2811
+ interface ContractMetric {
2812
+ /** Metric id matching either a predefined key or a custom extractor. */
2813
+ metric: string;
2814
+ higherIsBetter: boolean;
2815
+ /** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
2816
+ maxRegression?: number;
2817
+ /** Optional extractor if the metric isn't in the default set. */
2818
+ extract?: (run: Run, store: TraceStore) => Promise<number | null>;
2819
+ }
2820
+ interface ThresholdContract {
2821
+ name: string;
2822
+ baseline: RunFilter;
2823
+ candidate: RunFilter;
2824
+ metrics: ContractMetric[];
2825
+ slos?: Slo[];
2826
+ }
2827
+ interface ContractReport {
2828
+ name: string;
2829
+ baselineReport: BaselineReport;
2830
+ sloReport?: SloReport;
2831
+ breaches: string[];
2832
+ pass: boolean;
2833
+ }
2834
+ declare function evaluateContract(store: TraceStore, contract: ThresholdContract): Promise<ContractReport>;
2835
+ declare function renderMarkdownReport(reports: ContractReport[]): string;
2836
+
2837
+ /**
2838
+ * Observability adapters — bidirectional parity with production backends.
2839
+ *
2840
+ * `LangfuseAdapter` maps a Run's spans into Langfuse generation/score
2841
+ * records (schema-compatible; we don't depend on the SDK — consumers
2842
+ * POST the returned JSON to their Langfuse collector).
2843
+ *
2844
+ * `PrometheusEmitter` converts a TraceStore into a Prometheus text-
2845
+ * exposition-format string (counters + gauges for runs, tool calls,
2846
+ * errors, cost). Drop into a `/metrics` handler; no SDK needed.
2847
+ *
2848
+ * `replayTraceThroughJudge` is the canonical "re-score with a new
2849
+ * judge" path — takes an existing run, runs a judge function over
2850
+ * each LLM span, emits JudgeVerdict spans back into the store.
2851
+ */
2852
+
2853
+ interface LangfuseGeneration {
2854
+ id: string;
2855
+ traceId: string;
2856
+ name: string;
2857
+ model: string;
2858
+ input: unknown;
2859
+ output: unknown;
2860
+ startTime: string;
2861
+ endTime: string;
2862
+ usage: {
2863
+ input: number;
2864
+ output: number;
2865
+ total: number;
2866
+ totalCost: number;
2867
+ };
2868
+ metadata: Record<string, unknown>;
2869
+ }
2870
+ interface LangfuseScore {
2871
+ id: string;
2872
+ traceId: string;
2873
+ observationId: string;
2874
+ name: string;
2875
+ value: number;
2876
+ comment?: string;
2877
+ }
2878
+ interface LangfuseEnvelope {
2879
+ traceId: string;
2880
+ generations: LangfuseGeneration[];
2881
+ scores: LangfuseScore[];
2882
+ }
2883
+ declare function toLangfuseEnvelope(store: TraceStore, runId: string): Promise<LangfuseEnvelope>;
2884
+ declare function toPrometheusText(store: TraceStore): Promise<string>;
2885
+ interface JudgeReplayResult {
2886
+ spanId: string;
2887
+ targetSpanId: string;
2888
+ dimension: string;
2889
+ score: number;
2890
+ rationale?: string;
2891
+ }
2892
+ /**
2893
+ * Apply a judge function to every LLM span in a run and record the
2894
+ * results as JudgeVerdict spans. This is the canonical "no re-execution"
2895
+ * re-scoring path — you supply a pure judge `(llmSpan) → verdict`.
2896
+ */
2897
+ declare function replayTraceThroughJudge(store: TraceStore, runId: string, judge: {
2898
+ id: string;
2899
+ dimension: string;
2900
+ score: (span: LlmSpan) => Promise<{
2901
+ score: number;
2902
+ rationale?: string;
2903
+ evidence?: string;
2904
+ }>;
2905
+ }): Promise<JudgeReplayResult[]>;
2906
+
2907
+ /**
2908
+ * Paraphrase robustness — mutates a scenario prompt in structure-
2909
+ * preserving ways, re-scores, and reports score variance.
2910
+ *
2911
+ * Mutators are pure functions `(prompt: string) => string`. Ship a
2912
+ * default set; consumers add domain-specific ones.
2913
+ *
2914
+ * Robustness score: 1 - stdDev(scores) / (mean if positive else 1).
2915
+ * A perfect agent returns the same answer regardless of typo / case /
2916
+ * reordering — any variance signals a brittle prompt.
2917
+ */
2918
+ type Mutator = (prompt: string, seed: number) => string;
2919
+ interface RobustnessResult {
2920
+ originalScore: number;
2921
+ variantScores: Array<{
2922
+ mutator: string;
2923
+ score: number;
2924
+ mutated: string;
2925
+ }>;
2926
+ meanScore: number;
2927
+ stdDev: number;
2928
+ robustness: number;
2929
+ }
2930
+ declare function paraphraseRobustness(prompt: string, mutators: Array<{
2931
+ id: string;
2932
+ fn: Mutator;
2933
+ }>, scoreFn: (prompt: string) => Promise<number>, options?: {
2934
+ seed?: number;
2935
+ }): Promise<RobustnessResult>;
2936
+ /** Lowercase the whole prompt. Robust models ignore case. */
2937
+ declare const lowercaseMutator: Mutator;
2938
+ /** Reorder sentences. Robust models don't depend on sentence order. */
2939
+ declare const sentenceReorderMutator: Mutator;
2940
+ /** Swap adjacent letter pairs (1 per 40 chars, min 1). Robust models tolerate typos. */
2941
+ declare const typoMutator: Mutator;
2942
+ /** Add a benign politeness prefix. Robust models ignore flattery. */
2943
+ declare const politenessPrefixMutator: Mutator;
2944
+ /** Compact whitespace, strip newlines. Robust models don't depend on formatting. */
2945
+ declare const whitespaceCollapseMutator: Mutator;
2946
+ declare const DEFAULT_MUTATORS: Array<{
2947
+ id: string;
2948
+ fn: Mutator;
2949
+ }>;
2950
+
2951
+ /**
2952
+ * Visual diff — pixel-delta scoring for UI / visual outputs.
2953
+ *
2954
+ * Minimal dependency-free implementation: accepts two PNGs as byte
2955
+ * arrays + width/height and returns a Δ ratio + per-channel histogram.
2956
+ * Consumers supply the decoded pixel arrays (we don't pull a PNG
2957
+ * decoder into the core — use `sharp`, `@napi-rs/canvas`, or Playwright
2958
+ * in the driving test and pass the result here).
2959
+ */
2960
+ interface ImageData {
2961
+ width: number;
2962
+ height: number;
2963
+ /** Pixel data in RGBA order, 4 bytes per pixel. */
2964
+ data: Uint8Array | Uint8ClampedArray;
2965
+ }
2966
+ interface VisualDiffResult {
2967
+ /** Ratio of pixels differing beyond `tolerance` (0..1). */
2968
+ diffRatio: number;
2969
+ differingPixels: number;
2970
+ totalPixels: number;
2971
+ maxChannelDelta: number;
2972
+ /** Status for dashboards: unchanged (< 0.1%), changed, or severely-changed (> 5%). */
2973
+ status: 'unchanged' | 'changed' | 'severely-changed';
2974
+ }
2975
+ interface VisualDiffOptions {
2976
+ /** Pixels whose max-channel delta is ≤ this are considered unchanged. Default 8/255. */
2977
+ tolerance?: number;
2978
+ }
2979
+ declare function visualDiff(a: ImageData, b: ImageData, options?: VisualDiffOptions): VisualDiffResult;
2980
+ /** Convenience: diffs two byte-identical-dim RGBA arrays, returns just the ratio. */
2981
+ declare function pixelDeltaRatio(a: Uint8Array, b: Uint8Array, width: number, height: number, tolerance?: number): number;
2982
+
2983
+ /**
2984
+ * BuilderSession — ties a builder-of-builders workflow together.
2985
+ *
2986
+ * Models agent-builder's shape: Project → Chat → Edit → Ship → App →
2987
+ * AppAgent. Each layer is a Run (linked via parentRunId). The
2988
+ * framework-enforced invariants:
2989
+ *
2990
+ * - One Project → many Chats; chatId scopes runs within a project.
2991
+ * - One Chat = one builder Run with `layer='builder'`.
2992
+ * - One Ship = one child Run with `layer='app-build'` + SandboxHarness.
2993
+ * - One AppScenario = one grandchild Run with `layer='app-runtime'`.
2994
+ *
2995
+ * Consumers obtain a BuilderSession, call `startChat`, drive the
2996
+ * builder agent (emitting spans), and call `ship` / `runAppScenario`
2997
+ * as the workflow progresses. The session reconstructs itself from
2998
+ * trace data via `resume(store, projectId)`.
2999
+ */
3000
+
3001
+ interface BuilderSessionInit {
3002
+ projectId: string;
3003
+ chatId?: string;
3004
+ /** Free-form: user's task description, project name, etc. Stored on the builder Run. */
3005
+ tags?: Record<string, string>;
3006
+ }
3007
+ interface ShipOptions {
3008
+ harness: HarnessConfig;
3009
+ driver?: SandboxDriver;
3010
+ /** scenarioId of this app-build run. Defaults to `${projectId}/build`. */
3011
+ scenarioId?: string;
3012
+ }
3013
+ interface RunAppScenarioOptions {
3014
+ scenario: TestGradedScenario;
3015
+ /** Harness driver override; defaults to the one the session was created with. */
3016
+ driver?: SandboxDriver;
3017
+ }
3018
+ declare class BuilderSession {
3019
+ private store;
3020
+ private builderEmitter;
3021
+ readonly projectId: string;
3022
+ readonly chatId: string;
3023
+ private builderRunId?;
3024
+ private lastBuildRunId?;
3025
+ private defaultDriver?;
3026
+ constructor(store: TraceStore, init: BuilderSessionInit, driver?: SandboxDriver);
3027
+ /** Start the builder (L0) run for this chat. Returns the runId. */
3028
+ startChat(scenarioId?: string): Promise<string>;
3029
+ /** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
3030
+ get emitter(): TraceEmitter;
3031
+ /**
3032
+ * Ship the project's generated app: run the sandbox harness as a child
3033
+ * Run (`layer='app-build'`). Returns the build result + runId.
3034
+ */
3035
+ ship(options: ShipOptions): Promise<{
3036
+ runId: string;
3037
+ result: SandboxHarnessResult;
3038
+ }>;
3039
+ /**
3040
+ * Run a domain scenario against the just-built app as a grandchild Run
3041
+ * (`layer='app-runtime'`). The `ship` call must precede this so the
3042
+ * parent is set correctly; if no build exists yet the session attaches
3043
+ * directly to the builder run (useful for prototypes).
3044
+ */
3045
+ runAppScenario(options: RunAppScenarioOptions): Promise<TestGradedRunResult>;
3046
+ /** Record an end-of-chat meta score (judge verdict on whether the builder
3047
+ * served the user's intent). Accepts a numeric score + optional rationale. */
3048
+ recordMetaScore(score: number, rationale?: string): Promise<void>;
3049
+ /** Close the builder Run with a final outcome. */
3050
+ endChat(outcome: {
3051
+ pass: boolean;
3052
+ score?: number;
3053
+ notes?: string;
3054
+ }): Promise<void>;
3055
+ /**
3056
+ * Inline app-runtime run — for cases where the "scenario" isn't a
3057
+ * SWE-bench-style test suite but a live agent interaction (LLM chat,
3058
+ * domain flow). Returns an emitter bound to a fresh Run in the
3059
+ * `app-runtime` layer; caller emits spans inside and calls
3060
+ * `.endRun()` with the final verdict.
3061
+ */
3062
+ startAppRuntime(scenarioId: string): Promise<TraceEmitter>;
3063
+ /**
3064
+ * Lightweight "ship marker" — record an app-build Run with a caller-
3065
+ * provided verdict. Use when there isn't a sandbox harness to run but
3066
+ * you still want to mark the build state at publish time.
3067
+ */
3068
+ recordShipMarker(args: {
3069
+ pass: boolean;
3070
+ score: number;
3071
+ scenarioId?: string;
3072
+ notes?: string;
3073
+ }): Promise<string>;
3074
+ get lastBuildRunIdValue(): string | undefined;
3075
+ get builderRunIdValue(): string | undefined;
3076
+ }
3077
+ /**
3078
+ * Reconstruct the most recent BuilderSession state for a given project —
3079
+ * returns { builderRunId, lastBuildRunId, chatRuns }. For chat-first UIs
3080
+ * this is how a resumed session finds its place in the edit history.
3081
+ */
3082
+ declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
3083
+ projectId: string;
3084
+ chatRuns: Run[];
3085
+ lastBuilderRun?: Run;
3086
+ lastBuildRun?: Run;
3087
+ lastAppRuntimeRuns: Run[];
3088
+ }>;
3089
+
3090
+ /**
3091
+ * Three-layer evaluation — the canonical scoring breakdown for
3092
+ * builder-of-builders workflows.
3093
+ *
3094
+ * meta_score: did the builder understand + satisfy user intent?
3095
+ * (judge verdict attached to the builder run)
3096
+ * build_score: did the generated scaffold build + pass its own tests?
3097
+ * (outcome.score on the app-build child run)
3098
+ * runtime_score: did the generated agent pass its domain scenarios?
3099
+ * (mean outcome.score over app-runtime grandchild runs)
3100
+ *
3101
+ * Returns a structured report per project. The cross-layer correlation
3102
+ * is the highest-leverage signal the framework computes — if
3103
+ * meta_score doesn't predict runtime_score, the builder's self-scoring
3104
+ * is broken.
3105
+ */
3106
+
3107
+ interface ThreeLayerProjectReport {
3108
+ projectId: string;
3109
+ builderRunId?: string;
3110
+ /** Judge-verdict score on the builder run (0..1 after normalization). */
3111
+ metaScore: number | null;
3112
+ buildRunId?: string;
3113
+ /** 0..1 from the sandbox harness (testsPassed / testsTotal). */
3114
+ buildScore: number | null;
3115
+ appRuntimeRunIds: string[];
3116
+ /** Mean of outcome.score over app-runtime runs, 0..1. */
3117
+ runtimeScore: number | null;
3118
+ runtimePassRate: number | null;
3119
+ /** True when all three layers produced a score. */
3120
+ complete: boolean;
3121
+ }
3122
+ declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
3123
+ /** Aggregate scoring across every project in a corpus. */
3124
+ declare function scoreAllProjects(store: TraceStore): Promise<ThreeLayerProjectReport[]>;
3125
+
3126
+ /**
3127
+ * Meta-eval correlation — the highest-leverage signal in the framework.
3128
+ *
3129
+ * Given a corpus of three-layer project reports, compute how well each
3130
+ * pair of layers correlates. The question we care about most:
3131
+ *
3132
+ * Does `metaScore` (what the builder thinks it did) predict
3133
+ * `runtimeScore` (what the user actually gets)?
3134
+ *
3135
+ * If r < ~0.4, the builder's self-scoring is broken — it's optimizing
3136
+ * for something other than real-world success. If r > 0.7, meta_score
3137
+ * is a usable proxy and can drive CI gates cheaply.
3138
+ *
3139
+ * Non-parametric rank correlation (Spearman) is also reported because
3140
+ * meta scores are often ordinal-ish.
3141
+ */
3142
+
3143
+ interface LayerCorrelation {
3144
+ n: number;
3145
+ pearson: number;
3146
+ spearman: number;
3147
+ }
3148
+ interface CorrelationReport {
3149
+ /** Pairs present in the corpus (layers with ≥ 2 matched data points). */
3150
+ metaVsBuild?: LayerCorrelation;
3151
+ metaVsRuntime?: LayerCorrelation;
3152
+ buildVsRuntime?: LayerCorrelation;
3153
+ /** Number of complete projects (all 3 scores present). */
3154
+ completeProjects: number;
3155
+ }
3156
+ declare function correlateLayers(reports: ThreeLayerProjectReport[]): CorrelationReport;
3157
+
3158
+ /**
3159
+ * ProjectRegistry — project-level aggregation over the trace corpus.
3160
+ *
3161
+ * Thin reader over TraceStore that answers the questions a chat-first,
3162
+ * resumable UI needs:
3163
+ * - listProjects() → project IDs with latest activity
3164
+ * - projectTimeline(id) → chats + builds + runtime runs, chronological
3165
+ * - projectChats(id) → chat-level summaries (turn count, outcome)
3166
+ *
3167
+ * All queries are pure reads; no state duplication.
3168
+ */
3169
+
3170
+ interface ProjectSummary {
3171
+ projectId: string;
3172
+ chatCount: number;
3173
+ buildCount: number;
3174
+ appRuntimeCount: number;
3175
+ lastActivityAt: number;
3176
+ latestChatId?: string;
3177
+ latestOutcome?: {
3178
+ pass: boolean;
3179
+ score?: number;
3180
+ };
3181
+ }
3182
+ interface ChatSummary {
3183
+ chatId: string;
3184
+ projectId: string;
3185
+ builderRunId: string;
3186
+ startedAt: number;
3187
+ endedAt?: number;
3188
+ status: Run['status'];
3189
+ outcome?: Run['outcome'];
3190
+ /** Counts of spans emitted during the chat. */
3191
+ llmTurns?: number;
3192
+ toolCalls?: number;
3193
+ buildRunId?: string;
3194
+ appRuntimeRunIds: string[];
3195
+ }
3196
+ interface ProjectTimelineEntry {
3197
+ run: Run;
3198
+ layerBucket: 'chat' | 'build' | 'runtime' | 'other';
3199
+ }
3200
+ declare class ProjectRegistry {
3201
+ private store;
3202
+ constructor(store: TraceStore);
3203
+ listProjects(): Promise<ProjectSummary[]>;
3204
+ projectTimeline(projectId: string): Promise<ProjectTimelineEntry[]>;
3205
+ projectChats(projectId: string): Promise<ChatSummary[]>;
3206
+ }
578
3207
 
579
- export { AgentDriver, type AgentDriverConfig, type ArtifactCheck, type ArtifactResult, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type CheckResult, type CollectedArtifacts, type CompletionCriterion, ConvergenceTracker, type DriverResult, type DriverState, type EvalResult, type ExecutorConfig, type FeedbackPattern, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgeRubric, type JudgeScore, MODEL_PRICING, MetricsCollector, type PersonaConfig, ProductClient, type ProductClientConfig, type RouteMap, type RubricDimension, type Scenario, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type TestResult, TokenCounter, type Turn, type TurnMetrics, type TurnResult, adversarialJudge, codeExecutionJudge, coherenceJudge, confidenceInterval, createCustomJudge, createDomainExpertJudge, defaultJudges, estimateCost, estimateTokens, executeScenario, formatBenchmarkReport, formatDriverReport, interRaterReliability, mannWhitneyU, normalizeScores, partialCredit, printDriverSummary, runE2EWorkflow, weightedMean };
3208
+ export { AgentDriver, type AgentDriverConfig, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScore, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CostEntry, type CostSummary, CostTracker, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EvalResult, type EventFilter, type EventKind, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, FAILURE_CLASSES, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type ImageData, InMemoryExperimentStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InspectorContext, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, benjaminiHochberg, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, canaryLeakView, checkCanaries, checkSlos, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, defaultJudges, dominates, estimateCost, estimateTokens, evaluateContract, evaluateOracles, executeScenario, expectAgent, exportRunAsOtlp, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, lowercaseMutator, mannWhitneyU, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdownReport, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runE2EWorkflow, runExpectations, runFailureClass, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, statusAdvanced, stuckLoopView, textInSnapshot, toLangfuseEnvelope, toPrometheusText, toolNamesForRun, toolSpans, toolWasteView, typoMutator, urlContains, verbosityBias, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };