@tangle-network/agent-eval 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -470,6 +470,31 @@ declare function mannWhitneyU(a: number[], b: number[]): {
470
470
  };
471
471
  /** Partial credit: returns 0-1 ratio of current toward target */
472
472
  declare function partialCredit(current: number, target: number): number;
473
+ /**
474
+ * Paired t-test — before/after measurements on the SAME items.
475
+ * Pairing removes inter-item variance, giving tighter significance than
476
+ * an unpaired test when comparing prompt v1 vs prompt v2 on identical
477
+ * scenarios.
478
+ */
479
+ declare function pairedTTest(before: number[], after: number[]): {
480
+ t: number;
481
+ df: number;
482
+ p: number;
483
+ };
484
+ /**
485
+ * Wilcoxon signed-rank test — paired non-parametric alternative.
486
+ * Use when the differences aren't normally distributed.
487
+ */
488
+ declare function wilcoxonSignedRank(before: number[], after: number[]): {
489
+ w: number;
490
+ p: number;
491
+ };
492
+ /**
493
+ * Cohen's d — standardized effect size for two independent groups.
494
+ * Positive d means group b has higher mean than group a.
495
+ * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
496
+ */
497
+ declare function cohensD(a: number[], b: number[]): number;
473
498
 
474
499
  /**
475
500
  * ConvergenceTracker — tracks completion percentage over turns.
@@ -576,4 +601,587 @@ declare function formatDriverReport(results: DriverResult[]): string;
576
601
  /** Print a compact summary to console */
577
602
  declare function printDriverSummary(results: DriverResult[]): void;
578
603
 
579
- export { AgentDriver, type AgentDriverConfig, type ArtifactCheck, type ArtifactResult, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type CheckResult, type CollectedArtifacts, type CompletionCriterion, ConvergenceTracker, type DriverResult, type DriverState, type EvalResult, type ExecutorConfig, type FeedbackPattern, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgeRubric, type JudgeScore, MODEL_PRICING, MetricsCollector, type PersonaConfig, ProductClient, type ProductClientConfig, type RouteMap, type RubricDimension, type Scenario, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type TestResult, TokenCounter, type Turn, type TurnMetrics, type TurnResult, adversarialJudge, codeExecutionJudge, coherenceJudge, confidenceInterval, createCustomJudge, createDomainExpertJudge, defaultJudges, estimateCost, estimateTokens, executeScenario, formatBenchmarkReport, formatDriverReport, interRaterReliability, mannWhitneyU, normalizeScores, partialCredit, printDriverSummary, runE2EWorkflow, weightedMean };
604
+ /**
605
+ * Versioned prompt registry.
606
+ *
607
+ * Every prompt used in an eval run is registered with an explicit version.
608
+ * Reports include the content hash so A/B compares are rigorous: if the
609
+ * hash changes between two reports, the prompt actually changed; if it
610
+ * matches, the variance is elsewhere.
611
+ *
612
+ * Hash is SHA-256(content), truncated to 12 hex chars for readability.
613
+ * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
614
+ */
615
+ interface PromptHandle {
616
+ /** Stable human-readable id, e.g. 'legal.system' */
617
+ id: string;
618
+ /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
619
+ version: string;
620
+ /** SHA-256 of content, 12-hex-char prefix */
621
+ hash: string;
622
+ /** Full prompt body */
623
+ content: string;
624
+ }
625
+ declare class PromptRegistry {
626
+ private readonly entries;
627
+ /**
628
+ * Register a prompt. Re-registering the same id+version with DIFFERENT
629
+ * content throws — versions are immutable. Re-registering with the SAME
630
+ * content is a no-op (idempotent).
631
+ */
632
+ register(id: string, version: string, content: string): Promise<PromptHandle>;
633
+ /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
634
+ get(id: string, version: string): PromptHandle;
635
+ /** Return all versions of an id, newest-first (lex-descending on version). */
636
+ listVersions(id: string): PromptHandle[];
637
+ /** Snapshot the whole registry — useful for including in reports. */
638
+ list(): PromptHandle[];
639
+ /** Verify a hash against registered content. Returns null if not found. */
640
+ verifyHash(id: string, version: string, expectedHash: string): boolean | null;
641
+ }
642
+ /** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
643
+ declare function hashContent(content: string): Promise<string>;
644
+
645
+ /**
646
+ * LLM trace store — one record per model call.
647
+ *
648
+ * Sink for the full eval data-plane: what got sent, what came back, what it
649
+ * cost, how long it took. Replayable, queryable, diff-able.
650
+ *
651
+ * Two built-in stores:
652
+ * - `MemoryTraceStore` — fast, ephemeral, useful in tests and short runs
653
+ * - `FileSystemTraceStore` — NDJSON files per-run, grepable, committable
654
+ *
655
+ * Consumers plug in custom stores for Langfuse / OTEL / D1 / Postgres.
656
+ */
657
+ interface LlmTrace {
658
+ id: string;
659
+ runId: string;
660
+ scenarioId?: string;
661
+ turnIndex?: number;
662
+ role: 'driver' | 'judge' | 'product' | 'optimizer' | string;
663
+ model: string;
664
+ prompt: string;
665
+ output: string;
666
+ inputTokens?: number;
667
+ outputTokens?: number;
668
+ costUsd?: number;
669
+ durationMs?: number;
670
+ timestamp: string;
671
+ metadata?: Record<string, unknown>;
672
+ }
673
+ interface TraceQuery {
674
+ runId?: string;
675
+ scenarioId?: string;
676
+ role?: string;
677
+ model?: string;
678
+ sinceMs?: number;
679
+ limit?: number;
680
+ }
681
+ interface TraceStore {
682
+ record(trace: LlmTrace): Promise<void>;
683
+ query(query: TraceQuery): Promise<LlmTrace[]>;
684
+ count(query?: TraceQuery): Promise<number>;
685
+ }
686
+ declare class MemoryTraceStore implements TraceStore {
687
+ private traces;
688
+ record(trace: LlmTrace): Promise<void>;
689
+ query(query: TraceQuery): Promise<LlmTrace[]>;
690
+ count(query?: TraceQuery): Promise<number>;
691
+ /** Clear the store — test helper. */
692
+ reset(): void;
693
+ private filter;
694
+ }
695
+ interface FileSystemTraceStoreOptions {
696
+ dir: string;
697
+ /** Max file size before rolling to a new segment (default 32 MB). */
698
+ rolloverBytes?: number;
699
+ /** Function to write the file — defaults to node:fs/promises.appendFile */
700
+ append?: (path: string, data: string) => Promise<void>;
701
+ read?: (path: string) => Promise<string>;
702
+ list?: (dir: string) => Promise<string[]>;
703
+ stat?: (path: string) => Promise<{
704
+ size: number;
705
+ }>;
706
+ mkdir?: (dir: string) => Promise<void>;
707
+ }
708
+ declare class FileSystemTraceStore implements TraceStore {
709
+ private readonly opts;
710
+ constructor(opts: FileSystemTraceStoreOptions);
711
+ record(trace: LlmTrace): Promise<void>;
712
+ query(query: TraceQuery): Promise<LlmTrace[]>;
713
+ count(query?: TraceQuery): Promise<number>;
714
+ private segments;
715
+ private currentSegment;
716
+ }
717
+
718
+ /**
719
+ * Anti-slop quality judge.
720
+ *
721
+ * Deterministic pattern-based quality check — no LLM call. Catches the
722
+ * 80% of AI slop that every production agent leaks:
723
+ * - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
724
+ * - N-gram repetition (same phrase over and over)
725
+ * - Hedging overuse ("I could be wrong, but...")
726
+ * - Apology padding ("I'm so sorry for the confusion...")
727
+ * - Unused opening formulas ("Great question!")
728
+ * - Length bounds (too short to be useful, too long to be read)
729
+ *
730
+ * Produces a JudgeScore in the same shape as LLM judges so it composes into
731
+ * `BenchmarkRunner`'s judge array transparently.
732
+ */
733
+
734
+ interface AntiSlopConfig {
735
+ /** Domain label — appears in the JudgeScore output */
736
+ domain?: string;
737
+ /** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
738
+ bannedPhrases?: string[];
739
+ /** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
740
+ bannedOpenings?: RegExp[];
741
+ /** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
742
+ hedgingPatterns?: RegExp[];
743
+ /** Regexes matching apology padding. */
744
+ apologyPatterns?: RegExp[];
745
+ /** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
746
+ repetitionThreshold?: number;
747
+ /** Min output length in chars; below this the turn is deemed too terse. */
748
+ minLength?: number;
749
+ /** Max output length in chars; above this the turn is deemed too verbose. */
750
+ maxLength?: number;
751
+ /** How heavily each violation class reduces the score (default 1). */
752
+ penaltyWeights?: Partial<Record<SlopCategory, number>>;
753
+ }
754
+ type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
755
+ /** Create a reusable Judge function from an anti-slop config. */
756
+ declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
757
+ interface AntiSlopIssue {
758
+ category: SlopCategory;
759
+ detail: string;
760
+ example?: string;
761
+ }
762
+ interface AntiSlopReport {
763
+ /** 0–10 score; 10 is clean, lower values mean more slop. */
764
+ score: number;
765
+ issues: AntiSlopIssue[];
766
+ /** Count of each category for programmatic aggregation. */
767
+ counts: Record<SlopCategory, number>;
768
+ }
769
+ /**
770
+ * Pure function — analyze one or more outputs against the config. Exposed
771
+ * separately so consumers can build their own reporters on top.
772
+ */
773
+ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
774
+ penaltyWeights: Record<SlopCategory, number>;
775
+ }): AntiSlopReport;
776
+
777
+ /**
778
+ * Artifact validators.
779
+ *
780
+ * Generic "score a produced artifact" primitive. Tax uses it for PDF form
781
+ * correctness, legal for contract clauses, film for script breakdowns, GTM
782
+ * for social posts. One interface, many validators; all plug into
783
+ * `BenchmarkRunner` the same way.
784
+ *
785
+ * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
786
+ * plus a `ValidationContext` (scenario id, the turns that produced it) and
787
+ * returns a `ValidationResult` with pass/fail + 0..1 score + structured
788
+ * issues.
789
+ */
790
+ interface Artifact {
791
+ /** Logical kind — validators type-guard on this */
792
+ kind: 'file' | 'json' | 'text' | 'binary' | string;
793
+ /** Filesystem-style path, optional */
794
+ path?: string;
795
+ /** String content for text/json/file kinds */
796
+ content?: string;
797
+ /** Binary content (if kind === 'binary') */
798
+ bytes?: Uint8Array;
799
+ /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
800
+ metadata?: Record<string, unknown>;
801
+ }
802
+ interface ValidationContext {
803
+ scenarioId: string;
804
+ turnIndex?: number;
805
+ /** Prior artifacts for multi-artifact scenarios */
806
+ priorArtifacts?: Artifact[];
807
+ /** Free-form hints the validator uses for domain-specific checks */
808
+ hints?: Record<string, unknown>;
809
+ }
810
+ interface ValidationIssue {
811
+ severity: 'error' | 'warning' | 'info';
812
+ message: string;
813
+ /** Optional path into the artifact (e.g. JSON path or byte offset) */
814
+ locus?: string;
815
+ }
816
+ interface ValidationResult {
817
+ pass: boolean;
818
+ /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
819
+ score: number;
820
+ issues: ValidationIssue[];
821
+ /** Diagnostic payload for reporters */
822
+ evidence?: Record<string, unknown>;
823
+ }
824
+ interface ArtifactValidator {
825
+ /** Stable identifier for the validator; appears in reports. */
826
+ name: string;
827
+ /** Optional description for human-facing reports. */
828
+ description?: string;
829
+ /** Called once per artifact; validators are expected to be pure + idempotent. */
830
+ validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
831
+ }
832
+ /**
833
+ * Run every validator on the same artifact; aggregate pass as AND, score as
834
+ * (weighted) mean, issues concatenated. Weights default to 1 each.
835
+ */
836
+ declare function composeValidators(validators: ArtifactValidator[], options?: {
837
+ name?: string;
838
+ weights?: number[];
839
+ }): ArtifactValidator;
840
+ /** Pass if the artifact body matches a provided regex. */
841
+ declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
842
+ /** Pass if JSON parses and every required key is present. */
843
+ declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
844
+ /** Pass if min ≤ byte length ≤ max. */
845
+ declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
846
+ /** Pass if the artifact contains every required substring (case-insensitive by default). */
847
+ declare function containsAll(name: string, required: string[], options?: {
848
+ caseSensitive?: boolean;
849
+ }): ArtifactValidator;
850
+
851
+ /**
852
+ * Workspace inspector — score the persisted state of an agent after a run.
853
+ *
854
+ * Many evals don't ask "did the response say the right thing" but "did the
855
+ * agent put the right rows in the DB / files in the vault / entities on the
856
+ * canvas". This is the primitive for that.
857
+ *
858
+ * Implementations read from D1, KV, filesystem, or any store — the interface
859
+ * is deliberately small so consumers plug in their own backends.
860
+ */
861
+ interface WorkspaceSnapshot {
862
+ /** Vault files: logical path → content */
863
+ files: Record<string, string>;
864
+ /** DB rows: table name → array of rows (post-validation) */
865
+ rows: Record<string, Array<Record<string, unknown>>>;
866
+ /** KV entries: key → value (scoped to whatever prefix the inspector chose) */
867
+ kv: Record<string, string>;
868
+ /** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
869
+ blobs?: Record<string, {
870
+ size: number;
871
+ hash?: string;
872
+ mimeType?: string;
873
+ }>;
874
+ }
875
+ interface InspectorContext {
876
+ /** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
877
+ scopeId: string;
878
+ /** Optional scenario id — allows scenario-specific snapshot shaping */
879
+ scenarioId?: string;
880
+ }
881
+ interface WorkspaceInspector {
882
+ name: string;
883
+ snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
884
+ }
885
+ declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
886
+ readonly name = "in-memory";
887
+ private readonly snapshots;
888
+ set(scopeId: string, snapshot: WorkspaceSnapshot): void;
889
+ snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
890
+ }
891
+ interface WorkspaceAssertion {
892
+ name: string;
893
+ description?: string;
894
+ check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
895
+ }
896
+ interface WorkspaceAssertionResult {
897
+ pass: boolean;
898
+ /** 0..1 — partial credit for assertions that admit it */
899
+ score: number;
900
+ detail?: string;
901
+ }
902
+ declare function fileExists(path: string): WorkspaceAssertion;
903
+ declare function fileContains(path: string, needle: string): WorkspaceAssertion;
904
+ declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
905
+ declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
906
+ min?: number;
907
+ }): WorkspaceAssertion;
908
+ /** Run many assertions; return aggregate pass + mean score + per-assertion details. */
909
+ declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
910
+ pass: boolean;
911
+ score: number;
912
+ results: Array<{
913
+ assertion: string;
914
+ result: WorkspaceAssertionResult;
915
+ }>;
916
+ };
917
+
918
+ /**
919
+ * Experiment tracker — group runs, diff them, watch scores move over time.
920
+ *
921
+ * Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
922
+ * - A run has a config (prompt hash, model, scenario ids, seed)
923
+ * - Runs belong to experiments (named groups)
924
+ * - The store is pluggable (in-memory for tests, filesystem for local,
925
+ * custom for Langfuse/D1)
926
+ * - Diffs show score deltas, new/dropped scenarios, and config changes
927
+ *
928
+ * The output plugs directly into `BenchmarkReport` — runs archive the full
929
+ * report, diff operates on the summary.
930
+ */
931
+
932
+ interface RunConfig {
933
+ experimentId: string;
934
+ name?: string;
935
+ model?: string;
936
+ promptHash?: string;
937
+ promptVersion?: string;
938
+ seed?: number;
939
+ metadata?: Record<string, unknown>;
940
+ }
941
+ interface Run {
942
+ id: string;
943
+ experimentId: string;
944
+ name?: string;
945
+ config: RunConfig;
946
+ startedAt: string;
947
+ completedAt?: string;
948
+ status: 'running' | 'completed' | 'failed';
949
+ report?: BenchmarkReport;
950
+ error?: string;
951
+ }
952
+ interface Experiment {
953
+ id: string;
954
+ name: string;
955
+ createdAt: string;
956
+ metadata?: Record<string, unknown>;
957
+ }
958
+ interface ExperimentStore {
959
+ saveExperiment(exp: Experiment): Promise<void>;
960
+ getExperiment(id: string): Promise<Experiment | null>;
961
+ listExperiments(): Promise<Experiment[]>;
962
+ saveRun(run: Run): Promise<void>;
963
+ getRun(id: string): Promise<Run | null>;
964
+ listRuns(experimentId: string): Promise<Run[]>;
965
+ }
966
+ declare class InMemoryExperimentStore implements ExperimentStore {
967
+ private readonly experiments;
968
+ private readonly runs;
969
+ saveExperiment(exp: Experiment): Promise<void>;
970
+ getExperiment(id: string): Promise<Experiment | null>;
971
+ listExperiments(): Promise<Experiment[]>;
972
+ saveRun(run: Run): Promise<void>;
973
+ getRun(id: string): Promise<Run | null>;
974
+ listRuns(experimentId: string): Promise<Run[]>;
975
+ }
976
+ declare class ExperimentTracker {
977
+ private readonly store;
978
+ constructor(store: ExperimentStore);
979
+ startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
980
+ startRun(config: RunConfig): Promise<Run>;
981
+ completeRun(runId: string, report: BenchmarkReport): Promise<void>;
982
+ failRun(runId: string, error: string): Promise<void>;
983
+ /**
984
+ * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
985
+ * and config changes that may explain the movement.
986
+ */
987
+ diff(runIdA: string, runIdB: string): Promise<RunDiff>;
988
+ /** Timeline of aggregate scores for an experiment. */
989
+ timeline(experimentId: string): Promise<Array<{
990
+ runId: string;
991
+ startedAt: string;
992
+ overall: number | null;
993
+ }>>;
994
+ }
995
+ interface RunDiff {
996
+ before: {
997
+ runId: string;
998
+ name?: string;
999
+ startedAt: string;
1000
+ };
1001
+ after: {
1002
+ runId: string;
1003
+ name?: string;
1004
+ startedAt: string;
1005
+ };
1006
+ aggregateDelta: number;
1007
+ scenarios: Array<{
1008
+ scenarioId: string;
1009
+ before: number | null;
1010
+ after: number | null;
1011
+ delta: number | null;
1012
+ status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
1013
+ }>;
1014
+ configChanges: Record<string, {
1015
+ before: unknown;
1016
+ after: unknown;
1017
+ }>;
1018
+ }
1019
+
1020
+ /**
1021
+ * Prompt optimizer — A/B test prompt variants with statistical rigor.
1022
+ *
1023
+ * Runs N prompt variants against a fixed scenario set, collects per-scenario
1024
+ * scores via the user-provided `scoreVariant` callback, and returns:
1025
+ * - per-variant mean + bootstrap CI
1026
+ * - pairwise significance (Mann-Whitney, non-parametric — works on any
1027
+ * score distribution, not just normal)
1028
+ * - a winner (highest mean, flagged if the lead is not significant)
1029
+ *
1030
+ * Deliberately generic — the `scoreVariant` callback does whatever domain
1031
+ * work the consumer needs (invoke the agent, judge the output, whatever),
1032
+ * and returns a number per scenario. This lets the optimizer stay small +
1033
+ * testable.
1034
+ */
1035
+ interface PromptVariant {
1036
+ id: string;
1037
+ prompt: string;
1038
+ metadata?: Record<string, unknown>;
1039
+ }
1040
+ interface OptimizationConfig {
1041
+ variants: PromptVariant[];
1042
+ /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
1043
+ trialsPerScenario?: number;
1044
+ /** Significance threshold for pairwise comparison (default 0.05). */
1045
+ significanceLevel?: number;
1046
+ /**
1047
+ * The scoring callback. For each (variant, scenarioId, trialIndex), produce
1048
+ * a score in 0..1 (or any numeric range — the optimizer only cares about
1049
+ * monotonicity).
1050
+ */
1051
+ scoreVariant: (args: {
1052
+ variant: PromptVariant;
1053
+ scenarioId: string;
1054
+ trialIndex: number;
1055
+ }) => Promise<number>;
1056
+ /** Scenario ids to run against. */
1057
+ scenarioIds: string[];
1058
+ /** Optional hook — fires after each (variant, scenario) fully scored. */
1059
+ onScenarioComplete?: (info: {
1060
+ variantId: string;
1061
+ scenarioId: string;
1062
+ scores: number[];
1063
+ }) => void;
1064
+ }
1065
+ interface VariantScore {
1066
+ variantId: string;
1067
+ mean: number;
1068
+ ci95: {
1069
+ lower: number;
1070
+ upper: number;
1071
+ };
1072
+ n: number;
1073
+ perScenario: Record<string, {
1074
+ mean: number;
1075
+ n: number;
1076
+ samples: number[];
1077
+ }>;
1078
+ }
1079
+ interface PairwiseComparison {
1080
+ variantA: string;
1081
+ variantB: string;
1082
+ pValue: number;
1083
+ significant: boolean;
1084
+ meanDelta: number;
1085
+ }
1086
+ interface OptimizationResult {
1087
+ winner: {
1088
+ variantId: string;
1089
+ /** True when the winner's lead vs every other variant is statistically significant. */
1090
+ significant: boolean;
1091
+ ciLowerBoundExceedsSecondMean: boolean;
1092
+ };
1093
+ scores: VariantScore[];
1094
+ pairwise: PairwiseComparison[];
1095
+ config: {
1096
+ trialsPerScenario: number;
1097
+ significanceLevel: number;
1098
+ variants: string[];
1099
+ scenarios: string[];
1100
+ };
1101
+ }
1102
+ declare class PromptOptimizer {
1103
+ run(config: OptimizationConfig): Promise<OptimizationResult>;
1104
+ }
1105
+
1106
+ /**
1107
+ * Dual-agent convergence bench.
1108
+ *
1109
+ * Pattern lifted from tax-agent + legal-agent: two agents take turns until
1110
+ * they converge on a consensus artifact. One proposes, the other critiques;
1111
+ * the proposer revises; repeat until a score threshold is hit or max rounds.
1112
+ *
1113
+ * Generalized so any two "agents" (gateways, local functions, anything with
1114
+ * `propose` + `critique`) compose in. Returns convergence rounds per
1115
+ * scenario + whether convergence happened.
1116
+ */
1117
+ interface DualAgentScenario {
1118
+ id: string;
1119
+ initialPrompt: string;
1120
+ /** Optional context the agents can read (e.g. source documents). */
1121
+ context?: Record<string, unknown>;
1122
+ }
1123
+ interface DualAgentRound {
1124
+ roundIndex: number;
1125
+ proposal: string;
1126
+ critique: string;
1127
+ convergenceScore: number;
1128
+ }
1129
+ interface DualAgentScenarioResult {
1130
+ scenarioId: string;
1131
+ converged: boolean;
1132
+ roundsToConverge: number | null;
1133
+ finalProposal: string;
1134
+ history: DualAgentRound[];
1135
+ finalScore: number;
1136
+ }
1137
+ interface DualAgentBenchConfig {
1138
+ scenarios: DualAgentScenario[];
1139
+ maxRounds?: number;
1140
+ /** Convergence threshold in 0..1 (default 0.85). */
1141
+ convergenceThreshold?: number;
1142
+ /**
1143
+ * Propose an answer given the scenario + the critic's prior critique (if any).
1144
+ * Returns the proposal string.
1145
+ */
1146
+ propose: (args: {
1147
+ scenario: DualAgentScenario;
1148
+ roundIndex: number;
1149
+ priorProposal?: string;
1150
+ priorCritique?: string;
1151
+ }) => Promise<string>;
1152
+ /**
1153
+ * Critique the proposer's current output. Returns a structured critique
1154
+ * (free text) plus a convergence score: how close the proposal is to
1155
+ * acceptable. 1.0 = accept, 0.0 = totally off.
1156
+ */
1157
+ critique: (args: {
1158
+ scenario: DualAgentScenario;
1159
+ roundIndex: number;
1160
+ proposal: string;
1161
+ }) => Promise<{
1162
+ critique: string;
1163
+ convergenceScore: number;
1164
+ }>;
1165
+ /** Optional per-round hook for progress + tracing. */
1166
+ onRoundComplete?: (info: {
1167
+ scenarioId: string;
1168
+ round: DualAgentRound;
1169
+ }) => void;
1170
+ }
1171
+ interface DualAgentReport {
1172
+ scenarios: DualAgentScenarioResult[];
1173
+ aggregate: {
1174
+ convergenceRate: number;
1175
+ avgRoundsToConverge: number | null;
1176
+ avgFinalScore: number;
1177
+ };
1178
+ config: {
1179
+ maxRounds: number;
1180
+ convergenceThreshold: number;
1181
+ };
1182
+ }
1183
+ declare class DualAgentBench {
1184
+ run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
1185
+ }
1186
+
1187
+ export { AgentDriver, type AgentDriverConfig, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type ArtifactResult, type ArtifactValidator, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type CheckResult, type CollectedArtifacts, type CompletionCriterion, ConvergenceTracker, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EvalResult, type ExecutorConfig, type Experiment, type ExperimentStore, ExperimentTracker, type FeedbackPattern, FileSystemTraceStore, type FileSystemTraceStoreOptions, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InspectorContext, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgeRubric, type JudgeScore, type LlmTrace, MODEL_PRICING, MemoryTraceStore, MetricsCollector, type OptimizationConfig, type OptimizationResult, type PairwiseComparison, type PersonaConfig, ProductClient, type ProductClientConfig, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type RouteMap, type RubricDimension, type Run, type RunConfig, type RunDiff, type Scenario, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type SlopCategory, type TestResult, TokenCounter, type TraceQuery, type TraceStore, type Turn, type TurnMetrics, type TurnResult, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, analyzeAntiSlop, byteLengthRange, codeExecutionJudge, cohensD, coherenceJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, defaultJudges, estimateCost, estimateTokens, executeScenario, fileContains, fileExists, formatBenchmarkReport, formatDriverReport, hashContent, interRaterReliability, jsonHasKeys, mannWhitneyU, normalizeScores, pairedTTest, partialCredit, printDriverSummary, regexMatch, rowCount, rowWhere, runAssertions, runE2EWorkflow, weightedMean, wilcoxonSignedRank };