@tangle-network/agent-eval 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +609 -1
- package/dist/index.js +1011 -1
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.d.ts
CHANGED
|
@@ -470,6 +470,31 @@ declare function mannWhitneyU(a: number[], b: number[]): {
|
|
|
470
470
|
};
|
|
471
471
|
/** Partial credit: returns 0-1 ratio of current toward target */
|
|
472
472
|
declare function partialCredit(current: number, target: number): number;
|
|
473
|
+
/**
|
|
474
|
+
* Paired t-test — before/after measurements on the SAME items.
|
|
475
|
+
* Pairing removes inter-item variance, giving tighter significance than
|
|
476
|
+
* an unpaired test when comparing prompt v1 vs prompt v2 on identical
|
|
477
|
+
* scenarios.
|
|
478
|
+
*/
|
|
479
|
+
declare function pairedTTest(before: number[], after: number[]): {
|
|
480
|
+
t: number;
|
|
481
|
+
df: number;
|
|
482
|
+
p: number;
|
|
483
|
+
};
|
|
484
|
+
/**
|
|
485
|
+
* Wilcoxon signed-rank test — paired non-parametric alternative.
|
|
486
|
+
* Use when the differences aren't normally distributed.
|
|
487
|
+
*/
|
|
488
|
+
declare function wilcoxonSignedRank(before: number[], after: number[]): {
|
|
489
|
+
w: number;
|
|
490
|
+
p: number;
|
|
491
|
+
};
|
|
492
|
+
/**
|
|
493
|
+
* Cohen's d — standardized effect size for two independent groups.
|
|
494
|
+
* Positive d means group b has higher mean than group a.
|
|
495
|
+
* Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
|
|
496
|
+
*/
|
|
497
|
+
declare function cohensD(a: number[], b: number[]): number;
|
|
473
498
|
|
|
474
499
|
/**
|
|
475
500
|
* ConvergenceTracker — tracks completion percentage over turns.
|
|
@@ -576,4 +601,587 @@ declare function formatDriverReport(results: DriverResult[]): string;
|
|
|
576
601
|
/** Print a compact summary to console */
|
|
577
602
|
declare function printDriverSummary(results: DriverResult[]): void;
|
|
578
603
|
|
|
579
|
-
|
|
604
|
+
/**
|
|
605
|
+
* Versioned prompt registry.
|
|
606
|
+
*
|
|
607
|
+
* Every prompt used in an eval run is registered with an explicit version.
|
|
608
|
+
* Reports include the content hash so A/B compares are rigorous: if the
|
|
609
|
+
* hash changes between two reports, the prompt actually changed; if it
|
|
610
|
+
* matches, the variance is elsewhere.
|
|
611
|
+
*
|
|
612
|
+
* Hash is SHA-256(content), truncated to 12 hex chars for readability.
|
|
613
|
+
* Uses the Web Crypto API (works in Workers, Node 22+, browsers).
|
|
614
|
+
*/
|
|
615
|
+
interface PromptHandle {
|
|
616
|
+
/** Stable human-readable id, e.g. 'legal.system' */
|
|
617
|
+
id: string;
|
|
618
|
+
/** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
|
|
619
|
+
version: string;
|
|
620
|
+
/** SHA-256 of content, 12-hex-char prefix */
|
|
621
|
+
hash: string;
|
|
622
|
+
/** Full prompt body */
|
|
623
|
+
content: string;
|
|
624
|
+
}
|
|
625
|
+
declare class PromptRegistry {
|
|
626
|
+
private readonly entries;
|
|
627
|
+
/**
|
|
628
|
+
* Register a prompt. Re-registering the same id+version with DIFFERENT
|
|
629
|
+
* content throws — versions are immutable. Re-registering with the SAME
|
|
630
|
+
* content is a no-op (idempotent).
|
|
631
|
+
*/
|
|
632
|
+
register(id: string, version: string, content: string): Promise<PromptHandle>;
|
|
633
|
+
/** Look up a registered prompt. Throws if unknown — no implicit defaults. */
|
|
634
|
+
get(id: string, version: string): PromptHandle;
|
|
635
|
+
/** Return all versions of an id, newest-first (lex-descending on version). */
|
|
636
|
+
listVersions(id: string): PromptHandle[];
|
|
637
|
+
/** Snapshot the whole registry — useful for including in reports. */
|
|
638
|
+
list(): PromptHandle[];
|
|
639
|
+
/** Verify a hash against registered content. Returns null if not found. */
|
|
640
|
+
verifyHash(id: string, version: string, expectedHash: string): boolean | null;
|
|
641
|
+
}
|
|
642
|
+
/** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
|
|
643
|
+
declare function hashContent(content: string): Promise<string>;
|
|
644
|
+
|
|
645
|
+
/**
|
|
646
|
+
* LLM trace store — one record per model call.
|
|
647
|
+
*
|
|
648
|
+
* Sink for the full eval data-plane: what got sent, what came back, what it
|
|
649
|
+
* cost, how long it took. Replayable, queryable, diff-able.
|
|
650
|
+
*
|
|
651
|
+
* Two built-in stores:
|
|
652
|
+
* - `MemoryTraceStore` — fast, ephemeral, useful in tests and short runs
|
|
653
|
+
* - `FileSystemTraceStore` — NDJSON files per-run, grepable, committable
|
|
654
|
+
*
|
|
655
|
+
* Consumers plug in custom stores for Langfuse / OTEL / D1 / Postgres.
|
|
656
|
+
*/
|
|
657
|
+
interface LlmTrace {
|
|
658
|
+
id: string;
|
|
659
|
+
runId: string;
|
|
660
|
+
scenarioId?: string;
|
|
661
|
+
turnIndex?: number;
|
|
662
|
+
role: 'driver' | 'judge' | 'product' | 'optimizer' | string;
|
|
663
|
+
model: string;
|
|
664
|
+
prompt: string;
|
|
665
|
+
output: string;
|
|
666
|
+
inputTokens?: number;
|
|
667
|
+
outputTokens?: number;
|
|
668
|
+
costUsd?: number;
|
|
669
|
+
durationMs?: number;
|
|
670
|
+
timestamp: string;
|
|
671
|
+
metadata?: Record<string, unknown>;
|
|
672
|
+
}
|
|
673
|
+
interface TraceQuery {
|
|
674
|
+
runId?: string;
|
|
675
|
+
scenarioId?: string;
|
|
676
|
+
role?: string;
|
|
677
|
+
model?: string;
|
|
678
|
+
sinceMs?: number;
|
|
679
|
+
limit?: number;
|
|
680
|
+
}
|
|
681
|
+
interface TraceStore {
|
|
682
|
+
record(trace: LlmTrace): Promise<void>;
|
|
683
|
+
query(query: TraceQuery): Promise<LlmTrace[]>;
|
|
684
|
+
count(query?: TraceQuery): Promise<number>;
|
|
685
|
+
}
|
|
686
|
+
declare class MemoryTraceStore implements TraceStore {
|
|
687
|
+
private traces;
|
|
688
|
+
record(trace: LlmTrace): Promise<void>;
|
|
689
|
+
query(query: TraceQuery): Promise<LlmTrace[]>;
|
|
690
|
+
count(query?: TraceQuery): Promise<number>;
|
|
691
|
+
/** Clear the store — test helper. */
|
|
692
|
+
reset(): void;
|
|
693
|
+
private filter;
|
|
694
|
+
}
|
|
695
|
+
interface FileSystemTraceStoreOptions {
|
|
696
|
+
dir: string;
|
|
697
|
+
/** Max file size before rolling to a new segment (default 32 MB). */
|
|
698
|
+
rolloverBytes?: number;
|
|
699
|
+
/** Function to write the file — defaults to node:fs/promises.appendFile */
|
|
700
|
+
append?: (path: string, data: string) => Promise<void>;
|
|
701
|
+
read?: (path: string) => Promise<string>;
|
|
702
|
+
list?: (dir: string) => Promise<string[]>;
|
|
703
|
+
stat?: (path: string) => Promise<{
|
|
704
|
+
size: number;
|
|
705
|
+
}>;
|
|
706
|
+
mkdir?: (dir: string) => Promise<void>;
|
|
707
|
+
}
|
|
708
|
+
declare class FileSystemTraceStore implements TraceStore {
|
|
709
|
+
private readonly opts;
|
|
710
|
+
constructor(opts: FileSystemTraceStoreOptions);
|
|
711
|
+
record(trace: LlmTrace): Promise<void>;
|
|
712
|
+
query(query: TraceQuery): Promise<LlmTrace[]>;
|
|
713
|
+
count(query?: TraceQuery): Promise<number>;
|
|
714
|
+
private segments;
|
|
715
|
+
private currentSegment;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
/**
|
|
719
|
+
* Anti-slop quality judge.
|
|
720
|
+
*
|
|
721
|
+
* Deterministic pattern-based quality check — no LLM call. Catches the
|
|
722
|
+
* 80% of AI slop that every production agent leaks:
|
|
723
|
+
* - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
|
|
724
|
+
* - N-gram repetition (same phrase over and over)
|
|
725
|
+
* - Hedging overuse ("I could be wrong, but...")
|
|
726
|
+
* - Apology padding ("I'm so sorry for the confusion...")
|
|
727
|
+
* - Unused opening formulas ("Great question!")
|
|
728
|
+
* - Length bounds (too short to be useful, too long to be read)
|
|
729
|
+
*
|
|
730
|
+
* Produces a JudgeScore in the same shape as LLM judges so it composes into
|
|
731
|
+
* `BenchmarkRunner`'s judge array transparently.
|
|
732
|
+
*/
|
|
733
|
+
|
|
734
|
+
interface AntiSlopConfig {
|
|
735
|
+
/** Domain label — appears in the JudgeScore output */
|
|
736
|
+
domain?: string;
|
|
737
|
+
/** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
|
|
738
|
+
bannedPhrases?: string[];
|
|
739
|
+
/** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
|
|
740
|
+
bannedOpenings?: RegExp[];
|
|
741
|
+
/** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
|
|
742
|
+
hedgingPatterns?: RegExp[];
|
|
743
|
+
/** Regexes matching apology padding. */
|
|
744
|
+
apologyPatterns?: RegExp[];
|
|
745
|
+
/** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
|
|
746
|
+
repetitionThreshold?: number;
|
|
747
|
+
/** Min output length in chars; below this the turn is deemed too terse. */
|
|
748
|
+
minLength?: number;
|
|
749
|
+
/** Max output length in chars; above this the turn is deemed too verbose. */
|
|
750
|
+
maxLength?: number;
|
|
751
|
+
/** How heavily each violation class reduces the score (default 1). */
|
|
752
|
+
penaltyWeights?: Partial<Record<SlopCategory, number>>;
|
|
753
|
+
}
|
|
754
|
+
type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
|
|
755
|
+
/** Create a reusable Judge function from an anti-slop config. */
|
|
756
|
+
declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
|
|
757
|
+
interface AntiSlopIssue {
|
|
758
|
+
category: SlopCategory;
|
|
759
|
+
detail: string;
|
|
760
|
+
example?: string;
|
|
761
|
+
}
|
|
762
|
+
interface AntiSlopReport {
|
|
763
|
+
/** 0–10 score; 10 is clean, lower values mean more slop. */
|
|
764
|
+
score: number;
|
|
765
|
+
issues: AntiSlopIssue[];
|
|
766
|
+
/** Count of each category for programmatic aggregation. */
|
|
767
|
+
counts: Record<SlopCategory, number>;
|
|
768
|
+
}
|
|
769
|
+
/**
|
|
770
|
+
* Pure function — analyze one or more outputs against the config. Exposed
|
|
771
|
+
* separately so consumers can build their own reporters on top.
|
|
772
|
+
*/
|
|
773
|
+
declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
|
|
774
|
+
penaltyWeights: Record<SlopCategory, number>;
|
|
775
|
+
}): AntiSlopReport;
|
|
776
|
+
|
|
777
|
+
/**
|
|
778
|
+
* Artifact validators.
|
|
779
|
+
*
|
|
780
|
+
* Generic "score a produced artifact" primitive. Tax uses it for PDF form
|
|
781
|
+
* correctness, legal for contract clauses, film for script breakdowns, GTM
|
|
782
|
+
* for social posts. One interface, many validators; all plug into
|
|
783
|
+
* `BenchmarkRunner` the same way.
|
|
784
|
+
*
|
|
785
|
+
* A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
|
|
786
|
+
* plus a `ValidationContext` (scenario id, the turns that produced it) and
|
|
787
|
+
* returns a `ValidationResult` with pass/fail + 0..1 score + structured
|
|
788
|
+
* issues.
|
|
789
|
+
*/
|
|
790
|
+
interface Artifact {
|
|
791
|
+
/** Logical kind — validators type-guard on this */
|
|
792
|
+
kind: 'file' | 'json' | 'text' | 'binary' | string;
|
|
793
|
+
/** Filesystem-style path, optional */
|
|
794
|
+
path?: string;
|
|
795
|
+
/** String content for text/json/file kinds */
|
|
796
|
+
content?: string;
|
|
797
|
+
/** Binary content (if kind === 'binary') */
|
|
798
|
+
bytes?: Uint8Array;
|
|
799
|
+
/** Caller-supplied metadata (mimeType, sha256, size, etc.) */
|
|
800
|
+
metadata?: Record<string, unknown>;
|
|
801
|
+
}
|
|
802
|
+
interface ValidationContext {
|
|
803
|
+
scenarioId: string;
|
|
804
|
+
turnIndex?: number;
|
|
805
|
+
/** Prior artifacts for multi-artifact scenarios */
|
|
806
|
+
priorArtifacts?: Artifact[];
|
|
807
|
+
/** Free-form hints the validator uses for domain-specific checks */
|
|
808
|
+
hints?: Record<string, unknown>;
|
|
809
|
+
}
|
|
810
|
+
interface ValidationIssue {
|
|
811
|
+
severity: 'error' | 'warning' | 'info';
|
|
812
|
+
message: string;
|
|
813
|
+
/** Optional path into the artifact (e.g. JSON path or byte offset) */
|
|
814
|
+
locus?: string;
|
|
815
|
+
}
|
|
816
|
+
interface ValidationResult {
|
|
817
|
+
pass: boolean;
|
|
818
|
+
/** 0–1 normalized score. Validators should be monotonic in pass-ness. */
|
|
819
|
+
score: number;
|
|
820
|
+
issues: ValidationIssue[];
|
|
821
|
+
/** Diagnostic payload for reporters */
|
|
822
|
+
evidence?: Record<string, unknown>;
|
|
823
|
+
}
|
|
824
|
+
interface ArtifactValidator {
|
|
825
|
+
/** Stable identifier for the validator; appears in reports. */
|
|
826
|
+
name: string;
|
|
827
|
+
/** Optional description for human-facing reports. */
|
|
828
|
+
description?: string;
|
|
829
|
+
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
830
|
+
validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
|
|
831
|
+
}
|
|
832
|
+
/**
|
|
833
|
+
* Run every validator on the same artifact; aggregate pass as AND, score as
|
|
834
|
+
* (weighted) mean, issues concatenated. Weights default to 1 each.
|
|
835
|
+
*/
|
|
836
|
+
declare function composeValidators(validators: ArtifactValidator[], options?: {
|
|
837
|
+
name?: string;
|
|
838
|
+
weights?: number[];
|
|
839
|
+
}): ArtifactValidator;
|
|
840
|
+
/** Pass if the artifact body matches a provided regex. */
|
|
841
|
+
declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
|
|
842
|
+
/** Pass if JSON parses and every required key is present. */
|
|
843
|
+
declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
|
|
844
|
+
/** Pass if min ≤ byte length ≤ max. */
|
|
845
|
+
declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
|
|
846
|
+
/** Pass if the artifact contains every required substring (case-insensitive by default). */
|
|
847
|
+
declare function containsAll(name: string, required: string[], options?: {
|
|
848
|
+
caseSensitive?: boolean;
|
|
849
|
+
}): ArtifactValidator;
|
|
850
|
+
|
|
851
|
+
/**
|
|
852
|
+
* Workspace inspector — score the persisted state of an agent after a run.
|
|
853
|
+
*
|
|
854
|
+
* Many evals don't ask "did the response say the right thing" but "did the
|
|
855
|
+
* agent put the right rows in the DB / files in the vault / entities on the
|
|
856
|
+
* canvas". This is the primitive for that.
|
|
857
|
+
*
|
|
858
|
+
* Implementations read from D1, KV, filesystem, or any store — the interface
|
|
859
|
+
* is deliberately small so consumers plug in their own backends.
|
|
860
|
+
*/
|
|
861
|
+
interface WorkspaceSnapshot {
|
|
862
|
+
/** Vault files: logical path → content */
|
|
863
|
+
files: Record<string, string>;
|
|
864
|
+
/** DB rows: table name → array of rows (post-validation) */
|
|
865
|
+
rows: Record<string, Array<Record<string, unknown>>>;
|
|
866
|
+
/** KV entries: key → value (scoped to whatever prefix the inspector chose) */
|
|
867
|
+
kv: Record<string, string>;
|
|
868
|
+
/** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
|
|
869
|
+
blobs?: Record<string, {
|
|
870
|
+
size: number;
|
|
871
|
+
hash?: string;
|
|
872
|
+
mimeType?: string;
|
|
873
|
+
}>;
|
|
874
|
+
}
|
|
875
|
+
interface InspectorContext {
|
|
876
|
+
/** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
|
|
877
|
+
scopeId: string;
|
|
878
|
+
/** Optional scenario id — allows scenario-specific snapshot shaping */
|
|
879
|
+
scenarioId?: string;
|
|
880
|
+
}
|
|
881
|
+
interface WorkspaceInspector {
|
|
882
|
+
name: string;
|
|
883
|
+
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
884
|
+
}
|
|
885
|
+
declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
|
|
886
|
+
readonly name = "in-memory";
|
|
887
|
+
private readonly snapshots;
|
|
888
|
+
set(scopeId: string, snapshot: WorkspaceSnapshot): void;
|
|
889
|
+
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
890
|
+
}
|
|
891
|
+
interface WorkspaceAssertion {
|
|
892
|
+
name: string;
|
|
893
|
+
description?: string;
|
|
894
|
+
check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
|
|
895
|
+
}
|
|
896
|
+
interface WorkspaceAssertionResult {
|
|
897
|
+
pass: boolean;
|
|
898
|
+
/** 0..1 — partial credit for assertions that admit it */
|
|
899
|
+
score: number;
|
|
900
|
+
detail?: string;
|
|
901
|
+
}
|
|
902
|
+
declare function fileExists(path: string): WorkspaceAssertion;
|
|
903
|
+
declare function fileContains(path: string, needle: string): WorkspaceAssertion;
|
|
904
|
+
declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
|
|
905
|
+
declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
|
|
906
|
+
min?: number;
|
|
907
|
+
}): WorkspaceAssertion;
|
|
908
|
+
/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
|
|
909
|
+
declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
|
|
910
|
+
pass: boolean;
|
|
911
|
+
score: number;
|
|
912
|
+
results: Array<{
|
|
913
|
+
assertion: string;
|
|
914
|
+
result: WorkspaceAssertionResult;
|
|
915
|
+
}>;
|
|
916
|
+
};
|
|
917
|
+
|
|
918
|
+
/**
|
|
919
|
+
* Experiment tracker — group runs, diff them, watch scores move over time.
|
|
920
|
+
*
|
|
921
|
+
* Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
|
|
922
|
+
* - A run has a config (prompt hash, model, scenario ids, seed)
|
|
923
|
+
* - Runs belong to experiments (named groups)
|
|
924
|
+
* - The store is pluggable (in-memory for tests, filesystem for local,
|
|
925
|
+
* custom for Langfuse/D1)
|
|
926
|
+
* - Diffs show score deltas, new/dropped scenarios, and config changes
|
|
927
|
+
*
|
|
928
|
+
* The output plugs directly into `BenchmarkReport` — runs archive the full
|
|
929
|
+
* report, diff operates on the summary.
|
|
930
|
+
*/
|
|
931
|
+
|
|
932
|
+
interface RunConfig {
|
|
933
|
+
experimentId: string;
|
|
934
|
+
name?: string;
|
|
935
|
+
model?: string;
|
|
936
|
+
promptHash?: string;
|
|
937
|
+
promptVersion?: string;
|
|
938
|
+
seed?: number;
|
|
939
|
+
metadata?: Record<string, unknown>;
|
|
940
|
+
}
|
|
941
|
+
interface Run {
|
|
942
|
+
id: string;
|
|
943
|
+
experimentId: string;
|
|
944
|
+
name?: string;
|
|
945
|
+
config: RunConfig;
|
|
946
|
+
startedAt: string;
|
|
947
|
+
completedAt?: string;
|
|
948
|
+
status: 'running' | 'completed' | 'failed';
|
|
949
|
+
report?: BenchmarkReport;
|
|
950
|
+
error?: string;
|
|
951
|
+
}
|
|
952
|
+
interface Experiment {
|
|
953
|
+
id: string;
|
|
954
|
+
name: string;
|
|
955
|
+
createdAt: string;
|
|
956
|
+
metadata?: Record<string, unknown>;
|
|
957
|
+
}
|
|
958
|
+
interface ExperimentStore {
|
|
959
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
960
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
961
|
+
listExperiments(): Promise<Experiment[]>;
|
|
962
|
+
saveRun(run: Run): Promise<void>;
|
|
963
|
+
getRun(id: string): Promise<Run | null>;
|
|
964
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
965
|
+
}
|
|
966
|
+
declare class InMemoryExperimentStore implements ExperimentStore {
|
|
967
|
+
private readonly experiments;
|
|
968
|
+
private readonly runs;
|
|
969
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
970
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
971
|
+
listExperiments(): Promise<Experiment[]>;
|
|
972
|
+
saveRun(run: Run): Promise<void>;
|
|
973
|
+
getRun(id: string): Promise<Run | null>;
|
|
974
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
975
|
+
}
|
|
976
|
+
declare class ExperimentTracker {
|
|
977
|
+
private readonly store;
|
|
978
|
+
constructor(store: ExperimentStore);
|
|
979
|
+
startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
|
|
980
|
+
startRun(config: RunConfig): Promise<Run>;
|
|
981
|
+
completeRun(runId: string, report: BenchmarkReport): Promise<void>;
|
|
982
|
+
failRun(runId: string, error: string): Promise<void>;
|
|
983
|
+
/**
|
|
984
|
+
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
985
|
+
* and config changes that may explain the movement.
|
|
986
|
+
*/
|
|
987
|
+
diff(runIdA: string, runIdB: string): Promise<RunDiff>;
|
|
988
|
+
/** Timeline of aggregate scores for an experiment. */
|
|
989
|
+
timeline(experimentId: string): Promise<Array<{
|
|
990
|
+
runId: string;
|
|
991
|
+
startedAt: string;
|
|
992
|
+
overall: number | null;
|
|
993
|
+
}>>;
|
|
994
|
+
}
|
|
995
|
+
interface RunDiff {
|
|
996
|
+
before: {
|
|
997
|
+
runId: string;
|
|
998
|
+
name?: string;
|
|
999
|
+
startedAt: string;
|
|
1000
|
+
};
|
|
1001
|
+
after: {
|
|
1002
|
+
runId: string;
|
|
1003
|
+
name?: string;
|
|
1004
|
+
startedAt: string;
|
|
1005
|
+
};
|
|
1006
|
+
aggregateDelta: number;
|
|
1007
|
+
scenarios: Array<{
|
|
1008
|
+
scenarioId: string;
|
|
1009
|
+
before: number | null;
|
|
1010
|
+
after: number | null;
|
|
1011
|
+
delta: number | null;
|
|
1012
|
+
status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
|
|
1013
|
+
}>;
|
|
1014
|
+
configChanges: Record<string, {
|
|
1015
|
+
before: unknown;
|
|
1016
|
+
after: unknown;
|
|
1017
|
+
}>;
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
/**
|
|
1021
|
+
* Prompt optimizer — A/B test prompt variants with statistical rigor.
|
|
1022
|
+
*
|
|
1023
|
+
* Runs N prompt variants against a fixed scenario set, collects per-scenario
|
|
1024
|
+
* scores via the user-provided `scoreVariant` callback, and returns:
|
|
1025
|
+
* - per-variant mean + bootstrap CI
|
|
1026
|
+
* - pairwise significance (Mann-Whitney, non-parametric — works on any
|
|
1027
|
+
* score distribution, not just normal)
|
|
1028
|
+
* - a winner (highest mean, flagged if the lead is not significant)
|
|
1029
|
+
*
|
|
1030
|
+
* Deliberately generic — the `scoreVariant` callback does whatever domain
|
|
1031
|
+
* work the consumer needs (invoke the agent, judge the output, whatever),
|
|
1032
|
+
* and returns a number per scenario. This lets the optimizer stay small +
|
|
1033
|
+
* testable.
|
|
1034
|
+
*/
|
|
1035
|
+
interface PromptVariant {
|
|
1036
|
+
id: string;
|
|
1037
|
+
prompt: string;
|
|
1038
|
+
metadata?: Record<string, unknown>;
|
|
1039
|
+
}
|
|
1040
|
+
interface OptimizationConfig {
|
|
1041
|
+
variants: PromptVariant[];
|
|
1042
|
+
/** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
|
|
1043
|
+
trialsPerScenario?: number;
|
|
1044
|
+
/** Significance threshold for pairwise comparison (default 0.05). */
|
|
1045
|
+
significanceLevel?: number;
|
|
1046
|
+
/**
|
|
1047
|
+
* The scoring callback. For each (variant, scenarioId, trialIndex), produce
|
|
1048
|
+
* a score in 0..1 (or any numeric range — the optimizer only cares about
|
|
1049
|
+
* monotonicity).
|
|
1050
|
+
*/
|
|
1051
|
+
scoreVariant: (args: {
|
|
1052
|
+
variant: PromptVariant;
|
|
1053
|
+
scenarioId: string;
|
|
1054
|
+
trialIndex: number;
|
|
1055
|
+
}) => Promise<number>;
|
|
1056
|
+
/** Scenario ids to run against. */
|
|
1057
|
+
scenarioIds: string[];
|
|
1058
|
+
/** Optional hook — fires after each (variant, scenario) fully scored. */
|
|
1059
|
+
onScenarioComplete?: (info: {
|
|
1060
|
+
variantId: string;
|
|
1061
|
+
scenarioId: string;
|
|
1062
|
+
scores: number[];
|
|
1063
|
+
}) => void;
|
|
1064
|
+
}
|
|
1065
|
+
interface VariantScore {
|
|
1066
|
+
variantId: string;
|
|
1067
|
+
mean: number;
|
|
1068
|
+
ci95: {
|
|
1069
|
+
lower: number;
|
|
1070
|
+
upper: number;
|
|
1071
|
+
};
|
|
1072
|
+
n: number;
|
|
1073
|
+
perScenario: Record<string, {
|
|
1074
|
+
mean: number;
|
|
1075
|
+
n: number;
|
|
1076
|
+
samples: number[];
|
|
1077
|
+
}>;
|
|
1078
|
+
}
|
|
1079
|
+
interface PairwiseComparison {
|
|
1080
|
+
variantA: string;
|
|
1081
|
+
variantB: string;
|
|
1082
|
+
pValue: number;
|
|
1083
|
+
significant: boolean;
|
|
1084
|
+
meanDelta: number;
|
|
1085
|
+
}
|
|
1086
|
+
interface OptimizationResult {
|
|
1087
|
+
winner: {
|
|
1088
|
+
variantId: string;
|
|
1089
|
+
/** True when the winner's lead vs every other variant is statistically significant. */
|
|
1090
|
+
significant: boolean;
|
|
1091
|
+
ciLowerBoundExceedsSecondMean: boolean;
|
|
1092
|
+
};
|
|
1093
|
+
scores: VariantScore[];
|
|
1094
|
+
pairwise: PairwiseComparison[];
|
|
1095
|
+
config: {
|
|
1096
|
+
trialsPerScenario: number;
|
|
1097
|
+
significanceLevel: number;
|
|
1098
|
+
variants: string[];
|
|
1099
|
+
scenarios: string[];
|
|
1100
|
+
};
|
|
1101
|
+
}
|
|
1102
|
+
declare class PromptOptimizer {
|
|
1103
|
+
run(config: OptimizationConfig): Promise<OptimizationResult>;
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
/**
|
|
1107
|
+
* Dual-agent convergence bench.
|
|
1108
|
+
*
|
|
1109
|
+
* Pattern lifted from tax-agent + legal-agent: two agents take turns until
|
|
1110
|
+
* they converge on a consensus artifact. One proposes, the other critiques;
|
|
1111
|
+
* the proposer revises; repeat until a score threshold is hit or max rounds.
|
|
1112
|
+
*
|
|
1113
|
+
* Generalized so any two "agents" (gateways, local functions, anything with
|
|
1114
|
+
* `propose` + `critique`) compose in. Returns convergence rounds per
|
|
1115
|
+
* scenario + whether convergence happened.
|
|
1116
|
+
*/
|
|
1117
|
+
interface DualAgentScenario {
|
|
1118
|
+
id: string;
|
|
1119
|
+
initialPrompt: string;
|
|
1120
|
+
/** Optional context the agents can read (e.g. source documents). */
|
|
1121
|
+
context?: Record<string, unknown>;
|
|
1122
|
+
}
|
|
1123
|
+
interface DualAgentRound {
|
|
1124
|
+
roundIndex: number;
|
|
1125
|
+
proposal: string;
|
|
1126
|
+
critique: string;
|
|
1127
|
+
convergenceScore: number;
|
|
1128
|
+
}
|
|
1129
|
+
interface DualAgentScenarioResult {
|
|
1130
|
+
scenarioId: string;
|
|
1131
|
+
converged: boolean;
|
|
1132
|
+
roundsToConverge: number | null;
|
|
1133
|
+
finalProposal: string;
|
|
1134
|
+
history: DualAgentRound[];
|
|
1135
|
+
finalScore: number;
|
|
1136
|
+
}
|
|
1137
|
+
interface DualAgentBenchConfig {
|
|
1138
|
+
scenarios: DualAgentScenario[];
|
|
1139
|
+
maxRounds?: number;
|
|
1140
|
+
/** Convergence threshold in 0..1 (default 0.85). */
|
|
1141
|
+
convergenceThreshold?: number;
|
|
1142
|
+
/**
|
|
1143
|
+
* Propose an answer given the scenario + the critic's prior critique (if any).
|
|
1144
|
+
* Returns the proposal string.
|
|
1145
|
+
*/
|
|
1146
|
+
propose: (args: {
|
|
1147
|
+
scenario: DualAgentScenario;
|
|
1148
|
+
roundIndex: number;
|
|
1149
|
+
priorProposal?: string;
|
|
1150
|
+
priorCritique?: string;
|
|
1151
|
+
}) => Promise<string>;
|
|
1152
|
+
/**
|
|
1153
|
+
* Critique the proposer's current output. Returns a structured critique
|
|
1154
|
+
* (free text) plus a convergence score: how close the proposal is to
|
|
1155
|
+
* acceptable. 1.0 = accept, 0.0 = totally off.
|
|
1156
|
+
*/
|
|
1157
|
+
critique: (args: {
|
|
1158
|
+
scenario: DualAgentScenario;
|
|
1159
|
+
roundIndex: number;
|
|
1160
|
+
proposal: string;
|
|
1161
|
+
}) => Promise<{
|
|
1162
|
+
critique: string;
|
|
1163
|
+
convergenceScore: number;
|
|
1164
|
+
}>;
|
|
1165
|
+
/** Optional per-round hook for progress + tracing. */
|
|
1166
|
+
onRoundComplete?: (info: {
|
|
1167
|
+
scenarioId: string;
|
|
1168
|
+
round: DualAgentRound;
|
|
1169
|
+
}) => void;
|
|
1170
|
+
}
|
|
1171
|
+
interface DualAgentReport {
|
|
1172
|
+
scenarios: DualAgentScenarioResult[];
|
|
1173
|
+
aggregate: {
|
|
1174
|
+
convergenceRate: number;
|
|
1175
|
+
avgRoundsToConverge: number | null;
|
|
1176
|
+
avgFinalScore: number;
|
|
1177
|
+
};
|
|
1178
|
+
config: {
|
|
1179
|
+
maxRounds: number;
|
|
1180
|
+
convergenceThreshold: number;
|
|
1181
|
+
};
|
|
1182
|
+
}
|
|
1183
|
+
declare class DualAgentBench {
|
|
1184
|
+
run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
export { AgentDriver, type AgentDriverConfig, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type ArtifactResult, type ArtifactValidator, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type CheckResult, type CollectedArtifacts, type CompletionCriterion, ConvergenceTracker, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EvalResult, type ExecutorConfig, type Experiment, type ExperimentStore, ExperimentTracker, type FeedbackPattern, FileSystemTraceStore, type FileSystemTraceStoreOptions, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InspectorContext, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgeRubric, type JudgeScore, type LlmTrace, MODEL_PRICING, MemoryTraceStore, MetricsCollector, type OptimizationConfig, type OptimizationResult, type PairwiseComparison, type PersonaConfig, ProductClient, type ProductClientConfig, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type RouteMap, type RubricDimension, type Run, type RunConfig, type RunDiff, type Scenario, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type SlopCategory, type TestResult, TokenCounter, type TraceQuery, type TraceStore, type Turn, type TurnMetrics, type TurnResult, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, analyzeAntiSlop, byteLengthRange, codeExecutionJudge, cohensD, coherenceJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, defaultJudges, estimateCost, estimateTokens, executeScenario, fileContains, fileExists, formatBenchmarkReport, formatDriverReport, hashContent, interRaterReliability, jsonHasKeys, mannWhitneyU, normalizeScores, pairedTTest, partialCredit, printDriverSummary, regexMatch, rowCount, rowWhere, runAssertions, runE2EWorkflow, weightedMean, wilcoxonSignedRank };
|