@tangle-network/agent-eval 0.17.0 → 0.17.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -0
- package/dist/index.d.ts +1453 -1088
- package/dist/index.js +1477 -231
- package/dist/index.js.map +1 -1
- package/package.json +12 -10
package/dist/index.d.ts
CHANGED
|
@@ -513,1052 +513,1526 @@ declare function formatDriverReport(results: DriverResult[]): string;
|
|
|
513
513
|
declare function printDriverSummary(results: DriverResult[]): void;
|
|
514
514
|
|
|
515
515
|
/**
|
|
516
|
-
*
|
|
517
|
-
* Inverted dimensions (hallucination, false_confidence, worst_failure)
|
|
518
|
-
* already use inverted scoring in the prompt (10 = no hallucination),
|
|
519
|
-
* but this function ensures consistency if raw scores leak through.
|
|
520
|
-
*/
|
|
521
|
-
declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
|
|
522
|
-
/** Weighted mean — falls back to uniform weights when omitted */
|
|
523
|
-
declare function weightedMean(scores: {
|
|
524
|
-
score: number;
|
|
525
|
-
weight?: number;
|
|
526
|
-
}[]): number;
|
|
527
|
-
/** Bootstrap confidence interval */
|
|
528
|
-
declare function confidenceInterval(scores: number[], confidence?: number): {
|
|
529
|
-
mean: number;
|
|
530
|
-
lower: number;
|
|
531
|
-
upper: number;
|
|
532
|
-
};
|
|
533
|
-
/**
|
|
534
|
-
* Inter-rater reliability — simplified Krippendorff's alpha.
|
|
516
|
+
* TraceSchema v1 — the canonical data model for agent-eval.
|
|
535
517
|
*
|
|
536
|
-
*
|
|
537
|
-
*
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
*
|
|
542
|
-
*
|
|
543
|
-
*/
|
|
544
|
-
declare function mannWhitneyU(a: number[], b: number[]): {
|
|
545
|
-
u: number;
|
|
546
|
-
p: number;
|
|
547
|
-
};
|
|
548
|
-
/** Partial credit: returns 0-1 ratio of current toward target */
|
|
549
|
-
declare function partialCredit(current: number, target: number): number;
|
|
550
|
-
/**
|
|
551
|
-
* Paired t-test — before/after measurements on the SAME items.
|
|
552
|
-
* Pairing removes inter-item variance, giving tighter significance than
|
|
553
|
-
* an unpaired test when comparing prompt v1 vs prompt v2 on identical
|
|
554
|
-
* scenarios.
|
|
555
|
-
*/
|
|
556
|
-
declare function pairedTTest(before: number[], after: number[]): {
|
|
557
|
-
t: number;
|
|
558
|
-
df: number;
|
|
559
|
-
p: number;
|
|
560
|
-
};
|
|
561
|
-
/**
|
|
562
|
-
* Wilcoxon signed-rank test — paired non-parametric alternative.
|
|
563
|
-
* Use when the differences aren't normally distributed.
|
|
518
|
+
* Every score, every failure class, every pipeline in the framework is
|
|
519
|
+
* a view over this data. Shape it once, live with it.
|
|
520
|
+
*
|
|
521
|
+
* Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
|
|
522
|
+
* but extended with agent-specific span kinds (llm, tool, retrieval,
|
|
523
|
+
* judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
|
|
524
|
+
* entities that OTEL leaves as free-form attributes.
|
|
564
525
|
*/
|
|
565
|
-
declare
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
526
|
+
declare const TRACE_SCHEMA_VERSION = "1.0.0";
|
|
527
|
+
type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
|
|
528
|
+
interface BudgetSpec {
|
|
529
|
+
tokens?: number;
|
|
530
|
+
wallMs?: number;
|
|
531
|
+
calls?: number;
|
|
532
|
+
usd?: number;
|
|
533
|
+
}
|
|
534
|
+
interface RunOutcome$1 {
|
|
535
|
+
score?: number;
|
|
536
|
+
pass?: boolean;
|
|
537
|
+
failureClass?: FailureClass;
|
|
538
|
+
notes?: string;
|
|
539
|
+
}
|
|
569
540
|
/**
|
|
570
|
-
*
|
|
571
|
-
*
|
|
572
|
-
*
|
|
541
|
+
* Layer — optional classification in a nested build workflow.
|
|
542
|
+
* `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
|
|
543
|
+
* `app-build`: sandbox harness that compiled + tested the generated scaffold.
|
|
544
|
+
* `app-runtime`: a run of the generated agent against a domain scenario.
|
|
545
|
+
* `meta`: any meta-eval (judge replay, correlation analysis).
|
|
573
546
|
*/
|
|
574
|
-
|
|
547
|
+
type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
|
|
548
|
+
interface Run$1 {
|
|
549
|
+
runId: string;
|
|
550
|
+
scenarioId: string;
|
|
551
|
+
variantId?: string;
|
|
552
|
+
datasetVersion?: string;
|
|
553
|
+
/** Git SHA of agent code at run time. */
|
|
554
|
+
codeSha?: string;
|
|
555
|
+
/** Hash of the prompt template + any system prompt. */
|
|
556
|
+
promptSha?: string;
|
|
557
|
+
/** Model id + date + system-prompt hash, concatenated. */
|
|
558
|
+
modelFingerprint?: string;
|
|
559
|
+
seed?: number;
|
|
560
|
+
/** Arbitrary environment markers (shell, docker version, tz). */
|
|
561
|
+
envFingerprint?: Record<string, string>;
|
|
562
|
+
/** Version of the redaction rules applied to this run. */
|
|
563
|
+
redactionVersion?: string;
|
|
564
|
+
/** Parent run in a nested build workflow. A builder run's children are
|
|
565
|
+
* app-build runs; those children are app-runtime runs. */
|
|
566
|
+
parentRunId?: string;
|
|
567
|
+
/** Stable project identifier — groups runs across chats + sessions. */
|
|
568
|
+
projectId?: string;
|
|
569
|
+
/** Chat/conversation identifier within a project. */
|
|
570
|
+
chatId?: string;
|
|
571
|
+
/** Layer classification — hint for aggregation; not enforced. */
|
|
572
|
+
layer?: RunLayer;
|
|
573
|
+
startedAt: number;
|
|
574
|
+
endedAt?: number;
|
|
575
|
+
status: RunStatus;
|
|
576
|
+
outcome?: RunOutcome$1;
|
|
577
|
+
budget?: BudgetSpec;
|
|
578
|
+
/** Free-form labels for downstream grouping. */
|
|
579
|
+
tags?: Record<string, string>;
|
|
580
|
+
}
|
|
581
|
+
type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
|
|
582
|
+
type SpanStatus = 'ok' | 'error';
|
|
583
|
+
interface SpanBase {
|
|
584
|
+
spanId: string;
|
|
585
|
+
parentSpanId?: string;
|
|
586
|
+
runId: string;
|
|
587
|
+
kind: SpanKind;
|
|
588
|
+
name: string;
|
|
589
|
+
startedAt: number;
|
|
590
|
+
endedAt?: number;
|
|
591
|
+
status?: SpanStatus;
|
|
592
|
+
error?: string;
|
|
593
|
+
/** Anything not covered by typed fields. Kept deliberately free-form. */
|
|
594
|
+
attributes?: Record<string, unknown>;
|
|
595
|
+
}
|
|
596
|
+
interface Message {
|
|
597
|
+
role: 'system' | 'user' | 'assistant' | 'tool';
|
|
598
|
+
content: string;
|
|
599
|
+
tokens?: number;
|
|
600
|
+
/** Multi-modal content descriptors; blobs themselves live in Artifacts. */
|
|
601
|
+
images?: Array<{
|
|
602
|
+
artifactId?: string;
|
|
603
|
+
url?: string;
|
|
604
|
+
mime?: string;
|
|
605
|
+
}>;
|
|
606
|
+
}
|
|
607
|
+
interface LlmSpan extends SpanBase {
|
|
608
|
+
kind: 'llm';
|
|
609
|
+
model: string;
|
|
610
|
+
messages: Message[];
|
|
611
|
+
output?: string;
|
|
612
|
+
inputTokens?: number;
|
|
613
|
+
outputTokens?: number;
|
|
614
|
+
cachedTokens?: number;
|
|
615
|
+
reasoningTokens?: number;
|
|
616
|
+
costUsd?: number;
|
|
617
|
+
finishReason?: string;
|
|
618
|
+
}
|
|
619
|
+
interface ToolSpan extends SpanBase {
|
|
620
|
+
kind: 'tool';
|
|
621
|
+
toolName: string;
|
|
622
|
+
args: unknown;
|
|
623
|
+
result?: unknown;
|
|
624
|
+
latencyMs?: number;
|
|
625
|
+
}
|
|
626
|
+
interface RetrievalSpan extends SpanBase {
|
|
627
|
+
kind: 'retrieval';
|
|
628
|
+
query: string;
|
|
629
|
+
hits: Array<{
|
|
630
|
+
docId: string;
|
|
631
|
+
score: number;
|
|
632
|
+
content?: string;
|
|
633
|
+
}>;
|
|
634
|
+
}
|
|
635
|
+
interface JudgeSpan extends SpanBase {
|
|
636
|
+
kind: 'judge';
|
|
637
|
+
judgeId: string;
|
|
638
|
+
/** Span this judgment applies to. */
|
|
639
|
+
targetSpanId: string;
|
|
640
|
+
dimension: string;
|
|
641
|
+
/** Numeric score (free-range; interpretation up to the judge). */
|
|
642
|
+
score: number;
|
|
643
|
+
rationale?: string;
|
|
644
|
+
evidence?: string;
|
|
645
|
+
}
|
|
646
|
+
interface SandboxSpan extends SpanBase {
|
|
647
|
+
kind: 'sandbox';
|
|
648
|
+
image?: string;
|
|
649
|
+
command?: string;
|
|
650
|
+
exitCode?: number;
|
|
651
|
+
testsTotal?: number;
|
|
652
|
+
testsPassed?: number;
|
|
653
|
+
stdoutHash?: string;
|
|
654
|
+
stderrHash?: string;
|
|
655
|
+
/** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
|
|
656
|
+
wallMs?: number;
|
|
657
|
+
}
|
|
658
|
+
interface GenericSpan extends SpanBase {
|
|
659
|
+
kind: 'agent' | 'custom';
|
|
660
|
+
}
|
|
661
|
+
type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
|
|
662
|
+
type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
|
|
663
|
+
interface TraceEvent {
|
|
664
|
+
eventId: string;
|
|
665
|
+
runId: string;
|
|
666
|
+
spanId?: string;
|
|
667
|
+
kind: EventKind;
|
|
668
|
+
timestamp: number;
|
|
669
|
+
payload: Record<string, unknown>;
|
|
670
|
+
}
|
|
671
|
+
interface BudgetLedgerEntry {
|
|
672
|
+
runId: string;
|
|
673
|
+
dimension: keyof BudgetSpec;
|
|
674
|
+
limit: number;
|
|
675
|
+
consumed: number;
|
|
676
|
+
remaining: number;
|
|
677
|
+
timestamp: number;
|
|
678
|
+
breached: boolean;
|
|
679
|
+
/** Span that triggered this entry, if any. */
|
|
680
|
+
spanId?: string;
|
|
681
|
+
}
|
|
682
|
+
interface Artifact$1 {
|
|
683
|
+
artifactId: string;
|
|
684
|
+
runId: string;
|
|
685
|
+
spanId?: string;
|
|
686
|
+
contentType: string;
|
|
687
|
+
sizeBytes: number;
|
|
688
|
+
/** sha256 in hex. */
|
|
689
|
+
hash: string;
|
|
690
|
+
/** External storage URL (R2, S3, filesystem path). */
|
|
691
|
+
storageUrl?: string;
|
|
692
|
+
/** Inline content for small blobs — keep under ~64KB. */
|
|
693
|
+
inlineContent?: string;
|
|
694
|
+
}
|
|
695
|
+
type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
|
|
696
|
+
declare const FAILURE_CLASSES: readonly FailureClass[];
|
|
697
|
+
declare function isLlmSpan(s: Span): s is LlmSpan;
|
|
698
|
+
declare function isToolSpan(s: Span): s is ToolSpan;
|
|
699
|
+
declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
|
|
700
|
+
declare function isJudgeSpan(s: Span): s is JudgeSpan;
|
|
701
|
+
declare function isSandboxSpan(s: Span): s is SandboxSpan;
|
|
575
702
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
constructor(criteria: CompletionCriterion[]);
|
|
586
|
-
/** Evaluate criteria against current state, record result */
|
|
587
|
-
record(turn: number, state: DriverState): {
|
|
588
|
-
completionPercent: number;
|
|
589
|
-
complete: boolean;
|
|
590
|
-
criteriaStatus: Record<string, boolean | number>;
|
|
703
|
+
interface RunFilter {
|
|
704
|
+
scenarioId?: string;
|
|
705
|
+
variantId?: string;
|
|
706
|
+
status?: RunStatus;
|
|
707
|
+
since?: number;
|
|
708
|
+
until?: number;
|
|
709
|
+
tag?: {
|
|
710
|
+
key: string;
|
|
711
|
+
value: string;
|
|
591
712
|
};
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
713
|
+
parentRunId?: string;
|
|
714
|
+
projectId?: string;
|
|
715
|
+
chatId?: string;
|
|
716
|
+
layer?: RunLayer;
|
|
717
|
+
}
|
|
718
|
+
interface SpanFilter {
|
|
719
|
+
runId?: string;
|
|
720
|
+
parentSpanId?: string;
|
|
721
|
+
kind?: SpanKind;
|
|
722
|
+
name?: string;
|
|
723
|
+
toolName?: string;
|
|
724
|
+
judgeId?: string;
|
|
725
|
+
since?: number;
|
|
726
|
+
until?: number;
|
|
727
|
+
}
|
|
728
|
+
interface EventFilter {
|
|
729
|
+
runId?: string;
|
|
730
|
+
spanId?: string;
|
|
731
|
+
kind?: EventKind;
|
|
732
|
+
since?: number;
|
|
733
|
+
until?: number;
|
|
734
|
+
}
|
|
735
|
+
interface TraceStore {
|
|
736
|
+
appendRun(run: Run$1): Promise<void>;
|
|
737
|
+
updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
|
|
738
|
+
appendSpan(span: Span): Promise<void>;
|
|
739
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
740
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
741
|
+
appendArtifact(artifact: Artifact$1): Promise<void>;
|
|
742
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
743
|
+
getRun(runId: string): Promise<Run$1 | undefined>;
|
|
744
|
+
listRuns(filter?: RunFilter): Promise<Run$1[]>;
|
|
745
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
746
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
747
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
748
|
+
artifacts(runId: string): Promise<Artifact$1[]>;
|
|
749
|
+
}
|
|
750
|
+
declare class InMemoryTraceStore implements TraceStore {
|
|
751
|
+
private runs;
|
|
752
|
+
private allSpans;
|
|
753
|
+
private allEvents;
|
|
754
|
+
private allArtifacts;
|
|
755
|
+
private allBudget;
|
|
756
|
+
appendRun(run: Run$1): Promise<void>;
|
|
757
|
+
updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
|
|
758
|
+
appendSpan(span: Span): Promise<void>;
|
|
759
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
760
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
761
|
+
appendArtifact(artifact: Artifact$1): Promise<void>;
|
|
762
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
763
|
+
getRun(runId: string): Promise<Run$1 | undefined>;
|
|
764
|
+
listRuns(filter?: RunFilter): Promise<Run$1[]>;
|
|
765
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
766
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
767
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
768
|
+
artifacts(runId: string): Promise<Artifact$1[]>;
|
|
769
|
+
}
|
|
770
|
+
interface FileSystemTraceStoreOptions {
|
|
771
|
+
dir: string;
|
|
772
|
+
/** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
|
|
773
|
+
maxBytes?: number;
|
|
774
|
+
}
|
|
775
|
+
declare class FileSystemTraceStore implements TraceStore {
|
|
776
|
+
private dir;
|
|
777
|
+
private maxBytes;
|
|
778
|
+
/** Lazy in-memory index for queries — populated on first read. */
|
|
779
|
+
private index?;
|
|
780
|
+
private loaded;
|
|
781
|
+
constructor(options: FileSystemTraceStoreOptions);
|
|
782
|
+
private ensureDir;
|
|
783
|
+
private append;
|
|
784
|
+
private insertInto;
|
|
785
|
+
private load;
|
|
786
|
+
appendRun(run: Run$1): Promise<void>;
|
|
787
|
+
updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
|
|
788
|
+
appendSpan(span: Span): Promise<void>;
|
|
789
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
790
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
791
|
+
appendArtifact(artifact: Artifact$1): Promise<void>;
|
|
792
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
793
|
+
getRun(runId: string): Promise<Run$1 | undefined>;
|
|
794
|
+
listRuns(filter?: RunFilter): Promise<Run$1[]>;
|
|
795
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
796
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
797
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
798
|
+
artifacts(runId: string): Promise<Artifact$1[]>;
|
|
602
799
|
}
|
|
603
800
|
|
|
604
801
|
/**
|
|
605
|
-
*
|
|
606
|
-
*
|
|
607
|
-
* Every prompt used in an eval run is registered with an explicit version.
|
|
608
|
-
* Reports include the content hash so A/B compares are rigorous: if the
|
|
609
|
-
* hash changes between two reports, the prompt actually changed; if it
|
|
610
|
-
* matches, the variance is elsewhere.
|
|
802
|
+
* TraceEmitter — hierarchical span builder that auto-parents using an
|
|
803
|
+
* internal stack. One emitter per Run; emitters do NOT share state.
|
|
611
804
|
*
|
|
612
|
-
*
|
|
613
|
-
*
|
|
805
|
+
* Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
|
|
806
|
+
* return a `SpanHandle` with `.end()` / `.fail()` so callers don't
|
|
807
|
+
* have to thread spanIds manually. For async workflows that can't use
|
|
808
|
+
* the stack (e.g. fan-out parallel calls), pass `parentSpanId`
|
|
809
|
+
* explicitly.
|
|
614
810
|
*/
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
/** SHA-256 of content, 12-hex-char prefix */
|
|
621
|
-
hash: string;
|
|
622
|
-
/** Full prompt body */
|
|
623
|
-
content: string;
|
|
811
|
+
|
|
812
|
+
interface SpanHandle<S extends Span = Span> {
|
|
813
|
+
span: S;
|
|
814
|
+
end(patch?: Partial<S>): Promise<void>;
|
|
815
|
+
fail(error: string | Error, patch?: Partial<S>): Promise<void>;
|
|
624
816
|
}
|
|
625
|
-
|
|
626
|
-
|
|
817
|
+
interface TraceEmitterOptions {
|
|
818
|
+
runId?: string;
|
|
819
|
+
/** Inject a clock for deterministic tests. */
|
|
820
|
+
now?: () => number;
|
|
821
|
+
/** Inject an id generator for deterministic tests. */
|
|
822
|
+
id?: () => string;
|
|
823
|
+
}
|
|
824
|
+
declare class TraceEmitter {
|
|
825
|
+
private store;
|
|
826
|
+
private stack;
|
|
827
|
+
private _runId;
|
|
828
|
+
private now;
|
|
829
|
+
private id;
|
|
830
|
+
constructor(store: TraceStore, options?: TraceEmitterOptions);
|
|
831
|
+
get runId(): string;
|
|
832
|
+
startRun(run: Omit<Run$1, 'runId' | 'startedAt' | 'status'>): Promise<Run$1>;
|
|
833
|
+
endRun(outcome?: RunOutcome$1): Promise<void>;
|
|
834
|
+
abortRun(reason: string): Promise<void>;
|
|
835
|
+
span<S extends Span = Span>(init: {
|
|
836
|
+
kind: SpanKind;
|
|
837
|
+
name: string;
|
|
838
|
+
parentSpanId?: string;
|
|
839
|
+
attributes?: Record<string, unknown>;
|
|
840
|
+
} & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
|
|
841
|
+
private handle;
|
|
842
|
+
private pop;
|
|
843
|
+
llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
|
|
844
|
+
tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
|
|
845
|
+
retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
|
|
846
|
+
recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
|
|
847
|
+
sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
|
|
848
|
+
emit(event: {
|
|
849
|
+
kind: EventKind;
|
|
850
|
+
spanId?: string;
|
|
851
|
+
payload?: Record<string, unknown>;
|
|
852
|
+
}): Promise<TraceEvent>;
|
|
853
|
+
recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
|
|
854
|
+
timestamp?: number;
|
|
855
|
+
}): Promise<BudgetLedgerEntry>;
|
|
856
|
+
recordArtifact(artifact: Omit<Artifact$1, 'artifactId' | 'runId'>): Promise<Artifact$1>;
|
|
627
857
|
/**
|
|
628
|
-
*
|
|
629
|
-
*
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
register(id: string, version: string, content: string): Promise<PromptHandle>;
|
|
633
|
-
/** Look up a registered prompt. Throws if unknown — no implicit defaults. */
|
|
634
|
-
get(id: string, version: string): PromptHandle;
|
|
635
|
-
/** Return all versions of an id, newest-first (lex-descending on version). */
|
|
636
|
-
listVersions(id: string): PromptHandle[];
|
|
637
|
-
/** Snapshot the whole registry — useful for including in reports. */
|
|
638
|
-
list(): PromptHandle[];
|
|
639
|
-
/** Verify a hash against registered content. Returns null if not found. */
|
|
640
|
-
verifyHash(id: string, version: string, expectedHash: string): boolean | null;
|
|
858
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
859
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
860
|
+
*/
|
|
861
|
+
within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
|
|
641
862
|
}
|
|
642
|
-
/**
|
|
643
|
-
declare function
|
|
863
|
+
/** Helper to build an LLM span handle args object from a provider-shaped response. */
|
|
864
|
+
declare function llmSpanFromProvider(args: {
|
|
865
|
+
name?: string;
|
|
866
|
+
model: string;
|
|
867
|
+
messages: Message[];
|
|
868
|
+
output: string;
|
|
869
|
+
usage?: {
|
|
870
|
+
inputTokens?: number;
|
|
871
|
+
outputTokens?: number;
|
|
872
|
+
cachedTokens?: number;
|
|
873
|
+
reasoningTokens?: number;
|
|
874
|
+
};
|
|
875
|
+
costUsd?: number;
|
|
876
|
+
finishReason?: string;
|
|
877
|
+
}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
|
|
644
878
|
|
|
645
879
|
/**
|
|
646
|
-
*
|
|
880
|
+
* Policy-based agent control runtime.
|
|
647
881
|
*
|
|
648
|
-
*
|
|
649
|
-
* 80% of AI slop that every production agent leaks:
|
|
650
|
-
* - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
|
|
651
|
-
* - N-gram repetition (same phrase over and over)
|
|
652
|
-
* - Hedging overuse ("I could be wrong, but...")
|
|
653
|
-
* - Apology padding ("I'm so sorry for the confusion...")
|
|
654
|
-
* - Unused opening formulas ("Great question!")
|
|
655
|
-
* - Length bounds (too short to be useful, too long to be read)
|
|
882
|
+
* This is the minimal reusable loop behind driver-agent patterns:
|
|
656
883
|
*
|
|
657
|
-
*
|
|
658
|
-
*
|
|
884
|
+
* observe state -> validate -> decide next action -> act -> observe -> ...
|
|
885
|
+
*
|
|
886
|
+
* It deliberately does not model named "topologies". Direct execution,
|
|
887
|
+
* critic/revise, driver intervention, specialist calls, and human escalation
|
|
888
|
+
* are all just actions chosen by the control policy.
|
|
659
889
|
*/
|
|
660
890
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
/**
|
|
665
|
-
|
|
666
|
-
/**
|
|
667
|
-
|
|
668
|
-
/**
|
|
669
|
-
|
|
670
|
-
/**
|
|
671
|
-
|
|
672
|
-
/**
|
|
673
|
-
|
|
674
|
-
/**
|
|
675
|
-
|
|
676
|
-
/**
|
|
677
|
-
|
|
678
|
-
/** How heavily each violation class reduces the score (default 1). */
|
|
679
|
-
penaltyWeights?: Partial<Record<SlopCategory, number>>;
|
|
891
|
+
type ControlSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
892
|
+
type ControlActionFailureMode = 'continue' | 'stop';
|
|
893
|
+
interface ControlEvalResult {
|
|
894
|
+
/** Stable validator or judge id. */
|
|
895
|
+
id: string;
|
|
896
|
+
/** Whether this check passed. */
|
|
897
|
+
passed: boolean;
|
|
898
|
+
/** Optional normalized score. 1 = best, 0 = worst. */
|
|
899
|
+
score?: number;
|
|
900
|
+
/** Objective validators should usually be "error" or "critical" when failed. */
|
|
901
|
+
severity?: ControlSeverity;
|
|
902
|
+
/** Human-readable result. */
|
|
903
|
+
detail?: string;
|
|
904
|
+
/** Small evidence string or pointer. Avoid large payloads. */
|
|
905
|
+
evidence?: string;
|
|
906
|
+
/** True when the result came from deterministic state, not LLM judgment. */
|
|
907
|
+
objective?: boolean;
|
|
680
908
|
}
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
category: SlopCategory;
|
|
686
|
-
detail: string;
|
|
687
|
-
example?: string;
|
|
909
|
+
interface ControlBudget {
|
|
910
|
+
maxSteps: number;
|
|
911
|
+
maxWallMs?: number;
|
|
912
|
+
maxCostUsd?: number;
|
|
688
913
|
}
|
|
689
|
-
interface
|
|
690
|
-
/**
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
914
|
+
interface ControlStopPolicies<TState, TAction> {
|
|
915
|
+
/**
|
|
916
|
+
* Stop after N consecutive steps with no state fingerprint change and
|
|
917
|
+
* less than `minScoreDelta` score movement. Disabled when omitted.
|
|
918
|
+
*/
|
|
919
|
+
maxNoProgressSteps?: number;
|
|
920
|
+
/**
|
|
921
|
+
* Stop after the same action fingerprint is selected N consecutive
|
|
922
|
+
* times. Disabled when omitted.
|
|
923
|
+
*/
|
|
924
|
+
maxRepeatedActions?: number;
|
|
925
|
+
/** Minimum score movement that counts as progress. Default 0.001. */
|
|
926
|
+
minScoreDelta?: number;
|
|
927
|
+
/** Override the default JSON/string fingerprint for state comparisons. */
|
|
928
|
+
stateFingerprint?: (state: TState) => string;
|
|
929
|
+
/** Override the default JSON/string fingerprint for repeated-action checks. */
|
|
930
|
+
actionFingerprint?: (action: TAction) => string;
|
|
931
|
+
}
|
|
932
|
+
interface ControlContext<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
933
|
+
intent: string;
|
|
934
|
+
state: TState;
|
|
935
|
+
evals: TEval[];
|
|
936
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
937
|
+
budget: ControlBudget;
|
|
938
|
+
stepIndex: number;
|
|
939
|
+
wallMs: number;
|
|
940
|
+
spentCostUsd: number;
|
|
941
|
+
remainingCostUsd?: number;
|
|
942
|
+
abortSignal: AbortSignal;
|
|
943
|
+
emitter?: TraceEmitter;
|
|
695
944
|
}
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
945
|
+
type ControlDecision<TAction> = {
|
|
946
|
+
type: 'continue';
|
|
947
|
+
action: TAction;
|
|
948
|
+
reason?: string;
|
|
949
|
+
} | {
|
|
950
|
+
type: 'stop';
|
|
951
|
+
reason: string;
|
|
952
|
+
pass?: boolean;
|
|
953
|
+
score?: number;
|
|
954
|
+
};
|
|
955
|
+
interface StopDecision {
|
|
956
|
+
stop: boolean;
|
|
957
|
+
pass: boolean;
|
|
958
|
+
reason: string;
|
|
959
|
+
score?: number;
|
|
960
|
+
failureClass?: FailureClass;
|
|
961
|
+
}
|
|
962
|
+
interface ControlActionOutcome<TActionResult> {
|
|
963
|
+
ok: boolean;
|
|
964
|
+
result?: TActionResult;
|
|
965
|
+
error?: string;
|
|
966
|
+
costUsd?: number;
|
|
967
|
+
durationMs: number;
|
|
968
|
+
}
|
|
969
|
+
interface ControlRuntimeError {
|
|
970
|
+
phase: 'observe' | 'validate' | 'decide' | 'act' | 'stop-policy' | 'on-step' | 'trace';
|
|
971
|
+
stepIndex: number;
|
|
972
|
+
message: string;
|
|
973
|
+
}
|
|
974
|
+
interface ControlStep<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
975
|
+
index: number;
|
|
976
|
+
decision: ControlDecision<TAction>;
|
|
977
|
+
beforeState: TState;
|
|
978
|
+
afterState: TState;
|
|
979
|
+
evalsBefore: TEval[];
|
|
980
|
+
evalsAfter: TEval[];
|
|
981
|
+
actionOutcome?: ControlActionOutcome<TActionResult>;
|
|
982
|
+
startedAt: string;
|
|
983
|
+
endedAt: string;
|
|
984
|
+
}
|
|
985
|
+
interface ControlRunResult<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
986
|
+
intent: string;
|
|
987
|
+
pass: boolean;
|
|
988
|
+
completed: boolean;
|
|
989
|
+
reason: string;
|
|
990
|
+
score?: number;
|
|
991
|
+
steps: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
992
|
+
finalState: TState | undefined;
|
|
993
|
+
finalEvals: TEval[];
|
|
994
|
+
wallMs: number;
|
|
995
|
+
spentCostUsd: number;
|
|
996
|
+
runId: string | null;
|
|
997
|
+
failureClass?: FailureClass;
|
|
998
|
+
runtimeErrors: ControlRuntimeError[];
|
|
999
|
+
stoppedBy: 'policy' | 'stop-policy' | 'budget' | 'abort' | 'runtime-error';
|
|
1000
|
+
}
|
|
1001
|
+
interface ControlRuntimeConfig<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
1002
|
+
intent: string;
|
|
1003
|
+
budget?: Partial<ControlBudget>;
|
|
1004
|
+
signal?: AbortSignal;
|
|
1005
|
+
/** Defaults to `continue`: action failures are recorded, then the policy gets another chance. */
|
|
1006
|
+
actionFailure?: ControlActionFailureMode;
|
|
1007
|
+
/**
|
|
1008
|
+
* Extract cost from an action result. Used for `maxCostUsd` budget
|
|
1009
|
+
* enforcement and trace budget ledger emission.
|
|
1010
|
+
*/
|
|
1011
|
+
getActionCostUsd?: (ctx: {
|
|
1012
|
+
action: TAction;
|
|
1013
|
+
result: TActionResult;
|
|
1014
|
+
state: TState;
|
|
1015
|
+
evals: TEval[];
|
|
1016
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
1017
|
+
}) => number | undefined;
|
|
1018
|
+
/** Read typed task/product state. Prefer structured state over transcript-only context. */
|
|
1019
|
+
observe: (ctx: {
|
|
1020
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
1021
|
+
abortSignal: AbortSignal;
|
|
1022
|
+
}) => Promise<TState> | TState;
|
|
1023
|
+
/** Objective validators first, subjective judges only where objective state is insufficient. */
|
|
1024
|
+
validate: (ctx: {
|
|
1025
|
+
intent: string;
|
|
1026
|
+
state: TState;
|
|
1027
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
1028
|
+
abortSignal: AbortSignal;
|
|
1029
|
+
}) => Promise<TEval[]> | TEval[];
|
|
1030
|
+
/** Choose the next control action. Can call a worker, ask user, run critic, inspect state, or stop. */
|
|
1031
|
+
decide: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<ControlDecision<TAction>> | ControlDecision<TAction>;
|
|
1032
|
+
/** Execute the action selected by the policy. */
|
|
1033
|
+
act: (action: TAction, ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<TActionResult> | TActionResult;
|
|
1034
|
+
/** Final stopping policy. Called before decide and after each action. */
|
|
1035
|
+
shouldStop?: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<StopDecision> | StopDecision;
|
|
1036
|
+
/** Optional hook for tracing or live progress updates. */
|
|
1037
|
+
onStep?: (step: ControlStep<TState, TAction, TActionResult, TEval>) => Promise<void> | void;
|
|
1038
|
+
/** Optional generic stuck-loop policies. Custom `shouldStop` still runs first. */
|
|
1039
|
+
stopPolicies?: ControlStopPolicies<TState, TAction>;
|
|
1040
|
+
/** Optional trace sink. Emits one run plus one span per control step. */
|
|
1041
|
+
store?: TraceStore;
|
|
1042
|
+
scenarioId?: string;
|
|
1043
|
+
projectId?: string;
|
|
1044
|
+
variantId?: string;
|
|
1045
|
+
}
|
|
1046
|
+
declare function runAgentControlLoop<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(config: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>>;
|
|
1047
|
+
declare function stopOnNoProgress<TState, TAction>(maxNoProgressSteps: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxNoProgressSteps'>): ControlStopPolicies<TState, TAction>;
|
|
1048
|
+
declare function stopOnRepeatedAction<TState, TAction>(maxRepeatedActions: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxRepeatedActions'>): ControlStopPolicies<TState, TAction>;
|
|
1049
|
+
declare function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
|
|
1050
|
+
declare function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
|
|
1051
|
+
declare function allCriticalPassed(evals: ControlEvalResult[]): boolean;
|
|
703
1052
|
|
|
704
1053
|
/**
|
|
705
|
-
*
|
|
1054
|
+
* Dataset — versioned, sliceable, content-hashed scenario collection.
|
|
706
1055
|
*
|
|
707
|
-
*
|
|
708
|
-
*
|
|
709
|
-
*
|
|
710
|
-
*
|
|
1056
|
+
* Scenarios stop being ephemeral arrays and become first-class
|
|
1057
|
+
* artifacts. Every Dataset carries:
|
|
1058
|
+
* - content hash (sha256 over canonicalized scenario array)
|
|
1059
|
+
* - provenance (contributor, createdAt, sourceUrl)
|
|
1060
|
+
* - split labels (train | dev | test | holdout)
|
|
1061
|
+
* - difficulty tiers (easy | medium | hard | extreme)
|
|
1062
|
+
* - tags (free-form, per-scenario)
|
|
711
1063
|
*
|
|
712
|
-
*
|
|
713
|
-
*
|
|
714
|
-
*
|
|
715
|
-
*
|
|
1064
|
+
* `Dataset.slice({ difficulty, split, holdout, seed })` returns a
|
|
1065
|
+
* deterministic, reproducible subset. Holdout slices are locked: you
|
|
1066
|
+
* can read them but `mutate` throws, which prevents "oh I'll just
|
|
1067
|
+
* tweak that one scenario" contamination drift.
|
|
716
1068
|
*/
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
1069
|
+
type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
|
|
1070
|
+
type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
|
|
1071
|
+
interface DatasetScenario {
|
|
1072
|
+
id: string;
|
|
1073
|
+
/** Arbitrary payload; the framework doesn't interpret it. */
|
|
1074
|
+
payload: unknown;
|
|
1075
|
+
split?: DatasetSplit;
|
|
1076
|
+
difficulty?: DatasetDifficulty;
|
|
1077
|
+
/** Canary token that MUST NOT round-trip through a correct agent output. */
|
|
1078
|
+
canary?: string;
|
|
1079
|
+
tags?: Record<string, string>;
|
|
728
1080
|
}
|
|
729
|
-
interface
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
1081
|
+
interface DatasetProvenance {
|
|
1082
|
+
contributor?: string;
|
|
1083
|
+
createdAt: string;
|
|
1084
|
+
sourceUrl?: string;
|
|
1085
|
+
license?: string;
|
|
1086
|
+
description?: string;
|
|
1087
|
+
/** Monotonic human-readable version (e.g. "2026.04.20"). */
|
|
1088
|
+
version: string;
|
|
736
1089
|
}
|
|
737
|
-
interface
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
/**
|
|
741
|
-
|
|
1090
|
+
interface DatasetManifest {
|
|
1091
|
+
name: string;
|
|
1092
|
+
provenance: DatasetProvenance;
|
|
1093
|
+
/** sha256 hex over canonicalized scenarios. */
|
|
1094
|
+
contentHash: string;
|
|
1095
|
+
scenarioCount: number;
|
|
1096
|
+
splitCounts: Record<DatasetSplit, number>;
|
|
742
1097
|
}
|
|
743
|
-
interface
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
1098
|
+
interface SliceOptions {
|
|
1099
|
+
split?: DatasetSplit;
|
|
1100
|
+
difficulty?: DatasetDifficulty;
|
|
1101
|
+
/** Number of scenarios (random sample, seeded). Omit to take all that match. */
|
|
1102
|
+
limit?: number;
|
|
1103
|
+
seed?: number;
|
|
1104
|
+
/** Predicate narrowing. Applied after split/difficulty filters. */
|
|
1105
|
+
filter?: (scenario: DatasetScenario) => boolean;
|
|
1106
|
+
/** If true, include scenarios marked as holdout. Default false. */
|
|
1107
|
+
includeHoldout?: boolean;
|
|
750
1108
|
}
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
/** Optional description for human-facing reports. */
|
|
755
|
-
description?: string;
|
|
756
|
-
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
757
|
-
validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
|
|
1109
|
+
/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
|
|
1110
|
+
declare class HoldoutLockedError extends Error {
|
|
1111
|
+
constructor(datasetName: string);
|
|
758
1112
|
}
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
/**
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
1113
|
+
declare class Dataset {
|
|
1114
|
+
readonly name: string;
|
|
1115
|
+
readonly provenance: DatasetProvenance;
|
|
1116
|
+
private scenarios;
|
|
1117
|
+
private locked;
|
|
1118
|
+
constructor(init: {
|
|
1119
|
+
name: string;
|
|
1120
|
+
provenance: DatasetProvenance;
|
|
1121
|
+
scenarios: DatasetScenario[];
|
|
1122
|
+
locked?: boolean;
|
|
1123
|
+
});
|
|
1124
|
+
/** All scenarios. Readonly — callers must go through `slice` or `clone`. */
|
|
1125
|
+
all(): readonly DatasetScenario[];
|
|
1126
|
+
get size(): number;
|
|
1127
|
+
/**
|
|
1128
|
+
* Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
|
|
1129
|
+
* the same arguments always produce the same slice across machines.
|
|
1130
|
+
*/
|
|
1131
|
+
slice(options?: SliceOptions): DatasetScenario[];
|
|
1132
|
+
/**
|
|
1133
|
+
* Assemble the manifest (name + provenance + content hash + counts).
|
|
1134
|
+
* Content hash is deterministic over canonicalized scenarios.
|
|
1135
|
+
*/
|
|
1136
|
+
manifest(): Promise<DatasetManifest>;
|
|
1137
|
+
/** Fresh unlocked copy — for post-release forks when mutation is needed. */
|
|
1138
|
+
clone(overrides?: Partial<{
|
|
1139
|
+
name: string;
|
|
1140
|
+
version: string;
|
|
1141
|
+
}>): Dataset;
|
|
1142
|
+
lock(): void;
|
|
1143
|
+
add(scenario: DatasetScenario): void;
|
|
1144
|
+
remove(scenarioId: string): void;
|
|
1145
|
+
/**
|
|
1146
|
+
* Stable JSON-Lines serialization — deterministic byte-for-byte.
|
|
1147
|
+
* Write to disk for contamination-verifiable archives.
|
|
1148
|
+
*/
|
|
1149
|
+
toJsonl(): string;
|
|
1150
|
+
static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
|
|
1151
|
+
}
|
|
1152
|
+
declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
|
|
777
1153
|
|
|
778
1154
|
/**
|
|
779
|
-
*
|
|
1155
|
+
* Prompt optimizer — A/B test prompt variants with statistical rigor.
|
|
780
1156
|
*
|
|
781
|
-
*
|
|
782
|
-
*
|
|
783
|
-
*
|
|
1157
|
+
* Runs N prompt variants against a fixed scenario set, collects per-scenario
|
|
1158
|
+
* scores via the user-provided `scoreVariant` callback, and returns:
|
|
1159
|
+
* - per-variant mean + bootstrap CI
|
|
1160
|
+
* - pairwise significance (Mann-Whitney, non-parametric — works on any
|
|
1161
|
+
* score distribution, not just normal)
|
|
1162
|
+
* - a winner (highest mean, flagged if the lead is not significant)
|
|
784
1163
|
*
|
|
785
|
-
*
|
|
786
|
-
*
|
|
1164
|
+
* Deliberately generic — the `scoreVariant` callback does whatever domain
|
|
1165
|
+
* work the consumer needs (invoke the agent, judge the output, whatever),
|
|
1166
|
+
* and returns a number per scenario. This lets the optimizer stay small +
|
|
1167
|
+
* testable.
|
|
787
1168
|
*/
|
|
788
|
-
interface
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
/**
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
1169
|
+
interface PromptVariant$1 {
|
|
1170
|
+
id: string;
|
|
1171
|
+
prompt: string;
|
|
1172
|
+
metadata?: Record<string, unknown>;
|
|
1173
|
+
}
|
|
1174
|
+
interface OptimizationConfig {
|
|
1175
|
+
variants: PromptVariant$1[];
|
|
1176
|
+
/** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
|
|
1177
|
+
trialsPerScenario?: number;
|
|
1178
|
+
/** Significance threshold for pairwise comparison (default 0.05). */
|
|
1179
|
+
significanceLevel?: number;
|
|
1180
|
+
/**
|
|
1181
|
+
* The scoring callback. For each (variant, scenarioId, trialIndex), produce
|
|
1182
|
+
* a score in 0..1 (or any numeric range — the optimizer only cares about
|
|
1183
|
+
* monotonicity).
|
|
1184
|
+
*/
|
|
1185
|
+
scoreVariant: (args: {
|
|
1186
|
+
variant: PromptVariant$1;
|
|
1187
|
+
scenarioId: string;
|
|
1188
|
+
trialIndex: number;
|
|
1189
|
+
}) => Promise<number>;
|
|
1190
|
+
/** Scenario ids to run against. */
|
|
1191
|
+
scenarioIds: string[];
|
|
1192
|
+
/** Optional hook — fires after each (variant, scenario) fully scored. */
|
|
1193
|
+
onScenarioComplete?: (info: {
|
|
1194
|
+
variantId: string;
|
|
1195
|
+
scenarioId: string;
|
|
1196
|
+
scores: number[];
|
|
1197
|
+
}) => void;
|
|
1198
|
+
}
|
|
1199
|
+
interface VariantScore {
|
|
1200
|
+
variantId: string;
|
|
1201
|
+
mean: number;
|
|
1202
|
+
ci95: {
|
|
1203
|
+
lower: number;
|
|
1204
|
+
upper: number;
|
|
1205
|
+
};
|
|
1206
|
+
n: number;
|
|
1207
|
+
perScenario: Record<string, {
|
|
1208
|
+
mean: number;
|
|
1209
|
+
n: number;
|
|
1210
|
+
samples: number[];
|
|
800
1211
|
}>;
|
|
801
1212
|
}
|
|
802
|
-
interface
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
1213
|
+
interface PairwiseComparison {
|
|
1214
|
+
variantA: string;
|
|
1215
|
+
variantB: string;
|
|
1216
|
+
pValue: number;
|
|
1217
|
+
/** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
|
|
1218
|
+
qValue: number;
|
|
1219
|
+
/** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
|
|
1220
|
+
significant: boolean;
|
|
1221
|
+
meanDelta: number;
|
|
807
1222
|
}
|
|
808
|
-
interface
|
|
809
|
-
|
|
810
|
-
|
|
1223
|
+
interface OptimizationResult {
|
|
1224
|
+
winner: {
|
|
1225
|
+
variantId: string;
|
|
1226
|
+
/** True when the winner's lead vs every other variant is statistically significant. */
|
|
1227
|
+
significant: boolean;
|
|
1228
|
+
ciLowerBoundExceedsSecondMean: boolean;
|
|
1229
|
+
};
|
|
1230
|
+
scores: VariantScore[];
|
|
1231
|
+
pairwise: PairwiseComparison[];
|
|
1232
|
+
config: {
|
|
1233
|
+
trialsPerScenario: number;
|
|
1234
|
+
significanceLevel: number;
|
|
1235
|
+
variants: string[];
|
|
1236
|
+
scenarios: string[];
|
|
1237
|
+
};
|
|
811
1238
|
}
|
|
812
|
-
declare class
|
|
813
|
-
|
|
814
|
-
private readonly snapshots;
|
|
815
|
-
set(scopeId: string, snapshot: WorkspaceSnapshot): void;
|
|
816
|
-
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
1239
|
+
declare class PromptOptimizer {
|
|
1240
|
+
run(config: OptimizationConfig): Promise<OptimizationResult>;
|
|
817
1241
|
}
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
1242
|
+
|
|
1243
|
+
interface RunScore {
|
|
1244
|
+
success: number;
|
|
1245
|
+
goalProgress: number;
|
|
1246
|
+
repoGroundedness: number;
|
|
1247
|
+
driftPenalty: number;
|
|
1248
|
+
toolUseQuality: number;
|
|
1249
|
+
patchQuality: number;
|
|
1250
|
+
testReality: number;
|
|
1251
|
+
finalGate: number;
|
|
1252
|
+
reviewerBlockers: number;
|
|
1253
|
+
costUsd: number;
|
|
1254
|
+
wallSeconds: number;
|
|
1255
|
+
notes?: string[];
|
|
822
1256
|
}
|
|
823
|
-
interface
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
1257
|
+
interface RunScoreWeights {
|
|
1258
|
+
success: number;
|
|
1259
|
+
goalProgress: number;
|
|
1260
|
+
repoGroundedness: number;
|
|
1261
|
+
driftPenalty: number;
|
|
1262
|
+
toolUseQuality: number;
|
|
1263
|
+
patchQuality: number;
|
|
1264
|
+
testReality: number;
|
|
1265
|
+
finalGate: number;
|
|
1266
|
+
reviewerBlockers: number;
|
|
1267
|
+
costUsd: number;
|
|
1268
|
+
wallSeconds: number;
|
|
828
1269
|
}
|
|
829
|
-
declare
|
|
830
|
-
declare function
|
|
831
|
-
declare function
|
|
832
|
-
declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
|
|
833
|
-
min?: number;
|
|
834
|
-
}): WorkspaceAssertion;
|
|
835
|
-
/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
|
|
836
|
-
declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
|
|
837
|
-
pass: boolean;
|
|
838
|
-
score: number;
|
|
839
|
-
results: Array<{
|
|
840
|
-
assertion: string;
|
|
841
|
-
result: WorkspaceAssertionResult;
|
|
842
|
-
}>;
|
|
843
|
-
};
|
|
1270
|
+
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
1271
|
+
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
1272
|
+
declare function clamp01(value: number): number;
|
|
844
1273
|
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
1274
|
+
interface SteeringRolePrompt {
|
|
1275
|
+
system?: string;
|
|
1276
|
+
append?: string;
|
|
1277
|
+
}
|
|
1278
|
+
interface SteeringBundle {
|
|
1279
|
+
id: string;
|
|
1280
|
+
coderPrompt?: string;
|
|
1281
|
+
continuePrompt?: string;
|
|
1282
|
+
reviewerPrompts?: Record<string, string>;
|
|
1283
|
+
skills?: string[];
|
|
1284
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1285
|
+
metadata?: Record<string, unknown>;
|
|
1286
|
+
}
|
|
1287
|
+
interface SteeringDelta {
|
|
1288
|
+
coderPrompt?: string;
|
|
1289
|
+
continuePrompt?: string;
|
|
1290
|
+
reviewerPrompts?: Record<string, string>;
|
|
1291
|
+
skills?: string[];
|
|
1292
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1293
|
+
metadata?: Record<string, unknown>;
|
|
1294
|
+
}
|
|
1295
|
+
declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
|
|
1296
|
+
declare function renderSteeringText(bundle: SteeringBundle): string;
|
|
1297
|
+
|
|
1298
|
+
interface OptimizationExample {
|
|
1299
|
+
scenarioId: string;
|
|
1300
|
+
metadata?: Record<string, unknown>;
|
|
1301
|
+
}
|
|
1302
|
+
interface SteeringEvaluation {
|
|
1303
|
+
variant: SteeringBundle;
|
|
1304
|
+
example: OptimizationExample;
|
|
1305
|
+
trialIndex: number;
|
|
1306
|
+
}
|
|
1307
|
+
interface SteeringVariantReport {
|
|
1308
|
+
variantId: string;
|
|
1309
|
+
bundle: SteeringBundle;
|
|
1310
|
+
mean: number;
|
|
1311
|
+
ci95: {
|
|
1312
|
+
lower: number;
|
|
1313
|
+
upper: number;
|
|
1314
|
+
};
|
|
1315
|
+
scenarioScores: Record<string, {
|
|
1316
|
+
mean: number;
|
|
1317
|
+
n: number;
|
|
1318
|
+
samples: number[];
|
|
1319
|
+
}>;
|
|
1320
|
+
}
|
|
1321
|
+
interface OptimizationLoopResult {
|
|
1322
|
+
winner: SteeringBundle;
|
|
1323
|
+
significant: boolean;
|
|
1324
|
+
reports: SteeringVariantReport[];
|
|
1325
|
+
pairwise: Array<{
|
|
1326
|
+
variantA: string;
|
|
1327
|
+
variantB: string;
|
|
1328
|
+
pValue: number;
|
|
1329
|
+
qValue: number;
|
|
1330
|
+
significant: boolean;
|
|
1331
|
+
meanDelta: number;
|
|
1332
|
+
}>;
|
|
1333
|
+
}
|
|
1334
|
+
interface OptimizationLoopConfig {
|
|
1335
|
+
variants: SteeringBundle[];
|
|
1336
|
+
examples: OptimizationExample[];
|
|
1337
|
+
evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
|
|
1338
|
+
scoreWeights?: Partial<RunScoreWeights>;
|
|
1339
|
+
trialsPerScenario?: number;
|
|
1340
|
+
}
|
|
1341
|
+
declare class OptimizationLoop {
|
|
1342
|
+
private readonly optimizer;
|
|
1343
|
+
constructor(optimizer?: PromptOptimizer);
|
|
1344
|
+
run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
|
|
1345
|
+
}
|
|
858
1346
|
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
1347
|
+
type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
|
|
1348
|
+
type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
|
|
1349
|
+
type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
|
|
1350
|
+
type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
1351
|
+
interface FeedbackTask {
|
|
1352
|
+
intent: string;
|
|
1353
|
+
context?: unknown;
|
|
1354
|
+
}
|
|
1355
|
+
interface ProposedSideEffect {
|
|
1356
|
+
type: string;
|
|
1357
|
+
risk?: 'low' | 'medium' | 'high';
|
|
1358
|
+
costUsd?: number;
|
|
1359
|
+
externalSideEffect?: boolean;
|
|
1360
|
+
requiresApproval?: boolean;
|
|
866
1361
|
metadata?: Record<string, unknown>;
|
|
867
1362
|
}
|
|
868
|
-
interface
|
|
1363
|
+
interface FeedbackLabel {
|
|
1364
|
+
id?: string;
|
|
1365
|
+
source: FeedbackLabelSource;
|
|
1366
|
+
kind: FeedbackLabelKind;
|
|
1367
|
+
value: unknown;
|
|
1368
|
+
reason?: string;
|
|
1369
|
+
severity?: FeedbackSeverity;
|
|
1370
|
+
createdAt: string;
|
|
1371
|
+
metadata?: Record<string, unknown>;
|
|
1372
|
+
}
|
|
1373
|
+
interface FeedbackAttempt {
|
|
869
1374
|
id: string;
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
1375
|
+
stepIndex: number;
|
|
1376
|
+
artifactType: FeedbackArtifactType;
|
|
1377
|
+
artifact: unknown;
|
|
1378
|
+
options?: unknown[];
|
|
1379
|
+
proposedAction?: ProposedSideEffect;
|
|
1380
|
+
evals?: ControlEvalResult[];
|
|
1381
|
+
feedback?: FeedbackLabel[];
|
|
1382
|
+
createdAt: string;
|
|
1383
|
+
metadata?: Record<string, unknown>;
|
|
878
1384
|
}
|
|
879
|
-
interface
|
|
1385
|
+
interface FeedbackOutcome {
|
|
1386
|
+
success?: boolean;
|
|
1387
|
+
score?: number;
|
|
1388
|
+
metrics?: Record<string, number>;
|
|
1389
|
+
costUsd?: number;
|
|
1390
|
+
detail?: string;
|
|
1391
|
+
observedAt?: string;
|
|
1392
|
+
metadata?: Record<string, unknown>;
|
|
1393
|
+
}
|
|
1394
|
+
interface FeedbackTrajectory {
|
|
880
1395
|
id: string;
|
|
881
|
-
|
|
1396
|
+
projectId?: string;
|
|
1397
|
+
scenarioId?: string;
|
|
1398
|
+
task: FeedbackTask;
|
|
1399
|
+
attempts: FeedbackAttempt[];
|
|
1400
|
+
labels: FeedbackLabel[];
|
|
1401
|
+
outcome?: FeedbackOutcome;
|
|
1402
|
+
split?: DatasetSplit;
|
|
1403
|
+
tags?: Record<string, string>;
|
|
882
1404
|
createdAt: string;
|
|
1405
|
+
updatedAt?: string;
|
|
883
1406
|
metadata?: Record<string, unknown>;
|
|
884
1407
|
}
|
|
885
|
-
interface
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
1408
|
+
interface FeedbackTrajectoryStore {
|
|
1409
|
+
save(trajectory: FeedbackTrajectory): Promise<void>;
|
|
1410
|
+
get(id: string): Promise<FeedbackTrajectory | null>;
|
|
1411
|
+
list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
|
|
1412
|
+
appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
|
|
1413
|
+
appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
|
|
892
1414
|
}
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
listExperiments(): Promise<Experiment[]>;
|
|
899
|
-
saveRun(run: Run$1): Promise<void>;
|
|
900
|
-
getRun(id: string): Promise<Run$1 | null>;
|
|
901
|
-
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
1415
|
+
interface FeedbackTrajectoryFilter {
|
|
1416
|
+
projectId?: string;
|
|
1417
|
+
scenarioId?: string;
|
|
1418
|
+
split?: DatasetSplit;
|
|
1419
|
+
tag?: [string, string];
|
|
902
1420
|
}
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
completeRun(runId: string, report: BenchmarkReport): Promise<void>;
|
|
909
|
-
failRun(runId: string, error: string): Promise<void>;
|
|
910
|
-
/**
|
|
911
|
-
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
912
|
-
* and config changes that may explain the movement.
|
|
913
|
-
*/
|
|
914
|
-
diff(runIdA: string, runIdB: string): Promise<RunDiff>;
|
|
915
|
-
/** Timeline of aggregate scores for an experiment. */
|
|
916
|
-
timeline(experimentId: string): Promise<Array<{
|
|
917
|
-
runId: string;
|
|
918
|
-
startedAt: string;
|
|
919
|
-
overall: number | null;
|
|
920
|
-
}>>;
|
|
1421
|
+
interface FeedbackSplitPolicy {
|
|
1422
|
+
trainPct?: number;
|
|
1423
|
+
devPct?: number;
|
|
1424
|
+
testPct?: number;
|
|
1425
|
+
holdoutPct?: number;
|
|
921
1426
|
}
|
|
922
|
-
interface
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
runId: string;
|
|
930
|
-
name?: string;
|
|
931
|
-
startedAt: string;
|
|
932
|
-
};
|
|
933
|
-
aggregateDelta: number;
|
|
934
|
-
scenarios: Array<{
|
|
935
|
-
scenarioId: string;
|
|
936
|
-
before: number | null;
|
|
937
|
-
after: number | null;
|
|
938
|
-
delta: number | null;
|
|
939
|
-
status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
|
|
940
|
-
}>;
|
|
941
|
-
configChanges: Record<string, {
|
|
942
|
-
before: unknown;
|
|
943
|
-
after: unknown;
|
|
944
|
-
}>;
|
|
1427
|
+
interface PreferenceMemoryEntry {
|
|
1428
|
+
instruction: string;
|
|
1429
|
+
rationale: string;
|
|
1430
|
+
weight: number;
|
|
1431
|
+
sourceTrajectoryId: string;
|
|
1432
|
+
sourceLabelId?: string;
|
|
1433
|
+
category?: string;
|
|
945
1434
|
}
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
* Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
|
|
951
|
-
* files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
|
|
952
|
-
* based rollover. Writes are append-only so the file log doubles as an audit
|
|
953
|
-
* trail of every state transition the tracker ever wrote.
|
|
954
|
-
*
|
|
955
|
-
* Reads lazy-load every NDJSON file in the directory (including rolled-over
|
|
956
|
-
* archives), latest-write-wins per `id`. Subsequent writes update the
|
|
957
|
-
* in-memory index in place so reads after writes are O(1).
|
|
958
|
-
*
|
|
959
|
-
* Node-only — imports `node:fs/promises`. Don't import this from a Worker;
|
|
960
|
-
* use the in-memory store or the D1 store from `./experiment-tracker-d1`.
|
|
961
|
-
*/
|
|
962
|
-
|
|
963
|
-
interface FileSystemExperimentStoreOptions {
|
|
964
|
-
/** Directory the NDJSON files live in. Created on first write. */
|
|
965
|
-
dir: string;
|
|
966
|
-
/** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
|
|
967
|
-
maxBytes?: number;
|
|
1435
|
+
interface FeedbackOptimizerRow extends OptimizationExample {
|
|
1436
|
+
trajectoryId: string;
|
|
1437
|
+
labelKinds: FeedbackLabelKind[];
|
|
1438
|
+
score?: number;
|
|
968
1439
|
}
|
|
969
|
-
declare class
|
|
1440
|
+
declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
|
|
1441
|
+
private readonly trajectories;
|
|
1442
|
+
save(trajectory: FeedbackTrajectory): Promise<void>;
|
|
1443
|
+
get(id: string): Promise<FeedbackTrajectory | null>;
|
|
1444
|
+
list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
|
|
1445
|
+
appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
|
|
1446
|
+
appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
|
|
1447
|
+
}
|
|
1448
|
+
declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
|
|
970
1449
|
private readonly dir;
|
|
971
|
-
private readonly
|
|
972
|
-
private index?;
|
|
1450
|
+
private readonly memory;
|
|
973
1451
|
private loaded;
|
|
974
|
-
constructor(options:
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
1452
|
+
constructor(options: {
|
|
1453
|
+
dir: string;
|
|
1454
|
+
});
|
|
1455
|
+
save(trajectory: FeedbackTrajectory): Promise<void>;
|
|
1456
|
+
get(id: string): Promise<FeedbackTrajectory | null>;
|
|
1457
|
+
list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
|
|
1458
|
+
appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
|
|
1459
|
+
appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
|
|
982
1460
|
private append;
|
|
983
1461
|
private load;
|
|
984
1462
|
}
|
|
1463
|
+
declare function createFeedbackTrajectory(input: {
|
|
1464
|
+
id?: string;
|
|
1465
|
+
projectId?: string;
|
|
1466
|
+
scenarioId?: string;
|
|
1467
|
+
task: FeedbackTask;
|
|
1468
|
+
attempts?: FeedbackAttempt[];
|
|
1469
|
+
labels?: FeedbackLabel[];
|
|
1470
|
+
outcome?: FeedbackOutcome;
|
|
1471
|
+
split?: DatasetSplit;
|
|
1472
|
+
tags?: Record<string, string>;
|
|
1473
|
+
createdAt?: string;
|
|
1474
|
+
metadata?: Record<string, unknown>;
|
|
1475
|
+
}): FeedbackTrajectory;
|
|
1476
|
+
declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
|
|
1477
|
+
declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
|
|
1478
|
+
declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
|
|
1479
|
+
declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
|
|
1480
|
+
declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
|
|
1481
|
+
declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
|
|
1482
|
+
declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
|
|
1483
|
+
maxEntries?: number;
|
|
1484
|
+
}): PreferenceMemoryEntry[];
|
|
1485
|
+
declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
|
|
1486
|
+
declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
|
|
1487
|
+
declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
|
|
1488
|
+
declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
|
|
1489
|
+
projectId?: string;
|
|
1490
|
+
scenarioId?: string;
|
|
1491
|
+
artifactType?: FeedbackArtifactType;
|
|
1492
|
+
artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
|
|
1493
|
+
proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
|
|
1494
|
+
createdAt?: string;
|
|
1495
|
+
}): FeedbackTrajectory;
|
|
985
1496
|
|
|
986
1497
|
/**
|
|
987
|
-
*
|
|
988
|
-
*
|
|
989
|
-
*
|
|
990
|
-
*
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1498
|
+
* Normalize scores so all dimensions follow "higher = better".
|
|
1499
|
+
* Inverted dimensions (hallucination, false_confidence, worst_failure)
|
|
1500
|
+
* already use inverted scoring in the prompt (10 = no hallucination),
|
|
1501
|
+
* but this function ensures consistency if raw scores leak through.
|
|
1502
|
+
*/
|
|
1503
|
+
declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
|
|
1504
|
+
/** Weighted mean — falls back to uniform weights when omitted */
|
|
1505
|
+
declare function weightedMean(scores: {
|
|
1506
|
+
score: number;
|
|
1507
|
+
weight?: number;
|
|
1508
|
+
}[]): number;
|
|
1509
|
+
/** Bootstrap confidence interval */
|
|
1510
|
+
declare function confidenceInterval(scores: number[], confidence?: number): {
|
|
1511
|
+
mean: number;
|
|
1512
|
+
lower: number;
|
|
1513
|
+
upper: number;
|
|
1514
|
+
};
|
|
1515
|
+
/**
|
|
1516
|
+
* Inter-rater reliability — simplified Krippendorff's alpha.
|
|
1000
1517
|
*
|
|
1001
|
-
*
|
|
1002
|
-
*
|
|
1003
|
-
* v1; bump only on breaking shape changes.
|
|
1518
|
+
* Each inner array is one judge's scores for all items.
|
|
1519
|
+
* All arrays must have the same length (same items scored).
|
|
1004
1520
|
*/
|
|
1005
|
-
|
|
1521
|
+
declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
|
|
1006
1522
|
/**
|
|
1007
|
-
*
|
|
1008
|
-
*
|
|
1009
|
-
* those types installed can pass the binding directly.
|
|
1523
|
+
* Mann-Whitney U test for comparing two independent groups.
|
|
1524
|
+
* Returns U statistic and approximate p-value (normal approximation).
|
|
1010
1525
|
*/
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1526
|
+
declare function mannWhitneyU(a: number[], b: number[]): {
|
|
1527
|
+
u: number;
|
|
1528
|
+
p: number;
|
|
1529
|
+
};
|
|
1530
|
+
/** Partial credit: returns 0-1 ratio of current toward target */
|
|
1531
|
+
declare function partialCredit(current: number, target: number): number;
|
|
1532
|
+
/**
|
|
1533
|
+
* Paired t-test — before/after measurements on the SAME items.
|
|
1534
|
+
* Pairing removes inter-item variance, giving tighter significance than
|
|
1535
|
+
* an unpaired test when comparing prompt v1 vs prompt v2 on identical
|
|
1536
|
+
* scenarios.
|
|
1537
|
+
*/
|
|
1538
|
+
declare function pairedTTest(before: number[], after: number[]): {
|
|
1539
|
+
t: number;
|
|
1540
|
+
df: number;
|
|
1541
|
+
p: number;
|
|
1542
|
+
};
|
|
1543
|
+
/**
|
|
1544
|
+
* Wilcoxon signed-rank test — paired non-parametric alternative.
|
|
1545
|
+
* Use when the differences aren't normally distributed.
|
|
1546
|
+
*/
|
|
1547
|
+
declare function wilcoxonSignedRank(before: number[], after: number[]): {
|
|
1548
|
+
w: number;
|
|
1549
|
+
p: number;
|
|
1550
|
+
};
|
|
1551
|
+
/**
|
|
1552
|
+
* Cohen's d — standardized effect size for two independent groups.
|
|
1553
|
+
* Positive d means group b has higher mean than group a.
|
|
1554
|
+
* Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
|
|
1555
|
+
*/
|
|
1556
|
+
declare function cohensD(a: number[], b: number[]): number;
|
|
1557
|
+
|
|
1558
|
+
/**
|
|
1559
|
+
* ConvergenceTracker — tracks completion percentage over turns.
|
|
1560
|
+
*
|
|
1561
|
+
* Produces convergence curves showing how quickly the agent reaches
|
|
1562
|
+
* completion criteria.
|
|
1563
|
+
*/
|
|
1564
|
+
declare class ConvergenceTracker {
|
|
1565
|
+
private criteria;
|
|
1566
|
+
private history;
|
|
1567
|
+
constructor(criteria: CompletionCriterion[]);
|
|
1568
|
+
/** Evaluate criteria against current state, record result */
|
|
1569
|
+
record(turn: number, state: DriverState): {
|
|
1570
|
+
completionPercent: number;
|
|
1571
|
+
complete: boolean;
|
|
1572
|
+
criteriaStatus: Record<string, boolean | number>;
|
|
1573
|
+
};
|
|
1574
|
+
/** Get convergence curve */
|
|
1575
|
+
getCurve(): number[];
|
|
1576
|
+
/** Get full history with per-criterion status */
|
|
1577
|
+
getHistory(): {
|
|
1578
|
+
turn: number;
|
|
1579
|
+
completionPercent: number;
|
|
1580
|
+
criteriaStatus: Record<string, boolean | number>;
|
|
1581
|
+
}[];
|
|
1582
|
+
/** Find the turn where completion first reached 100% (or null) */
|
|
1583
|
+
getTurnToCompletion(): number | null;
|
|
1053
1584
|
}
|
|
1054
1585
|
|
|
1055
1586
|
/**
|
|
1056
|
-
*
|
|
1587
|
+
* Versioned prompt registry.
|
|
1057
1588
|
*
|
|
1058
|
-
*
|
|
1059
|
-
*
|
|
1060
|
-
*
|
|
1061
|
-
*
|
|
1062
|
-
* score distribution, not just normal)
|
|
1063
|
-
* - a winner (highest mean, flagged if the lead is not significant)
|
|
1589
|
+
* Every prompt used in an eval run is registered with an explicit version.
|
|
1590
|
+
* Reports include the content hash so A/B compares are rigorous: if the
|
|
1591
|
+
* hash changes between two reports, the prompt actually changed; if it
|
|
1592
|
+
* matches, the variance is elsewhere.
|
|
1064
1593
|
*
|
|
1065
|
-
*
|
|
1066
|
-
*
|
|
1067
|
-
* and returns a number per scenario. This lets the optimizer stay small +
|
|
1068
|
-
* testable.
|
|
1594
|
+
* Hash is SHA-256(content), truncated to 12 hex chars for readability.
|
|
1595
|
+
* Uses the Web Crypto API (works in Workers, Node 22+, browsers).
|
|
1069
1596
|
*/
|
|
1070
|
-
interface
|
|
1597
|
+
interface PromptHandle {
|
|
1598
|
+
/** Stable human-readable id, e.g. 'legal.system' */
|
|
1071
1599
|
id: string;
|
|
1072
|
-
|
|
1073
|
-
|
|
1600
|
+
/** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
|
|
1601
|
+
version: string;
|
|
1602
|
+
/** SHA-256 of content, 12-hex-char prefix */
|
|
1603
|
+
hash: string;
|
|
1604
|
+
/** Full prompt body */
|
|
1605
|
+
content: string;
|
|
1074
1606
|
}
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
/** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
|
|
1078
|
-
trialsPerScenario?: number;
|
|
1079
|
-
/** Significance threshold for pairwise comparison (default 0.05). */
|
|
1080
|
-
significanceLevel?: number;
|
|
1607
|
+
declare class PromptRegistry {
|
|
1608
|
+
private readonly entries;
|
|
1081
1609
|
/**
|
|
1082
|
-
*
|
|
1083
|
-
*
|
|
1084
|
-
*
|
|
1610
|
+
* Register a prompt. Re-registering the same id+version with DIFFERENT
|
|
1611
|
+
* content throws — versions are immutable. Re-registering with the SAME
|
|
1612
|
+
* content is a no-op (idempotent).
|
|
1085
1613
|
*/
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
/**
|
|
1092
|
-
|
|
1093
|
-
/**
|
|
1094
|
-
|
|
1095
|
-
variantId: string;
|
|
1096
|
-
scenarioId: string;
|
|
1097
|
-
scores: number[];
|
|
1098
|
-
}) => void;
|
|
1099
|
-
}
|
|
1100
|
-
interface VariantScore {
|
|
1101
|
-
variantId: string;
|
|
1102
|
-
mean: number;
|
|
1103
|
-
ci95: {
|
|
1104
|
-
lower: number;
|
|
1105
|
-
upper: number;
|
|
1106
|
-
};
|
|
1107
|
-
n: number;
|
|
1108
|
-
perScenario: Record<string, {
|
|
1109
|
-
mean: number;
|
|
1110
|
-
n: number;
|
|
1111
|
-
samples: number[];
|
|
1112
|
-
}>;
|
|
1113
|
-
}
|
|
1114
|
-
interface PairwiseComparison {
|
|
1115
|
-
variantA: string;
|
|
1116
|
-
variantB: string;
|
|
1117
|
-
pValue: number;
|
|
1118
|
-
/** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
|
|
1119
|
-
qValue: number;
|
|
1120
|
-
/** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
|
|
1121
|
-
significant: boolean;
|
|
1122
|
-
meanDelta: number;
|
|
1123
|
-
}
|
|
1124
|
-
interface OptimizationResult {
|
|
1125
|
-
winner: {
|
|
1126
|
-
variantId: string;
|
|
1127
|
-
/** True when the winner's lead vs every other variant is statistically significant. */
|
|
1128
|
-
significant: boolean;
|
|
1129
|
-
ciLowerBoundExceedsSecondMean: boolean;
|
|
1130
|
-
};
|
|
1131
|
-
scores: VariantScore[];
|
|
1132
|
-
pairwise: PairwiseComparison[];
|
|
1133
|
-
config: {
|
|
1134
|
-
trialsPerScenario: number;
|
|
1135
|
-
significanceLevel: number;
|
|
1136
|
-
variants: string[];
|
|
1137
|
-
scenarios: string[];
|
|
1138
|
-
};
|
|
1139
|
-
}
|
|
1140
|
-
declare class PromptOptimizer {
|
|
1141
|
-
run(config: OptimizationConfig): Promise<OptimizationResult>;
|
|
1142
|
-
}
|
|
1143
|
-
|
|
1144
|
-
interface SteeringRolePrompt {
|
|
1145
|
-
system?: string;
|
|
1146
|
-
append?: string;
|
|
1147
|
-
}
|
|
1148
|
-
interface SteeringBundle {
|
|
1149
|
-
id: string;
|
|
1150
|
-
coderPrompt?: string;
|
|
1151
|
-
continuePrompt?: string;
|
|
1152
|
-
reviewerPrompts?: Record<string, string>;
|
|
1153
|
-
skills?: string[];
|
|
1154
|
-
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1155
|
-
metadata?: Record<string, unknown>;
|
|
1156
|
-
}
|
|
1157
|
-
interface SteeringDelta {
|
|
1158
|
-
coderPrompt?: string;
|
|
1159
|
-
continuePrompt?: string;
|
|
1160
|
-
reviewerPrompts?: Record<string, string>;
|
|
1161
|
-
skills?: string[];
|
|
1162
|
-
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1163
|
-
metadata?: Record<string, unknown>;
|
|
1164
|
-
}
|
|
1165
|
-
declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
|
|
1166
|
-
declare function renderSteeringText(bundle: SteeringBundle): string;
|
|
1167
|
-
|
|
1168
|
-
interface RunScore {
|
|
1169
|
-
success: number;
|
|
1170
|
-
goalProgress: number;
|
|
1171
|
-
repoGroundedness: number;
|
|
1172
|
-
driftPenalty: number;
|
|
1173
|
-
toolUseQuality: number;
|
|
1174
|
-
patchQuality: number;
|
|
1175
|
-
testReality: number;
|
|
1176
|
-
finalGate: number;
|
|
1177
|
-
reviewerBlockers: number;
|
|
1178
|
-
costUsd: number;
|
|
1179
|
-
wallSeconds: number;
|
|
1180
|
-
notes?: string[];
|
|
1181
|
-
}
|
|
1182
|
-
interface RunScoreWeights {
|
|
1183
|
-
success: number;
|
|
1184
|
-
goalProgress: number;
|
|
1185
|
-
repoGroundedness: number;
|
|
1186
|
-
driftPenalty: number;
|
|
1187
|
-
toolUseQuality: number;
|
|
1188
|
-
patchQuality: number;
|
|
1189
|
-
testReality: number;
|
|
1190
|
-
finalGate: number;
|
|
1191
|
-
reviewerBlockers: number;
|
|
1192
|
-
costUsd: number;
|
|
1193
|
-
wallSeconds: number;
|
|
1614
|
+
register(id: string, version: string, content: string): Promise<PromptHandle>;
|
|
1615
|
+
/** Look up a registered prompt. Throws if unknown — no implicit defaults. */
|
|
1616
|
+
get(id: string, version: string): PromptHandle;
|
|
1617
|
+
/** Return all versions of an id, newest-first (lex-descending on version). */
|
|
1618
|
+
listVersions(id: string): PromptHandle[];
|
|
1619
|
+
/** Snapshot the whole registry — useful for including in reports. */
|
|
1620
|
+
list(): PromptHandle[];
|
|
1621
|
+
/** Verify a hash against registered content. Returns null if not found. */
|
|
1622
|
+
verifyHash(id: string, version: string, expectedHash: string): boolean | null;
|
|
1194
1623
|
}
|
|
1195
|
-
|
|
1196
|
-
declare function
|
|
1197
|
-
declare function clamp01(value: number): number;
|
|
1624
|
+
/** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
|
|
1625
|
+
declare function hashContent(content: string): Promise<string>;
|
|
1198
1626
|
|
|
1199
1627
|
/**
|
|
1200
|
-
*
|
|
1628
|
+
* Anti-slop quality judge.
|
|
1201
1629
|
*
|
|
1202
|
-
*
|
|
1203
|
-
*
|
|
1630
|
+
* Deterministic pattern-based quality check — no LLM call. Catches the
|
|
1631
|
+
* 80% of AI slop that every production agent leaks:
|
|
1632
|
+
* - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
|
|
1633
|
+
* - N-gram repetition (same phrase over and over)
|
|
1634
|
+
* - Hedging overuse ("I could be wrong, but...")
|
|
1635
|
+
* - Apology padding ("I'm so sorry for the confusion...")
|
|
1636
|
+
* - Unused opening formulas ("Great question!")
|
|
1637
|
+
* - Length bounds (too short to be useful, too long to be read)
|
|
1204
1638
|
*
|
|
1205
|
-
*
|
|
1206
|
-
*
|
|
1207
|
-
* judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
|
|
1208
|
-
* entities that OTEL leaves as free-form attributes.
|
|
1639
|
+
* Produces a JudgeScore in the same shape as LLM judges so it composes into
|
|
1640
|
+
* `BenchmarkRunner`'s judge array transparently.
|
|
1209
1641
|
*/
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1642
|
+
|
|
1643
|
+
interface AntiSlopConfig {
|
|
1644
|
+
/** Domain label — appears in the JudgeScore output */
|
|
1645
|
+
domain?: string;
|
|
1646
|
+
/** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
|
|
1647
|
+
bannedPhrases?: string[];
|
|
1648
|
+
/** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
|
|
1649
|
+
bannedOpenings?: RegExp[];
|
|
1650
|
+
/** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
|
|
1651
|
+
hedgingPatterns?: RegExp[];
|
|
1652
|
+
/** Regexes matching apology padding. */
|
|
1653
|
+
apologyPatterns?: RegExp[];
|
|
1654
|
+
/** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
|
|
1655
|
+
repetitionThreshold?: number;
|
|
1656
|
+
/** Min output length in chars; below this the turn is deemed too terse. */
|
|
1657
|
+
minLength?: number;
|
|
1658
|
+
/** Max output length in chars; above this the turn is deemed too verbose. */
|
|
1659
|
+
maxLength?: number;
|
|
1660
|
+
/** How heavily each violation class reduces the score (default 1). */
|
|
1661
|
+
penaltyWeights?: Partial<Record<SlopCategory, number>>;
|
|
1217
1662
|
}
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1663
|
+
type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
|
|
1664
|
+
/** Create a reusable Judge function from an anti-slop config. */
|
|
1665
|
+
declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
|
|
1666
|
+
interface AntiSlopIssue {
|
|
1667
|
+
category: SlopCategory;
|
|
1668
|
+
detail: string;
|
|
1669
|
+
example?: string;
|
|
1670
|
+
}
|
|
1671
|
+
interface AntiSlopReport {
|
|
1672
|
+
/** 0–10 score; 10 is clean, lower values mean more slop. */
|
|
1673
|
+
score: number;
|
|
1674
|
+
issues: AntiSlopIssue[];
|
|
1675
|
+
/** Count of each category for programmatic aggregation. */
|
|
1676
|
+
counts: Record<SlopCategory, number>;
|
|
1223
1677
|
}
|
|
1224
1678
|
/**
|
|
1225
|
-
*
|
|
1226
|
-
*
|
|
1227
|
-
* `app-build`: sandbox harness that compiled + tested the generated scaffold.
|
|
1228
|
-
* `app-runtime`: a run of the generated agent against a domain scenario.
|
|
1229
|
-
* `meta`: any meta-eval (judge replay, correlation analysis).
|
|
1679
|
+
* Pure function — analyze one or more outputs against the config. Exposed
|
|
1680
|
+
* separately so consumers can build their own reporters on top.
|
|
1230
1681
|
*/
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
/**
|
|
1252
|
-
|
|
1253
|
-
/**
|
|
1254
|
-
|
|
1255
|
-
/**
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
status: RunStatus;
|
|
1260
|
-
outcome?: RunOutcome$1;
|
|
1261
|
-
budget?: BudgetSpec;
|
|
1262
|
-
/** Free-form labels for downstream grouping. */
|
|
1263
|
-
tags?: Record<string, string>;
|
|
1682
|
+
declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
|
|
1683
|
+
penaltyWeights: Record<SlopCategory, number>;
|
|
1684
|
+
}): AntiSlopReport;
|
|
1685
|
+
|
|
1686
|
+
/**
|
|
1687
|
+
* Artifact validators.
|
|
1688
|
+
*
|
|
1689
|
+
* Generic "score a produced artifact" primitive. Tax uses it for PDF form
|
|
1690
|
+
* correctness, legal for contract clauses, film for script breakdowns, GTM
|
|
1691
|
+
* for social posts. One interface, many validators; all plug into
|
|
1692
|
+
* `BenchmarkRunner` the same way.
|
|
1693
|
+
*
|
|
1694
|
+
* A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
|
|
1695
|
+
* plus a `ValidationContext` (scenario id, the turns that produced it) and
|
|
1696
|
+
* returns a `ValidationResult` with pass/fail + 0..1 score + structured
|
|
1697
|
+
* issues.
|
|
1698
|
+
*/
|
|
1699
|
+
interface Artifact {
|
|
1700
|
+
/** Logical kind — validators type-guard on this */
|
|
1701
|
+
kind: 'file' | 'json' | 'text' | 'binary' | string;
|
|
1702
|
+
/** Filesystem-style path, optional */
|
|
1703
|
+
path?: string;
|
|
1704
|
+
/** String content for text/json/file kinds */
|
|
1705
|
+
content?: string;
|
|
1706
|
+
/** Binary content (if kind === 'binary') */
|
|
1707
|
+
bytes?: Uint8Array;
|
|
1708
|
+
/** Caller-supplied metadata (mimeType, sha256, size, etc.) */
|
|
1709
|
+
metadata?: Record<string, unknown>;
|
|
1264
1710
|
}
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
name: string;
|
|
1273
|
-
startedAt: number;
|
|
1274
|
-
endedAt?: number;
|
|
1275
|
-
status?: SpanStatus;
|
|
1276
|
-
error?: string;
|
|
1277
|
-
/** Anything not covered by typed fields. Kept deliberately free-form. */
|
|
1278
|
-
attributes?: Record<string, unknown>;
|
|
1711
|
+
interface ValidationContext {
|
|
1712
|
+
scenarioId: string;
|
|
1713
|
+
turnIndex?: number;
|
|
1714
|
+
/** Prior artifacts for multi-artifact scenarios */
|
|
1715
|
+
priorArtifacts?: Artifact[];
|
|
1716
|
+
/** Free-form hints the validator uses for domain-specific checks */
|
|
1717
|
+
hints?: Record<string, unknown>;
|
|
1279
1718
|
}
|
|
1280
|
-
interface
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
images?: Array<{
|
|
1286
|
-
artifactId?: string;
|
|
1287
|
-
url?: string;
|
|
1288
|
-
mime?: string;
|
|
1289
|
-
}>;
|
|
1719
|
+
interface ValidationIssue {
|
|
1720
|
+
severity: 'error' | 'warning' | 'info';
|
|
1721
|
+
message: string;
|
|
1722
|
+
/** Optional path into the artifact (e.g. JSON path or byte offset) */
|
|
1723
|
+
locus?: string;
|
|
1290
1724
|
}
|
|
1291
|
-
interface
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
cachedTokens?: number;
|
|
1299
|
-
reasoningTokens?: number;
|
|
1300
|
-
costUsd?: number;
|
|
1301
|
-
finishReason?: string;
|
|
1725
|
+
interface ValidationResult {
|
|
1726
|
+
pass: boolean;
|
|
1727
|
+
/** 0–1 normalized score. Validators should be monotonic in pass-ness. */
|
|
1728
|
+
score: number;
|
|
1729
|
+
issues: ValidationIssue[];
|
|
1730
|
+
/** Diagnostic payload for reporters */
|
|
1731
|
+
evidence?: Record<string, unknown>;
|
|
1302
1732
|
}
|
|
1303
|
-
interface
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1733
|
+
interface ArtifactValidator {
|
|
1734
|
+
/** Stable identifier for the validator; appears in reports. */
|
|
1735
|
+
name: string;
|
|
1736
|
+
/** Optional description for human-facing reports. */
|
|
1737
|
+
description?: string;
|
|
1738
|
+
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
1739
|
+
validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
|
|
1309
1740
|
}
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1741
|
+
/**
|
|
1742
|
+
* Run every validator on the same artifact; aggregate pass as AND, score as
|
|
1743
|
+
* (weighted) mean, issues concatenated. Weights default to 1 each.
|
|
1744
|
+
*/
|
|
1745
|
+
declare function composeValidators(validators: ArtifactValidator[], options?: {
|
|
1746
|
+
name?: string;
|
|
1747
|
+
weights?: number[];
|
|
1748
|
+
}): ArtifactValidator;
|
|
1749
|
+
/** Pass if the artifact body matches a provided regex. */
|
|
1750
|
+
declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
|
|
1751
|
+
/** Pass if JSON parses and every required key is present. */
|
|
1752
|
+
declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
|
|
1753
|
+
/** Pass if min ≤ byte length ≤ max. */
|
|
1754
|
+
declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
|
|
1755
|
+
/** Pass if the artifact contains every required substring (case-insensitive by default). */
|
|
1756
|
+
declare function containsAll(name: string, required: string[], options?: {
|
|
1757
|
+
caseSensitive?: boolean;
|
|
1758
|
+
}): ArtifactValidator;
|
|
1759
|
+
|
|
1760
|
+
/**
|
|
1761
|
+
* Workspace inspector — score the persisted state of an agent after a run.
|
|
1762
|
+
*
|
|
1763
|
+
* Many evals don't ask "did the response say the right thing" but "did the
|
|
1764
|
+
* agent put the right rows in the DB / files in the vault / entities on the
|
|
1765
|
+
* canvas". This is the primitive for that.
|
|
1766
|
+
*
|
|
1767
|
+
* Implementations read from D1, KV, filesystem, or any store — the interface
|
|
1768
|
+
* is deliberately small so consumers plug in their own backends.
|
|
1769
|
+
*/
|
|
1770
|
+
interface WorkspaceSnapshot {
|
|
1771
|
+
/** Vault files: logical path → content */
|
|
1772
|
+
files: Record<string, string>;
|
|
1773
|
+
/** DB rows: table name → array of rows (post-validation) */
|
|
1774
|
+
rows: Record<string, Array<Record<string, unknown>>>;
|
|
1775
|
+
/** KV entries: key → value (scoped to whatever prefix the inspector chose) */
|
|
1776
|
+
kv: Record<string, string>;
|
|
1777
|
+
/** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
|
|
1778
|
+
blobs?: Record<string, {
|
|
1779
|
+
size: number;
|
|
1780
|
+
hash?: string;
|
|
1781
|
+
mimeType?: string;
|
|
1317
1782
|
}>;
|
|
1318
1783
|
}
|
|
1319
|
-
interface
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
/**
|
|
1323
|
-
|
|
1324
|
-
dimension: string;
|
|
1325
|
-
/** Numeric score (free-range; interpretation up to the judge). */
|
|
1326
|
-
score: number;
|
|
1327
|
-
rationale?: string;
|
|
1328
|
-
evidence?: string;
|
|
1329
|
-
}
|
|
1330
|
-
interface SandboxSpan extends SpanBase {
|
|
1331
|
-
kind: 'sandbox';
|
|
1332
|
-
image?: string;
|
|
1333
|
-
command?: string;
|
|
1334
|
-
exitCode?: number;
|
|
1335
|
-
testsTotal?: number;
|
|
1336
|
-
testsPassed?: number;
|
|
1337
|
-
stdoutHash?: string;
|
|
1338
|
-
stderrHash?: string;
|
|
1339
|
-
/** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
|
|
1340
|
-
wallMs?: number;
|
|
1784
|
+
interface InspectorContext {
|
|
1785
|
+
/** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
|
|
1786
|
+
scopeId: string;
|
|
1787
|
+
/** Optional scenario id — allows scenario-specific snapshot shaping */
|
|
1788
|
+
scenarioId?: string;
|
|
1341
1789
|
}
|
|
1342
|
-
interface
|
|
1343
|
-
|
|
1790
|
+
interface WorkspaceInspector {
|
|
1791
|
+
name: string;
|
|
1792
|
+
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
1344
1793
|
}
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
spanId?: string;
|
|
1351
|
-
kind: EventKind;
|
|
1352
|
-
timestamp: number;
|
|
1353
|
-
payload: Record<string, unknown>;
|
|
1794
|
+
declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
|
|
1795
|
+
readonly name = "in-memory";
|
|
1796
|
+
private readonly snapshots;
|
|
1797
|
+
set(scopeId: string, snapshot: WorkspaceSnapshot): void;
|
|
1798
|
+
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
1354
1799
|
}
|
|
1355
|
-
interface
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
consumed: number;
|
|
1360
|
-
remaining: number;
|
|
1361
|
-
timestamp: number;
|
|
1362
|
-
breached: boolean;
|
|
1363
|
-
/** Span that triggered this entry, if any. */
|
|
1364
|
-
spanId?: string;
|
|
1800
|
+
interface WorkspaceAssertion {
|
|
1801
|
+
name: string;
|
|
1802
|
+
description?: string;
|
|
1803
|
+
check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
|
|
1365
1804
|
}
|
|
1366
|
-
interface
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
sizeBytes: number;
|
|
1372
|
-
/** sha256 in hex. */
|
|
1373
|
-
hash: string;
|
|
1374
|
-
/** External storage URL (R2, S3, filesystem path). */
|
|
1375
|
-
storageUrl?: string;
|
|
1376
|
-
/** Inline content for small blobs — keep under ~64KB. */
|
|
1377
|
-
inlineContent?: string;
|
|
1805
|
+
interface WorkspaceAssertionResult {
|
|
1806
|
+
pass: boolean;
|
|
1807
|
+
/** 0..1 — partial credit for assertions that admit it */
|
|
1808
|
+
score: number;
|
|
1809
|
+
detail?: string;
|
|
1378
1810
|
}
|
|
1379
|
-
|
|
1380
|
-
declare
|
|
1381
|
-
declare function
|
|
1382
|
-
declare function
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1811
|
+
declare function fileExists(path: string): WorkspaceAssertion;
|
|
1812
|
+
declare function fileContains(path: string, needle: string): WorkspaceAssertion;
|
|
1813
|
+
declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
|
|
1814
|
+
declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
|
|
1815
|
+
min?: number;
|
|
1816
|
+
}): WorkspaceAssertion;
|
|
1817
|
+
/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
|
|
1818
|
+
declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
|
|
1819
|
+
pass: boolean;
|
|
1820
|
+
score: number;
|
|
1821
|
+
results: Array<{
|
|
1822
|
+
assertion: string;
|
|
1823
|
+
result: WorkspaceAssertionResult;
|
|
1824
|
+
}>;
|
|
1825
|
+
};
|
|
1386
1826
|
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1827
|
+
/**
|
|
1828
|
+
* Experiment tracker — group runs, diff them, watch scores move over time.
|
|
1829
|
+
*
|
|
1830
|
+
* Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
|
|
1831
|
+
* - A run has a config (prompt hash, model, scenario ids, seed)
|
|
1832
|
+
* - Runs belong to experiments (named groups)
|
|
1833
|
+
* - The store is pluggable (in-memory for tests, filesystem for local,
|
|
1834
|
+
* custom for Langfuse/D1)
|
|
1835
|
+
* - Diffs show score deltas, new/dropped scenarios, and config changes
|
|
1836
|
+
*
|
|
1837
|
+
* The output plugs directly into `BenchmarkReport` — runs archive the full
|
|
1838
|
+
* report, diff operates on the summary.
|
|
1839
|
+
*/
|
|
1840
|
+
|
|
1841
|
+
interface RunConfig {
|
|
1842
|
+
experimentId: string;
|
|
1843
|
+
name?: string;
|
|
1844
|
+
model?: string;
|
|
1845
|
+
promptHash?: string;
|
|
1846
|
+
promptVersion?: string;
|
|
1847
|
+
seed?: number;
|
|
1848
|
+
metadata?: Record<string, unknown>;
|
|
1401
1849
|
}
|
|
1402
|
-
interface
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
kind?: SpanKind;
|
|
1850
|
+
interface Run {
|
|
1851
|
+
id: string;
|
|
1852
|
+
experimentId: string;
|
|
1406
1853
|
name?: string;
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1854
|
+
config: RunConfig;
|
|
1855
|
+
startedAt: string;
|
|
1856
|
+
completedAt?: string;
|
|
1857
|
+
status: 'running' | 'completed' | 'failed';
|
|
1858
|
+
report?: BenchmarkReport;
|
|
1859
|
+
error?: string;
|
|
1411
1860
|
}
|
|
1412
|
-
interface
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
until?: number;
|
|
1861
|
+
interface Experiment {
|
|
1862
|
+
id: string;
|
|
1863
|
+
name: string;
|
|
1864
|
+
createdAt: string;
|
|
1865
|
+
metadata?: Record<string, unknown>;
|
|
1418
1866
|
}
|
|
1419
|
-
interface
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1427
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
1428
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1429
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1430
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1431
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1432
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
1867
|
+
interface ExperimentStore {
|
|
1868
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
1869
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
1870
|
+
listExperiments(): Promise<Experiment[]>;
|
|
1871
|
+
saveRun(run: Run): Promise<void>;
|
|
1872
|
+
getRun(id: string): Promise<Run | null>;
|
|
1873
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1433
1874
|
}
|
|
1434
|
-
declare class
|
|
1435
|
-
private
|
|
1436
|
-
private
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
1444
|
-
appendEvent(event: TraceEvent): Promise<void>;
|
|
1445
|
-
appendArtifact(artifact: Artifact): Promise<void>;
|
|
1446
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1447
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
1448
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1449
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1450
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1451
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1452
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
1875
|
+
declare class InMemoryExperimentStore implements ExperimentStore {
|
|
1876
|
+
private readonly experiments;
|
|
1877
|
+
private readonly runs;
|
|
1878
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
1879
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
1880
|
+
listExperiments(): Promise<Experiment[]>;
|
|
1881
|
+
saveRun(run: Run): Promise<void>;
|
|
1882
|
+
getRun(id: string): Promise<Run | null>;
|
|
1883
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1453
1884
|
}
|
|
1454
|
-
|
|
1885
|
+
declare class ExperimentTracker {
|
|
1886
|
+
private readonly store;
|
|
1887
|
+
constructor(store: ExperimentStore);
|
|
1888
|
+
startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
|
|
1889
|
+
startRun(config: RunConfig): Promise<Run>;
|
|
1890
|
+
completeRun(runId: string, report: BenchmarkReport): Promise<void>;
|
|
1891
|
+
failRun(runId: string, error: string): Promise<void>;
|
|
1892
|
+
/**
|
|
1893
|
+
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
1894
|
+
* and config changes that may explain the movement.
|
|
1895
|
+
*/
|
|
1896
|
+
diff(runIdA: string, runIdB: string): Promise<RunDiff>;
|
|
1897
|
+
/** Timeline of aggregate scores for an experiment. */
|
|
1898
|
+
timeline(experimentId: string): Promise<Array<{
|
|
1899
|
+
runId: string;
|
|
1900
|
+
startedAt: string;
|
|
1901
|
+
overall: number | null;
|
|
1902
|
+
}>>;
|
|
1903
|
+
}
|
|
1904
|
+
interface RunDiff {
|
|
1905
|
+
before: {
|
|
1906
|
+
runId: string;
|
|
1907
|
+
name?: string;
|
|
1908
|
+
startedAt: string;
|
|
1909
|
+
};
|
|
1910
|
+
after: {
|
|
1911
|
+
runId: string;
|
|
1912
|
+
name?: string;
|
|
1913
|
+
startedAt: string;
|
|
1914
|
+
};
|
|
1915
|
+
aggregateDelta: number;
|
|
1916
|
+
scenarios: Array<{
|
|
1917
|
+
scenarioId: string;
|
|
1918
|
+
before: number | null;
|
|
1919
|
+
after: number | null;
|
|
1920
|
+
delta: number | null;
|
|
1921
|
+
status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
|
|
1922
|
+
}>;
|
|
1923
|
+
configChanges: Record<string, {
|
|
1924
|
+
before: unknown;
|
|
1925
|
+
after: unknown;
|
|
1926
|
+
}>;
|
|
1927
|
+
}
|
|
1928
|
+
|
|
1929
|
+
/**
|
|
1930
|
+
* FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
|
|
1931
|
+
*
|
|
1932
|
+
* Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
|
|
1933
|
+
* files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
|
|
1934
|
+
* based rollover. Writes are append-only so the file log doubles as an audit
|
|
1935
|
+
* trail of every state transition the tracker ever wrote.
|
|
1936
|
+
*
|
|
1937
|
+
* Reads lazy-load every NDJSON file in the directory (including rolled-over
|
|
1938
|
+
* archives), latest-write-wins per `id`. Subsequent writes update the
|
|
1939
|
+
* in-memory index in place so reads after writes are O(1).
|
|
1940
|
+
*
|
|
1941
|
+
* Node-only — imports `node:fs/promises`. Don't import this from a Worker;
|
|
1942
|
+
* use the in-memory store or the D1 store from `./experiment-tracker-d1`.
|
|
1943
|
+
*/
|
|
1944
|
+
|
|
1945
|
+
interface FileSystemExperimentStoreOptions {
|
|
1946
|
+
/** Directory the NDJSON files live in. Created on first write. */
|
|
1455
1947
|
dir: string;
|
|
1456
|
-
/**
|
|
1948
|
+
/** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
|
|
1457
1949
|
maxBytes?: number;
|
|
1458
1950
|
}
|
|
1459
|
-
declare class
|
|
1460
|
-
private dir;
|
|
1461
|
-
private maxBytes;
|
|
1462
|
-
/** Lazy in-memory index for queries — populated on first read. */
|
|
1951
|
+
declare class FileSystemExperimentStore implements ExperimentStore {
|
|
1952
|
+
private readonly dir;
|
|
1953
|
+
private readonly maxBytes;
|
|
1463
1954
|
private index?;
|
|
1464
1955
|
private loaded;
|
|
1465
|
-
constructor(options:
|
|
1956
|
+
constructor(options: FileSystemExperimentStoreOptions);
|
|
1957
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
1958
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
1959
|
+
listExperiments(): Promise<Experiment[]>;
|
|
1960
|
+
saveRun(run: Run): Promise<void>;
|
|
1961
|
+
getRun(id: string): Promise<Run | null>;
|
|
1962
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1466
1963
|
private ensureDir;
|
|
1467
1964
|
private append;
|
|
1468
|
-
private insertInto;
|
|
1469
1965
|
private load;
|
|
1470
|
-
appendRun(run: Run): Promise<void>;
|
|
1471
|
-
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
1472
|
-
appendSpan(span: Span): Promise<void>;
|
|
1473
|
-
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
1474
|
-
appendEvent(event: TraceEvent): Promise<void>;
|
|
1475
|
-
appendArtifact(artifact: Artifact): Promise<void>;
|
|
1476
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1477
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
1478
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1479
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1480
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1481
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1482
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
1483
1966
|
}
|
|
1484
1967
|
|
|
1485
1968
|
/**
|
|
1486
|
-
*
|
|
1487
|
-
* internal stack. One emitter per Run; emitters do NOT share state.
|
|
1969
|
+
* D1ExperimentStore — Cloudflare D1-backed `ExperimentStore`.
|
|
1488
1970
|
*
|
|
1489
|
-
*
|
|
1490
|
-
*
|
|
1491
|
-
*
|
|
1492
|
-
* the
|
|
1493
|
-
*
|
|
1971
|
+
* Workers-safe (uses only the `D1Database` binding the runtime injects). Two
|
|
1972
|
+
* tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
|
|
1973
|
+
* a Worker route can both write the row at run start and update it at run end
|
|
1974
|
+
* without losing the original config — the row's lifecycle mirrors the
|
|
1975
|
+
* `Run.status` field one-to-one.
|
|
1976
|
+
*
|
|
1977
|
+
* Why this lives next to `InMemoryExperimentStore`:
|
|
1978
|
+
* - bad-app, legal-agent, gtm-agent, film-agent all run as Workers
|
|
1979
|
+
* - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
|
|
1980
|
+
* - Hand-rolling D1 SQL in every consumer is exactly the duplication this
|
|
1981
|
+
* module exists to prevent
|
|
1982
|
+
*
|
|
1983
|
+
* Schema versioning: the `meta` table records `schema_version` so a future
|
|
1984
|
+
* column addition can be detected and migrated additively. Today's schema is
|
|
1985
|
+
* v1; bump only on breaking shape changes.
|
|
1494
1986
|
*/
|
|
1495
1987
|
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1988
|
+
/**
|
|
1989
|
+
* Minimal `D1Database` shape we depend on. Avoids pulling in
|
|
1990
|
+
* `@cloudflare/workers-types` as a hard dep — consumers that already have
|
|
1991
|
+
* those types installed can pass the binding directly.
|
|
1992
|
+
*/
|
|
1993
|
+
interface D1Like {
|
|
1994
|
+
prepare(query: string): D1PreparedStatementLike;
|
|
1995
|
+
batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
|
|
1996
|
+
exec(query: string): Promise<unknown>;
|
|
1500
1997
|
}
|
|
1501
|
-
interface
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1998
|
+
interface D1PreparedStatementLike {
|
|
1999
|
+
bind(...values: unknown[]): D1PreparedStatementLike;
|
|
2000
|
+
first<T = Record<string, unknown>>(): Promise<T | null>;
|
|
2001
|
+
all<T = Record<string, unknown>>(): Promise<{
|
|
2002
|
+
results: T[];
|
|
2003
|
+
}>;
|
|
2004
|
+
run(): Promise<unknown>;
|
|
2005
|
+
}
|
|
2006
|
+
interface D1ExperimentStoreOptions {
|
|
2007
|
+
/** D1 binding from `env`. */
|
|
2008
|
+
db: D1Like;
|
|
2009
|
+
/**
|
|
2010
|
+
* Optional table-name prefix so multiple ExperimentStores can share a DB
|
|
2011
|
+
* without colliding (e.g. `tax_eval_experiments` vs `legal_eval_experiments`).
|
|
2012
|
+
* Default: `agent_eval_`.
|
|
2013
|
+
*/
|
|
2014
|
+
tablePrefix?: string;
|
|
1507
2015
|
}
|
|
1508
|
-
declare class
|
|
1509
|
-
private
|
|
1510
|
-
private
|
|
1511
|
-
private
|
|
1512
|
-
private
|
|
1513
|
-
private
|
|
1514
|
-
constructor(
|
|
1515
|
-
get runId(): string;
|
|
1516
|
-
startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
|
|
1517
|
-
endRun(outcome?: RunOutcome$1): Promise<void>;
|
|
1518
|
-
abortRun(reason: string): Promise<void>;
|
|
1519
|
-
span<S extends Span = Span>(init: {
|
|
1520
|
-
kind: SpanKind;
|
|
1521
|
-
name: string;
|
|
1522
|
-
parentSpanId?: string;
|
|
1523
|
-
attributes?: Record<string, unknown>;
|
|
1524
|
-
} & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
|
|
1525
|
-
private handle;
|
|
1526
|
-
private pop;
|
|
1527
|
-
llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
|
|
1528
|
-
tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
|
|
1529
|
-
retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
|
|
1530
|
-
recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
|
|
1531
|
-
sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
|
|
1532
|
-
emit(event: {
|
|
1533
|
-
kind: EventKind;
|
|
1534
|
-
spanId?: string;
|
|
1535
|
-
payload?: Record<string, unknown>;
|
|
1536
|
-
}): Promise<TraceEvent>;
|
|
1537
|
-
recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
|
|
1538
|
-
timestamp?: number;
|
|
1539
|
-
}): Promise<BudgetLedgerEntry>;
|
|
1540
|
-
recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
|
|
2016
|
+
declare class D1ExperimentStore implements ExperimentStore {
|
|
2017
|
+
private readonly db;
|
|
2018
|
+
private readonly experimentsTable;
|
|
2019
|
+
private readonly runsTable;
|
|
2020
|
+
private readonly metaTable;
|
|
2021
|
+
private schemaReady;
|
|
2022
|
+
constructor(options: D1ExperimentStoreOptions);
|
|
1541
2023
|
/**
|
|
1542
|
-
*
|
|
1543
|
-
*
|
|
2024
|
+
* Idempotent schema setup. Safe to call before every operation; the second
|
|
2025
|
+
* call short-circuits via `schemaReady`. Most consumers will call it once
|
|
2026
|
+
* during Worker bootstrap.
|
|
1544
2027
|
*/
|
|
1545
|
-
|
|
2028
|
+
ensureSchema(): Promise<void>;
|
|
2029
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
2030
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
2031
|
+
listExperiments(): Promise<Experiment[]>;
|
|
2032
|
+
saveRun(run: Run): Promise<void>;
|
|
2033
|
+
getRun(id: string): Promise<Run | null>;
|
|
2034
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1546
2035
|
}
|
|
1547
|
-
/** Helper to build an LLM span handle args object from a provider-shaped response. */
|
|
1548
|
-
declare function llmSpanFromProvider(args: {
|
|
1549
|
-
name?: string;
|
|
1550
|
-
model: string;
|
|
1551
|
-
messages: Message[];
|
|
1552
|
-
output: string;
|
|
1553
|
-
usage?: {
|
|
1554
|
-
inputTokens?: number;
|
|
1555
|
-
outputTokens?: number;
|
|
1556
|
-
cachedTokens?: number;
|
|
1557
|
-
reasoningTokens?: number;
|
|
1558
|
-
};
|
|
1559
|
-
costUsd?: number;
|
|
1560
|
-
finishReason?: string;
|
|
1561
|
-
}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
|
|
1562
2036
|
|
|
1563
2037
|
/**
|
|
1564
2038
|
* Typed query helpers over TraceStore.
|
|
@@ -1569,7 +2043,7 @@ declare function llmSpanFromProvider(args: {
|
|
|
1569
2043
|
* tooling works out of the box.
|
|
1570
2044
|
*/
|
|
1571
2045
|
|
|
1572
|
-
declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
|
|
2046
|
+
declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run$1[]>;
|
|
1573
2047
|
declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
|
|
1574
2048
|
declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
|
|
1575
2049
|
declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
|
|
@@ -1585,7 +2059,7 @@ declare function aggregateLlm(spans: LlmSpan[]): {
|
|
|
1585
2059
|
costUsd: number;
|
|
1586
2060
|
};
|
|
1587
2061
|
/** Pick the outcome's failure class when present, else derive 'success' from run status. */
|
|
1588
|
-
declare function runFailureClass(run: Run): FailureClass;
|
|
2062
|
+
declare function runFailureClass(run: Run$1): FailureClass;
|
|
1589
2063
|
|
|
1590
2064
|
/**
|
|
1591
2065
|
* Redaction — remove PII / secrets from trace payloads before persist.
|
|
@@ -1689,10 +2163,10 @@ interface OtlpExport {
|
|
|
1689
2163
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
1690
2164
|
|
|
1691
2165
|
interface RunTrace {
|
|
1692
|
-
run: Run;
|
|
2166
|
+
run: Run$1;
|
|
1693
2167
|
spans: Span[];
|
|
1694
2168
|
events: TraceEvent[];
|
|
1695
|
-
artifacts: Artifact[];
|
|
2169
|
+
artifacts: Artifact$1[];
|
|
1696
2170
|
budget: BudgetLedgerEntry[];
|
|
1697
2171
|
}
|
|
1698
2172
|
interface RunCriticOptions {
|
|
@@ -1725,55 +2199,6 @@ declare function distillPlaybook(entries: PlaybookEntry[], options?: {
|
|
|
1725
2199
|
}): Playbook;
|
|
1726
2200
|
declare function renderPlaybookMarkdown(playbook: Playbook): string;
|
|
1727
2201
|
|
|
1728
|
-
interface OptimizationExample {
|
|
1729
|
-
scenarioId: string;
|
|
1730
|
-
metadata?: Record<string, unknown>;
|
|
1731
|
-
}
|
|
1732
|
-
interface SteeringEvaluation {
|
|
1733
|
-
variant: SteeringBundle;
|
|
1734
|
-
example: OptimizationExample;
|
|
1735
|
-
trialIndex: number;
|
|
1736
|
-
}
|
|
1737
|
-
interface SteeringVariantReport {
|
|
1738
|
-
variantId: string;
|
|
1739
|
-
bundle: SteeringBundle;
|
|
1740
|
-
mean: number;
|
|
1741
|
-
ci95: {
|
|
1742
|
-
lower: number;
|
|
1743
|
-
upper: number;
|
|
1744
|
-
};
|
|
1745
|
-
scenarioScores: Record<string, {
|
|
1746
|
-
mean: number;
|
|
1747
|
-
n: number;
|
|
1748
|
-
samples: number[];
|
|
1749
|
-
}>;
|
|
1750
|
-
}
|
|
1751
|
-
interface OptimizationLoopResult {
|
|
1752
|
-
winner: SteeringBundle;
|
|
1753
|
-
significant: boolean;
|
|
1754
|
-
reports: SteeringVariantReport[];
|
|
1755
|
-
pairwise: Array<{
|
|
1756
|
-
variantA: string;
|
|
1757
|
-
variantB: string;
|
|
1758
|
-
pValue: number;
|
|
1759
|
-
qValue: number;
|
|
1760
|
-
significant: boolean;
|
|
1761
|
-
meanDelta: number;
|
|
1762
|
-
}>;
|
|
1763
|
-
}
|
|
1764
|
-
interface OptimizationLoopConfig {
|
|
1765
|
-
variants: SteeringBundle[];
|
|
1766
|
-
examples: OptimizationExample[];
|
|
1767
|
-
evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
|
|
1768
|
-
scoreWeights?: Partial<RunScoreWeights>;
|
|
1769
|
-
trialsPerScenario?: number;
|
|
1770
|
-
}
|
|
1771
|
-
declare class OptimizationLoop {
|
|
1772
|
-
private readonly optimizer;
|
|
1773
|
-
constructor(optimizer?: PromptOptimizer);
|
|
1774
|
-
run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
|
|
1775
|
-
}
|
|
1776
|
-
|
|
1777
2202
|
type SteeringOptimizerBackend = 'pairwise' | 'ax-gepa';
|
|
1778
2203
|
interface SteeringOptimizationRow {
|
|
1779
2204
|
variantId: string;
|
|
@@ -2400,6 +2825,51 @@ interface LlmReviewerConfig<State, Summary = unknown> {
|
|
|
2400
2825
|
}
|
|
2401
2826
|
declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
|
|
2402
2827
|
|
|
2828
|
+
interface ProposeReviewControlState<State, Summary = unknown> {
|
|
2829
|
+
shot: number;
|
|
2830
|
+
state: State;
|
|
2831
|
+
priorReview: Review | null;
|
|
2832
|
+
verification: Verification;
|
|
2833
|
+
traceSummary?: Summary;
|
|
2834
|
+
memory: ReviewMemoryEntry[];
|
|
2835
|
+
completed: boolean;
|
|
2836
|
+
reviewAvailable: boolean;
|
|
2837
|
+
reviewError?: string;
|
|
2838
|
+
}
|
|
2839
|
+
interface ProposeReviewControlAction {
|
|
2840
|
+
type: 'propose-review-shot';
|
|
2841
|
+
shot: number;
|
|
2842
|
+
}
|
|
2843
|
+
interface ProposeReviewControlResult<State, Summary = unknown> {
|
|
2844
|
+
state: State;
|
|
2845
|
+
verification: Verification;
|
|
2846
|
+
traceSummary?: Summary;
|
|
2847
|
+
review: Review | null;
|
|
2848
|
+
reviewAvailable: boolean;
|
|
2849
|
+
reviewError?: string;
|
|
2850
|
+
}
|
|
2851
|
+
interface ProposeReviewControlConfig<State, Summary = unknown> {
|
|
2852
|
+
goal: string;
|
|
2853
|
+
initialState: State;
|
|
2854
|
+
propose: ProposeFn<State, Summary>;
|
|
2855
|
+
verify: VerifyFn<State>;
|
|
2856
|
+
review: ReviewFn<State, Summary>;
|
|
2857
|
+
maxShots?: number;
|
|
2858
|
+
maxWallMs?: number;
|
|
2859
|
+
memory?: ReviewMemoryStore;
|
|
2860
|
+
store?: TraceStore;
|
|
2861
|
+
scenarioId?: string;
|
|
2862
|
+
projectId?: string;
|
|
2863
|
+
variantId?: string;
|
|
2864
|
+
fallbackInstruction?: string;
|
|
2865
|
+
confidenceFloor?: number;
|
|
2866
|
+
confidenceFloorWindow?: number;
|
|
2867
|
+
failureClassFromVerification?: (verification: Verification) => FailureClass | undefined;
|
|
2868
|
+
actionFailure?: ControlRuntimeConfig<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>['actionFailure'];
|
|
2869
|
+
}
|
|
2870
|
+
declare function runProposeReviewAsControlLoop<State, Summary = unknown>(config: ProposeReviewControlConfig<State, Summary>): Promise<ControlRunResult<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>>;
|
|
2871
|
+
declare function controlFailureClassFromVerification(verification: Verification): FailureClass | undefined;
|
|
2872
|
+
|
|
2403
2873
|
/**
|
|
2404
2874
|
* TestGradedScenario — a scenario whose score comes from a test suite.
|
|
2405
2875
|
*
|
|
@@ -2428,7 +2898,7 @@ interface TestGradedRunOptions {
|
|
|
2428
2898
|
variantId?: string;
|
|
2429
2899
|
driver?: SandboxDriver;
|
|
2430
2900
|
/** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
|
|
2431
|
-
provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
2901
|
+
provenance?: Pick<Run$1, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
2432
2902
|
}
|
|
2433
2903
|
interface TestGradedRunResult {
|
|
2434
2904
|
runId: string;
|
|
@@ -2481,7 +2951,7 @@ declare class BudgetGuard {
|
|
|
2481
2951
|
*/
|
|
2482
2952
|
|
|
2483
2953
|
interface FailureContext {
|
|
2484
|
-
run: Run;
|
|
2954
|
+
run: Run$1;
|
|
2485
2955
|
spans: Span[];
|
|
2486
2956
|
events: TraceEvent[];
|
|
2487
2957
|
}
|
|
@@ -2824,7 +3294,7 @@ interface RegressionSpec {
|
|
|
2824
3294
|
metric: string;
|
|
2825
3295
|
higherIsBetter: boolean;
|
|
2826
3296
|
/** Extract a scalar from a run. Default extractors handle common metrics. */
|
|
2827
|
-
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
3297
|
+
extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
|
|
2828
3298
|
}
|
|
2829
3299
|
interface RegressionOptions extends BaselineOptions {
|
|
2830
3300
|
baseline: RunFilter;
|
|
@@ -3192,107 +3662,6 @@ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minR
|
|
|
3192
3662
|
/** Common check: a status field advanced in an expected order. */
|
|
3193
3663
|
declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
|
|
3194
3664
|
|
|
3195
|
-
/**
|
|
3196
|
-
* Dataset — versioned, sliceable, content-hashed scenario collection.
|
|
3197
|
-
*
|
|
3198
|
-
* Scenarios stop being ephemeral arrays and become first-class
|
|
3199
|
-
* artifacts. Every Dataset carries:
|
|
3200
|
-
* - content hash (sha256 over canonicalized scenario array)
|
|
3201
|
-
* - provenance (contributor, createdAt, sourceUrl)
|
|
3202
|
-
* - split labels (train | dev | test | holdout)
|
|
3203
|
-
* - difficulty tiers (easy | medium | hard | extreme)
|
|
3204
|
-
* - tags (free-form, per-scenario)
|
|
3205
|
-
*
|
|
3206
|
-
* `Dataset.slice({ difficulty, split, holdout, seed })` returns a
|
|
3207
|
-
* deterministic, reproducible subset. Holdout slices are locked: you
|
|
3208
|
-
* can read them but `mutate` throws, which prevents "oh I'll just
|
|
3209
|
-
* tweak that one scenario" contamination drift.
|
|
3210
|
-
*/
|
|
3211
|
-
type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
|
|
3212
|
-
type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
|
|
3213
|
-
interface DatasetScenario {
|
|
3214
|
-
id: string;
|
|
3215
|
-
/** Arbitrary payload; the framework doesn't interpret it. */
|
|
3216
|
-
payload: unknown;
|
|
3217
|
-
split?: DatasetSplit;
|
|
3218
|
-
difficulty?: DatasetDifficulty;
|
|
3219
|
-
/** Canary token that MUST NOT round-trip through a correct agent output. */
|
|
3220
|
-
canary?: string;
|
|
3221
|
-
tags?: Record<string, string>;
|
|
3222
|
-
}
|
|
3223
|
-
interface DatasetProvenance {
|
|
3224
|
-
contributor?: string;
|
|
3225
|
-
createdAt: string;
|
|
3226
|
-
sourceUrl?: string;
|
|
3227
|
-
license?: string;
|
|
3228
|
-
description?: string;
|
|
3229
|
-
/** Monotonic human-readable version (e.g. "2026.04.20"). */
|
|
3230
|
-
version: string;
|
|
3231
|
-
}
|
|
3232
|
-
interface DatasetManifest {
|
|
3233
|
-
name: string;
|
|
3234
|
-
provenance: DatasetProvenance;
|
|
3235
|
-
/** sha256 hex over canonicalized scenarios. */
|
|
3236
|
-
contentHash: string;
|
|
3237
|
-
scenarioCount: number;
|
|
3238
|
-
splitCounts: Record<DatasetSplit, number>;
|
|
3239
|
-
}
|
|
3240
|
-
interface SliceOptions {
|
|
3241
|
-
split?: DatasetSplit;
|
|
3242
|
-
difficulty?: DatasetDifficulty;
|
|
3243
|
-
/** Number of scenarios (random sample, seeded). Omit to take all that match. */
|
|
3244
|
-
limit?: number;
|
|
3245
|
-
seed?: number;
|
|
3246
|
-
/** Predicate narrowing. Applied after split/difficulty filters. */
|
|
3247
|
-
filter?: (scenario: DatasetScenario) => boolean;
|
|
3248
|
-
/** If true, include scenarios marked as holdout. Default false. */
|
|
3249
|
-
includeHoldout?: boolean;
|
|
3250
|
-
}
|
|
3251
|
-
/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
|
|
3252
|
-
declare class HoldoutLockedError extends Error {
|
|
3253
|
-
constructor(datasetName: string);
|
|
3254
|
-
}
|
|
3255
|
-
declare class Dataset {
|
|
3256
|
-
readonly name: string;
|
|
3257
|
-
readonly provenance: DatasetProvenance;
|
|
3258
|
-
private scenarios;
|
|
3259
|
-
private locked;
|
|
3260
|
-
constructor(init: {
|
|
3261
|
-
name: string;
|
|
3262
|
-
provenance: DatasetProvenance;
|
|
3263
|
-
scenarios: DatasetScenario[];
|
|
3264
|
-
locked?: boolean;
|
|
3265
|
-
});
|
|
3266
|
-
/** All scenarios. Readonly — callers must go through `slice` or `clone`. */
|
|
3267
|
-
all(): readonly DatasetScenario[];
|
|
3268
|
-
get size(): number;
|
|
3269
|
-
/**
|
|
3270
|
-
* Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
|
|
3271
|
-
* the same arguments always produce the same slice across machines.
|
|
3272
|
-
*/
|
|
3273
|
-
slice(options?: SliceOptions): DatasetScenario[];
|
|
3274
|
-
/**
|
|
3275
|
-
* Assemble the manifest (name + provenance + content hash + counts).
|
|
3276
|
-
* Content hash is deterministic over canonicalized scenarios.
|
|
3277
|
-
*/
|
|
3278
|
-
manifest(): Promise<DatasetManifest>;
|
|
3279
|
-
/** Fresh unlocked copy — for post-release forks when mutation is needed. */
|
|
3280
|
-
clone(overrides?: Partial<{
|
|
3281
|
-
name: string;
|
|
3282
|
-
version: string;
|
|
3283
|
-
}>): Dataset;
|
|
3284
|
-
lock(): void;
|
|
3285
|
-
add(scenario: DatasetScenario): void;
|
|
3286
|
-
remove(scenarioId: string): void;
|
|
3287
|
-
/**
|
|
3288
|
-
* Stable JSON-Lines serialization — deterministic byte-for-byte.
|
|
3289
|
-
* Write to disk for contamination-verifiable archives.
|
|
3290
|
-
*/
|
|
3291
|
-
toJsonl(): string;
|
|
3292
|
-
static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
|
|
3293
|
-
}
|
|
3294
|
-
declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
|
|
3295
|
-
|
|
3296
3665
|
/**
|
|
3297
3666
|
* ContaminationGuard — ensures held-out scenarios don't leak into
|
|
3298
3667
|
* training/prompt paths, and flags model memorization.
|
|
@@ -3608,7 +3977,7 @@ interface ContractMetric {
|
|
|
3608
3977
|
/** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
|
|
3609
3978
|
maxRegression?: number;
|
|
3610
3979
|
/** Optional extractor if the metric isn't in the default set. */
|
|
3611
|
-
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
3980
|
+
extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
|
|
3612
3981
|
}
|
|
3613
3982
|
interface ThresholdContract {
|
|
3614
3983
|
name: string;
|
|
@@ -3874,10 +4243,10 @@ declare class BuilderSession {
|
|
|
3874
4243
|
*/
|
|
3875
4244
|
declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
|
|
3876
4245
|
projectId: string;
|
|
3877
|
-
chatRuns: Run[];
|
|
3878
|
-
lastBuilderRun?: Run;
|
|
3879
|
-
lastBuildRun?: Run;
|
|
3880
|
-
lastAppRuntimeRuns: Run[];
|
|
4246
|
+
chatRuns: Run$1[];
|
|
4247
|
+
lastBuilderRun?: Run$1;
|
|
4248
|
+
lastBuildRun?: Run$1;
|
|
4249
|
+
lastAppRuntimeRuns: Run$1[];
|
|
3881
4250
|
}>;
|
|
3882
4251
|
|
|
3883
4252
|
/**
|
|
@@ -3997,8 +4366,8 @@ interface ChatSummary {
|
|
|
3997
4366
|
builderRunId: string;
|
|
3998
4367
|
startedAt: number;
|
|
3999
4368
|
endedAt?: number;
|
|
4000
|
-
status: Run['status'];
|
|
4001
|
-
outcome?: Run['outcome'];
|
|
4369
|
+
status: Run$1['status'];
|
|
4370
|
+
outcome?: Run$1['outcome'];
|
|
4002
4371
|
/** Counts of spans emitted during the chat. */
|
|
4003
4372
|
llmTurns?: number;
|
|
4004
4373
|
toolCalls?: number;
|
|
@@ -4006,7 +4375,7 @@ interface ChatSummary {
|
|
|
4006
4375
|
appRuntimeRunIds: string[];
|
|
4007
4376
|
}
|
|
4008
4377
|
interface ProjectTimelineEntry {
|
|
4009
|
-
run: Run;
|
|
4378
|
+
run: Run$1;
|
|
4010
4379
|
layerBucket: 'chat' | 'build' | 'runtime' | 'other';
|
|
4011
4380
|
}
|
|
4012
4381
|
declare class ProjectRegistry {
|
|
@@ -4093,7 +4462,7 @@ declare class FileSystemOutcomeStore implements OutcomeStore {
|
|
|
4093
4462
|
interface EvalMetricSpec {
|
|
4094
4463
|
id: string;
|
|
4095
4464
|
/** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
|
|
4096
|
-
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
4465
|
+
extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
|
|
4097
4466
|
}
|
|
4098
4467
|
interface OutcomePair {
|
|
4099
4468
|
evalMetric: string;
|
|
@@ -7976,10 +8345,6 @@ interface ReflectionProposal {
|
|
|
7976
8345
|
rationale: string;
|
|
7977
8346
|
payload: unknown;
|
|
7978
8347
|
}
|
|
7979
|
-
/**
|
|
7980
|
-
* Parse the model's JSON response back into proposals. Tolerates markdown
|
|
7981
|
-
* fences and surrounding prose. Returns at most `maxProposals`.
|
|
7982
|
-
*/
|
|
7983
8348
|
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
7984
8349
|
|
|
7985
|
-
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|
|
8350
|
+
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|