@tangle-network/agent-eval 0.17.1 → 0.17.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -1
- package/dist/index.d.ts +1493 -1088
- package/dist/index.js +1487 -187
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -513,1052 +513,1562 @@ declare function formatDriverReport(results: DriverResult[]): string;
|
|
|
513
513
|
declare function printDriverSummary(results: DriverResult[]): void;
|
|
514
514
|
|
|
515
515
|
/**
|
|
516
|
-
*
|
|
517
|
-
* Inverted dimensions (hallucination, false_confidence, worst_failure)
|
|
518
|
-
* already use inverted scoring in the prompt (10 = no hallucination),
|
|
519
|
-
* but this function ensures consistency if raw scores leak through.
|
|
520
|
-
*/
|
|
521
|
-
declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
|
|
522
|
-
/** Weighted mean — falls back to uniform weights when omitted */
|
|
523
|
-
declare function weightedMean(scores: {
|
|
524
|
-
score: number;
|
|
525
|
-
weight?: number;
|
|
526
|
-
}[]): number;
|
|
527
|
-
/** Bootstrap confidence interval */
|
|
528
|
-
declare function confidenceInterval(scores: number[], confidence?: number): {
|
|
529
|
-
mean: number;
|
|
530
|
-
lower: number;
|
|
531
|
-
upper: number;
|
|
532
|
-
};
|
|
533
|
-
/**
|
|
534
|
-
* Inter-rater reliability — simplified Krippendorff's alpha.
|
|
516
|
+
* TraceSchema v1 — the canonical data model for agent-eval.
|
|
535
517
|
*
|
|
536
|
-
*
|
|
537
|
-
*
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
*
|
|
542
|
-
*
|
|
543
|
-
*/
|
|
544
|
-
declare function mannWhitneyU(a: number[], b: number[]): {
|
|
545
|
-
u: number;
|
|
546
|
-
p: number;
|
|
547
|
-
};
|
|
548
|
-
/** Partial credit: returns 0-1 ratio of current toward target */
|
|
549
|
-
declare function partialCredit(current: number, target: number): number;
|
|
550
|
-
/**
|
|
551
|
-
* Paired t-test — before/after measurements on the SAME items.
|
|
552
|
-
* Pairing removes inter-item variance, giving tighter significance than
|
|
553
|
-
* an unpaired test when comparing prompt v1 vs prompt v2 on identical
|
|
554
|
-
* scenarios.
|
|
555
|
-
*/
|
|
556
|
-
declare function pairedTTest(before: number[], after: number[]): {
|
|
557
|
-
t: number;
|
|
558
|
-
df: number;
|
|
559
|
-
p: number;
|
|
560
|
-
};
|
|
561
|
-
/**
|
|
562
|
-
* Wilcoxon signed-rank test — paired non-parametric alternative.
|
|
563
|
-
* Use when the differences aren't normally distributed.
|
|
518
|
+
* Every score, every failure class, every pipeline in the framework is
|
|
519
|
+
* a view over this data. Shape it once, live with it.
|
|
520
|
+
*
|
|
521
|
+
* Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
|
|
522
|
+
* but extended with agent-specific span kinds (llm, tool, retrieval,
|
|
523
|
+
* judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
|
|
524
|
+
* entities that OTEL leaves as free-form attributes.
|
|
564
525
|
*/
|
|
565
|
-
declare
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
526
|
+
declare const TRACE_SCHEMA_VERSION = "1.0.0";
|
|
527
|
+
type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
|
|
528
|
+
interface BudgetSpec {
|
|
529
|
+
tokens?: number;
|
|
530
|
+
wallMs?: number;
|
|
531
|
+
calls?: number;
|
|
532
|
+
usd?: number;
|
|
533
|
+
}
|
|
534
|
+
interface RunOutcome$1 {
|
|
535
|
+
score?: number;
|
|
536
|
+
pass?: boolean;
|
|
537
|
+
failureClass?: FailureClass;
|
|
538
|
+
notes?: string;
|
|
539
|
+
}
|
|
569
540
|
/**
|
|
570
|
-
*
|
|
571
|
-
*
|
|
572
|
-
*
|
|
541
|
+
* Layer — optional classification in a nested build workflow.
|
|
542
|
+
* `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
|
|
543
|
+
* `app-build`: sandbox harness that compiled + tested the generated scaffold.
|
|
544
|
+
* `app-runtime`: a run of the generated agent against a domain scenario.
|
|
545
|
+
* `meta`: any meta-eval (judge replay, correlation analysis).
|
|
573
546
|
*/
|
|
574
|
-
|
|
547
|
+
type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
|
|
548
|
+
interface Run$1 {
|
|
549
|
+
runId: string;
|
|
550
|
+
scenarioId: string;
|
|
551
|
+
variantId?: string;
|
|
552
|
+
datasetVersion?: string;
|
|
553
|
+
/** Git SHA of agent code at run time. */
|
|
554
|
+
codeSha?: string;
|
|
555
|
+
/** Hash of the prompt template + any system prompt. */
|
|
556
|
+
promptSha?: string;
|
|
557
|
+
/** Model id + date + system-prompt hash, concatenated. */
|
|
558
|
+
modelFingerprint?: string;
|
|
559
|
+
seed?: number;
|
|
560
|
+
/** Arbitrary environment markers (shell, docker version, tz). */
|
|
561
|
+
envFingerprint?: Record<string, string>;
|
|
562
|
+
/** Version of the redaction rules applied to this run. */
|
|
563
|
+
redactionVersion?: string;
|
|
564
|
+
/** Parent run in a nested build workflow. A builder run's children are
|
|
565
|
+
* app-build runs; those children are app-runtime runs. */
|
|
566
|
+
parentRunId?: string;
|
|
567
|
+
/** Stable project identifier — groups runs across chats + sessions. */
|
|
568
|
+
projectId?: string;
|
|
569
|
+
/** Chat/conversation identifier within a project. */
|
|
570
|
+
chatId?: string;
|
|
571
|
+
/** Layer classification — hint for aggregation; not enforced. */
|
|
572
|
+
layer?: RunLayer;
|
|
573
|
+
startedAt: number;
|
|
574
|
+
endedAt?: number;
|
|
575
|
+
status: RunStatus;
|
|
576
|
+
outcome?: RunOutcome$1;
|
|
577
|
+
budget?: BudgetSpec;
|
|
578
|
+
/** Free-form labels for downstream grouping. */
|
|
579
|
+
tags?: Record<string, string>;
|
|
580
|
+
}
|
|
581
|
+
type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
|
|
582
|
+
type SpanStatus = 'ok' | 'error';
|
|
583
|
+
interface SpanBase {
|
|
584
|
+
spanId: string;
|
|
585
|
+
parentSpanId?: string;
|
|
586
|
+
runId: string;
|
|
587
|
+
kind: SpanKind;
|
|
588
|
+
name: string;
|
|
589
|
+
startedAt: number;
|
|
590
|
+
endedAt?: number;
|
|
591
|
+
status?: SpanStatus;
|
|
592
|
+
error?: string;
|
|
593
|
+
/** Anything not covered by typed fields. Kept deliberately free-form. */
|
|
594
|
+
attributes?: Record<string, unknown>;
|
|
595
|
+
}
|
|
596
|
+
interface Message {
|
|
597
|
+
role: 'system' | 'user' | 'assistant' | 'tool';
|
|
598
|
+
content: string;
|
|
599
|
+
tokens?: number;
|
|
600
|
+
/** Multi-modal content descriptors; blobs themselves live in Artifacts. */
|
|
601
|
+
images?: Array<{
|
|
602
|
+
artifactId?: string;
|
|
603
|
+
url?: string;
|
|
604
|
+
mime?: string;
|
|
605
|
+
}>;
|
|
606
|
+
}
|
|
607
|
+
interface LlmSpan extends SpanBase {
|
|
608
|
+
kind: 'llm';
|
|
609
|
+
model: string;
|
|
610
|
+
messages: Message[];
|
|
611
|
+
output?: string;
|
|
612
|
+
inputTokens?: number;
|
|
613
|
+
outputTokens?: number;
|
|
614
|
+
cachedTokens?: number;
|
|
615
|
+
reasoningTokens?: number;
|
|
616
|
+
costUsd?: number;
|
|
617
|
+
finishReason?: string;
|
|
618
|
+
}
|
|
619
|
+
interface ToolSpan extends SpanBase {
|
|
620
|
+
kind: 'tool';
|
|
621
|
+
toolName: string;
|
|
622
|
+
args: unknown;
|
|
623
|
+
result?: unknown;
|
|
624
|
+
latencyMs?: number;
|
|
625
|
+
}
|
|
626
|
+
interface RetrievalSpan extends SpanBase {
|
|
627
|
+
kind: 'retrieval';
|
|
628
|
+
query: string;
|
|
629
|
+
hits: Array<{
|
|
630
|
+
docId: string;
|
|
631
|
+
score: number;
|
|
632
|
+
content?: string;
|
|
633
|
+
}>;
|
|
634
|
+
}
|
|
635
|
+
interface JudgeSpan extends SpanBase {
|
|
636
|
+
kind: 'judge';
|
|
637
|
+
judgeId: string;
|
|
638
|
+
/** Span this judgment applies to. */
|
|
639
|
+
targetSpanId: string;
|
|
640
|
+
dimension: string;
|
|
641
|
+
/** Numeric score (free-range; interpretation up to the judge). */
|
|
642
|
+
score: number;
|
|
643
|
+
rationale?: string;
|
|
644
|
+
evidence?: string;
|
|
645
|
+
}
|
|
646
|
+
interface SandboxSpan extends SpanBase {
|
|
647
|
+
kind: 'sandbox';
|
|
648
|
+
image?: string;
|
|
649
|
+
command?: string;
|
|
650
|
+
exitCode?: number;
|
|
651
|
+
testsTotal?: number;
|
|
652
|
+
testsPassed?: number;
|
|
653
|
+
stdoutHash?: string;
|
|
654
|
+
stderrHash?: string;
|
|
655
|
+
/** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
|
|
656
|
+
wallMs?: number;
|
|
657
|
+
}
|
|
658
|
+
interface GenericSpan extends SpanBase {
|
|
659
|
+
kind: 'agent' | 'custom';
|
|
660
|
+
}
|
|
661
|
+
type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
|
|
662
|
+
type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
|
|
663
|
+
interface TraceEvent {
|
|
664
|
+
eventId: string;
|
|
665
|
+
runId: string;
|
|
666
|
+
spanId?: string;
|
|
667
|
+
kind: EventKind;
|
|
668
|
+
timestamp: number;
|
|
669
|
+
payload: Record<string, unknown>;
|
|
670
|
+
}
|
|
671
|
+
interface BudgetLedgerEntry {
|
|
672
|
+
runId: string;
|
|
673
|
+
dimension: keyof BudgetSpec;
|
|
674
|
+
limit: number;
|
|
675
|
+
consumed: number;
|
|
676
|
+
remaining: number;
|
|
677
|
+
timestamp: number;
|
|
678
|
+
breached: boolean;
|
|
679
|
+
/** Span that triggered this entry, if any. */
|
|
680
|
+
spanId?: string;
|
|
681
|
+
}
|
|
682
|
+
interface Artifact$1 {
|
|
683
|
+
artifactId: string;
|
|
684
|
+
runId: string;
|
|
685
|
+
spanId?: string;
|
|
686
|
+
contentType: string;
|
|
687
|
+
sizeBytes: number;
|
|
688
|
+
/** sha256 in hex. */
|
|
689
|
+
hash: string;
|
|
690
|
+
/** External storage URL (R2, S3, filesystem path). */
|
|
691
|
+
storageUrl?: string;
|
|
692
|
+
/** Inline content for small blobs — keep under ~64KB. */
|
|
693
|
+
inlineContent?: string;
|
|
694
|
+
}
|
|
695
|
+
type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
|
|
696
|
+
declare const FAILURE_CLASSES: readonly FailureClass[];
|
|
697
|
+
declare function isLlmSpan(s: Span): s is LlmSpan;
|
|
698
|
+
declare function isToolSpan(s: Span): s is ToolSpan;
|
|
699
|
+
declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
|
|
700
|
+
declare function isJudgeSpan(s: Span): s is JudgeSpan;
|
|
701
|
+
declare function isSandboxSpan(s: Span): s is SandboxSpan;
|
|
575
702
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
constructor(criteria: CompletionCriterion[]);
|
|
586
|
-
/** Evaluate criteria against current state, record result */
|
|
587
|
-
record(turn: number, state: DriverState): {
|
|
588
|
-
completionPercent: number;
|
|
589
|
-
complete: boolean;
|
|
590
|
-
criteriaStatus: Record<string, boolean | number>;
|
|
703
|
+
interface RunFilter {
|
|
704
|
+
scenarioId?: string;
|
|
705
|
+
variantId?: string;
|
|
706
|
+
status?: RunStatus;
|
|
707
|
+
since?: number;
|
|
708
|
+
until?: number;
|
|
709
|
+
tag?: {
|
|
710
|
+
key: string;
|
|
711
|
+
value: string;
|
|
591
712
|
};
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
713
|
+
parentRunId?: string;
|
|
714
|
+
projectId?: string;
|
|
715
|
+
chatId?: string;
|
|
716
|
+
layer?: RunLayer;
|
|
717
|
+
}
|
|
718
|
+
interface SpanFilter {
|
|
719
|
+
runId?: string;
|
|
720
|
+
parentSpanId?: string;
|
|
721
|
+
kind?: SpanKind;
|
|
722
|
+
name?: string;
|
|
723
|
+
toolName?: string;
|
|
724
|
+
judgeId?: string;
|
|
725
|
+
since?: number;
|
|
726
|
+
until?: number;
|
|
727
|
+
}
|
|
728
|
+
interface EventFilter {
|
|
729
|
+
runId?: string;
|
|
730
|
+
spanId?: string;
|
|
731
|
+
kind?: EventKind;
|
|
732
|
+
since?: number;
|
|
733
|
+
until?: number;
|
|
734
|
+
}
|
|
735
|
+
interface TraceStore {
|
|
736
|
+
appendRun(run: Run$1): Promise<void>;
|
|
737
|
+
updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
|
|
738
|
+
appendSpan(span: Span): Promise<void>;
|
|
739
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
740
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
741
|
+
appendArtifact(artifact: Artifact$1): Promise<void>;
|
|
742
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
743
|
+
getRun(runId: string): Promise<Run$1 | undefined>;
|
|
744
|
+
listRuns(filter?: RunFilter): Promise<Run$1[]>;
|
|
745
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
746
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
747
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
748
|
+
artifacts(runId: string): Promise<Artifact$1[]>;
|
|
749
|
+
}
|
|
750
|
+
declare class InMemoryTraceStore implements TraceStore {
|
|
751
|
+
private runs;
|
|
752
|
+
private allSpans;
|
|
753
|
+
private allEvents;
|
|
754
|
+
private allArtifacts;
|
|
755
|
+
private allBudget;
|
|
756
|
+
appendRun(run: Run$1): Promise<void>;
|
|
757
|
+
updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
|
|
758
|
+
appendSpan(span: Span): Promise<void>;
|
|
759
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
760
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
761
|
+
appendArtifact(artifact: Artifact$1): Promise<void>;
|
|
762
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
763
|
+
getRun(runId: string): Promise<Run$1 | undefined>;
|
|
764
|
+
listRuns(filter?: RunFilter): Promise<Run$1[]>;
|
|
765
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
766
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
767
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
768
|
+
artifacts(runId: string): Promise<Artifact$1[]>;
|
|
769
|
+
}
|
|
770
|
+
interface FileSystemTraceStoreOptions {
|
|
771
|
+
dir: string;
|
|
772
|
+
/** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
|
|
773
|
+
maxBytes?: number;
|
|
774
|
+
}
|
|
775
|
+
declare class FileSystemTraceStore implements TraceStore {
|
|
776
|
+
private dir;
|
|
777
|
+
private maxBytes;
|
|
778
|
+
/** Lazy in-memory index for queries — populated on first read. */
|
|
779
|
+
private index?;
|
|
780
|
+
private loaded;
|
|
781
|
+
constructor(options: FileSystemTraceStoreOptions);
|
|
782
|
+
private ensureDir;
|
|
783
|
+
private append;
|
|
784
|
+
private insertInto;
|
|
785
|
+
private load;
|
|
786
|
+
appendRun(run: Run$1): Promise<void>;
|
|
787
|
+
updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
|
|
788
|
+
appendSpan(span: Span): Promise<void>;
|
|
789
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
790
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
791
|
+
appendArtifact(artifact: Artifact$1): Promise<void>;
|
|
792
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
793
|
+
getRun(runId: string): Promise<Run$1 | undefined>;
|
|
794
|
+
listRuns(filter?: RunFilter): Promise<Run$1[]>;
|
|
795
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
796
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
797
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
798
|
+
artifacts(runId: string): Promise<Artifact$1[]>;
|
|
602
799
|
}
|
|
603
800
|
|
|
604
801
|
/**
|
|
605
|
-
*
|
|
606
|
-
*
|
|
607
|
-
* Every prompt used in an eval run is registered with an explicit version.
|
|
608
|
-
* Reports include the content hash so A/B compares are rigorous: if the
|
|
609
|
-
* hash changes between two reports, the prompt actually changed; if it
|
|
610
|
-
* matches, the variance is elsewhere.
|
|
802
|
+
* TraceEmitter — hierarchical span builder that auto-parents using an
|
|
803
|
+
* internal stack. One emitter per Run; emitters do NOT share state.
|
|
611
804
|
*
|
|
612
|
-
*
|
|
613
|
-
*
|
|
805
|
+
* Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
|
|
806
|
+
* return a `SpanHandle` with `.end()` / `.fail()` so callers don't
|
|
807
|
+
* have to thread spanIds manually. For async workflows that can't use
|
|
808
|
+
* the stack (e.g. fan-out parallel calls), pass `parentSpanId`
|
|
809
|
+
* explicitly.
|
|
614
810
|
*/
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
/** SHA-256 of content, 12-hex-char prefix */
|
|
621
|
-
hash: string;
|
|
622
|
-
/** Full prompt body */
|
|
623
|
-
content: string;
|
|
811
|
+
|
|
812
|
+
interface SpanHandle<S extends Span = Span> {
|
|
813
|
+
span: S;
|
|
814
|
+
end(patch?: Partial<S>): Promise<void>;
|
|
815
|
+
fail(error: string | Error, patch?: Partial<S>): Promise<void>;
|
|
624
816
|
}
|
|
625
|
-
|
|
626
|
-
|
|
817
|
+
interface TraceEmitterOptions {
|
|
818
|
+
runId?: string;
|
|
819
|
+
/** Inject a clock for deterministic tests. */
|
|
820
|
+
now?: () => number;
|
|
821
|
+
/** Inject an id generator for deterministic tests. */
|
|
822
|
+
id?: () => string;
|
|
823
|
+
}
|
|
824
|
+
declare class TraceEmitter {
|
|
825
|
+
private store;
|
|
826
|
+
private stack;
|
|
827
|
+
private _runId;
|
|
828
|
+
private now;
|
|
829
|
+
private id;
|
|
830
|
+
constructor(store: TraceStore, options?: TraceEmitterOptions);
|
|
831
|
+
get runId(): string;
|
|
832
|
+
startRun(run: Omit<Run$1, 'runId' | 'startedAt' | 'status'>): Promise<Run$1>;
|
|
833
|
+
endRun(outcome?: RunOutcome$1): Promise<void>;
|
|
834
|
+
abortRun(reason: string): Promise<void>;
|
|
835
|
+
span<S extends Span = Span>(init: {
|
|
836
|
+
kind: SpanKind;
|
|
837
|
+
name: string;
|
|
838
|
+
parentSpanId?: string;
|
|
839
|
+
attributes?: Record<string, unknown>;
|
|
840
|
+
} & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
|
|
841
|
+
private handle;
|
|
842
|
+
private pop;
|
|
843
|
+
llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
|
|
844
|
+
tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
|
|
845
|
+
retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
|
|
846
|
+
recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
|
|
847
|
+
sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
|
|
848
|
+
emit(event: {
|
|
849
|
+
kind: EventKind;
|
|
850
|
+
spanId?: string;
|
|
851
|
+
payload?: Record<string, unknown>;
|
|
852
|
+
}): Promise<TraceEvent>;
|
|
853
|
+
recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
|
|
854
|
+
timestamp?: number;
|
|
855
|
+
}): Promise<BudgetLedgerEntry>;
|
|
856
|
+
recordArtifact(artifact: Omit<Artifact$1, 'artifactId' | 'runId'>): Promise<Artifact$1>;
|
|
627
857
|
/**
|
|
628
|
-
*
|
|
629
|
-
*
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
register(id: string, version: string, content: string): Promise<PromptHandle>;
|
|
633
|
-
/** Look up a registered prompt. Throws if unknown — no implicit defaults. */
|
|
634
|
-
get(id: string, version: string): PromptHandle;
|
|
635
|
-
/** Return all versions of an id, newest-first (lex-descending on version). */
|
|
636
|
-
listVersions(id: string): PromptHandle[];
|
|
637
|
-
/** Snapshot the whole registry — useful for including in reports. */
|
|
638
|
-
list(): PromptHandle[];
|
|
639
|
-
/** Verify a hash against registered content. Returns null if not found. */
|
|
640
|
-
verifyHash(id: string, version: string, expectedHash: string): boolean | null;
|
|
858
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
859
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
860
|
+
*/
|
|
861
|
+
within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
|
|
641
862
|
}
|
|
642
|
-
/**
|
|
643
|
-
declare function
|
|
863
|
+
/** Helper to build an LLM span handle args object from a provider-shaped response. */
|
|
864
|
+
declare function llmSpanFromProvider(args: {
|
|
865
|
+
name?: string;
|
|
866
|
+
model: string;
|
|
867
|
+
messages: Message[];
|
|
868
|
+
output: string;
|
|
869
|
+
usage?: {
|
|
870
|
+
inputTokens?: number;
|
|
871
|
+
outputTokens?: number;
|
|
872
|
+
cachedTokens?: number;
|
|
873
|
+
reasoningTokens?: number;
|
|
874
|
+
};
|
|
875
|
+
costUsd?: number;
|
|
876
|
+
finishReason?: string;
|
|
877
|
+
}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
|
|
644
878
|
|
|
645
879
|
/**
|
|
646
|
-
*
|
|
880
|
+
* Policy-based agent control runtime.
|
|
647
881
|
*
|
|
648
|
-
*
|
|
649
|
-
* 80% of AI slop that every production agent leaks:
|
|
650
|
-
* - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
|
|
651
|
-
* - N-gram repetition (same phrase over and over)
|
|
652
|
-
* - Hedging overuse ("I could be wrong, but...")
|
|
653
|
-
* - Apology padding ("I'm so sorry for the confusion...")
|
|
654
|
-
* - Unused opening formulas ("Great question!")
|
|
655
|
-
* - Length bounds (too short to be useful, too long to be read)
|
|
882
|
+
* This is the minimal reusable loop behind driver-agent patterns:
|
|
656
883
|
*
|
|
657
|
-
*
|
|
658
|
-
*
|
|
884
|
+
* observe state -> validate -> decide next action -> act -> observe -> ...
|
|
885
|
+
*
|
|
886
|
+
* It deliberately does not model named "topologies". Direct execution,
|
|
887
|
+
* critic/revise, driver intervention, specialist calls, and human escalation
|
|
888
|
+
* are all just actions chosen by the control policy.
|
|
659
889
|
*/
|
|
660
890
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
/**
|
|
665
|
-
|
|
666
|
-
/**
|
|
667
|
-
|
|
668
|
-
/**
|
|
669
|
-
|
|
670
|
-
/**
|
|
671
|
-
|
|
672
|
-
/**
|
|
673
|
-
|
|
674
|
-
/**
|
|
675
|
-
|
|
676
|
-
/**
|
|
677
|
-
|
|
678
|
-
/** How heavily each violation class reduces the score (default 1). */
|
|
679
|
-
penaltyWeights?: Partial<Record<SlopCategory, number>>;
|
|
891
|
+
type ControlSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
892
|
+
type ControlActionFailureMode = 'continue' | 'stop';
|
|
893
|
+
interface ControlEvalResult {
|
|
894
|
+
/** Stable validator or judge id. */
|
|
895
|
+
id: string;
|
|
896
|
+
/** Whether this check passed. */
|
|
897
|
+
passed: boolean;
|
|
898
|
+
/** Optional normalized score. 1 = best, 0 = worst. */
|
|
899
|
+
score?: number;
|
|
900
|
+
/** Objective validators should usually be "error" or "critical" when failed. */
|
|
901
|
+
severity?: ControlSeverity;
|
|
902
|
+
/** Human-readable result. */
|
|
903
|
+
detail?: string;
|
|
904
|
+
/** Small evidence string or pointer. Avoid large payloads. */
|
|
905
|
+
evidence?: string;
|
|
906
|
+
/** True when the result came from deterministic state, not LLM judgment. */
|
|
907
|
+
objective?: boolean;
|
|
680
908
|
}
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
category: SlopCategory;
|
|
686
|
-
detail: string;
|
|
687
|
-
example?: string;
|
|
909
|
+
interface ControlBudget {
|
|
910
|
+
maxSteps: number;
|
|
911
|
+
maxWallMs?: number;
|
|
912
|
+
maxCostUsd?: number;
|
|
688
913
|
}
|
|
689
|
-
interface
|
|
690
|
-
/**
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
914
|
+
interface ControlStopPolicies<TState, TAction> {
|
|
915
|
+
/**
|
|
916
|
+
* Stop after N consecutive steps with no state fingerprint change and
|
|
917
|
+
* less than `minScoreDelta` score movement. Disabled when omitted.
|
|
918
|
+
*/
|
|
919
|
+
maxNoProgressSteps?: number;
|
|
920
|
+
/**
|
|
921
|
+
* Stop after the same action fingerprint is selected N consecutive
|
|
922
|
+
* times. Disabled when omitted.
|
|
923
|
+
*/
|
|
924
|
+
maxRepeatedActions?: number;
|
|
925
|
+
/** Minimum score movement that counts as progress. Default 0.001. */
|
|
926
|
+
minScoreDelta?: number;
|
|
927
|
+
/** Override the default JSON/string fingerprint for state comparisons. */
|
|
928
|
+
stateFingerprint?: (state: TState) => string;
|
|
929
|
+
/** Override the default JSON/string fingerprint for repeated-action checks. */
|
|
930
|
+
actionFingerprint?: (action: TAction) => string;
|
|
931
|
+
}
|
|
932
|
+
interface ControlContext<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
933
|
+
intent: string;
|
|
934
|
+
state: TState;
|
|
935
|
+
evals: TEval[];
|
|
936
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
937
|
+
budget: ControlBudget;
|
|
938
|
+
stepIndex: number;
|
|
939
|
+
wallMs: number;
|
|
940
|
+
spentCostUsd: number;
|
|
941
|
+
remainingCostUsd?: number;
|
|
942
|
+
abortSignal: AbortSignal;
|
|
943
|
+
emitter?: TraceEmitter;
|
|
695
944
|
}
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
945
|
+
type ControlDecision<TAction> = {
|
|
946
|
+
type: 'continue';
|
|
947
|
+
action: TAction;
|
|
948
|
+
reason?: string;
|
|
949
|
+
} | {
|
|
950
|
+
type: 'stop';
|
|
951
|
+
reason: string;
|
|
952
|
+
pass?: boolean;
|
|
953
|
+
score?: number;
|
|
954
|
+
};
|
|
955
|
+
interface StopDecision {
|
|
956
|
+
stop: boolean;
|
|
957
|
+
pass: boolean;
|
|
958
|
+
reason: string;
|
|
959
|
+
score?: number;
|
|
960
|
+
failureClass?: FailureClass;
|
|
961
|
+
}
|
|
962
|
+
interface ControlActionOutcome<TActionResult> {
|
|
963
|
+
ok: boolean;
|
|
964
|
+
result?: TActionResult;
|
|
965
|
+
error?: string;
|
|
966
|
+
costUsd?: number;
|
|
967
|
+
durationMs: number;
|
|
968
|
+
}
|
|
969
|
+
interface ControlRuntimeError {
|
|
970
|
+
phase: 'observe' | 'validate' | 'decide' | 'act' | 'stop-policy' | 'on-step' | 'trace';
|
|
971
|
+
stepIndex: number;
|
|
972
|
+
message: string;
|
|
973
|
+
}
|
|
974
|
+
interface ControlStep<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
975
|
+
index: number;
|
|
976
|
+
decision: ControlDecision<TAction>;
|
|
977
|
+
beforeState: TState;
|
|
978
|
+
afterState: TState;
|
|
979
|
+
evalsBefore: TEval[];
|
|
980
|
+
evalsAfter: TEval[];
|
|
981
|
+
actionOutcome?: ControlActionOutcome<TActionResult>;
|
|
982
|
+
startedAt: string;
|
|
983
|
+
endedAt: string;
|
|
984
|
+
}
|
|
985
|
+
interface ControlRunResult<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
986
|
+
intent: string;
|
|
987
|
+
pass: boolean;
|
|
988
|
+
completed: boolean;
|
|
989
|
+
reason: string;
|
|
990
|
+
score?: number;
|
|
991
|
+
steps: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
992
|
+
finalState: TState | undefined;
|
|
993
|
+
finalEvals: TEval[];
|
|
994
|
+
wallMs: number;
|
|
995
|
+
spentCostUsd: number;
|
|
996
|
+
runId: string | null;
|
|
997
|
+
failureClass?: FailureClass;
|
|
998
|
+
runtimeErrors: ControlRuntimeError[];
|
|
999
|
+
stoppedBy: 'policy' | 'stop-policy' | 'budget' | 'abort' | 'runtime-error';
|
|
1000
|
+
}
|
|
1001
|
+
interface ControlRuntimeConfig<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
|
|
1002
|
+
intent: string;
|
|
1003
|
+
budget?: Partial<ControlBudget>;
|
|
1004
|
+
signal?: AbortSignal;
|
|
1005
|
+
/** Defaults to `continue`: action failures are recorded, then the policy gets another chance. */
|
|
1006
|
+
actionFailure?: ControlActionFailureMode;
|
|
1007
|
+
/**
|
|
1008
|
+
* Extract cost from an action result. Used for `maxCostUsd` budget
|
|
1009
|
+
* enforcement and trace budget ledger emission.
|
|
1010
|
+
*/
|
|
1011
|
+
getActionCostUsd?: (ctx: {
|
|
1012
|
+
action: TAction;
|
|
1013
|
+
result: TActionResult;
|
|
1014
|
+
state: TState;
|
|
1015
|
+
evals: TEval[];
|
|
1016
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
1017
|
+
}) => number | undefined;
|
|
1018
|
+
/** Read typed task/product state. Prefer structured state over transcript-only context. */
|
|
1019
|
+
observe: (ctx: {
|
|
1020
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
1021
|
+
abortSignal: AbortSignal;
|
|
1022
|
+
}) => Promise<TState> | TState;
|
|
1023
|
+
/** Objective validators first, subjective judges only where objective state is insufficient. */
|
|
1024
|
+
validate: (ctx: {
|
|
1025
|
+
intent: string;
|
|
1026
|
+
state: TState;
|
|
1027
|
+
history: ControlStep<TState, TAction, TActionResult, TEval>[];
|
|
1028
|
+
abortSignal: AbortSignal;
|
|
1029
|
+
}) => Promise<TEval[]> | TEval[];
|
|
1030
|
+
/** Choose the next control action. Can call a worker, ask user, run critic, inspect state, or stop. */
|
|
1031
|
+
decide: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<ControlDecision<TAction>> | ControlDecision<TAction>;
|
|
1032
|
+
/** Execute the action selected by the policy. */
|
|
1033
|
+
act: (action: TAction, ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<TActionResult> | TActionResult;
|
|
1034
|
+
/** Final stopping policy. Called before decide and after each action. */
|
|
1035
|
+
shouldStop?: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<StopDecision> | StopDecision;
|
|
1036
|
+
/** Optional hook for tracing or live progress updates. */
|
|
1037
|
+
onStep?: (step: ControlStep<TState, TAction, TActionResult, TEval>) => Promise<void> | void;
|
|
1038
|
+
/** Optional generic stuck-loop policies. Custom `shouldStop` still runs first. */
|
|
1039
|
+
stopPolicies?: ControlStopPolicies<TState, TAction>;
|
|
1040
|
+
/** Optional trace sink. Emits one run plus one span per control step. */
|
|
1041
|
+
store?: TraceStore;
|
|
1042
|
+
scenarioId?: string;
|
|
1043
|
+
projectId?: string;
|
|
1044
|
+
variantId?: string;
|
|
1045
|
+
}
|
|
1046
|
+
declare function runAgentControlLoop<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(config: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>>;
|
|
1047
|
+
declare function stopOnNoProgress<TState, TAction>(maxNoProgressSteps: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxNoProgressSteps'>): ControlStopPolicies<TState, TAction>;
|
|
1048
|
+
declare function stopOnRepeatedAction<TState, TAction>(maxRepeatedActions: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxRepeatedActions'>): ControlStopPolicies<TState, TAction>;
|
|
1049
|
+
declare function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
|
|
1050
|
+
declare function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
|
|
1051
|
+
declare function allCriticalPassed(evals: ControlEvalResult[]): boolean;
|
|
703
1052
|
|
|
704
1053
|
/**
|
|
705
|
-
*
|
|
1054
|
+
* Dataset — versioned, sliceable, content-hashed scenario collection.
|
|
706
1055
|
*
|
|
707
|
-
*
|
|
708
|
-
*
|
|
709
|
-
*
|
|
710
|
-
*
|
|
1056
|
+
* Scenarios stop being ephemeral arrays and become first-class
|
|
1057
|
+
* artifacts. Every Dataset carries:
|
|
1058
|
+
* - content hash (sha256 over canonicalized scenario array)
|
|
1059
|
+
* - provenance (contributor, createdAt, sourceUrl)
|
|
1060
|
+
* - split labels (train | dev | test | holdout)
|
|
1061
|
+
* - difficulty tiers (easy | medium | hard | extreme)
|
|
1062
|
+
* - tags (free-form, per-scenario)
|
|
711
1063
|
*
|
|
712
|
-
*
|
|
713
|
-
*
|
|
714
|
-
*
|
|
715
|
-
*
|
|
1064
|
+
* `Dataset.slice({ difficulty, split, holdout, seed })` returns a
|
|
1065
|
+
* deterministic, reproducible subset. Holdout slices are locked: you
|
|
1066
|
+
* can read them but `mutate` throws, which prevents "oh I'll just
|
|
1067
|
+
* tweak that one scenario" contamination drift.
|
|
716
1068
|
*/
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
1069
|
+
type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
|
|
1070
|
+
type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
|
|
1071
|
+
interface DatasetScenario {
|
|
1072
|
+
id: string;
|
|
1073
|
+
/** Arbitrary payload; the framework doesn't interpret it. */
|
|
1074
|
+
payload: unknown;
|
|
1075
|
+
split?: DatasetSplit;
|
|
1076
|
+
difficulty?: DatasetDifficulty;
|
|
1077
|
+
/** Canary token that MUST NOT round-trip through a correct agent output. */
|
|
1078
|
+
canary?: string;
|
|
1079
|
+
tags?: Record<string, string>;
|
|
728
1080
|
}
|
|
729
|
-
interface
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
1081
|
+
interface DatasetProvenance {
|
|
1082
|
+
contributor?: string;
|
|
1083
|
+
createdAt: string;
|
|
1084
|
+
sourceUrl?: string;
|
|
1085
|
+
license?: string;
|
|
1086
|
+
description?: string;
|
|
1087
|
+
/** Monotonic human-readable version (e.g. "2026.04.20"). */
|
|
1088
|
+
version: string;
|
|
736
1089
|
}
|
|
737
|
-
interface
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
/**
|
|
741
|
-
|
|
1090
|
+
interface DatasetManifest {
|
|
1091
|
+
name: string;
|
|
1092
|
+
provenance: DatasetProvenance;
|
|
1093
|
+
/** sha256 hex over canonicalized scenarios. */
|
|
1094
|
+
contentHash: string;
|
|
1095
|
+
scenarioCount: number;
|
|
1096
|
+
splitCounts: Record<DatasetSplit, number>;
|
|
742
1097
|
}
|
|
743
|
-
interface
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
1098
|
+
interface SliceOptions {
|
|
1099
|
+
split?: DatasetSplit;
|
|
1100
|
+
difficulty?: DatasetDifficulty;
|
|
1101
|
+
/** Number of scenarios (random sample, seeded). Omit to take all that match. */
|
|
1102
|
+
limit?: number;
|
|
1103
|
+
seed?: number;
|
|
1104
|
+
/** Predicate narrowing. Applied after split/difficulty filters. */
|
|
1105
|
+
filter?: (scenario: DatasetScenario) => boolean;
|
|
1106
|
+
/** If true, include scenarios marked as holdout. Default false. */
|
|
1107
|
+
includeHoldout?: boolean;
|
|
750
1108
|
}
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
/** Optional description for human-facing reports. */
|
|
755
|
-
description?: string;
|
|
756
|
-
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
757
|
-
validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
|
|
1109
|
+
/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
|
|
1110
|
+
declare class HoldoutLockedError extends Error {
|
|
1111
|
+
constructor(datasetName: string);
|
|
758
1112
|
}
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
/**
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
1113
|
+
declare class Dataset {
|
|
1114
|
+
readonly name: string;
|
|
1115
|
+
readonly provenance: DatasetProvenance;
|
|
1116
|
+
private scenarios;
|
|
1117
|
+
private locked;
|
|
1118
|
+
constructor(init: {
|
|
1119
|
+
name: string;
|
|
1120
|
+
provenance: DatasetProvenance;
|
|
1121
|
+
scenarios: DatasetScenario[];
|
|
1122
|
+
locked?: boolean;
|
|
1123
|
+
});
|
|
1124
|
+
/** All scenarios. Readonly — callers must go through `slice` or `clone`. */
|
|
1125
|
+
all(): readonly DatasetScenario[];
|
|
1126
|
+
get size(): number;
|
|
1127
|
+
/**
|
|
1128
|
+
* Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
|
|
1129
|
+
* the same arguments always produce the same slice across machines.
|
|
1130
|
+
*/
|
|
1131
|
+
slice(options?: SliceOptions): DatasetScenario[];
|
|
1132
|
+
/**
|
|
1133
|
+
* Assemble the manifest (name + provenance + content hash + counts).
|
|
1134
|
+
* Content hash is deterministic over canonicalized scenarios.
|
|
1135
|
+
*/
|
|
1136
|
+
manifest(): Promise<DatasetManifest>;
|
|
1137
|
+
/** Fresh unlocked copy — for post-release forks when mutation is needed. */
|
|
1138
|
+
clone(overrides?: Partial<{
|
|
1139
|
+
name: string;
|
|
1140
|
+
version: string;
|
|
1141
|
+
}>): Dataset;
|
|
1142
|
+
lock(): void;
|
|
1143
|
+
add(scenario: DatasetScenario): void;
|
|
1144
|
+
remove(scenarioId: string): void;
|
|
1145
|
+
/**
|
|
1146
|
+
* Stable JSON-Lines serialization — deterministic byte-for-byte.
|
|
1147
|
+
* Write to disk for contamination-verifiable archives.
|
|
1148
|
+
*/
|
|
1149
|
+
toJsonl(): string;
|
|
1150
|
+
static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
|
|
1151
|
+
}
|
|
1152
|
+
declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
|
|
777
1153
|
|
|
778
1154
|
/**
|
|
779
|
-
*
|
|
1155
|
+
* Prompt optimizer — A/B test prompt variants with statistical rigor.
|
|
780
1156
|
*
|
|
781
|
-
*
|
|
782
|
-
*
|
|
783
|
-
*
|
|
1157
|
+
* Runs N prompt variants against a fixed scenario set, collects per-scenario
|
|
1158
|
+
* scores via the user-provided `scoreVariant` callback, and returns:
|
|
1159
|
+
* - per-variant mean + bootstrap CI
|
|
1160
|
+
* - pairwise significance (Mann-Whitney, non-parametric — works on any
|
|
1161
|
+
* score distribution, not just normal)
|
|
1162
|
+
* - a winner (highest mean, flagged if the lead is not significant)
|
|
784
1163
|
*
|
|
785
|
-
*
|
|
786
|
-
*
|
|
1164
|
+
* Deliberately generic — the `scoreVariant` callback does whatever domain
|
|
1165
|
+
* work the consumer needs (invoke the agent, judge the output, whatever),
|
|
1166
|
+
* and returns a number per scenario. This lets the optimizer stay small +
|
|
1167
|
+
* testable.
|
|
787
1168
|
*/
|
|
788
|
-
interface
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
/**
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
1169
|
+
interface PromptVariant$1 {
|
|
1170
|
+
id: string;
|
|
1171
|
+
prompt: string;
|
|
1172
|
+
metadata?: Record<string, unknown>;
|
|
1173
|
+
}
|
|
1174
|
+
interface OptimizationConfig {
|
|
1175
|
+
variants: PromptVariant$1[];
|
|
1176
|
+
/** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
|
|
1177
|
+
trialsPerScenario?: number;
|
|
1178
|
+
/** Significance threshold for pairwise comparison (default 0.05). */
|
|
1179
|
+
significanceLevel?: number;
|
|
1180
|
+
/**
|
|
1181
|
+
* The scoring callback. For each (variant, scenarioId, trialIndex), produce
|
|
1182
|
+
* a score in 0..1 (or any numeric range — the optimizer only cares about
|
|
1183
|
+
* monotonicity).
|
|
1184
|
+
*/
|
|
1185
|
+
scoreVariant: (args: {
|
|
1186
|
+
variant: PromptVariant$1;
|
|
1187
|
+
scenarioId: string;
|
|
1188
|
+
trialIndex: number;
|
|
1189
|
+
}) => Promise<number>;
|
|
1190
|
+
/** Scenario ids to run against. */
|
|
1191
|
+
scenarioIds: string[];
|
|
1192
|
+
/** Optional hook — fires after each (variant, scenario) fully scored. */
|
|
1193
|
+
onScenarioComplete?: (info: {
|
|
1194
|
+
variantId: string;
|
|
1195
|
+
scenarioId: string;
|
|
1196
|
+
scores: number[];
|
|
1197
|
+
}) => void;
|
|
1198
|
+
}
|
|
1199
|
+
interface VariantScore {
|
|
1200
|
+
variantId: string;
|
|
1201
|
+
mean: number;
|
|
1202
|
+
ci95: {
|
|
1203
|
+
lower: number;
|
|
1204
|
+
upper: number;
|
|
1205
|
+
};
|
|
1206
|
+
n: number;
|
|
1207
|
+
perScenario: Record<string, {
|
|
1208
|
+
mean: number;
|
|
1209
|
+
n: number;
|
|
1210
|
+
samples: number[];
|
|
800
1211
|
}>;
|
|
801
1212
|
}
|
|
802
|
-
interface
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
1213
|
+
interface PairwiseComparison {
|
|
1214
|
+
variantA: string;
|
|
1215
|
+
variantB: string;
|
|
1216
|
+
pValue: number;
|
|
1217
|
+
/** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
|
|
1218
|
+
qValue: number;
|
|
1219
|
+
/** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
|
|
1220
|
+
significant: boolean;
|
|
1221
|
+
meanDelta: number;
|
|
807
1222
|
}
|
|
808
|
-
interface
|
|
809
|
-
|
|
810
|
-
|
|
1223
|
+
interface OptimizationResult {
|
|
1224
|
+
winner: {
|
|
1225
|
+
variantId: string;
|
|
1226
|
+
/** True when the winner's lead vs every other variant is statistically significant. */
|
|
1227
|
+
significant: boolean;
|
|
1228
|
+
ciLowerBoundExceedsSecondMean: boolean;
|
|
1229
|
+
};
|
|
1230
|
+
scores: VariantScore[];
|
|
1231
|
+
pairwise: PairwiseComparison[];
|
|
1232
|
+
config: {
|
|
1233
|
+
trialsPerScenario: number;
|
|
1234
|
+
significanceLevel: number;
|
|
1235
|
+
variants: string[];
|
|
1236
|
+
scenarios: string[];
|
|
1237
|
+
};
|
|
811
1238
|
}
|
|
812
|
-
declare class
|
|
813
|
-
|
|
814
|
-
private readonly snapshots;
|
|
815
|
-
set(scopeId: string, snapshot: WorkspaceSnapshot): void;
|
|
816
|
-
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
1239
|
+
declare class PromptOptimizer {
|
|
1240
|
+
run(config: OptimizationConfig): Promise<OptimizationResult>;
|
|
817
1241
|
}
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
1242
|
+
|
|
1243
|
+
interface RunScore {
|
|
1244
|
+
success: number;
|
|
1245
|
+
goalProgress: number;
|
|
1246
|
+
repoGroundedness: number;
|
|
1247
|
+
driftPenalty: number;
|
|
1248
|
+
toolUseQuality: number;
|
|
1249
|
+
patchQuality: number;
|
|
1250
|
+
testReality: number;
|
|
1251
|
+
finalGate: number;
|
|
1252
|
+
reviewerBlockers: number;
|
|
1253
|
+
costUsd: number;
|
|
1254
|
+
wallSeconds: number;
|
|
1255
|
+
notes?: string[];
|
|
822
1256
|
}
|
|
823
|
-
interface
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
1257
|
+
interface RunScoreWeights {
|
|
1258
|
+
success: number;
|
|
1259
|
+
goalProgress: number;
|
|
1260
|
+
repoGroundedness: number;
|
|
1261
|
+
driftPenalty: number;
|
|
1262
|
+
toolUseQuality: number;
|
|
1263
|
+
patchQuality: number;
|
|
1264
|
+
testReality: number;
|
|
1265
|
+
finalGate: number;
|
|
1266
|
+
reviewerBlockers: number;
|
|
1267
|
+
costUsd: number;
|
|
1268
|
+
wallSeconds: number;
|
|
828
1269
|
}
|
|
829
|
-
declare
|
|
830
|
-
declare function
|
|
831
|
-
declare function
|
|
832
|
-
declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
|
|
833
|
-
min?: number;
|
|
834
|
-
}): WorkspaceAssertion;
|
|
835
|
-
/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
|
|
836
|
-
declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
|
|
837
|
-
pass: boolean;
|
|
838
|
-
score: number;
|
|
839
|
-
results: Array<{
|
|
840
|
-
assertion: string;
|
|
841
|
-
result: WorkspaceAssertionResult;
|
|
842
|
-
}>;
|
|
843
|
-
};
|
|
1270
|
+
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
1271
|
+
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
1272
|
+
declare function clamp01(value: number): number;
|
|
844
1273
|
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
1274
|
+
interface SteeringRolePrompt {
|
|
1275
|
+
system?: string;
|
|
1276
|
+
append?: string;
|
|
1277
|
+
}
|
|
1278
|
+
interface SteeringBundle {
|
|
1279
|
+
id: string;
|
|
1280
|
+
coderPrompt?: string;
|
|
1281
|
+
continuePrompt?: string;
|
|
1282
|
+
reviewerPrompts?: Record<string, string>;
|
|
1283
|
+
skills?: string[];
|
|
1284
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1285
|
+
metadata?: Record<string, unknown>;
|
|
1286
|
+
}
|
|
1287
|
+
interface SteeringDelta {
|
|
1288
|
+
coderPrompt?: string;
|
|
1289
|
+
continuePrompt?: string;
|
|
1290
|
+
reviewerPrompts?: Record<string, string>;
|
|
1291
|
+
skills?: string[];
|
|
1292
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1293
|
+
metadata?: Record<string, unknown>;
|
|
1294
|
+
}
|
|
1295
|
+
declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
|
|
1296
|
+
declare function renderSteeringText(bundle: SteeringBundle): string;
|
|
1297
|
+
|
|
1298
|
+
interface OptimizationExample {
|
|
1299
|
+
scenarioId: string;
|
|
1300
|
+
metadata?: Record<string, unknown>;
|
|
1301
|
+
}
|
|
1302
|
+
interface SteeringEvaluation {
|
|
1303
|
+
variant: SteeringBundle;
|
|
1304
|
+
example: OptimizationExample;
|
|
1305
|
+
trialIndex: number;
|
|
1306
|
+
}
|
|
1307
|
+
interface SteeringVariantReport {
|
|
1308
|
+
variantId: string;
|
|
1309
|
+
bundle: SteeringBundle;
|
|
1310
|
+
mean: number;
|
|
1311
|
+
ci95: {
|
|
1312
|
+
lower: number;
|
|
1313
|
+
upper: number;
|
|
1314
|
+
};
|
|
1315
|
+
scenarioScores: Record<string, {
|
|
1316
|
+
mean: number;
|
|
1317
|
+
n: number;
|
|
1318
|
+
samples: number[];
|
|
1319
|
+
}>;
|
|
1320
|
+
}
|
|
1321
|
+
interface OptimizationLoopResult {
|
|
1322
|
+
winner: SteeringBundle;
|
|
1323
|
+
significant: boolean;
|
|
1324
|
+
reports: SteeringVariantReport[];
|
|
1325
|
+
pairwise: Array<{
|
|
1326
|
+
variantA: string;
|
|
1327
|
+
variantB: string;
|
|
1328
|
+
pValue: number;
|
|
1329
|
+
qValue: number;
|
|
1330
|
+
significant: boolean;
|
|
1331
|
+
meanDelta: number;
|
|
1332
|
+
}>;
|
|
1333
|
+
}
|
|
1334
|
+
interface OptimizationLoopConfig {
|
|
1335
|
+
variants: SteeringBundle[];
|
|
1336
|
+
examples: OptimizationExample[];
|
|
1337
|
+
evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
|
|
1338
|
+
scoreWeights?: Partial<RunScoreWeights>;
|
|
1339
|
+
trialsPerScenario?: number;
|
|
1340
|
+
}
|
|
1341
|
+
declare class OptimizationLoop {
|
|
1342
|
+
private readonly optimizer;
|
|
1343
|
+
constructor(optimizer?: PromptOptimizer);
|
|
1344
|
+
run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
|
|
1345
|
+
}
|
|
858
1346
|
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
1347
|
+
type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
|
|
1348
|
+
type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
|
|
1349
|
+
type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
|
|
1350
|
+
type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
1351
|
+
interface FeedbackTask {
|
|
1352
|
+
intent: string;
|
|
1353
|
+
context?: unknown;
|
|
1354
|
+
}
|
|
1355
|
+
interface ProposedSideEffect {
|
|
1356
|
+
type: string;
|
|
1357
|
+
risk?: 'low' | 'medium' | 'high';
|
|
1358
|
+
costUsd?: number;
|
|
1359
|
+
externalSideEffect?: boolean;
|
|
1360
|
+
requiresApproval?: boolean;
|
|
866
1361
|
metadata?: Record<string, unknown>;
|
|
867
1362
|
}
|
|
868
|
-
interface
|
|
1363
|
+
interface FeedbackLabel {
|
|
1364
|
+
id?: string;
|
|
1365
|
+
source: FeedbackLabelSource;
|
|
1366
|
+
kind: FeedbackLabelKind;
|
|
1367
|
+
value: unknown;
|
|
1368
|
+
reason?: string;
|
|
1369
|
+
severity?: FeedbackSeverity;
|
|
1370
|
+
createdAt: string;
|
|
1371
|
+
metadata?: Record<string, unknown>;
|
|
1372
|
+
}
|
|
1373
|
+
interface FeedbackAttempt {
|
|
869
1374
|
id: string;
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
1375
|
+
stepIndex: number;
|
|
1376
|
+
artifactType: FeedbackArtifactType;
|
|
1377
|
+
artifact: unknown;
|
|
1378
|
+
options?: unknown[];
|
|
1379
|
+
proposedAction?: ProposedSideEffect;
|
|
1380
|
+
evals?: ControlEvalResult[];
|
|
1381
|
+
feedback?: FeedbackLabel[];
|
|
1382
|
+
createdAt: string;
|
|
1383
|
+
metadata?: Record<string, unknown>;
|
|
878
1384
|
}
|
|
879
|
-
interface
|
|
1385
|
+
interface FeedbackOutcome {
|
|
1386
|
+
success?: boolean;
|
|
1387
|
+
score?: number;
|
|
1388
|
+
metrics?: Record<string, number>;
|
|
1389
|
+
costUsd?: number;
|
|
1390
|
+
detail?: string;
|
|
1391
|
+
observedAt?: string;
|
|
1392
|
+
metadata?: Record<string, unknown>;
|
|
1393
|
+
}
|
|
1394
|
+
interface FeedbackTrajectory {
|
|
880
1395
|
id: string;
|
|
881
|
-
|
|
1396
|
+
projectId?: string;
|
|
1397
|
+
scenarioId?: string;
|
|
1398
|
+
task: FeedbackTask;
|
|
1399
|
+
attempts: FeedbackAttempt[];
|
|
1400
|
+
labels: FeedbackLabel[];
|
|
1401
|
+
outcome?: FeedbackOutcome;
|
|
1402
|
+
split?: DatasetSplit;
|
|
1403
|
+
tags?: Record<string, string>;
|
|
882
1404
|
createdAt: string;
|
|
1405
|
+
updatedAt?: string;
|
|
883
1406
|
metadata?: Record<string, unknown>;
|
|
884
1407
|
}
|
|
885
|
-
interface
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
1408
|
+
interface FeedbackTrajectoryStore {
|
|
1409
|
+
save(trajectory: FeedbackTrajectory): Promise<void>;
|
|
1410
|
+
get(id: string): Promise<FeedbackTrajectory | null>;
|
|
1411
|
+
list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
|
|
1412
|
+
appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
|
|
1413
|
+
appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
|
|
892
1414
|
}
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
listExperiments(): Promise<Experiment[]>;
|
|
899
|
-
saveRun(run: Run$1): Promise<void>;
|
|
900
|
-
getRun(id: string): Promise<Run$1 | null>;
|
|
901
|
-
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
1415
|
+
interface FeedbackTrajectoryFilter {
|
|
1416
|
+
projectId?: string;
|
|
1417
|
+
scenarioId?: string;
|
|
1418
|
+
split?: DatasetSplit;
|
|
1419
|
+
tag?: [string, string];
|
|
902
1420
|
}
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
completeRun(runId: string, report: BenchmarkReport): Promise<void>;
|
|
909
|
-
failRun(runId: string, error: string): Promise<void>;
|
|
910
|
-
/**
|
|
911
|
-
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
912
|
-
* and config changes that may explain the movement.
|
|
913
|
-
*/
|
|
914
|
-
diff(runIdA: string, runIdB: string): Promise<RunDiff>;
|
|
915
|
-
/** Timeline of aggregate scores for an experiment. */
|
|
916
|
-
timeline(experimentId: string): Promise<Array<{
|
|
917
|
-
runId: string;
|
|
918
|
-
startedAt: string;
|
|
919
|
-
overall: number | null;
|
|
920
|
-
}>>;
|
|
1421
|
+
interface FeedbackSplitPolicy {
|
|
1422
|
+
trainPct?: number;
|
|
1423
|
+
devPct?: number;
|
|
1424
|
+
testPct?: number;
|
|
1425
|
+
holdoutPct?: number;
|
|
921
1426
|
}
|
|
922
|
-
interface
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
runId: string;
|
|
930
|
-
name?: string;
|
|
931
|
-
startedAt: string;
|
|
932
|
-
};
|
|
933
|
-
aggregateDelta: number;
|
|
934
|
-
scenarios: Array<{
|
|
935
|
-
scenarioId: string;
|
|
936
|
-
before: number | null;
|
|
937
|
-
after: number | null;
|
|
938
|
-
delta: number | null;
|
|
939
|
-
status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
|
|
940
|
-
}>;
|
|
941
|
-
configChanges: Record<string, {
|
|
942
|
-
before: unknown;
|
|
943
|
-
after: unknown;
|
|
944
|
-
}>;
|
|
1427
|
+
interface PreferenceMemoryEntry {
|
|
1428
|
+
instruction: string;
|
|
1429
|
+
rationale: string;
|
|
1430
|
+
weight: number;
|
|
1431
|
+
sourceTrajectoryId: string;
|
|
1432
|
+
sourceLabelId?: string;
|
|
1433
|
+
category?: string;
|
|
945
1434
|
}
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
* Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
|
|
951
|
-
* files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
|
|
952
|
-
* based rollover. Writes are append-only so the file log doubles as an audit
|
|
953
|
-
* trail of every state transition the tracker ever wrote.
|
|
954
|
-
*
|
|
955
|
-
* Reads lazy-load every NDJSON file in the directory (including rolled-over
|
|
956
|
-
* archives), latest-write-wins per `id`. Subsequent writes update the
|
|
957
|
-
* in-memory index in place so reads after writes are O(1).
|
|
958
|
-
*
|
|
959
|
-
* Node-only — imports `node:fs/promises`. Don't import this from a Worker;
|
|
960
|
-
* use the in-memory store or the D1 store from `./experiment-tracker-d1`.
|
|
961
|
-
*/
|
|
962
|
-
|
|
963
|
-
interface FileSystemExperimentStoreOptions {
|
|
964
|
-
/** Directory the NDJSON files live in. Created on first write. */
|
|
965
|
-
dir: string;
|
|
966
|
-
/** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
|
|
967
|
-
maxBytes?: number;
|
|
1435
|
+
interface FeedbackOptimizerRow extends OptimizationExample {
|
|
1436
|
+
trajectoryId: string;
|
|
1437
|
+
labelKinds: FeedbackLabelKind[];
|
|
1438
|
+
score?: number;
|
|
968
1439
|
}
|
|
969
|
-
|
|
1440
|
+
interface FeedbackReplayResult {
|
|
1441
|
+
trajectoryId: string;
|
|
1442
|
+
pass: boolean;
|
|
1443
|
+
score?: number;
|
|
1444
|
+
labels: FeedbackLabel[];
|
|
1445
|
+
outcome?: FeedbackOutcome;
|
|
1446
|
+
metadata?: Record<string, unknown>;
|
|
1447
|
+
}
|
|
1448
|
+
interface FeedbackReplayAdapter {
|
|
1449
|
+
replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
|
|
1450
|
+
}
|
|
1451
|
+
declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
|
|
1452
|
+
private readonly trajectories;
|
|
1453
|
+
save(trajectory: FeedbackTrajectory): Promise<void>;
|
|
1454
|
+
get(id: string): Promise<FeedbackTrajectory | null>;
|
|
1455
|
+
list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
|
|
1456
|
+
appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
|
|
1457
|
+
appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
|
|
1458
|
+
}
|
|
1459
|
+
declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
|
|
970
1460
|
private readonly dir;
|
|
971
|
-
private readonly
|
|
972
|
-
private index?;
|
|
1461
|
+
private readonly memory;
|
|
973
1462
|
private loaded;
|
|
974
|
-
constructor(options:
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
1463
|
+
constructor(options: {
|
|
1464
|
+
dir: string;
|
|
1465
|
+
});
|
|
1466
|
+
save(trajectory: FeedbackTrajectory): Promise<void>;
|
|
1467
|
+
get(id: string): Promise<FeedbackTrajectory | null>;
|
|
1468
|
+
list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
|
|
1469
|
+
appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
|
|
1470
|
+
appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
|
|
982
1471
|
private append;
|
|
983
1472
|
private load;
|
|
984
1473
|
}
|
|
1474
|
+
declare function createFeedbackTrajectory(input: {
|
|
1475
|
+
id?: string;
|
|
1476
|
+
projectId?: string;
|
|
1477
|
+
scenarioId?: string;
|
|
1478
|
+
task: FeedbackTask;
|
|
1479
|
+
attempts?: FeedbackAttempt[];
|
|
1480
|
+
labels?: FeedbackLabel[];
|
|
1481
|
+
outcome?: FeedbackOutcome;
|
|
1482
|
+
split?: DatasetSplit;
|
|
1483
|
+
tags?: Record<string, string>;
|
|
1484
|
+
createdAt?: string;
|
|
1485
|
+
metadata?: Record<string, unknown>;
|
|
1486
|
+
}): FeedbackTrajectory;
|
|
1487
|
+
declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
|
|
1488
|
+
declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
|
|
1489
|
+
declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
|
|
1490
|
+
declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
|
|
1491
|
+
declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
|
|
1492
|
+
declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
|
|
1493
|
+
declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
|
|
1494
|
+
declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
|
|
1495
|
+
declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
|
|
1496
|
+
maxEntries?: number;
|
|
1497
|
+
}): PreferenceMemoryEntry[];
|
|
1498
|
+
declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
|
|
1499
|
+
declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
|
|
1500
|
+
declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
|
|
1501
|
+
declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
|
|
1502
|
+
projectId?: string;
|
|
1503
|
+
scenarioId?: string;
|
|
1504
|
+
artifactType?: FeedbackArtifactType;
|
|
1505
|
+
artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
|
|
1506
|
+
proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
|
|
1507
|
+
createdAt?: string;
|
|
1508
|
+
}): FeedbackTrajectory;
|
|
1509
|
+
|
|
1510
|
+
interface ActionExecutionPolicy {
|
|
1511
|
+
allowedTypes?: string[];
|
|
1512
|
+
blockedTypes?: string[];
|
|
1513
|
+
alwaysRequireApprovalTypes?: string[];
|
|
1514
|
+
autoApproveTypes?: string[];
|
|
1515
|
+
requireApprovalForExternalSideEffects?: boolean;
|
|
1516
|
+
requireApprovalAboveCostUsd?: number;
|
|
1517
|
+
maxActionCostUsd?: number;
|
|
1518
|
+
remainingBudgetUsd?: number;
|
|
1519
|
+
expectedOutcomeRequired?: boolean;
|
|
1520
|
+
killCriteriaRequired?: boolean;
|
|
1521
|
+
}
|
|
1522
|
+
interface ActionPolicyDecision {
|
|
1523
|
+
allowed: boolean;
|
|
1524
|
+
blocked: boolean;
|
|
1525
|
+
requiresApproval: boolean;
|
|
1526
|
+
reasons: string[];
|
|
1527
|
+
label?: FeedbackLabel;
|
|
1528
|
+
}
|
|
1529
|
+
declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
|
|
1530
|
+
createdAt?: string;
|
|
1531
|
+
}): ActionPolicyDecision;
|
|
985
1532
|
|
|
986
1533
|
/**
|
|
987
|
-
*
|
|
988
|
-
*
|
|
989
|
-
*
|
|
990
|
-
*
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1534
|
+
* Normalize scores so all dimensions follow "higher = better".
|
|
1535
|
+
* Inverted dimensions (hallucination, false_confidence, worst_failure)
|
|
1536
|
+
* already use inverted scoring in the prompt (10 = no hallucination),
|
|
1537
|
+
* but this function ensures consistency if raw scores leak through.
|
|
1538
|
+
*/
|
|
1539
|
+
declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
|
|
1540
|
+
/** Weighted mean — falls back to uniform weights when omitted */
|
|
1541
|
+
declare function weightedMean(scores: {
|
|
1542
|
+
score: number;
|
|
1543
|
+
weight?: number;
|
|
1544
|
+
}[]): number;
|
|
1545
|
+
/** Bootstrap confidence interval */
|
|
1546
|
+
declare function confidenceInterval(scores: number[], confidence?: number): {
|
|
1547
|
+
mean: number;
|
|
1548
|
+
lower: number;
|
|
1549
|
+
upper: number;
|
|
1550
|
+
};
|
|
1551
|
+
/**
|
|
1552
|
+
* Inter-rater reliability — simplified Krippendorff's alpha.
|
|
1000
1553
|
*
|
|
1001
|
-
*
|
|
1002
|
-
*
|
|
1003
|
-
* v1; bump only on breaking shape changes.
|
|
1554
|
+
* Each inner array is one judge's scores for all items.
|
|
1555
|
+
* All arrays must have the same length (same items scored).
|
|
1004
1556
|
*/
|
|
1005
|
-
|
|
1557
|
+
declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
|
|
1006
1558
|
/**
|
|
1007
|
-
*
|
|
1008
|
-
*
|
|
1009
|
-
* those types installed can pass the binding directly.
|
|
1559
|
+
* Mann-Whitney U test for comparing two independent groups.
|
|
1560
|
+
* Returns U statistic and approximate p-value (normal approximation).
|
|
1010
1561
|
*/
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1562
|
+
declare function mannWhitneyU(a: number[], b: number[]): {
|
|
1563
|
+
u: number;
|
|
1564
|
+
p: number;
|
|
1565
|
+
};
|
|
1566
|
+
/** Partial credit: returns 0-1 ratio of current toward target */
|
|
1567
|
+
declare function partialCredit(current: number, target: number): number;
|
|
1568
|
+
/**
|
|
1569
|
+
* Paired t-test — before/after measurements on the SAME items.
|
|
1570
|
+
* Pairing removes inter-item variance, giving tighter significance than
|
|
1571
|
+
* an unpaired test when comparing prompt v1 vs prompt v2 on identical
|
|
1572
|
+
* scenarios.
|
|
1573
|
+
*/
|
|
1574
|
+
declare function pairedTTest(before: number[], after: number[]): {
|
|
1575
|
+
t: number;
|
|
1576
|
+
df: number;
|
|
1577
|
+
p: number;
|
|
1578
|
+
};
|
|
1579
|
+
/**
|
|
1580
|
+
* Wilcoxon signed-rank test — paired non-parametric alternative.
|
|
1581
|
+
* Use when the differences aren't normally distributed.
|
|
1582
|
+
*/
|
|
1583
|
+
declare function wilcoxonSignedRank(before: number[], after: number[]): {
|
|
1584
|
+
w: number;
|
|
1585
|
+
p: number;
|
|
1586
|
+
};
|
|
1587
|
+
/**
|
|
1588
|
+
* Cohen's d — standardized effect size for two independent groups.
|
|
1589
|
+
* Positive d means group b has higher mean than group a.
|
|
1590
|
+
* Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
|
|
1591
|
+
*/
|
|
1592
|
+
declare function cohensD(a: number[], b: number[]): number;
|
|
1593
|
+
|
|
1594
|
+
/**
|
|
1595
|
+
* ConvergenceTracker — tracks completion percentage over turns.
|
|
1596
|
+
*
|
|
1597
|
+
* Produces convergence curves showing how quickly the agent reaches
|
|
1598
|
+
* completion criteria.
|
|
1599
|
+
*/
|
|
1600
|
+
declare class ConvergenceTracker {
|
|
1601
|
+
private criteria;
|
|
1602
|
+
private history;
|
|
1603
|
+
constructor(criteria: CompletionCriterion[]);
|
|
1604
|
+
/** Evaluate criteria against current state, record result */
|
|
1605
|
+
record(turn: number, state: DriverState): {
|
|
1606
|
+
completionPercent: number;
|
|
1607
|
+
complete: boolean;
|
|
1608
|
+
criteriaStatus: Record<string, boolean | number>;
|
|
1609
|
+
};
|
|
1610
|
+
/** Get convergence curve */
|
|
1611
|
+
getCurve(): number[];
|
|
1612
|
+
/** Get full history with per-criterion status */
|
|
1613
|
+
getHistory(): {
|
|
1614
|
+
turn: number;
|
|
1615
|
+
completionPercent: number;
|
|
1616
|
+
criteriaStatus: Record<string, boolean | number>;
|
|
1617
|
+
}[];
|
|
1618
|
+
/** Find the turn where completion first reached 100% (or null) */
|
|
1619
|
+
getTurnToCompletion(): number | null;
|
|
1053
1620
|
}
|
|
1054
1621
|
|
|
1055
1622
|
/**
|
|
1056
|
-
*
|
|
1623
|
+
* Versioned prompt registry.
|
|
1057
1624
|
*
|
|
1058
|
-
*
|
|
1059
|
-
*
|
|
1060
|
-
*
|
|
1061
|
-
*
|
|
1062
|
-
* score distribution, not just normal)
|
|
1063
|
-
* - a winner (highest mean, flagged if the lead is not significant)
|
|
1625
|
+
* Every prompt used in an eval run is registered with an explicit version.
|
|
1626
|
+
* Reports include the content hash so A/B compares are rigorous: if the
|
|
1627
|
+
* hash changes between two reports, the prompt actually changed; if it
|
|
1628
|
+
* matches, the variance is elsewhere.
|
|
1064
1629
|
*
|
|
1065
|
-
*
|
|
1066
|
-
*
|
|
1067
|
-
* and returns a number per scenario. This lets the optimizer stay small +
|
|
1068
|
-
* testable.
|
|
1630
|
+
* Hash is SHA-256(content), truncated to 12 hex chars for readability.
|
|
1631
|
+
* Uses the Web Crypto API (works in Workers, Node 22+, browsers).
|
|
1069
1632
|
*/
|
|
1070
|
-
interface
|
|
1633
|
+
interface PromptHandle {
|
|
1634
|
+
/** Stable human-readable id, e.g. 'browser.system' */
|
|
1071
1635
|
id: string;
|
|
1072
|
-
|
|
1073
|
-
|
|
1636
|
+
/** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
|
|
1637
|
+
version: string;
|
|
1638
|
+
/** SHA-256 of content, 12-hex-char prefix */
|
|
1639
|
+
hash: string;
|
|
1640
|
+
/** Full prompt body */
|
|
1641
|
+
content: string;
|
|
1074
1642
|
}
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
/** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
|
|
1078
|
-
trialsPerScenario?: number;
|
|
1079
|
-
/** Significance threshold for pairwise comparison (default 0.05). */
|
|
1080
|
-
significanceLevel?: number;
|
|
1643
|
+
declare class PromptRegistry {
|
|
1644
|
+
private readonly entries;
|
|
1081
1645
|
/**
|
|
1082
|
-
*
|
|
1083
|
-
*
|
|
1084
|
-
*
|
|
1646
|
+
* Register a prompt. Re-registering the same id+version with DIFFERENT
|
|
1647
|
+
* content throws — versions are immutable. Re-registering with the SAME
|
|
1648
|
+
* content is a no-op (idempotent).
|
|
1085
1649
|
*/
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
/**
|
|
1092
|
-
|
|
1093
|
-
/**
|
|
1094
|
-
|
|
1095
|
-
variantId: string;
|
|
1096
|
-
scenarioId: string;
|
|
1097
|
-
scores: number[];
|
|
1098
|
-
}) => void;
|
|
1099
|
-
}
|
|
1100
|
-
interface VariantScore {
|
|
1101
|
-
variantId: string;
|
|
1102
|
-
mean: number;
|
|
1103
|
-
ci95: {
|
|
1104
|
-
lower: number;
|
|
1105
|
-
upper: number;
|
|
1106
|
-
};
|
|
1107
|
-
n: number;
|
|
1108
|
-
perScenario: Record<string, {
|
|
1109
|
-
mean: number;
|
|
1110
|
-
n: number;
|
|
1111
|
-
samples: number[];
|
|
1112
|
-
}>;
|
|
1113
|
-
}
|
|
1114
|
-
interface PairwiseComparison {
|
|
1115
|
-
variantA: string;
|
|
1116
|
-
variantB: string;
|
|
1117
|
-
pValue: number;
|
|
1118
|
-
/** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
|
|
1119
|
-
qValue: number;
|
|
1120
|
-
/** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
|
|
1121
|
-
significant: boolean;
|
|
1122
|
-
meanDelta: number;
|
|
1123
|
-
}
|
|
1124
|
-
interface OptimizationResult {
|
|
1125
|
-
winner: {
|
|
1126
|
-
variantId: string;
|
|
1127
|
-
/** True when the winner's lead vs every other variant is statistically significant. */
|
|
1128
|
-
significant: boolean;
|
|
1129
|
-
ciLowerBoundExceedsSecondMean: boolean;
|
|
1130
|
-
};
|
|
1131
|
-
scores: VariantScore[];
|
|
1132
|
-
pairwise: PairwiseComparison[];
|
|
1133
|
-
config: {
|
|
1134
|
-
trialsPerScenario: number;
|
|
1135
|
-
significanceLevel: number;
|
|
1136
|
-
variants: string[];
|
|
1137
|
-
scenarios: string[];
|
|
1138
|
-
};
|
|
1139
|
-
}
|
|
1140
|
-
declare class PromptOptimizer {
|
|
1141
|
-
run(config: OptimizationConfig): Promise<OptimizationResult>;
|
|
1142
|
-
}
|
|
1143
|
-
|
|
1144
|
-
interface SteeringRolePrompt {
|
|
1145
|
-
system?: string;
|
|
1146
|
-
append?: string;
|
|
1147
|
-
}
|
|
1148
|
-
interface SteeringBundle {
|
|
1149
|
-
id: string;
|
|
1150
|
-
coderPrompt?: string;
|
|
1151
|
-
continuePrompt?: string;
|
|
1152
|
-
reviewerPrompts?: Record<string, string>;
|
|
1153
|
-
skills?: string[];
|
|
1154
|
-
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1155
|
-
metadata?: Record<string, unknown>;
|
|
1156
|
-
}
|
|
1157
|
-
interface SteeringDelta {
|
|
1158
|
-
coderPrompt?: string;
|
|
1159
|
-
continuePrompt?: string;
|
|
1160
|
-
reviewerPrompts?: Record<string, string>;
|
|
1161
|
-
skills?: string[];
|
|
1162
|
-
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1163
|
-
metadata?: Record<string, unknown>;
|
|
1164
|
-
}
|
|
1165
|
-
declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
|
|
1166
|
-
declare function renderSteeringText(bundle: SteeringBundle): string;
|
|
1167
|
-
|
|
1168
|
-
interface RunScore {
|
|
1169
|
-
success: number;
|
|
1170
|
-
goalProgress: number;
|
|
1171
|
-
repoGroundedness: number;
|
|
1172
|
-
driftPenalty: number;
|
|
1173
|
-
toolUseQuality: number;
|
|
1174
|
-
patchQuality: number;
|
|
1175
|
-
testReality: number;
|
|
1176
|
-
finalGate: number;
|
|
1177
|
-
reviewerBlockers: number;
|
|
1178
|
-
costUsd: number;
|
|
1179
|
-
wallSeconds: number;
|
|
1180
|
-
notes?: string[];
|
|
1181
|
-
}
|
|
1182
|
-
interface RunScoreWeights {
|
|
1183
|
-
success: number;
|
|
1184
|
-
goalProgress: number;
|
|
1185
|
-
repoGroundedness: number;
|
|
1186
|
-
driftPenalty: number;
|
|
1187
|
-
toolUseQuality: number;
|
|
1188
|
-
patchQuality: number;
|
|
1189
|
-
testReality: number;
|
|
1190
|
-
finalGate: number;
|
|
1191
|
-
reviewerBlockers: number;
|
|
1192
|
-
costUsd: number;
|
|
1193
|
-
wallSeconds: number;
|
|
1650
|
+
register(id: string, version: string, content: string): Promise<PromptHandle>;
|
|
1651
|
+
/** Look up a registered prompt. Throws if unknown — no implicit defaults. */
|
|
1652
|
+
get(id: string, version: string): PromptHandle;
|
|
1653
|
+
/** Return all versions of an id, newest-first (lex-descending on version). */
|
|
1654
|
+
listVersions(id: string): PromptHandle[];
|
|
1655
|
+
/** Snapshot the whole registry — useful for including in reports. */
|
|
1656
|
+
list(): PromptHandle[];
|
|
1657
|
+
/** Verify a hash against registered content. Returns null if not found. */
|
|
1658
|
+
verifyHash(id: string, version: string, expectedHash: string): boolean | null;
|
|
1194
1659
|
}
|
|
1195
|
-
|
|
1196
|
-
declare function
|
|
1197
|
-
declare function clamp01(value: number): number;
|
|
1660
|
+
/** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
|
|
1661
|
+
declare function hashContent(content: string): Promise<string>;
|
|
1198
1662
|
|
|
1199
1663
|
/**
|
|
1200
|
-
*
|
|
1664
|
+
* Anti-slop quality judge.
|
|
1201
1665
|
*
|
|
1202
|
-
*
|
|
1203
|
-
*
|
|
1666
|
+
* Deterministic pattern-based quality check — no LLM call. Catches the
|
|
1667
|
+
* 80% of AI slop that every production agent leaks:
|
|
1668
|
+
* - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
|
|
1669
|
+
* - N-gram repetition (same phrase over and over)
|
|
1670
|
+
* - Hedging overuse ("I could be wrong, but...")
|
|
1671
|
+
* - Apology padding ("I'm so sorry for the confusion...")
|
|
1672
|
+
* - Unused opening formulas ("Great question!")
|
|
1673
|
+
* - Length bounds (too short to be useful, too long to be read)
|
|
1204
1674
|
*
|
|
1205
|
-
*
|
|
1206
|
-
*
|
|
1207
|
-
* judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
|
|
1208
|
-
* entities that OTEL leaves as free-form attributes.
|
|
1675
|
+
* Produces a JudgeScore in the same shape as LLM judges so it composes into
|
|
1676
|
+
* `BenchmarkRunner`'s judge array transparently.
|
|
1209
1677
|
*/
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1678
|
+
|
|
1679
|
+
interface AntiSlopConfig {
|
|
1680
|
+
/** Domain label — appears in the JudgeScore output */
|
|
1681
|
+
domain?: string;
|
|
1682
|
+
/** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
|
|
1683
|
+
bannedPhrases?: string[];
|
|
1684
|
+
/** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
|
|
1685
|
+
bannedOpenings?: RegExp[];
|
|
1686
|
+
/** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
|
|
1687
|
+
hedgingPatterns?: RegExp[];
|
|
1688
|
+
/** Regexes matching apology padding. */
|
|
1689
|
+
apologyPatterns?: RegExp[];
|
|
1690
|
+
/** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
|
|
1691
|
+
repetitionThreshold?: number;
|
|
1692
|
+
/** Min output length in chars; below this the turn is deemed too terse. */
|
|
1693
|
+
minLength?: number;
|
|
1694
|
+
/** Max output length in chars; above this the turn is deemed too verbose. */
|
|
1695
|
+
maxLength?: number;
|
|
1696
|
+
/** How heavily each violation class reduces the score (default 1). */
|
|
1697
|
+
penaltyWeights?: Partial<Record<SlopCategory, number>>;
|
|
1217
1698
|
}
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1699
|
+
type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
|
|
1700
|
+
/** Create a reusable Judge function from an anti-slop config. */
|
|
1701
|
+
declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
|
|
1702
|
+
interface AntiSlopIssue {
|
|
1703
|
+
category: SlopCategory;
|
|
1704
|
+
detail: string;
|
|
1705
|
+
example?: string;
|
|
1706
|
+
}
|
|
1707
|
+
interface AntiSlopReport {
|
|
1708
|
+
/** 0–10 score; 10 is clean, lower values mean more slop. */
|
|
1709
|
+
score: number;
|
|
1710
|
+
issues: AntiSlopIssue[];
|
|
1711
|
+
/** Count of each category for programmatic aggregation. */
|
|
1712
|
+
counts: Record<SlopCategory, number>;
|
|
1223
1713
|
}
|
|
1224
1714
|
/**
|
|
1225
|
-
*
|
|
1226
|
-
*
|
|
1227
|
-
* `app-build`: sandbox harness that compiled + tested the generated scaffold.
|
|
1228
|
-
* `app-runtime`: a run of the generated agent against a domain scenario.
|
|
1229
|
-
* `meta`: any meta-eval (judge replay, correlation analysis).
|
|
1715
|
+
* Pure function — analyze one or more outputs against the config. Exposed
|
|
1716
|
+
* separately so consumers can build their own reporters on top.
|
|
1230
1717
|
*/
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
/**
|
|
1252
|
-
|
|
1253
|
-
/**
|
|
1254
|
-
|
|
1255
|
-
/**
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
status: RunStatus;
|
|
1260
|
-
outcome?: RunOutcome$1;
|
|
1261
|
-
budget?: BudgetSpec;
|
|
1262
|
-
/** Free-form labels for downstream grouping. */
|
|
1263
|
-
tags?: Record<string, string>;
|
|
1718
|
+
declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
|
|
1719
|
+
penaltyWeights: Record<SlopCategory, number>;
|
|
1720
|
+
}): AntiSlopReport;
|
|
1721
|
+
|
|
1722
|
+
/**
|
|
1723
|
+
* Artifact validators.
|
|
1724
|
+
*
|
|
1725
|
+
* Generic "score a produced artifact" primitive. Tax uses it for PDF form
|
|
1726
|
+
* correctness, research for sourced briefs, browser for task assertions, coding
|
|
1727
|
+
* for social posts. One interface, many validators; all plug into
|
|
1728
|
+
* `BenchmarkRunner` the same way.
|
|
1729
|
+
*
|
|
1730
|
+
* A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
|
|
1731
|
+
* plus a `ValidationContext` (scenario id, the turns that produced it) and
|
|
1732
|
+
* returns a `ValidationResult` with pass/fail + 0..1 score + structured
|
|
1733
|
+
* issues.
|
|
1734
|
+
*/
|
|
1735
|
+
interface Artifact {
|
|
1736
|
+
/** Logical kind — validators type-guard on this */
|
|
1737
|
+
kind: 'file' | 'json' | 'text' | 'binary' | string;
|
|
1738
|
+
/** Filesystem-style path, optional */
|
|
1739
|
+
path?: string;
|
|
1740
|
+
/** String content for text/json/file kinds */
|
|
1741
|
+
content?: string;
|
|
1742
|
+
/** Binary content (if kind === 'binary') */
|
|
1743
|
+
bytes?: Uint8Array;
|
|
1744
|
+
/** Caller-supplied metadata (mimeType, sha256, size, etc.) */
|
|
1745
|
+
metadata?: Record<string, unknown>;
|
|
1264
1746
|
}
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
name: string;
|
|
1273
|
-
startedAt: number;
|
|
1274
|
-
endedAt?: number;
|
|
1275
|
-
status?: SpanStatus;
|
|
1276
|
-
error?: string;
|
|
1277
|
-
/** Anything not covered by typed fields. Kept deliberately free-form. */
|
|
1278
|
-
attributes?: Record<string, unknown>;
|
|
1747
|
+
interface ValidationContext {
|
|
1748
|
+
scenarioId: string;
|
|
1749
|
+
turnIndex?: number;
|
|
1750
|
+
/** Prior artifacts for multi-artifact scenarios */
|
|
1751
|
+
priorArtifacts?: Artifact[];
|
|
1752
|
+
/** Free-form hints the validator uses for domain-specific checks */
|
|
1753
|
+
hints?: Record<string, unknown>;
|
|
1279
1754
|
}
|
|
1280
|
-
interface
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
images?: Array<{
|
|
1286
|
-
artifactId?: string;
|
|
1287
|
-
url?: string;
|
|
1288
|
-
mime?: string;
|
|
1289
|
-
}>;
|
|
1755
|
+
interface ValidationIssue {
|
|
1756
|
+
severity: 'error' | 'warning' | 'info';
|
|
1757
|
+
message: string;
|
|
1758
|
+
/** Optional path into the artifact (e.g. JSON path or byte offset) */
|
|
1759
|
+
locus?: string;
|
|
1290
1760
|
}
|
|
1291
|
-
interface
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
cachedTokens?: number;
|
|
1299
|
-
reasoningTokens?: number;
|
|
1300
|
-
costUsd?: number;
|
|
1301
|
-
finishReason?: string;
|
|
1761
|
+
interface ValidationResult {
|
|
1762
|
+
pass: boolean;
|
|
1763
|
+
/** 0–1 normalized score. Validators should be monotonic in pass-ness. */
|
|
1764
|
+
score: number;
|
|
1765
|
+
issues: ValidationIssue[];
|
|
1766
|
+
/** Diagnostic payload for reporters */
|
|
1767
|
+
evidence?: Record<string, unknown>;
|
|
1302
1768
|
}
|
|
1303
|
-
interface
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1769
|
+
interface ArtifactValidator {
|
|
1770
|
+
/** Stable identifier for the validator; appears in reports. */
|
|
1771
|
+
name: string;
|
|
1772
|
+
/** Optional description for human-facing reports. */
|
|
1773
|
+
description?: string;
|
|
1774
|
+
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
1775
|
+
validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
|
|
1309
1776
|
}
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1777
|
+
/**
|
|
1778
|
+
* Run every validator on the same artifact; aggregate pass as AND, score as
|
|
1779
|
+
* (weighted) mean, issues concatenated. Weights default to 1 each.
|
|
1780
|
+
*/
|
|
1781
|
+
declare function composeValidators(validators: ArtifactValidator[], options?: {
|
|
1782
|
+
name?: string;
|
|
1783
|
+
weights?: number[];
|
|
1784
|
+
}): ArtifactValidator;
|
|
1785
|
+
/** Pass if the artifact body matches a provided regex. */
|
|
1786
|
+
declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
|
|
1787
|
+
/** Pass if JSON parses and every required key is present. */
|
|
1788
|
+
declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
|
|
1789
|
+
/** Pass if min ≤ byte length ≤ max. */
|
|
1790
|
+
declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
|
|
1791
|
+
/** Pass if the artifact contains every required substring (case-insensitive by default). */
|
|
1792
|
+
declare function containsAll(name: string, required: string[], options?: {
|
|
1793
|
+
caseSensitive?: boolean;
|
|
1794
|
+
}): ArtifactValidator;
|
|
1795
|
+
|
|
1796
|
+
/**
|
|
1797
|
+
* Workspace inspector — score the persisted state of an agent after a run.
|
|
1798
|
+
*
|
|
1799
|
+
* Many evals don't ask "did the response say the right thing" but "did the
|
|
1800
|
+
* agent put the right rows in the DB / files in the vault / entities on the
|
|
1801
|
+
* canvas". This is the primitive for that.
|
|
1802
|
+
*
|
|
1803
|
+
* Implementations read from D1, KV, filesystem, or any store — the interface
|
|
1804
|
+
* is deliberately small so consumers plug in their own backends.
|
|
1805
|
+
*/
|
|
1806
|
+
interface WorkspaceSnapshot {
|
|
1807
|
+
/** Vault files: logical path → content */
|
|
1808
|
+
files: Record<string, string>;
|
|
1809
|
+
/** DB rows: table name → array of rows (post-validation) */
|
|
1810
|
+
rows: Record<string, Array<Record<string, unknown>>>;
|
|
1811
|
+
/** KV entries: key → value (scoped to whatever prefix the inspector chose) */
|
|
1812
|
+
kv: Record<string, string>;
|
|
1813
|
+
/** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
|
|
1814
|
+
blobs?: Record<string, {
|
|
1815
|
+
size: number;
|
|
1816
|
+
hash?: string;
|
|
1817
|
+
mimeType?: string;
|
|
1317
1818
|
}>;
|
|
1318
1819
|
}
|
|
1319
|
-
interface
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
/**
|
|
1323
|
-
|
|
1324
|
-
dimension: string;
|
|
1325
|
-
/** Numeric score (free-range; interpretation up to the judge). */
|
|
1326
|
-
score: number;
|
|
1327
|
-
rationale?: string;
|
|
1328
|
-
evidence?: string;
|
|
1329
|
-
}
|
|
1330
|
-
interface SandboxSpan extends SpanBase {
|
|
1331
|
-
kind: 'sandbox';
|
|
1332
|
-
image?: string;
|
|
1333
|
-
command?: string;
|
|
1334
|
-
exitCode?: number;
|
|
1335
|
-
testsTotal?: number;
|
|
1336
|
-
testsPassed?: number;
|
|
1337
|
-
stdoutHash?: string;
|
|
1338
|
-
stderrHash?: string;
|
|
1339
|
-
/** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
|
|
1340
|
-
wallMs?: number;
|
|
1341
|
-
}
|
|
1342
|
-
interface GenericSpan extends SpanBase {
|
|
1343
|
-
kind: 'agent' | 'custom';
|
|
1820
|
+
interface InspectorContext {
|
|
1821
|
+
/** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
|
|
1822
|
+
scopeId: string;
|
|
1823
|
+
/** Optional scenario id — allows scenario-specific snapshot shaping */
|
|
1824
|
+
scenarioId?: string;
|
|
1344
1825
|
}
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
eventId: string;
|
|
1349
|
-
runId: string;
|
|
1350
|
-
spanId?: string;
|
|
1351
|
-
kind: EventKind;
|
|
1352
|
-
timestamp: number;
|
|
1353
|
-
payload: Record<string, unknown>;
|
|
1826
|
+
interface WorkspaceInspector {
|
|
1827
|
+
name: string;
|
|
1828
|
+
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
1354
1829
|
}
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
remaining: number;
|
|
1361
|
-
timestamp: number;
|
|
1362
|
-
breached: boolean;
|
|
1363
|
-
/** Span that triggered this entry, if any. */
|
|
1364
|
-
spanId?: string;
|
|
1830
|
+
declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
|
|
1831
|
+
readonly name = "in-memory";
|
|
1832
|
+
private readonly snapshots;
|
|
1833
|
+
set(scopeId: string, snapshot: WorkspaceSnapshot): void;
|
|
1834
|
+
snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
|
|
1365
1835
|
}
|
|
1366
|
-
interface
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
contentType: string;
|
|
1371
|
-
sizeBytes: number;
|
|
1372
|
-
/** sha256 in hex. */
|
|
1373
|
-
hash: string;
|
|
1374
|
-
/** External storage URL (R2, S3, filesystem path). */
|
|
1375
|
-
storageUrl?: string;
|
|
1376
|
-
/** Inline content for small blobs — keep under ~64KB. */
|
|
1377
|
-
inlineContent?: string;
|
|
1836
|
+
interface WorkspaceAssertion {
|
|
1837
|
+
name: string;
|
|
1838
|
+
description?: string;
|
|
1839
|
+
check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
|
|
1378
1840
|
}
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
declare function
|
|
1841
|
+
interface WorkspaceAssertionResult {
|
|
1842
|
+
pass: boolean;
|
|
1843
|
+
/** 0..1 — partial credit for assertions that admit it */
|
|
1844
|
+
score: number;
|
|
1845
|
+
detail?: string;
|
|
1846
|
+
}
|
|
1847
|
+
declare function fileExists(path: string): WorkspaceAssertion;
|
|
1848
|
+
declare function fileContains(path: string, needle: string): WorkspaceAssertion;
|
|
1849
|
+
declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
|
|
1850
|
+
declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
|
|
1851
|
+
min?: number;
|
|
1852
|
+
}): WorkspaceAssertion;
|
|
1853
|
+
/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
|
|
1854
|
+
declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
|
|
1855
|
+
pass: boolean;
|
|
1856
|
+
score: number;
|
|
1857
|
+
results: Array<{
|
|
1858
|
+
assertion: string;
|
|
1859
|
+
result: WorkspaceAssertionResult;
|
|
1860
|
+
}>;
|
|
1861
|
+
};
|
|
1386
1862
|
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1863
|
+
/**
|
|
1864
|
+
* Experiment tracker — group runs, diff them, watch scores move over time.
|
|
1865
|
+
*
|
|
1866
|
+
* Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
|
|
1867
|
+
* - A run has a config (prompt hash, model, scenario ids, seed)
|
|
1868
|
+
* - Runs belong to experiments (named groups)
|
|
1869
|
+
* - The store is pluggable (in-memory for tests, filesystem for local,
|
|
1870
|
+
* custom for Langfuse/D1)
|
|
1871
|
+
* - Diffs show score deltas, new/dropped scenarios, and config changes
|
|
1872
|
+
*
|
|
1873
|
+
* The output plugs directly into `BenchmarkReport` — runs archive the full
|
|
1874
|
+
* report, diff operates on the summary.
|
|
1875
|
+
*/
|
|
1876
|
+
|
|
1877
|
+
interface RunConfig {
|
|
1878
|
+
experimentId: string;
|
|
1879
|
+
name?: string;
|
|
1880
|
+
model?: string;
|
|
1881
|
+
promptHash?: string;
|
|
1882
|
+
promptVersion?: string;
|
|
1883
|
+
seed?: number;
|
|
1884
|
+
metadata?: Record<string, unknown>;
|
|
1401
1885
|
}
|
|
1402
|
-
interface
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
kind?: SpanKind;
|
|
1886
|
+
interface Run {
|
|
1887
|
+
id: string;
|
|
1888
|
+
experimentId: string;
|
|
1406
1889
|
name?: string;
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1890
|
+
config: RunConfig;
|
|
1891
|
+
startedAt: string;
|
|
1892
|
+
completedAt?: string;
|
|
1893
|
+
status: 'running' | 'completed' | 'failed';
|
|
1894
|
+
report?: BenchmarkReport;
|
|
1895
|
+
error?: string;
|
|
1411
1896
|
}
|
|
1412
|
-
interface
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
until?: number;
|
|
1897
|
+
interface Experiment {
|
|
1898
|
+
id: string;
|
|
1899
|
+
name: string;
|
|
1900
|
+
createdAt: string;
|
|
1901
|
+
metadata?: Record<string, unknown>;
|
|
1418
1902
|
}
|
|
1419
|
-
interface
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1427
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
1428
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1429
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1430
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1431
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1432
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
1903
|
+
interface ExperimentStore {
|
|
1904
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
1905
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
1906
|
+
listExperiments(): Promise<Experiment[]>;
|
|
1907
|
+
saveRun(run: Run): Promise<void>;
|
|
1908
|
+
getRun(id: string): Promise<Run | null>;
|
|
1909
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1433
1910
|
}
|
|
1434
|
-
declare class
|
|
1435
|
-
private
|
|
1436
|
-
private
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
1444
|
-
appendEvent(event: TraceEvent): Promise<void>;
|
|
1445
|
-
appendArtifact(artifact: Artifact): Promise<void>;
|
|
1446
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1447
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
1448
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1449
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1450
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1451
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1452
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
1911
|
+
declare class InMemoryExperimentStore implements ExperimentStore {
|
|
1912
|
+
private readonly experiments;
|
|
1913
|
+
private readonly runs;
|
|
1914
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
1915
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
1916
|
+
listExperiments(): Promise<Experiment[]>;
|
|
1917
|
+
saveRun(run: Run): Promise<void>;
|
|
1918
|
+
getRun(id: string): Promise<Run | null>;
|
|
1919
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1453
1920
|
}
|
|
1454
|
-
|
|
1921
|
+
declare class ExperimentTracker {
|
|
1922
|
+
private readonly store;
|
|
1923
|
+
constructor(store: ExperimentStore);
|
|
1924
|
+
startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
|
|
1925
|
+
startRun(config: RunConfig): Promise<Run>;
|
|
1926
|
+
completeRun(runId: string, report: BenchmarkReport): Promise<void>;
|
|
1927
|
+
failRun(runId: string, error: string): Promise<void>;
|
|
1928
|
+
/**
|
|
1929
|
+
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
1930
|
+
* and config changes that may explain the movement.
|
|
1931
|
+
*/
|
|
1932
|
+
diff(runIdA: string, runIdB: string): Promise<RunDiff>;
|
|
1933
|
+
/** Timeline of aggregate scores for an experiment. */
|
|
1934
|
+
timeline(experimentId: string): Promise<Array<{
|
|
1935
|
+
runId: string;
|
|
1936
|
+
startedAt: string;
|
|
1937
|
+
overall: number | null;
|
|
1938
|
+
}>>;
|
|
1939
|
+
}
|
|
1940
|
+
interface RunDiff {
|
|
1941
|
+
before: {
|
|
1942
|
+
runId: string;
|
|
1943
|
+
name?: string;
|
|
1944
|
+
startedAt: string;
|
|
1945
|
+
};
|
|
1946
|
+
after: {
|
|
1947
|
+
runId: string;
|
|
1948
|
+
name?: string;
|
|
1949
|
+
startedAt: string;
|
|
1950
|
+
};
|
|
1951
|
+
aggregateDelta: number;
|
|
1952
|
+
scenarios: Array<{
|
|
1953
|
+
scenarioId: string;
|
|
1954
|
+
before: number | null;
|
|
1955
|
+
after: number | null;
|
|
1956
|
+
delta: number | null;
|
|
1957
|
+
status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
|
|
1958
|
+
}>;
|
|
1959
|
+
configChanges: Record<string, {
|
|
1960
|
+
before: unknown;
|
|
1961
|
+
after: unknown;
|
|
1962
|
+
}>;
|
|
1963
|
+
}
|
|
1964
|
+
|
|
1965
|
+
/**
|
|
1966
|
+
* FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
|
|
1967
|
+
*
|
|
1968
|
+
* Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
|
|
1969
|
+
* files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
|
|
1970
|
+
* based rollover. Writes are append-only so the file log doubles as an audit
|
|
1971
|
+
* trail of every state transition the tracker ever wrote.
|
|
1972
|
+
*
|
|
1973
|
+
* Reads lazy-load every NDJSON file in the directory (including rolled-over
|
|
1974
|
+
* archives), latest-write-wins per `id`. Subsequent writes update the
|
|
1975
|
+
* in-memory index in place so reads after writes are O(1).
|
|
1976
|
+
*
|
|
1977
|
+
* Node-only — imports `node:fs/promises`. Don't import this from a Worker;
|
|
1978
|
+
* use the in-memory store or the D1 store from `./experiment-tracker-d1`.
|
|
1979
|
+
*/
|
|
1980
|
+
|
|
1981
|
+
interface FileSystemExperimentStoreOptions {
|
|
1982
|
+
/** Directory the NDJSON files live in. Created on first write. */
|
|
1455
1983
|
dir: string;
|
|
1456
|
-
/**
|
|
1984
|
+
/** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
|
|
1457
1985
|
maxBytes?: number;
|
|
1458
1986
|
}
|
|
1459
|
-
declare class
|
|
1460
|
-
private dir;
|
|
1461
|
-
private maxBytes;
|
|
1462
|
-
/** Lazy in-memory index for queries — populated on first read. */
|
|
1987
|
+
declare class FileSystemExperimentStore implements ExperimentStore {
|
|
1988
|
+
private readonly dir;
|
|
1989
|
+
private readonly maxBytes;
|
|
1463
1990
|
private index?;
|
|
1464
1991
|
private loaded;
|
|
1465
|
-
constructor(options:
|
|
1992
|
+
constructor(options: FileSystemExperimentStoreOptions);
|
|
1993
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
1994
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
1995
|
+
listExperiments(): Promise<Experiment[]>;
|
|
1996
|
+
saveRun(run: Run): Promise<void>;
|
|
1997
|
+
getRun(id: string): Promise<Run | null>;
|
|
1998
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1466
1999
|
private ensureDir;
|
|
1467
2000
|
private append;
|
|
1468
|
-
private insertInto;
|
|
1469
2001
|
private load;
|
|
1470
|
-
appendRun(run: Run): Promise<void>;
|
|
1471
|
-
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
1472
|
-
appendSpan(span: Span): Promise<void>;
|
|
1473
|
-
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
1474
|
-
appendEvent(event: TraceEvent): Promise<void>;
|
|
1475
|
-
appendArtifact(artifact: Artifact): Promise<void>;
|
|
1476
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1477
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
1478
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1479
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1480
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1481
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1482
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
1483
2002
|
}
|
|
1484
2003
|
|
|
1485
2004
|
/**
|
|
1486
|
-
*
|
|
1487
|
-
* internal stack. One emitter per Run; emitters do NOT share state.
|
|
2005
|
+
* D1ExperimentStore — Cloudflare D1-backed `ExperimentStore`.
|
|
1488
2006
|
*
|
|
1489
|
-
*
|
|
1490
|
-
*
|
|
1491
|
-
*
|
|
1492
|
-
* the
|
|
1493
|
-
*
|
|
2007
|
+
* Workers-safe (uses only the `D1Database` binding the runtime injects). Two
|
|
2008
|
+
* tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
|
|
2009
|
+
* a Worker route can both write the row at run start and update it at run end
|
|
2010
|
+
* without losing the original config — the row's lifecycle mirrors the
|
|
2011
|
+
* `Run.status` field one-to-one.
|
|
2012
|
+
*
|
|
2013
|
+
* Why this lives next to `InMemoryExperimentStore`:
|
|
2014
|
+
* - browser, coding, and computer-use agents can all run as Workers
|
|
2015
|
+
* - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
|
|
2016
|
+
* - Hand-rolling D1 SQL in every consumer is exactly the duplication this
|
|
2017
|
+
* module exists to prevent
|
|
2018
|
+
*
|
|
2019
|
+
* Schema versioning: the `meta` table records `schema_version` so a future
|
|
2020
|
+
* column addition can be detected and migrated additively. Today's schema is
|
|
2021
|
+
* v1; bump only on breaking shape changes.
|
|
1494
2022
|
*/
|
|
1495
2023
|
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
2024
|
+
/**
|
|
2025
|
+
* Minimal `D1Database` shape we depend on. Avoids pulling in
|
|
2026
|
+
* `@cloudflare/workers-types` as a hard dep — consumers that already have
|
|
2027
|
+
* those types installed can pass the binding directly.
|
|
2028
|
+
*/
|
|
2029
|
+
interface D1Like {
|
|
2030
|
+
prepare(query: string): D1PreparedStatementLike;
|
|
2031
|
+
batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
|
|
2032
|
+
exec(query: string): Promise<unknown>;
|
|
1500
2033
|
}
|
|
1501
|
-
interface
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
2034
|
+
interface D1PreparedStatementLike {
|
|
2035
|
+
bind(...values: unknown[]): D1PreparedStatementLike;
|
|
2036
|
+
first<T = Record<string, unknown>>(): Promise<T | null>;
|
|
2037
|
+
all<T = Record<string, unknown>>(): Promise<{
|
|
2038
|
+
results: T[];
|
|
2039
|
+
}>;
|
|
2040
|
+
run(): Promise<unknown>;
|
|
2041
|
+
}
|
|
2042
|
+
interface D1ExperimentStoreOptions {
|
|
2043
|
+
/** D1 binding from `env`. */
|
|
2044
|
+
db: D1Like;
|
|
2045
|
+
/**
|
|
2046
|
+
* Optional table-name prefix so multiple ExperimentStores can share a DB
|
|
2047
|
+
* without colliding (e.g. `browser_eval_experiments` vs `coding_eval_experiments`).
|
|
2048
|
+
* Default: `agent_eval_`.
|
|
2049
|
+
*/
|
|
2050
|
+
tablePrefix?: string;
|
|
1507
2051
|
}
|
|
1508
|
-
declare class
|
|
1509
|
-
private
|
|
1510
|
-
private
|
|
1511
|
-
private
|
|
1512
|
-
private
|
|
1513
|
-
private
|
|
1514
|
-
constructor(
|
|
1515
|
-
get runId(): string;
|
|
1516
|
-
startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
|
|
1517
|
-
endRun(outcome?: RunOutcome$1): Promise<void>;
|
|
1518
|
-
abortRun(reason: string): Promise<void>;
|
|
1519
|
-
span<S extends Span = Span>(init: {
|
|
1520
|
-
kind: SpanKind;
|
|
1521
|
-
name: string;
|
|
1522
|
-
parentSpanId?: string;
|
|
1523
|
-
attributes?: Record<string, unknown>;
|
|
1524
|
-
} & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
|
|
1525
|
-
private handle;
|
|
1526
|
-
private pop;
|
|
1527
|
-
llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
|
|
1528
|
-
tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
|
|
1529
|
-
retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
|
|
1530
|
-
recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
|
|
1531
|
-
sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
|
|
1532
|
-
emit(event: {
|
|
1533
|
-
kind: EventKind;
|
|
1534
|
-
spanId?: string;
|
|
1535
|
-
payload?: Record<string, unknown>;
|
|
1536
|
-
}): Promise<TraceEvent>;
|
|
1537
|
-
recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
|
|
1538
|
-
timestamp?: number;
|
|
1539
|
-
}): Promise<BudgetLedgerEntry>;
|
|
1540
|
-
recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
|
|
2052
|
+
declare class D1ExperimentStore implements ExperimentStore {
|
|
2053
|
+
private readonly db;
|
|
2054
|
+
private readonly experimentsTable;
|
|
2055
|
+
private readonly runsTable;
|
|
2056
|
+
private readonly metaTable;
|
|
2057
|
+
private schemaReady;
|
|
2058
|
+
constructor(options: D1ExperimentStoreOptions);
|
|
1541
2059
|
/**
|
|
1542
|
-
*
|
|
1543
|
-
*
|
|
2060
|
+
* Idempotent schema setup. Safe to call before every operation; the second
|
|
2061
|
+
* call short-circuits via `schemaReady`. Most consumers will call it once
|
|
2062
|
+
* during Worker bootstrap.
|
|
1544
2063
|
*/
|
|
1545
|
-
|
|
2064
|
+
ensureSchema(): Promise<void>;
|
|
2065
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
2066
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
2067
|
+
listExperiments(): Promise<Experiment[]>;
|
|
2068
|
+
saveRun(run: Run): Promise<void>;
|
|
2069
|
+
getRun(id: string): Promise<Run | null>;
|
|
2070
|
+
listRuns(experimentId: string): Promise<Run[]>;
|
|
1546
2071
|
}
|
|
1547
|
-
/** Helper to build an LLM span handle args object from a provider-shaped response. */
|
|
1548
|
-
declare function llmSpanFromProvider(args: {
|
|
1549
|
-
name?: string;
|
|
1550
|
-
model: string;
|
|
1551
|
-
messages: Message[];
|
|
1552
|
-
output: string;
|
|
1553
|
-
usage?: {
|
|
1554
|
-
inputTokens?: number;
|
|
1555
|
-
outputTokens?: number;
|
|
1556
|
-
cachedTokens?: number;
|
|
1557
|
-
reasoningTokens?: number;
|
|
1558
|
-
};
|
|
1559
|
-
costUsd?: number;
|
|
1560
|
-
finishReason?: string;
|
|
1561
|
-
}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
|
|
1562
2072
|
|
|
1563
2073
|
/**
|
|
1564
2074
|
* Typed query helpers over TraceStore.
|
|
@@ -1569,7 +2079,7 @@ declare function llmSpanFromProvider(args: {
|
|
|
1569
2079
|
* tooling works out of the box.
|
|
1570
2080
|
*/
|
|
1571
2081
|
|
|
1572
|
-
declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
|
|
2082
|
+
declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run$1[]>;
|
|
1573
2083
|
declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
|
|
1574
2084
|
declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
|
|
1575
2085
|
declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
|
|
@@ -1585,7 +2095,7 @@ declare function aggregateLlm(spans: LlmSpan[]): {
|
|
|
1585
2095
|
costUsd: number;
|
|
1586
2096
|
};
|
|
1587
2097
|
/** Pick the outcome's failure class when present, else derive 'success' from run status. */
|
|
1588
|
-
declare function runFailureClass(run: Run): FailureClass;
|
|
2098
|
+
declare function runFailureClass(run: Run$1): FailureClass;
|
|
1589
2099
|
|
|
1590
2100
|
/**
|
|
1591
2101
|
* Redaction — remove PII / secrets from trace payloads before persist.
|
|
@@ -1689,10 +2199,10 @@ interface OtlpExport {
|
|
|
1689
2199
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
1690
2200
|
|
|
1691
2201
|
interface RunTrace {
|
|
1692
|
-
run: Run;
|
|
2202
|
+
run: Run$1;
|
|
1693
2203
|
spans: Span[];
|
|
1694
2204
|
events: TraceEvent[];
|
|
1695
|
-
artifacts: Artifact[];
|
|
2205
|
+
artifacts: Artifact$1[];
|
|
1696
2206
|
budget: BudgetLedgerEntry[];
|
|
1697
2207
|
}
|
|
1698
2208
|
interface RunCriticOptions {
|
|
@@ -1725,55 +2235,6 @@ declare function distillPlaybook(entries: PlaybookEntry[], options?: {
|
|
|
1725
2235
|
}): Playbook;
|
|
1726
2236
|
declare function renderPlaybookMarkdown(playbook: Playbook): string;
|
|
1727
2237
|
|
|
1728
|
-
interface OptimizationExample {
|
|
1729
|
-
scenarioId: string;
|
|
1730
|
-
metadata?: Record<string, unknown>;
|
|
1731
|
-
}
|
|
1732
|
-
interface SteeringEvaluation {
|
|
1733
|
-
variant: SteeringBundle;
|
|
1734
|
-
example: OptimizationExample;
|
|
1735
|
-
trialIndex: number;
|
|
1736
|
-
}
|
|
1737
|
-
interface SteeringVariantReport {
|
|
1738
|
-
variantId: string;
|
|
1739
|
-
bundle: SteeringBundle;
|
|
1740
|
-
mean: number;
|
|
1741
|
-
ci95: {
|
|
1742
|
-
lower: number;
|
|
1743
|
-
upper: number;
|
|
1744
|
-
};
|
|
1745
|
-
scenarioScores: Record<string, {
|
|
1746
|
-
mean: number;
|
|
1747
|
-
n: number;
|
|
1748
|
-
samples: number[];
|
|
1749
|
-
}>;
|
|
1750
|
-
}
|
|
1751
|
-
interface OptimizationLoopResult {
|
|
1752
|
-
winner: SteeringBundle;
|
|
1753
|
-
significant: boolean;
|
|
1754
|
-
reports: SteeringVariantReport[];
|
|
1755
|
-
pairwise: Array<{
|
|
1756
|
-
variantA: string;
|
|
1757
|
-
variantB: string;
|
|
1758
|
-
pValue: number;
|
|
1759
|
-
qValue: number;
|
|
1760
|
-
significant: boolean;
|
|
1761
|
-
meanDelta: number;
|
|
1762
|
-
}>;
|
|
1763
|
-
}
|
|
1764
|
-
interface OptimizationLoopConfig {
|
|
1765
|
-
variants: SteeringBundle[];
|
|
1766
|
-
examples: OptimizationExample[];
|
|
1767
|
-
evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
|
|
1768
|
-
scoreWeights?: Partial<RunScoreWeights>;
|
|
1769
|
-
trialsPerScenario?: number;
|
|
1770
|
-
}
|
|
1771
|
-
declare class OptimizationLoop {
|
|
1772
|
-
private readonly optimizer;
|
|
1773
|
-
constructor(optimizer?: PromptOptimizer);
|
|
1774
|
-
run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
|
|
1775
|
-
}
|
|
1776
|
-
|
|
1777
2238
|
type SteeringOptimizerBackend = 'pairwise' | 'ax-gepa';
|
|
1778
2239
|
interface SteeringOptimizationRow {
|
|
1779
2240
|
variantId: string;
|
|
@@ -2167,7 +2628,7 @@ type HostedRunCriticConfig = Pick<RunCriticOptions, 'weights'> & {
|
|
|
2167
2628
|
/**
|
|
2168
2629
|
* Dual-agent convergence bench.
|
|
2169
2630
|
*
|
|
2170
|
-
* Pattern lifted from
|
|
2631
|
+
* Pattern lifted from dual-worker review loops: two agents take turns until
|
|
2171
2632
|
* they converge on a consensus artifact. One proposes, the other critiques;
|
|
2172
2633
|
* the proposer revises; repeat until a score threshold is hit or max rounds.
|
|
2173
2634
|
*
|
|
@@ -2400,6 +2861,51 @@ interface LlmReviewerConfig<State, Summary = unknown> {
|
|
|
2400
2861
|
}
|
|
2401
2862
|
declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
|
|
2402
2863
|
|
|
2864
|
+
interface ProposeReviewControlState<State, Summary = unknown> {
|
|
2865
|
+
shot: number;
|
|
2866
|
+
state: State;
|
|
2867
|
+
priorReview: Review | null;
|
|
2868
|
+
verification: Verification;
|
|
2869
|
+
traceSummary?: Summary;
|
|
2870
|
+
memory: ReviewMemoryEntry[];
|
|
2871
|
+
completed: boolean;
|
|
2872
|
+
reviewAvailable: boolean;
|
|
2873
|
+
reviewError?: string;
|
|
2874
|
+
}
|
|
2875
|
+
interface ProposeReviewControlAction {
|
|
2876
|
+
type: 'propose-review-shot';
|
|
2877
|
+
shot: number;
|
|
2878
|
+
}
|
|
2879
|
+
interface ProposeReviewControlResult<State, Summary = unknown> {
|
|
2880
|
+
state: State;
|
|
2881
|
+
verification: Verification;
|
|
2882
|
+
traceSummary?: Summary;
|
|
2883
|
+
review: Review | null;
|
|
2884
|
+
reviewAvailable: boolean;
|
|
2885
|
+
reviewError?: string;
|
|
2886
|
+
}
|
|
2887
|
+
interface ProposeReviewControlConfig<State, Summary = unknown> {
|
|
2888
|
+
goal: string;
|
|
2889
|
+
initialState: State;
|
|
2890
|
+
propose: ProposeFn<State, Summary>;
|
|
2891
|
+
verify: VerifyFn<State>;
|
|
2892
|
+
review: ReviewFn<State, Summary>;
|
|
2893
|
+
maxShots?: number;
|
|
2894
|
+
maxWallMs?: number;
|
|
2895
|
+
memory?: ReviewMemoryStore;
|
|
2896
|
+
store?: TraceStore;
|
|
2897
|
+
scenarioId?: string;
|
|
2898
|
+
projectId?: string;
|
|
2899
|
+
variantId?: string;
|
|
2900
|
+
fallbackInstruction?: string;
|
|
2901
|
+
confidenceFloor?: number;
|
|
2902
|
+
confidenceFloorWindow?: number;
|
|
2903
|
+
failureClassFromVerification?: (verification: Verification) => FailureClass | undefined;
|
|
2904
|
+
actionFailure?: ControlRuntimeConfig<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>['actionFailure'];
|
|
2905
|
+
}
|
|
2906
|
+
declare function runProposeReviewAsControlLoop<State, Summary = unknown>(config: ProposeReviewControlConfig<State, Summary>): Promise<ControlRunResult<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>>;
|
|
2907
|
+
declare function controlFailureClassFromVerification(verification: Verification): FailureClass | undefined;
|
|
2908
|
+
|
|
2403
2909
|
/**
|
|
2404
2910
|
* TestGradedScenario — a scenario whose score comes from a test suite.
|
|
2405
2911
|
*
|
|
@@ -2428,7 +2934,7 @@ interface TestGradedRunOptions {
|
|
|
2428
2934
|
variantId?: string;
|
|
2429
2935
|
driver?: SandboxDriver;
|
|
2430
2936
|
/** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
|
|
2431
|
-
provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
2937
|
+
provenance?: Pick<Run$1, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
2432
2938
|
}
|
|
2433
2939
|
interface TestGradedRunResult {
|
|
2434
2940
|
runId: string;
|
|
@@ -2481,7 +2987,7 @@ declare class BudgetGuard {
|
|
|
2481
2987
|
*/
|
|
2482
2988
|
|
|
2483
2989
|
interface FailureContext {
|
|
2484
|
-
run: Run;
|
|
2990
|
+
run: Run$1;
|
|
2485
2991
|
spans: Span[];
|
|
2486
2992
|
events: TraceEvent[];
|
|
2487
2993
|
}
|
|
@@ -2824,7 +3330,7 @@ interface RegressionSpec {
|
|
|
2824
3330
|
metric: string;
|
|
2825
3331
|
higherIsBetter: boolean;
|
|
2826
3332
|
/** Extract a scalar from a run. Default extractors handle common metrics. */
|
|
2827
|
-
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
3333
|
+
extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
|
|
2828
3334
|
}
|
|
2829
3335
|
interface RegressionOptions extends BaselineOptions {
|
|
2830
3336
|
baseline: RunFilter;
|
|
@@ -2938,7 +3444,7 @@ declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): Ora
|
|
|
2938
3444
|
/**
|
|
2939
3445
|
* Cost tracker — token + USD accounting per scenario and per run.
|
|
2940
3446
|
*
|
|
2941
|
-
*
|
|
3447
|
+
* Adapted from generic usage-event accounting. Every
|
|
2942
3448
|
* optimizer needs to know "is the quality gain worth the cost delta?",
|
|
2943
3449
|
* and every dashboard needs dollars-per-completed-task. MODEL_PRICING
|
|
2944
3450
|
* from metrics.ts stays authoritative for estimate math; this module
|
|
@@ -3149,7 +3655,7 @@ declare function analyzeSeries(values: number[], options?: SeriesConvergenceOpti
|
|
|
3149
3655
|
* State continuity scoring — measures how well a resumed/handed-off agent
|
|
3150
3656
|
* preserves prior work.
|
|
3151
3657
|
*
|
|
3152
|
-
*
|
|
3658
|
+
* When session 2 continues
|
|
3153
3659
|
* session 1's work, the key question is: did it preserve key artifacts,
|
|
3154
3660
|
* or start over and lose context? Each `ContinuityCheck` inspects one
|
|
3155
3661
|
* aspect (file preserved, key count grew, status advanced) and yields
|
|
@@ -3192,107 +3698,6 @@ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minR
|
|
|
3192
3698
|
/** Common check: a status field advanced in an expected order. */
|
|
3193
3699
|
declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
|
|
3194
3700
|
|
|
3195
|
-
/**
|
|
3196
|
-
* Dataset — versioned, sliceable, content-hashed scenario collection.
|
|
3197
|
-
*
|
|
3198
|
-
* Scenarios stop being ephemeral arrays and become first-class
|
|
3199
|
-
* artifacts. Every Dataset carries:
|
|
3200
|
-
* - content hash (sha256 over canonicalized scenario array)
|
|
3201
|
-
* - provenance (contributor, createdAt, sourceUrl)
|
|
3202
|
-
* - split labels (train | dev | test | holdout)
|
|
3203
|
-
* - difficulty tiers (easy | medium | hard | extreme)
|
|
3204
|
-
* - tags (free-form, per-scenario)
|
|
3205
|
-
*
|
|
3206
|
-
* `Dataset.slice({ difficulty, split, holdout, seed })` returns a
|
|
3207
|
-
* deterministic, reproducible subset. Holdout slices are locked: you
|
|
3208
|
-
* can read them but `mutate` throws, which prevents "oh I'll just
|
|
3209
|
-
* tweak that one scenario" contamination drift.
|
|
3210
|
-
*/
|
|
3211
|
-
type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
|
|
3212
|
-
type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
|
|
3213
|
-
interface DatasetScenario {
|
|
3214
|
-
id: string;
|
|
3215
|
-
/** Arbitrary payload; the framework doesn't interpret it. */
|
|
3216
|
-
payload: unknown;
|
|
3217
|
-
split?: DatasetSplit;
|
|
3218
|
-
difficulty?: DatasetDifficulty;
|
|
3219
|
-
/** Canary token that MUST NOT round-trip through a correct agent output. */
|
|
3220
|
-
canary?: string;
|
|
3221
|
-
tags?: Record<string, string>;
|
|
3222
|
-
}
|
|
3223
|
-
interface DatasetProvenance {
|
|
3224
|
-
contributor?: string;
|
|
3225
|
-
createdAt: string;
|
|
3226
|
-
sourceUrl?: string;
|
|
3227
|
-
license?: string;
|
|
3228
|
-
description?: string;
|
|
3229
|
-
/** Monotonic human-readable version (e.g. "2026.04.20"). */
|
|
3230
|
-
version: string;
|
|
3231
|
-
}
|
|
3232
|
-
interface DatasetManifest {
|
|
3233
|
-
name: string;
|
|
3234
|
-
provenance: DatasetProvenance;
|
|
3235
|
-
/** sha256 hex over canonicalized scenarios. */
|
|
3236
|
-
contentHash: string;
|
|
3237
|
-
scenarioCount: number;
|
|
3238
|
-
splitCounts: Record<DatasetSplit, number>;
|
|
3239
|
-
}
|
|
3240
|
-
interface SliceOptions {
|
|
3241
|
-
split?: DatasetSplit;
|
|
3242
|
-
difficulty?: DatasetDifficulty;
|
|
3243
|
-
/** Number of scenarios (random sample, seeded). Omit to take all that match. */
|
|
3244
|
-
limit?: number;
|
|
3245
|
-
seed?: number;
|
|
3246
|
-
/** Predicate narrowing. Applied after split/difficulty filters. */
|
|
3247
|
-
filter?: (scenario: DatasetScenario) => boolean;
|
|
3248
|
-
/** If true, include scenarios marked as holdout. Default false. */
|
|
3249
|
-
includeHoldout?: boolean;
|
|
3250
|
-
}
|
|
3251
|
-
/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
|
|
3252
|
-
declare class HoldoutLockedError extends Error {
|
|
3253
|
-
constructor(datasetName: string);
|
|
3254
|
-
}
|
|
3255
|
-
declare class Dataset {
|
|
3256
|
-
readonly name: string;
|
|
3257
|
-
readonly provenance: DatasetProvenance;
|
|
3258
|
-
private scenarios;
|
|
3259
|
-
private locked;
|
|
3260
|
-
constructor(init: {
|
|
3261
|
-
name: string;
|
|
3262
|
-
provenance: DatasetProvenance;
|
|
3263
|
-
scenarios: DatasetScenario[];
|
|
3264
|
-
locked?: boolean;
|
|
3265
|
-
});
|
|
3266
|
-
/** All scenarios. Readonly — callers must go through `slice` or `clone`. */
|
|
3267
|
-
all(): readonly DatasetScenario[];
|
|
3268
|
-
get size(): number;
|
|
3269
|
-
/**
|
|
3270
|
-
* Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
|
|
3271
|
-
* the same arguments always produce the same slice across machines.
|
|
3272
|
-
*/
|
|
3273
|
-
slice(options?: SliceOptions): DatasetScenario[];
|
|
3274
|
-
/**
|
|
3275
|
-
* Assemble the manifest (name + provenance + content hash + counts).
|
|
3276
|
-
* Content hash is deterministic over canonicalized scenarios.
|
|
3277
|
-
*/
|
|
3278
|
-
manifest(): Promise<DatasetManifest>;
|
|
3279
|
-
/** Fresh unlocked copy — for post-release forks when mutation is needed. */
|
|
3280
|
-
clone(overrides?: Partial<{
|
|
3281
|
-
name: string;
|
|
3282
|
-
version: string;
|
|
3283
|
-
}>): Dataset;
|
|
3284
|
-
lock(): void;
|
|
3285
|
-
add(scenario: DatasetScenario): void;
|
|
3286
|
-
remove(scenarioId: string): void;
|
|
3287
|
-
/**
|
|
3288
|
-
* Stable JSON-Lines serialization — deterministic byte-for-byte.
|
|
3289
|
-
* Write to disk for contamination-verifiable archives.
|
|
3290
|
-
*/
|
|
3291
|
-
toJsonl(): string;
|
|
3292
|
-
static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
|
|
3293
|
-
}
|
|
3294
|
-
declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
|
|
3295
|
-
|
|
3296
3701
|
/**
|
|
3297
3702
|
* ContaminationGuard — ensures held-out scenarios don't leak into
|
|
3298
3703
|
* training/prompt paths, and flags model memorization.
|
|
@@ -3608,7 +4013,7 @@ interface ContractMetric {
|
|
|
3608
4013
|
/** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
|
|
3609
4014
|
maxRegression?: number;
|
|
3610
4015
|
/** Optional extractor if the metric isn't in the default set. */
|
|
3611
|
-
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
4016
|
+
extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
|
|
3612
4017
|
}
|
|
3613
4018
|
interface ThresholdContract {
|
|
3614
4019
|
name: string;
|
|
@@ -3874,10 +4279,10 @@ declare class BuilderSession {
|
|
|
3874
4279
|
*/
|
|
3875
4280
|
declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
|
|
3876
4281
|
projectId: string;
|
|
3877
|
-
chatRuns: Run[];
|
|
3878
|
-
lastBuilderRun?: Run;
|
|
3879
|
-
lastBuildRun?: Run;
|
|
3880
|
-
lastAppRuntimeRuns: Run[];
|
|
4282
|
+
chatRuns: Run$1[];
|
|
4283
|
+
lastBuilderRun?: Run$1;
|
|
4284
|
+
lastBuildRun?: Run$1;
|
|
4285
|
+
lastAppRuntimeRuns: Run$1[];
|
|
3881
4286
|
}>;
|
|
3882
4287
|
|
|
3883
4288
|
/**
|
|
@@ -3997,8 +4402,8 @@ interface ChatSummary {
|
|
|
3997
4402
|
builderRunId: string;
|
|
3998
4403
|
startedAt: number;
|
|
3999
4404
|
endedAt?: number;
|
|
4000
|
-
status: Run['status'];
|
|
4001
|
-
outcome?: Run['outcome'];
|
|
4405
|
+
status: Run$1['status'];
|
|
4406
|
+
outcome?: Run$1['outcome'];
|
|
4002
4407
|
/** Counts of spans emitted during the chat. */
|
|
4003
4408
|
llmTurns?: number;
|
|
4004
4409
|
toolCalls?: number;
|
|
@@ -4006,7 +4411,7 @@ interface ChatSummary {
|
|
|
4006
4411
|
appRuntimeRunIds: string[];
|
|
4007
4412
|
}
|
|
4008
4413
|
interface ProjectTimelineEntry {
|
|
4009
|
-
run: Run;
|
|
4414
|
+
run: Run$1;
|
|
4010
4415
|
layerBucket: 'chat' | 'build' | 'runtime' | 'other';
|
|
4011
4416
|
}
|
|
4012
4417
|
declare class ProjectRegistry {
|
|
@@ -4093,7 +4498,7 @@ declare class FileSystemOutcomeStore implements OutcomeStore {
|
|
|
4093
4498
|
interface EvalMetricSpec {
|
|
4094
4499
|
id: string;
|
|
4095
4500
|
/** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
|
|
4096
|
-
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
4501
|
+
extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
|
|
4097
4502
|
}
|
|
4098
4503
|
interface OutcomePair {
|
|
4099
4504
|
evalMetric: string;
|
|
@@ -7978,4 +8383,4 @@ interface ReflectionProposal {
|
|
|
7978
8383
|
}
|
|
7979
8384
|
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
7980
8385
|
|
|
7981
|
-
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|
|
8386
|
+
export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|