npm - @tangle-network/agent-eval - Versions diffs - 0.17.1 → 0.17.3 - Mend

@tangle-network/agent-eval 0.17.1 → 0.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -513,1052 +513,1562 @@ declare function formatDriverReport(results: DriverResult[]): string;
 declare function printDriverSummary(results: DriverResult[]): void;
 /**
- * Normalize scores so all dimensions follow "higher = better".
- * Inverted dimensions (hallucination, false_confidence, worst_failure)
- * already use inverted scoring in the prompt (10 = no hallucination),
- * but this function ensures consistency if raw scores leak through.
- */
-declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
-/** Weighted mean — falls back to uniform weights when omitted */
-declare function weightedMean(scores: {
-    score: number;
-    weight?: number;
-}[]): number;
-/** Bootstrap confidence interval */
-declare function confidenceInterval(scores: number[], confidence?: number): {
-    mean: number;
-    lower: number;
-    upper: number;
-};
-/**
- * Inter-rater reliability — simplified Krippendorff's alpha.
+ * TraceSchema v1 — the canonical data model for agent-eval.
  *
- * Each inner array is one judge's scores for all items.
- * All arrays must have the same length (same items scored).
- */
-declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
-/**
- * Mann-Whitney U test for comparing two independent groups.
- * Returns U statistic and approximate p-value (normal approximation).
- */
-declare function mannWhitneyU(a: number[], b: number[]): {
-    u: number;
-    p: number;
-};
-/** Partial credit: returns 0-1 ratio of current toward target */
-declare function partialCredit(current: number, target: number): number;
-/**
- * Paired t-test — before/after measurements on the SAME items.
- * Pairing removes inter-item variance, giving tighter significance than
- * an unpaired test when comparing prompt v1 vs prompt v2 on identical
- * scenarios.
- */
-declare function pairedTTest(before: number[], after: number[]): {
-    t: number;
-    df: number;
-    p: number;
-};
-/**
- * Wilcoxon signed-rank test — paired non-parametric alternative.
- * Use when the differences aren't normally distributed.
+ * Every score, every failure class, every pipeline in the framework is
+ * a view over this data. Shape it once, live with it.
+ *
+ * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
+ * but extended with agent-specific span kinds (llm, tool, retrieval,
+ * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
+ * entities that OTEL leaves as free-form attributes.
  */
-declare function wilcoxonSignedRank(before: number[], after: number[]): {
-    w: number;
-    p: number;
-};
+declare const TRACE_SCHEMA_VERSION = "1.0.0";
+type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
+interface BudgetSpec {
+    tokens?: number;
+    wallMs?: number;
+    calls?: number;
+    usd?: number;
+}
+interface RunOutcome$1 {
+    score?: number;
+    pass?: boolean;
+    failureClass?: FailureClass;
+    notes?: string;
+}
 /**
- * Cohen's d — standardized effect size for two independent groups.
- * Positive d means group b has higher mean than group a.
- * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
+ * Layer — optional classification in a nested build workflow.
+ * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
+ * `app-build`: sandbox harness that compiled + tested the generated scaffold.
+ * `app-runtime`: a run of the generated agent against a domain scenario.
+ * `meta`: any meta-eval (judge replay, correlation analysis).
  */
-declare function cohensD(a: number[], b: number[]): number;
+type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
+interface Run$1 {
+    runId: string;
+    scenarioId: string;
+    variantId?: string;
+    datasetVersion?: string;
+    /** Git SHA of agent code at run time. */
+    codeSha?: string;
+    /** Hash of the prompt template + any system prompt. */
+    promptSha?: string;
+    /** Model id + date + system-prompt hash, concatenated. */
+    modelFingerprint?: string;
+    seed?: number;
+    /** Arbitrary environment markers (shell, docker version, tz). */
+    envFingerprint?: Record<string, string>;
+    /** Version of the redaction rules applied to this run. */
+    redactionVersion?: string;
+    /** Parent run in a nested build workflow. A builder run's children are
+     *  app-build runs; those children are app-runtime runs. */
+    parentRunId?: string;
+    /** Stable project identifier — groups runs across chats + sessions. */
+    projectId?: string;
+    /** Chat/conversation identifier within a project. */
+    chatId?: string;
+    /** Layer classification — hint for aggregation; not enforced. */
+    layer?: RunLayer;
+    startedAt: number;
+    endedAt?: number;
+    status: RunStatus;
+    outcome?: RunOutcome$1;
+    budget?: BudgetSpec;
+    /** Free-form labels for downstream grouping. */
+    tags?: Record<string, string>;
+}
+type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
+type SpanStatus = 'ok' | 'error';
+interface SpanBase {
+    spanId: string;
+    parentSpanId?: string;
+    runId: string;
+    kind: SpanKind;
+    name: string;
+    startedAt: number;
+    endedAt?: number;
+    status?: SpanStatus;
+    error?: string;
+    /** Anything not covered by typed fields. Kept deliberately free-form. */
+    attributes?: Record<string, unknown>;
+}
+interface Message {
+    role: 'system' | 'user' | 'assistant' | 'tool';
+    content: string;
+    tokens?: number;
+    /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
+    images?: Array<{
+        artifactId?: string;
+        url?: string;
+        mime?: string;
+    }>;
+}
+interface LlmSpan extends SpanBase {
+    kind: 'llm';
+    model: string;
+    messages: Message[];
+    output?: string;
+    inputTokens?: number;
+    outputTokens?: number;
+    cachedTokens?: number;
+    reasoningTokens?: number;
+    costUsd?: number;
+    finishReason?: string;
+}
+interface ToolSpan extends SpanBase {
+    kind: 'tool';
+    toolName: string;
+    args: unknown;
+    result?: unknown;
+    latencyMs?: number;
+}
+interface RetrievalSpan extends SpanBase {
+    kind: 'retrieval';
+    query: string;
+    hits: Array<{
+        docId: string;
+        score: number;
+        content?: string;
+    }>;
+}
+interface JudgeSpan extends SpanBase {
+    kind: 'judge';
+    judgeId: string;
+    /** Span this judgment applies to. */
+    targetSpanId: string;
+    dimension: string;
+    /** Numeric score (free-range; interpretation up to the judge). */
+    score: number;
+    rationale?: string;
+    evidence?: string;
+}
+interface SandboxSpan extends SpanBase {
+    kind: 'sandbox';
+    image?: string;
+    command?: string;
+    exitCode?: number;
+    testsTotal?: number;
+    testsPassed?: number;
+    stdoutHash?: string;
+    stderrHash?: string;
+    /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
+    wallMs?: number;
+}
+interface GenericSpan extends SpanBase {
+    kind: 'agent' | 'custom';
+}
+type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
+type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
+interface TraceEvent {
+    eventId: string;
+    runId: string;
+    spanId?: string;
+    kind: EventKind;
+    timestamp: number;
+    payload: Record<string, unknown>;
+}
+interface BudgetLedgerEntry {
+    runId: string;
+    dimension: keyof BudgetSpec;
+    limit: number;
+    consumed: number;
+    remaining: number;
+    timestamp: number;
+    breached: boolean;
+    /** Span that triggered this entry, if any. */
+    spanId?: string;
+}
+interface Artifact$1 {
+    artifactId: string;
+    runId: string;
+    spanId?: string;
+    contentType: string;
+    sizeBytes: number;
+    /** sha256 in hex. */
+    hash: string;
+    /** External storage URL (R2, S3, filesystem path). */
+    storageUrl?: string;
+    /** Inline content for small blobs — keep under ~64KB. */
+    inlineContent?: string;
+}
+type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
+declare const FAILURE_CLASSES: readonly FailureClass[];
+declare function isLlmSpan(s: Span): s is LlmSpan;
+declare function isToolSpan(s: Span): s is ToolSpan;
+declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
+declare function isJudgeSpan(s: Span): s is JudgeSpan;
+declare function isSandboxSpan(s: Span): s is SandboxSpan;
-/**
- * ConvergenceTracker — tracks completion percentage over turns.
- *
- * Produces convergence curves showing how quickly the agent reaches
- * completion criteria.
- */
-declare class ConvergenceTracker {
-    private criteria;
-    private history;
-    constructor(criteria: CompletionCriterion[]);
-    /** Evaluate criteria against current state, record result */
-    record(turn: number, state: DriverState): {
-        completionPercent: number;
-        complete: boolean;
-        criteriaStatus: Record<string, boolean | number>;
+interface RunFilter {
+    scenarioId?: string;
+    variantId?: string;
+    status?: RunStatus;
+    since?: number;
+    until?: number;
+    tag?: {
+        key: string;
+        value: string;
     };
-    /** Get convergence curve */
-    getCurve(): number[];
-    /** Get full history with per-criterion status */
-    getHistory(): {
-        turn: number;
-        completionPercent: number;
-        criteriaStatus: Record<string, boolean | number>;
-    }[];
-    /** Find the turn where completion first reached 100% (or null) */
-    getTurnToCompletion(): number | null;
+    parentRunId?: string;
+    projectId?: string;
+    chatId?: string;
+    layer?: RunLayer;
+}
+interface SpanFilter {
+    runId?: string;
+    parentSpanId?: string;
+    kind?: SpanKind;
+    name?: string;
+    toolName?: string;
+    judgeId?: string;
+    since?: number;
+    until?: number;
+}
+interface EventFilter {
+    runId?: string;
+    spanId?: string;
+    kind?: EventKind;
+    since?: number;
+    until?: number;
+}
+interface TraceStore {
+    appendRun(run: Run$1): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact$1): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run$1 | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run$1[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact$1[]>;
+}
+declare class InMemoryTraceStore implements TraceStore {
+    private runs;
+    private allSpans;
+    private allEvents;
+    private allArtifacts;
+    private allBudget;
+    appendRun(run: Run$1): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact$1): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run$1 | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run$1[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact$1[]>;
+}
+interface FileSystemTraceStoreOptions {
+    dir: string;
+    /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
+    maxBytes?: number;
+}
+declare class FileSystemTraceStore implements TraceStore {
+    private dir;
+    private maxBytes;
+    /** Lazy in-memory index for queries — populated on first read. */
+    private index?;
+    private loaded;
+    constructor(options: FileSystemTraceStoreOptions);
+    private ensureDir;
+    private append;
+    private insertInto;
+    private load;
+    appendRun(run: Run$1): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact$1): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run$1 | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run$1[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact$1[]>;
 }
 /**
- * Versioned prompt registry.
- *
- * Every prompt used in an eval run is registered with an explicit version.
- * Reports include the content hash so A/B compares are rigorous: if the
- * hash changes between two reports, the prompt actually changed; if it
- * matches, the variance is elsewhere.
+ * TraceEmitter — hierarchical span builder that auto-parents using an
+ * internal stack. One emitter per Run; emitters do NOT share state.
  *
- * Hash is SHA-256(content), truncated to 12 hex chars for readability.
- * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
+ * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
+ * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
+ * have to thread spanIds manually. For async workflows that can't use
+ * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
+ * explicitly.
  */
-interface PromptHandle {
-    /** Stable human-readable id, e.g. 'legal.system' */
-    id: string;
-    /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
-    version: string;
-    /** SHA-256 of content, 12-hex-char prefix */
-    hash: string;
-    /** Full prompt body */
-    content: string;
+interface SpanHandle<S extends Span = Span> {
+    span: S;
+    end(patch?: Partial<S>): Promise<void>;
+    fail(error: string | Error, patch?: Partial<S>): Promise<void>;
 }
-declare class PromptRegistry {
-    private readonly entries;
+interface TraceEmitterOptions {
+    runId?: string;
+    /** Inject a clock for deterministic tests. */
+    now?: () => number;
+    /** Inject an id generator for deterministic tests. */
+    id?: () => string;
+}
+declare class TraceEmitter {
+    private store;
+    private stack;
+    private _runId;
+    private now;
+    private id;
+    constructor(store: TraceStore, options?: TraceEmitterOptions);
+    get runId(): string;
+    startRun(run: Omit<Run$1, 'runId' | 'startedAt' | 'status'>): Promise<Run$1>;
+    endRun(outcome?: RunOutcome$1): Promise<void>;
+    abortRun(reason: string): Promise<void>;
+    span<S extends Span = Span>(init: {
+        kind: SpanKind;
+        name: string;
+        parentSpanId?: string;
+        attributes?: Record<string, unknown>;
+    } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
+    private handle;
+    private pop;
+    llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
+    tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
+    retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
+    recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
+    sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
+    emit(event: {
+        kind: EventKind;
+        spanId?: string;
+        payload?: Record<string, unknown>;
+    }): Promise<TraceEvent>;
+    recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
+        timestamp?: number;
+    }): Promise<BudgetLedgerEntry>;
+    recordArtifact(artifact: Omit<Artifact$1, 'artifactId' | 'runId'>): Promise<Artifact$1>;
     /**
-     * Register a prompt. Re-registering the same id+version with DIFFERENT
-     * content throws — versions are immutable. Re-registering with the SAME
-     * content is a no-op (idempotent).
-     */
-    register(id: string, version: string, content: string): Promise<PromptHandle>;
-    /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
-    get(id: string, version: string): PromptHandle;
-    /** Return all versions of an id, newest-first (lex-descending on version). */
-    listVersions(id: string): PromptHandle[];
-    /** Snapshot the whole registry — useful for including in reports. */
-    list(): PromptHandle[];
-    /** Verify a hash against registered content. Returns null if not found. */
-    verifyHash(id: string, version: string, expectedHash: string): boolean | null;
+     * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
+     * Returns the fn's return value. Use this for the 95% case.
+     */
+    within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
 }
-/** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
-declare function hashContent(content: string): Promise<string>;
+/** Helper to build an LLM span handle args object from a provider-shaped response. */
+declare function llmSpanFromProvider(args: {
+    name?: string;
+    model: string;
+    messages: Message[];
+    output: string;
+    usage?: {
+        inputTokens?: number;
+        outputTokens?: number;
+        cachedTokens?: number;
+        reasoningTokens?: number;
+    };
+    costUsd?: number;
+    finishReason?: string;
+}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
 /**
- * Anti-slop quality judge.
+ * Policy-based agent control runtime.
  *
- * Deterministic pattern-based quality check — no LLM call. Catches the
- * 80% of AI slop that every production agent leaks:
- *   - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
- *   - N-gram repetition (same phrase over and over)
- *   - Hedging overuse ("I could be wrong, but...")
- *   - Apology padding ("I'm so sorry for the confusion...")
- *   - Unused opening formulas ("Great question!")
- *   - Length bounds (too short to be useful, too long to be read)
+ * This is the minimal reusable loop behind driver-agent patterns:
  *
- * Produces a JudgeScore in the same shape as LLM judges so it composes into
- * `BenchmarkRunner`'s judge array transparently.
+ *   observe state -> validate -> decide next action -> act -> observe -> ...
+ *
+ * It deliberately does not model named "topologies". Direct execution,
+ * critic/revise, driver intervention, specialist calls, and human escalation
+ * are all just actions chosen by the control policy.
  */
-interface AntiSlopConfig {
-    /** Domain label — appears in the JudgeScore output */
-    domain?: string;
-    /** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
-    bannedPhrases?: string[];
-    /** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
-    bannedOpenings?: RegExp[];
-    /** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
-    hedgingPatterns?: RegExp[];
-    /** Regexes matching apology padding. */
-    apologyPatterns?: RegExp[];
-    /** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
-    repetitionThreshold?: number;
-    /** Min output length in chars; below this the turn is deemed too terse. */
-    minLength?: number;
-    /** Max output length in chars; above this the turn is deemed too verbose. */
-    maxLength?: number;
-    /** How heavily each violation class reduces the score (default 1). */
-    penaltyWeights?: Partial<Record<SlopCategory, number>>;
+type ControlSeverity = 'info' | 'warning' | 'error' | 'critical';
+type ControlActionFailureMode = 'continue' | 'stop';
+interface ControlEvalResult {
+    /** Stable validator or judge id. */
+    id: string;
+    /** Whether this check passed. */
+    passed: boolean;
+    /** Optional normalized score. 1 = best, 0 = worst. */
+    score?: number;
+    /** Objective validators should usually be "error" or "critical" when failed. */
+    severity?: ControlSeverity;
+    /** Human-readable result. */
+    detail?: string;
+    /** Small evidence string or pointer. Avoid large payloads. */
+    evidence?: string;
+    /** True when the result came from deterministic state, not LLM judgment. */
+    objective?: boolean;
 }
-type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
-/** Create a reusable Judge function from an anti-slop config. */
-declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
-interface AntiSlopIssue {
-    category: SlopCategory;
-    detail: string;
-    example?: string;
+interface ControlBudget {
+    maxSteps: number;
+    maxWallMs?: number;
+    maxCostUsd?: number;
 }
-interface AntiSlopReport {
-    /** 0–10 score; 10 is clean, lower values mean more slop. */
-    score: number;
-    issues: AntiSlopIssue[];
-    /** Count of each category for programmatic aggregation. */
-    counts: Record<SlopCategory, number>;
+interface ControlStopPolicies<TState, TAction> {
+    /**
+     * Stop after N consecutive steps with no state fingerprint change and
+     * less than `minScoreDelta` score movement. Disabled when omitted.
+     */
+    maxNoProgressSteps?: number;
+    /**
+     * Stop after the same action fingerprint is selected N consecutive
+     * times. Disabled when omitted.
+     */
+    maxRepeatedActions?: number;
+    /** Minimum score movement that counts as progress. Default 0.001. */
+    minScoreDelta?: number;
+    /** Override the default JSON/string fingerprint for state comparisons. */
+    stateFingerprint?: (state: TState) => string;
+    /** Override the default JSON/string fingerprint for repeated-action checks. */
+    actionFingerprint?: (action: TAction) => string;
+}
+interface ControlContext<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
+    intent: string;
+    state: TState;
+    evals: TEval[];
+    history: ControlStep<TState, TAction, TActionResult, TEval>[];
+    budget: ControlBudget;
+    stepIndex: number;
+    wallMs: number;
+    spentCostUsd: number;
+    remainingCostUsd?: number;
+    abortSignal: AbortSignal;
+    emitter?: TraceEmitter;
 }
-/**
- * Pure function — analyze one or more outputs against the config. Exposed
- * separately so consumers can build their own reporters on top.
- */
-declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
-    penaltyWeights: Record<SlopCategory, number>;
-}): AntiSlopReport;
+type ControlDecision<TAction> = {
+    type: 'continue';
+    action: TAction;
+    reason?: string;
+} | {
+    type: 'stop';
+    reason: string;
+    pass?: boolean;
+    score?: number;
+};
+interface StopDecision {
+    stop: boolean;
+    pass: boolean;
+    reason: string;
+    score?: number;
+    failureClass?: FailureClass;
+}
+interface ControlActionOutcome<TActionResult> {
+    ok: boolean;
+    result?: TActionResult;
+    error?: string;
+    costUsd?: number;
+    durationMs: number;
+}
+interface ControlRuntimeError {
+    phase: 'observe' | 'validate' | 'decide' | 'act' | 'stop-policy' | 'on-step' | 'trace';
+    stepIndex: number;
+    message: string;
+}
+interface ControlStep<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
+    index: number;
+    decision: ControlDecision<TAction>;
+    beforeState: TState;
+    afterState: TState;
+    evalsBefore: TEval[];
+    evalsAfter: TEval[];
+    actionOutcome?: ControlActionOutcome<TActionResult>;
+    startedAt: string;
+    endedAt: string;
+}
+interface ControlRunResult<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
+    intent: string;
+    pass: boolean;
+    completed: boolean;
+    reason: string;
+    score?: number;
+    steps: ControlStep<TState, TAction, TActionResult, TEval>[];
+    finalState: TState | undefined;
+    finalEvals: TEval[];
+    wallMs: number;
+    spentCostUsd: number;
+    runId: string | null;
+    failureClass?: FailureClass;
+    runtimeErrors: ControlRuntimeError[];
+    stoppedBy: 'policy' | 'stop-policy' | 'budget' | 'abort' | 'runtime-error';
+}
+interface ControlRuntimeConfig<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
+    intent: string;
+    budget?: Partial<ControlBudget>;
+    signal?: AbortSignal;
+    /** Defaults to `continue`: action failures are recorded, then the policy gets another chance. */
+    actionFailure?: ControlActionFailureMode;
+    /**
+     * Extract cost from an action result. Used for `maxCostUsd` budget
+     * enforcement and trace budget ledger emission.
+     */
+    getActionCostUsd?: (ctx: {
+        action: TAction;
+        result: TActionResult;
+        state: TState;
+        evals: TEval[];
+        history: ControlStep<TState, TAction, TActionResult, TEval>[];
+    }) => number | undefined;
+    /** Read typed task/product state. Prefer structured state over transcript-only context. */
+    observe: (ctx: {
+        history: ControlStep<TState, TAction, TActionResult, TEval>[];
+        abortSignal: AbortSignal;
+    }) => Promise<TState> | TState;
+    /** Objective validators first, subjective judges only where objective state is insufficient. */
+    validate: (ctx: {
+        intent: string;
+        state: TState;
+        history: ControlStep<TState, TAction, TActionResult, TEval>[];
+        abortSignal: AbortSignal;
+    }) => Promise<TEval[]> | TEval[];
+    /** Choose the next control action. Can call a worker, ask user, run critic, inspect state, or stop. */
+    decide: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<ControlDecision<TAction>> | ControlDecision<TAction>;
+    /** Execute the action selected by the policy. */
+    act: (action: TAction, ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<TActionResult> | TActionResult;
+    /** Final stopping policy. Called before decide and after each action. */
+    shouldStop?: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<StopDecision> | StopDecision;
+    /** Optional hook for tracing or live progress updates. */
+    onStep?: (step: ControlStep<TState, TAction, TActionResult, TEval>) => Promise<void> | void;
+    /** Optional generic stuck-loop policies. Custom `shouldStop` still runs first. */
+    stopPolicies?: ControlStopPolicies<TState, TAction>;
+    /** Optional trace sink. Emits one run plus one span per control step. */
+    store?: TraceStore;
+    scenarioId?: string;
+    projectId?: string;
+    variantId?: string;
+}
+declare function runAgentControlLoop<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(config: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>>;
+declare function stopOnNoProgress<TState, TAction>(maxNoProgressSteps: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxNoProgressSteps'>): ControlStopPolicies<TState, TAction>;
+declare function stopOnRepeatedAction<TState, TAction>(maxRepeatedActions: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxRepeatedActions'>): ControlStopPolicies<TState, TAction>;
+declare function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
+declare function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
+declare function allCriticalPassed(evals: ControlEvalResult[]): boolean;
 /**
- * Artifact validators.
+ * Dataset — versioned, sliceable, content-hashed scenario collection.
  *
- * Generic "score a produced artifact" primitive. Tax uses it for PDF form
- * correctness, legal for contract clauses, film for script breakdowns, GTM
- * for social posts. One interface, many validators; all plug into
- * `BenchmarkRunner` the same way.
+ * Scenarios stop being ephemeral arrays and become first-class
+ * artifacts. Every Dataset carries:
+ *   - content hash (sha256 over canonicalized scenario array)
+ *   - provenance (contributor, createdAt, sourceUrl)
+ *   - split labels (train | dev | test | holdout)
+ *   - difficulty tiers (easy | medium | hard | extreme)
+ *   - tags (free-form, per-scenario)
  *
- * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
- * plus a `ValidationContext` (scenario id, the turns that produced it) and
- * returns a `ValidationResult` with pass/fail + 0..1 score + structured
- * issues.
+ * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
+ * deterministic, reproducible subset. Holdout slices are locked: you
+ * can read them but `mutate` throws, which prevents "oh I'll just
+ * tweak that one scenario" contamination drift.
  */
-interface Artifact$1 {
-    /** Logical kind — validators type-guard on this */
-    kind: 'file' | 'json' | 'text' | 'binary' | string;
-    /** Filesystem-style path, optional */
-    path?: string;
-    /** String content for text/json/file kinds */
-    content?: string;
-    /** Binary content (if kind === 'binary') */
-    bytes?: Uint8Array;
-    /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
-    metadata?: Record<string, unknown>;
+type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
+type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
+interface DatasetScenario {
+    id: string;
+    /** Arbitrary payload; the framework doesn't interpret it. */
+    payload: unknown;
+    split?: DatasetSplit;
+    difficulty?: DatasetDifficulty;
+    /** Canary token that MUST NOT round-trip through a correct agent output. */
+    canary?: string;
+    tags?: Record<string, string>;
 }
-interface ValidationContext {
-    scenarioId: string;
-    turnIndex?: number;
-    /** Prior artifacts for multi-artifact scenarios */
-    priorArtifacts?: Artifact$1[];
-    /** Free-form hints the validator uses for domain-specific checks */
-    hints?: Record<string, unknown>;
+interface DatasetProvenance {
+    contributor?: string;
+    createdAt: string;
+    sourceUrl?: string;
+    license?: string;
+    description?: string;
+    /** Monotonic human-readable version (e.g. "2026.04.20"). */
+    version: string;
 }
-interface ValidationIssue {
-    severity: 'error' | 'warning' | 'info';
-    message: string;
-    /** Optional path into the artifact (e.g. JSON path or byte offset) */
-    locus?: string;
+interface DatasetManifest {
+    name: string;
+    provenance: DatasetProvenance;
+    /** sha256 hex over canonicalized scenarios. */
+    contentHash: string;
+    scenarioCount: number;
+    splitCounts: Record<DatasetSplit, number>;
 }
-interface ValidationResult {
-    pass: boolean;
-    /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
-    score: number;
-    issues: ValidationIssue[];
-    /** Diagnostic payload for reporters */
-    evidence?: Record<string, unknown>;
+interface SliceOptions {
+    split?: DatasetSplit;
+    difficulty?: DatasetDifficulty;
+    /** Number of scenarios (random sample, seeded). Omit to take all that match. */
+    limit?: number;
+    seed?: number;
+    /** Predicate narrowing. Applied after split/difficulty filters. */
+    filter?: (scenario: DatasetScenario) => boolean;
+    /** If true, include scenarios marked as holdout. Default false. */
+    includeHoldout?: boolean;
 }
-interface ArtifactValidator {
-    /** Stable identifier for the validator; appears in reports. */
-    name: string;
-    /** Optional description for human-facing reports. */
-    description?: string;
-    /** Called once per artifact; validators are expected to be pure + idempotent. */
-    validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
+/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
+declare class HoldoutLockedError extends Error {
+    constructor(datasetName: string);
 }
-/**
- * Run every validator on the same artifact; aggregate pass as AND, score as
- * (weighted) mean, issues concatenated. Weights default to 1 each.
- */
-declare function composeValidators(validators: ArtifactValidator[], options?: {
-    name?: string;
-    weights?: number[];
-}): ArtifactValidator;
-/** Pass if the artifact body matches a provided regex. */
-declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
-/** Pass if JSON parses and every required key is present. */
-declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
-/** Pass if min ≤ byte length ≤ max. */
-declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
-/** Pass if the artifact contains every required substring (case-insensitive by default). */
-declare function containsAll(name: string, required: string[], options?: {
-    caseSensitive?: boolean;
-}): ArtifactValidator;
+declare class Dataset {
+    readonly name: string;
+    readonly provenance: DatasetProvenance;
+    private scenarios;
+    private locked;
+    constructor(init: {
+        name: string;
+        provenance: DatasetProvenance;
+        scenarios: DatasetScenario[];
+        locked?: boolean;
+    });
+    /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
+    all(): readonly DatasetScenario[];
+    get size(): number;
+    /**
+     * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
+     * the same arguments always produce the same slice across machines.
+     */
+    slice(options?: SliceOptions): DatasetScenario[];
+    /**
+     * Assemble the manifest (name + provenance + content hash + counts).
+     * Content hash is deterministic over canonicalized scenarios.
+     */
+    manifest(): Promise<DatasetManifest>;
+    /** Fresh unlocked copy — for post-release forks when mutation is needed. */
+    clone(overrides?: Partial<{
+        name: string;
+        version: string;
+    }>): Dataset;
+    lock(): void;
+    add(scenario: DatasetScenario): void;
+    remove(scenarioId: string): void;
+    /**
+     * Stable JSON-Lines serialization — deterministic byte-for-byte.
+     * Write to disk for contamination-verifiable archives.
+     */
+    toJsonl(): string;
+    static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
+}
+declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
 /**
- * Workspace inspector — score the persisted state of an agent after a run.
+ * Prompt optimizer — A/B test prompt variants with statistical rigor.
  *
- * Many evals don't ask "did the response say the right thing" but "did the
- * agent put the right rows in the DB / files in the vault / entities on the
- * canvas". This is the primitive for that.
+ * Runs N prompt variants against a fixed scenario set, collects per-scenario
+ * scores via the user-provided `scoreVariant` callback, and returns:
+ *   - per-variant mean + bootstrap CI
+ *   - pairwise significance (Mann-Whitney, non-parametric — works on any
+ *     score distribution, not just normal)
+ *   - a winner (highest mean, flagged if the lead is not significant)
  *
- * Implementations read from D1, KV, filesystem, or any store — the interface
- * is deliberately small so consumers plug in their own backends.
+ * Deliberately generic — the `scoreVariant` callback does whatever domain
+ * work the consumer needs (invoke the agent, judge the output, whatever),
+ * and returns a number per scenario. This lets the optimizer stay small +
+ * testable.
  */
-interface WorkspaceSnapshot {
-    /** Vault files: logical path → content */
-    files: Record<string, string>;
-    /** DB rows: table name → array of rows (post-validation) */
-    rows: Record<string, Array<Record<string, unknown>>>;
-    /** KV entries: key → value (scoped to whatever prefix the inspector chose) */
-    kv: Record<string, string>;
-    /** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
-    blobs?: Record<string, {
-        size: number;
-        hash?: string;
-        mimeType?: string;
+interface PromptVariant$1 {
+    id: string;
+    prompt: string;
+    metadata?: Record<string, unknown>;
+}
+interface OptimizationConfig {
+    variants: PromptVariant$1[];
+    /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
+    trialsPerScenario?: number;
+    /** Significance threshold for pairwise comparison (default 0.05). */
+    significanceLevel?: number;
+    /**
+     * The scoring callback. For each (variant, scenarioId, trialIndex), produce
+     * a score in 0..1 (or any numeric range — the optimizer only cares about
+     * monotonicity).
+     */
+    scoreVariant: (args: {
+        variant: PromptVariant$1;
+        scenarioId: string;
+        trialIndex: number;
+    }) => Promise<number>;
+    /** Scenario ids to run against. */
+    scenarioIds: string[];
+    /** Optional hook — fires after each (variant, scenario) fully scored. */
+    onScenarioComplete?: (info: {
+        variantId: string;
+        scenarioId: string;
+        scores: number[];
+    }) => void;
+}
+interface VariantScore {
+    variantId: string;
+    mean: number;
+    ci95: {
+        lower: number;
+        upper: number;
+    };
+    n: number;
+    perScenario: Record<string, {
+        mean: number;
+        n: number;
+        samples: number[];
     }>;
 }
-interface InspectorContext {
-    /** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
-    scopeId: string;
-    /** Optional scenario id — allows scenario-specific snapshot shaping */
-    scenarioId?: string;
+interface PairwiseComparison {
+    variantA: string;
+    variantB: string;
+    pValue: number;
+    /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
+    qValue: number;
+    /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
+    significant: boolean;
+    meanDelta: number;
 }
-interface WorkspaceInspector {
-    name: string;
-    snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
+interface OptimizationResult {
+    winner: {
+        variantId: string;
+        /** True when the winner's lead vs every other variant is statistically significant. */
+        significant: boolean;
+        ciLowerBoundExceedsSecondMean: boolean;
+    };
+    scores: VariantScore[];
+    pairwise: PairwiseComparison[];
+    config: {
+        trialsPerScenario: number;
+        significanceLevel: number;
+        variants: string[];
+        scenarios: string[];
+    };
 }
-declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
-    readonly name = "in-memory";
-    private readonly snapshots;
-    set(scopeId: string, snapshot: WorkspaceSnapshot): void;
-    snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
+declare class PromptOptimizer {
+    run(config: OptimizationConfig): Promise<OptimizationResult>;
 }
-interface WorkspaceAssertion {
-    name: string;
-    description?: string;
-    check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
+interface RunScore {
+    success: number;
+    goalProgress: number;
+    repoGroundedness: number;
+    driftPenalty: number;
+    toolUseQuality: number;
+    patchQuality: number;
+    testReality: number;
+    finalGate: number;
+    reviewerBlockers: number;
+    costUsd: number;
+    wallSeconds: number;
+    notes?: string[];
 }
-interface WorkspaceAssertionResult {
-    pass: boolean;
-    /** 0..1 — partial credit for assertions that admit it */
-    score: number;
-    detail?: string;
+interface RunScoreWeights {
+    success: number;
+    goalProgress: number;
+    repoGroundedness: number;
+    driftPenalty: number;
+    toolUseQuality: number;
+    patchQuality: number;
+    testReality: number;
+    finalGate: number;
+    reviewerBlockers: number;
+    costUsd: number;
+    wallSeconds: number;
 }
-declare function fileExists(path: string): WorkspaceAssertion;
-declare function fileContains(path: string, needle: string): WorkspaceAssertion;
-declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
-declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
-    min?: number;
-}): WorkspaceAssertion;
-/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
-declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
-    pass: boolean;
-    score: number;
-    results: Array<{
-        assertion: string;
-        result: WorkspaceAssertionResult;
-    }>;
-};
+declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
+declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
+declare function clamp01(value: number): number;
-/**
- * Experiment tracker — group runs, diff them, watch scores move over time.
- *
- * Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
- *   - A run has a config (prompt hash, model, scenario ids, seed)
- *   - Runs belong to experiments (named groups)
- *   - The store is pluggable (in-memory for tests, filesystem for local,
- *     custom for Langfuse/D1)
- *   - Diffs show score deltas, new/dropped scenarios, and config changes
- *
- * The output plugs directly into `BenchmarkReport` — runs archive the full
- * report, diff operates on the summary.
- */
+interface SteeringRolePrompt {
+    system?: string;
+    append?: string;
+}
+interface SteeringBundle {
+    id: string;
+    coderPrompt?: string;
+    continuePrompt?: string;
+    reviewerPrompts?: Record<string, string>;
+    skills?: string[];
+    rolePrompts?: Record<string, SteeringRolePrompt>;
+    metadata?: Record<string, unknown>;
+}
+interface SteeringDelta {
+    coderPrompt?: string;
+    continuePrompt?: string;
+    reviewerPrompts?: Record<string, string>;
+    skills?: string[];
+    rolePrompts?: Record<string, SteeringRolePrompt>;
+    metadata?: Record<string, unknown>;
+}
+declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
+declare function renderSteeringText(bundle: SteeringBundle): string;
+interface OptimizationExample {
+    scenarioId: string;
+    metadata?: Record<string, unknown>;
+}
+interface SteeringEvaluation {
+    variant: SteeringBundle;
+    example: OptimizationExample;
+    trialIndex: number;
+}
+interface SteeringVariantReport {
+    variantId: string;
+    bundle: SteeringBundle;
+    mean: number;
+    ci95: {
+        lower: number;
+        upper: number;
+    };
+    scenarioScores: Record<string, {
+        mean: number;
+        n: number;
+        samples: number[];
+    }>;
+}
+interface OptimizationLoopResult {
+    winner: SteeringBundle;
+    significant: boolean;
+    reports: SteeringVariantReport[];
+    pairwise: Array<{
+        variantA: string;
+        variantB: string;
+        pValue: number;
+        qValue: number;
+        significant: boolean;
+        meanDelta: number;
+    }>;
+}
+interface OptimizationLoopConfig {
+    variants: SteeringBundle[];
+    examples: OptimizationExample[];
+    evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
+    scoreWeights?: Partial<RunScoreWeights>;
+    trialsPerScenario?: number;
+}
+declare class OptimizationLoop {
+    private readonly optimizer;
+    constructor(optimizer?: PromptOptimizer);
+    run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
+}
-interface RunConfig {
-    experimentId: string;
-    name?: string;
-    model?: string;
-    promptHash?: string;
-    promptVersion?: string;
-    seed?: number;
+type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
+type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
+type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
+type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
+interface FeedbackTask {
+    intent: string;
+    context?: unknown;
+}
+interface ProposedSideEffect {
+    type: string;
+    risk?: 'low' | 'medium' | 'high';
+    costUsd?: number;
+    externalSideEffect?: boolean;
+    requiresApproval?: boolean;
     metadata?: Record<string, unknown>;
 }
-interface Run$1 {
+interface FeedbackLabel {
+    id?: string;
+    source: FeedbackLabelSource;
+    kind: FeedbackLabelKind;
+    value: unknown;
+    reason?: string;
+    severity?: FeedbackSeverity;
+    createdAt: string;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackAttempt {
     id: string;
-    experimentId: string;
-    name?: string;
-    config: RunConfig;
-    startedAt: string;
-    completedAt?: string;
-    status: 'running' | 'completed' | 'failed';
-    report?: BenchmarkReport;
-    error?: string;
+    stepIndex: number;
+    artifactType: FeedbackArtifactType;
+    artifact: unknown;
+    options?: unknown[];
+    proposedAction?: ProposedSideEffect;
+    evals?: ControlEvalResult[];
+    feedback?: FeedbackLabel[];
+    createdAt: string;
+    metadata?: Record<string, unknown>;
 }
-interface Experiment {
+interface FeedbackOutcome {
+    success?: boolean;
+    score?: number;
+    metrics?: Record<string, number>;
+    costUsd?: number;
+    detail?: string;
+    observedAt?: string;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackTrajectory {
     id: string;
-    name: string;
+    projectId?: string;
+    scenarioId?: string;
+    task: FeedbackTask;
+    attempts: FeedbackAttempt[];
+    labels: FeedbackLabel[];
+    outcome?: FeedbackOutcome;
+    split?: DatasetSplit;
+    tags?: Record<string, string>;
     createdAt: string;
+    updatedAt?: string;
     metadata?: Record<string, unknown>;
 }
-interface ExperimentStore {
-    saveExperiment(exp: Experiment): Promise<void>;
-    getExperiment(id: string): Promise<Experiment | null>;
-    listExperiments(): Promise<Experiment[]>;
-    saveRun(run: Run$1): Promise<void>;
-    getRun(id: string): Promise<Run$1 | null>;
-    listRuns(experimentId: string): Promise<Run$1[]>;
+interface FeedbackTrajectoryStore {
+    save(trajectory: FeedbackTrajectory): Promise<void>;
+    get(id: string): Promise<FeedbackTrajectory | null>;
+    list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
+    appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
+    appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
 }
-declare class InMemoryExperimentStore implements ExperimentStore {
-    private readonly experiments;
-    private readonly runs;
-    saveExperiment(exp: Experiment): Promise<void>;
-    getExperiment(id: string): Promise<Experiment | null>;
-    listExperiments(): Promise<Experiment[]>;
-    saveRun(run: Run$1): Promise<void>;
-    getRun(id: string): Promise<Run$1 | null>;
-    listRuns(experimentId: string): Promise<Run$1[]>;
+interface FeedbackTrajectoryFilter {
+    projectId?: string;
+    scenarioId?: string;
+    split?: DatasetSplit;
+    tag?: [string, string];
 }
-declare class ExperimentTracker {
-    private readonly store;
-    constructor(store: ExperimentStore);
-    startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
-    startRun(config: RunConfig): Promise<Run$1>;
-    completeRun(runId: string, report: BenchmarkReport): Promise<void>;
-    failRun(runId: string, error: string): Promise<void>;
-    /**
-     * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
-     * and config changes that may explain the movement.
-     */
-    diff(runIdA: string, runIdB: string): Promise<RunDiff>;
-    /** Timeline of aggregate scores for an experiment. */
-    timeline(experimentId: string): Promise<Array<{
-        runId: string;
-        startedAt: string;
-        overall: number | null;
-    }>>;
+interface FeedbackSplitPolicy {
+    trainPct?: number;
+    devPct?: number;
+    testPct?: number;
+    holdoutPct?: number;
 }
-interface RunDiff {
-    before: {
-        runId: string;
-        name?: string;
-        startedAt: string;
-    };
-    after: {
-        runId: string;
-        name?: string;
-        startedAt: string;
-    };
-    aggregateDelta: number;
-    scenarios: Array<{
-        scenarioId: string;
-        before: number | null;
-        after: number | null;
-        delta: number | null;
-        status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
-    }>;
-    configChanges: Record<string, {
-        before: unknown;
-        after: unknown;
-    }>;
+interface PreferenceMemoryEntry {
+    instruction: string;
+    rationale: string;
+    weight: number;
+    sourceTrajectoryId: string;
+    sourceLabelId?: string;
+    category?: string;
 }
-/**
- * FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
- *
- * Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
- * files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
- * based rollover. Writes are append-only so the file log doubles as an audit
- * trail of every state transition the tracker ever wrote.
- *
- * Reads lazy-load every NDJSON file in the directory (including rolled-over
- * archives), latest-write-wins per `id`. Subsequent writes update the
- * in-memory index in place so reads after writes are O(1).
- *
- * Node-only — imports `node:fs/promises`. Don't import this from a Worker;
- * use the in-memory store or the D1 store from `./experiment-tracker-d1`.
- */
-interface FileSystemExperimentStoreOptions {
-    /** Directory the NDJSON files live in. Created on first write. */
-    dir: string;
-    /** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
-    maxBytes?: number;
+interface FeedbackOptimizerRow extends OptimizationExample {
+    trajectoryId: string;
+    labelKinds: FeedbackLabelKind[];
+    score?: number;
 }
-declare class FileSystemExperimentStore implements ExperimentStore {
+interface FeedbackReplayResult {
+    trajectoryId: string;
+    pass: boolean;
+    score?: number;
+    labels: FeedbackLabel[];
+    outcome?: FeedbackOutcome;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackReplayAdapter {
+    replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
+}
+declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
+    private readonly trajectories;
+    save(trajectory: FeedbackTrajectory): Promise<void>;
+    get(id: string): Promise<FeedbackTrajectory | null>;
+    list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
+    appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
+    appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
+}
+declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
     private readonly dir;
-    private readonly maxBytes;
-    private index?;
+    private readonly memory;
     private loaded;
-    constructor(options: FileSystemExperimentStoreOptions);
-    saveExperiment(exp: Experiment): Promise<void>;
-    getExperiment(id: string): Promise<Experiment | null>;
-    listExperiments(): Promise<Experiment[]>;
-    saveRun(run: Run$1): Promise<void>;
-    getRun(id: string): Promise<Run$1 | null>;
-    listRuns(experimentId: string): Promise<Run$1[]>;
-    private ensureDir;
+    constructor(options: {
+        dir: string;
+    });
+    save(trajectory: FeedbackTrajectory): Promise<void>;
+    get(id: string): Promise<FeedbackTrajectory | null>;
+    list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
+    appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
+    appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
     private append;
     private load;
 }
+declare function createFeedbackTrajectory(input: {
+    id?: string;
+    projectId?: string;
+    scenarioId?: string;
+    task: FeedbackTask;
+    attempts?: FeedbackAttempt[];
+    labels?: FeedbackLabel[];
+    outcome?: FeedbackOutcome;
+    split?: DatasetSplit;
+    tags?: Record<string, string>;
+    createdAt?: string;
+    metadata?: Record<string, unknown>;
+}): FeedbackTrajectory;
+declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
+declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
+declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
+declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
+declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
+declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
+declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
+declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
+declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
+    maxEntries?: number;
+}): PreferenceMemoryEntry[];
+declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
+declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
+declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
+declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
+    projectId?: string;
+    scenarioId?: string;
+    artifactType?: FeedbackArtifactType;
+    artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
+    proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
+    createdAt?: string;
+}): FeedbackTrajectory;
+interface ActionExecutionPolicy {
+    allowedTypes?: string[];
+    blockedTypes?: string[];
+    alwaysRequireApprovalTypes?: string[];
+    autoApproveTypes?: string[];
+    requireApprovalForExternalSideEffects?: boolean;
+    requireApprovalAboveCostUsd?: number;
+    maxActionCostUsd?: number;
+    remainingBudgetUsd?: number;
+    expectedOutcomeRequired?: boolean;
+    killCriteriaRequired?: boolean;
+}
+interface ActionPolicyDecision {
+    allowed: boolean;
+    blocked: boolean;
+    requiresApproval: boolean;
+    reasons: string[];
+    label?: FeedbackLabel;
+}
+declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
+    createdAt?: string;
+}): ActionPolicyDecision;
 /**
- * D1ExperimentStore — Cloudflare D1-backed `ExperimentStore`.
- *
- * Workers-safe (uses only the `D1Database` binding the runtime injects). Two
- * tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
- * a Worker route can both write the row at run start and update it at run end
- * without losing the original config — the row's lifecycle mirrors the
- * `Run.status` field one-to-one.
- *
- * Why this lives next to `InMemoryExperimentStore`:
- *   - bad-app, legal-agent, gtm-agent, film-agent all run as Workers
- *   - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
- *   - Hand-rolling D1 SQL in every consumer is exactly the duplication this
- *     module exists to prevent
+ * Normalize scores so all dimensions follow "higher = better".
+ * Inverted dimensions (hallucination, false_confidence, worst_failure)
+ * already use inverted scoring in the prompt (10 = no hallucination),
+ * but this function ensures consistency if raw scores leak through.
+ */
+declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
+/** Weighted mean — falls back to uniform weights when omitted */
+declare function weightedMean(scores: {
+    score: number;
+    weight?: number;
+}[]): number;
+/** Bootstrap confidence interval */
+declare function confidenceInterval(scores: number[], confidence?: number): {
+    mean: number;
+    lower: number;
+    upper: number;
+};
+/**
+ * Inter-rater reliability — simplified Krippendorff's alpha.
  *
- * Schema versioning: the `meta` table records `schema_version` so a future
- * column addition can be detected and migrated additively. Today's schema is
- * v1; bump only on breaking shape changes.
+ * Each inner array is one judge's scores for all items.
+ * All arrays must have the same length (same items scored).
  */
+declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
 /**
- * Minimal `D1Database` shape we depend on. Avoids pulling in
- * `@cloudflare/workers-types` as a hard dep — consumers that already have
- * those types installed can pass the binding directly.
+ * Mann-Whitney U test for comparing two independent groups.
+ * Returns U statistic and approximate p-value (normal approximation).
  */
-interface D1Like {
-    prepare(query: string): D1PreparedStatementLike;
-    batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
-    exec(query: string): Promise<unknown>;
-}
-interface D1PreparedStatementLike {
-    bind(...values: unknown[]): D1PreparedStatementLike;
-    first<T = Record<string, unknown>>(): Promise<T | null>;
-    all<T = Record<string, unknown>>(): Promise<{
-        results: T[];
-    }>;
-    run(): Promise<unknown>;
-}
-interface D1ExperimentStoreOptions {
-    /** D1 binding from `env`. */
-    db: D1Like;
-    /**
-     * Optional table-name prefix so multiple ExperimentStores can share a DB
-     * without colliding (e.g. `tax_eval_experiments` vs `legal_eval_experiments`).
-     * Default: `agent_eval_`.
-     */
-    tablePrefix?: string;
-}
-declare class D1ExperimentStore implements ExperimentStore {
-    private readonly db;
-    private readonly experimentsTable;
-    private readonly runsTable;
-    private readonly metaTable;
-    private schemaReady;
-    constructor(options: D1ExperimentStoreOptions);
-    /**
-     * Idempotent schema setup. Safe to call before every operation; the second
-     * call short-circuits via `schemaReady`. Most consumers will call it once
-     * during Worker bootstrap.
-     */
-    ensureSchema(): Promise<void>;
-    saveExperiment(exp: Experiment): Promise<void>;
-    getExperiment(id: string): Promise<Experiment | null>;
-    listExperiments(): Promise<Experiment[]>;
-    saveRun(run: Run$1): Promise<void>;
-    getRun(id: string): Promise<Run$1 | null>;
-    listRuns(experimentId: string): Promise<Run$1[]>;
+declare function mannWhitneyU(a: number[], b: number[]): {
+    u: number;
+    p: number;
+};
+/** Partial credit: returns 0-1 ratio of current toward target */
+declare function partialCredit(current: number, target: number): number;
+/**
+ * Paired t-test — before/after measurements on the SAME items.
+ * Pairing removes inter-item variance, giving tighter significance than
+ * an unpaired test when comparing prompt v1 vs prompt v2 on identical
+ * scenarios.
+ */
+declare function pairedTTest(before: number[], after: number[]): {
+    t: number;
+    df: number;
+    p: number;
+};
+/**
+ * Wilcoxon signed-rank test — paired non-parametric alternative.
+ * Use when the differences aren't normally distributed.
+ */
+declare function wilcoxonSignedRank(before: number[], after: number[]): {
+    w: number;
+    p: number;
+};
+/**
+ * Cohen's d — standardized effect size for two independent groups.
+ * Positive d means group b has higher mean than group a.
+ * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
+ */
+declare function cohensD(a: number[], b: number[]): number;
+/**
+ * ConvergenceTracker — tracks completion percentage over turns.
+ *
+ * Produces convergence curves showing how quickly the agent reaches
+ * completion criteria.
+ */
+declare class ConvergenceTracker {
+    private criteria;
+    private history;
+    constructor(criteria: CompletionCriterion[]);
+    /** Evaluate criteria against current state, record result */
+    record(turn: number, state: DriverState): {
+        completionPercent: number;
+        complete: boolean;
+        criteriaStatus: Record<string, boolean | number>;
+    };
+    /** Get convergence curve */
+    getCurve(): number[];
+    /** Get full history with per-criterion status */
+    getHistory(): {
+        turn: number;
+        completionPercent: number;
+        criteriaStatus: Record<string, boolean | number>;
+    }[];
+    /** Find the turn where completion first reached 100% (or null) */
+    getTurnToCompletion(): number | null;
 }
 /**
- * Prompt optimizer — A/B test prompt variants with statistical rigor.
+ * Versioned prompt registry.
  *
- * Runs N prompt variants against a fixed scenario set, collects per-scenario
- * scores via the user-provided `scoreVariant` callback, and returns:
- *   - per-variant mean + bootstrap CI
- *   - pairwise significance (Mann-Whitney, non-parametric — works on any
- *     score distribution, not just normal)
- *   - a winner (highest mean, flagged if the lead is not significant)
+ * Every prompt used in an eval run is registered with an explicit version.
+ * Reports include the content hash so A/B compares are rigorous: if the
+ * hash changes between two reports, the prompt actually changed; if it
+ * matches, the variance is elsewhere.
  *
- * Deliberately generic — the `scoreVariant` callback does whatever domain
- * work the consumer needs (invoke the agent, judge the output, whatever),
- * and returns a number per scenario. This lets the optimizer stay small +
- * testable.
+ * Hash is SHA-256(content), truncated to 12 hex chars for readability.
+ * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
  */
-interface PromptVariant$1 {
+interface PromptHandle {
+    /** Stable human-readable id, e.g. 'browser.system' */
     id: string;
-    prompt: string;
-    metadata?: Record<string, unknown>;
+    /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
+    version: string;
+    /** SHA-256 of content, 12-hex-char prefix */
+    hash: string;
+    /** Full prompt body */
+    content: string;
 }
-interface OptimizationConfig {
-    variants: PromptVariant$1[];
-    /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
-    trialsPerScenario?: number;
-    /** Significance threshold for pairwise comparison (default 0.05). */
-    significanceLevel?: number;
+declare class PromptRegistry {
+    private readonly entries;
     /**
-     * The scoring callback. For each (variant, scenarioId, trialIndex), produce
-     * a score in 0..1 (or any numeric range — the optimizer only cares about
-     * monotonicity).
+     * Register a prompt. Re-registering the same id+version with DIFFERENT
+     * content throws — versions are immutable. Re-registering with the SAME
+     * content is a no-op (idempotent).
      */
-    scoreVariant: (args: {
-        variant: PromptVariant$1;
-        scenarioId: string;
-        trialIndex: number;
-    }) => Promise<number>;
-    /** Scenario ids to run against. */
-    scenarioIds: string[];
-    /** Optional hook — fires after each (variant, scenario) fully scored. */
-    onScenarioComplete?: (info: {
-        variantId: string;
-        scenarioId: string;
-        scores: number[];
-    }) => void;
-}
-interface VariantScore {
-    variantId: string;
-    mean: number;
-    ci95: {
-        lower: number;
-        upper: number;
-    };
-    n: number;
-    perScenario: Record<string, {
-        mean: number;
-        n: number;
-        samples: number[];
-    }>;
-}
-interface PairwiseComparison {
-    variantA: string;
-    variantB: string;
-    pValue: number;
-    /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
-    qValue: number;
-    /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
-    significant: boolean;
-    meanDelta: number;
-}
-interface OptimizationResult {
-    winner: {
-        variantId: string;
-        /** True when the winner's lead vs every other variant is statistically significant. */
-        significant: boolean;
-        ciLowerBoundExceedsSecondMean: boolean;
-    };
-    scores: VariantScore[];
-    pairwise: PairwiseComparison[];
-    config: {
-        trialsPerScenario: number;
-        significanceLevel: number;
-        variants: string[];
-        scenarios: string[];
-    };
-}
-declare class PromptOptimizer {
-    run(config: OptimizationConfig): Promise<OptimizationResult>;
-}
-interface SteeringRolePrompt {
-    system?: string;
-    append?: string;
-}
-interface SteeringBundle {
-    id: string;
-    coderPrompt?: string;
-    continuePrompt?: string;
-    reviewerPrompts?: Record<string, string>;
-    skills?: string[];
-    rolePrompts?: Record<string, SteeringRolePrompt>;
-    metadata?: Record<string, unknown>;
-}
-interface SteeringDelta {
-    coderPrompt?: string;
-    continuePrompt?: string;
-    reviewerPrompts?: Record<string, string>;
-    skills?: string[];
-    rolePrompts?: Record<string, SteeringRolePrompt>;
-    metadata?: Record<string, unknown>;
-}
-declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
-declare function renderSteeringText(bundle: SteeringBundle): string;
-interface RunScore {
-    success: number;
-    goalProgress: number;
-    repoGroundedness: number;
-    driftPenalty: number;
-    toolUseQuality: number;
-    patchQuality: number;
-    testReality: number;
-    finalGate: number;
-    reviewerBlockers: number;
-    costUsd: number;
-    wallSeconds: number;
-    notes?: string[];
-}
-interface RunScoreWeights {
-    success: number;
-    goalProgress: number;
-    repoGroundedness: number;
-    driftPenalty: number;
-    toolUseQuality: number;
-    patchQuality: number;
-    testReality: number;
-    finalGate: number;
-    reviewerBlockers: number;
-    costUsd: number;
-    wallSeconds: number;
+    register(id: string, version: string, content: string): Promise<PromptHandle>;
+    /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
+    get(id: string, version: string): PromptHandle;
+    /** Return all versions of an id, newest-first (lex-descending on version). */
+    listVersions(id: string): PromptHandle[];
+    /** Snapshot the whole registry — useful for including in reports. */
+    list(): PromptHandle[];
+    /** Verify a hash against registered content. Returns null if not found. */
+    verifyHash(id: string, version: string, expectedHash: string): boolean | null;
 }
-declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
-declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
-declare function clamp01(value: number): number;
+/** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
+declare function hashContent(content: string): Promise<string>;
 /**
- * TraceSchema v1 — the canonical data model for agent-eval.
+ * Anti-slop quality judge.
  *
- * Every score, every failure class, every pipeline in the framework is
- * a view over this data. Shape it once, live with it.
+ * Deterministic pattern-based quality check — no LLM call. Catches the
+ * 80% of AI slop that every production agent leaks:
+ *   - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
+ *   - N-gram repetition (same phrase over and over)
+ *   - Hedging overuse ("I could be wrong, but...")
+ *   - Apology padding ("I'm so sorry for the confusion...")
+ *   - Unused opening formulas ("Great question!")
+ *   - Length bounds (too short to be useful, too long to be read)
  *
- * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
- * but extended with agent-specific span kinds (llm, tool, retrieval,
- * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
- * entities that OTEL leaves as free-form attributes.
+ * Produces a JudgeScore in the same shape as LLM judges so it composes into
+ * `BenchmarkRunner`'s judge array transparently.
  */
-declare const TRACE_SCHEMA_VERSION = "1.0.0";
-type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
-interface BudgetSpec {
-    tokens?: number;
-    wallMs?: number;
-    calls?: number;
-    usd?: number;
+interface AntiSlopConfig {
+    /** Domain label — appears in the JudgeScore output */
+    domain?: string;
+    /** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
+    bannedPhrases?: string[];
+    /** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
+    bannedOpenings?: RegExp[];
+    /** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
+    hedgingPatterns?: RegExp[];
+    /** Regexes matching apology padding. */
+    apologyPatterns?: RegExp[];
+    /** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
+    repetitionThreshold?: number;
+    /** Min output length in chars; below this the turn is deemed too terse. */
+    minLength?: number;
+    /** Max output length in chars; above this the turn is deemed too verbose. */
+    maxLength?: number;
+    /** How heavily each violation class reduces the score (default 1). */
+    penaltyWeights?: Partial<Record<SlopCategory, number>>;
 }
-interface RunOutcome$1 {
-    score?: number;
-    pass?: boolean;
-    failureClass?: FailureClass;
-    notes?: string;
+type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
+/** Create a reusable Judge function from an anti-slop config. */
+declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
+interface AntiSlopIssue {
+    category: SlopCategory;
+    detail: string;
+    example?: string;
+}
+interface AntiSlopReport {
+    /** 0–10 score; 10 is clean, lower values mean more slop. */
+    score: number;
+    issues: AntiSlopIssue[];
+    /** Count of each category for programmatic aggregation. */
+    counts: Record<SlopCategory, number>;
 }
 /**
- * Layer — optional classification in a nested build workflow.
- * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
- * `app-build`: sandbox harness that compiled + tested the generated scaffold.
- * `app-runtime`: a run of the generated agent against a domain scenario.
- * `meta`: any meta-eval (judge replay, correlation analysis).
+ * Pure function — analyze one or more outputs against the config. Exposed
+ * separately so consumers can build their own reporters on top.
  */
-type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
-interface Run {
-    runId: string;
-    scenarioId: string;
-    variantId?: string;
-    datasetVersion?: string;
-    /** Git SHA of agent code at run time. */
-    codeSha?: string;
-    /** Hash of the prompt template + any system prompt. */
-    promptSha?: string;
-    /** Model id + date + system-prompt hash, concatenated. */
-    modelFingerprint?: string;
-    seed?: number;
-    /** Arbitrary environment markers (shell, docker version, tz). */
-    envFingerprint?: Record<string, string>;
-    /** Version of the redaction rules applied to this run. */
-    redactionVersion?: string;
-    /** Parent run in a nested build workflow. A builder run's children are
-     *  app-build runs; those children are app-runtime runs. */
-    parentRunId?: string;
-    /** Stable project identifier — groups runs across chats + sessions. */
-    projectId?: string;
-    /** Chat/conversation identifier within a project. */
-    chatId?: string;
-    /** Layer classification — hint for aggregation; not enforced. */
-    layer?: RunLayer;
-    startedAt: number;
-    endedAt?: number;
-    status: RunStatus;
-    outcome?: RunOutcome$1;
-    budget?: BudgetSpec;
-    /** Free-form labels for downstream grouping. */
-    tags?: Record<string, string>;
+declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
+    penaltyWeights: Record<SlopCategory, number>;
+}): AntiSlopReport;
+/**
+ * Artifact validators.
+ *
+ * Generic "score a produced artifact" primitive. Tax uses it for PDF form
+ * correctness, research for sourced briefs, browser for task assertions, coding
+ * for social posts. One interface, many validators; all plug into
+ * `BenchmarkRunner` the same way.
+ *
+ * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
+ * plus a `ValidationContext` (scenario id, the turns that produced it) and
+ * returns a `ValidationResult` with pass/fail + 0..1 score + structured
+ * issues.
+ */
+interface Artifact {
+    /** Logical kind — validators type-guard on this */
+    kind: 'file' | 'json' | 'text' | 'binary' | string;
+    /** Filesystem-style path, optional */
+    path?: string;
+    /** String content for text/json/file kinds */
+    content?: string;
+    /** Binary content (if kind === 'binary') */
+    bytes?: Uint8Array;
+    /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
+    metadata?: Record<string, unknown>;
 }
-type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
-type SpanStatus = 'ok' | 'error';
-interface SpanBase {
-    spanId: string;
-    parentSpanId?: string;
-    runId: string;
-    kind: SpanKind;
-    name: string;
-    startedAt: number;
-    endedAt?: number;
-    status?: SpanStatus;
-    error?: string;
-    /** Anything not covered by typed fields. Kept deliberately free-form. */
-    attributes?: Record<string, unknown>;
+interface ValidationContext {
+    scenarioId: string;
+    turnIndex?: number;
+    /** Prior artifacts for multi-artifact scenarios */
+    priorArtifacts?: Artifact[];
+    /** Free-form hints the validator uses for domain-specific checks */
+    hints?: Record<string, unknown>;
 }
-interface Message {
-    role: 'system' | 'user' | 'assistant' | 'tool';
-    content: string;
-    tokens?: number;
-    /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
-    images?: Array<{
-        artifactId?: string;
-        url?: string;
-        mime?: string;
-    }>;
+interface ValidationIssue {
+    severity: 'error' | 'warning' | 'info';
+    message: string;
+    /** Optional path into the artifact (e.g. JSON path or byte offset) */
+    locus?: string;
 }
-interface LlmSpan extends SpanBase {
-    kind: 'llm';
-    model: string;
-    messages: Message[];
-    output?: string;
-    inputTokens?: number;
-    outputTokens?: number;
-    cachedTokens?: number;
-    reasoningTokens?: number;
-    costUsd?: number;
-    finishReason?: string;
+interface ValidationResult {
+    pass: boolean;
+    /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
+    score: number;
+    issues: ValidationIssue[];
+    /** Diagnostic payload for reporters */
+    evidence?: Record<string, unknown>;
 }
-interface ToolSpan extends SpanBase {
-    kind: 'tool';
-    toolName: string;
-    args: unknown;
-    result?: unknown;
-    latencyMs?: number;
+interface ArtifactValidator {
+    /** Stable identifier for the validator; appears in reports. */
+    name: string;
+    /** Optional description for human-facing reports. */
+    description?: string;
+    /** Called once per artifact; validators are expected to be pure + idempotent. */
+    validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
 }
-interface RetrievalSpan extends SpanBase {
-    kind: 'retrieval';
-    query: string;
-    hits: Array<{
-        docId: string;
-        score: number;
-        content?: string;
+/**
+ * Run every validator on the same artifact; aggregate pass as AND, score as
+ * (weighted) mean, issues concatenated. Weights default to 1 each.
+ */
+declare function composeValidators(validators: ArtifactValidator[], options?: {
+    name?: string;
+    weights?: number[];
+}): ArtifactValidator;
+/** Pass if the artifact body matches a provided regex. */
+declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
+/** Pass if JSON parses and every required key is present. */
+declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
+/** Pass if min ≤ byte length ≤ max. */
+declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
+/** Pass if the artifact contains every required substring (case-insensitive by default). */
+declare function containsAll(name: string, required: string[], options?: {
+    caseSensitive?: boolean;
+}): ArtifactValidator;
+/**
+ * Workspace inspector — score the persisted state of an agent after a run.
+ *
+ * Many evals don't ask "did the response say the right thing" but "did the
+ * agent put the right rows in the DB / files in the vault / entities on the
+ * canvas". This is the primitive for that.
+ *
+ * Implementations read from D1, KV, filesystem, or any store — the interface
+ * is deliberately small so consumers plug in their own backends.
+ */
+interface WorkspaceSnapshot {
+    /** Vault files: logical path → content */
+    files: Record<string, string>;
+    /** DB rows: table name → array of rows (post-validation) */
+    rows: Record<string, Array<Record<string, unknown>>>;
+    /** KV entries: key → value (scoped to whatever prefix the inspector chose) */
+    kv: Record<string, string>;
+    /** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
+    blobs?: Record<string, {
+        size: number;
+        hash?: string;
+        mimeType?: string;
     }>;
 }
-interface JudgeSpan extends SpanBase {
-    kind: 'judge';
-    judgeId: string;
-    /** Span this judgment applies to. */
-    targetSpanId: string;
-    dimension: string;
-    /** Numeric score (free-range; interpretation up to the judge). */
-    score: number;
-    rationale?: string;
-    evidence?: string;
-}
-interface SandboxSpan extends SpanBase {
-    kind: 'sandbox';
-    image?: string;
-    command?: string;
-    exitCode?: number;
-    testsTotal?: number;
-    testsPassed?: number;
-    stdoutHash?: string;
-    stderrHash?: string;
-    /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
-    wallMs?: number;
-}
-interface GenericSpan extends SpanBase {
-    kind: 'agent' | 'custom';
+interface InspectorContext {
+    /** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
+    scopeId: string;
+    /** Optional scenario id — allows scenario-specific snapshot shaping */
+    scenarioId?: string;
 }
-type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
-type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
-interface TraceEvent {
-    eventId: string;
-    runId: string;
-    spanId?: string;
-    kind: EventKind;
-    timestamp: number;
-    payload: Record<string, unknown>;
+interface WorkspaceInspector {
+    name: string;
+    snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
 }
-interface BudgetLedgerEntry {
-    runId: string;
-    dimension: keyof BudgetSpec;
-    limit: number;
-    consumed: number;
-    remaining: number;
-    timestamp: number;
-    breached: boolean;
-    /** Span that triggered this entry, if any. */
-    spanId?: string;
+declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
+    readonly name = "in-memory";
+    private readonly snapshots;
+    set(scopeId: string, snapshot: WorkspaceSnapshot): void;
+    snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
 }
-interface Artifact {
-    artifactId: string;
-    runId: string;
-    spanId?: string;
-    contentType: string;
-    sizeBytes: number;
-    /** sha256 in hex. */
-    hash: string;
-    /** External storage URL (R2, S3, filesystem path). */
-    storageUrl?: string;
-    /** Inline content for small blobs — keep under ~64KB. */
-    inlineContent?: string;
+interface WorkspaceAssertion {
+    name: string;
+    description?: string;
+    check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
 }
-type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
-declare const FAILURE_CLASSES: readonly FailureClass[];
-declare function isLlmSpan(s: Span): s is LlmSpan;
-declare function isToolSpan(s: Span): s is ToolSpan;
-declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
-declare function isJudgeSpan(s: Span): s is JudgeSpan;
-declare function isSandboxSpan(s: Span): s is SandboxSpan;
+interface WorkspaceAssertionResult {
+    pass: boolean;
+    /** 0..1 — partial credit for assertions that admit it */
+    score: number;
+    detail?: string;
+}
+declare function fileExists(path: string): WorkspaceAssertion;
+declare function fileContains(path: string, needle: string): WorkspaceAssertion;
+declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
+declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
+    min?: number;
+}): WorkspaceAssertion;
+/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
+declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
+    pass: boolean;
+    score: number;
+    results: Array<{
+        assertion: string;
+        result: WorkspaceAssertionResult;
+    }>;
+};
-interface RunFilter {
-    scenarioId?: string;
-    variantId?: string;
-    status?: RunStatus;
-    since?: number;
-    until?: number;
-    tag?: {
-        key: string;
-        value: string;
-    };
-    parentRunId?: string;
-    projectId?: string;
-    chatId?: string;
-    layer?: RunLayer;
+/**
+ * Experiment tracker — group runs, diff them, watch scores move over time.
+ *
+ * Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
+ *   - A run has a config (prompt hash, model, scenario ids, seed)
+ *   - Runs belong to experiments (named groups)
+ *   - The store is pluggable (in-memory for tests, filesystem for local,
+ *     custom for Langfuse/D1)
+ *   - Diffs show score deltas, new/dropped scenarios, and config changes
+ *
+ * The output plugs directly into `BenchmarkReport` — runs archive the full
+ * report, diff operates on the summary.
+ */
+interface RunConfig {
+    experimentId: string;
+    name?: string;
+    model?: string;
+    promptHash?: string;
+    promptVersion?: string;
+    seed?: number;
+    metadata?: Record<string, unknown>;
 }
-interface SpanFilter {
-    runId?: string;
-    parentSpanId?: string;
-    kind?: SpanKind;
+interface Run {
+    id: string;
+    experimentId: string;
     name?: string;
-    toolName?: string;
-    judgeId?: string;
-    since?: number;
-    until?: number;
+    config: RunConfig;
+    startedAt: string;
+    completedAt?: string;
+    status: 'running' | 'completed' | 'failed';
+    report?: BenchmarkReport;
+    error?: string;
 }
-interface EventFilter {
-    runId?: string;
-    spanId?: string;
-    kind?: EventKind;
-    since?: number;
-    until?: number;
+interface Experiment {
+    id: string;
+    name: string;
+    createdAt: string;
+    metadata?: Record<string, unknown>;
 }
-interface TraceStore {
-    appendRun(run: Run): Promise<void>;
-    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
-    appendSpan(span: Span): Promise<void>;
-    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
-    appendEvent(event: TraceEvent): Promise<void>;
-    appendArtifact(artifact: Artifact): Promise<void>;
-    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
-    getRun(runId: string): Promise<Run | undefined>;
-    listRuns(filter?: RunFilter): Promise<Run[]>;
-    spans(filter?: SpanFilter): Promise<Span[]>;
-    events(filter?: EventFilter): Promise<TraceEvent[]>;
-    budget(runId: string): Promise<BudgetLedgerEntry[]>;
-    artifacts(runId: string): Promise<Artifact[]>;
+interface ExperimentStore {
+    saveExperiment(exp: Experiment): Promise<void>;
+    getExperiment(id: string): Promise<Experiment | null>;
+    listExperiments(): Promise<Experiment[]>;
+    saveRun(run: Run): Promise<void>;
+    getRun(id: string): Promise<Run | null>;
+    listRuns(experimentId: string): Promise<Run[]>;
 }
-declare class InMemoryTraceStore implements TraceStore {
-    private runs;
-    private allSpans;
-    private allEvents;
-    private allArtifacts;
-    private allBudget;
-    appendRun(run: Run): Promise<void>;
-    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
-    appendSpan(span: Span): Promise<void>;
-    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
-    appendEvent(event: TraceEvent): Promise<void>;
-    appendArtifact(artifact: Artifact): Promise<void>;
-    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
-    getRun(runId: string): Promise<Run | undefined>;
-    listRuns(filter?: RunFilter): Promise<Run[]>;
-    spans(filter?: SpanFilter): Promise<Span[]>;
-    events(filter?: EventFilter): Promise<TraceEvent[]>;
-    budget(runId: string): Promise<BudgetLedgerEntry[]>;
-    artifacts(runId: string): Promise<Artifact[]>;
+declare class InMemoryExperimentStore implements ExperimentStore {
+    private readonly experiments;
+    private readonly runs;
+    saveExperiment(exp: Experiment): Promise<void>;
+    getExperiment(id: string): Promise<Experiment | null>;
+    listExperiments(): Promise<Experiment[]>;
+    saveRun(run: Run): Promise<void>;
+    getRun(id: string): Promise<Run | null>;
+    listRuns(experimentId: string): Promise<Run[]>;
 }
-interface FileSystemTraceStoreOptions {
+declare class ExperimentTracker {
+    private readonly store;
+    constructor(store: ExperimentStore);
+    startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
+    startRun(config: RunConfig): Promise<Run>;
+    completeRun(runId: string, report: BenchmarkReport): Promise<void>;
+    failRun(runId: string, error: string): Promise<void>;
+    /**
+     * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
+     * and config changes that may explain the movement.
+     */
+    diff(runIdA: string, runIdB: string): Promise<RunDiff>;
+    /** Timeline of aggregate scores for an experiment. */
+    timeline(experimentId: string): Promise<Array<{
+        runId: string;
+        startedAt: string;
+        overall: number | null;
+    }>>;
+}
+interface RunDiff {
+    before: {
+        runId: string;
+        name?: string;
+        startedAt: string;
+    };
+    after: {
+        runId: string;
+        name?: string;
+        startedAt: string;
+    };
+    aggregateDelta: number;
+    scenarios: Array<{
+        scenarioId: string;
+        before: number | null;
+        after: number | null;
+        delta: number | null;
+        status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
+    }>;
+    configChanges: Record<string, {
+        before: unknown;
+        after: unknown;
+    }>;
+}
+/**
+ * FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
+ *
+ * Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
+ * files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
+ * based rollover. Writes are append-only so the file log doubles as an audit
+ * trail of every state transition the tracker ever wrote.
+ *
+ * Reads lazy-load every NDJSON file in the directory (including rolled-over
+ * archives), latest-write-wins per `id`. Subsequent writes update the
+ * in-memory index in place so reads after writes are O(1).
+ *
+ * Node-only — imports `node:fs/promises`. Don't import this from a Worker;
+ * use the in-memory store or the D1 store from `./experiment-tracker-d1`.
+ */
+interface FileSystemExperimentStoreOptions {
+    /** Directory the NDJSON files live in. Created on first write. */
     dir: string;
-    /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
+    /** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
     maxBytes?: number;
 }
-declare class FileSystemTraceStore implements TraceStore {
-    private dir;
-    private maxBytes;
-    /** Lazy in-memory index for queries — populated on first read. */
+declare class FileSystemExperimentStore implements ExperimentStore {
+    private readonly dir;
+    private readonly maxBytes;
     private index?;
     private loaded;
-    constructor(options: FileSystemTraceStoreOptions);
+    constructor(options: FileSystemExperimentStoreOptions);
+    saveExperiment(exp: Experiment): Promise<void>;
+    getExperiment(id: string): Promise<Experiment | null>;
+    listExperiments(): Promise<Experiment[]>;
+    saveRun(run: Run): Promise<void>;
+    getRun(id: string): Promise<Run | null>;
+    listRuns(experimentId: string): Promise<Run[]>;
     private ensureDir;
     private append;
-    private insertInto;
     private load;
-    appendRun(run: Run): Promise<void>;
-    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
-    appendSpan(span: Span): Promise<void>;
-    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
-    appendEvent(event: TraceEvent): Promise<void>;
-    appendArtifact(artifact: Artifact): Promise<void>;
-    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
-    getRun(runId: string): Promise<Run | undefined>;
-    listRuns(filter?: RunFilter): Promise<Run[]>;
-    spans(filter?: SpanFilter): Promise<Span[]>;
-    events(filter?: EventFilter): Promise<TraceEvent[]>;
-    budget(runId: string): Promise<BudgetLedgerEntry[]>;
-    artifacts(runId: string): Promise<Artifact[]>;
 }
 /**
- * TraceEmitter — hierarchical span builder that auto-parents using an
- * internal stack. One emitter per Run; emitters do NOT share state.
+ * D1ExperimentStore — Cloudflare D1-backed `ExperimentStore`.
  *
- * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
- * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
- * have to thread spanIds manually. For async workflows that can't use
- * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
- * explicitly.
+ * Workers-safe (uses only the `D1Database` binding the runtime injects). Two
+ * tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
+ * a Worker route can both write the row at run start and update it at run end
+ * without losing the original config — the row's lifecycle mirrors the
+ * `Run.status` field one-to-one.
+ *
+ * Why this lives next to `InMemoryExperimentStore`:
+ *   - browser, coding, and computer-use agents can all run as Workers
+ *   - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
+ *   - Hand-rolling D1 SQL in every consumer is exactly the duplication this
+ *     module exists to prevent
+ *
+ * Schema versioning: the `meta` table records `schema_version` so a future
+ * column addition can be detected and migrated additively. Today's schema is
+ * v1; bump only on breaking shape changes.
  */
-interface SpanHandle<S extends Span = Span> {
-    span: S;
-    end(patch?: Partial<S>): Promise<void>;
-    fail(error: string | Error, patch?: Partial<S>): Promise<void>;
+/**
+ * Minimal `D1Database` shape we depend on. Avoids pulling in
+ * `@cloudflare/workers-types` as a hard dep — consumers that already have
+ * those types installed can pass the binding directly.
+ */
+interface D1Like {
+    prepare(query: string): D1PreparedStatementLike;
+    batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
+    exec(query: string): Promise<unknown>;
 }
-interface TraceEmitterOptions {
-    runId?: string;
-    /** Inject a clock for deterministic tests. */
-    now?: () => number;
-    /** Inject an id generator for deterministic tests. */
-    id?: () => string;
+interface D1PreparedStatementLike {
+    bind(...values: unknown[]): D1PreparedStatementLike;
+    first<T = Record<string, unknown>>(): Promise<T | null>;
+    all<T = Record<string, unknown>>(): Promise<{
+        results: T[];
+    }>;
+    run(): Promise<unknown>;
+}
+interface D1ExperimentStoreOptions {
+    /** D1 binding from `env`. */
+    db: D1Like;
+    /**
+     * Optional table-name prefix so multiple ExperimentStores can share a DB
+     * without colliding (e.g. `browser_eval_experiments` vs `coding_eval_experiments`).
+     * Default: `agent_eval_`.
+     */
+    tablePrefix?: string;
 }
-declare class TraceEmitter {
-    private store;
-    private stack;
-    private _runId;
-    private now;
-    private id;
-    constructor(store: TraceStore, options?: TraceEmitterOptions);
-    get runId(): string;
-    startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
-    endRun(outcome?: RunOutcome$1): Promise<void>;
-    abortRun(reason: string): Promise<void>;
-    span<S extends Span = Span>(init: {
-        kind: SpanKind;
-        name: string;
-        parentSpanId?: string;
-        attributes?: Record<string, unknown>;
-    } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
-    private handle;
-    private pop;
-    llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
-    tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
-    retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
-    recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
-    sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
-    emit(event: {
-        kind: EventKind;
-        spanId?: string;
-        payload?: Record<string, unknown>;
-    }): Promise<TraceEvent>;
-    recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
-        timestamp?: number;
-    }): Promise<BudgetLedgerEntry>;
-    recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
+declare class D1ExperimentStore implements ExperimentStore {
+    private readonly db;
+    private readonly experimentsTable;
+    private readonly runsTable;
+    private readonly metaTable;
+    private schemaReady;
+    constructor(options: D1ExperimentStoreOptions);
     /**
-     * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
-     * Returns the fn's return value. Use this for the 95% case.
+     * Idempotent schema setup. Safe to call before every operation; the second
+     * call short-circuits via `schemaReady`. Most consumers will call it once
+     * during Worker bootstrap.
      */
-    within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
+    ensureSchema(): Promise<void>;
+    saveExperiment(exp: Experiment): Promise<void>;
+    getExperiment(id: string): Promise<Experiment | null>;
+    listExperiments(): Promise<Experiment[]>;
+    saveRun(run: Run): Promise<void>;
+    getRun(id: string): Promise<Run | null>;
+    listRuns(experimentId: string): Promise<Run[]>;
 }
-/** Helper to build an LLM span handle args object from a provider-shaped response. */
-declare function llmSpanFromProvider(args: {
-    name?: string;
-    model: string;
-    messages: Message[];
-    output: string;
-    usage?: {
-        inputTokens?: number;
-        outputTokens?: number;
-        cachedTokens?: number;
-        reasoningTokens?: number;
-    };
-    costUsd?: number;
-    finishReason?: string;
-}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
 /**
  * Typed query helpers over TraceStore.
@@ -1569,7 +2079,7 @@ declare function llmSpanFromProvider(args: {
  * tooling works out of the box.
  */
-declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
+declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run$1[]>;
 declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
 declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
 declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
@@ -1585,7 +2095,7 @@ declare function aggregateLlm(spans: LlmSpan[]): {
     costUsd: number;
 };
 /** Pick the outcome's failure class when present, else derive 'success' from run status. */
-declare function runFailureClass(run: Run): FailureClass;
+declare function runFailureClass(run: Run$1): FailureClass;
 /**
  * Redaction — remove PII / secrets from trace payloads before persist.
@@ -1689,10 +2199,10 @@ interface OtlpExport {
 declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
 interface RunTrace {
-    run: Run;
+    run: Run$1;
     spans: Span[];
     events: TraceEvent[];
-    artifacts: Artifact[];
+    artifacts: Artifact$1[];
     budget: BudgetLedgerEntry[];
 }
 interface RunCriticOptions {
@@ -1725,55 +2235,6 @@ declare function distillPlaybook(entries: PlaybookEntry[], options?: {
 }): Playbook;
 declare function renderPlaybookMarkdown(playbook: Playbook): string;
-interface OptimizationExample {
-    scenarioId: string;
-    metadata?: Record<string, unknown>;
-}
-interface SteeringEvaluation {
-    variant: SteeringBundle;
-    example: OptimizationExample;
-    trialIndex: number;
-}
-interface SteeringVariantReport {
-    variantId: string;
-    bundle: SteeringBundle;
-    mean: number;
-    ci95: {
-        lower: number;
-        upper: number;
-    };
-    scenarioScores: Record<string, {
-        mean: number;
-        n: number;
-        samples: number[];
-    }>;
-}
-interface OptimizationLoopResult {
-    winner: SteeringBundle;
-    significant: boolean;
-    reports: SteeringVariantReport[];
-    pairwise: Array<{
-        variantA: string;
-        variantB: string;
-        pValue: number;
-        qValue: number;
-        significant: boolean;
-        meanDelta: number;
-    }>;
-}
-interface OptimizationLoopConfig {
-    variants: SteeringBundle[];
-    examples: OptimizationExample[];
-    evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
-    scoreWeights?: Partial<RunScoreWeights>;
-    trialsPerScenario?: number;
-}
-declare class OptimizationLoop {
-    private readonly optimizer;
-    constructor(optimizer?: PromptOptimizer);
-    run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
-}
 type SteeringOptimizerBackend = 'pairwise' | 'ax-gepa';
 interface SteeringOptimizationRow {
     variantId: string;
@@ -2167,7 +2628,7 @@ type HostedRunCriticConfig = Pick<RunCriticOptions, 'weights'> & {
 /**
  * Dual-agent convergence bench.
  *
- * Pattern lifted from tax-agent + legal-agent: two agents take turns until
+ * Pattern lifted from dual-worker review loops: two agents take turns until
  * they converge on a consensus artifact. One proposes, the other critiques;
  * the proposer revises; repeat until a score threshold is hit or max rounds.
  *
@@ -2400,6 +2861,51 @@ interface LlmReviewerConfig<State, Summary = unknown> {
 }
 declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
+interface ProposeReviewControlState<State, Summary = unknown> {
+    shot: number;
+    state: State;
+    priorReview: Review | null;
+    verification: Verification;
+    traceSummary?: Summary;
+    memory: ReviewMemoryEntry[];
+    completed: boolean;
+    reviewAvailable: boolean;
+    reviewError?: string;
+}
+interface ProposeReviewControlAction {
+    type: 'propose-review-shot';
+    shot: number;
+}
+interface ProposeReviewControlResult<State, Summary = unknown> {
+    state: State;
+    verification: Verification;
+    traceSummary?: Summary;
+    review: Review | null;
+    reviewAvailable: boolean;
+    reviewError?: string;
+}
+interface ProposeReviewControlConfig<State, Summary = unknown> {
+    goal: string;
+    initialState: State;
+    propose: ProposeFn<State, Summary>;
+    verify: VerifyFn<State>;
+    review: ReviewFn<State, Summary>;
+    maxShots?: number;
+    maxWallMs?: number;
+    memory?: ReviewMemoryStore;
+    store?: TraceStore;
+    scenarioId?: string;
+    projectId?: string;
+    variantId?: string;
+    fallbackInstruction?: string;
+    confidenceFloor?: number;
+    confidenceFloorWindow?: number;
+    failureClassFromVerification?: (verification: Verification) => FailureClass | undefined;
+    actionFailure?: ControlRuntimeConfig<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>['actionFailure'];
+}
+declare function runProposeReviewAsControlLoop<State, Summary = unknown>(config: ProposeReviewControlConfig<State, Summary>): Promise<ControlRunResult<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>>;
+declare function controlFailureClassFromVerification(verification: Verification): FailureClass | undefined;
 /**
  * TestGradedScenario — a scenario whose score comes from a test suite.
  *
@@ -2428,7 +2934,7 @@ interface TestGradedRunOptions {
     variantId?: string;
     driver?: SandboxDriver;
     /** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
-    provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
+    provenance?: Pick<Run$1, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
 }
 interface TestGradedRunResult {
     runId: string;
@@ -2481,7 +2987,7 @@ declare class BudgetGuard {
  */
 interface FailureContext {
-    run: Run;
+    run: Run$1;
     spans: Span[];
     events: TraceEvent[];
 }
@@ -2824,7 +3330,7 @@ interface RegressionSpec {
     metric: string;
     higherIsBetter: boolean;
     /** Extract a scalar from a run. Default extractors handle common metrics. */
-    extract?: (run: Run, store: TraceStore) => Promise<number | null>;
+    extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
 }
 interface RegressionOptions extends BaselineOptions {
     baseline: RunFilter;
@@ -2938,7 +3444,7 @@ declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): Ora
 /**
  * Cost tracker — token + USD accounting per scenario and per run.
  *
- * Lifted from tax/legal metrics.ts + tangle-router UsageEvent. Every
+ * Adapted from generic usage-event accounting. Every
  * optimizer needs to know "is the quality gain worth the cost delta?",
  * and every dashboard needs dollars-per-completed-task. MODEL_PRICING
  * from metrics.ts stays authoritative for estimate math; this module
@@ -3149,7 +3655,7 @@ declare function analyzeSeries(values: number[], options?: SeriesConvergenceOpti
  * State continuity scoring — measures how well a resumed/handed-off agent
  * preserves prior work.
  *
- * Lifted from tax-agent's run-resume-eval.ts. When session 2 continues
+ * When session 2 continues
  * session 1's work, the key question is: did it preserve key artifacts,
  * or start over and lose context? Each `ContinuityCheck` inspects one
  * aspect (file preserved, key count grew, status advanced) and yields
@@ -3192,107 +3698,6 @@ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minR
 /** Common check: a status field advanced in an expected order. */
 declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
-/**
- * Dataset — versioned, sliceable, content-hashed scenario collection.
- *
- * Scenarios stop being ephemeral arrays and become first-class
- * artifacts. Every Dataset carries:
- *   - content hash (sha256 over canonicalized scenario array)
- *   - provenance (contributor, createdAt, sourceUrl)
- *   - split labels (train | dev | test | holdout)
- *   - difficulty tiers (easy | medium | hard | extreme)
- *   - tags (free-form, per-scenario)
- *
- * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
- * deterministic, reproducible subset. Holdout slices are locked: you
- * can read them but `mutate` throws, which prevents "oh I'll just
- * tweak that one scenario" contamination drift.
- */
-type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
-type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
-interface DatasetScenario {
-    id: string;
-    /** Arbitrary payload; the framework doesn't interpret it. */
-    payload: unknown;
-    split?: DatasetSplit;
-    difficulty?: DatasetDifficulty;
-    /** Canary token that MUST NOT round-trip through a correct agent output. */
-    canary?: string;
-    tags?: Record<string, string>;
-}
-interface DatasetProvenance {
-    contributor?: string;
-    createdAt: string;
-    sourceUrl?: string;
-    license?: string;
-    description?: string;
-    /** Monotonic human-readable version (e.g. "2026.04.20"). */
-    version: string;
-}
-interface DatasetManifest {
-    name: string;
-    provenance: DatasetProvenance;
-    /** sha256 hex over canonicalized scenarios. */
-    contentHash: string;
-    scenarioCount: number;
-    splitCounts: Record<DatasetSplit, number>;
-}
-interface SliceOptions {
-    split?: DatasetSplit;
-    difficulty?: DatasetDifficulty;
-    /** Number of scenarios (random sample, seeded). Omit to take all that match. */
-    limit?: number;
-    seed?: number;
-    /** Predicate narrowing. Applied after split/difficulty filters. */
-    filter?: (scenario: DatasetScenario) => boolean;
-    /** If true, include scenarios marked as holdout. Default false. */
-    includeHoldout?: boolean;
-}
-/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
-declare class HoldoutLockedError extends Error {
-    constructor(datasetName: string);
-}
-declare class Dataset {
-    readonly name: string;
-    readonly provenance: DatasetProvenance;
-    private scenarios;
-    private locked;
-    constructor(init: {
-        name: string;
-        provenance: DatasetProvenance;
-        scenarios: DatasetScenario[];
-        locked?: boolean;
-    });
-    /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
-    all(): readonly DatasetScenario[];
-    get size(): number;
-    /**
-     * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
-     * the same arguments always produce the same slice across machines.
-     */
-    slice(options?: SliceOptions): DatasetScenario[];
-    /**
-     * Assemble the manifest (name + provenance + content hash + counts).
-     * Content hash is deterministic over canonicalized scenarios.
-     */
-    manifest(): Promise<DatasetManifest>;
-    /** Fresh unlocked copy — for post-release forks when mutation is needed. */
-    clone(overrides?: Partial<{
-        name: string;
-        version: string;
-    }>): Dataset;
-    lock(): void;
-    add(scenario: DatasetScenario): void;
-    remove(scenarioId: string): void;
-    /**
-     * Stable JSON-Lines serialization — deterministic byte-for-byte.
-     * Write to disk for contamination-verifiable archives.
-     */
-    toJsonl(): string;
-    static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
-}
-declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
 /**
  * ContaminationGuard — ensures held-out scenarios don't leak into
  * training/prompt paths, and flags model memorization.
@@ -3608,7 +4013,7 @@ interface ContractMetric {
     /** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
     maxRegression?: number;
     /** Optional extractor if the metric isn't in the default set. */
-    extract?: (run: Run, store: TraceStore) => Promise<number | null>;
+    extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
 }
 interface ThresholdContract {
     name: string;
@@ -3874,10 +4279,10 @@ declare class BuilderSession {
  */
 declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
     projectId: string;
-    chatRuns: Run[];
-    lastBuilderRun?: Run;
-    lastBuildRun?: Run;
-    lastAppRuntimeRuns: Run[];
+    chatRuns: Run$1[];
+    lastBuilderRun?: Run$1;
+    lastBuildRun?: Run$1;
+    lastAppRuntimeRuns: Run$1[];
 }>;
 /**
@@ -3997,8 +4402,8 @@ interface ChatSummary {
     builderRunId: string;
     startedAt: number;
     endedAt?: number;
-    status: Run['status'];
-    outcome?: Run['outcome'];
+    status: Run$1['status'];
+    outcome?: Run$1['outcome'];
     /** Counts of spans emitted during the chat. */
     llmTurns?: number;
     toolCalls?: number;
@@ -4006,7 +4411,7 @@ interface ChatSummary {
     appRuntimeRunIds: string[];
 }
 interface ProjectTimelineEntry {
-    run: Run;
+    run: Run$1;
     layerBucket: 'chat' | 'build' | 'runtime' | 'other';
 }
 declare class ProjectRegistry {
@@ -4093,7 +4498,7 @@ declare class FileSystemOutcomeStore implements OutcomeStore {
 interface EvalMetricSpec {
     id: string;
     /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
-    extract?: (run: Run, store: TraceStore) => Promise<number | null>;
+    extract?: (run: Run$1, store: TraceStore) => Promise<number | null>;
 }
 interface OutcomePair {
     evalMetric: string;
@@ -7978,4 +8383,4 @@ interface ReflectionProposal {
 }
 declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
-export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
+export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };