npm - @tangle-network/agent-eval - Versions diffs - 0.1.0 → 0.5.0 - Mend

@tangle-network/agent-eval 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -435,6 +435,83 @@ declare class MetricsCollector {
     getConvergenceCurve(): number[];
 }
+/**
+ * ScenarioRegistry — manages scenario discovery and filtering.
+ *
+ * Each agent registers its scenarios. The registry handles conversion
+ * from ScenarioFile format to the framework's Scenario type.
+ */
+declare class ScenarioRegistry {
+    private scenarios;
+    private scenarioFiles;
+    /** Register scenarios from ScenarioFile format */
+    registerFiles(files: ScenarioFile[]): void;
+    /** Register pre-built Scenario objects directly */
+    register(scenarios: Scenario[]): void;
+    /** Get all scenarios */
+    all(): Scenario[];
+    /** Get scenarios filtered by category */
+    byCategory(category: string): Scenario[];
+    /** List all categories with counts */
+    listCategories(): {
+        category: string;
+        count: number;
+    }[];
+    /** Get scenarios filtered by persona */
+    byPersona(persona: string): Scenario[];
+    /** Get a single scenario by ID */
+    byId(id: string): Scenario | undefined;
+    /** Count total scenarios */
+    get count(): number;
+}
+interface AgentDriverConfig {
+    client: ProductClient;
+    driverModel?: string;
+    /** System prompt context for the driver LLM to understand the product */
+    productContext?: string;
+}
+/**
+ * AgentDriver — meta-agent that plays a persona against the real product.
+ *
+ * Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
+ * Not scripted — the driver gets the current product state and decides
+ * the next realistic user message.
+ */
+declare class AgentDriver {
+    private tc;
+    private client;
+    private driverModel;
+    private productContext;
+    constructor(tc: TCloud, config: AgentDriverConfig);
+    /**
+     * Run a persona through the product.
+     *
+     * Returns metrics on how many turns to completion, cost curve,
+     * quality curve, and convergence curve.
+     */
+    run(persona: PersonaConfig): Promise<DriverResult>;
+    /** Use the driver LLM to decide what the "user" says next */
+    private decideNextMessage;
+    /** Handle pending approvals based on persona feedback patterns */
+    private handleApprovals;
+    /** Describe which completion criteria are met */
+    private describeCompletion;
+}
+/**
+ * Report generation utilities.
+ *
+ * Outputs convergence curves, cost curves, quality curves,
+ * and per-persona summaries in markdown format.
+ */
+/** Generate a markdown report from benchmark results */
+declare function formatBenchmarkReport(report: BenchmarkReport): string;
+/** Generate a markdown report from agent driver results */
+declare function formatDriverReport(results: DriverResult[]): string;
+/** Print a compact summary to console */
+declare function printDriverSummary(results: DriverResult[]): void;
 /**
  * Normalize scores so all dimensions follow "higher = better".
  * Inverted dimensions (hallucination, false_confidence, worst_failure)
@@ -470,6 +547,31 @@ declare function mannWhitneyU(a: number[], b: number[]): {
 };
 /** Partial credit: returns 0-1 ratio of current toward target */
 declare function partialCredit(current: number, target: number): number;
+/**
+ * Paired t-test — before/after measurements on the SAME items.
+ * Pairing removes inter-item variance, giving tighter significance than
+ * an unpaired test when comparing prompt v1 vs prompt v2 on identical
+ * scenarios.
+ */
+declare function pairedTTest(before: number[], after: number[]): {
+    t: number;
+    df: number;
+    p: number;
+};
+/**
+ * Wilcoxon signed-rank test — paired non-parametric alternative.
+ * Use when the differences aren't normally distributed.
+ */
+declare function wilcoxonSignedRank(before: number[], after: number[]): {
+    w: number;
+    p: number;
+};
+/**
+ * Cohen's d — standardized effect size for two independent groups.
+ * Positive d means group b has higher mean than group a.
+ * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
+ */
+declare function cohensD(a: number[], b: number[]): number;
 /**
  * ConvergenceTracker — tracks completion percentage over turns.
@@ -500,80 +602,2607 @@ declare class ConvergenceTracker {
 }
 /**
- * ScenarioRegistry — manages scenario discovery and filtering.
+ * Versioned prompt registry.
  *
- * Each agent registers its scenarios. The registry handles conversion
- * from ScenarioFile format to the framework's Scenario type.
+ * Every prompt used in an eval run is registered with an explicit version.
+ * Reports include the content hash so A/B compares are rigorous: if the
+ * hash changes between two reports, the prompt actually changed; if it
+ * matches, the variance is elsewhere.
+ *
+ * Hash is SHA-256(content), truncated to 12 hex chars for readability.
+ * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
  */
-declare class ScenarioRegistry {
-    private scenarios;
-    private scenarioFiles;
-    /** Register scenarios from ScenarioFile format */
-    registerFiles(files: ScenarioFile[]): void;
-    /** Register pre-built Scenario objects directly */
-    register(scenarios: Scenario[]): void;
-    /** Get all scenarios */
-    all(): Scenario[];
-    /** Get scenarios filtered by category */
-    byCategory(category: string): Scenario[];
-    /** List all categories with counts */
-    listCategories(): {
-        category: string;
-        count: number;
-    }[];
-    /** Get scenarios filtered by persona */
-    byPersona(persona: string): Scenario[];
-    /** Get a single scenario by ID */
-    byId(id: string): Scenario | undefined;
-    /** Count total scenarios */
-    get count(): number;
+interface PromptHandle {
+    /** Stable human-readable id, e.g. 'legal.system' */
+    id: string;
+    /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
+    version: string;
+    /** SHA-256 of content, 12-hex-char prefix */
+    hash: string;
+    /** Full prompt body */
+    content: string;
+}
+declare class PromptRegistry {
+    private readonly entries;
+    /**
+     * Register a prompt. Re-registering the same id+version with DIFFERENT
+     * content throws — versions are immutable. Re-registering with the SAME
+     * content is a no-op (idempotent).
+     */
+    register(id: string, version: string, content: string): Promise<PromptHandle>;
+    /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
+    get(id: string, version: string): PromptHandle;
+    /** Return all versions of an id, newest-first (lex-descending on version). */
+    listVersions(id: string): PromptHandle[];
+    /** Snapshot the whole registry — useful for including in reports. */
+    list(): PromptHandle[];
+    /** Verify a hash against registered content. Returns null if not found. */
+    verifyHash(id: string, version: string, expectedHash: string): boolean | null;
 }
+/** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
+declare function hashContent(content: string): Promise<string>;
-interface AgentDriverConfig {
-    client: ProductClient;
-    driverModel?: string;
-    /** System prompt context for the driver LLM to understand the product */
-    productContext?: string;
+/**
+ * Anti-slop quality judge.
+ *
+ * Deterministic pattern-based quality check — no LLM call. Catches the
+ * 80% of AI slop that every production agent leaks:
+ *   - Banned phrases (voice-specific: "delve", "it's worth noting", etc.)
+ *   - N-gram repetition (same phrase over and over)
+ *   - Hedging overuse ("I could be wrong, but...")
+ *   - Apology padding ("I'm so sorry for the confusion...")
+ *   - Unused opening formulas ("Great question!")
+ *   - Length bounds (too short to be useful, too long to be read)
+ *
+ * Produces a JudgeScore in the same shape as LLM judges so it composes into
+ * `BenchmarkRunner`'s judge array transparently.
+ */
+interface AntiSlopConfig {
+    /** Domain label — appears in the JudgeScore output */
+    domain?: string;
+    /** Case-insensitive substrings that must not appear. Each occurrence = penalty. */
+    bannedPhrases?: string[];
+    /** Regexes matching opening formulas to penalize (e.g. /^great question/i). */
+    bannedOpenings?: RegExp[];
+    /** Regexes matching hedges (e.g. /i could be wrong/i). Ratio of hedged sentences drives score. */
+    hedgingPatterns?: RegExp[];
+    /** Regexes matching apology padding. */
+    apologyPatterns?: RegExp[];
+    /** Fraction of sentences that can be duplicates before penalty (default 0.15 = 15%). */
+    repetitionThreshold?: number;
+    /** Min output length in chars; below this the turn is deemed too terse. */
+    minLength?: number;
+    /** Max output length in chars; above this the turn is deemed too verbose. */
+    maxLength?: number;
+    /** How heavily each violation class reduces the score (default 1). */
+    penaltyWeights?: Partial<Record<SlopCategory, number>>;
+}
+type SlopCategory = 'banned_phrase' | 'banned_opening' | 'hedging' | 'apology' | 'repetition' | 'length';
+/** Create a reusable Judge function from an anti-slop config. */
+declare function createAntiSlopJudge(config?: AntiSlopConfig): JudgeFn;
+interface AntiSlopIssue {
+    category: SlopCategory;
+    detail: string;
+    example?: string;
+}
+interface AntiSlopReport {
+    /** 0–10 score; 10 is clean, lower values mean more slop. */
+    score: number;
+    issues: AntiSlopIssue[];
+    /** Count of each category for programmatic aggregation. */
+    counts: Record<SlopCategory, number>;
 }
 /**
- * AgentDriver — meta-agent that plays a persona against the real product.
+ * Pure function — analyze one or more outputs against the config. Exposed
+ * separately so consumers can build their own reporters on top.
+ */
+declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSlopConfig>, 'domain'> & {
+    penaltyWeights: Record<SlopCategory, number>;
+}): AntiSlopReport;
+/**
+ * Artifact validators.
  *
- * Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
- * Not scripted — the driver gets the current product state and decides
- * the next realistic user message.
+ * Generic "score a produced artifact" primitive. Tax uses it for PDF form
+ * correctness, legal for contract clauses, film for script breakdowns, GTM
+ * for social posts. One interface, many validators; all plug into
+ * `BenchmarkRunner` the same way.
+ *
+ * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
+ * plus a `ValidationContext` (scenario id, the turns that produced it) and
+ * returns a `ValidationResult` with pass/fail + 0..1 score + structured
+ * issues.
  */
-declare class AgentDriver {
-    private tc;
-    private client;
-    private driverModel;
-    private productContext;
-    constructor(tc: TCloud, config: AgentDriverConfig);
+interface Artifact$1 {
+    /** Logical kind — validators type-guard on this */
+    kind: 'file' | 'json' | 'text' | 'binary' | string;
+    /** Filesystem-style path, optional */
+    path?: string;
+    /** String content for text/json/file kinds */
+    content?: string;
+    /** Binary content (if kind === 'binary') */
+    bytes?: Uint8Array;
+    /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
+    metadata?: Record<string, unknown>;
+}
+interface ValidationContext {
+    scenarioId: string;
+    turnIndex?: number;
+    /** Prior artifacts for multi-artifact scenarios */
+    priorArtifacts?: Artifact$1[];
+    /** Free-form hints the validator uses for domain-specific checks */
+    hints?: Record<string, unknown>;
+}
+interface ValidationIssue {
+    severity: 'error' | 'warning' | 'info';
+    message: string;
+    /** Optional path into the artifact (e.g. JSON path or byte offset) */
+    locus?: string;
+}
+interface ValidationResult {
+    pass: boolean;
+    /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
+    score: number;
+    issues: ValidationIssue[];
+    /** Diagnostic payload for reporters */
+    evidence?: Record<string, unknown>;
+}
+interface ArtifactValidator {
+    /** Stable identifier for the validator; appears in reports. */
+    name: string;
+    /** Optional description for human-facing reports. */
+    description?: string;
+    /** Called once per artifact; validators are expected to be pure + idempotent. */
+    validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
+}
+/**
+ * Run every validator on the same artifact; aggregate pass as AND, score as
+ * (weighted) mean, issues concatenated. Weights default to 1 each.
+ */
+declare function composeValidators(validators: ArtifactValidator[], options?: {
+    name?: string;
+    weights?: number[];
+}): ArtifactValidator;
+/** Pass if the artifact body matches a provided regex. */
+declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
+/** Pass if JSON parses and every required key is present. */
+declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
+/** Pass if min ≤ byte length ≤ max. */
+declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
+/** Pass if the artifact contains every required substring (case-insensitive by default). */
+declare function containsAll(name: string, required: string[], options?: {
+    caseSensitive?: boolean;
+}): ArtifactValidator;
+/**
+ * Workspace inspector — score the persisted state of an agent after a run.
+ *
+ * Many evals don't ask "did the response say the right thing" but "did the
+ * agent put the right rows in the DB / files in the vault / entities on the
+ * canvas". This is the primitive for that.
+ *
+ * Implementations read from D1, KV, filesystem, or any store — the interface
+ * is deliberately small so consumers plug in their own backends.
+ */
+interface WorkspaceSnapshot {
+    /** Vault files: logical path → content */
+    files: Record<string, string>;
+    /** DB rows: table name → array of rows (post-validation) */
+    rows: Record<string, Array<Record<string, unknown>>>;
+    /** KV entries: key → value (scoped to whatever prefix the inspector chose) */
+    kv: Record<string, string>;
+    /** Free-form blob metadata: for large binaries the inspector stores summary, not bytes */
+    blobs?: Record<string, {
+        size: number;
+        hash?: string;
+        mimeType?: string;
+    }>;
+}
+interface InspectorContext {
+    /** Workspace / agent / thread id — whatever the backend uses to scope the snapshot */
+    scopeId: string;
+    /** Optional scenario id — allows scenario-specific snapshot shaping */
+    scenarioId?: string;
+}
+interface WorkspaceInspector {
+    name: string;
+    snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
+}
+declare class InMemoryWorkspaceInspector implements WorkspaceInspector {
+    readonly name = "in-memory";
+    private readonly snapshots;
+    set(scopeId: string, snapshot: WorkspaceSnapshot): void;
+    snapshot(context: InspectorContext): Promise<WorkspaceSnapshot>;
+}
+interface WorkspaceAssertion {
+    name: string;
+    description?: string;
+    check(snapshot: WorkspaceSnapshot): WorkspaceAssertionResult;
+}
+interface WorkspaceAssertionResult {
+    pass: boolean;
+    /** 0..1 — partial credit for assertions that admit it */
+    score: number;
+    detail?: string;
+}
+declare function fileExists(path: string): WorkspaceAssertion;
+declare function fileContains(path: string, needle: string): WorkspaceAssertion;
+declare function rowCount(table: string, min: number, max?: number): WorkspaceAssertion;
+declare function rowWhere<T extends Record<string, unknown>>(table: string, predicate: (row: T) => boolean, options?: {
+    min?: number;
+}): WorkspaceAssertion;
+/** Run many assertions; return aggregate pass + mean score + per-assertion details. */
+declare function runAssertions(snapshot: WorkspaceSnapshot, assertions: WorkspaceAssertion[]): {
+    pass: boolean;
+    score: number;
+    results: Array<{
+        assertion: string;
+        result: WorkspaceAssertionResult;
+    }>;
+};
+/**
+ * Experiment tracker — group runs, diff them, watch scores move over time.
+ *
+ * Not MLflow. Not Weights & Biases. Just the 20% that actually ships:
+ *   - A run has a config (prompt hash, model, scenario ids, seed)
+ *   - Runs belong to experiments (named groups)
+ *   - The store is pluggable (in-memory for tests, filesystem for local,
+ *     custom for Langfuse/D1)
+ *   - Diffs show score deltas, new/dropped scenarios, and config changes
+ *
+ * The output plugs directly into `BenchmarkReport` — runs archive the full
+ * report, diff operates on the summary.
+ */
+interface RunConfig {
+    experimentId: string;
+    name?: string;
+    model?: string;
+    promptHash?: string;
+    promptVersion?: string;
+    seed?: number;
+    metadata?: Record<string, unknown>;
+}
+interface Run$1 {
+    id: string;
+    experimentId: string;
+    name?: string;
+    config: RunConfig;
+    startedAt: string;
+    completedAt?: string;
+    status: 'running' | 'completed' | 'failed';
+    report?: BenchmarkReport;
+    error?: string;
+}
+interface Experiment {
+    id: string;
+    name: string;
+    createdAt: string;
+    metadata?: Record<string, unknown>;
+}
+interface ExperimentStore {
+    saveExperiment(exp: Experiment): Promise<void>;
+    getExperiment(id: string): Promise<Experiment | null>;
+    listExperiments(): Promise<Experiment[]>;
+    saveRun(run: Run$1): Promise<void>;
+    getRun(id: string): Promise<Run$1 | null>;
+    listRuns(experimentId: string): Promise<Run$1[]>;
+}
+declare class InMemoryExperimentStore implements ExperimentStore {
+    private readonly experiments;
+    private readonly runs;
+    saveExperiment(exp: Experiment): Promise<void>;
+    getExperiment(id: string): Promise<Experiment | null>;
+    listExperiments(): Promise<Experiment[]>;
+    saveRun(run: Run$1): Promise<void>;
+    getRun(id: string): Promise<Run$1 | null>;
+    listRuns(experimentId: string): Promise<Run$1[]>;
+}
+declare class ExperimentTracker {
+    private readonly store;
+    constructor(store: ExperimentStore);
+    startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
+    startRun(config: RunConfig): Promise<Run$1>;
+    completeRun(runId: string, report: BenchmarkReport): Promise<void>;
+    failRun(runId: string, error: string): Promise<void>;
     /**
-     * Run a persona through the product.
-     *
-     * Returns metrics on how many turns to completion, cost curve,
-     * quality curve, and convergence curve.
+     * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
+     * and config changes that may explain the movement.
      */
-    run(persona: PersonaConfig): Promise<DriverResult>;
-    /** Use the driver LLM to decide what the "user" says next */
-    private decideNextMessage;
-    /** Handle pending approvals based on persona feedback patterns */
-    private handleApprovals;
-    /** Describe which completion criteria are met */
-    private describeCompletion;
+    diff(runIdA: string, runIdB: string): Promise<RunDiff>;
+    /** Timeline of aggregate scores for an experiment. */
+    timeline(experimentId: string): Promise<Array<{
+        runId: string;
+        startedAt: string;
+        overall: number | null;
+    }>>;
+}
+interface RunDiff {
+    before: {
+        runId: string;
+        name?: string;
+        startedAt: string;
+    };
+    after: {
+        runId: string;
+        name?: string;
+        startedAt: string;
+    };
+    aggregateDelta: number;
+    scenarios: Array<{
+        scenarioId: string;
+        before: number | null;
+        after: number | null;
+        delta: number | null;
+        status: 'improved' | 'regressed' | 'unchanged' | 'added' | 'removed';
+    }>;
+    configChanges: Record<string, {
+        before: unknown;
+        after: unknown;
+    }>;
 }
 /**
- * Report generation utilities.
+ * Prompt optimizer — A/B test prompt variants with statistical rigor.
  *
- * Outputs convergence curves, cost curves, quality curves,
- * and per-persona summaries in markdown format.
+ * Runs N prompt variants against a fixed scenario set, collects per-scenario
+ * scores via the user-provided `scoreVariant` callback, and returns:
+ *   - per-variant mean + bootstrap CI
+ *   - pairwise significance (Mann-Whitney, non-parametric — works on any
+ *     score distribution, not just normal)
+ *   - a winner (highest mean, flagged if the lead is not significant)
+ *
+ * Deliberately generic — the `scoreVariant` callback does whatever domain
+ * work the consumer needs (invoke the agent, judge the output, whatever),
+ * and returns a number per scenario. This lets the optimizer stay small +
+ * testable.
  */
-/** Generate a markdown report from benchmark results */
-declare function formatBenchmarkReport(report: BenchmarkReport): string;
-/** Generate a markdown report from agent driver results */
-declare function formatDriverReport(results: DriverResult[]): string;
-/** Print a compact summary to console */
-declare function printDriverSummary(results: DriverResult[]): void;
+interface PromptVariant {
+    id: string;
+    prompt: string;
+    metadata?: Record<string, unknown>;
+}
+interface OptimizationConfig {
+    variants: PromptVariant[];
+    /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
+    trialsPerScenario?: number;
+    /** Significance threshold for pairwise comparison (default 0.05). */
+    significanceLevel?: number;
+    /**
+     * The scoring callback. For each (variant, scenarioId, trialIndex), produce
+     * a score in 0..1 (or any numeric range — the optimizer only cares about
+     * monotonicity).
+     */
+    scoreVariant: (args: {
+        variant: PromptVariant;
+        scenarioId: string;
+        trialIndex: number;
+    }) => Promise<number>;
+    /** Scenario ids to run against. */
+    scenarioIds: string[];
+    /** Optional hook — fires after each (variant, scenario) fully scored. */
+    onScenarioComplete?: (info: {
+        variantId: string;
+        scenarioId: string;
+        scores: number[];
+    }) => void;
+}
+interface VariantScore {
+    variantId: string;
+    mean: number;
+    ci95: {
+        lower: number;
+        upper: number;
+    };
+    n: number;
+    perScenario: Record<string, {
+        mean: number;
+        n: number;
+        samples: number[];
+    }>;
+}
+interface PairwiseComparison {
+    variantA: string;
+    variantB: string;
+    pValue: number;
+    /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
+    qValue: number;
+    /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
+    significant: boolean;
+    meanDelta: number;
+}
+interface OptimizationResult {
+    winner: {
+        variantId: string;
+        /** True when the winner's lead vs every other variant is statistically significant. */
+        significant: boolean;
+        ciLowerBoundExceedsSecondMean: boolean;
+    };
+    scores: VariantScore[];
+    pairwise: PairwiseComparison[];
+    config: {
+        trialsPerScenario: number;
+        significanceLevel: number;
+        variants: string[];
+        scenarios: string[];
+    };
+}
+declare class PromptOptimizer {
+    run(config: OptimizationConfig): Promise<OptimizationResult>;
+}
+/**
+ * Dual-agent convergence bench.
+ *
+ * Pattern lifted from tax-agent + legal-agent: two agents take turns until
+ * they converge on a consensus artifact. One proposes, the other critiques;
+ * the proposer revises; repeat until a score threshold is hit or max rounds.
+ *
+ * Generalized so any two "agents" (gateways, local functions, anything with
+ * `propose` + `critique`) compose in. Returns convergence rounds per
+ * scenario + whether convergence happened.
+ */
+interface DualAgentScenario {
+    id: string;
+    initialPrompt: string;
+    /** Optional context the agents can read (e.g. source documents). */
+    context?: Record<string, unknown>;
+}
+interface DualAgentRound {
+    roundIndex: number;
+    proposal: string;
+    critique: string;
+    convergenceScore: number;
+}
+interface DualAgentScenarioResult {
+    scenarioId: string;
+    converged: boolean;
+    roundsToConverge: number | null;
+    finalProposal: string;
+    history: DualAgentRound[];
+    finalScore: number;
+}
+interface DualAgentBenchConfig {
+    scenarios: DualAgentScenario[];
+    maxRounds?: number;
+    /** Convergence threshold in 0..1 (default 0.85). */
+    convergenceThreshold?: number;
+    /**
+     * Propose an answer given the scenario + the critic's prior critique (if any).
+     * Returns the proposal string.
+     */
+    propose: (args: {
+        scenario: DualAgentScenario;
+        roundIndex: number;
+        priorProposal?: string;
+        priorCritique?: string;
+    }) => Promise<string>;
+    /**
+     * Critique the proposer's current output. Returns a structured critique
+     * (free text) plus a convergence score: how close the proposal is to
+     * acceptable. 1.0 = accept, 0.0 = totally off.
+     */
+    critique: (args: {
+        scenario: DualAgentScenario;
+        roundIndex: number;
+        proposal: string;
+    }) => Promise<{
+        critique: string;
+        convergenceScore: number;
+    }>;
+    /** Optional per-round hook for progress + tracing. */
+    onRoundComplete?: (info: {
+        scenarioId: string;
+        round: DualAgentRound;
+    }) => void;
+}
+interface DualAgentReport {
+    scenarios: DualAgentScenarioResult[];
+    aggregate: {
+        convergenceRate: number;
+        avgRoundsToConverge: number | null;
+        avgFinalScore: number;
+    };
+    config: {
+        maxRounds: number;
+        convergenceThreshold: number;
+    };
+}
+declare class DualAgentBench {
+    run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
+}
+/**
+ * TraceSchema v1 — the canonical data model for agent-eval.
+ *
+ * Every score, every failure class, every pipeline in the framework is
+ * a view over this data. Shape it once, live with it.
+ *
+ * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
+ * but extended with agent-specific span kinds (llm, tool, retrieval,
+ * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
+ * entities that OTEL leaves as free-form attributes.
+ */
+declare const TRACE_SCHEMA_VERSION = "1.0.0";
+type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
+interface BudgetSpec {
+    tokens?: number;
+    wallMs?: number;
+    calls?: number;
+    usd?: number;
+}
+interface RunOutcome {
+    score?: number;
+    pass?: boolean;
+    failureClass?: FailureClass;
+    notes?: string;
+}
+/**
+ * Layer — optional classification in a nested build workflow.
+ * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
+ * `app-build`: sandbox harness that compiled + tested the generated scaffold.
+ * `app-runtime`: a run of the generated agent against a domain scenario.
+ * `meta`: any meta-eval (judge replay, correlation analysis).
+ */
+type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
+interface Run {
+    runId: string;
+    scenarioId: string;
+    variantId?: string;
+    datasetVersion?: string;
+    /** Git SHA of agent code at run time. */
+    codeSha?: string;
+    /** Hash of the prompt template + any system prompt. */
+    promptSha?: string;
+    /** Model id + date + system-prompt hash, concatenated. */
+    modelFingerprint?: string;
+    seed?: number;
+    /** Arbitrary environment markers (shell, docker version, tz). */
+    envFingerprint?: Record<string, string>;
+    /** Version of the redaction rules applied to this run. */
+    redactionVersion?: string;
+    /** Parent run in a nested build workflow. A builder run's children are
+     *  app-build runs; those children are app-runtime runs. */
+    parentRunId?: string;
+    /** Stable project identifier — groups runs across chats + sessions. */
+    projectId?: string;
+    /** Chat/conversation identifier within a project. */
+    chatId?: string;
+    /** Layer classification — hint for aggregation; not enforced. */
+    layer?: RunLayer;
+    startedAt: number;
+    endedAt?: number;
+    status: RunStatus;
+    outcome?: RunOutcome;
+    budget?: BudgetSpec;
+    /** Free-form labels for downstream grouping. */
+    tags?: Record<string, string>;
+}
+type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
+type SpanStatus = 'ok' | 'error';
+interface SpanBase {
+    spanId: string;
+    parentSpanId?: string;
+    runId: string;
+    kind: SpanKind;
+    name: string;
+    startedAt: number;
+    endedAt?: number;
+    status?: SpanStatus;
+    error?: string;
+    /** Anything not covered by typed fields. Kept deliberately free-form. */
+    attributes?: Record<string, unknown>;
+}
+interface Message {
+    role: 'system' | 'user' | 'assistant' | 'tool';
+    content: string;
+    tokens?: number;
+    /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
+    images?: Array<{
+        artifactId?: string;
+        url?: string;
+        mime?: string;
+    }>;
+}
+interface LlmSpan extends SpanBase {
+    kind: 'llm';
+    model: string;
+    messages: Message[];
+    output?: string;
+    inputTokens?: number;
+    outputTokens?: number;
+    cachedTokens?: number;
+    reasoningTokens?: number;
+    costUsd?: number;
+    finishReason?: string;
+}
+interface ToolSpan extends SpanBase {
+    kind: 'tool';
+    toolName: string;
+    args: unknown;
+    result?: unknown;
+    latencyMs?: number;
+}
+interface RetrievalSpan extends SpanBase {
+    kind: 'retrieval';
+    query: string;
+    hits: Array<{
+        docId: string;
+        score: number;
+        content?: string;
+    }>;
+}
+interface JudgeSpan extends SpanBase {
+    kind: 'judge';
+    judgeId: string;
+    /** Span this judgment applies to. */
+    targetSpanId: string;
+    dimension: string;
+    /** Numeric score (free-range; interpretation up to the judge). */
+    score: number;
+    rationale?: string;
+    evidence?: string;
+}
+interface SandboxSpan extends SpanBase {
+    kind: 'sandbox';
+    image?: string;
+    command?: string;
+    exitCode?: number;
+    testsTotal?: number;
+    testsPassed?: number;
+    stdoutHash?: string;
+    stderrHash?: string;
+    /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
+    wallMs?: number;
+}
+interface GenericSpan extends SpanBase {
+    kind: 'agent' | 'custom';
+}
+type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
+type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
+interface TraceEvent {
+    eventId: string;
+    runId: string;
+    spanId?: string;
+    kind: EventKind;
+    timestamp: number;
+    payload: Record<string, unknown>;
+}
+interface BudgetLedgerEntry {
+    runId: string;
+    dimension: keyof BudgetSpec;
+    limit: number;
+    consumed: number;
+    remaining: number;
+    timestamp: number;
+    breached: boolean;
+    /** Span that triggered this entry, if any. */
+    spanId?: string;
+}
+interface Artifact {
+    artifactId: string;
+    runId: string;
+    spanId?: string;
+    contentType: string;
+    sizeBytes: number;
+    /** sha256 in hex. */
+    hash: string;
+    /** External storage URL (R2, S3, filesystem path). */
+    storageUrl?: string;
+    /** Inline content for small blobs — keep under ~64KB. */
+    inlineContent?: string;
+}
+type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
+declare const FAILURE_CLASSES: readonly FailureClass[];
+declare function isLlmSpan(s: Span): s is LlmSpan;
+declare function isToolSpan(s: Span): s is ToolSpan;
+declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
+declare function isJudgeSpan(s: Span): s is JudgeSpan;
+declare function isSandboxSpan(s: Span): s is SandboxSpan;
+interface RunFilter {
+    scenarioId?: string;
+    variantId?: string;
+    status?: RunStatus;
+    since?: number;
+    until?: number;
+    tag?: {
+        key: string;
+        value: string;
+    };
+    parentRunId?: string;
+    projectId?: string;
+    chatId?: string;
+    layer?: RunLayer;
+}
+interface SpanFilter {
+    runId?: string;
+    parentSpanId?: string;
+    kind?: SpanKind;
+    name?: string;
+    toolName?: string;
+    judgeId?: string;
+    since?: number;
+    until?: number;
+}
+interface EventFilter {
+    runId?: string;
+    spanId?: string;
+    kind?: EventKind;
+    since?: number;
+    until?: number;
+}
+interface TraceStore {
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+declare class InMemoryTraceStore implements TraceStore {
+    private runs;
+    private allSpans;
+    private allEvents;
+    private allArtifacts;
+    private allBudget;
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+interface FileSystemTraceStoreOptions {
+    dir: string;
+    /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
+    maxBytes?: number;
+}
+declare class FileSystemTraceStore implements TraceStore {
+    private dir;
+    private maxBytes;
+    /** Lazy in-memory index for queries — populated on first read. */
+    private index?;
+    private loaded;
+    constructor(options: FileSystemTraceStoreOptions);
+    private ensureDir;
+    private append;
+    private insertInto;
+    private load;
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+/**
+ * TraceEmitter — hierarchical span builder that auto-parents using an
+ * internal stack. One emitter per Run; emitters do NOT share state.
+ *
+ * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
+ * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
+ * have to thread spanIds manually. For async workflows that can't use
+ * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
+ * explicitly.
+ */
+interface SpanHandle<S extends Span = Span> {
+    span: S;
+    end(patch?: Partial<S>): Promise<void>;
+    fail(error: string | Error, patch?: Partial<S>): Promise<void>;
+}
+interface TraceEmitterOptions {
+    runId?: string;
+    /** Inject a clock for deterministic tests. */
+    now?: () => number;
+    /** Inject an id generator for deterministic tests. */
+    id?: () => string;
+}
+declare class TraceEmitter {
+    private store;
+    private stack;
+    private _runId;
+    private now;
+    private id;
+    constructor(store: TraceStore, options?: TraceEmitterOptions);
+    get runId(): string;
+    startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
+    endRun(outcome?: RunOutcome): Promise<void>;
+    abortRun(reason: string): Promise<void>;
+    span<S extends Span = Span>(init: {
+        kind: SpanKind;
+        name: string;
+        parentSpanId?: string;
+        attributes?: Record<string, unknown>;
+    } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
+    private handle;
+    private pop;
+    llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
+    tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
+    retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
+    recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
+    sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
+    emit(event: {
+        kind: EventKind;
+        spanId?: string;
+        payload?: Record<string, unknown>;
+    }): Promise<TraceEvent>;
+    recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
+        timestamp?: number;
+    }): Promise<BudgetLedgerEntry>;
+    recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
+    /**
+     * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
+     * Returns the fn's return value. Use this for the 95% case.
+     */
+    within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
+}
+/** Helper to build an LLM span handle args object from a provider-shaped response. */
+declare function llmSpanFromProvider(args: {
+    name?: string;
+    model: string;
+    messages: Message[];
+    output: string;
+    usage?: {
+        inputTokens?: number;
+        outputTokens?: number;
+        cachedTokens?: number;
+        reasoningTokens?: number;
+    };
+    costUsd?: number;
+    finishReason?: string;
+}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
+/**
+ * Typed query helpers over TraceStore.
+ *
+ * Not a full SQL engine — a minimal, composable set of operators that
+ * cover the canned-pipeline use cases. For ad-hoc analytics, persist to
+ * NDJSON and point DuckDB at it; the schema is stable so external SQL
+ * tooling works out of the box.
+ */
+declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
+declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
+declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
+declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
+/** Group spans by any key selector. */
+declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
+/** Hash tool arguments to an orderless-key-stable string for de-duplication. */
+declare function argHash(args: unknown): string;
+/** Sum an LLM-span array into aggregate token + cost. */
+declare function aggregateLlm(spans: LlmSpan[]): {
+    inputTokens: number;
+    outputTokens: number;
+    cachedTokens: number;
+    costUsd: number;
+};
+/** Pick the outcome's failure class when present, else derive 'success' from run status. */
+declare function runFailureClass(run: Run): FailureClass;
+/**
+ * Redaction — remove PII / secrets from trace payloads before persist.
+ *
+ * Pre-persistence rules mean raw traces in storage are already scrubbed.
+ * Unredacted variants (for debugging / post-mortems) live in a separate
+ * storage layer with stricter access controls; this module only covers
+ * the default scrub-then-persist path.
+ *
+ * Rules compose: pass an array of `RedactionRule`, each is applied in
+ * order. Strings that match get replaced with a tagged sentinel so the
+ * eval framework can count how many redactions happened per run
+ * (surfaced via `redaction_applied` events).
+ */
+interface RedactionRule {
+    id: string;
+    pattern: RegExp;
+    /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
+    replacement?: string;
+}
+interface RedactionReport {
+    redactionCount: number;
+    byRule: Record<string, number>;
+}
+/** OWASP / common-sense defaults — extend per-domain. */
+declare const DEFAULT_REDACTION_RULES: RedactionRule[];
+declare const REDACTION_VERSION = "1.0.0";
+/**
+ * Redact a single string. Returns the new string and a per-rule count of
+ * how many substitutions fired.
+ */
+declare function redactString(input: string, rules?: RedactionRule[]): {
+    output: string;
+    report: RedactionReport;
+};
+/**
+ * Walk a JSON-ish value applying `redactString` to every string leaf.
+ * Arrays and plain objects are recursed; other types pass through
+ * untouched. Circular references throw — traces should be tree-shaped.
+ */
+declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
+    value: unknown;
+    report: RedactionReport;
+};
+/**
+ * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
+ * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
+ *
+ * Wire format only. We do NOT depend on the @opentelemetry SDK — that
+ * would drag in polyfills incompatible with Workers/Edge. Consumers
+ * push the JSON to their collector of choice via HTTP.
+ *
+ * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
+ */
+declare const OTEL_AGENT_EVAL_SCOPE: {
+    name: string;
+    version: string;
+};
+interface OtlpSpan {
+    traceId: string;
+    spanId: string;
+    parentSpanId?: string;
+    name: string;
+    kind: number;
+    startTimeUnixNano: string;
+    endTimeUnixNano: string;
+    attributes: Array<{
+        key: string;
+        value: {
+            stringValue?: string;
+            intValue?: string;
+            doubleValue?: number;
+            boolValue?: boolean;
+        };
+    }>;
+    events?: Array<{
+        timeUnixNano: string;
+        name: string;
+        attributes?: OtlpSpan['attributes'];
+    }>;
+    status?: {
+        code: number;
+        message?: string;
+    };
+}
+interface OtlpResourceSpans {
+    resource: {
+        attributes: OtlpSpan['attributes'];
+    };
+    scopeSpans: Array<{
+        scope: typeof OTEL_AGENT_EVAL_SCOPE;
+        spans: OtlpSpan[];
+    }>;
+}
+interface OtlpExport {
+    resourceSpans: OtlpResourceSpans[];
+}
+/** Export a single run's spans + events in OTLP/JSON. */
+declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
+/**
+ * SandboxHarness — executes a scenario in an isolated environment and
+ * emits a rich SandboxSpan into the trace.
+ *
+ * Two built-in drivers:
+ *   - `SubprocessSandboxDriver` — spawn in a local cwd with env vars.
+ *     Fast, no dependencies, fine for unit tests and most CI gates.
+ *   - `DockerSandboxDriver` — lifted from tangle-router's sandbox path;
+ *     shells out to `docker run`. Stronger isolation, slower startup.
+ *
+ * Consumers implement `SandboxDriver` for custom backends (Firecracker,
+ * Cloudflare sandbox product, etc.). The harness doesn't care which.
+ */
+interface HarnessConfig {
+    /** Setup command (e.g. "pnpm install"). Non-zero exit fails the run. */
+    setupCommand?: string;
+    /** Run command (e.g. "pnpm build"). */
+    runCommand?: string;
+    /** Test command (e.g. "pnpm test --run"). Drives the test count + pass count. */
+    testCommand?: string;
+    /** Absolute cwd for the subprocess driver. Ignored by docker driver. */
+    cwd?: string;
+    /** Max wall-clock per phase in ms. Default 10 minutes. */
+    timeoutMs?: number;
+    /** Image for the docker driver. */
+    image?: string;
+    /** Extra env vars (validated; shell-escaped). */
+    env?: Record<string, string>;
+    /** Parser for the test output — maps stdout/stderr/exit code → pass count. */
+    testParser?: TestOutputParser;
+}
+interface TestOutputParser {
+    id: string;
+    parse(stdout: string, stderr: string, exitCode: number): {
+        testsTotal: number;
+        testsPassed: number;
+    } | undefined;
+}
+interface SandboxResult {
+    phase: 'setup' | 'run' | 'test';
+    exitCode: number;
+    stdout: string;
+    stderr: string;
+    wallMs: number;
+    testsTotal?: number;
+    testsPassed?: number;
+}
+interface SandboxDriver {
+    id: string;
+    exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
+}
+/** Vitest default summary line: "Tests  X passed | Y failed". */
+declare const vitestTestParser: TestOutputParser;
+/** Pytest default: "collected N items" + " X passed, Y failed". */
+declare const pytestTestParser: TestOutputParser;
+/** Jest: "Tests: X passed, Y total" (and optional failed). */
+declare const jestTestParser: TestOutputParser;
+/** Composite parser — tries a list of parsers in order. */
+declare function composeParsers(...parsers: TestOutputParser[]): TestOutputParser;
+declare class SubprocessSandboxDriver implements SandboxDriver {
+    id: string;
+    exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
+}
+declare class DockerSandboxDriver implements SandboxDriver {
+    id: string;
+    exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
+}
+interface SandboxHarnessResult {
+    passed: boolean;
+    setup?: SandboxResult;
+    run?: SandboxResult;
+    test?: SandboxResult;
+    totalWallMs: number;
+    /** Final score — 0 when no tests; otherwise testsPassed/testsTotal. */
+    score: number;
+}
+declare class SandboxHarness {
+    private driver;
+    constructor(driver?: SandboxDriver);
+    run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
+}
+/**
+ * TestGradedScenario — a scenario whose score comes from a test suite.
+ *
+ * This is the SWE-bench pattern generalized. The scenario ships:
+ *   - fixture data (setup instructions)
+ *   - a test command the harness runs
+ *   - optional assertion overrides
+ *
+ * The runner emits a run, delegates to SandboxHarness, records the
+ * outcome, and returns a structured verdict. Consumers bind their own
+ * agent execution to this contract.
+ */
+interface TestGradedScenario {
+    id: string;
+    description?: string;
+    harness: HarnessConfig;
+    /** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
+    passThreshold?: number;
+    /** Provenance for dataset tracking. */
+    datasetVersion?: string;
+    /** Free-form tags (difficulty, category, etc.). */
+    tags?: Record<string, string>;
+}
+interface TestGradedRunOptions {
+    variantId?: string;
+    driver?: SandboxDriver;
+    /** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
+    provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
+}
+interface TestGradedRunResult {
+    runId: string;
+    scenario: TestGradedScenario;
+    harness: SandboxHarnessResult;
+    pass: boolean;
+    score: number;
+    failureClass?: FailureClass;
+}
+declare function runTestGradedScenario(scenario: TestGradedScenario, store: TraceStore, options?: TestGradedRunOptions): Promise<TestGradedRunResult>;
+/**
+ * BudgetGuard — enforces token / wall-clock / call / $ caps, records
+ * a ledger entry on every decrement, emits `budget_breach` + throws
+ * `BudgetBreachError` when a cap is hit.
+ *
+ * Wraps a TraceEmitter. The emitter persists ledger entries + breach
+ * events so the classifier, pipelines, and reports can all read
+ * budget state from the trace corpus — no separate accounting.
+ */
+declare class BudgetBreachError extends Error {
+    dimension: keyof BudgetSpec;
+    limit: number;
+    attempted: number;
+    constructor(dimension: keyof BudgetSpec, limit: number, attempted: number);
+}
+declare class BudgetGuard {
+    private consumed;
+    private emitter;
+    private budget;
+    private startedAt;
+    constructor(emitter: TraceEmitter, budget: BudgetSpec, now?: () => number);
+    /** Record consumption. Throws `BudgetBreachError` if any dimension exceeds its cap. */
+    charge(delta: Partial<Record<keyof BudgetSpec, number>>, spanId?: string): Promise<void>;
+    /** Convenience: advance wall-clock budget based on elapsed wall time. */
+    tickWall(nowMs: number, spanId?: string): Promise<void>;
+    get state(): Record<keyof BudgetSpec, number>;
+}
+/**
+ * Failure taxonomy — canonical classes + a default classifier.
+ *
+ * Every failed run should end up in a named class. The classifier here
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
+ * the consumer for novel cases and trained into the rule base over time.
+ *
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
+ * returned class as `Run.outcome.failureClass`.
+ */
+interface FailureContext {
+    run: Run;
+    spans: Span[];
+    events: TraceEvent[];
+}
+interface FailureClassification {
+    failureClass: FailureClass;
+    reason: string;
+    triggerSpanId?: string;
+    triggerEventId?: string;
+}
+/** Ordered rules — first match wins. */
+interface FailureRule {
+    id: string;
+    match: (ctx: FailureContext) => {
+        failureClass: FailureClass;
+        reason: string;
+        triggerSpanId?: string;
+        triggerEventId?: string;
+    } | null;
+}
+declare const DEFAULT_RULES: FailureRule[];
+/** Classify the failure mode of a run using an ordered rule list. */
+declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
+/**
+ * Trajectory — ordered, structured view over a run's spans.
+ *
+ * A pure function `buildTrajectory(store, runId) → Trajectory` returns
+ * a topologically ordered list of `TrajectoryStep` with parent-child
+ * grouping collapsed into a single line-of-agent-work. Separate
+ * analyzers (stuck-loop detection, waste ratio) live in
+ * `pipelines/` and consume the trajectory.
+ */
+interface TrajectoryStep {
+    index: number;
+    span: Span;
+    /** Depth in the span tree from the root. 0 = top-level. */
+    depth: number;
+    /** Events attached to this span. */
+    events: TraceEvent[];
+}
+interface Trajectory {
+    runId: string;
+    steps: TrajectoryStep[];
+    llmTurns: number;
+    toolCalls: number;
+    judgeVerdicts: number;
+    retrievals: number;
+    totalDurationMs: number;
+}
+declare function buildTrajectory(store: TraceStore, runId: string): Promise<Trajectory>;
+/**
+ * Tool-use metrics — derived purely from trace data.
+ *
+ * No scoring assumptions: consumers supply optional ground-truth tool
+ * selections per turn + optional "information used downstream" signals.
+ * Without those, we still compute descriptive metrics (error rate,
+ * retry rate, duplicate-call rate) that are useful on their own.
+ */
+interface ToolUseMetrics {
+    runId: string;
+    totalCalls: number;
+    byTool: Record<string, ToolStats>;
+    errorRate: number;
+    /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
+    duplicateRate: number;
+    /** Ratio of error calls followed by ≥1 retry on same tool. */
+    retryRate: number;
+    /** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
+    selectionAccuracy?: number;
+}
+interface ToolStats {
+    calls: number;
+    errors: number;
+    avgLatencyMs: number;
+    duplicates: number;
+}
+interface ToolUseOptions {
+    /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
+    selectionLabels?: Record<string, boolean>;
+}
+declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
+/**
+ * StuckLoopView — detects when an agent calls the same tool with the
+ * same (or structurally similar) arguments ≥ N times in a short window.
+ *
+ * Rationale: agents that loop are the number-one production failure
+ * mode on long-horizon flows. The view returns (runId, toolName,
+ * argHash, occurrences, windowMs) for each detected loop plus a
+ * fraction of runs affected.
+ */
+interface StuckLoopFinding {
+    runId: string;
+    toolName: string;
+    argHash: string;
+    occurrences: number;
+    spanIds: string[];
+    /** Milliseconds between first and last call in the loop. */
+    windowMs: number;
+}
+interface StuckLoopReport {
+    findings: StuckLoopFinding[];
+    affectedRunRatio: number;
+    totalRuns: number;
+}
+interface StuckLoopOptions {
+    /** Minimum call count to flag a loop (default 3). */
+    minOccurrences?: number;
+    /** Filter to a specific runId; omit to scan the entire corpus. */
+    runId?: string;
+}
+declare function stuckLoopView(store: TraceStore, options?: StuckLoopOptions): Promise<StuckLoopReport>;
+/**
+ * ToolWasteView — fraction of tool calls whose results weren't used
+ * downstream. Without a "used" signal we fall back to structural
+ * proxies: error calls, duplicate calls, and tool calls followed by
+ * zero subsequent LLM spans are all considered waste.
+ *
+ * Consumers can pass a `usageOracle` that inspects a tool span and
+ * returns true iff the tool's result appears in a later LLM message,
+ * artifact, or state mutation — that's the canonical definition; the
+ * default heuristic is a reasonable fallback.
+ */
+interface ToolWasteFinding {
+    runId: string;
+    wastedCalls: number;
+    totalCalls: number;
+    wasteRate: number;
+}
+interface ToolWasteReport {
+    byRun: ToolWasteFinding[];
+    overallWasteRate: number;
+}
+interface ToolWasteOptions {
+    runId?: string;
+    usageOracle?: (tool: ToolSpan, later: {
+        llm: Awaited<ReturnType<typeof llmSpans>>;
+    }) => boolean;
+}
+declare function toolWasteView(store: TraceStore, options?: ToolWasteOptions): Promise<ToolWasteReport>;
+/**
+ * BudgetBreachView — aggregates breach events across the corpus.
+ *
+ * Answers: which dimensions get hit most often? Which scenarios are
+ * underbudgeted? Which variants trigger the most breaches?
+ */
+interface BudgetBreachFinding {
+    runId: string;
+    scenarioId: string;
+    variantId?: string;
+    dimension: keyof BudgetSpec;
+    limit: number;
+    consumed: number;
+    excessRatio: number;
+    timestamp: number;
+}
+interface BudgetBreachReport {
+    findings: BudgetBreachFinding[];
+    byDimension: Record<string, number>;
+    byScenario: Record<string, number>;
+    byVariant: Record<string, number>;
+    totalRuns: number;
+    breachedRunRatio: number;
+}
+declare function budgetBreachView(store: TraceStore, options?: {
+    scenarioId?: string;
+    variantId?: string;
+}): Promise<BudgetBreachReport>;
+/**
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
+ *
+ * Each cluster includes: N runs, scenarios affected, representative
+ * error message, a proposed mitigation hint (rule → action table).
+ */
+interface FailureCluster {
+    failureClass: FailureClass;
+    /** Tool name when the trigger was a tool span, else undefined. */
+    toolName?: string;
+    /** First 16 chars of argHash — clusters similar args. */
+    argPrefix?: string;
+    runCount: number;
+    scenarioIds: string[];
+    exampleError?: string;
+    exampleRunId: string;
+}
+interface FailureClusterReport {
+    clusters: FailureCluster[];
+    totalFailures: number;
+    totalRuns: number;
+}
+declare function failureClusterView(store: TraceStore, options?: {
+    rules?: FailureRule[];
+    minClusterSize?: number;
+}): Promise<FailureClusterReport>;
+/**
+ * JudgeAgreementView — pairwise agreement between judges across the
+ * corpus, grouped by dimension.
+ *
+ * Output drives two workflows:
+ *   - Judge robustness audit: "does Claude agree with GPT at κ ≥ 0.6?"
+ *   - Calibration tracking: κ vs golden human labels over time (by
+ *     providing a `humanGoldenJudgeId`).
+ */
+interface JudgePair {
+    judgeA: string;
+    judgeB: string;
+    dimension: string;
+    /** Number of (targetSpanId, dimension) tuples both judges scored. */
+    commonItems: number;
+    pearson: number;
+    krippendorff: number;
+}
+interface JudgeAgreementReport {
+    pairs: JudgePair[];
+    dimensions: string[];
+    judgeIds: string[];
+}
+declare function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport>;
+/**
+ * FirstDivergenceView — aligns two trajectories by step index, reports
+ * the first step where they differ.
+ *
+ * "Differ" is configurable — default is (kind, toolName if tool, model
+ * if llm). Use this view to attribute "why is variant B better?" to a
+ * specific step rather than an aggregate mean delta.
+ */
+interface DivergenceReport {
+    runA: string;
+    runB: string;
+    firstDivergenceIndex: number | null;
+    aStep?: TrajectoryStep;
+    bStep?: TrajectoryStep;
+    reason?: string;
+    /** Common prefix length (steps that matched). */
+    commonPrefixLen: number;
+}
+interface DivergenceOptions {
+    /** Returns true if two steps are considered equal. Default: kind + tool/model match. */
+    stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
+}
+declare function firstDivergenceView(store: TraceStore, runA: string, runB: string, options?: DivergenceOptions): Promise<DivergenceReport>;
+/**
+ * Baseline regression detection.
+ *
+ * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
+ * to: "is this run measurably worse than baseline?" — with enough
+ * statistical rigor to distinguish noise from drift.
+ *
+ * Uses:
+ *   - Welch's t-test (unequal variance) for per-metric mean comparison
+ *   - Cohen's d for effect size magnitude
+ *   - IQR for stability flag (unstable samples can't be trusted for comparisons)
+ *
+ * Returns a structured verdict: improved | regressed | stable | unstable.
+ */
+interface MetricSamples {
+    /** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
+    metric: string;
+    /** Whether higher values are better. */
+    higherIsBetter: boolean;
+    baseline: number[];
+    candidate: number[];
+}
+interface MetricVerdict {
+    metric: string;
+    baselineMean: number;
+    candidateMean: number;
+    delta: number;
+    cohensD: number;
+    welchT: number;
+    welchDf: number;
+    welchP: number;
+    stable: boolean;
+    /** IQR of the combined samples — used as a rough stability indicator. */
+    iqr: number;
+    verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
+}
+interface BaselineReport {
+    metrics: MetricVerdict[];
+    /** True if any critical metric regressed. */
+    hasRegression: boolean;
+    /** True if any metric is unstable (too noisy to judge). */
+    hasUnstable: boolean;
+}
+interface BaselineOptions {
+    /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
+    effectThreshold?: number;
+    /** p-value threshold for statistical significance (default 0.05). */
+    alpha?: number;
+    /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
+    unstableCvThreshold?: number;
+}
+/**
+ * Compare candidate samples against baseline per metric. Verdict logic:
+ *   - unstable: IQR/|mean| > threshold on either set — not enough signal
+ *   - improved: meaningful effect in the "better" direction AND p < alpha
+ *   - regressed: meaningful effect in the "worse" direction AND p < alpha
+ *   - stable: otherwise (no significant change)
+ */
+declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
+/** Inter-quartile range; 0 when the sample has no spread. */
+declare function iqr(xs: number[]): number;
+/**
+ * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
+ * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
+ * when df is large.
+ */
+declare function welchsTTest(a: number[], b: number[]): {
+    t: number;
+    df: number;
+    p: number;
+};
+/**
+ * RegressionView — compares a candidate slice to a baseline slice on a
+ * named metric. Delegates the statistics (Welch's t-test, Cohen's d,
+ * IQR stability) to `baseline.ts`.
+ *
+ * This is the entry point for CI regression gates: "given runs tagged
+ * release=A and release=B, did any metric regress?"
+ */
+interface RegressionSpec {
+    metric: string;
+    higherIsBetter: boolean;
+    /** Extract a scalar from a run. Default extractors handle common metrics. */
+    extract?: (run: Run, store: TraceStore) => Promise<number | null>;
+}
+interface RegressionOptions extends BaselineOptions {
+    baseline: RunFilter;
+    candidate: RunFilter;
+}
+declare function regressionView(store: TraceStore, metrics: RegressionSpec[], options: RegressionOptions): Promise<BaselineReport>;
+/**
+ * SLO gates — quantified pass/fail primitives beyond score thresholds.
+ *
+ * Lifted from ADC's sandbox eval suite. Each SLO defines a metric, a
+ * threshold, and a severity (critical | warning). Critical breaches fail
+ * the eval; warnings are reported but don't gate CI. Margin is the
+ * ratio of actual to threshold for histogramming "how close are we?"
+ *
+ * Consumers assemble their own SLO arrays; DEFAULT_AGENT_SLOS covers
+ * the generic agent flow (provision, first token, pass rate, cost).
+ */
+type SloSeverity = 'critical' | 'warning';
+type SloComparator = 'lte' | 'gte';
+interface Slo {
+    /** Stable identifier — must be unique within an SLO set. */
+    id: string;
+    /** Human description, shown in reports. */
+    description: string;
+    /** Metric key looked up in the candidate record. */
+    metric: string;
+    /** Whether the metric should stay below (lte) or above (gte) threshold. */
+    comparator: SloComparator;
+    /** Threshold value. */
+    threshold: number;
+    severity: SloSeverity;
+}
+interface SloCheckResult {
+    slo: Slo;
+    actual: number | undefined;
+    passed: boolean;
+    /** actual/threshold for lte, threshold/actual for gte. >1 means safe margin; <1 means breach. 0 when actual is missing. */
+    margin: number;
+    detail: string;
+}
+interface SloReport {
+    results: SloCheckResult[];
+    passedCritical: boolean;
+    criticalBreaches: SloCheckResult[];
+    warnings: SloCheckResult[];
+}
+/**
+ * Evaluate an SLO set against a candidate metrics object. Missing metrics
+ * count as breaches — if you declared it, you must measure it.
+ */
+declare function checkSlos(metrics: Record<string, number>, slos: Slo[]): SloReport;
+/** Reference SLO set for agent-style evals. Tune per-product by cloning + overriding. */
+declare const DEFAULT_AGENT_SLOS: Slo[];
+/**
+ * Declarative oracles — ground-truth assertions without an LLM.
+ *
+ * Lifted from browser-agent-driver's _oracle.mjs. When you know the
+ * expected outcome exactly (a URL, a text fragment, a JSON shape), you
+ * don't need an LLM judge — you need a regex. These oracles are
+ * composable pass/fail checks over an observation bundle.
+ *
+ * Each oracle returns { pass, detail, evidence? } and has a short
+ * `id` for reporting. `evaluateOracles` runs a batch and aggregates.
+ */
+interface OracleObservation {
+    /** Final observable text output from the agent (response, page snapshot, stdout). */
+    text?: string;
+    /** Final URL — for browser-style scenarios. */
+    url?: string;
+    /** Any structured JSON the agent produced. */
+    json?: unknown;
+    /** Free-form context used by custom oracles. */
+    context?: Record<string, unknown>;
+}
+interface OracleResult {
+    id: string;
+    pass: boolean;
+    detail: string;
+    evidence?: string;
+}
+interface Oracle {
+    id: string;
+    check(obs: OracleObservation): OracleResult;
+}
+declare function textInSnapshot(needle: string, opts?: {
+    caseSensitive?: boolean;
+}): Oracle;
+declare function urlContains(fragment: string): Oracle;
+declare function jsonShape(expected: Record<string, unknown>): Oracle;
+declare function regexMatches(pattern: RegExp): Oracle;
+/**
+ * Anti-bot detector — distinguishes genuine failures from blocked navigation
+ * (cloudflare, recaptcha, etc). Returns an Oracle that PASSES when no block
+ * marker is present; on block, detail names the blocker so runners can tag
+ * results as "blocked" rather than "failed". Lifted from browser-agent-driver.
+ */
+declare function notBlocked(): Oracle;
+interface OracleReport {
+    results: OracleResult[];
+    pass: boolean;
+    passCount: number;
+    failCount: number;
+    /** 0-1 ratio of oracles passed. */
+    score: number;
+}
+/** Run all oracles against one observation and aggregate. */
+declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): OracleReport;
+/**
+ * Cost tracker — token + USD accounting per scenario and per run.
+ *
+ * Lifted from tax/legal metrics.ts + tangle-router UsageEvent. Every
+ * optimizer needs to know "is the quality gain worth the cost delta?",
+ * and every dashboard needs dollars-per-completed-task. MODEL_PRICING
+ * from metrics.ts stays authoritative for estimate math; this module
+ * adds the aggregation + per-scenario roll-up that was duplicated
+ * across 4 verticals.
+ */
+interface TokenSpec {
+    inputTokens: number;
+    outputTokens: number;
+    cachedTokens?: number;
+    reasoningTokens?: number;
+}
+interface CostEntry extends TokenSpec {
+    scenarioId: string;
+    model: string;
+    /** Override estimate with an observed cost (e.g. from provider response). */
+    actualCostUsd?: number;
+    timestamp: number;
+    /** Free-form tags (variant id, round #, etc.). */
+    tags?: Record<string, string>;
+}
+interface ScenarioCost {
+    scenarioId: string;
+    entries: CostEntry[];
+    totalInputTokens: number;
+    totalOutputTokens: number;
+    totalCachedTokens: number;
+    totalCostUsd: number;
+    /** Pass flag — set by consumer via markOutcome; used for cost-per-completed-task. */
+    completed?: boolean;
+}
+declare class CostTracker {
+    private byScenario;
+    record(entry: Omit<CostEntry, 'timestamp'> & {
+        timestamp?: number;
+    }): CostEntry;
+    markOutcome(scenarioId: string, completed: boolean): void;
+    get(scenarioId: string): ScenarioCost | undefined;
+    list(): ScenarioCost[];
+    summary(): CostSummary;
+}
+interface CostSummary {
+    scenarioCount: number;
+    completedCount: number;
+    totalInputTokens: number;
+    totalOutputTokens: number;
+    totalCostUsd: number;
+    avgCostPerScenarioUsd: number;
+    /** Total USD / completed scenarios — null when nothing completed. */
+    costPerCompletedTaskUsd: number | null;
+}
+/**
+ * Pareto frontier — multi-objective optimization over candidate runs.
+ *
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
+ * ttfb), you rarely have a single "winner" — you have a set of
+ * non-dominated candidates. This module exposes:
+ *
+ *   - `paretoFrontier`: filter a set of candidates to the non-dominated ones
+ *   - `dominates`: does A dominate B across all objectives?
+ *
+ * Each objective is declared with a direction: 'maximize' (higher=better)
+ * or 'minimize' (lower=better). Candidates are any object; pass an
+ * `objective(candidate)` accessor.
+ */
+type Direction = 'maximize' | 'minimize';
+interface Objective<T> {
+    /** Stable label used in reports. */
+    name: string;
+    direction: Direction;
+    value: (candidate: T) => number;
+}
+interface ParetoResult<T> {
+    frontier: T[];
+    dominated: T[];
+    /** Index map: frontier[i] dominates each of dominatedBy[i]. */
+    dominanceMap: Array<{
+        dominator: T;
+        dominated: T[];
+    }>;
+}
+/** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
+declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
+/**
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
+ * objective are excluded (can't rank them). A candidate enters the frontier
+ * iff no other candidate dominates it.
+ */
+declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
+/**
+ * Series convergence — detects whether a sequence of scalar measurements
+ * is stabilizing, drifting, or noisy.
+ *
+ * Lifted from ADC convergence.ts. The per-turn `ConvergenceTracker` is
+ * about progress *within* a single run; this module is about drift
+ * *across* runs (e.g. "are my nightly eval scores stabilizing?").
+ *
+ * Three signals:
+ *   - stabilized: last K values have low variance (< epsilon) — done
+ *   - drifting:   recent trend is monotonic and beyond noise — regressing or improving
+ *   - noisy:      neither — keep iterating, but flag as untrustworthy for gating
+ */
+interface SeriesConvergenceOptions {
+    /** Window size for "recent" analysis (default 5). */
+    window?: number;
+    /** Coefficient-of-variation threshold below which the window is stabilized (default 0.05 = 5%). */
+    stableCv?: number;
+    /** Minimum monotone run length to call drift (default 3). */
+    driftRun?: number;
+}
+interface SeriesConvergenceResult {
+    state: 'stabilized' | 'drifting-up' | 'drifting-down' | 'noisy' | 'insufficient-data';
+    windowMean: number;
+    windowCv: number;
+    /** Longest monotonic run at the tail of the series (positive for up, negative for down). */
+    tailRun: number;
+    /** True when n ≥ window AND windowCv ≤ stableCv. */
+    stable: boolean;
+}
+declare function analyzeSeries(values: number[], options?: SeriesConvergenceOptions): SeriesConvergenceResult;
+/**
+ * State continuity scoring — measures how well a resumed/handed-off agent
+ * preserves prior work.
+ *
+ * Lifted from tax-agent's run-resume-eval.ts. When session 2 continues
+ * session 1's work, the key question is: did it preserve key artifacts,
+ * or start over and lose context? Each `ContinuityCheck` inspects one
+ * aspect (file preserved, key count grew, status advanced) and yields
+ * 0-1 credit; the aggregate is the simple mean.
+ *
+ * Generic over any "snapshot" shape — pass your own checks.
+ */
+interface ContinuitySnapshotPair<T> {
+    before: T;
+    after: T;
+}
+interface ContinuityCheck<T> {
+    /** Stable identifier; shown in the report. */
+    id: string;
+    /** Description of what this check measures. */
+    description: string;
+    /** Returns 0..1 credit for this dimension (1 = fully preserved/improved). */
+    score: (pair: ContinuitySnapshotPair<T>) => number;
+}
+interface ContinuityCheckResult {
+    id: string;
+    description: string;
+    score: number;
+    pass: boolean;
+}
+interface ContinuityReport {
+    results: ContinuityCheckResult[];
+    /** Mean of per-check scores, in 0..1. */
+    overallScore: number;
+    /** True iff ALL checks scored ≥ passThreshold. */
+    pass: boolean;
+}
+declare function scoreContinuity<T>(pair: ContinuitySnapshotPair<T>, checks: ContinuityCheck<T>[], options?: {
+    passThreshold?: number;
+}): ContinuityReport;
+/** Common check: a required key in a record exists and equals the prior value. */
+declare function keyPreserved<T extends Record<string, unknown>>(key: keyof T & string): ContinuityCheck<T>;
+/** Common check: a collection (array) grew or stayed the same size. */
+declare function collectionPreserved<T, K extends keyof T & string>(key: K, minRatio?: number): ContinuityCheck<T>;
+/** Common check: a status field advanced in an expected order. */
+declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
+/**
+ * Dataset — versioned, sliceable, content-hashed scenario collection.
+ *
+ * Scenarios stop being ephemeral arrays and become first-class
+ * artifacts. Every Dataset carries:
+ *   - content hash (sha256 over canonicalized scenario array)
+ *   - provenance (contributor, createdAt, sourceUrl)
+ *   - split labels (train | dev | test | holdout)
+ *   - difficulty tiers (easy | medium | hard | extreme)
+ *   - tags (free-form, per-scenario)
+ *
+ * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
+ * deterministic, reproducible subset. Holdout slices are locked: you
+ * can read them but `mutate` throws, which prevents "oh I'll just
+ * tweak that one scenario" contamination drift.
+ */
+type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
+type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
+interface DatasetScenario {
+    id: string;
+    /** Arbitrary payload; the framework doesn't interpret it. */
+    payload: unknown;
+    split?: DatasetSplit;
+    difficulty?: DatasetDifficulty;
+    /** Canary token that MUST NOT round-trip through a correct agent output. */
+    canary?: string;
+    tags?: Record<string, string>;
+}
+interface DatasetProvenance {
+    contributor?: string;
+    createdAt: string;
+    sourceUrl?: string;
+    license?: string;
+    description?: string;
+    /** Monotonic human-readable version (e.g. "2026.04.20"). */
+    version: string;
+}
+interface DatasetManifest {
+    name: string;
+    provenance: DatasetProvenance;
+    /** sha256 hex over canonicalized scenarios. */
+    contentHash: string;
+    scenarioCount: number;
+    splitCounts: Record<DatasetSplit, number>;
+}
+interface SliceOptions {
+    split?: DatasetSplit;
+    difficulty?: DatasetDifficulty;
+    /** Number of scenarios (random sample, seeded). Omit to take all that match. */
+    limit?: number;
+    seed?: number;
+    /** Predicate narrowing. Applied after split/difficulty filters. */
+    filter?: (scenario: DatasetScenario) => boolean;
+    /** If true, include scenarios marked as holdout. Default false. */
+    includeHoldout?: boolean;
+}
+/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
+declare class HoldoutLockedError extends Error {
+    constructor(datasetName: string);
+}
+declare class Dataset {
+    readonly name: string;
+    readonly provenance: DatasetProvenance;
+    private scenarios;
+    private locked;
+    constructor(init: {
+        name: string;
+        provenance: DatasetProvenance;
+        scenarios: DatasetScenario[];
+        locked?: boolean;
+    });
+    /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
+    all(): readonly DatasetScenario[];
+    get size(): number;
+    /**
+     * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
+     * the same arguments always produce the same slice across machines.
+     */
+    slice(options?: SliceOptions): DatasetScenario[];
+    /**
+     * Assemble the manifest (name + provenance + content hash + counts).
+     * Content hash is deterministic over canonicalized scenarios.
+     */
+    manifest(): Promise<DatasetManifest>;
+    /** Fresh unlocked copy — for post-release forks when mutation is needed. */
+    clone(overrides?: Partial<{
+        name: string;
+        version: string;
+    }>): Dataset;
+    lock(): void;
+    add(scenario: DatasetScenario): void;
+    remove(scenarioId: string): void;
+    /**
+     * Stable JSON-Lines serialization — deterministic byte-for-byte.
+     * Write to disk for contamination-verifiable archives.
+     */
+    toJsonl(): string;
+    static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
+}
+declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
+/**
+ * ContaminationGuard — ensures held-out scenarios don't leak into
+ * training/prompt paths, and flags model memorization.
+ *
+ * Three probes:
+ *   1. `checkCanaries(output, scenario)` — if the scenario carries a
+ *      canary token, it MUST NOT appear in the agent's output.
+ *      Canaries are strings that are statistically impossible to
+ *      reconstruct from the scenario description alone — so if they
+ *      echo back, the model memorized them.
+ *   2. `canaryLeakView(store)` — cross-corpus view of every run whose
+ *      output contained a canary, with the offending scenario + run.
+ *   3. `HoldoutAuditor` — wraps a Dataset and emits a structured error
+ *      on any code path that reads holdout scenarios but doesn't flag
+ *      `purpose: 'evaluation'`. Keeps engineers honest.
+ */
+interface CanaryLeak {
+    scenarioId: string;
+    canary: string;
+    runId?: string;
+    evidence: string;
+}
+declare function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[];
+/**
+ * Scan the LLM-output history in a corpus; returns every case where a
+ * canary from a known scenario appeared in agent output. Pass the full
+ * set of scenarios whose canaries you care about (typically the whole
+ * held-out slice).
+ */
+declare function canaryLeakView(store: TraceStore, scenarios: DatasetScenario[]): Promise<CanaryLeak[]>;
+declare class HoldoutAuditor {
+    private scenarios;
+    private accessLog;
+    constructor(scenarios: DatasetScenario[]);
+    /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
+    get(scenarioId: string, purpose: 'evaluation' | 'debugging'): DatasetScenario;
+    getAccessLog(): ReadonlyArray<{
+        scenarioId: string;
+        purpose: string;
+        at: number;
+    }>;
+}
+/**
+ * Red-team battery — adversarial scenario corpus with per-category
+ * scorers.
+ *
+ * Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
+ * that declares: category, expected refusal, attack surface. Scorers
+ * inspect the agent's output + trajectory and return a PASS/FAIL per
+ * category. Aggregated via `redTeamReport`.
+ *
+ * The built-in corpus is intentionally conservative — 40 cases
+ * covering the most-seen attack patterns. Extend via `extendCorpus`.
+ */
+type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
+interface RedTeamPayload {
+    category: RedTeamCategory;
+    /** The hostile input the agent sees. */
+    input: string;
+    /**
+     * What a safe agent should do: "refuse" means explicit refusal,
+     * "ignore" means process benignly without executing the attack.
+     */
+    expected: 'refuse' | 'ignore';
+    /** Substring(s) that MUST NOT appear in output. */
+    forbiddenStrings?: string[];
+    /** Tool names the agent MUST NOT call for this scenario. */
+    forbiddenTools?: string[];
+}
+interface RedTeamCase extends DatasetScenario {
+    payload: RedTeamPayload;
+}
+interface RedTeamFinding {
+    scenarioId: string;
+    category: RedTeamCategory;
+    passed: boolean;
+    reason: string;
+    evidence?: string;
+}
+interface RedTeamReport {
+    findings: RedTeamFinding[];
+    passRateByCategory: Record<RedTeamCategory, number>;
+    overallPassRate: number;
+}
+/** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
+declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
+declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
+/**
+ * Score a single agent response against the red-team case.
+ * `toolCalls` should be the names of tools the agent invoked during the run.
+ */
+declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
+/** Aggregate red-team findings into per-category pass rates. */
+declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
+/**
+ * Extract the tool-call names from a corpus run — convenience for the
+ * common pipeline (run the scenario → score the run).
+ */
+declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
+/**
+ * Power analysis + multiple-comparison correction.
+ *
+ * Two jobs:
+ *   1. Before running: `requiredSampleSize({ effect, alpha, power })`
+ *      returns the N per arm needed to detect a given effect size.
+ *   2. After running: `benjaminiHochberg(pValues, fdr)` and
+ *      `bonferroni(pValues, alpha)` correct for multiple pairwise tests
+ *      so PromptOptimizer's "significant" flag is statistically honest.
+ *
+ * Fixes the correctness bug in 0.2's PromptOptimizer which applied
+ * alpha directly across n*(n-1)/2 pairwise tests without correction —
+ * dramatically inflating false-positive rate when variants ≥ 3.
+ */
+/**
+ * Required N per arm for a two-sample comparison at target effect size,
+ * alpha, and power. Uses the normal-approximation formula:
+ *
+ *   n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
+ *
+ * where d is Cohen's d. Returns Infinity for effect ≤ 0.
+ */
+declare function requiredSampleSize(opts: {
+    effect: number;
+    alpha?: number;
+    power?: number;
+    twoSided?: boolean;
+}): number;
+/** Bonferroni adjustment: multiply every p-value by the number of tests, clamp at 1. */
+declare function bonferroni(pValues: number[], alpha?: number): {
+    adjusted: number[];
+    significant: boolean[];
+};
+/**
+ * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
+ * significance at the target FDR. Properly handles ties and preserves
+ * monotonicity of q-values.
+ */
+declare function benjaminiHochberg(pValues: number[], fdr?: number): {
+    qValues: number[];
+    significant: boolean[];
+};
+/**
+ * Behavior DSL — pytest-style assertions over a run's trajectory.
+ *
+ * Shape:
+ *   expect(store, runId).toCall('search').withArgs({ q: /.+/ })
+ *   expect(store, runId).toRefuse()
+ *   expect(store, runId).toOutputMatch(/confirmed/i)
+ *   expect(store, runId).toRespectBudget('tokens')
+ *   expect(store, runId).toCompleteWithin({ wallMs: 30_000 })
+ *
+ * Each matcher returns an `Expectation` with `.check() → MatcherResult`
+ * so the DSL is composable with suite runners — you can collect all
+ * expectations into a report instead of throwing on first failure.
+ */
+interface MatcherResult {
+    ok: boolean;
+    detail: string;
+    evidence?: string;
+}
+interface Expectation {
+    /** Human-facing label; used in reports. */
+    label: string;
+    check(): Promise<MatcherResult>;
+}
+declare class BehaviorAssertion {
+    private store;
+    private runId;
+    constructor(store: TraceStore, runId: string);
+    toCall(toolName: string): CallExpectation;
+    toRefuse(markers?: RegExp[]): Expectation;
+    toOutputMatch(pattern: RegExp): Expectation;
+    toRespectBudget(dimension: keyof BudgetLedgerEntry['dimension'] | 'tokens' | 'wallMs' | 'calls' | 'usd'): Expectation;
+    toCompleteWithin(limits: {
+        wallMs?: number;
+        toolCalls?: number;
+        llmTurns?: number;
+    }): Expectation;
+    toNeverCall(toolName: string): Expectation;
+}
+declare class CallExpectation implements Expectation {
+    private store;
+    private runId;
+    private toolName;
+    private argMatchers;
+    private minCount;
+    private maxCount;
+    constructor(store: TraceStore, runId: string, toolName: string);
+    get label(): string;
+    withArgs(shape: Record<string, unknown | RegExp>): this;
+    times(n: number): this;
+    atLeast(n: number): this;
+    atMost(n: number): this;
+    check(): Promise<MatcherResult>;
+}
+declare function expectAgent(store: TraceStore, runId: string): BehaviorAssertion;
+/** Runs every expectation, collects results. Never throws. */
+declare function runExpectations(expectations: Expectation[]): Promise<{
+    results: Array<{
+        label: string;
+        result: MatcherResult;
+    }>;
+    pass: boolean;
+    passCount: number;
+    failCount: number;
+}>;
+/**
+ * Judge calibration — measure judge quality against human gold + bias.
+ *
+ * Workflow:
+ *   1. Build a golden set: {itemId, humanScore}[].
+ *   2. Run candidate judges; each produces {itemId, score}.
+ *   3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
+ *   4. Run bias probes (positional, verbosity, self-preference) to
+ *      detect systematic score inflation.
+ *
+ * Returns actionable diagnostics, not a single number. Consumers then
+ * decide whether to trust the judge, retrain it, or add a tie-breaker.
+ */
+interface GoldenItem {
+    itemId: string;
+    humanScore: number;
+    /** Optional group used for per-group bias audits (e.g. model-of-output family). */
+    group?: string;
+}
+interface CandidateScore {
+    itemId: string;
+    score: number;
+    /** Optional — enables positional-bias analysis (did order matter?). */
+    positionOfAInput?: 'first' | 'second';
+}
+interface CalibrationResult {
+    n: number;
+    pearson: number;
+    /** Cohen's κ with quadratic weights over integer-rounded scores. */
+    kappa: number;
+    /** Mean absolute error vs human. */
+    mae: number;
+    /** Worst-5 miscalibrations (largest |judge - human|). */
+    worstItems: Array<{
+        itemId: string;
+        judge: number;
+        human: number;
+        delta: number;
+    }>;
+}
+declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
+interface PositionalBiasResult {
+    /**
+     * Score delta (first-position - second-position) averaged across items
+     * presented in both positions. Non-zero = positional bias.
+     */
+    avgDelta: number;
+    n: number;
+}
+/**
+ * Feed the same items to the judge twice with A/B swapped and pass all
+ * results here. Items that don't appear in both positions are ignored.
+ */
+declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
+interface VerbosityBiasResult {
+    /** Pearson correlation between output length and score. Strong positive = verbosity bias. */
+    pearson: number;
+    n: number;
+}
+declare function verbosityBias(samples: Array<{
+    outputLen: number;
+    score: number;
+}>): VerbosityBiasResult;
+interface SelfPreferenceResult {
+    /** Mean judge score when judge's family matches output's family. */
+    inFamilyMean: number;
+    outOfFamilyMean: number;
+    deltaMean: number;
+    n: number;
+}
+/**
+ * Pass the same scenarios scored with judge-model X grading outputs from
+ * model X (in-family) and model Y (out-of-family). Non-zero delta
+ * indicates self-preference.
+ */
+declare function selfPreference(samples: Array<{
+    score: number;
+    inFamily: boolean;
+}>): SelfPreferenceResult;
+/**
+ * CI gate — evaluate a corpus against threshold contracts and generate
+ * a human-readable PR/build comment.
+ *
+ * Three layers:
+ *   1. `ThresholdContract` declarations (YAML-equivalent TS objects)
+ *   2. `evaluateContract` runs the contracts against a TraceStore and
+ *      returns a structured report + overall pass/fail.
+ *   3. `renderMarkdownReport` formats the report for GitHub PR comments.
+ *
+ * Consumers wrap this in their own `gh pr comment` / CI integration —
+ * we don't ship the GitHub Action binary, just the library call that
+ * the action invokes.
+ */
+interface ContractMetric {
+    /** Metric id matching either a predefined key or a custom extractor. */
+    metric: string;
+    higherIsBetter: boolean;
+    /** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
+    maxRegression?: number;
+    /** Optional extractor if the metric isn't in the default set. */
+    extract?: (run: Run, store: TraceStore) => Promise<number | null>;
+}
+interface ThresholdContract {
+    name: string;
+    baseline: RunFilter;
+    candidate: RunFilter;
+    metrics: ContractMetric[];
+    slos?: Slo[];
+}
+interface ContractReport {
+    name: string;
+    baselineReport: BaselineReport;
+    sloReport?: SloReport;
+    breaches: string[];
+    pass: boolean;
+}
+declare function evaluateContract(store: TraceStore, contract: ThresholdContract): Promise<ContractReport>;
+declare function renderMarkdownReport(reports: ContractReport[]): string;
+/**
+ * Observability adapters — bidirectional parity with production backends.
+ *
+ * `LangfuseAdapter` maps a Run's spans into Langfuse generation/score
+ * records (schema-compatible; we don't depend on the SDK — consumers
+ * POST the returned JSON to their Langfuse collector).
+ *
+ * `PrometheusEmitter` converts a TraceStore into a Prometheus text-
+ * exposition-format string (counters + gauges for runs, tool calls,
+ * errors, cost). Drop into a `/metrics` handler; no SDK needed.
+ *
+ * `replayTraceThroughJudge` is the canonical "re-score with a new
+ * judge" path — takes an existing run, runs a judge function over
+ * each LLM span, emits JudgeVerdict spans back into the store.
+ */
+interface LangfuseGeneration {
+    id: string;
+    traceId: string;
+    name: string;
+    model: string;
+    input: unknown;
+    output: unknown;
+    startTime: string;
+    endTime: string;
+    usage: {
+        input: number;
+        output: number;
+        total: number;
+        totalCost: number;
+    };
+    metadata: Record<string, unknown>;
+}
+interface LangfuseScore {
+    id: string;
+    traceId: string;
+    observationId: string;
+    name: string;
+    value: number;
+    comment?: string;
+}
+interface LangfuseEnvelope {
+    traceId: string;
+    generations: LangfuseGeneration[];
+    scores: LangfuseScore[];
+}
+declare function toLangfuseEnvelope(store: TraceStore, runId: string): Promise<LangfuseEnvelope>;
+declare function toPrometheusText(store: TraceStore): Promise<string>;
+interface JudgeReplayResult {
+    spanId: string;
+    targetSpanId: string;
+    dimension: string;
+    score: number;
+    rationale?: string;
+}
+/**
+ * Apply a judge function to every LLM span in a run and record the
+ * results as JudgeVerdict spans. This is the canonical "no re-execution"
+ * re-scoring path — you supply a pure judge `(llmSpan) → verdict`.
+ */
+declare function replayTraceThroughJudge(store: TraceStore, runId: string, judge: {
+    id: string;
+    dimension: string;
+    score: (span: LlmSpan) => Promise<{
+        score: number;
+        rationale?: string;
+        evidence?: string;
+    }>;
+}): Promise<JudgeReplayResult[]>;
+/**
+ * Paraphrase robustness — mutates a scenario prompt in structure-
+ * preserving ways, re-scores, and reports score variance.
+ *
+ * Mutators are pure functions `(prompt: string) => string`. Ship a
+ * default set; consumers add domain-specific ones.
+ *
+ * Robustness score: 1 - stdDev(scores) / (mean if positive else 1).
+ * A perfect agent returns the same answer regardless of typo / case /
+ * reordering — any variance signals a brittle prompt.
+ */
+type Mutator = (prompt: string, seed: number) => string;
+interface RobustnessResult {
+    originalScore: number;
+    variantScores: Array<{
+        mutator: string;
+        score: number;
+        mutated: string;
+    }>;
+    meanScore: number;
+    stdDev: number;
+    robustness: number;
+}
+declare function paraphraseRobustness(prompt: string, mutators: Array<{
+    id: string;
+    fn: Mutator;
+}>, scoreFn: (prompt: string) => Promise<number>, options?: {
+    seed?: number;
+}): Promise<RobustnessResult>;
+/** Lowercase the whole prompt. Robust models ignore case. */
+declare const lowercaseMutator: Mutator;
+/** Reorder sentences. Robust models don't depend on sentence order. */
+declare const sentenceReorderMutator: Mutator;
+/** Swap adjacent letter pairs (1 per 40 chars, min 1). Robust models tolerate typos. */
+declare const typoMutator: Mutator;
+/** Add a benign politeness prefix. Robust models ignore flattery. */
+declare const politenessPrefixMutator: Mutator;
+/** Compact whitespace, strip newlines. Robust models don't depend on formatting. */
+declare const whitespaceCollapseMutator: Mutator;
+declare const DEFAULT_MUTATORS: Array<{
+    id: string;
+    fn: Mutator;
+}>;
+/**
+ * Visual diff — pixel-delta scoring for UI / visual outputs.
+ *
+ * Minimal dependency-free implementation: accepts two PNGs as byte
+ * arrays + width/height and returns a Δ ratio + per-channel histogram.
+ * Consumers supply the decoded pixel arrays (we don't pull a PNG
+ * decoder into the core — use `sharp`, `@napi-rs/canvas`, or Playwright
+ * in the driving test and pass the result here).
+ */
+interface ImageData {
+    width: number;
+    height: number;
+    /** Pixel data in RGBA order, 4 bytes per pixel. */
+    data: Uint8Array | Uint8ClampedArray;
+}
+interface VisualDiffResult {
+    /** Ratio of pixels differing beyond `tolerance` (0..1). */
+    diffRatio: number;
+    differingPixels: number;
+    totalPixels: number;
+    maxChannelDelta: number;
+    /** Status for dashboards: unchanged (< 0.1%), changed, or severely-changed (> 5%). */
+    status: 'unchanged' | 'changed' | 'severely-changed';
+}
+interface VisualDiffOptions {
+    /** Pixels whose max-channel delta is ≤ this are considered unchanged. Default 8/255. */
+    tolerance?: number;
+}
+declare function visualDiff(a: ImageData, b: ImageData, options?: VisualDiffOptions): VisualDiffResult;
+/** Convenience: diffs two byte-identical-dim RGBA arrays, returns just the ratio. */
+declare function pixelDeltaRatio(a: Uint8Array, b: Uint8Array, width: number, height: number, tolerance?: number): number;
+/**
+ * BuilderSession — ties a builder-of-builders workflow together.
+ *
+ * Models agent-builder's shape: Project → Chat → Edit → Ship → App →
+ * AppAgent. Each layer is a Run (linked via parentRunId). The
+ * framework-enforced invariants:
+ *
+ *   - One Project → many Chats; chatId scopes runs within a project.
+ *   - One Chat = one builder Run with `layer='builder'`.
+ *   - One Ship = one child Run with `layer='app-build'` + SandboxHarness.
+ *   - One AppScenario = one grandchild Run with `layer='app-runtime'`.
+ *
+ * Consumers obtain a BuilderSession, call `startChat`, drive the
+ * builder agent (emitting spans), and call `ship` / `runAppScenario`
+ * as the workflow progresses. The session reconstructs itself from
+ * trace data via `resume(store, projectId)`.
+ */
+interface BuilderSessionInit {
+    projectId: string;
+    chatId?: string;
+    /** Free-form: user's task description, project name, etc. Stored on the builder Run. */
+    tags?: Record<string, string>;
+}
+interface ShipOptions {
+    harness: HarnessConfig;
+    driver?: SandboxDriver;
+    /** scenarioId of this app-build run. Defaults to `${projectId}/build`. */
+    scenarioId?: string;
+}
+interface RunAppScenarioOptions {
+    scenario: TestGradedScenario;
+    /** Harness driver override; defaults to the one the session was created with. */
+    driver?: SandboxDriver;
+}
+declare class BuilderSession {
+    private store;
+    private builderEmitter;
+    readonly projectId: string;
+    readonly chatId: string;
+    private builderRunId?;
+    private lastBuildRunId?;
+    private defaultDriver?;
+    constructor(store: TraceStore, init: BuilderSessionInit, driver?: SandboxDriver);
+    /** Start the builder (L0) run for this chat. Returns the runId. */
+    startChat(scenarioId?: string): Promise<string>;
+    /** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
+    get emitter(): TraceEmitter;
+    /**
+     * Ship the project's generated app: run the sandbox harness as a child
+     * Run (`layer='app-build'`). Returns the build result + runId.
+     */
+    ship(options: ShipOptions): Promise<{
+        runId: string;
+        result: SandboxHarnessResult;
+    }>;
+    /**
+     * Run a domain scenario against the just-built app as a grandchild Run
+     * (`layer='app-runtime'`). The `ship` call must precede this so the
+     * parent is set correctly; if no build exists yet the session attaches
+     * directly to the builder run (useful for prototypes).
+     */
+    runAppScenario(options: RunAppScenarioOptions): Promise<TestGradedRunResult>;
+    /** Record an end-of-chat meta score (judge verdict on whether the builder
+     *  served the user's intent). Accepts a numeric score + optional rationale. */
+    recordMetaScore(score: number, rationale?: string): Promise<void>;
+    /** Close the builder Run with a final outcome. */
+    endChat(outcome: {
+        pass: boolean;
+        score?: number;
+        notes?: string;
+    }): Promise<void>;
+    /**
+     * Inline app-runtime run — for cases where the "scenario" isn't a
+     * SWE-bench-style test suite but a live agent interaction (LLM chat,
+     * domain flow). Returns an emitter bound to a fresh Run in the
+     * `app-runtime` layer; caller emits spans inside and calls
+     * `.endRun()` with the final verdict.
+     */
+    startAppRuntime(scenarioId: string): Promise<TraceEmitter>;
+    /**
+     * Lightweight "ship marker" — record an app-build Run with a caller-
+     * provided verdict. Use when there isn't a sandbox harness to run but
+     * you still want to mark the build state at publish time.
+     */
+    recordShipMarker(args: {
+        pass: boolean;
+        score: number;
+        scenarioId?: string;
+        notes?: string;
+    }): Promise<string>;
+    get lastBuildRunIdValue(): string | undefined;
+    get builderRunIdValue(): string | undefined;
+}
+/**
+ * Reconstruct the most recent BuilderSession state for a given project —
+ * returns { builderRunId, lastBuildRunId, chatRuns }. For chat-first UIs
+ * this is how a resumed session finds its place in the edit history.
+ */
+declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
+    projectId: string;
+    chatRuns: Run[];
+    lastBuilderRun?: Run;
+    lastBuildRun?: Run;
+    lastAppRuntimeRuns: Run[];
+}>;
+/**
+ * Three-layer evaluation — the canonical scoring breakdown for
+ * builder-of-builders workflows.
+ *
+ *   meta_score:    did the builder understand + satisfy user intent?
+ *                  (judge verdict attached to the builder run)
+ *   build_score:   did the generated scaffold build + pass its own tests?
+ *                  (outcome.score on the app-build child run)
+ *   runtime_score: did the generated agent pass its domain scenarios?
+ *                  (mean outcome.score over app-runtime grandchild runs)
+ *
+ * Returns a structured report per project. The cross-layer correlation
+ * is the highest-leverage signal the framework computes — if
+ * meta_score doesn't predict runtime_score, the builder's self-scoring
+ * is broken.
+ */
+interface ThreeLayerProjectReport {
+    projectId: string;
+    builderRunId?: string;
+    /** Judge-verdict score on the builder run (0..1 after normalization). */
+    metaScore: number | null;
+    buildRunId?: string;
+    /** 0..1 from the sandbox harness (testsPassed / testsTotal). */
+    buildScore: number | null;
+    appRuntimeRunIds: string[];
+    /** Mean of outcome.score over app-runtime runs, 0..1. */
+    runtimeScore: number | null;
+    runtimePassRate: number | null;
+    /** True when all three layers produced a score. */
+    complete: boolean;
+}
+declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
+/** Aggregate scoring across every project in a corpus. */
+declare function scoreAllProjects(store: TraceStore): Promise<ThreeLayerProjectReport[]>;
+/**
+ * Meta-eval correlation — the highest-leverage signal in the framework.
+ *
+ * Given a corpus of three-layer project reports, compute how well each
+ * pair of layers correlates. The question we care about most:
+ *
+ *   Does `metaScore` (what the builder thinks it did) predict
+ *   `runtimeScore` (what the user actually gets)?
+ *
+ * If r < ~0.4, the builder's self-scoring is broken — it's optimizing
+ * for something other than real-world success. If r > 0.7, meta_score
+ * is a usable proxy and can drive CI gates cheaply.
+ *
+ * Non-parametric rank correlation (Spearman) is also reported because
+ * meta scores are often ordinal-ish.
+ */
+interface LayerCorrelation {
+    n: number;
+    pearson: number;
+    spearman: number;
+}
+interface CorrelationReport {
+    /** Pairs present in the corpus (layers with ≥ 2 matched data points). */
+    metaVsBuild?: LayerCorrelation;
+    metaVsRuntime?: LayerCorrelation;
+    buildVsRuntime?: LayerCorrelation;
+    /** Number of complete projects (all 3 scores present). */
+    completeProjects: number;
+}
+declare function correlateLayers(reports: ThreeLayerProjectReport[]): CorrelationReport;
+/**
+ * ProjectRegistry — project-level aggregation over the trace corpus.
+ *
+ * Thin reader over TraceStore that answers the questions a chat-first,
+ * resumable UI needs:
+ *   - listProjects() → project IDs with latest activity
+ *   - projectTimeline(id) → chats + builds + runtime runs, chronological
+ *   - projectChats(id) → chat-level summaries (turn count, outcome)
+ *
+ * All queries are pure reads; no state duplication.
+ */
+interface ProjectSummary {
+    projectId: string;
+    chatCount: number;
+    buildCount: number;
+    appRuntimeCount: number;
+    lastActivityAt: number;
+    latestChatId?: string;
+    latestOutcome?: {
+        pass: boolean;
+        score?: number;
+    };
+}
+interface ChatSummary {
+    chatId: string;
+    projectId: string;
+    builderRunId: string;
+    startedAt: number;
+    endedAt?: number;
+    status: Run['status'];
+    outcome?: Run['outcome'];
+    /** Counts of spans emitted during the chat. */
+    llmTurns?: number;
+    toolCalls?: number;
+    buildRunId?: string;
+    appRuntimeRunIds: string[];
+}
+interface ProjectTimelineEntry {
+    run: Run;
+    layerBucket: 'chat' | 'build' | 'runtime' | 'other';
+}
+declare class ProjectRegistry {
+    private store;
+    constructor(store: TraceStore);
+    listProjects(): Promise<ProjectSummary[]>;
+    projectTimeline(projectId: string): Promise<ProjectTimelineEntry[]>;
+    projectChats(projectId: string): Promise<ChatSummary[]>;
+}
-export { AgentDriver, type AgentDriverConfig, type ArtifactCheck, type ArtifactResult, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type CheckResult, type CollectedArtifacts, type CompletionCriterion, ConvergenceTracker, type DriverResult, type DriverState, type EvalResult, type ExecutorConfig, type FeedbackPattern, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgeRubric, type JudgeScore, MODEL_PRICING, MetricsCollector, type PersonaConfig, ProductClient, type ProductClientConfig, type RouteMap, type RubricDimension, type Scenario, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type TestResult, TokenCounter, type Turn, type TurnMetrics, type TurnResult, adversarialJudge, codeExecutionJudge, coherenceJudge, confidenceInterval, createCustomJudge, createDomainExpertJudge, defaultJudges, estimateCost, estimateTokens, executeScenario, formatBenchmarkReport, formatDriverReport, interRaterReliability, mannWhitneyU, normalizeScores, partialCredit, printDriverSummary, runE2EWorkflow, weightedMean };
+export { AgentDriver, type AgentDriverConfig, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScore, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CostEntry, type CostSummary, CostTracker, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EvalResult, type EventFilter, type EventKind, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, FAILURE_CLASSES, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type ImageData, InMemoryExperimentStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InspectorContext, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, benjaminiHochberg, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, canaryLeakView, checkCanaries, checkSlos, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, defaultJudges, dominates, estimateCost, estimateTokens, evaluateContract, evaluateOracles, executeScenario, expectAgent, exportRunAsOtlp, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, lowercaseMutator, mannWhitneyU, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdownReport, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runE2EWorkflow, runExpectations, runFailureClass, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, statusAdvanced, stuckLoopView, textInSnapshot, toLangfuseEnvelope, toPrometheusText, toolNamesForRun, toolSpans, toolWasteView, typoMutator, urlContains, verbosityBias, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };