npm - @tangle-network/agent-eval - Versions diffs - 0.5.0 → 0.6.0 - Mend

@tangle-network/agent-eval 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -3205,4 +3205,931 @@ declare class ProjectRegistry {
     projectChats(projectId: string): Promise<ChatSummary[]>;
 }
-export { AgentDriver, type AgentDriverConfig, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScore, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CostEntry, type CostSummary, CostTracker, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EvalResult, type EventFilter, type EventKind, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, FAILURE_CLASSES, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type ImageData, InMemoryExperimentStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InspectorContext, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, benjaminiHochberg, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, canaryLeakView, checkCanaries, checkSlos, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, defaultJudges, dominates, estimateCost, estimateTokens, evaluateContract, evaluateOracles, executeScenario, expectAgent, exportRunAsOtlp, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, lowercaseMutator, mannWhitneyU, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdownReport, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runE2EWorkflow, runExpectations, runFailureClass, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, statusAdvanced, stuckLoopView, textInSnapshot, toLangfuseEnvelope, toPrometheusText, toolNamesForRun, toolSpans, toolWasteView, typoMutator, urlContains, verbosityBias, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
+/**
+ * OutcomeStore — deployment outcomes attached to Run IDs.
+ *
+ * Outcomes arrive asynchronously from production telemetry after the
+ * eval run completed: user ratings, retention flags, conversion events,
+ * revenue, support-ticket rate, anything a product team can measure.
+ * The store is a peer to TraceStore — separate lifecycle, same runId
+ * foreign key.
+ *
+ * The whole point of this module is to make the meta-eval correlation
+ * question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
+ */
+interface DeploymentOutcome {
+    runId: string;
+    capturedAt: number;
+    /** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
+    metrics: Record<string, number>;
+    /** Dimensions for stratified analysis — cohort, region, user_segment. */
+    labels?: Record<string, string>;
+    /** Free-form provenance (source system, pipeline version). */
+    source?: string;
+}
+interface OutcomeFilter {
+    runIds?: string[];
+    since?: number;
+    until?: number;
+    label?: {
+        key: string;
+        value: string;
+    };
+    source?: string;
+}
+interface OutcomeStore {
+    append(outcome: DeploymentOutcome): Promise<void>;
+    /** All outcomes attached to this run (a single run can have many — multiple
+     *  capture windows over deployment time). */
+    forRun(runId: string): Promise<DeploymentOutcome[]>;
+    list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
+}
+declare class InMemoryOutcomeStore implements OutcomeStore {
+    private items;
+    append(outcome: DeploymentOutcome): Promise<void>;
+    forRun(runId: string): Promise<DeploymentOutcome[]>;
+    list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
+}
+interface FileSystemOutcomeStoreOptions {
+    dir: string;
+    maxBytes?: number;
+}
+declare class FileSystemOutcomeStore implements OutcomeStore {
+    private dir;
+    private maxBytes;
+    private memo?;
+    private loaded;
+    constructor(options: FileSystemOutcomeStoreOptions);
+    private ensureDir;
+    append(outcome: DeploymentOutcome): Promise<void>;
+    private load;
+    forRun(runId: string): Promise<DeploymentOutcome[]>;
+    list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
+}
+/**
+ * Correlation study — "does our eval score predict real-world outcomes?"
+ *
+ * This is the load-bearing signal. Takes a TraceStore + OutcomeStore,
+ * joins on runId, computes Pearson + Spearman + bootstrap CI for every
+ * (evalMetric, outcomeMetric) pair the caller declares.
+ *
+ * Without this number the framework is ornamental. With it and r > 0.6
+ * the framework is a moat — no other agent-eval tool publishes one.
+ */
+interface EvalMetricSpec {
+    id: string;
+    /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
+    extract?: (run: Run, store: TraceStore) => Promise<number | null>;
+}
+interface OutcomePair {
+    evalMetric: string;
+    outcomeMetric: string;
+}
+interface CorrelationResult {
+    evalMetric: string;
+    outcomeMetric: string;
+    n: number;
+    pearson: number;
+    spearman: number;
+    /** 95% bootstrap CI for Pearson. */
+    pearsonCi95: {
+        lower: number;
+        upper: number;
+    };
+    /** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */
+    verdict: 'strong' | 'moderate' | 'weak';
+}
+interface CorrelationStudyResult {
+    pairs: CorrelationResult[];
+    joinedSamples: number;
+    skippedRuns: number;
+}
+interface CorrelationStudyOptions {
+    /** Only join outcomes captured within this window after run.startedAt. */
+    maxCaptureLagMs?: number;
+    /** Restrict to a subset of outcomes (cohort, region, source). */
+    outcomeFilter?: OutcomeFilter;
+    /** Which outcome per run to use when multiple exist. Default 'latest'. */
+    reduction?: 'latest' | 'mean' | 'max';
+    /** Bootstrap iterations for the CI. Default 500. */
+    bootstrapIterations?: number;
+}
+declare function correlationStudy(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetrics: EvalMetricSpec[], outcomeMetricNames: string[], options?: CorrelationStudyOptions): Promise<CorrelationStudyResult>;
+/**
+ * Calibration curve — binned "if eval says X, what does reality show?"
+ *
+ * Companion to correlationStudy. Raw correlation is a single number;
+ * the calibration curve shows *where* the eval is well-calibrated vs
+ * overconfident / underconfident. Buckets the eval metric, computes
+ * mean outcome per bucket, reports expected-calibration-error (ECE).
+ */
+interface CalibrationBin {
+    lower: number;
+    upper: number;
+    n: number;
+    evalMean: number;
+    outcomeMean: number;
+    /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */
+    gap: number;
+}
+interface CalibrationReport {
+    evalMetric: string;
+    outcomeMetric: string;
+    n: number;
+    bins: CalibrationBin[];
+    /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */
+    ece: number;
+    /** Max bin gap — upper bound on miscalibration. */
+    maxGap: number;
+}
+interface CalibrationOptions {
+    bins?: number;
+    /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */
+    binning?: 'equal-width' | 'equal-frequency';
+    /** Clip eval values to [lo, hi] before binning. */
+    range?: {
+        lo: number;
+        hi: number;
+    };
+}
+declare function calibrationCurve(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetric: EvalMetricSpec, outcomeMetric: string, options?: CalibrationOptions): Promise<CalibrationReport | null>;
+/**
+ * Process Reward Modeling — per-step rubric grading.
+ *
+ * A StepRubric inspects one span and returns a score + rationale.
+ * PrmGrader applies an array of rubrics to every LLM span in a
+ * trajectory (consumers can broaden to tool/retrieval spans via the
+ * `kind` filter on each rubric).
+ *
+ * Why this matters: outcome-only eval (did the final artifact work?)
+ * gives sparse reward — most agent turns are unattributable. PRMs
+ * densify the signal so optimizers and RL fine-tuning can assign
+ * credit per turn.
+ */
+interface StepContext {
+    trajectory: Trajectory;
+    step: TrajectoryStep;
+    /** Steps preceding `step` in trajectory order. */
+    prior: TrajectoryStep[];
+    /** Steps following `step`. */
+    next: TrajectoryStep[];
+}
+interface StepRubric {
+    id: string;
+    /** Only grade spans of these kinds (default: all). */
+    kinds?: Array<Span['kind']>;
+    /** Weight in the aggregate score. Default 1. */
+    weight?: number;
+    /** Returns score in 0..1 + optional rationale/evidence. Return `null` to
+     *  skip grading (rubric doesn't apply to this step). */
+    grade: (ctx: StepContext) => Promise<{
+        score: number;
+        rationale?: string;
+        evidence?: string;
+    } | null>;
+}
+interface GradedStep {
+    spanId: string;
+    rubricId: string;
+    score: number;
+    weight: number;
+    rationale?: string;
+    evidence?: string;
+}
+interface PrmGradedTrace {
+    runId: string;
+    steps: GradedStep[];
+    /** Weighted mean of all graded steps; 0..1. */
+    aggregateScore: number;
+    /** Number of spans graded — useful for sanity-checking coverage. */
+    gradedCount: number;
+    /** Number of spans in the trajectory that no rubric matched. */
+    ungradedCount: number;
+}
+declare class PrmGrader {
+    private rubrics;
+    constructor(rubrics: StepRubric[]);
+    /**
+     * Grade every eligible span in a run. Emits a JudgeVerdict span for each
+     * (rubric × span) verdict so the result is visible to downstream pipelines
+     * (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
+     */
+    grade(store: TraceStore, runId: string): Promise<PrmGradedTrace>;
+}
+/** Helper: reads JudgeVerdict spans that PRM emitted so downstream pipelines
+ *  can distinguish PRM verdicts from human or top-level LLM judges. */
+declare function isPrmVerdict(verdict: JudgeSpan): boolean;
+/**
+ * Built-in reference rubrics. Consumers combine these with domain
+ * rubrics. All are deterministic, rule-based — cheap to run + easy
+ * to unit-test. LLM-based rubrics are trivially authored by
+ * following the StepRubric contract.
+ */
+/** Penalize very short or very long assistant outputs. */
+declare function outputLengthRubric(args?: {
+    minChars?: number;
+    maxChars?: number;
+    weight?: number;
+}): StepRubric;
+/** Reward tool calls that succeeded (status='ok') with an informative result. */
+declare function toolSuccessRubric(args?: {
+    weight?: number;
+}): StepRubric;
+/** Penalize tool calls that duplicate a prior call with identical args. */
+declare function toolNonRedundantRubric(args?: {
+    weight?: number;
+}): StepRubric;
+/** Penalize LLM outputs that contain common refusal markers when a refusal
+ *  is NOT expected (caller inverts weight for scenarios where refusal IS expected). */
+declare function nonRefusalRubric(args?: {
+    markers?: RegExp[];
+    weight?: number;
+}): StepRubric;
+/** Reward outputs that invoke the next-step tool the trajectory actually uses
+ *  (i.e. the LLM span announced "I will call X" and the following tool span IS X). */
+declare function toolIntentAlignmentRubric(args?: {
+    weight?: number;
+}): StepRubric;
+/**
+ * Export PRM-graded traces as training data for downstream reward-model
+ * fine-tuning. Canonical format is NDJSON of
+ * `{ trajectory_text, step_index, rubric, score }` so a small model can
+ * learn to predict step rewards from step context.
+ *
+ * The framework doesn't train the model — we emit the data; callers
+ * plug it into their preferred trainer (TRL, Unsloth, custom).
+ */
+interface PrmTrainingSample {
+    runId: string;
+    spanId: string;
+    rubricId: string;
+    score: number;
+    /** Serialized step context — step + surrounding conversation. */
+    context: {
+        priorTurns: Array<{
+            role: string;
+            content: string;
+        }>;
+        step: {
+            kind: Span['kind'];
+            text: string;
+        };
+    };
+    /** Optional evidence + rationale for auditability. */
+    rationale?: string;
+    evidence?: string;
+}
+declare function exportTrainingData(store: TraceStore, graded: PrmGradedTrace[], options?: {
+    contextWindow?: number;
+}): Promise<PrmTrainingSample[]>;
+/** NDJSON serialization — write to file or stream directly to a trainer. */
+declare function toNdjson(samples: PrmTrainingSample[]): string;
+/**
+ * Inference-time PRM scoring — pick the best of N candidate trajectories
+ * using a trained reward model (or a rule-based PRM as a proxy).
+ *
+ * The canonical Best-of-N pattern: generate N completions, score each
+ * with a PRM, pick the winner. Here the scoring loop is framework-agnostic
+ * — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.
+ */
+interface BestOfNResult {
+    winner: PrmGradedTrace;
+    ranked: PrmGradedTrace[];
+    /** Standard deviation of aggregate scores — small = candidates were homogenous. */
+    stdDev: number;
+}
+declare function prmBestOfN(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<BestOfNResult>;
+/**
+ * Weighted vote across multiple graders — use when you want a PRM ensemble
+ * (e.g. rule-based + LLM-based + trained model). Each grader produces its
+ * own ranking; we aggregate via rank-sum (Borda count) so no single grader
+ * dominates via a different score scale.
+ */
+declare function prmEnsembleBestOfN(store: TraceStore, graders: PrmGrader[], runIds: string[]): Promise<BestOfNResult>;
+/**
+ * Bisector — auto-locate the change that introduced an eval regression.
+ *
+ * Two shapes:
+ *   - `commitBisect` — walk an ordered SHA list, binary-search for the
+ *     first commit that fails.
+ *   - `promptBisect` — given a good and bad prompt, progressively port
+ *     paragraphs from good→bad to localize the breaking change.
+ *
+ * Generic `bisect<T>` lets callers drive any ordered state space
+ * (dataset versions, config files, CLI flag combinations).
+ */
+interface BisectOptions<T> {
+    /** State known to pass. */
+    good: T;
+    /** State known to fail. */
+    bad: T;
+    /** Equality test on state values — default Object.is. */
+    equals?: (a: T, b: T) => boolean;
+    /** Pick the halfway state between good + bad. Return null when no further
+     *  split is possible (e.g. adjacent commits). */
+    halfway: (good: T, bad: T) => T | null;
+    /** Produce a verdict for a state. */
+    runEval: (state: T) => Promise<{
+        score: number;
+        pass: boolean;
+    }>;
+    /** Hard cap on iterations (default 40 — covers ~1T ordered states). */
+    maxIterations?: number;
+}
+interface BisectStep<T> {
+    state: T;
+    score: number;
+    pass: boolean;
+}
+interface BisectResult<T> {
+    /** The first bad state — typically `bad` in the final (good, bad) adjacent pair. */
+    culprit: T;
+    /** Ordered trace of all states evaluated. */
+    path: BisectStep<T>[];
+    /** True when we narrowed to an adjacent (good, bad) pair. */
+    converged: boolean;
+    /** True when `good` itself failed or `bad` itself passed — the caller's
+     *  premise was broken. */
+    inputInconsistent: boolean;
+}
+declare function bisect<T>(options: BisectOptions<T>): Promise<BisectResult<T>>;
+/**
+ * Commit bisect — `commits` is an ordered SHA list, oldest to newest.
+ * `good` and `bad` must both be present in the list.
+ */
+declare function commitBisect(options: {
+    commits: string[];
+    good: string;
+    bad: string;
+    runEval: (sha: string) => Promise<{
+        score: number;
+        pass: boolean;
+    }>;
+    maxIterations?: number;
+}): Promise<BisectResult<string>>;
+/**
+ * Prompt bisect — splits the good and bad prompts into paragraphs, then
+ * progressively replaces paragraphs in `good` with their counterparts
+ * from `bad` to localize the offending change. Only works when the two
+ * prompts have the same paragraph count (a common editorial workflow
+ * constraint — one paragraph = one change unit).
+ */
+declare function promptBisect(options: {
+    good: string;
+    bad: string;
+    runEval: (prompt: string) => Promise<{
+        score: number;
+        pass: boolean;
+    }>;
+    maxIterations?: number;
+    paragraphSplitter?: (prompt: string) => string[];
+}): Promise<BisectResult<string> & {
+    offendingParagraphIndex?: number;
+}>;
+/**
+ * Counterfactual replay — "what would have happened if we'd changed
+ * exactly one thing at turn N?"
+ *
+ * The framework does NOT drive the agent — it sets up the replay
+ * context (prior spans, prior state, mutation spec) and records the
+ * resulting divergence. Consumers supply an `executeFrom(ctx)` callback
+ * that runs their agent starting from turn N with the mutation applied.
+ *
+ * Counterfactual runs are recorded as a new Run with `layer='meta'` and
+ * `parentRunId = originalRunId`, so downstream diff + correlation
+ * pipelines see them natively.
+ */
+type CounterfactualMutation = {
+    kind: 'swap-model';
+    at: number;
+    newModel: string;
+} | {
+    kind: 'swap-tool-result';
+    at: number;
+    newResult: unknown;
+} | {
+    kind: 'truncate-after';
+    at: number;
+} | {
+    kind: 'inject-system-message';
+    at: number;
+    content: string;
+} | {
+    kind: 'custom';
+    at: number;
+    describe: string;
+    apply: (step: TrajectoryStep) => TrajectoryStep;
+};
+interface CounterfactualContext {
+    originalRunId: string;
+    originalTrajectory: Trajectory;
+    /** Steps up to (but not including) the mutation point — the prefix the
+     *  replayed agent inherits as its prior conversation/tool history. */
+    prefix: TrajectoryStep[];
+    mutation: CounterfactualMutation;
+    /** Pre-applied mutation on the step at `mutation.at`. Consumers use this
+     *  as the FIRST step the replayed agent emits (they decide whether to
+     *  re-emit it or continue from there). */
+    mutatedStep: TrajectoryStep;
+}
+interface CounterfactualResult {
+    counterfactualRunId: string;
+    originalRunId: string;
+    mutation: CounterfactualMutation;
+    /** Structured delta summary — caller can extend via scoring. */
+    delta: {
+        originalOutcomeScore: number | null;
+        counterfactualOutcomeScore: number | null;
+        deltaScore: number | null;
+    };
+}
+interface CounterfactualRunner {
+    /**
+     * Execute the agent from `ctx.prefix` with the mutation applied.
+     * MUST emit spans into the provided emitter so they become part of
+     * the counterfactual run. MUST call emitter.endRun() with a verdict.
+     */
+    executeFrom: (ctx: CounterfactualContext, emitter: TraceEmitter) => Promise<void>;
+}
+declare function runCounterfactual(store: TraceStore, originalRunId: string, mutation: CounterfactualMutation, runner: CounterfactualRunner): Promise<CounterfactualResult>;
+/**
+ * Aggregate a batch of counterfactuals into a simple attribution table:
+ * which mutation kinds move outcomes most? (Useful when you run a grid
+ * over the same trajectory — swap-model at every llm span, swap-tool
+ * at every tool span — and want a ranked summary.)
+ */
+declare function attributeCounterfactuals(results: CounterfactualResult[]): Array<{
+    mutationKind: CounterfactualMutation['kind'];
+    n: number;
+    meanAbsDelta: number;
+    meanSignedDelta: number;
+}>;
+/**
+ * Full cross-trace diff — align two trajectories step-by-step, report
+ * per-step score deltas, attribute a variant's total outcome lead to
+ * specific turns.
+ *
+ * 0.5 shipped `firstDivergenceView` (finds the first differing step).
+ * This does the heavier work: full alignment via LCS, per-step
+ * contribution to score delta using PRM verdicts when available,
+ * fallback to structural heuristics (latency, token count, tool
+ * outcome) otherwise.
+ */
+type AlignmentOp = {
+    op: 'match';
+    a: TrajectoryStep;
+    b: TrajectoryStep;
+} | {
+    op: 'insert';
+    b: TrajectoryStep;
+} | {
+    op: 'delete';
+    a: TrajectoryStep;
+} | {
+    op: 'replace';
+    a: TrajectoryStep;
+    b: TrajectoryStep;
+};
+interface StepAttribution {
+    op: AlignmentOp;
+    /** Difference in PRM score (or null when not scored by a matching judge). */
+    prmDelta: number | null;
+    /** Difference in latency (endedAt - startedAt). */
+    latencyDeltaMs: number | null;
+    /** Difference in token count (LLM spans). */
+    tokenDelta: number | null;
+    /** Reason this step is / isn't considered a contributor to the outcome delta. */
+    note: string;
+}
+interface CrossTraceDiff {
+    runA: string;
+    runB: string;
+    alignment: AlignmentOp[];
+    attributions: StepAttribution[];
+    /** Total score delta (B - A). */
+    totalScoreDelta: number | null;
+    /** Sum of PRM deltas across matched/replaced steps. Close to
+     *  `totalScoreDelta` when PRM covers the trajectory; gap indicates
+     *  unmodeled variance. */
+    prmDeltaSum: number;
+}
+interface CrossTraceDiffOptions {
+    stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
+}
+declare function crossTraceDiff(store: TraceStore, runA: string, runB: string, options?: CrossTraceDiffOptions): Promise<CrossTraceDiff>;
+/**
+ * Pre-registered hypotheses — declare what you're testing BEFORE the
+ * run, check it AFTER. Prevents p-hacking, optional stopping, and the
+ * "we ran until it looked good" failure mode.
+ *
+ * Manifest is a plain JSON-friendly object. Sign it with a content hash
+ * + timestamp; the registered record becomes immutable. Post-run,
+ * evaluate the manifest against observed results — the library refuses
+ * to let you re-interpret a different metric as the declared one.
+ */
+interface HypothesisManifest {
+    id: string;
+    /** Human prose — goes into the audit trail. */
+    hypothesis: string;
+    /** Metric the hypothesis claims to move. */
+    metric: string;
+    /** 'increase' = candidate should score higher than baseline; 'decrease' = lower. */
+    direction: 'increase' | 'decrease';
+    /** Minimum effect size to count (same units as the metric). */
+    minEffect: number;
+    /** Alpha threshold. */
+    alpha: number;
+    /** Target statistical power at which sample size was pre-computed. */
+    power: number;
+    /** Declared N per arm before running. */
+    preRegisteredN: number;
+    /** ISO8601 timestamp the manifest was registered. */
+    registeredAt: string;
+    /** Optional identifiers to tie into the trace corpus. */
+    baselineLabel?: string;
+    candidateLabel?: string;
+}
+interface SignedManifest extends HypothesisManifest {
+    /** sha256 hex of canonicalized manifest (everything except contentHash). */
+    contentHash: string;
+}
+interface HypothesisResult {
+    manifest: SignedManifest;
+    observedN: number;
+    observedEffect: number;
+    observedPValue: number;
+    /** True iff the observed effect hits the pre-declared direction with
+     *  magnitude ≥ minEffect AND p < alpha. */
+    confirmed: boolean;
+    /** Enumerated reasons the hypothesis was rejected (each a machine-tag). */
+    rejectionReasons: Array<'wrong_direction' | 'effect_too_small' | 'not_significant' | 'undersampled'>;
+    notes?: string;
+}
+declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
+/** Verify that a signed manifest has not been tampered with. */
+declare function verifyManifest(m: SignedManifest): Promise<boolean>;
+/**
+ * Evaluate a pre-registered hypothesis against observed results.
+ * Mechanical — no re-interpretation permitted.
+ */
+declare function evaluateHypothesis(manifest: SignedManifest, observed: {
+    n: number;
+    effect: number;
+    pValue: number;
+}): Promise<HypothesisResult>;
+/**
+ * Self-play scenario evolution — agents generate adversarial scenarios
+ * against each other; survivors become part of the eval corpus.
+ *
+ * Framework-agnostic about how scenarios are generated. Caller supplies:
+ *   - `propose`: asks a "proposer" agent for candidate scenarios
+ *   - `scoreAgainst`: runs a target agent against a scenario and returns
+ *     its score
+ *
+ * A scenario *survives* if it reveals a meaningful score difference
+ * between two target agents (or between a target agent and itself on
+ * different runs). Survivors are promoted to a Dataset; the caller
+ * decides what to do with them (hold-out, training, regression set).
+ *
+ * Guard rails: minimum absolute score delta to consider a scenario
+ * informative; floor on absolute target score so degenerate break-all
+ * scenarios (noise, gibberish) don't flood the corpus.
+ */
+interface CandidateScenario {
+    id: string;
+    payload: unknown;
+    /** Free-form tags (domain, generation, parent). */
+    tags?: Record<string, string>;
+}
+interface ScoredTarget {
+    targetId: string;
+    score: number;
+}
+interface EvolutionRound {
+    round: number;
+    proposed: CandidateScenario[];
+    survived: CandidateScenario[];
+    rejected: Array<{
+        candidate: CandidateScenario;
+        reason: string;
+    }>;
+    scoredBreakdown: Array<{
+        candidate: CandidateScenario;
+        scores: ScoredTarget[];
+        spread: number;
+    }>;
+}
+interface SelfPlayOptions {
+    /** Minimum score spread across targets for a scenario to survive. Default 0.1. */
+    minSpread?: number;
+    /** Minimum floor score across targets — keeps degenerate break-all scenarios
+     *  out. Default 0.1 (if every target scores below this, discard). */
+    minAbsoluteFloor?: number;
+    /** Hard cap on survivors per round. Default 50. */
+    maxSurvivors?: number;
+    /** Rounds to run. Default 1. Each round's survivors can be fed back into
+     *  `propose` to compound. */
+    rounds?: number;
+    /** Seed for scenario id generation if proposer doesn't provide one. */
+    seed?: number;
+}
+interface SelfPlayProposer {
+    propose(round: number, priorSurvivors: CandidateScenario[]): Promise<CandidateScenario[]>;
+}
+interface SelfPlayScorer {
+    /** Score one candidate against every target; returns parallel array. */
+    scoreCandidate(candidate: CandidateScenario, targets: string[]): Promise<ScoredTarget[]>;
+}
+declare function runSelfPlay(proposer: SelfPlayProposer, scorer: SelfPlayScorer, targets: string[], options?: SelfPlayOptions): Promise<{
+    rounds: EvolutionRound[];
+    dataset: Dataset;
+}>;
+/**
+ * Causal attribution via factorial experiments.
+ *
+ * Run every combination of {model × prompt × scenario × seed}, then
+ * decompose observed score variance into main effects + interactions.
+ * Moves from correlational "variant B is better" to causal "the model
+ * swap accounts for 42% of the lead; the prompt change accounts for 28%;
+ * interaction is 30%."
+ *
+ * Minimal implementation: 2-way factorial (two factors at a time) with
+ * main-effect + interaction decomposition via variance of cell means.
+ * Consumers run the factorial design themselves (we don't schedule
+ * runs); this module consumes the (factorLevels, observedScores)
+ * table and does the attribution math.
+ */
+interface FactorialCell {
+    /** Map factor name → level id. e.g. { model: 'claude', prompt: 'v2' } */
+    levels: Record<string, string>;
+    /** Observed score for this cell (mean over replications if n > 1). */
+    score: number;
+    /** Number of replications averaged to produce `score`. */
+    n: number;
+}
+interface FactorContribution {
+    factor: string;
+    /** Variance attributed to this factor's main effect, as a fraction of total. */
+    shareOfVariance: number;
+    /** Range of cell means across levels of this factor. */
+    range: number;
+}
+interface InteractionContribution {
+    factors: [string, string];
+    shareOfVariance: number;
+}
+interface CausalAttributionReport {
+    totalVariance: number;
+    mainEffects: FactorContribution[];
+    interactions: InteractionContribution[];
+    /** Residual = variance unexplained by main effects + modeled interactions. */
+    residualShare: number;
+    /** Sanity: shares sum to 1 (within fp). */
+    sharesSum: number;
+}
+declare function causalAttribution(cells: FactorialCell[]): CausalAttributionReport;
+/**
+ * Active learning — agent-as-scenario-author.
+ *
+ * Analyzes an existing Dataset + trace corpus for coverage gaps and
+ * weak spots, returns a prioritized list of *synthesis targets*:
+ * (gap description, existing-neighbor examples, suggested direction).
+ *
+ * Does NOT call an LLM itself — the proposer agent is caller-supplied.
+ * This module's job is to identify WHERE new scenarios would compound
+ * the most information, not to author them.
+ *
+ * Gaps we detect:
+ *   - dimensions with high score variance (unstable, need more data)
+ *   - dimensions with low coverage count (undersampled)
+ *   - failure classes with clusters (systematic weakness)
+ *   - difficulty bins with no coverage
+ */
+type SynthesisReason = 'high-variance' | 'undersampled' | 'failure-cluster' | 'difficulty-gap';
+interface SynthesisTarget {
+    reason: SynthesisReason;
+    description: string;
+    /** Existing scenarios that are closest to the gap; caller feeds these to
+     *  their LLM proposer as few-shot examples. */
+    neighbors: DatasetScenario[];
+    /** Suggested direction — e.g. "harder variants", "edge cases of X", "failure class Y". */
+    direction: string;
+    /** Priority score — higher = more information-dense gap. 0..1. */
+    priority: number;
+}
+interface ActiveLearningOptions {
+    /** Minimum scenarios per difficulty band to count as "covered". */
+    minPerBand?: number;
+    /** Variance threshold above which a scenario's dimension is "unstable". */
+    varianceThreshold?: number;
+    /** Max synthesis targets returned. */
+    topK?: number;
+}
+declare function proposeSynthesisTargets(dataset: Dataset, traceStore: TraceStore, options?: ActiveLearningOptions): Promise<SynthesisTarget[]>;
+/**
+ * Reward-model export — the productizable wrapper around PRM training
+ * data. Takes a TraceStore + PrmGrader, produces an embeddable
+ * inference scorer that customers plug into their own agent stack.
+ *
+ * Two export forms:
+ *   - `exportRewardModel(store, graders)` — serializes the (step-context,
+ *     score) corpus to a framework-agnostic payload. Customer fine-tunes
+ *     their own model; we ship the scaffolding.
+ *   - `loadScorerFromTraces(store, grader)` — a zero-deps "reward model"
+ *     that literally replays the trained rubric at inference time. Works
+ *     as a reference baseline + deterministic fallback.
+ */
+interface ExportedRewardModel {
+    /** Version of the export format. Bump when payload shape changes. */
+    version: '1.0';
+    /** Metadata about the training corpus. */
+    metadata: {
+        nTraces: number;
+        nSamples: number;
+        rubrics: string[];
+        exportedAt: string;
+        /** Mean reward across training corpus — use as sanity check at load. */
+        meanReward: number;
+    };
+    /** NDJSON training payload suitable for most fine-tuning frameworks. */
+    trainingNdjson: string;
+}
+declare function exportRewardModel(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<ExportedRewardModel>;
+/**
+ * Zero-deps inference scorer — apply a grader to a trajectory and return
+ * its aggregate score. This is the "reward model" customers embed when
+ * they don't want (or can't) fine-tune one. Deterministic + portable.
+ */
+interface InferenceScorer {
+    /** Score a completed trajectory. Higher is better. */
+    score(trajectory: Trajectory, store: TraceStore): Promise<number>;
+    metadata: {
+        rubrics: string[];
+        deterministic: true;
+    };
+}
+declare function loadScorerFromGrader(grader: PrmGrader): InferenceScorer;
+/**
+ * Replay a trace corpus through a scorer — produces the canonical
+ * "what would this reward model have said about every run?" table.
+ * Callers use this to validate a trained model against the training
+ * corpus (expect high agreement; drift indicates overfitting).
+ */
+declare function replayScorerOverCorpus(store: TraceStore, scorer: InferenceScorer, runIds: string[]): Promise<Array<{
+    runId: string;
+    score: number;
+    outcomeScore: number | null;
+}>>;
+/**
+ * Governance reporting — shared types.
+ *
+ * The framework collects a `GovernanceContext` (traces + outcomes +
+ * dataset manifests + red-team results + judge calibration) and each
+ * specific template (NIST AI RMF, SOC2, EU AI Act) renders a
+ * structured report from it.
+ *
+ * Reports are machine-readable JSON first; human-readable Markdown is a
+ * pure transform on top. External auditors consume the Markdown; CI
+ * consumes the JSON.
+ */
+interface GovernanceContext {
+    /** Legal / org identity for the report. */
+    organization: string;
+    /** System / agent identifier. */
+    systemName: string;
+    /** ISO8601 period the report covers. */
+    periodStart: string;
+    periodEnd: string;
+    /** Versioned dataset manifests used during the period. */
+    datasets: DatasetManifest[];
+    traceStore: TraceStore;
+    outcomeStore?: OutcomeStore;
+    /** Cached red-team results for the period, if available. */
+    redTeam?: RedTeamReport;
+    /** Judge-vs-human calibration results, if measured. */
+    judgeCalibration?: CalibrationResult[];
+    /** Responsible owner for the system — role + name + email. */
+    owner: {
+        role: string;
+        name: string;
+        email: string;
+    };
+}
+interface GovernanceFinding {
+    id: string;
+    severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
+    /** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
+    control: string;
+    summary: string;
+    evidence?: string;
+    remediation?: string;
+}
+interface GovernanceReport {
+    framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
+    version: string;
+    context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
+    summary: {
+        findings: number;
+        byeverity: Record<GovernanceFinding['severity'], number>;
+        overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
+    };
+    findings: GovernanceFinding[];
+    /** Framework-specific structured payload (mapped controls, risk class, etc.). */
+    payload: Record<string, unknown>;
+    generatedAt: string;
+}
+declare function renderMarkdown(report: GovernanceReport): string;
+declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
+/**
+ * NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
+ *
+ * Each subcategory derives its status from concrete framework state:
+ *   MEASURE 2.x: do we have a calibration regime? contamination controls?
+ *   MEASURE 2.7: are red-team results available?
+ *   MANAGE 1.x: are outcome metrics captured? correlation measured?
+ *   GOVERN 1.x: dataset + prompt provenance recorded?
+ *
+ * We ship the mapping and the derivation rules; consumers supply the
+ * GovernanceContext.
+ */
+declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
+/**
+ * SOC 2 — Common Criteria 7 (system operations + change management)
+ * audit trail derived from the trace corpus.
+ *
+ * This is NOT a formal SOC2 report — that requires an external
+ * auditor. What we ship is the machine-readable *evidence* package
+ * that an auditor consumes: run counts, deploy events, access log
+ * summary, anomaly tracking, response-time SLOs.
+ */
+declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
+/**
+ * EU AI Act — risk-class classification + compliance checklist.
+ *
+ * Classification is declarative: caller supplies the domain/use-case
+ * signals (biometric? critical infrastructure? education? employment?
+ * access to services?) and we map to the Act's risk tiers:
+ *   - "unacceptable" (prohibited)
+ *   - "high"        (Annex III — strict obligations)
+ *   - "limited"     (transparency obligations)
+ *   - "minimal"     (voluntary codes of conduct)
+ *
+ * Then the compliance checklist enumerates Article 9 (risk mgmt),
+ * 10 (data + data governance), 11 (technical documentation), 13
+ * (transparency), 14 (human oversight), 15 (accuracy + robustness)
+ * requirements and flags gaps.
+ */
+type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
+interface UseCaseSignals {
+    /** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
+    biometricPublic?: boolean;
+    /** Social scoring by public authorities? (Art. 5). */
+    socialScoring?: boolean;
+    /** Subliminal manipulation? (Art. 5). */
+    subliminal?: boolean;
+    /** Annex III sector: critical infrastructure / education / employment /
+     *  access to essential services / law enforcement / migration /
+     *  administration of justice / democratic processes? */
+    annexIII?: boolean;
+    /** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
+    chatbot?: boolean;
+    /** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
+    generatesSyntheticMedia?: boolean;
+}
+declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
+declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
+export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, crossTraceDiff, defaultJudges, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };