@tangle-network/agent-eval 0.20.8 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,4 +1,6 @@
1
1
  import { TCloud } from '@tangle-network/tcloud';
2
+ import { R as RunRecord, a as RunSplitTag } from './index-1PZOtZFr.js';
3
+ export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, e as RunJudgeMetadata, f as RunOutcome, g as RunRecordValidationError, h as RunTokenUsage, i as benchmarkDeterministicSplit, j as benchmarks, k as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './index-1PZOtZFr.js';
2
4
  import { AxAIService, AxFunction } from '@ax-llm/ax';
3
5
 
4
6
  interface Scenario {
@@ -432,139 +434,6 @@ declare class Dataset {
432
434
  }
433
435
  declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
434
436
 
435
- /**
436
- * Paper-grade RunRecord schema + runtime validator.
437
- *
438
- * Every run that participates in a promotion gate, paper table, or
439
- * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
440
- * fields are exactly those the paper "Two Loops, Three Roles" requires
441
- * for reproducibility: who/what/when/cost/seed/hash, plus the search vs
442
- * holdout split tag and either a `searchScore` or a `holdoutScore`.
443
- *
444
- * This is intentionally NOT a replacement for the rich `Run` /
445
- * `ProposeReviewReport` / `ScenarioResult` types already in the
446
- * package. Those are runtime structures with full provenance. A
447
- * `RunRecord` is the analysis-time projection — the JSON-friendly
448
- * row you'd put in a parquet file or paste into a notebook.
449
- *
450
- * Validate at the boundary:
451
- *
452
- * const rec = validateRunRecord(rawJson) // throws on missing
453
- * const ok = isRunRecord(rawJson) // boolean check
454
- * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }
455
- *
456
- * The validator runs in pure TS — zod is intentionally NOT a
457
- * dependency. Round-trip tested in `tests/run-record.test.ts`.
458
- */
459
- /** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
460
- * combined train+test pool that the optimizer is allowed to read. */
461
- type RunSplitTag = 'search' | 'dev' | 'holdout';
462
- interface RunTokenUsage {
463
- input: number;
464
- output: number;
465
- cached?: number;
466
- }
467
- interface RunJudgeMetadata {
468
- model: string;
469
- promptVersion: string;
470
- /** [0,1] confidence the judge declared. Constant judge confidence
471
- * across many runs is a fallback signal (see `canary.ts`). */
472
- confidence: number;
473
- /** True if the judge degraded to a fallback path (rules-only,
474
- * prior-call cache, etc.). The canary uses this to alert. */
475
- fallback: boolean;
476
- }
477
- interface RunOutcome$1 {
478
- /** Score on the search/optimization split. Optional because a
479
- * holdout-only evaluation only fills `holdoutScore`. */
480
- searchScore?: number;
481
- /** Score on the held-out split. Optional because a search-only run
482
- * only fills `searchScore`. At least one must be present. */
483
- holdoutScore?: number;
484
- /** Bag of any other metric the run produced — judge dimensions,
485
- * pass/fail counters, latency stats, etc. Numeric only — keeps
486
- * reporters honest. */
487
- raw: Record<string, number>;
488
- }
489
- /**
490
- * Mandatory paper-grade fields for a single evaluation run. Optional
491
- * fields are extension points; mandatory fields throw if missing.
492
- *
493
- * Hash discipline:
494
- * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
495
- * model (after any steering bundle merge).
496
- * - `configHash` is the sha256 of the effective run config (model,
497
- * temperature, tools, judges, splits). The pair (promptHash,
498
- * configHash) uniquely identifies an experimental cell.
499
- *
500
- * Model snapshot discipline:
501
- * - `model` MUST encode a snapshot version. Bare aliases like
502
- * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
503
- * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
504
- */
505
- interface RunRecord {
506
- /** UUID for the run. */
507
- runId: string;
508
- /** Logical experiment grouping (a treatment vs a baseline within
509
- * the same sweep should share `experimentId`). */
510
- experimentId: string;
511
- /** Stable identifier for the candidate (variant) being run. The
512
- * promotion gate compares two `candidateId`s on matched items. */
513
- candidateId: string;
514
- /** RNG seed for the run. Always recorded — silent re-seeding is
515
- * the most common cause of non-reproducible numbers. */
516
- seed: number;
517
- /** Model identifier WITH snapshot version. */
518
- model: string;
519
- /** sha256 of the effective prompt (post-steering). */
520
- promptHash: string;
521
- /** sha256 of the effective config. */
522
- configHash: string;
523
- /** Git SHA the harness was run from. */
524
- commitSha: string;
525
- /** End-to-end wall-clock duration in milliseconds. */
526
- wallMs: number;
527
- /** Time spent queued before execution started, if known. */
528
- queueMs?: number;
529
- /** Total USD cost. Mandatory — runs without a cost number are
530
- * unbounded by definition and must not be admitted into the gate. */
531
- costUsd: number;
532
- /** Token usage breakdown. */
533
- tokenUsage: RunTokenUsage;
534
- /** Judge-side metadata, if a judge was used. */
535
- judgeMetadata?: RunJudgeMetadata;
536
- /** Per-split scores + raw bag. */
537
- outcome: RunOutcome$1;
538
- /** Categorical failure tag, when the run failed and the harness
539
- * classified it. Free-form string; standard tags live in
540
- * `failure-taxonomy.ts`. */
541
- failureMode?: string;
542
- /** Which split this run was drawn from. */
543
- splitTag: RunSplitTag;
544
- }
545
- declare class RunRecordValidationError extends Error {
546
- readonly path: string;
547
- constructor(message: string, path?: string);
548
- }
549
- /**
550
- * Strict validator. Throws `RunRecordValidationError` on the first
551
- * missing or wrongly-typed field. Returns the input cast to
552
- * `RunRecord` on success — the validator does not coerce.
553
- */
554
- declare function validateRunRecord(input: unknown): RunRecord;
555
- /** Boolean validator — convenience for filtering arrays. */
556
- declare function isRunRecord(input: unknown): input is RunRecord;
557
- /** Non-throwing validator — returns a discriminated union. */
558
- declare function parseRunRecordSafe(input: unknown): {
559
- ok: true;
560
- value: RunRecord;
561
- } | {
562
- ok: false;
563
- error: RunRecordValidationError;
564
- };
565
- /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
566
- declare function roundTripRunRecord(record: RunRecord): RunRecord;
567
-
568
437
  /**
569
438
  * HeldOutGate — first-class held-out paired-delta promotion gate.
570
439
  *
@@ -7762,6 +7631,24 @@ interface Researcher {
7762
7631
  applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
7763
7632
  evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
7764
7633
  }
7634
+ interface CallbackResearcherOptions {
7635
+ inspectFailures: Researcher['inspectFailures'];
7636
+ proposeChange: Researcher['proposeChange'];
7637
+ applyChange: Researcher['applyChange'];
7638
+ evaluateChange: Researcher['evaluateChange'];
7639
+ }
7640
+ /**
7641
+ * Minimal concrete researcher for tests, scripts, and small integrations.
7642
+ * Larger autonomous researchers can still implement `Researcher` directly.
7643
+ */
7644
+ declare class CallbackResearcher implements Researcher {
7645
+ private readonly callbacks;
7646
+ constructor(callbacks: CallbackResearcherOptions);
7647
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
7648
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
7649
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
7650
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
7651
+ }
7765
7652
  /**
7766
7653
  * No-op researcher — fails loud on every method. Use as a placeholder
7767
7654
  * in code paths that wire the interface but don't have an implementation
@@ -8013,162 +7900,6 @@ interface CanaryOptions {
8013
7900
  */
8014
7901
  declare function runCanaries(runs: RunRecord[], opts?: CanaryOptions): CanaryReport;
8015
7902
 
8016
- /**
8017
- * Shared types for the reference benchmark wrappers under
8018
- * `src/benchmarks/`. Each wrapper exports the three functions in
8019
- * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.
8020
- */
8021
-
8022
- interface BenchmarkDatasetItem<TPayload = unknown> {
8023
- /** Stable dataset-local item id (used for split assignment + paper
8024
- * references). Unique within a benchmark. */
8025
- id: string;
8026
- /** Free-form payload. Each benchmark defines its own shape. */
8027
- payload: TPayload;
8028
- }
8029
- interface BenchmarkEvaluation {
8030
- /** [0, 1] score for the response on this item. Exact-match
8031
- * benchmarks use 0/1; partial-credit benchmarks may return
8032
- * fractional values. */
8033
- score: number;
8034
- /** Optional bag of raw scoring signals — e.g. parsed numeric
8035
- * answer, regex match, judge sub-scores. */
8036
- raw: Record<string, unknown>;
8037
- }
8038
- /** Common signature implemented by every adapter under `src/benchmarks/*`. */
8039
- interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {
8040
- /** Load the dataset for the given split. May hit the network on
8041
- * first call but should be cache-friendly. Adapters that don't
8042
- * ship the dataset itself MUST throw a clearly-marked error
8043
- * pointing the caller at the loader script. */
8044
- loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>;
8045
- /** Score a single response. Pure with respect to the inputs. */
8046
- evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>;
8047
- /** Deterministic split assignment via item id hashing. The
8048
- * fraction of items in each split is implementation-defined but
8049
- * MUST be stable across processes and platforms. */
8050
- assignSplit(itemId: string): RunSplitTag;
8051
- }
8052
- /** Split-assignment seed shared across all benchmarks. Bumping this
8053
- * value reshuffles every split — do NOT do that lightly. */
8054
- declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
8055
- /**
8056
- * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a
8057
- * stable 32-bit hash of `${seed}::${id}`. Default proportions:
8058
- *
8059
- * search: 60% (optimization-readable)
8060
- * dev: 20% (held-out for tuning, leak-on-purpose during dev)
8061
- * holdout:20% (paper-grade held-out, gated reads)
8062
- */
8063
- declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
8064
-
8065
- /**
8066
- * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
8067
- * deterministic, dependency-free benchmark for any router that maps a
8068
- * natural-language request to one of a fixed set of route labels.
8069
- *
8070
- * Format (see `routing/README.md` for prose):
8071
- *
8072
- * {
8073
- * id: stable per-task ID (matches across processes).
8074
- * category: one of the four route labels.
8075
- * prompt: the user-facing request the router must classify.
8076
- * route: the ground-truth route the router should pick.
8077
- * synonyms: other strings that count as a correct answer.
8078
- * hardNegatives:close-but-wrong route labels — used to detect the
8079
- * "always picks the popular route" failure mode.
8080
- * }
8081
- *
8082
- * The four categories are intentionally cross-domain (file ops,
8083
- * math, search, conversation) so a router that collapses to one
8084
- * category is easy to spot.
8085
- */
8086
- interface RoutingItem {
8087
- id: string;
8088
- category: 'file' | 'math' | 'search' | 'chat';
8089
- prompt: string;
8090
- /** Canonical correct route label. */
8091
- route: string;
8092
- /** Alternate route labels that also count as correct. */
8093
- synonyms: string[];
8094
- /** Wrong-but-tempting route labels (for analysis, not grading). */
8095
- hardNegatives: string[];
8096
- }
8097
- declare const ROUTING_DATASET: RoutingItem[];
8098
-
8099
- /**
8100
- * Routing benchmark — synthetic, dependency-free, ships in the
8101
- * package. 16 cross-category items in `dataset.ts`. See
8102
- * `routing/README.md` for the format.
8103
- *
8104
- * `evaluate` does case-insensitive exact match against the canonical
8105
- * route plus declared synonyms. The first valid route token in the
8106
- * response wins; everything else is ignored. Wrong answers also
8107
- * report whether they hit a hard negative — useful when triaging
8108
- * "always picks the popular route" failure modes.
8109
- */
8110
-
8111
- type RoutingPayload = RoutingItem;
8112
- type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>;
8113
- declare class RoutingAdapter implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload> {
8114
- loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]>;
8115
- evaluate(item: RoutingDatasetItem, response: string): Promise<BenchmarkEvaluation>;
8116
- assignSplit(itemId: string): RunSplitTag;
8117
- }
8118
- /**
8119
- * Pull route-shaped tokens out of a model response. Routes look like
8120
- * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics
8121
- * are not routes, but `category.action` patterns are robust to most
8122
- * model wrappers (JSON output, prose explanations, code fences).
8123
- */
8124
- declare function extractRouteTokens(response: string): string[];
8125
- declare const loadDataset: (split: RunSplitTag) => Promise<RoutingDatasetItem[]>;
8126
- declare const evaluate: (item: RoutingDatasetItem, response: string) => Promise<BenchmarkEvaluation>;
8127
- declare const assignSplit: (itemId: string) => RunSplitTag;
8128
-
8129
- declare const index$1_ROUTING_DATASET: typeof ROUTING_DATASET;
8130
- type index$1_RoutingAdapter = RoutingAdapter;
8131
- declare const index$1_RoutingAdapter: typeof RoutingAdapter;
8132
- type index$1_RoutingDatasetItem = RoutingDatasetItem;
8133
- type index$1_RoutingItem = RoutingItem;
8134
- type index$1_RoutingPayload = RoutingPayload;
8135
- declare const index$1_assignSplit: typeof assignSplit;
8136
- declare const index$1_evaluate: typeof evaluate;
8137
- declare const index$1_extractRouteTokens: typeof extractRouteTokens;
8138
- declare const index$1_loadDataset: typeof loadDataset;
8139
- declare namespace index$1 {
8140
- export { index$1_ROUTING_DATASET as ROUTING_DATASET, index$1_RoutingAdapter as RoutingAdapter, type index$1_RoutingDatasetItem as RoutingDatasetItem, type index$1_RoutingItem as RoutingItem, type index$1_RoutingPayload as RoutingPayload, index$1_assignSplit as assignSplit, index$1_evaluate as evaluate, index$1_extractRouteTokens as extractRouteTokens, index$1_loadDataset as loadDataset };
8141
- }
8142
-
8143
- /**
8144
- * Reference benchmark wrappers — entry point.
8145
- *
8146
- * Core surface (exported here):
8147
- * - The `BenchmarkAdapter` contract.
8148
- * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
8149
- * - `routing` — synthetic 16-task router benchmark. The only novel
8150
- * benchmark we built; ships in the package.
8151
- *
8152
- * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
8153
- * - `gsm8k` — exact-match math reasoning (HF mirror, dataset
8154
- * not bundled).
8155
- * - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
8156
- * external grader).
8157
- *
8158
- * The example wrappers are reference implementations of `BenchmarkAdapter`.
8159
- * Read them, copy them, adapt them. They're intentionally not in the main
8160
- * entry — every team will configure them differently.
8161
- */
8162
-
8163
- declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
8164
- type index_BenchmarkAdapter<_TItem = unknown, TPayload = unknown> = BenchmarkAdapter<_TItem, TPayload>;
8165
- type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayload>;
8166
- type index_BenchmarkEvaluation = BenchmarkEvaluation;
8167
- declare const index_deterministicSplit: typeof deterministicSplit;
8168
- declare namespace index {
8169
- export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
8170
- }
8171
-
8172
7903
  interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
8173
7904
  bundleForRun?: (run: ReferenceReplayRun<Input>) => SteeringBundle;
8174
7905
  scoreForCase?: (caseRun: ReferenceReplayCaseRun<Input>, run: ReferenceReplayRun<Input>) => RunScore;
@@ -9057,18 +8788,26 @@ interface AnalyzeTracesResult {
9057
8788
  /** Total turns the actor took. */
9058
8789
  turnCount: number;
9059
8790
  /** Token usage by role. */
9060
- usage: {
9061
- actor: unknown[];
9062
- responder: unknown[];
9063
- };
8791
+ usage: TraceAnalystUsage;
9064
8792
  /** Full system + assistant + tool message log by role. */
9065
- chatLog: {
9066
- actor: unknown[];
9067
- responder: unknown[];
9068
- };
8793
+ chatLog: TraceAnalystChatLog;
9069
8794
  /** Prompt version that produced this run. */
9070
8795
  actorPromptVersion: string;
9071
8796
  }
8797
+ interface TraceAnalystUsage {
8798
+ actor: TraceAnalystUsageEntry[];
8799
+ responder: TraceAnalystUsageEntry[];
8800
+ }
8801
+ interface TraceAnalystUsageEntry {
8802
+ [key: string]: unknown;
8803
+ }
8804
+ interface TraceAnalystChatLog {
8805
+ actor: TraceAnalystChatMessage[];
8806
+ responder: TraceAnalystChatMessage[];
8807
+ }
8808
+ interface TraceAnalystChatMessage {
8809
+ [key: string]: unknown;
8810
+ }
9072
8811
  interface AnalyzeTracesTurnSnapshot {
9073
8812
  turn: number;
9074
8813
  isError: boolean;
@@ -9340,4 +9079,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
9340
9079
  declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
9341
9080
  declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
9342
9081
 
9343
- export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome$1 as RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, defaultTraceInsightPanel, deployGateLayer, describeTraceInsightScope, distillPlaybook, domainEvidencePattern, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, inferDomainKeywords, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, planTraceInsightQuestions, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, scoreTraceInsightReadiness, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, tokenizeDomainWords, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
9082
+ export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, CallbackResearcher, type CallbackResearcherOptions, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, defaultTraceInsightPanel, deployGateLayer, describeTraceInsightScope, distillPlaybook, domainEvidencePattern, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, inferDomainKeywords, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, planTraceInsightQuestions, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, scoreTraceInsightReadiness, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, tokenizeDomainWords, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };