@tangle-network/agent-eval 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -944,6 +944,114 @@ interface RunDiff {
944
944
  }>;
945
945
  }
946
946
 
947
+ /**
948
+ * FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
949
+ *
950
+ * Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
951
+ * files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
952
+ * based rollover. Writes are append-only so the file log doubles as an audit
953
+ * trail of every state transition the tracker ever wrote.
954
+ *
955
+ * Reads lazy-load every NDJSON file in the directory (including rolled-over
956
+ * archives), latest-write-wins per `id`. Subsequent writes update the
957
+ * in-memory index in place so reads after writes are O(1).
958
+ *
959
+ * Node-only — imports `node:fs/promises`. Don't import this from a Worker;
960
+ * use the in-memory store or the D1 store from `./experiment-tracker-d1`.
961
+ */
962
+
963
+ interface FileSystemExperimentStoreOptions {
964
+ /** Directory the NDJSON files live in. Created on first write. */
965
+ dir: string;
966
+ /** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
967
+ maxBytes?: number;
968
+ }
969
+ declare class FileSystemExperimentStore implements ExperimentStore {
970
+ private readonly dir;
971
+ private readonly maxBytes;
972
+ private index?;
973
+ private loaded;
974
+ constructor(options: FileSystemExperimentStoreOptions);
975
+ saveExperiment(exp: Experiment): Promise<void>;
976
+ getExperiment(id: string): Promise<Experiment | null>;
977
+ listExperiments(): Promise<Experiment[]>;
978
+ saveRun(run: Run$1): Promise<void>;
979
+ getRun(id: string): Promise<Run$1 | null>;
980
+ listRuns(experimentId: string): Promise<Run$1[]>;
981
+ private ensureDir;
982
+ private append;
983
+ private load;
984
+ }
985
+
986
+ /**
987
+ * D1ExperimentStore — Cloudflare D1-backed `ExperimentStore`.
988
+ *
989
+ * Workers-safe (uses only the `D1Database` binding the runtime injects). Two
990
+ * tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
991
+ * a Worker route can both write the row at run start and update it at run end
992
+ * without losing the original config — the row's lifecycle mirrors the
993
+ * `Run.status` field one-to-one.
994
+ *
995
+ * Why this lives next to `InMemoryExperimentStore`:
996
+ * - bad-app, legal-agent, gtm-agent, film-agent all run as Workers
997
+ * - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
998
+ * - Hand-rolling D1 SQL in every consumer is exactly the duplication this
999
+ * module exists to prevent
1000
+ *
1001
+ * Schema versioning: the `meta` table records `schema_version` so a future
1002
+ * column addition can be detected and migrated additively. Today's schema is
1003
+ * v1; bump only on breaking shape changes.
1004
+ */
1005
+
1006
+ /**
1007
+ * Minimal `D1Database` shape we depend on. Avoids pulling in
1008
+ * `@cloudflare/workers-types` as a hard dep — consumers that already have
1009
+ * those types installed can pass the binding directly.
1010
+ */
1011
+ interface D1Like {
1012
+ prepare(query: string): D1PreparedStatementLike;
1013
+ batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
1014
+ exec(query: string): Promise<unknown>;
1015
+ }
1016
+ interface D1PreparedStatementLike {
1017
+ bind(...values: unknown[]): D1PreparedStatementLike;
1018
+ first<T = Record<string, unknown>>(): Promise<T | null>;
1019
+ all<T = Record<string, unknown>>(): Promise<{
1020
+ results: T[];
1021
+ }>;
1022
+ run(): Promise<unknown>;
1023
+ }
1024
+ interface D1ExperimentStoreOptions {
1025
+ /** D1 binding from `env`. */
1026
+ db: D1Like;
1027
+ /**
1028
+ * Optional table-name prefix so multiple ExperimentStores can share a DB
1029
+ * without colliding (e.g. `tax_eval_experiments` vs `legal_eval_experiments`).
1030
+ * Default: `agent_eval_`.
1031
+ */
1032
+ tablePrefix?: string;
1033
+ }
1034
+ declare class D1ExperimentStore implements ExperimentStore {
1035
+ private readonly db;
1036
+ private readonly experimentsTable;
1037
+ private readonly runsTable;
1038
+ private readonly metaTable;
1039
+ private schemaReady;
1040
+ constructor(options: D1ExperimentStoreOptions);
1041
+ /**
1042
+ * Idempotent schema setup. Safe to call before every operation; the second
1043
+ * call short-circuits via `schemaReady`. Most consumers will call it once
1044
+ * during Worker bootstrap.
1045
+ */
1046
+ ensureSchema(): Promise<void>;
1047
+ saveExperiment(exp: Experiment): Promise<void>;
1048
+ getExperiment(id: string): Promise<Experiment | null>;
1049
+ listExperiments(): Promise<Experiment[]>;
1050
+ saveRun(run: Run$1): Promise<void>;
1051
+ getRun(id: string): Promise<Run$1 | null>;
1052
+ listRuns(experimentId: string): Promise<Run$1[]>;
1053
+ }
1054
+
947
1055
  /**
948
1056
  * Prompt optimizer — A/B test prompt variants with statistical rigor.
949
1057
  *
@@ -959,13 +1067,13 @@ interface RunDiff {
959
1067
  * and returns a number per scenario. This lets the optimizer stay small +
960
1068
  * testable.
961
1069
  */
962
- interface PromptVariant {
1070
+ interface PromptVariant$1 {
963
1071
  id: string;
964
1072
  prompt: string;
965
1073
  metadata?: Record<string, unknown>;
966
1074
  }
967
1075
  interface OptimizationConfig {
968
- variants: PromptVariant[];
1076
+ variants: PromptVariant$1[];
969
1077
  /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
970
1078
  trialsPerScenario?: number;
971
1079
  /** Significance threshold for pairwise comparison (default 0.05). */
@@ -976,7 +1084,7 @@ interface OptimizationConfig {
976
1084
  * monotonicity).
977
1085
  */
978
1086
  scoreVariant: (args: {
979
- variant: PromptVariant;
1087
+ variant: PromptVariant$1;
980
1088
  scenarioId: string;
981
1089
  trialIndex: number;
982
1090
  }) => Promise<number>;
@@ -1750,6 +1858,46 @@ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
1750
1858
  * iff no other candidate dominates it.
1751
1859
  */
1752
1860
  declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
1861
+ /**
1862
+ * Weighted-sum scalarisation. Use as a tie-break / single-winner selector
1863
+ * when callers don't want to consume a frontier. Each objective contributes
1864
+ * its normalised value (0..1 via min-max across the candidate pool) times
1865
+ * its weight; missing weights default to 1/N.
1866
+ *
1867
+ * Direction is honoured automatically — `minimize` axes have their values
1868
+ * inverted before scaling so "higher scalar = better" always holds.
1869
+ */
1870
+ declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
1871
+ weights?: Partial<Record<string, number>>;
1872
+ }): Array<{
1873
+ candidate: T;
1874
+ score: number;
1875
+ }>;
1876
+ /**
1877
+ * NSGA-II crowding distance — secondary sort for ties on the frontier.
1878
+ *
1879
+ * When the Pareto front collapses to a single point (or many candidates tie
1880
+ * on dominance), naive selection picks arbitrarily and the population
1881
+ * degenerates over generations. NSGA-II preserves diversity by preferring
1882
+ * candidates with more empty space around them on the frontier.
1883
+ *
1884
+ * Returns an array of `{ candidate, distance }` in the SAME order as the
1885
+ * input. Higher distance = more isolated = should be preferred when
1886
+ * preserving diversity.
1887
+ */
1888
+ declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
1889
+ candidate: T;
1890
+ distance: number;
1891
+ }>;
1892
+ /**
1893
+ * Pareto frontier with tie-break by crowding distance — the canonical
1894
+ * NSGA-II selection step. Returns the frontier sorted by descending crowding
1895
+ * distance so callers can `.slice(0, k)` to pick K diverse winners.
1896
+ */
1897
+ declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
1898
+ candidate: T;
1899
+ distance: number;
1900
+ }>;
1753
1901
 
1754
1902
  type HarnessIntervention = 'continue' | 'plan' | 'audit' | 'recover' | 'repair' | 'verify' | 'final_gate' | 'wait_for_measurement' | 'abort';
1755
1903
  interface WorkflowTopology {
@@ -6079,4 +6227,449 @@ interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
6079
6227
  declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: ReferenceReplayRun<Input>[], options?: ReferenceReplaySteeringRowsOptions<Input>): SteeringOptimizationRow[];
6080
6228
  declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
6081
6229
 
6082
- export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
6230
+ /**
6231
+ * PromptEvolutionLoop — population-based reflective-mutation evolution.
6232
+ *
6233
+ * Above the existing `AxGepaSteeringOptimizer` (which RANKS variants),
6234
+ * this loop GENERATES variants. Each generation:
6235
+ * 1. Score the population across (variant × scenario × rep).
6236
+ * 2. Pick survivors from the Pareto frontier (with crowding-distance tie-break).
6237
+ * 3. Ask the mutator for replacements until population size is restored.
6238
+ * 4. Repeat for N generations OR until convergence.
6239
+ *
6240
+ * Domain-agnostic. Consumers supply:
6241
+ * - A seed population of `PromptVariant`s.
6242
+ * - A `ScoreAdapter` that runs (variant, scenario, rep) → `TrialResult`.
6243
+ * - A `MutateAdapter` that produces children given trace evidence.
6244
+ * - Pareto `Objective<TrialAggregate>[]` defining the multi-objective vector.
6245
+ *
6246
+ * The loop owns: population management, parallel scheduling (concurrency-
6247
+ * limited), Pareto selection with crowding distance, generation reporting.
6248
+ *
6249
+ * It does NOT own: rendering trials to a model, executing prompts, choosing
6250
+ * mutation primitives, persisting to disk. Those are the consumer's call.
6251
+ */
6252
+
6253
+ interface PromptVariant<P = unknown> {
6254
+ /** Stable id for the variant — surfaces in reports and trial results. */
6255
+ id: string;
6256
+ /** Variant payload — interpretation is the consumer's responsibility. */
6257
+ payload: P;
6258
+ /** Generation index (0 = seed, then 1, 2, ...). */
6259
+ generation: number;
6260
+ /** Parent variant id when produced via mutation; absent for seeds. */
6261
+ parentId?: string;
6262
+ /** Human label for reports. */
6263
+ label: string;
6264
+ /** What the mutator was trying to fix. */
6265
+ rationale?: string;
6266
+ }
6267
+ interface TrialResult {
6268
+ variantId: string;
6269
+ scenarioId: string;
6270
+ rep: number;
6271
+ ok: boolean;
6272
+ /** Primary scalar score the consumer cares about (e.g., recall, accuracy). */
6273
+ score: number;
6274
+ /** Token cost (or any cost-like dimension). */
6275
+ cost?: number;
6276
+ /** Wall time in ms. */
6277
+ durationMs?: number;
6278
+ /** Free-form metric bag for objective accessors. */
6279
+ metrics?: Record<string, number>;
6280
+ error?: string;
6281
+ }
6282
+ /** Aggregated trial summary for one (variant, scenario) pair across reps. */
6283
+ interface ScenarioAggregate {
6284
+ variantId: string;
6285
+ scenarioId: string;
6286
+ meanScore: number;
6287
+ meanCost: number;
6288
+ meanDurationMs: number;
6289
+ okRate: number;
6290
+ trials: number;
6291
+ /** Mean of every numeric metric across reps. */
6292
+ metrics: Record<string, number>;
6293
+ }
6294
+ /** Aggregated trial summary for one variant across all scenarios. */
6295
+ interface VariantAggregate {
6296
+ variantId: string;
6297
+ meanScore: number;
6298
+ meanCost: number;
6299
+ meanDurationMs: number;
6300
+ okRate: number;
6301
+ scenarios: ScenarioAggregate[];
6302
+ /** Mean of every numeric metric, averaged across scenarios. */
6303
+ metrics: Record<string, number>;
6304
+ }
6305
+ interface ScoreAdapter<P = unknown> {
6306
+ score(args: {
6307
+ variant: PromptVariant<P>;
6308
+ scenarioId: string;
6309
+ rep: number;
6310
+ }): Promise<TrialResult>;
6311
+ }
6312
+ interface MutateAdapter<P = unknown> {
6313
+ mutate(args: {
6314
+ parent: PromptVariant<P>;
6315
+ parentAggregate: VariantAggregate;
6316
+ topTrials: TrialResult[];
6317
+ bottomTrials: TrialResult[];
6318
+ childCount: number;
6319
+ generation: number;
6320
+ }): Promise<PromptVariant<P>[]>;
6321
+ }
6322
+ interface PromptEvolutionConfig<P = unknown> {
6323
+ runId: string;
6324
+ /** What component is being mutated — surfaces in reports + reflection prompts. */
6325
+ target: string;
6326
+ seedVariants: PromptVariant<P>[];
6327
+ scenarioIds: string[];
6328
+ reps: number;
6329
+ generations: number;
6330
+ populationSize: number;
6331
+ /** Maximum concurrent score() calls. */
6332
+ scoreConcurrency: number;
6333
+ scoreAdapter: ScoreAdapter<P>;
6334
+ mutateAdapter: MutateAdapter<P>;
6335
+ /** Pareto objectives over `VariantAggregate`. Ordered by importance. */
6336
+ objectives: Objective<VariantAggregate>[];
6337
+ /** Optional weights for the scalar tie-break selector (by objective name). */
6338
+ scalarWeights?: Record<string, number>;
6339
+ /** Stop early if a generation produces no Pareto improvement. Default true. */
6340
+ earlyStopOnNoImprovement?: boolean;
6341
+ onProgress?: (event: PromptEvolutionEvent) => void;
6342
+ /**
6343
+ * Optional cache key for memoising scored (variantId, scenarioId, rep)
6344
+ * tuples. When provided AND a cache instance is passed, repeated trials
6345
+ * skip re-scoring. Cache keys are stable across runs.
6346
+ */
6347
+ cache?: TrialCache;
6348
+ }
6349
+ interface TrialCache {
6350
+ get(key: string): TrialResult | undefined;
6351
+ set(key: string, value: TrialResult): void;
6352
+ }
6353
+ declare class InMemoryTrialCache implements TrialCache {
6354
+ private store;
6355
+ get(key: string): TrialResult | undefined;
6356
+ set(key: string, value: TrialResult): void;
6357
+ size(): number;
6358
+ clear(): void;
6359
+ }
6360
+ type PromptEvolutionEvent = {
6361
+ type: 'generation-start';
6362
+ generation: number;
6363
+ populationSize: number;
6364
+ } | {
6365
+ type: 'trial-complete';
6366
+ generation: number;
6367
+ variantId: string;
6368
+ scenarioId: string;
6369
+ rep: number;
6370
+ ok: boolean;
6371
+ score: number;
6372
+ cached: boolean;
6373
+ } | {
6374
+ type: 'generation-complete';
6375
+ report: GenerationReport<unknown>;
6376
+ } | {
6377
+ type: 'converged';
6378
+ generation: number;
6379
+ reason: string;
6380
+ };
6381
+ interface GenerationReport<P = unknown> {
6382
+ runId: string;
6383
+ target: string;
6384
+ generation: number;
6385
+ variants: PromptVariant<P>[];
6386
+ aggregates: VariantAggregate[];
6387
+ /** Frontier candidates, sorted by descending crowding distance. */
6388
+ paretoFrontIds: string[];
6389
+ /** Scalar-best variant id — used for the single "winner" if callers want one. */
6390
+ winnerId: string;
6391
+ /** Trials that fed this generation (kept for downstream reporting). */
6392
+ trials: TrialResult[];
6393
+ }
6394
+ interface PromptEvolutionResult<P = unknown> {
6395
+ runId: string;
6396
+ target: string;
6397
+ generations: GenerationReport<P>[];
6398
+ /** Best variant by scalar score in the final generation. */
6399
+ bestVariant: PromptVariant<P>;
6400
+ /** Best aggregate (matches bestVariant). */
6401
+ bestAggregate: VariantAggregate;
6402
+ }
6403
+ declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
6404
+
6405
+ /**
6406
+ * GoldenMatcher — fuzzy matcher for "did the agent produce the expected things?".
6407
+ *
6408
+ * Universal primitive across agent-eval consumers. Use it for:
6409
+ * - Test suites: did the run hit the expected assertions?
6410
+ * - Tool agents: did the agent emit the expected tool call sequence?
6411
+ * - Judges: did the verdict include the expected concepts?
6412
+ * - Design audits: did the auditor surface the planted defects?
6413
+ *
6414
+ * Match rule (per golden):
6415
+ * - Any phrase in `golden.any` (case-insensitive substring) appears in the
6416
+ * candidate's text fields, OR
6417
+ * - Any pattern in `golden.anyRegex` (case-insensitive) matches.
6418
+ *
6419
+ * Recall is severity-weighted by default: critical=3, major=2, minor=1.
6420
+ * Missing one critical hurts more than missing three minors.
6421
+ */
6422
+ type GoldenSeverity = 'critical' | 'major' | 'minor';
6423
+ interface GoldenSpec {
6424
+ /** Stable identifier — survives across runs so consumers can grep by id. */
6425
+ id: string;
6426
+ /** Severity drives recall weighting. */
6427
+ severity: GoldenSeverity;
6428
+ /**
6429
+ * Substring phrases (case-insensitive). A hit on ANY phrase counts as a
6430
+ * match. Keep these SHORT (3-6 words) and SPECIFIC.
6431
+ */
6432
+ any: string[];
6433
+ /** Optional regex patterns. ORed with `any`. */
6434
+ anyRegex?: string[];
6435
+ /** Free-form note — surfaces in reports for humans. */
6436
+ hint?: string;
6437
+ /** Optional category for grouping/filtering. */
6438
+ category?: string;
6439
+ }
6440
+ interface MatchResult {
6441
+ /** Same length as goldens; `true` when matched. */
6442
+ matches: boolean[];
6443
+ /** Convenience: count of hits. */
6444
+ hits: number;
6445
+ /** Convenience: total goldens. */
6446
+ total: number;
6447
+ }
6448
+ /**
6449
+ * Match each golden against `candidates`, where each candidate exposes one or
6450
+ * more text fields the matcher should search. Defaults to searching all
6451
+ * string-typed fields concatenated.
6452
+ */
6453
+ declare function matchGoldens<T>(goldens: GoldenSpec[], candidates: T[], options?: {
6454
+ /**
6455
+ * Extract the searchable text for a candidate. Default: concatenate every
6456
+ * top-level string field with a space.
6457
+ */
6458
+ text?: (candidate: T) => string;
6459
+ }): MatchResult;
6460
+ /** Severity weights — exposed so consumers can override (rare). */
6461
+ declare const DEFAULT_SEVERITY_WEIGHTS: Record<GoldenSeverity, number>;
6462
+ /** Severity-weighted recall over a MatchResult + the goldens that produced it. */
6463
+ declare function weightedRecall(goldens: GoldenSpec[], result: MatchResult, weights?: Record<GoldenSeverity, number>): number;
6464
+ /**
6465
+ * Precision proxy: fraction of emitted candidates that match SOME golden.
6466
+ *
6467
+ * No human-labelled negatives means unmatched candidates are SOFT false
6468
+ * positives — punishes verbose agents that pad with filler. Doesn't punish
6469
+ * unknown-but-real findings; the way to tighten this is to grow the golden
6470
+ * set, not to invent a stricter score.
6471
+ */
6472
+ declare function precision<T>(goldens: GoldenSpec[], candidates: T[], options?: {
6473
+ text?: (candidate: T) => string;
6474
+ }): number;
6475
+
6476
+ /**
6477
+ * Inter-critic / inter-pass orthogonality.
6478
+ *
6479
+ * Detects redundant ensembles. When you run N critics (or N audit passes,
6480
+ * or N specialized agents) on the same input, you want them to disagree —
6481
+ * each contributing distinct signal. If they all converge on the same set
6482
+ * of findings, you're paying N× cost for ~1× signal.
6483
+ *
6484
+ * The metric is `1 − mean pairwise cosine similarity` over bags of words
6485
+ * extracted from each pass's outputs. 1.0 = fully orthogonal,
6486
+ * 0.0 = fully redundant.
6487
+ *
6488
+ * Universal primitive: pass anything that produces text (findings, tool
6489
+ * calls rendered as JSON, verdict strings) and the matcher derives its own
6490
+ * vocabulary.
6491
+ */
6492
+ interface OrthogonalityInput<T> {
6493
+ passes: Array<{
6494
+ findings: T[];
6495
+ }>;
6496
+ /** Render one element to text. Default: defaultRender (concatenates string fields). */
6497
+ text?: (item: T) => string;
6498
+ /** Minimum token length kept in the bag. Default 4 (drops short fillers). */
6499
+ minTokenLength?: number;
6500
+ }
6501
+ interface OrthogonalityResult {
6502
+ /** 1 − mean pairwise cosine similarity across passes. 1=fully orthogonal, 0=fully redundant. */
6503
+ orthogonality: number;
6504
+ /** Number of passes considered. */
6505
+ passCount: number;
6506
+ /** Pairwise cosine similarities, in upper-triangular order (for debugging). */
6507
+ similarities: number[];
6508
+ }
6509
+ declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
6510
+
6511
+ /**
6512
+ * Bootstrap-CI promotion gate.
6513
+ *
6514
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
6515
+ * curation), the question is "did this generation actually improve, or are
6516
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
6517
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
6518
+ * delta is real before code or prompts get promoted.
6519
+ *
6520
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
6521
+ * and to compose into any verdict gate.
6522
+ *
6523
+ * Default gate:
6524
+ * - Bootstrap mean baseline vs candidate (1k resamples).
6525
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
6526
+ * - Tunable confidence (default 95%) and resample count.
6527
+ *
6528
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
6529
+ * vocabulary:
6530
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
6531
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
6532
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
6533
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
6534
+ */
6535
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
6536
+ interface BootstrapResult {
6537
+ baselineMean: number;
6538
+ candidateMean: number;
6539
+ /** candidateMean - baselineMean, point estimate. */
6540
+ delta: number;
6541
+ /** Lower bound of the (1 - alpha) CI on the delta. */
6542
+ ciLower: number;
6543
+ /** Upper bound of the (1 - alpha) CI on the delta. */
6544
+ ciUpper: number;
6545
+ /** Number of bootstrap resamples used. */
6546
+ iterations: number;
6547
+ alpha: number;
6548
+ verdict: Verdict;
6549
+ }
6550
+ interface BootstrapOptions {
6551
+ /** Confidence level alpha (default 0.05 → 95% CI). */
6552
+ alpha?: number;
6553
+ /** Number of resamples (default 1000). */
6554
+ iterations?: number;
6555
+ /**
6556
+ * Minimum total samples (baseline + candidate) below which we always
6557
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
6558
+ * Default 6 (combined).
6559
+ */
6560
+ minTotalSamples?: number;
6561
+ /** RNG seed for reproducibility. Default: Math.random. */
6562
+ seed?: number;
6563
+ }
6564
+ /**
6565
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
6566
+ *
6567
+ * Uses simple percentile bootstrap on the difference of resampled means.
6568
+ * That's the standard non-parametric primitive — no distributional
6569
+ * assumptions, robust to skew, easy to reason about.
6570
+ */
6571
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
6572
+ /**
6573
+ * Judge-replay promotion gate.
6574
+ *
6575
+ * The cheap inner-loop judge that drives an evolution run is by definition
6576
+ * fast and noisy. When you're about to promote a winning variant to the
6577
+ * canonical default, you want a STRONGER judge (a more expensive model, a
6578
+ * human grader, a separately-trained reward model) to confirm the win
6579
+ * generalises beyond the inner loop.
6580
+ *
6581
+ * This helper takes raw winner + baseline outputs, scores both through the
6582
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
6583
+ * judge agrees the winner is real with the configured confidence. Doesn't
6584
+ * matter what shape your "output" is — pass a string, an object, anything
6585
+ * the judge can read.
6586
+ */
6587
+ interface JudgeReplayGateArgs<TOutput> {
6588
+ baselineOutputs: TOutput[];
6589
+ candidateOutputs: TOutput[];
6590
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
6591
+ judge: (output: TOutput) => Promise<number> | number;
6592
+ alpha?: number;
6593
+ iterations?: number;
6594
+ /** RNG seed for reproducibility. */
6595
+ seed?: number;
6596
+ /** Maximum concurrent judge calls. Default 4. */
6597
+ judgeConcurrency?: number;
6598
+ }
6599
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
6600
+ baselineSamples: number;
6601
+ candidateSamples: number;
6602
+ }>;
6603
+
6604
+ /**
6605
+ * Reflective mutation — primitives for trace-conditioned prompt rewriting.
6606
+ *
6607
+ * Used by `prompt-evolution.ts` (and any consumer running iterative
6608
+ * improvement). Given a parent prompt + concrete trace evidence (top trials,
6609
+ * bottom trials, missed expectations), produce an LLM-ready prompt that
6610
+ * proposes targeted mutations — not blind rephrasings.
6611
+ *
6612
+ * Why this lives outside `prompt-evolution.ts`: any consumer that wants to
6613
+ * run reflective rewriting WITHOUT the population/Pareto machinery can
6614
+ * import these primitives directly.
6615
+ *
6616
+ * Quality bar (vs. naive "mutate this prompt"):
6617
+ * - Show parent ↔ children diff, not just one variant
6618
+ * - Quote specific missed goldens with their match phrases
6619
+ * - Surface the model's actual emitted output side-by-side with what was expected
6620
+ * - Quote concrete mutation primitives so the model has a vocabulary
6621
+ */
6622
+ interface TrialTrace {
6623
+ /** Stable id for the trial — surfaces in the prompt for grounding. */
6624
+ id: string;
6625
+ /** Score the trial received on its primary metric. */
6626
+ score: number;
6627
+ /** Candidate inputs the agent was given (e.g., the fixture or scenario). */
6628
+ inputName?: string;
6629
+ /**
6630
+ * Goldens / expectations this trial was tested against, with whether each
6631
+ * was matched. The reflection prompt quotes the missed ones specifically.
6632
+ */
6633
+ expectations?: Array<{
6634
+ id: string;
6635
+ phrase: string;
6636
+ matched: boolean;
6637
+ }>;
6638
+ /** Free-form text — what the agent actually emitted (e.g., findings, plan). */
6639
+ emitted?: string;
6640
+ /** Optional structured metrics (recall, precision, cost, latency). */
6641
+ metrics?: Record<string, number>;
6642
+ }
6643
+ interface ReflectionContext {
6644
+ /** What is being mutated — appears in the system prompt for orientation. */
6645
+ target: string;
6646
+ /** Current variant's payload — JSON-serialised for the prompt. */
6647
+ parentPayload: unknown;
6648
+ /** Best-performing trials this generation. */
6649
+ topTrials: TrialTrace[];
6650
+ /** Worst-performing trials this generation — the missed-golden source. */
6651
+ bottomTrials: TrialTrace[];
6652
+ /** How many children the mutator should propose. */
6653
+ childCount: number;
6654
+ /** Optional: domain-specific mutation primitives the model can pick from. */
6655
+ mutationPrimitives?: string[];
6656
+ }
6657
+ declare const DEFAULT_MUTATION_PRIMITIVES: string[];
6658
+ /**
6659
+ * Build the LLM-ready reflection prompt. Output is plain text — pass it as
6660
+ * the user message. The system message should be small and stable (e.g.
6661
+ * "Output ONLY a JSON object matching the schema below.").
6662
+ */
6663
+ declare function buildReflectionPrompt(ctx: ReflectionContext): string;
6664
+ interface ReflectionProposal {
6665
+ label: string;
6666
+ rationale: string;
6667
+ payload: unknown;
6668
+ }
6669
+ /**
6670
+ * Parse the model's JSON response back into proposals. Tolerates markdown
6671
+ * fences and surrounding prose. Returns at most `maxProposals`.
6672
+ */
6673
+ declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
6674
+
6675
+ export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialCache, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };