@tangle-network/agent-eval 0.16.2 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -196,10 +196,11 @@ These are the primitives any team running prompt-optimization in production need
196
196
  meta-loop (`inspectFailures` → `proposeChange` → `applyChange` →
197
197
  `evaluateChange`). Ship a `NoopResearcher` as a placeholder; real
198
198
  implementations live downstream.
199
- - `benchmarks/{gsm8k,swebench-lite,routing}` — reference benchmark
200
- wrappers behind one `BenchmarkAdapter` shape, with deterministic
201
- splits and fail-loud env-var configuration. Mostly for reproducible
202
- comparisons; not core surface.
199
+ - `benchmarks/routing` — synthetic 16-task router benchmark we own.
200
+ Ships in the package. Reference wrappers for GSM8K and SWE-Bench
201
+ Lite live under `examples/benchmarks/` read, copy, adapt. All
202
+ three implement one `BenchmarkAdapter` shape with deterministic
203
+ splits and fail-loud env-var configuration.
203
204
 
204
205
  ### v0.16 changes from v0.15
205
206
 
package/dist/index.d.ts CHANGED
@@ -6975,103 +6975,6 @@ declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
6975
6975
  */
6976
6976
  declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
6977
6977
 
6978
- /**
6979
- * GSM8K wrapper — exact-match grading on the final numeric answer.
6980
- *
6981
- * The dataset itself is NOT bundled. `loadDataset` will:
6982
- * 1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
6983
- * file with `{ id, question, answer }` records — the standard
6984
- * HF mirror layout converted to JSONL);
6985
- * 2. otherwise throw a clearly-marked error pointing to the loader.
6986
- *
6987
- * `evaluate` parses the final number out of the response (last
6988
- * occurrence of a signed-decimal-or-integer literal, optionally after
6989
- * `####`, the GSM8K answer convention) and compares to the ground-
6990
- * truth integer. Floating-point comparisons use a 1e-6 tolerance.
6991
- */
6992
-
6993
- interface Gsm8kPayload {
6994
- question: string;
6995
- /** Reference answer, post-#### normalization. May be a number or
6996
- * a numeric string ("72", "1.5"). */
6997
- answer: string;
6998
- }
6999
- type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>;
7000
- declare class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
7001
- loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]>;
7002
- evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation>;
7003
- assignSplit(itemId: string): RunSplitTag;
7004
- }
7005
- /**
7006
- * Parse a GSM8K-style answer. Honors the dataset's `#### N`
7007
- * convention (the canonical answer comes after `####`); otherwise
7008
- * returns the LAST signed numeric literal in the string.
7009
- */
7010
- declare function parseGsm8kAnswer(text: string): number | null;
7011
- declare const loadDataset$2: (split: RunSplitTag) => Promise<Gsm8kItem[]>;
7012
- declare const evaluate$2: (item: Gsm8kItem, response: string) => Promise<BenchmarkEvaluation>;
7013
- declare const assignSplit$2: (itemId: string) => RunSplitTag;
7014
-
7015
- type index$3_Gsm8kAdapter = Gsm8kAdapter;
7016
- declare const index$3_Gsm8kAdapter: typeof Gsm8kAdapter;
7017
- type index$3_Gsm8kItem = Gsm8kItem;
7018
- type index$3_Gsm8kPayload = Gsm8kPayload;
7019
- declare const index$3_parseGsm8kAnswer: typeof parseGsm8kAnswer;
7020
- declare namespace index$3 {
7021
- export { index$3_Gsm8kAdapter as Gsm8kAdapter, type index$3_Gsm8kItem as Gsm8kItem, type index$3_Gsm8kPayload as Gsm8kPayload, assignSplit$2 as assignSplit, evaluate$2 as evaluate, loadDataset$2 as loadDataset, index$3_parseGsm8kAnswer as parseGsm8kAnswer };
7022
- }
7023
-
7024
- /**
7025
- * SWE-Bench Lite wrapper — 30-instance subset.
7026
- *
7027
- * Status: STUB. The actual SWE-Bench harness needs a Docker host and
7028
- * is too heavy to ship inside this package. We expose the contract
7029
- * (loadDataset, evaluate, assignSplit) so consumers can plug in their
7030
- * own grader without touching call sites.
7031
- *
7032
- * Wire-up paths in priority order:
7033
- *
7034
- * 1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
7035
- * lite instances + per-instance metadata (instance_id,
7036
- * problem_statement, base_commit, repo, FAIL_TO_PASS,
7037
- * PASS_TO_PASS).
7038
- * 2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
7039
- * that reads `{instance_id, patch}` JSON on stdin and writes
7040
- * `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
7041
- * JSON on stdout. Implementations can shell out to the
7042
- * official `swebench` runner here.
7043
- *
7044
- * If neither is set, every public method throws a clearly-marked
7045
- * "not implemented" error. The stub fails LOUD; it never silently
7046
- * scores zero.
7047
- */
7048
-
7049
- interface SweBenchLitePayload {
7050
- instanceId: string;
7051
- problemStatement: string;
7052
- baseCommit: string;
7053
- repo: string;
7054
- failToPass: string[];
7055
- passToPass: string[];
7056
- }
7057
- type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>;
7058
- declare class SweBenchLiteAdapter implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload> {
7059
- loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]>;
7060
- evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation>;
7061
- assignSplit(itemId: string): RunSplitTag;
7062
- }
7063
- declare const loadDataset$1: (split: RunSplitTag) => Promise<SweBenchLiteItem[]>;
7064
- declare const evaluate$1: (item: SweBenchLiteItem, response: string) => Promise<BenchmarkEvaluation>;
7065
- declare const assignSplit$1: (itemId: string) => RunSplitTag;
7066
-
7067
- type index$2_SweBenchLiteAdapter = SweBenchLiteAdapter;
7068
- declare const index$2_SweBenchLiteAdapter: typeof SweBenchLiteAdapter;
7069
- type index$2_SweBenchLiteItem = SweBenchLiteItem;
7070
- type index$2_SweBenchLitePayload = SweBenchLitePayload;
7071
- declare namespace index$2 {
7072
- export { index$2_SweBenchLiteAdapter as SweBenchLiteAdapter, type index$2_SweBenchLiteItem as SweBenchLiteItem, type index$2_SweBenchLitePayload as SweBenchLitePayload, assignSplit$1 as assignSplit, evaluate$1 as evaluate, loadDataset$1 as loadDataset };
7073
- }
7074
-
7075
6978
  /**
7076
6979
  * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
7077
6980
  * deterministic, dependency-free benchmark for any router that maps a
@@ -7153,21 +7056,21 @@ declare namespace index$1 {
7153
7056
  /**
7154
7057
  * Reference benchmark wrappers — entry point.
7155
7058
  *
7156
- * Three benchmarks ship under `src/benchmarks/`:
7157
- * - `gsm8k` exact-match math reasoning (HF mirror,
7158
- * dataset NOT bundled see `gsm8k/index.ts`).
7159
- * - `swebench-lite` 30-instance SWE-Bench subset (STUB; needs
7160
- * external grader).
7161
- * - `routing` — synthetic 16-task router benchmark, ships
7162
- * in the package.
7059
+ * Core surface (exported here):
7060
+ * - The `BenchmarkAdapter` contract.
7061
+ * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
7062
+ * - `routing` synthetic 16-task router benchmark. The only novel
7063
+ * benchmark we built; ships in the package.
7163
7064
  *
7164
- * Every benchmark exposes the same three exports — `loadDataset`,
7165
- * `evaluate`, `assignSplit` and a typed adapter class. Pick the
7166
- * import path that matches the benchmark.
7065
+ * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
7066
+ * - `gsm8k` exact-match math reasoning (HF mirror, dataset
7067
+ * not bundled).
7068
+ * - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
7069
+ * external grader).
7167
7070
  *
7168
- * Shared types (`BenchmarkAdapter`, `BenchmarkDatasetItem`,
7169
- * `BenchmarkEvaluation`, `deterministicSplit`, `BENCHMARK_SPLIT_SEED`)
7170
- * live in `./types`.
7071
+ * The example wrappers are reference implementations of `BenchmarkAdapter`.
7072
+ * Read them, copy them, adapt them. They're intentionally not in the main
7073
+ * entry every team will configure them differently.
7171
7074
  */
7172
7075
 
7173
7076
  declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
@@ -7176,7 +7079,7 @@ type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayl
7176
7079
  type index_BenchmarkEvaluation = BenchmarkEvaluation;
7177
7080
  declare const index_deterministicSplit: typeof deterministicSplit;
7178
7081
  declare namespace index {
7179
- export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$3 as gsm8k, index$1 as routing, index$2 as swebenchLite };
7082
+ export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
7180
7083
  }
7181
7084
 
7182
7085
  interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
@@ -8073,10 +7976,6 @@ interface ReflectionProposal {
8073
7976
  rationale: string;
8074
7977
  payload: unknown;
8075
7978
  }
8076
- /**
8077
- * Parse the model's JSON response back into proposals. Tolerates markdown
8078
- * fences and surrounding prose. Returns at most `maxProposals`.
8079
- */
8080
7979
  declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
8081
7980
 
8082
7981
  export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };