npm - @tangle-network/agent-eval - Versions diffs - 0.16.2 → 0.17.1 - Mend

@tangle-network/agent-eval 0.16.2 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -196,10 +196,11 @@ These are the primitives any team running prompt-optimization in production need
   meta-loop (`inspectFailures` → `proposeChange` → `applyChange` →
   `evaluateChange`). Ship a `NoopResearcher` as a placeholder; real
   implementations live downstream.
-- `benchmarks/{gsm8k,swebench-lite,routing}` — reference benchmark
-  wrappers behind one `BenchmarkAdapter` shape, with deterministic
-  splits and fail-loud env-var configuration. Mostly for reproducible
-  comparisons; not core surface.
+- `benchmarks/routing` — synthetic 16-task router benchmark we own.
+  Ships in the package. Reference wrappers for GSM8K and SWE-Bench
+  Lite live under `examples/benchmarks/` — read, copy, adapt. All
+  three implement one `BenchmarkAdapter` shape with deterministic
+  splits and fail-loud env-var configuration.
 ### v0.16 changes from v0.15

package/dist/index.d.ts CHANGED Viewed

@@ -6975,103 +6975,6 @@ declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
  */
 declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
-/**
- * GSM8K wrapper — exact-match grading on the final numeric answer.
- *
- * The dataset itself is NOT bundled. `loadDataset` will:
- *   1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
- *      file with `{ id, question, answer }` records — the standard
- *      HF mirror layout converted to JSONL);
- *   2. otherwise throw a clearly-marked error pointing to the loader.
- *
- * `evaluate` parses the final number out of the response (last
- * occurrence of a signed-decimal-or-integer literal, optionally after
- * `####`, the GSM8K answer convention) and compares to the ground-
- * truth integer. Floating-point comparisons use a 1e-6 tolerance.
- */
-interface Gsm8kPayload {
-    question: string;
-    /** Reference answer, post-#### normalization. May be a number or
-     *  a numeric string ("72", "1.5"). */
-    answer: string;
-}
-type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>;
-declare class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
-    loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]>;
-    evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation>;
-    assignSplit(itemId: string): RunSplitTag;
-}
-/**
- * Parse a GSM8K-style answer. Honors the dataset's `#### N`
- * convention (the canonical answer comes after `####`); otherwise
- * returns the LAST signed numeric literal in the string.
- */
-declare function parseGsm8kAnswer(text: string): number | null;
-declare const loadDataset$2: (split: RunSplitTag) => Promise<Gsm8kItem[]>;
-declare const evaluate$2: (item: Gsm8kItem, response: string) => Promise<BenchmarkEvaluation>;
-declare const assignSplit$2: (itemId: string) => RunSplitTag;
-type index$3_Gsm8kAdapter = Gsm8kAdapter;
-declare const index$3_Gsm8kAdapter: typeof Gsm8kAdapter;
-type index$3_Gsm8kItem = Gsm8kItem;
-type index$3_Gsm8kPayload = Gsm8kPayload;
-declare const index$3_parseGsm8kAnswer: typeof parseGsm8kAnswer;
-declare namespace index$3 {
-  export { index$3_Gsm8kAdapter as Gsm8kAdapter, type index$3_Gsm8kItem as Gsm8kItem, type index$3_Gsm8kPayload as Gsm8kPayload, assignSplit$2 as assignSplit, evaluate$2 as evaluate, loadDataset$2 as loadDataset, index$3_parseGsm8kAnswer as parseGsm8kAnswer };
-}
-/**
- * SWE-Bench Lite wrapper — 30-instance subset.
- *
- * Status: STUB. The actual SWE-Bench harness needs a Docker host and
- * is too heavy to ship inside this package. We expose the contract
- * (loadDataset, evaluate, assignSplit) so consumers can plug in their
- * own grader without touching call sites.
- *
- * Wire-up paths in priority order:
- *
- *   1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
- *      lite instances + per-instance metadata (instance_id,
- *      problem_statement, base_commit, repo, FAIL_TO_PASS,
- *      PASS_TO_PASS).
- *   2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
- *      that reads `{instance_id, patch}` JSON on stdin and writes
- *      `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
- *      JSON on stdout. Implementations can shell out to the
- *      official `swebench` runner here.
- *
- * If neither is set, every public method throws a clearly-marked
- * "not implemented" error. The stub fails LOUD; it never silently
- * scores zero.
- */
-interface SweBenchLitePayload {
-    instanceId: string;
-    problemStatement: string;
-    baseCommit: string;
-    repo: string;
-    failToPass: string[];
-    passToPass: string[];
-}
-type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>;
-declare class SweBenchLiteAdapter implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload> {
-    loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]>;
-    evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation>;
-    assignSplit(itemId: string): RunSplitTag;
-}
-declare const loadDataset$1: (split: RunSplitTag) => Promise<SweBenchLiteItem[]>;
-declare const evaluate$1: (item: SweBenchLiteItem, response: string) => Promise<BenchmarkEvaluation>;
-declare const assignSplit$1: (itemId: string) => RunSplitTag;
-type index$2_SweBenchLiteAdapter = SweBenchLiteAdapter;
-declare const index$2_SweBenchLiteAdapter: typeof SweBenchLiteAdapter;
-type index$2_SweBenchLiteItem = SweBenchLiteItem;
-type index$2_SweBenchLitePayload = SweBenchLitePayload;
-declare namespace index$2 {
-  export { index$2_SweBenchLiteAdapter as SweBenchLiteAdapter, type index$2_SweBenchLiteItem as SweBenchLiteItem, type index$2_SweBenchLitePayload as SweBenchLitePayload, assignSplit$1 as assignSplit, evaluate$1 as evaluate, loadDataset$1 as loadDataset };
-}
 /**
  * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
  * deterministic, dependency-free benchmark for any router that maps a
@@ -7153,21 +7056,21 @@ declare namespace index$1 {
 /**
  * Reference benchmark wrappers — entry point.
  *
- * Three benchmarks ship under `src/benchmarks/`:
- *   - `gsm8k`           — exact-match math reasoning (HF mirror,
- *                          dataset NOT bundled — see `gsm8k/index.ts`).
- *   - `swebench-lite`   — 30-instance SWE-Bench subset (STUB; needs
- *                          external grader).
- *   - `routing`         — synthetic 16-task router benchmark, ships
- *                          in the package.
+ * Core surface (exported here):
+ *   - The `BenchmarkAdapter` contract.
+ *   - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
+ *   - `routing` — synthetic 16-task router benchmark. The only novel
+ *     benchmark we built; ships in the package.
  *
- * Every benchmark exposes the same three exports — `loadDataset`,
- * `evaluate`, `assignSplit` — and a typed adapter class. Pick the
- * import path that matches the benchmark.
+ * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
+ *   - `gsm8k`         — exact-match math reasoning (HF mirror, dataset
+ *                       not bundled).
+ *   - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
+ *                       external grader).
  *
- * Shared types (`BenchmarkAdapter`, `BenchmarkDatasetItem`,
- * `BenchmarkEvaluation`, `deterministicSplit`, `BENCHMARK_SPLIT_SEED`)
- * live in `./types`.
+ * The example wrappers are reference implementations of `BenchmarkAdapter`.
+ * Read them, copy them, adapt them. They're intentionally not in the main
+ * entry — every team will configure them differently.
  */
 declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
@@ -7176,7 +7079,7 @@ type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayl
 type index_BenchmarkEvaluation = BenchmarkEvaluation;
 declare const index_deterministicSplit: typeof deterministicSplit;
 declare namespace index {
-  export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$3 as gsm8k, index$1 as routing, index$2 as swebenchLite };
+  export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
 }
 interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
@@ -8073,10 +7976,6 @@ interface ReflectionProposal {
     rationale: string;
     payload: unknown;
 }
-/**
- * Parse the model's JSON response back into proposals. Tolerates markdown
- * fences and surrounding prose. Returns at most `maxProposals`.
- */
 declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
 export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };