npm - @tangle-network/agent-eval - Versions diffs - 0.65.0 → 0.67.0 - Mend

@tangle-network/agent-eval 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/CHANGELOG.md +25 -0
package/dist/adapters/otel.d.ts +1 -1
package/dist/campaign/index.d.ts +110 -6
package/dist/campaign/index.js +26 -19
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
package/dist/chunk-6XQIEUQ2.js.map +1 -0
package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
package/dist/chunk-DFS3FEXO.js.map +1 -0
package/dist/chunk-MZ2IYGGN.js +592 -0
package/dist/chunk-MZ2IYGGN.js.map +1 -0
package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
package/dist/chunk-NV2PF37Q.js.map +1 -0
package/dist/contract/index.d.ts +11 -9
package/dist/contract/index.js +11 -12
package/dist/contract/index.js.map +1 -1
package/dist/hosted/index.d.ts +1 -1
package/dist/hosted/index.js +1 -1
package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
package/dist/index.d.ts +251 -7
package/dist/index.js +292 -2
package/dist/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/provenance-CChUqexv.d.ts +314 -0
package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
package/dist/release-report-CN8hJlhk.d.ts +233 -0
package/dist/reporting.d.ts +4 -3
package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
package/dist/statistics-B7yCbi9i.d.ts +253 -0
package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
package/package.json +1 -1
package/dist/chunk-4ODZXQV2.js.map +0 -1
package/dist/chunk-7TPYV2ER.js.map +0 -1
package/dist/chunk-CZRKD2X2.js +0 -1104
package/dist/chunk-CZRKD2X2.js.map +0 -1
package/dist/chunk-E22YUOAL.js +0 -111
package/dist/chunk-E22YUOAL.js.map +0 -1
package/dist/chunk-HKINEDRZ.js.map +0 -1
package/dist/release-report-DGoeObZT.d.ts +0 -484
/package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0

package/dist/hosted/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient, n as hostedClientFromEnv } from '../index-CzhtwYBT.js';
+export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient, n as hostedClientFromEnv } from '../index-DSEHMwvS.js';
 import '../types-c2R2kfmv.js';
 import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';

package/dist/hosted/index.js CHANGED Viewed

@@ -2,7 +2,7 @@ import {
   HOSTED_WIRE_VERSION,
   createHostedClient,
   hostedClientFromEnv
-} from "../chunk-HKINEDRZ.js";
+} from "../chunk-DFS3FEXO.js";
 import "../chunk-PZ5AY32C.js";
 export {
   HOSTED_WIRE_VERSION,

package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} RENAMED Viewed

@@ -455,8 +455,10 @@ interface IngestResponse {
  * speaks the wire format in `./types.ts`.
  *
  * Three modes:
- *   - **Ours:** point at `https://orchestrator.tangle.tools/v1`. We
- *     handle ingest + storage + dashboard.
+ *   - **Ours:** point at `https://orchestrator.tangle.tools` (the host root —
+ *     the client appends the versioned `/v1/ingest/...` path itself; a trailing
+ *     `/v1` on the endpoint is tolerated and normalized away). We handle ingest
+ *     + storage + dashboard.
  *   - **Self-hosted:** point at whatever URL runs the reference receiver
  *     from `examples/hosted-ingest-server/`.
  *   - **Off (default):** when `hostedTenant` is unset, nothing is sent.

package/dist/index.d.ts CHANGED Viewed

@@ -14,10 +14,10 @@ import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInpu
 export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
 import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
 export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
-import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
-export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
-import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-DPly4_hZ.js';
-export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-DPly4_hZ.js';
+import { b as JudgeFn, a as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-Croy5h7V.js';
+export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, J as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-Croy5h7V.js';
+import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext, C as ChatRequest, f as CreateChatClientOpts } from './registry-BGKyX6bw.js';
+export { g as AnalystHooks, h as AnalystInputKind, A as AnalystRegistry, i as AnalystRegistryOptions, j as AnalystRequirements, k as AnalystRunEvent, l as AnalystRunInputs, m as AnalystRunResult, n as AnalystRunSummary, B as BudgetPolicy, o as ChatCallOpts, p as ChatClient, q as ChatResponse, r as ChatTransport, s as CliBridgeTransportOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BGKyX6bw.js';
 import { TCloud } from '@tangle-network/tcloud';
 import { z } from 'zod';
 export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
@@ -28,8 +28,9 @@ export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind
 import { A as AgentProfile } from './agent-profile-DzcPHR1Z.js';
 export { a as BackendIntegrityError, B as BackendIntegrityReport, b as agentProfileHash, c as assertRealBackend, s as summarizeBackendIntegrity } from './agent-profile-DzcPHR1Z.js';
 export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
-import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DGoeObZT.js';
-export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CliffsMagnitude, t as CorpusAgreementOptions, u as CorpusAgreementPerDimension, v as CorpusAgreementReport, x as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, W as WeightedCompositeInput, y as WeightedCompositeResult, l as assertReleaseConfidence, m as benjaminiHochberg, z as bonferroni, n as bootstrapCi, D as cliffsDelta, E as cohensD, F as confidenceInterval, G as corpusInterRaterAgreement, H as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, I as interRaterReliability, K as interpretCliffs, p as judgeReplayGate, L as mannWhitneyU, M as normalizeScores, q as pairedBootstrap, N as pairedMde, O as pairedTTest, Q as partialCredit, r as renderReleaseReport, S as requiredSampleSize, T as weightedComposite, U as weightedMean, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
+import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CN8hJlhk.js';
+export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
+export { C as CliffsMagnitude, c as CorpusAgreementOptions, d as CorpusAgreementPerDimension, e as CorpusAgreementReport, f as CorpusScoreRecord, P as PairedBootstrapOptions, a as PairedBootstrapResult, W as WeightedCompositeInput, g as WeightedCompositeResult, b as benjaminiHochberg, h as bonferroni, i as cliffsDelta, j as cohensD, k as confidenceInterval, l as corpusInterRaterAgreement, m as corpusInterRaterAgreementFromJudgeScores, n as interRaterReliability, o as interpretCliffs, q as mannWhitneyU, r as normalizeScores, p as pairedBootstrap, s as pairedMde, t as pairedTTest, u as partialCredit, v as requiredSampleSize, x as weightedComposite, y as weightedMean, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
 import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
 export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
 import { T as TraceEmitter } from './emitter-DEZwY14K.js';
@@ -52,6 +53,8 @@ export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, Us
 export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DsnOpCO6.js';
 export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
+import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-c2R2kfmv.js';
+import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-BKpM5T4t.js';
 import './outcome-store-D6KWmYvj.js';
 interface RunScore {
@@ -5577,4 +5580,245 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
  */
 declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
-export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SKILL_USAGE_ANALYST, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, SkillUsageAnalyst, type SkillUsageRecord, type SkillUsageReport, type SkillUsageScanConfig, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildSkillUsageReport, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, emitSkillUsageFindings, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
+/**
+ * @experimental
+ *
+ * Gold scenarios for teacher→student distillation. The TEACHER is an
+ * expensive workflow (e.g. the 70-agent skill audit) whose verdicts are
+ * frozen as gold labels; the STUDENT is a cheap single-shot analyst whose
+ * prompt GEPA optimizes toward reproducing those labels.
+ *
+ * A `GoldScenario` is a `Scenario` (the substrate's input contract) carrying
+ * an OPAQUE `input` (what the student sees) and an OPAQUE `label` (the gold
+ * verdict the student's output is scored against). Both are typed `unknown`
+ * here: this module is domain-agnostic — it distills ANY analyst against ANY
+ * gold JSONL. The agreement comparator (see `agreement-judge.ts`) is what
+ * knows the label's shape.
+ *
+ * Loading + splitting are DETERMINISTIC and LLM-free: the gold set is the
+ * fixed ground truth, never regenerated here.
+ */
+/** A held gold record: opaque student-input + opaque gold-label, carried as a
+ *  substrate `Scenario` so it flows through `runCampaign` unchanged. */
+interface GoldScenario<TInput = unknown, TLabel = unknown> extends Scenario$1 {
+    kind: 'gold';
+    /** What the student analyst is shown (rendered into its user prompt). */
+    input: TInput;
+    /** The teacher's gold verdict — the target the student's output is scored
+     *  against by the agreement judge. NEVER shown to the student. */
+    label: TLabel;
+}
+/** Read a gold JSONL (one `{scenarioId|id, input, label, split?}` per line) into
+ *  `GoldScenario[]`. Deterministic, no LLM. Blank lines are skipped; a line
+ *  missing an id, `input`, or `label` throws (a silent skip would corrupt the
+ *  split silently — fail loud on a malformed gold set). */
+declare function loadGoldScenarios<TInput = unknown, TLabel = unknown>(jsonlPath: string): GoldScenario<TInput, TLabel>[];
+/** Parse gold JSONL text directly (no fs). Exported so tests + in-memory
+ *  callers exercise the same parse path as {@link loadGoldScenarios}. */
+declare function parseGoldJsonl<TInput = unknown, TLabel = unknown>(text: string, sourceLabel?: string): GoldScenario<TInput, TLabel>[];
+interface SplitGoldOptions {
+    /** Every Nth scenario (0-based index) goes to the TEST/holdout split; the
+     *  rest train. Default 4 ⇒ a 25% holdout. Ignored for any scenario that
+     *  carries an explicit `split:` tag (that is honored verbatim). */
+    testEveryNth?: number;
+}
+interface GoldSplit<TInput, TLabel> {
+    /** Training scenarios — the optimization pool the driver searches over. */
+    train: GoldScenario<TInput, TLabel>[];
+    /** Held-out scenarios — kept OUT of training; scored only at the gate. */
+    test: GoldScenario<TInput, TLabel>[];
+}
+/** Deterministic train/test split. A scenario tagged `split:train|test` is
+ *  routed by that tag; the rest fall to a modulo split (`index % testEveryNth
+ *  === 0 ⇒ test`). Pure — same input always yields the same split, so a gold
+ *  set's holdout is stable across runs (a shuffled split would let a lucky
+ *  seed flatter the gate). */
+declare function splitGold<TInput, TLabel>(scenarios: GoldScenario<TInput, TLabel>[], options?: SplitGoldOptions): GoldSplit<TInput, TLabel>;
+/**
+ * @experimental
+ *
+ * Agreement judge for teacher→student distillation. Scores a STUDENT artifact
+ * (the cheap analyst's produced label) against the GoldScenario's gold label
+ * (the teacher's verdict). The score IS the distillation objective: 1.0 means
+ * the student reproduced the teacher exactly, 0.0 means total disagreement.
+ *
+ * The comparison function is INJECTED (`compareLabels`) so the judge is
+ * domain-agnostic — distilling a skill-audit analyst, a triage analyst, or any
+ * other student is a one-line comparator swap. A default `fieldAgreement`
+ * comparator is provided for the common case: a flat verdict object with
+ * categorical and array fields.
+ *
+ * Everything here is PURE + unit-testable — no LLM. (The student spends tokens
+ * producing the artifact; scoring it against frozen gold does not.)
+ */
+/** What an injected comparator returns: a [0,1] composite plus the per-field
+ *  (per-dimension) agreement breakdown the GEPA driver reflects on to learn
+ *  WHICH part of the verdict the student is getting wrong. */
+interface AgreementResult {
+    /** Overall agreement in [0,1]. */
+    score: number;
+    /** Per-dimension agreement in [0,1] — keyed by field/aspect name. The
+     *  reflective driver surfaces the weakest of these as the lever to fix. */
+    dimensions: Record<string, number>;
+}
+/** Compare a produced label against a gold label → agreement. Injected so the
+ *  judge is domain-agnostic. */
+type CompareLabels<TProduced = unknown, TLabel = unknown> = (produced: TProduced, gold: TLabel) => AgreementResult;
+interface BuildAgreementJudgeOptions<TProduced = unknown, TLabel = unknown> {
+    /** Judge name surfaced in `CampaignResult.aggregates.byJudge` + the gate. */
+    name?: string;
+    /** The agreement function — produced student label vs gold teacher label. */
+    compareLabels: CompareLabels<TProduced, TLabel>;
+    /** Dimension keys the judge declares up-front (for `JudgeConfig.dimensions`).
+     *  When omitted, the dimensions present on the first scored result are used
+     *  for display only; the composite is unaffected. */
+    dimensionKeys?: string[];
+    /** Only score `gold`-kind scenarios. Default true — a mixed campaign won't
+     *  mis-apply the agreement judge to non-gold scenarios. */
+    goldOnly?: boolean;
+}
+/** Build a `JudgeConfig` that scores a produced student artifact against the
+ *  scenario's gold label. Conforms to the substrate `JudgeConfig` contract:
+ *  `score({artifact, scenario, signal}) => JudgeScore`. The `composite` is the
+ *  comparator's `score`; `dimensions` carries its per-field breakdown plus the
+ *  scalar `agreement` so a single-dimension consumer still sees the number. */
+declare function buildAgreementJudge<TProduced, TInput, TLabel>(options: BuildAgreementJudgeOptions<TProduced, TLabel>): JudgeConfig<TProduced, GoldScenario<TInput, TLabel>>;
+interface FieldAgreementSpec {
+    /** Categorical fields — scored exact-match (1 if equal, else 0). Compared
+     *  with `===` after `JSON`-normalizing so `true`/`'high'`/`3` all work. */
+    categorical?: string[];
+    /** Array fields — scored by Jaccard overlap (|A∩B| / |A∪B|). Two empty
+     *  arrays agree perfectly (1.0). Order-insensitive; elements compared by
+     *  their `JSON.stringify`. */
+    array?: string[];
+}
+/** Default comparator: average per-field agreement over a flat verdict object.
+ *  Categorical fields score exact-match; array fields score set-overlap
+ *  (Jaccard). The composite is the unweighted mean across all declared fields,
+ *  so missing a single boolean (e.g. `public_leak_risk`) costs `1/nFields` of
+ *  the score — the leak-detection lever the audit cares about is a real,
+ *  non-trivial fraction of the objective, not rounding noise.
+ *
+ *  Pure. A field absent from BOTH produced + gold is treated as agreeing
+ *  (both undefined ⇒ 1.0); a field present in only one side disagrees. */
+declare function fieldAgreement<TProduced extends Record<string, unknown>, TLabel>(spec: FieldAgreementSpec): CompareLabels<TProduced, TLabel>;
+/**
+ * @experimental
+ *
+ * `runDistillation` — the teacher→student distillation loop. COMPOSES existing
+ * substrate primitives; reimplements none of them:
+ *
+ *   - DRIVER       = `gepaDriver` (reflective prompt optimizer)
+ *   - LOOP         = `runImprovementLoop` (outer: optimize → holdout re-score → gate)
+ *   - MEASUREMENT  = `runCampaign` (inside the loop) scoring the student
+ *   - JUDGE        = `buildAgreementJudge` — student label vs gold teacher label
+ *   - GATE         = caller-supplied (`heldOutGate` / `defaultProductionGate`)
+ *   - STUDENT      = a cheap single-shot analyst whose system prompt is the
+ *                    `MutableSurface` GEPA mutates; it calls the LLM through
+ *                    `createChatClient` and emits a JSON label.
+ *
+ * The surface IS the student's system prompt. Each generation GEPA rewrites it;
+ * `dispatchWithSurface` renders {surface + scenario.input} into a chat request,
+ * calls the (cheap) model, parses the produced JSON label, and returns it as
+ * the artifact. The agreement judge scores that label against the gold label.
+ *
+ * `autoOnPromote: 'none'` is FORCED — the loop never opens a PR; the caller
+ * (the `distill` CLI) decides what to do with the winning prompt.
+ */
+/** Render the student's prompt from {current surface, scenario input}. The
+ *  surface is the system prompt; the scenario input is the user turn. Override
+ *  to inject few-shot framing or a JSON-schema reminder. */
+type RenderStudentPrompt<TInput> = (args: {
+    surface: string;
+    input: TInput;
+    scenarioId: string;
+}) => ChatRequest['messages'];
+/** Parse the model's raw text into a typed produced label. Throws on
+ *  unparseable output — a thrown dispatch is recorded as a failed cell (never
+ *  silently scored 0), which is the honest signal that the prompt isn't
+ *  emitting valid JSON yet. */
+type ParseStudentLabel<TProduced> = (rawContent: string, scenarioId: string) => TProduced;
+interface RunDistillationOptions<TProduced, TInput, TLabel> {
+    /** The student analyst's INITIAL system prompt — the baseline surface GEPA
+     *  searches from. */
+    baselinePrompt: string;
+    /** Training scenarios (the optimization pool). */
+    train: GoldScenario<TInput, TLabel>[];
+    /** Held-out scenarios — kept OUT of training; scored only at the gate. */
+    holdout: GoldScenario<TInput, TLabel>[];
+    /** Transport for BOTH the student (cheap model) and the GEPA reflection
+     *  (the optimizer model). The student calls it via `createChatClient`. */
+    llm: CreateChatClientOpts;
+    /** Router transport the GEPA driver reflects through. `gepaDriver` uses the
+     *  package `LlmClient` directly (`LlmClientOptions`), not the ChatClient —
+     *  pass the router creds here. A test may inject `fetch` to stub the
+     *  reflection HTTP and exercise the wiring without real tokens. */
+    reflectionLlm: LlmClientOptions;
+    /** Cheap model the student runs (e.g. a small/fast model). */
+    studentModel: string;
+    /** Model GEPA uses to propose prompt rewrites (typically a stronger model). */
+    optimizerModel: string;
+    /** Agreement judge — produced student label vs gold teacher label. */
+    judge: JudgeConfig<TProduced, GoldScenario<TInput, TLabel>>;
+    /** Promotion gate. Default: `heldOutGate` over the holdout. Pass
+     *  `defaultProductionGate({ holdoutScenarios: holdout, ... })` for the full
+     *  red-team / reward-hacking / canary stack. */
+    gate?: Gate<TProduced, GoldScenario<TInput, TLabel>>;
+    /** GEPA population size (candidates per generation). Default 4. */
+    populationSize?: number;
+    /** GEPA generations. Default 3. */
+    maxGenerations?: number;
+    /** Campaign reps per scenario. Default 1 — raise for CI bands on a flaky
+     *  student. */
+    reps?: number;
+    /** Where campaign artifacts + traces land. Default a temp dir under cwd. */
+    runDir?: string;
+    /** Levers offered to the GEPA reflection prompt. */
+    mutationPrimitives?: string[];
+    /** GEPA structured-doc constraints (preserve sections, edit budget). */
+    constraints?: GepaDriverConstraints;
+    /** Gate's minimum holdout-agreement delta to ship. Default 0.0 — a
+     *  distillation run reports the lift; the caller decides the bar. Only used
+     *  when `gate` is omitted (the default `heldOutGate`). */
+    deltaThreshold?: number;
+    /** Render the student prompt. Default: surface as system, JSON-stringified
+     *  input as the user turn with a JSON-only instruction. */
+    renderStudentPrompt?: RenderStudentPrompt<TInput>;
+    /** Parse the model's text into a produced label. Default: strict JSON parse
+     *  with fenced-block stripping. */
+    parseStudentLabel?: ParseStudentLabel<TProduced>;
+    /** Per-student-call sampling temperature. Default 0 (deterministic student;
+     *  the optimization signal must come from the PROMPT, not sampling noise). */
+    studentTemperature?: number;
+    /** Per-student-call max tokens. Default 1024. */
+    studentMaxTokens?: number;
+}
+interface RunDistillationResult<TProduced, TInput, TLabel> extends RunImprovementLoopResult<TProduced, GoldScenario<TInput, TLabel>> {
+    /** The winning student prompt (a string surface). */
+    winnerPrompt: string;
+    /** Mean agreement on the HOLDOUT — baseline vs winner. The headline number:
+     *  did distillation move the student closer to the teacher on UNSEEN gold? */
+    holdoutAgreement: {
+        baseline: number;
+        winner: number;
+        delta: number;
+    };
+}
+declare function runDistillation<TProduced, TInput, TLabel>(opts: RunDistillationOptions<TProduced, TInput, TLabel>): Promise<RunDistillationResult<TProduced, TInput, TLabel>>;
+/** Default student prompt render: surface as system, JSON input as the user
+ *  turn, with a JSON-only output instruction. */
+declare function defaultRenderStudentPrompt<TInput>(args: {
+    surface: string;
+    input: TInput;
+    scenarioId: string;
+}): ChatRequest['messages'];
+/** Default label parse: strip a ```json fence if present, then `JSON.parse`.
+ *  Throws on failure so the cell is recorded as failed, not silently zeroed. */
+declare function defaultParseStudentLabel<TProduced>(rawContent: string, scenarioId: string): TProduced;
+export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile, type AgreementResult, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, ChatRequest, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type ParseStudentLabel, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SKILL_USAGE_ANALYST, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, SkillUsageAnalyst, type SkillUsageRecord, type SkillUsageReport, type SkillUsageScanConfig, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SplitGoldOptions, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildSkillUsageReport, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, emitSkillUsageFindings, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseGoldJsonl, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };