@tangle-network/agent-eval 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/dist/adapters/otel.d.ts +1 -1
  3. package/dist/campaign/index.d.ts +110 -6
  4. package/dist/campaign/index.js +26 -19
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
  7. package/dist/chunk-6XQIEUQ2.js.map +1 -0
  8. package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
  9. package/dist/chunk-DFS3FEXO.js.map +1 -0
  10. package/dist/chunk-MZ2IYGGN.js +592 -0
  11. package/dist/chunk-MZ2IYGGN.js.map +1 -0
  12. package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
  13. package/dist/chunk-NV2PF37Q.js.map +1 -0
  14. package/dist/contract/index.d.ts +11 -9
  15. package/dist/contract/index.js +11 -12
  16. package/dist/contract/index.js.map +1 -1
  17. package/dist/hosted/index.d.ts +1 -1
  18. package/dist/hosted/index.js +1 -1
  19. package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
  20. package/dist/index.d.ts +251 -7
  21. package/dist/index.js +292 -2
  22. package/dist/index.js.map +1 -1
  23. package/dist/openapi.json +1 -1
  24. package/dist/provenance-CChUqexv.d.ts +314 -0
  25. package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
  26. package/dist/release-report-CN8hJlhk.d.ts +233 -0
  27. package/dist/reporting.d.ts +4 -3
  28. package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
  29. package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
  30. package/dist/statistics-B7yCbi9i.d.ts +253 -0
  31. package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
  32. package/package.json +1 -1
  33. package/dist/chunk-4ODZXQV2.js.map +0 -1
  34. package/dist/chunk-7TPYV2ER.js.map +0 -1
  35. package/dist/chunk-CZRKD2X2.js +0 -1104
  36. package/dist/chunk-CZRKD2X2.js.map +0 -1
  37. package/dist/chunk-E22YUOAL.js +0 -111
  38. package/dist/chunk-E22YUOAL.js.map +0 -1
  39. package/dist/chunk-HKINEDRZ.js.map +0 -1
  40. package/dist/release-report-DGoeObZT.d.ts +0 -484
  41. /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
@@ -1,4 +1,4 @@
1
- export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient, n as hostedClientFromEnv } from '../index-CzhtwYBT.js';
1
+ export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient, n as hostedClientFromEnv } from '../index-DSEHMwvS.js';
2
2
  import '../types-c2R2kfmv.js';
3
3
  import '../run-record-BgTFzO2r.js';
4
4
  import '../errors-Dwqw-T_m.js';
@@ -2,7 +2,7 @@ import {
2
2
  HOSTED_WIRE_VERSION,
3
3
  createHostedClient,
4
4
  hostedClientFromEnv
5
- } from "../chunk-HKINEDRZ.js";
5
+ } from "../chunk-DFS3FEXO.js";
6
6
  import "../chunk-PZ5AY32C.js";
7
7
  export {
8
8
  HOSTED_WIRE_VERSION,
@@ -455,8 +455,10 @@ interface IngestResponse {
455
455
  * speaks the wire format in `./types.ts`.
456
456
  *
457
457
  * Three modes:
458
- * - **Ours:** point at `https://orchestrator.tangle.tools/v1`. We
459
- * handle ingest + storage + dashboard.
458
+ * - **Ours:** point at `https://orchestrator.tangle.tools` (the host root —
459
+ * the client appends the versioned `/v1/ingest/...` path itself; a trailing
460
+ * `/v1` on the endpoint is tolerated and normalized away). We handle ingest
461
+ * + storage + dashboard.
460
462
  * - **Self-hosted:** point at whatever URL runs the reference receiver
461
463
  * from `examples/hosted-ingest-server/`.
462
464
  * - **Off (default):** when `hostedTenant` is unset, nothing is sent.
package/dist/index.d.ts CHANGED
@@ -14,10 +14,10 @@ import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInpu
14
14
  export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
15
15
  import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
16
16
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
17
- import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
18
- export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
19
- import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-DPly4_hZ.js';
20
- export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-DPly4_hZ.js';
17
+ import { b as JudgeFn, a as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-Croy5h7V.js';
18
+ export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, J as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-Croy5h7V.js';
19
+ import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext, C as ChatRequest, f as CreateChatClientOpts } from './registry-BGKyX6bw.js';
20
+ export { g as AnalystHooks, h as AnalystInputKind, A as AnalystRegistry, i as AnalystRegistryOptions, j as AnalystRequirements, k as AnalystRunEvent, l as AnalystRunInputs, m as AnalystRunResult, n as AnalystRunSummary, B as BudgetPolicy, o as ChatCallOpts, p as ChatClient, q as ChatResponse, r as ChatTransport, s as CliBridgeTransportOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BGKyX6bw.js';
21
21
  import { TCloud } from '@tangle-network/tcloud';
22
22
  import { z } from 'zod';
23
23
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
@@ -28,8 +28,9 @@ export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind
28
28
  import { A as AgentProfile } from './agent-profile-DzcPHR1Z.js';
29
29
  export { a as BackendIntegrityError, B as BackendIntegrityReport, b as agentProfileHash, c as assertRealBackend, s as summarizeBackendIntegrity } from './agent-profile-DzcPHR1Z.js';
30
30
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
31
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DGoeObZT.js';
32
- export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CliffsMagnitude, t as CorpusAgreementOptions, u as CorpusAgreementPerDimension, v as CorpusAgreementReport, x as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, W as WeightedCompositeInput, y as WeightedCompositeResult, l as assertReleaseConfidence, m as benjaminiHochberg, z as bonferroni, n as bootstrapCi, D as cliffsDelta, E as cohensD, F as confidenceInterval, G as corpusInterRaterAgreement, H as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, I as interRaterReliability, K as interpretCliffs, p as judgeReplayGate, L as mannWhitneyU, M as normalizeScores, q as pairedBootstrap, N as pairedMde, O as pairedTTest, Q as partialCredit, r as renderReleaseReport, S as requiredSampleSize, T as weightedComposite, U as weightedMean, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
31
+ import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CN8hJlhk.js';
32
+ export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
33
+ export { C as CliffsMagnitude, c as CorpusAgreementOptions, d as CorpusAgreementPerDimension, e as CorpusAgreementReport, f as CorpusScoreRecord, P as PairedBootstrapOptions, a as PairedBootstrapResult, W as WeightedCompositeInput, g as WeightedCompositeResult, b as benjaminiHochberg, h as bonferroni, i as cliffsDelta, j as cohensD, k as confidenceInterval, l as corpusInterRaterAgreement, m as corpusInterRaterAgreementFromJudgeScores, n as interRaterReliability, o as interpretCliffs, q as mannWhitneyU, r as normalizeScores, p as pairedBootstrap, s as pairedMde, t as pairedTTest, u as partialCredit, v as requiredSampleSize, x as weightedComposite, y as weightedMean, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
33
34
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
34
35
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
35
36
  import { T as TraceEmitter } from './emitter-DEZwY14K.js';
@@ -52,6 +53,8 @@ export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, Us
52
53
  export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DsnOpCO6.js';
53
54
  export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
54
55
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
56
+ import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-c2R2kfmv.js';
57
+ import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-BKpM5T4t.js';
55
58
  import './outcome-store-D6KWmYvj.js';
56
59
 
57
60
  interface RunScore {
@@ -5577,4 +5580,245 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
5577
5580
  */
5578
5581
  declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
5579
5582
 
5580
- export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SKILL_USAGE_ANALYST, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, SkillUsageAnalyst, type SkillUsageRecord, type SkillUsageReport, type SkillUsageScanConfig, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildSkillUsageReport, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, emitSkillUsageFindings, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
5583
+ /**
5584
+ * @experimental
5585
+ *
5586
+ * Gold scenarios for teacher→student distillation. The TEACHER is an
5587
+ * expensive workflow (e.g. the 70-agent skill audit) whose verdicts are
5588
+ * frozen as gold labels; the STUDENT is a cheap single-shot analyst whose
5589
+ * prompt GEPA optimizes toward reproducing those labels.
5590
+ *
5591
+ * A `GoldScenario` is a `Scenario` (the substrate's input contract) carrying
5592
+ * an OPAQUE `input` (what the student sees) and an OPAQUE `label` (the gold
5593
+ * verdict the student's output is scored against). Both are typed `unknown`
5594
+ * here: this module is domain-agnostic — it distills ANY analyst against ANY
5595
+ * gold JSONL. The agreement comparator (see `agreement-judge.ts`) is what
5596
+ * knows the label's shape.
5597
+ *
5598
+ * Loading + splitting are DETERMINISTIC and LLM-free: the gold set is the
5599
+ * fixed ground truth, never regenerated here.
5600
+ */
5601
+
5602
+ /** A held gold record: opaque student-input + opaque gold-label, carried as a
5603
+ * substrate `Scenario` so it flows through `runCampaign` unchanged. */
5604
+ interface GoldScenario<TInput = unknown, TLabel = unknown> extends Scenario$1 {
5605
+ kind: 'gold';
5606
+ /** What the student analyst is shown (rendered into its user prompt). */
5607
+ input: TInput;
5608
+ /** The teacher's gold verdict — the target the student's output is scored
5609
+ * against by the agreement judge. NEVER shown to the student. */
5610
+ label: TLabel;
5611
+ }
5612
+ /** Read a gold JSONL (one `{scenarioId|id, input, label, split?}` per line) into
5613
+ * `GoldScenario[]`. Deterministic, no LLM. Blank lines are skipped; a line
5614
+ * missing an id, `input`, or `label` throws (a silent skip would corrupt the
5615
+ * split silently — fail loud on a malformed gold set). */
5616
+ declare function loadGoldScenarios<TInput = unknown, TLabel = unknown>(jsonlPath: string): GoldScenario<TInput, TLabel>[];
5617
+ /** Parse gold JSONL text directly (no fs). Exported so tests + in-memory
5618
+ * callers exercise the same parse path as {@link loadGoldScenarios}. */
5619
+ declare function parseGoldJsonl<TInput = unknown, TLabel = unknown>(text: string, sourceLabel?: string): GoldScenario<TInput, TLabel>[];
5620
+ interface SplitGoldOptions {
5621
+ /** Every Nth scenario (0-based index) goes to the TEST/holdout split; the
5622
+ * rest train. Default 4 ⇒ a 25% holdout. Ignored for any scenario that
5623
+ * carries an explicit `split:` tag (that is honored verbatim). */
5624
+ testEveryNth?: number;
5625
+ }
5626
+ interface GoldSplit<TInput, TLabel> {
5627
+ /** Training scenarios — the optimization pool the driver searches over. */
5628
+ train: GoldScenario<TInput, TLabel>[];
5629
+ /** Held-out scenarios — kept OUT of training; scored only at the gate. */
5630
+ test: GoldScenario<TInput, TLabel>[];
5631
+ }
5632
+ /** Deterministic train/test split. A scenario tagged `split:train|test` is
5633
+ * routed by that tag; the rest fall to a modulo split (`index % testEveryNth
5634
+ * === 0 ⇒ test`). Pure — same input always yields the same split, so a gold
5635
+ * set's holdout is stable across runs (a shuffled split would let a lucky
5636
+ * seed flatter the gate). */
5637
+ declare function splitGold<TInput, TLabel>(scenarios: GoldScenario<TInput, TLabel>[], options?: SplitGoldOptions): GoldSplit<TInput, TLabel>;
5638
+
5639
+ /**
5640
+ * @experimental
5641
+ *
5642
+ * Agreement judge for teacher→student distillation. Scores a STUDENT artifact
5643
+ * (the cheap analyst's produced label) against the GoldScenario's gold label
5644
+ * (the teacher's verdict). The score IS the distillation objective: 1.0 means
5645
+ * the student reproduced the teacher exactly, 0.0 means total disagreement.
5646
+ *
5647
+ * The comparison function is INJECTED (`compareLabels`) so the judge is
5648
+ * domain-agnostic — distilling a skill-audit analyst, a triage analyst, or any
5649
+ * other student is a one-line comparator swap. A default `fieldAgreement`
5650
+ * comparator is provided for the common case: a flat verdict object with
5651
+ * categorical and array fields.
5652
+ *
5653
+ * Everything here is PURE + unit-testable — no LLM. (The student spends tokens
5654
+ * producing the artifact; scoring it against frozen gold does not.)
5655
+ */
5656
+
5657
+ /** What an injected comparator returns: a [0,1] composite plus the per-field
5658
+ * (per-dimension) agreement breakdown the GEPA driver reflects on to learn
5659
+ * WHICH part of the verdict the student is getting wrong. */
5660
+ interface AgreementResult {
5661
+ /** Overall agreement in [0,1]. */
5662
+ score: number;
5663
+ /** Per-dimension agreement in [0,1] — keyed by field/aspect name. The
5664
+ * reflective driver surfaces the weakest of these as the lever to fix. */
5665
+ dimensions: Record<string, number>;
5666
+ }
5667
+ /** Compare a produced label against a gold label → agreement. Injected so the
5668
+ * judge is domain-agnostic. */
5669
+ type CompareLabels<TProduced = unknown, TLabel = unknown> = (produced: TProduced, gold: TLabel) => AgreementResult;
5670
+ interface BuildAgreementJudgeOptions<TProduced = unknown, TLabel = unknown> {
5671
+ /** Judge name surfaced in `CampaignResult.aggregates.byJudge` + the gate. */
5672
+ name?: string;
5673
+ /** The agreement function — produced student label vs gold teacher label. */
5674
+ compareLabels: CompareLabels<TProduced, TLabel>;
5675
+ /** Dimension keys the judge declares up-front (for `JudgeConfig.dimensions`).
5676
+ * When omitted, the dimensions present on the first scored result are used
5677
+ * for display only; the composite is unaffected. */
5678
+ dimensionKeys?: string[];
5679
+ /** Only score `gold`-kind scenarios. Default true — a mixed campaign won't
5680
+ * mis-apply the agreement judge to non-gold scenarios. */
5681
+ goldOnly?: boolean;
5682
+ }
5683
+ /** Build a `JudgeConfig` that scores a produced student artifact against the
5684
+ * scenario's gold label. Conforms to the substrate `JudgeConfig` contract:
5685
+ * `score({artifact, scenario, signal}) => JudgeScore`. The `composite` is the
5686
+ * comparator's `score`; `dimensions` carries its per-field breakdown plus the
5687
+ * scalar `agreement` so a single-dimension consumer still sees the number. */
5688
+ declare function buildAgreementJudge<TProduced, TInput, TLabel>(options: BuildAgreementJudgeOptions<TProduced, TLabel>): JudgeConfig<TProduced, GoldScenario<TInput, TLabel>>;
5689
+ interface FieldAgreementSpec {
5690
+ /** Categorical fields — scored exact-match (1 if equal, else 0). Compared
5691
+ * with `===` after `JSON`-normalizing so `true`/`'high'`/`3` all work. */
5692
+ categorical?: string[];
5693
+ /** Array fields — scored by Jaccard overlap (|A∩B| / |A∪B|). Two empty
5694
+ * arrays agree perfectly (1.0). Order-insensitive; elements compared by
5695
+ * their `JSON.stringify`. */
5696
+ array?: string[];
5697
+ }
5698
+ /** Default comparator: average per-field agreement over a flat verdict object.
5699
+ * Categorical fields score exact-match; array fields score set-overlap
5700
+ * (Jaccard). The composite is the unweighted mean across all declared fields,
5701
+ * so missing a single boolean (e.g. `public_leak_risk`) costs `1/nFields` of
5702
+ * the score — the leak-detection lever the audit cares about is a real,
5703
+ * non-trivial fraction of the objective, not rounding noise.
5704
+ *
5705
+ * Pure. A field absent from BOTH produced + gold is treated as agreeing
5706
+ * (both undefined ⇒ 1.0); a field present in only one side disagrees. */
5707
+ declare function fieldAgreement<TProduced extends Record<string, unknown>, TLabel>(spec: FieldAgreementSpec): CompareLabels<TProduced, TLabel>;
5708
+
5709
+ /**
5710
+ * @experimental
5711
+ *
5712
+ * `runDistillation` — the teacher→student distillation loop. COMPOSES existing
5713
+ * substrate primitives; reimplements none of them:
5714
+ *
5715
+ * - DRIVER = `gepaDriver` (reflective prompt optimizer)
5716
+ * - LOOP = `runImprovementLoop` (outer: optimize → holdout re-score → gate)
5717
+ * - MEASUREMENT = `runCampaign` (inside the loop) scoring the student
5718
+ * - JUDGE = `buildAgreementJudge` — student label vs gold teacher label
5719
+ * - GATE = caller-supplied (`heldOutGate` / `defaultProductionGate`)
5720
+ * - STUDENT = a cheap single-shot analyst whose system prompt is the
5721
+ * `MutableSurface` GEPA mutates; it calls the LLM through
5722
+ * `createChatClient` and emits a JSON label.
5723
+ *
5724
+ * The surface IS the student's system prompt. Each generation GEPA rewrites it;
5725
+ * `dispatchWithSurface` renders {surface + scenario.input} into a chat request,
5726
+ * calls the (cheap) model, parses the produced JSON label, and returns it as
5727
+ * the artifact. The agreement judge scores that label against the gold label.
5728
+ *
5729
+ * `autoOnPromote: 'none'` is FORCED — the loop never opens a PR; the caller
5730
+ * (the `distill` CLI) decides what to do with the winning prompt.
5731
+ */
5732
+
5733
+ /** Render the student's prompt from {current surface, scenario input}. The
5734
+ * surface is the system prompt; the scenario input is the user turn. Override
5735
+ * to inject few-shot framing or a JSON-schema reminder. */
5736
+ type RenderStudentPrompt<TInput> = (args: {
5737
+ surface: string;
5738
+ input: TInput;
5739
+ scenarioId: string;
5740
+ }) => ChatRequest['messages'];
5741
+ /** Parse the model's raw text into a typed produced label. Throws on
5742
+ * unparseable output — a thrown dispatch is recorded as a failed cell (never
5743
+ * silently scored 0), which is the honest signal that the prompt isn't
5744
+ * emitting valid JSON yet. */
5745
+ type ParseStudentLabel<TProduced> = (rawContent: string, scenarioId: string) => TProduced;
5746
+ interface RunDistillationOptions<TProduced, TInput, TLabel> {
5747
+ /** The student analyst's INITIAL system prompt — the baseline surface GEPA
5748
+ * searches from. */
5749
+ baselinePrompt: string;
5750
+ /** Training scenarios (the optimization pool). */
5751
+ train: GoldScenario<TInput, TLabel>[];
5752
+ /** Held-out scenarios — kept OUT of training; scored only at the gate. */
5753
+ holdout: GoldScenario<TInput, TLabel>[];
5754
+ /** Transport for BOTH the student (cheap model) and the GEPA reflection
5755
+ * (the optimizer model). The student calls it via `createChatClient`. */
5756
+ llm: CreateChatClientOpts;
5757
+ /** Router transport the GEPA driver reflects through. `gepaDriver` uses the
5758
+ * package `LlmClient` directly (`LlmClientOptions`), not the ChatClient —
5759
+ * pass the router creds here. A test may inject `fetch` to stub the
5760
+ * reflection HTTP and exercise the wiring without real tokens. */
5761
+ reflectionLlm: LlmClientOptions;
5762
+ /** Cheap model the student runs (e.g. a small/fast model). */
5763
+ studentModel: string;
5764
+ /** Model GEPA uses to propose prompt rewrites (typically a stronger model). */
5765
+ optimizerModel: string;
5766
+ /** Agreement judge — produced student label vs gold teacher label. */
5767
+ judge: JudgeConfig<TProduced, GoldScenario<TInput, TLabel>>;
5768
+ /** Promotion gate. Default: `heldOutGate` over the holdout. Pass
5769
+ * `defaultProductionGate({ holdoutScenarios: holdout, ... })` for the full
5770
+ * red-team / reward-hacking / canary stack. */
5771
+ gate?: Gate<TProduced, GoldScenario<TInput, TLabel>>;
5772
+ /** GEPA population size (candidates per generation). Default 4. */
5773
+ populationSize?: number;
5774
+ /** GEPA generations. Default 3. */
5775
+ maxGenerations?: number;
5776
+ /** Campaign reps per scenario. Default 1 — raise for CI bands on a flaky
5777
+ * student. */
5778
+ reps?: number;
5779
+ /** Where campaign artifacts + traces land. Default a temp dir under cwd. */
5780
+ runDir?: string;
5781
+ /** Levers offered to the GEPA reflection prompt. */
5782
+ mutationPrimitives?: string[];
5783
+ /** GEPA structured-doc constraints (preserve sections, edit budget). */
5784
+ constraints?: GepaDriverConstraints;
5785
+ /** Gate's minimum holdout-agreement delta to ship. Default 0.0 — a
5786
+ * distillation run reports the lift; the caller decides the bar. Only used
5787
+ * when `gate` is omitted (the default `heldOutGate`). */
5788
+ deltaThreshold?: number;
5789
+ /** Render the student prompt. Default: surface as system, JSON-stringified
5790
+ * input as the user turn with a JSON-only instruction. */
5791
+ renderStudentPrompt?: RenderStudentPrompt<TInput>;
5792
+ /** Parse the model's text into a produced label. Default: strict JSON parse
5793
+ * with fenced-block stripping. */
5794
+ parseStudentLabel?: ParseStudentLabel<TProduced>;
5795
+ /** Per-student-call sampling temperature. Default 0 (deterministic student;
5796
+ * the optimization signal must come from the PROMPT, not sampling noise). */
5797
+ studentTemperature?: number;
5798
+ /** Per-student-call max tokens. Default 1024. */
5799
+ studentMaxTokens?: number;
5800
+ }
5801
+ interface RunDistillationResult<TProduced, TInput, TLabel> extends RunImprovementLoopResult<TProduced, GoldScenario<TInput, TLabel>> {
5802
+ /** The winning student prompt (a string surface). */
5803
+ winnerPrompt: string;
5804
+ /** Mean agreement on the HOLDOUT — baseline vs winner. The headline number:
5805
+ * did distillation move the student closer to the teacher on UNSEEN gold? */
5806
+ holdoutAgreement: {
5807
+ baseline: number;
5808
+ winner: number;
5809
+ delta: number;
5810
+ };
5811
+ }
5812
+ declare function runDistillation<TProduced, TInput, TLabel>(opts: RunDistillationOptions<TProduced, TInput, TLabel>): Promise<RunDistillationResult<TProduced, TInput, TLabel>>;
5813
+ /** Default student prompt render: surface as system, JSON input as the user
5814
+ * turn, with a JSON-only output instruction. */
5815
+ declare function defaultRenderStudentPrompt<TInput>(args: {
5816
+ surface: string;
5817
+ input: TInput;
5818
+ scenarioId: string;
5819
+ }): ChatRequest['messages'];
5820
+ /** Default label parse: strip a ```json fence if present, then `JSON.parse`.
5821
+ * Throws on failure so the cell is recorded as failed, not silently zeroed. */
5822
+ declare function defaultParseStudentLabel<TProduced>(rawContent: string, scenarioId: string): TProduced;
5823
+
5824
+ export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile, type AgreementResult, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, ChatRequest, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type ParseStudentLabel, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SKILL_USAGE_ANALYST, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, SkillUsageAnalyst, type SkillUsageRecord, type SkillUsageReport, type SkillUsageScanConfig, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SplitGoldOptions, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildSkillUsageReport, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, emitSkillUsageFindings, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseGoldJsonl, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };