@tangle-network/agent-eval 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +134 -0
  2. package/README.md +13 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
  5. package/dist/chunk-7EAUOUQS.js.map +1 -0
  6. package/dist/chunk-AXHNWLIX.js +246 -0
  7. package/dist/chunk-AXHNWLIX.js.map +1 -0
  8. package/dist/chunk-EXGR4XEM.js +283 -0
  9. package/dist/chunk-EXGR4XEM.js.map +1 -0
  10. package/dist/chunk-LZKIOBG2.js +2026 -0
  11. package/dist/chunk-LZKIOBG2.js.map +1 -0
  12. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  13. package/dist/chunk-QBW3YBTR.js.map +1 -0
  14. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  15. package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
  16. package/dist/chunk-VQQSPGSM.js.map +1 -0
  17. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  18. package/dist/control.d.ts +3 -3
  19. package/dist/control.js +2 -2
  20. package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
  21. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  22. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  23. package/dist/index-ekBXweiQ.d.ts +1894 -0
  24. package/dist/index.d.ts +18 -154
  25. package/dist/index.js +125 -25
  26. package/dist/index.js.map +1 -1
  27. package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
  28. package/dist/openapi.json +1 -1
  29. package/dist/optimization.d.ts +5 -5
  30. package/dist/optimization.js +7 -5
  31. package/dist/reporting.d.ts +294 -4
  32. package/dist/reporting.js +6 -4
  33. package/dist/rl.d.ts +8 -0
  34. package/dist/rl.js +113 -0
  35. package/dist/rl.js.map +1 -0
  36. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  37. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  38. package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
  39. package/dist/traces.d.ts +2 -2
  40. package/dist/traces.js +5 -5
  41. package/docs/auto-research-loop-end-to-end.md +186 -0
  42. package/docs/three-package-architecture.md +180 -0
  43. package/package.json +6 -1
  44. package/dist/chunk-UAND2LOT.js.map +0 -1
  45. package/dist/chunk-USHQBPMH.js.map +0 -1
  46. package/dist/chunk-YUFXO3TU.js.map +0 -1
  47. package/dist/reporting-B82RSv9C.d.ts +0 -593
  48. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
package/dist/index.d.ts CHANGED
@@ -1,25 +1,29 @@
1
1
  import { TCloud } from '@tangle-network/tcloud';
2
- import { R as ReleaseConfidenceThresholds, a as ReleaseConfidenceScorecard, O as OutcomeFilter, b as OutcomeStore } from './reporting-B82RSv9C.js';
3
- export { B as BootstrapOptions, c as BootstrapResult, D as DeploymentOutcome, F as FileSystemOutcomeStore, d as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, e as InterimReleaseConfidence, f as InterimReleaseConfidenceInput, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, g as PairedBootstrapResult, h as PairedEvalueOptions, i as PairedEvalueSequence, j as PairedEvalueStep, k as ReleaseConfidenceAxis, l as ReleaseConfidenceAxisName, m as ReleaseConfidenceInput, n as ReleaseConfidenceIssue, o as ReleaseConfidenceMetrics, p as ReleaseConfidenceStatus, q as ReleaseTraceEvidence, r as RenderReleaseReportOptions, s as RubricOutcomePair, t as RubricPredictiveValidityInput, u as RubricPredictiveValidityReport, v as RubricRanking, S as SequentialDecision, V as Verdict, w as assertReleaseConfidence, x as bhAdjust, y as bootstrapCi, z as evaluateInterimReleaseConfidence, A as evaluateReleaseConfidence, C as judgeReplayGate, E as pairedBootstrap, G as pairedEvalueSequence, H as pairedWilcoxon, K as releaseTraceEvidenceFromMultiShotTrials, L as renderReleaseReport, M as rubricPredictiveValidity } from './reporting-B82RSv9C.js';
4
- import { F as FeedbackLabel, a as FeedbackTrajectoryStore, b as FeedbackTrajectory, C as ControlSeverity, c as ControlEvalResult } from './feedback-trajectory-CB0A32o3.js';
5
- export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, l as ControlStep, m as ControlStopPolicies, n as FeedbackArtifactType, o as FeedbackAttempt, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, y as FeedbackTrajectoryFilter, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, S as StopDecision, B as allCriticalPassed, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, M as objectiveEval, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, T as runAgentControlLoop, U as serializeFeedbackTrajectoriesJsonl, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-CB0A32o3.js';
6
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-cxwMOAsy.js';
2
+ import { ReleaseConfidenceThresholds, ReleaseConfidenceScorecard } from './reporting.js';
3
+ export { BootstrapOptions, BootstrapResult, JudgeReplayGateArgs, PairedBootstrapOptions, PairedBootstrapResult, ReleaseConfidenceAxis, ReleaseConfidenceAxisName, ReleaseConfidenceInput, ReleaseConfidenceIssue, ReleaseConfidenceMetrics, ReleaseConfidenceStatus, ReleaseTraceEvidence, RenderReleaseReportOptions, Verdict, assertReleaseConfidence, bhAdjust, bootstrapCi, evaluateReleaseConfidence, judgeReplayGate, pairedBootstrap, pairedWilcoxon, releaseTraceEvidenceFromMultiShotTrials, renderReleaseReport } from './reporting.js';
4
+ import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory, E as ControlSeverity, G as ControlEvalResult } from './feedback-trajectory-c43WGtTX.js';
5
+ export { H as ControlActionFailureMode, J as ControlActionOutcome, K as ControlBudget, L as ControlContext, M as ControlDecision, N as ControlRunResult, O as ControlRuntimeConfig, Q as ControlRuntimeError, R as ControlStep, S as ControlStopPolicies, F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, T as StopDecision, U as allCriticalPassed, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, V as objectiveEval, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, W as runAgentControlLoop, B as serializeFeedbackTrajectoriesJsonl, X as stopOnNoProgress, Y as stopOnRepeatedAction, Z as subjectiveEval, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-c43WGtTX.js';
6
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DvkH87qJ.js';
7
7
  import { T as TraceEmitter } from './emitter-B2XqDKFU.js';
8
8
  export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
9
- import { A as ActionableSideInfo, O as Objective, P as ParetoResult, T as TrialCache, a as TrialResult, E as EvolvableVariant, M as MutateAdapter, V as VariantAggregate } from './summary-report-D4p7RlDu.js';
10
- export { b as AsiSeverity, D as DEFAULT_FAILURE_RULES, c as DEFAULT_MUTATION_PRIMITIVES, d as Direction, F as FailureClassification, e as FailureCluster, f as FailureClusterReport, g as FailureContext, h as FailureRule, G as GainDistributionBin, i as GainDistributionFigureSpec, j as GainDistributionOptions, k as GateDecision, l as GateEvidence, m as GenerationReport, H as HeldOutGate, n as HeldOutGateConfig, o as HeldOutGateRejectionCode, I as InMemoryTrialCache, p as MultiShotGateConfig, q as MultiShotGateResult, r as MultiShotMutateAdapter, s as MultiShotOptimizationConfig, t as MultiShotOptimizationResult, u as MultiShotRun, v as MultiShotRunInput, w as MultiShotRunner, x as MultiShotScore, y as MultiShotScorer, z as MultiShotSplit, B as MultiShotTrace, C as MultiShotTrialResult, J as MultiShotVariant, K as ParetoFigureSpec, L as ParetoPoint, N as PromptEvolutionConfig, Q as PromptEvolutionEvent, R as PromptEvolutionResult, S as RESEARCH_REPORT_HARD_PAIR_FLOOR, U as ReflectionContext, W as ReflectionProposal, X as ResearchReport, Y as ResearchReportCandidate, Z as ResearchReportDecision, _ as ResearchReportMethodology, $ as ResearchReportOptions, a0 as ResearchReportRecommendation, a1 as ScenarioAggregate, a2 as ScoreAdapter, a3 as SummaryTable, a4 as SummaryTableOptions, a5 as SummaryTableRow, a6 as TrialTrace, a7 as buildReflectionPrompt, a8 as classifyFailure, a9 as crowdingDistance, aa as defaultMultiShotObjectives, ab as dominates, ac as failureClusterView, ad as gainHistogram, ae as paretoChart, af as paretoFrontier, ag as paretoFrontierWithCrowding, ah as parseReflectionResponse, ai as researchReport, aj as runMultiShotOptimization, ak as runPromptEvolution, al as scalarScore, am as summaryTable, an as trialTraceFromMultiShotTrial } from './summary-report-D4p7RlDu.js';
9
+ import { A as ActionableSideInfo, O as Objective, J as ParetoResult, T as TrialCache, t as TrialResult, E as EvolvableVariant, o as MutateAdapter, V as VariantAggregate } from './summary-report-Ce1r4EYo.js';
10
+ export { a as AsiSeverity, K as DEFAULT_FAILURE_RULES, D as DEFAULT_MUTATION_PRIMITIVES, L as Direction, N as FailureClassification, Q as FailureCluster, U as FailureClusterReport, W as FailureContext, X as FailureRule, Y as GainDistributionBin, Z as GainDistributionFigureSpec, _ as GainDistributionOptions, C as GateDecision, $ as GateEvidence, G as GenerationReport, a0 as HeldOutGate, a1 as HeldOutGateConfig, a2 as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, a3 as ParetoFigureSpec, a4 as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, a5 as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, H as ResearchReport, a6 as ResearchReportCandidate, a7 as ResearchReportDecision, a8 as ResearchReportMethodology, F as ResearchReportOptions, a9 as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, aa as SummaryTable, ab as SummaryTableOptions, ac as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ad as classifyFailure, ae as crowdingDistance, w as defaultMultiShotObjectives, af as dominates, ag as failureClusterView, ah as gainHistogram, ai as paretoChart, aj as paretoFrontier, ak as paretoFrontierWithCrowding, x as parseReflectionResponse, al as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, am as scalarScore, an as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-Ce1r4EYo.js';
11
11
  import { a as Run$1, S as Span, f as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, F as FailureClass, g as BudgetSpec, c as ToolSpan, h as RunFilter, L as LlmSpan, J as JudgeSpan } from './store-u47QaJ9G.js';
12
12
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, m as RunLayer, n as RunStatus, e as SandboxSpan, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
13
13
  import { llmSpans } from './traces.js';
14
14
  export { AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalysisStore, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
15
- export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, c as RawProviderEvent, d as RawProviderSink, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
15
+ export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, f as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
16
16
  import { a as DatasetScenario, b as Dataset, c as DatasetManifest } from './dataset-B9qvlm_o.js';
17
17
  export { d as DatasetDifficulty, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B9qvlm_o.js';
18
- import { L as LlmClientOptions } from './optimization-UVDNKaO6.js';
19
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, r as LlmRouteAssertionError, s as LlmRouteRequirements, t as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, u as assertLlmRoute, v as callLlm, w as callLlmJson, x as probeLlm, y as runEvalCampaign, z as stripFencedJson } from './optimization-UVDNKaO6.js';
20
- import { a as RunRecord } from './run-record-CX_jcAyr.js';
21
- export { b as RunJudgeMetadata, c as RunOutcome, d as RunRecordValidationError, R as RunSplitTag, e as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CX_jcAyr.js';
22
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-c5saLbKD.js';
18
+ import { O as OutcomeFilter, a as OutcomeStore } from './sequential-DgU2mFsE.js';
19
+ export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, c as InterimReleaseConfidence, d as InterimReleaseConfidenceInput, P as PairedEvalueOptions, e as PairedEvalueSequence, f as PairedEvalueStep, R as RubricOutcomePair, g as RubricPredictiveValidityInput, h as RubricPredictiveValidityReport, i as RubricRanking, S as SequentialDecision, j as evaluateInterimReleaseConfidence, p as pairedEvalueSequence, r as rubricPredictiveValidity } from './sequential-DgU2mFsE.js';
20
+ import { L as LlmClientOptions } from './eval-campaign-Ds5QljIh.js';
21
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './eval-campaign-Ds5QljIh.js';
22
+ import { L as LayerResult, V as VerifyContext, a as Layer, S as Severity } from './index-ekBXweiQ.js';
23
+ export { A as AdaptationCurve, b as AdaptationPoint, c as AdaptationRunner, d as AdapterContext, e as AdversarialMutation, f as AdversarialScenario, g as AdversarialSearchOptions, h as AdversarialSearchReport, i as AnalyzeOptimizationResultOptions, j as AnalyzeOptimizationResultReport, B as BradleyTerryFit, k as BradleyTerryRating, l as BuildPairwiseFromCampaignInput, C as CellObservation, m as CompareCurvesResult, n as ComputeBestOfNOptions, o as ComputeBestOfNResult, p as ComputeCurve, q as ComputeCurveBudget, r as ComputeCurvePoint, s as ContaminationProbeInput, t as ContaminationProbeOptions, u as ContaminationProbeReport, v as CurriculumAllocation, D as DetectRewardHackingInput, w as DpoExportRow, x as DpoLookups, E as EloOptions, y as ExtractPreferencesOptions, z as ExtractStepRewardsOptions, F as Finding, G as GrpoExportRow, H as GrpoLookups, I as LayerStatus, M as MultiLayerVerifier, O as OffPolicyEstimate, J as OffPolicyOptions, K as OffPolicyTrajectory, P as PairwiseOutcome, N as ParetoPointInput, Q as PredictiveValidityResearcher, R as PredictiveValidityResearcherOptions, T as PreferenceExtractionReport, U as PreferenceStrategy, W as PreferenceTriple, X as PrmExportRow, Y as PrmLookups, Z as PrmTrainingTriple, _ as RLCampaignResult, $ as RewardHackingFinding, a0 as RewardHackingReport, a1 as RewardHackingSignal, a2 as RunAdaptationCurveOptions, a3 as RunComputeCurveOptions, a4 as RunRLCampaignOptions, a5 as RunwiseStepSummary, a6 as ScenarioPerturbation, a7 as ScenarioPerturbationKind, a8 as SelfConsistencyOptions, a9 as SelfConsistencyResult, aa as SftExportRow, ab as SftLookups, ac as StepReward, ad as StepRewardJsonlRow, ae as StepScorer, af as ThompsonCurriculumOptions, ag as VarianceCurriculumOptions, ah as VerifiableReward, ai as VerifiableRewardExtractionOptions, aj as VerifiableRewardSource, ak as VerificationReport, al as VerifyOptions, am as adversarialScenarioSearch, an as analyzeOptimizationResult, ao as applyEloUpdate, ap as bestOfN, aq as buildPairwiseFromCampaign, ar as compareAdaptationCurves, as as detectRewardHacking, at as doublyRobust, au as extractPreferences, av as extractStepRewards, aw as extractVerifiableReward, ax as extractVerifiableRewardsFromRecords, ay as filterDeterministicallyRewarded, az as firstPassK, aA as fitBradleyTerry, aB as gradeSemanticStatus, aC as injectIrrelevantClause, aD as inverseProbabilityWeighting, aE as observationsFromRunRecords, aF as offPolicyEstimateAll, aG as prmTrainingPairs, aH as renameVariables, aI as runAdaptationCurve, aJ as runComputeCurve, aK as runContaminationProbe, aL as runRLCampaign, aM as runwiseStepRewardSummary, aN as selfConsistency, aO as selfNormalizedImportanceWeighting, aP as shuffleOrder, aQ as stepRewardsToJsonl, aR as thompsonCurriculum, aS as toAnthropicFormat, aT as toDpoJsonl, aU as toDpoRows, aV as toGrpoJsonl, aW as toGrpoRows, aX as toPrmJsonl, aY as toPrmRows, aZ as toSftJsonl, a_ as toSftRows, a$ as toTRLFormat, b0 as trialToRunRecord, b1 as trialsToRunRecords, b2 as varianceBasedCurriculum, b3 as variantAggregateToRunRecord, b4 as verificationReportToRunRecord } from './index-ekBXweiQ.js';
24
+ import { R as RunRecord } from './run-record-DNiOMBrZ.js';
25
+ export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-DNiOMBrZ.js';
26
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DDTlbHEK.js';
23
27
  import '@ax-llm/ax';
24
28
 
25
29
  interface Scenario {
@@ -4226,146 +4230,6 @@ interface UseCaseSignals {
4226
4230
  declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
4227
4231
  declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
4228
4232
 
4229
- /**
4230
- * Multi-layer verifier — ordered pipeline of verification layers.
4231
- *
4232
- * Different contract from {@link JudgeRunner} (which runs parallel
4233
- * specs against a sandbox). MultiLayerVerifier is a DAG of layers
4234
- * (install → typecheck → build → lint → serve → semantic → …) with
4235
- * dependency-based skip, per-layer findings, soft-fail semantics, and
4236
- * an aggregated `blendedScore` across all passed layers.
4237
- *
4238
- * Use when you want:
4239
- * - ordered stages where a failing upstream stage skips downstream ones
4240
- * - each stage produces rich `findings` (severity + message + evidence)
4241
- * - a single composite score across stages with per-stage weights
4242
- * - soft-fail stages whose failure doesn't abort the pipeline
4243
- *
4244
- * Use {@link JudgeRunner} when you want:
4245
- * - N independent judges running in parallel against the same artifact
4246
- * - no inter-judge dependencies
4247
- * - boolean `passed` per judge + overall
4248
- *
4249
- * Both primitives compose — JudgeRunner can be invoked as a single
4250
- * layer inside a MultiLayerVerifier if that suits the caller.
4251
- */
4252
- type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
4253
- type Severity = 'critical' | 'major' | 'minor' | 'info';
4254
- interface Finding {
4255
- severity: Severity;
4256
- message: string;
4257
- evidence?: string;
4258
- /** Optional layer name the finding belongs to (set by the verifier if omitted). */
4259
- layer?: string;
4260
- /**
4261
- * Free-form structured payload — used by `multiToolchainLayer` to attach
4262
- * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
4263
- * Renderers MAY interrogate; agent-eval primitives never assume shape.
4264
- */
4265
- detail?: Record<string, unknown>;
4266
- }
4267
- interface LayerResult {
4268
- layer: string;
4269
- status: LayerStatus;
4270
- /** 0..1 score, optional — layers that don't produce a numeric score omit. */
4271
- score?: number;
4272
- durationMs: number;
4273
- findings: Finding[];
4274
- /** Short human-readable summary (one line). */
4275
- reason?: string;
4276
- /**
4277
- * Numeric layer-level diagnostics: error counts, warning counts,
4278
- * cyclomatic complexity, total adapter wall-time, etc. Keyed by
4279
- * diagnostic name; null = "diagnostic not applicable / not measured."
4280
- * Renderers that know the keys can display them; ones that don't,
4281
- * ignore. Free-form on purpose — consumers type the value shape in
4282
- * their own namespace. Added in 0.10.
4283
- */
4284
- diagnostics?: Record<string, number | null>;
4285
- /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
4286
- detail?: Record<string, unknown>;
4287
- }
4288
- interface VerifyContext<Env = unknown> {
4289
- /** Per-run opaque context the caller provides. Layers destructure what they need. */
4290
- env: Env;
4291
- /** Previously-computed results from layers that already ran. */
4292
- prior: Record<string, LayerResult>;
4293
- /** Signal — if aborted, layers MUST bail within reasonable wall. */
4294
- signal: AbortSignal;
4295
- }
4296
- interface Layer<Env = unknown> {
4297
- name: string;
4298
- /** Stages that must have `status: 'pass'` before this layer runs. */
4299
- dependsOn?: string[];
4300
- /**
4301
- * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
4302
- * contribute findings but not score.
4303
- */
4304
- weight?: number;
4305
- /**
4306
- * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
4307
- * being dropped — use for layers whose failure is a real signal. Default:
4308
- * fail drops from numerator + denominator, matching VB's existing semantics.
4309
- */
4310
- failContributesToScore?: boolean;
4311
- /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
4312
- capMs?: number;
4313
- run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
4314
- }
4315
- interface VerifyOptions<Env = unknown> {
4316
- env: Env;
4317
- /**
4318
- * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
4319
- * omits a cap. The verifier short-circuits remaining layers on overall cap.
4320
- */
4321
- overallCapMs?: number;
4322
- /** Called with each layer result as it completes. */
4323
- onLayer?: (result: LayerResult) => void;
4324
- }
4325
- interface VerificationReport {
4326
- layers: LayerResult[];
4327
- passCount: number;
4328
- failCount: number;
4329
- skippedCount: number;
4330
- errorCount: number;
4331
- /** True iff at least one scored layer ran AND every scored layer passed. */
4332
- allPass: boolean;
4333
- /**
4334
- * Weighted mean of `score` across contributing layers. 0 when no layers
4335
- * contributed. See {@link Layer.failContributesToScore} for fail semantics.
4336
- */
4337
- blendedScore: number;
4338
- durationMs: number;
4339
- startedAt: string;
4340
- finishedAt: string;
4341
- }
4342
- /**
4343
- * Grade a semantic-concept-style judge result into a single layer status.
4344
- *
4345
- * Pass when overall score >= threshold AND no critical-severity concept gap.
4346
- * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
4347
- *
4348
- * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
4349
- * too strict — a single concept at 6/10 failed the entire layer despite
4350
- * overall score being >= 0.7. Now we trust the judge's own `severity` field:
4351
- * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
4352
- */
4353
- declare function gradeSemanticStatus(input: {
4354
- score: number;
4355
- findings: Array<{
4356
- severity: Severity;
4357
- present?: boolean;
4358
- score?: number;
4359
- }>;
4360
- available: boolean;
4361
- threshold?: number;
4362
- }): LayerStatus;
4363
- declare class MultiLayerVerifier<Env = unknown> {
4364
- private readonly layers;
4365
- constructor(layers: Layer<Env>[]);
4366
- run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
4367
- }
4368
-
4369
4233
  /**
4370
4234
  * CommandRunner — abstract subprocess execution surface.
4371
4235
  *
@@ -6038,4 +5902,4 @@ interface OrthogonalityResult {
6038
5902
  }
6039
5903
  declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
6040
5904
 
6041
- export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OutcomeFilter, type OutcomePair, OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
5905
+ export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, type LayerCorrelation, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OutcomeFilter, type OutcomePair, OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  stopOnNoProgress,
20
20
  stopOnRepeatedAction,
21
21
  subjectiveEval
22
- } from "./chunk-ARZ6BEV6.js";
22
+ } from "./chunk-V5QSWN7L.js";
23
23
  import {
24
24
  CallbackResearcher,
25
25
  DEFAULT_MUTATION_PRIMITIVES,
@@ -46,7 +46,6 @@ import {
46
46
  renderPreferenceMemoryMarkdown,
47
47
  replayFeedbackTrajectories,
48
48
  replayFeedbackTrajectory,
49
- runEvalCampaign,
50
49
  runMultiShotOptimization,
51
50
  runPromptEvolution,
52
51
  scalarScore,
@@ -54,25 +53,89 @@ import {
54
53
  summarizePreferenceMemory,
55
54
  trialTraceFromMultiShotTrial,
56
55
  withAssignedFeedbackSplit
57
- } from "./chunk-USHQBPMH.js";
56
+ } from "./chunk-VQQSPGSM.js";
58
57
  import {
59
58
  RunRecordValidationError,
60
59
  isRunRecord,
61
60
  parseRunRecordSafe,
62
61
  roundTripRunRecord,
63
62
  validateRunRecord
64
- } from "./chunk-YUFXO3TU.js";
63
+ } from "./chunk-QBW3YBTR.js";
65
64
  import {
66
65
  assertReleaseConfidence,
67
66
  bootstrapCi,
68
- evaluateInterimReleaseConfidence,
69
67
  evaluateReleaseConfidence,
70
68
  judgeReplayGate,
71
- pairedEvalueSequence,
72
69
  releaseTraceEvidenceFromMultiShotTrials,
73
- renderReleaseReport,
70
+ renderReleaseReport
71
+ } from "./chunk-7EAUOUQS.js";
72
+ import {
73
+ PredictiveValidityResearcher,
74
+ adversarialScenarioSearch,
75
+ analyzeOptimizationResult,
76
+ applyEloUpdate,
77
+ bestOfN,
78
+ buildPairwiseFromCampaign,
79
+ compareAdaptationCurves,
80
+ detectRewardHacking,
81
+ doublyRobust,
82
+ extractPreferences,
83
+ extractStepRewards,
84
+ extractVerifiableReward,
85
+ extractVerifiableRewardsFromRecords,
86
+ filterDeterministicallyRewarded,
87
+ firstPassK,
88
+ fitBradleyTerry,
89
+ injectIrrelevantClause,
90
+ inverseProbabilityWeighting,
91
+ observationsFromRunRecords,
92
+ offPolicyEstimateAll,
93
+ prmTrainingPairs,
94
+ renameVariables,
95
+ runAdaptationCurve,
96
+ runComputeCurve,
97
+ runContaminationProbe,
98
+ runRLCampaign,
99
+ runwiseStepRewardSummary,
100
+ selfConsistency,
101
+ selfNormalizedImportanceWeighting,
102
+ shuffleOrder,
103
+ stepRewardsToJsonl,
104
+ thompsonCurriculum,
105
+ toAnthropicFormat,
106
+ toDpoJsonl,
107
+ toDpoRows,
108
+ toGrpoJsonl,
109
+ toGrpoRows,
110
+ toPrmJsonl,
111
+ toPrmRows,
112
+ toSftJsonl,
113
+ toSftRows,
114
+ toTRLFormat,
115
+ trialToRunRecord,
116
+ trialsToRunRecords,
117
+ varianceBasedCurriculum,
118
+ variantAggregateToRunRecord,
119
+ verificationReportToRunRecord
120
+ } from "./chunk-LZKIOBG2.js";
121
+ import {
122
+ runEvalCampaign
123
+ } from "./chunk-EXGR4XEM.js";
124
+ import {
125
+ LlmCallError,
126
+ LlmClient,
127
+ LlmRouteAssertionError,
128
+ assertLlmRoute,
129
+ callLlm,
130
+ callLlmJson,
131
+ probeLlm,
132
+ stripFencedJson
133
+ } from "./chunk-KAO3Q65R.js";
134
+ import {
135
+ evaluateInterimReleaseConfidence,
136
+ pairedEvalueSequence,
74
137
  rubricPredictiveValidity
75
- } from "./chunk-UAND2LOT.js";
138
+ } from "./chunk-AXHNWLIX.js";
76
139
  import {
77
140
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
78
141
  benjaminiHochberg,
@@ -151,6 +214,13 @@ import {
151
214
  assertRunCaptured,
152
215
  throwIfRunIncomplete
153
216
  } from "./chunk-QUKKGHTZ.js";
217
+ import {
218
+ FileSystemRawProviderSink,
219
+ InMemoryRawProviderSink,
220
+ NoopRawProviderSink,
221
+ defaultProviderRedactor,
222
+ providerFromBaseUrl
223
+ } from "./chunk-SQQLHODJ.js";
154
224
  import {
155
225
  TraceEmitter,
156
226
  llmSpanFromProvider
@@ -162,23 +232,6 @@ import {
162
232
  signManifest,
163
233
  verifyManifest
164
234
  } from "./chunk-6M774GY6.js";
165
- import {
166
- LlmCallError,
167
- LlmClient,
168
- LlmRouteAssertionError,
169
- assertLlmRoute,
170
- callLlm,
171
- callLlmJson,
172
- probeLlm,
173
- stripFencedJson
174
- } from "./chunk-KAO3Q65R.js";
175
- import {
176
- FileSystemRawProviderSink,
177
- InMemoryRawProviderSink,
178
- NoopRawProviderSink,
179
- defaultProviderRedactor,
180
- providerFromBaseUrl
181
- } from "./chunk-SQQLHODJ.js";
182
235
  import "./chunk-PZ5AY32C.js";
183
236
 
184
237
  // src/client.ts
@@ -10498,6 +10551,7 @@ export {
10498
10551
  OTEL_AGENT_EVAL_SCOPE,
10499
10552
  OtlpFileTraceStore,
10500
10553
  PairwiseSteeringOptimizer,
10554
+ PredictiveValidityResearcher,
10501
10555
  PrmGrader,
10502
10556
  ProductClient,
10503
10557
  ProjectRegistry,
@@ -10527,12 +10581,15 @@ export {
10527
10581
  UNIVERSAL_FINDERS,
10528
10582
  acquisitionPlansForKnowledgeGaps,
10529
10583
  adversarialJudge,
10584
+ adversarialScenarioSearch,
10530
10585
  aggregateLlm,
10531
10586
  aggregateRunScore,
10532
10587
  allCriticalPassed,
10533
10588
  analyzeAntiSlop,
10589
+ analyzeOptimizationResult,
10534
10590
  analyzeSeries,
10535
10591
  analyzeTraces,
10592
+ applyEloUpdate,
10536
10593
  argHash,
10537
10594
  assertLlmRoute,
10538
10595
  assertReleaseConfidence,
@@ -10542,12 +10599,14 @@ export {
10542
10599
  deterministicSplit as benchmarkDeterministicSplit,
10543
10600
  benchmarks_exports as benchmarks,
10544
10601
  benjaminiHochberg,
10602
+ bestOfN,
10545
10603
  bhAdjust,
10546
10604
  bisect,
10547
10605
  blockingKnowledgeEval,
10548
10606
  bonferroni,
10549
10607
  bootstrapCi,
10550
10608
  budgetBreachView,
10609
+ buildPairwiseFromCampaign,
10551
10610
  buildReflectionPrompt,
10552
10611
  buildReviewerPrompt,
10553
10612
  buildTraceAnalystTools,
@@ -10573,6 +10632,7 @@ export {
10573
10632
  coherenceJudge,
10574
10633
  collectionPreserved,
10575
10634
  commitBisect,
10635
+ compareAdaptationCurves,
10576
10636
  compareReferenceReplay,
10577
10637
  compareToBaseline,
10578
10638
  compilerJudge,
@@ -10609,9 +10669,11 @@ export {
10609
10669
  defaultTraceInsightPanel,
10610
10670
  deployGateLayer,
10611
10671
  describeTraceInsightScope,
10672
+ detectRewardHacking,
10612
10673
  distillPlaybook,
10613
10674
  domainEvidencePattern,
10614
10675
  dominates,
10676
+ doublyRobust,
10615
10677
  estimateCost,
10616
10678
  estimateTokens,
10617
10679
  euAiActReport,
@@ -10628,6 +10690,10 @@ export {
10628
10690
  exportTrainingData,
10629
10691
  extractAssetUrls,
10630
10692
  extractErrorCount,
10693
+ extractPreferences,
10694
+ extractStepRewards,
10695
+ extractVerifiableReward,
10696
+ extractVerifiableRewardsFromRecords,
10631
10697
  failureClusterView,
10632
10698
  feedbackTrajectoriesToDatasetScenarios,
10633
10699
  feedbackTrajectoriesToOptimizerRows,
@@ -10635,12 +10701,15 @@ export {
10635
10701
  feedbackTrajectoryToOptimizerRow,
10636
10702
  fileContains,
10637
10703
  fileExists,
10704
+ filterDeterministicallyRewarded,
10638
10705
  findAutoMatchNoExpectation,
10639
10706
  findConstructorCwdDropped,
10640
10707
  findFallbackToPass,
10641
10708
  findLiteralTruePass,
10642
10709
  findSkipCountsAsPass,
10643
10710
  firstDivergenceView,
10711
+ firstPassK,
10712
+ fitBradleyTerry,
10644
10713
  flowLayer,
10645
10714
  formatBenchmarkReport,
10646
10715
  formatDriverReport,
@@ -10656,12 +10725,14 @@ export {
10656
10725
  inMemoryReferenceReplayStore,
10657
10726
  inMemoryReviewStore,
10658
10727
  inferDomainKeywords,
10728
+ injectIrrelevantClause,
10659
10729
  integrationAsi,
10660
10730
  integrationGateEvals,
10661
10731
  integrationInvokeFailedPayload,
10662
10732
  integrationManifestResolvedPayload,
10663
10733
  integrationManifestValidatedPayload,
10664
10734
  interRaterReliability,
10735
+ inverseProbabilityWeighting,
10665
10736
  iqr,
10666
10737
  isJudgeSpan,
10667
10738
  isLlmSpan,
@@ -10697,6 +10768,8 @@ export {
10697
10768
  normalizeScores,
10698
10769
  notBlocked,
10699
10770
  objectiveEval,
10771
+ observationsFromRunRecords,
10772
+ offPolicyEstimateAll,
10700
10773
  outputLengthRubric,
10701
10774
  pairedBootstrap,
10702
10775
  pairedEvalueSequence,
@@ -10719,6 +10792,7 @@ export {
10719
10792
  printDriverSummary,
10720
10793
  prmBestOfN,
10721
10794
  prmEnsembleBestOfN,
10795
+ prmTrainingPairs,
10722
10796
  probeLlm,
10723
10797
  promptBisect,
10724
10798
  proposeSynthesisTargets,
@@ -10734,6 +10808,7 @@ export {
10734
10808
  regexMatches,
10735
10809
  regressionView,
10736
10810
  releaseTraceEvidenceFromMultiShotTrials,
10811
+ renameVariables,
10737
10812
  renderMarkdown,
10738
10813
  renderMarkdownReport,
10739
10814
  renderPlaybookMarkdown,
@@ -10752,10 +10827,13 @@ export {
10752
10827
  rowCount,
10753
10828
  rowWhere,
10754
10829
  rubricPredictiveValidity,
10830
+ runAdaptationCurve,
10755
10831
  runAgentControlLoop,
10756
10832
  runAssertions,
10757
10833
  runBehavioralCanaries,
10758
10834
  runCanaries,
10835
+ runComputeCurve,
10836
+ runContaminationProbe,
10759
10837
  runCounterfactual,
10760
10838
  runE2EWorkflow,
10761
10839
  runEvalCampaign,
@@ -10771,11 +10849,13 @@ export {
10771
10849
  runPromptEvolution,
10772
10850
  runProposeReview,
10773
10851
  runProposeReviewAsControlLoop,
10852
+ runRLCampaign,
10774
10853
  runReferenceReplay,
10775
10854
  runSelfPlay,
10776
10855
  runSemanticConceptJudge,
10777
10856
  runTestGradedScenario,
10778
10857
  runsForScenario,
10858
+ runwiseStepRewardSummary,
10779
10859
  scalarScore,
10780
10860
  scanForMuffledGates,
10781
10861
  scoreAllProjects,
@@ -10788,12 +10868,16 @@ export {
10788
10868
  scoreTraceInsightReadiness,
10789
10869
  securityJudge,
10790
10870
  selectHarnessVariant,
10871
+ selfConsistency,
10872
+ selfNormalizedImportanceWeighting,
10791
10873
  selfPreference,
10792
10874
  sentenceReorderMutator,
10793
10875
  serializeFeedbackTrajectoriesJsonl,
10876
+ shuffleOrder,
10794
10877
  signManifest,
10795
10878
  soc2Report,
10796
10879
  statusAdvanced,
10880
+ stepRewardsToJsonl,
10797
10881
  stopOnNoProgress,
10798
10882
  stopOnRepeatedAction,
10799
10883
  stripFencedJson,
@@ -10805,10 +10889,21 @@ export {
10805
10889
  summaryTable,
10806
10890
  testJudge,
10807
10891
  textInSnapshot,
10892
+ thompsonCurriculum,
10808
10893
  throwIfRunIncomplete,
10894
+ toAnthropicFormat,
10895
+ toDpoJsonl,
10896
+ toDpoRows,
10897
+ toGrpoJsonl,
10898
+ toGrpoRows,
10809
10899
  toLangfuseEnvelope,
10810
10900
  toNdjson,
10901
+ toPrmJsonl,
10902
+ toPrmRows,
10811
10903
  toPrometheusText,
10904
+ toSftJsonl,
10905
+ toSftRows,
10906
+ toTRLFormat,
10812
10907
  tokenizeDomainWords,
10813
10908
  toolIntentAlignmentRubric,
10814
10909
  toolNamesForRun,
@@ -10818,12 +10913,17 @@ export {
10818
10913
  toolWasteView,
10819
10914
  traceAnalystFunctionGroup,
10820
10915
  traceAnalystOnRunComplete,
10916
+ trialToRunRecord,
10821
10917
  trialTraceFromMultiShotTrial,
10918
+ trialsToRunRecords,
10822
10919
  typoMutator,
10823
10920
  urlContains,
10824
10921
  userQuestionsForKnowledgeGaps,
10825
10922
  validateRunRecord,
10923
+ varianceBasedCurriculum,
10924
+ variantAggregateToRunRecord,
10826
10925
  verbosityBias,
10926
+ verificationReportToRunRecord,
10827
10927
  verifyManifest,
10828
10928
  visualDiff,
10829
10929
  viteDeployRunner,