@tangle-network/agent-eval 0.22.0 → 0.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +156 -0
- package/README.md +13 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
- package/dist/chunk-VQQSPGSM.js.map +1 -0
- package/dist/{chunk-4W4NCYM2.js → chunk-XPHOZPOM.js} +4 -2
- package/dist/chunk-XPHOZPOM.js.map +1 -0
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +18 -154
- package/dist/index.js +126 -26
- package/dist/index.js.map +1 -1
- package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +7 -5
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +6 -4
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +6 -6
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/three-package-architecture.md +180 -0
- package/package.json +22 -10
- package/dist/chunk-4W4NCYM2.js.map +0 -1
- package/dist/chunk-UAND2LOT.js.map +0 -1
- package/dist/chunk-USHQBPMH.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-B82RSv9C.d.ts +0 -593
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
package/dist/index.d.ts
CHANGED
|
@@ -1,25 +1,29 @@
|
|
|
1
1
|
import { TCloud } from '@tangle-network/tcloud';
|
|
2
|
-
import {
|
|
3
|
-
export {
|
|
4
|
-
import {
|
|
5
|
-
export {
|
|
6
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
2
|
+
import { ReleaseConfidenceThresholds, ReleaseConfidenceScorecard } from './reporting.js';
|
|
3
|
+
export { BootstrapOptions, BootstrapResult, JudgeReplayGateArgs, PairedBootstrapOptions, PairedBootstrapResult, ReleaseConfidenceAxis, ReleaseConfidenceAxisName, ReleaseConfidenceInput, ReleaseConfidenceIssue, ReleaseConfidenceMetrics, ReleaseConfidenceStatus, ReleaseTraceEvidence, RenderReleaseReportOptions, Verdict, assertReleaseConfidence, bhAdjust, bootstrapCi, evaluateReleaseConfidence, judgeReplayGate, pairedBootstrap, pairedWilcoxon, releaseTraceEvidenceFromMultiShotTrials, renderReleaseReport } from './reporting.js';
|
|
4
|
+
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory, E as ControlSeverity, G as ControlEvalResult } from './feedback-trajectory-c43WGtTX.js';
|
|
5
|
+
export { H as ControlActionFailureMode, J as ControlActionOutcome, K as ControlBudget, L as ControlContext, M as ControlDecision, N as ControlRunResult, O as ControlRuntimeConfig, Q as ControlRuntimeError, R as ControlStep, S as ControlStopPolicies, F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, T as StopDecision, U as allCriticalPassed, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, V as objectiveEval, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, W as runAgentControlLoop, B as serializeFeedbackTrajectoriesJsonl, X as stopOnNoProgress, Y as stopOnRepeatedAction, Z as subjectiveEval, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-c43WGtTX.js';
|
|
6
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DvkH87qJ.js';
|
|
7
7
|
import { T as TraceEmitter } from './emitter-B2XqDKFU.js';
|
|
8
8
|
export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
|
|
9
|
-
import { A as ActionableSideInfo, O as Objective,
|
|
10
|
-
export {
|
|
9
|
+
import { A as ActionableSideInfo, O as Objective, J as ParetoResult, T as TrialCache, t as TrialResult, E as EvolvableVariant, o as MutateAdapter, V as VariantAggregate } from './summary-report-Ce1r4EYo.js';
|
|
10
|
+
export { a as AsiSeverity, K as DEFAULT_FAILURE_RULES, D as DEFAULT_MUTATION_PRIMITIVES, L as Direction, N as FailureClassification, Q as FailureCluster, U as FailureClusterReport, W as FailureContext, X as FailureRule, Y as GainDistributionBin, Z as GainDistributionFigureSpec, _ as GainDistributionOptions, C as GateDecision, $ as GateEvidence, G as GenerationReport, a0 as HeldOutGate, a1 as HeldOutGateConfig, a2 as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, a3 as ParetoFigureSpec, a4 as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, a5 as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, H as ResearchReport, a6 as ResearchReportCandidate, a7 as ResearchReportDecision, a8 as ResearchReportMethodology, F as ResearchReportOptions, a9 as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, aa as SummaryTable, ab as SummaryTableOptions, ac as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ad as classifyFailure, ae as crowdingDistance, w as defaultMultiShotObjectives, af as dominates, ag as failureClusterView, ah as gainHistogram, ai as paretoChart, aj as paretoFrontier, ak as paretoFrontierWithCrowding, x as parseReflectionResponse, al as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, am as scalarScore, an as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-Ce1r4EYo.js';
|
|
11
11
|
import { a as Run$1, S as Span, f as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, F as FailureClass, g as BudgetSpec, c as ToolSpan, h as RunFilter, L as LlmSpan, J as JudgeSpan } from './store-u47QaJ9G.js';
|
|
12
12
|
export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, m as RunLayer, n as RunStatus, e as SandboxSpan, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
|
|
13
13
|
import { llmSpans } from './traces.js';
|
|
14
14
|
export { AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalysisStore, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
15
|
-
export { F as FileSystemRawProviderSink,
|
|
15
|
+
export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, f as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
|
|
16
16
|
import { a as DatasetScenario, b as Dataset, c as DatasetManifest } from './dataset-B9qvlm_o.js';
|
|
17
17
|
export { d as DatasetDifficulty, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B9qvlm_o.js';
|
|
18
|
-
import {
|
|
19
|
-
export {
|
|
20
|
-
import {
|
|
21
|
-
export { b as
|
|
22
|
-
|
|
18
|
+
import { O as OutcomeFilter, a as OutcomeStore } from './sequential-DgU2mFsE.js';
|
|
19
|
+
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, c as InterimReleaseConfidence, d as InterimReleaseConfidenceInput, P as PairedEvalueOptions, e as PairedEvalueSequence, f as PairedEvalueStep, R as RubricOutcomePair, g as RubricPredictiveValidityInput, h as RubricPredictiveValidityReport, i as RubricRanking, S as SequentialDecision, j as evaluateInterimReleaseConfidence, p as pairedEvalueSequence, r as rubricPredictiveValidity } from './sequential-DgU2mFsE.js';
|
|
20
|
+
import { L as LlmClientOptions } from './eval-campaign-Ds5QljIh.js';
|
|
21
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './eval-campaign-Ds5QljIh.js';
|
|
22
|
+
import { L as LayerResult, V as VerifyContext, a as Layer, S as Severity } from './index-ekBXweiQ.js';
|
|
23
|
+
export { A as AdaptationCurve, b as AdaptationPoint, c as AdaptationRunner, d as AdapterContext, e as AdversarialMutation, f as AdversarialScenario, g as AdversarialSearchOptions, h as AdversarialSearchReport, i as AnalyzeOptimizationResultOptions, j as AnalyzeOptimizationResultReport, B as BradleyTerryFit, k as BradleyTerryRating, l as BuildPairwiseFromCampaignInput, C as CellObservation, m as CompareCurvesResult, n as ComputeBestOfNOptions, o as ComputeBestOfNResult, p as ComputeCurve, q as ComputeCurveBudget, r as ComputeCurvePoint, s as ContaminationProbeInput, t as ContaminationProbeOptions, u as ContaminationProbeReport, v as CurriculumAllocation, D as DetectRewardHackingInput, w as DpoExportRow, x as DpoLookups, E as EloOptions, y as ExtractPreferencesOptions, z as ExtractStepRewardsOptions, F as Finding, G as GrpoExportRow, H as GrpoLookups, I as LayerStatus, M as MultiLayerVerifier, O as OffPolicyEstimate, J as OffPolicyOptions, K as OffPolicyTrajectory, P as PairwiseOutcome, N as ParetoPointInput, Q as PredictiveValidityResearcher, R as PredictiveValidityResearcherOptions, T as PreferenceExtractionReport, U as PreferenceStrategy, W as PreferenceTriple, X as PrmExportRow, Y as PrmLookups, Z as PrmTrainingTriple, _ as RLCampaignResult, $ as RewardHackingFinding, a0 as RewardHackingReport, a1 as RewardHackingSignal, a2 as RunAdaptationCurveOptions, a3 as RunComputeCurveOptions, a4 as RunRLCampaignOptions, a5 as RunwiseStepSummary, a6 as ScenarioPerturbation, a7 as ScenarioPerturbationKind, a8 as SelfConsistencyOptions, a9 as SelfConsistencyResult, aa as SftExportRow, ab as SftLookups, ac as StepReward, ad as StepRewardJsonlRow, ae as StepScorer, af as ThompsonCurriculumOptions, ag as VarianceCurriculumOptions, ah as VerifiableReward, ai as VerifiableRewardExtractionOptions, aj as VerifiableRewardSource, ak as VerificationReport, al as VerifyOptions, am as adversarialScenarioSearch, an as analyzeOptimizationResult, ao as applyEloUpdate, ap as bestOfN, aq as buildPairwiseFromCampaign, ar as compareAdaptationCurves, as as detectRewardHacking, at as doublyRobust, au as extractPreferences, av as extractStepRewards, aw as extractVerifiableReward, ax as extractVerifiableRewardsFromRecords, ay as filterDeterministicallyRewarded, az as firstPassK, aA as fitBradleyTerry, aB as gradeSemanticStatus, aC as injectIrrelevantClause, aD as inverseProbabilityWeighting, aE as observationsFromRunRecords, aF as offPolicyEstimateAll, aG as prmTrainingPairs, aH as renameVariables, aI as runAdaptationCurve, aJ as runComputeCurve, aK as runContaminationProbe, aL as runRLCampaign, aM as runwiseStepRewardSummary, aN as selfConsistency, aO as selfNormalizedImportanceWeighting, aP as shuffleOrder, aQ as stepRewardsToJsonl, aR as thompsonCurriculum, aS as toAnthropicFormat, aT as toDpoJsonl, aU as toDpoRows, aV as toGrpoJsonl, aW as toGrpoRows, aX as toPrmJsonl, aY as toPrmRows, aZ as toSftJsonl, a_ as toSftRows, a$ as toTRLFormat, b0 as trialToRunRecord, b1 as trialsToRunRecords, b2 as varianceBasedCurriculum, b3 as variantAggregateToRunRecord, b4 as verificationReportToRunRecord } from './index-ekBXweiQ.js';
|
|
24
|
+
import { R as RunRecord } from './run-record-DNiOMBrZ.js';
|
|
25
|
+
export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-DNiOMBrZ.js';
|
|
26
|
+
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DDTlbHEK.js';
|
|
23
27
|
import '@ax-llm/ax';
|
|
24
28
|
|
|
25
29
|
interface Scenario {
|
|
@@ -4226,146 +4230,6 @@ interface UseCaseSignals {
|
|
|
4226
4230
|
declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
|
|
4227
4231
|
declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
|
|
4228
4232
|
|
|
4229
|
-
/**
|
|
4230
|
-
* Multi-layer verifier — ordered pipeline of verification layers.
|
|
4231
|
-
*
|
|
4232
|
-
* Different contract from {@link JudgeRunner} (which runs parallel
|
|
4233
|
-
* specs against a sandbox). MultiLayerVerifier is a DAG of layers
|
|
4234
|
-
* (install → typecheck → build → lint → serve → semantic → …) with
|
|
4235
|
-
* dependency-based skip, per-layer findings, soft-fail semantics, and
|
|
4236
|
-
* an aggregated `blendedScore` across all passed layers.
|
|
4237
|
-
*
|
|
4238
|
-
* Use when you want:
|
|
4239
|
-
* - ordered stages where a failing upstream stage skips downstream ones
|
|
4240
|
-
* - each stage produces rich `findings` (severity + message + evidence)
|
|
4241
|
-
* - a single composite score across stages with per-stage weights
|
|
4242
|
-
* - soft-fail stages whose failure doesn't abort the pipeline
|
|
4243
|
-
*
|
|
4244
|
-
* Use {@link JudgeRunner} when you want:
|
|
4245
|
-
* - N independent judges running in parallel against the same artifact
|
|
4246
|
-
* - no inter-judge dependencies
|
|
4247
|
-
* - boolean `passed` per judge + overall
|
|
4248
|
-
*
|
|
4249
|
-
* Both primitives compose — JudgeRunner can be invoked as a single
|
|
4250
|
-
* layer inside a MultiLayerVerifier if that suits the caller.
|
|
4251
|
-
*/
|
|
4252
|
-
type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
|
|
4253
|
-
type Severity = 'critical' | 'major' | 'minor' | 'info';
|
|
4254
|
-
interface Finding {
|
|
4255
|
-
severity: Severity;
|
|
4256
|
-
message: string;
|
|
4257
|
-
evidence?: string;
|
|
4258
|
-
/** Optional layer name the finding belongs to (set by the verifier if omitted). */
|
|
4259
|
-
layer?: string;
|
|
4260
|
-
/**
|
|
4261
|
-
* Free-form structured payload — used by `multiToolchainLayer` to attach
|
|
4262
|
-
* `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
|
|
4263
|
-
* Renderers MAY interrogate; agent-eval primitives never assume shape.
|
|
4264
|
-
*/
|
|
4265
|
-
detail?: Record<string, unknown>;
|
|
4266
|
-
}
|
|
4267
|
-
interface LayerResult {
|
|
4268
|
-
layer: string;
|
|
4269
|
-
status: LayerStatus;
|
|
4270
|
-
/** 0..1 score, optional — layers that don't produce a numeric score omit. */
|
|
4271
|
-
score?: number;
|
|
4272
|
-
durationMs: number;
|
|
4273
|
-
findings: Finding[];
|
|
4274
|
-
/** Short human-readable summary (one line). */
|
|
4275
|
-
reason?: string;
|
|
4276
|
-
/**
|
|
4277
|
-
* Numeric layer-level diagnostics: error counts, warning counts,
|
|
4278
|
-
* cyclomatic complexity, total adapter wall-time, etc. Keyed by
|
|
4279
|
-
* diagnostic name; null = "diagnostic not applicable / not measured."
|
|
4280
|
-
* Renderers that know the keys can display them; ones that don't,
|
|
4281
|
-
* ignore. Free-form on purpose — consumers type the value shape in
|
|
4282
|
-
* their own namespace. Added in 0.10.
|
|
4283
|
-
*/
|
|
4284
|
-
diagnostics?: Record<string, number | null>;
|
|
4285
|
-
/** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
|
|
4286
|
-
detail?: Record<string, unknown>;
|
|
4287
|
-
}
|
|
4288
|
-
interface VerifyContext<Env = unknown> {
|
|
4289
|
-
/** Per-run opaque context the caller provides. Layers destructure what they need. */
|
|
4290
|
-
env: Env;
|
|
4291
|
-
/** Previously-computed results from layers that already ran. */
|
|
4292
|
-
prior: Record<string, LayerResult>;
|
|
4293
|
-
/** Signal — if aborted, layers MUST bail within reasonable wall. */
|
|
4294
|
-
signal: AbortSignal;
|
|
4295
|
-
}
|
|
4296
|
-
interface Layer<Env = unknown> {
|
|
4297
|
-
name: string;
|
|
4298
|
-
/** Stages that must have `status: 'pass'` before this layer runs. */
|
|
4299
|
-
dependsOn?: string[];
|
|
4300
|
-
/**
|
|
4301
|
-
* Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
|
|
4302
|
-
* contribute findings but not score.
|
|
4303
|
-
*/
|
|
4304
|
-
weight?: number;
|
|
4305
|
-
/**
|
|
4306
|
-
* If true, a `fail` status contributes to `blendedScore` (as 0) instead of
|
|
4307
|
-
* being dropped — use for layers whose failure is a real signal. Default:
|
|
4308
|
-
* fail drops from numerator + denominator, matching VB's existing semantics.
|
|
4309
|
-
*/
|
|
4310
|
-
failContributesToScore?: boolean;
|
|
4311
|
-
/** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
|
|
4312
|
-
capMs?: number;
|
|
4313
|
-
run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
|
|
4314
|
-
}
|
|
4315
|
-
interface VerifyOptions<Env = unknown> {
|
|
4316
|
-
env: Env;
|
|
4317
|
-
/**
|
|
4318
|
-
* Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
|
|
4319
|
-
* omits a cap. The verifier short-circuits remaining layers on overall cap.
|
|
4320
|
-
*/
|
|
4321
|
-
overallCapMs?: number;
|
|
4322
|
-
/** Called with each layer result as it completes. */
|
|
4323
|
-
onLayer?: (result: LayerResult) => void;
|
|
4324
|
-
}
|
|
4325
|
-
interface VerificationReport {
|
|
4326
|
-
layers: LayerResult[];
|
|
4327
|
-
passCount: number;
|
|
4328
|
-
failCount: number;
|
|
4329
|
-
skippedCount: number;
|
|
4330
|
-
errorCount: number;
|
|
4331
|
-
/** True iff at least one scored layer ran AND every scored layer passed. */
|
|
4332
|
-
allPass: boolean;
|
|
4333
|
-
/**
|
|
4334
|
-
* Weighted mean of `score` across contributing layers. 0 when no layers
|
|
4335
|
-
* contributed. See {@link Layer.failContributesToScore} for fail semantics.
|
|
4336
|
-
*/
|
|
4337
|
-
blendedScore: number;
|
|
4338
|
-
durationMs: number;
|
|
4339
|
-
startedAt: string;
|
|
4340
|
-
finishedAt: string;
|
|
4341
|
-
}
|
|
4342
|
-
/**
|
|
4343
|
-
* Grade a semantic-concept-style judge result into a single layer status.
|
|
4344
|
-
*
|
|
4345
|
-
* Pass when overall score >= threshold AND no critical-severity concept gap.
|
|
4346
|
-
* Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
|
|
4347
|
-
*
|
|
4348
|
-
* Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
|
|
4349
|
-
* too strict — a single concept at 6/10 failed the entire layer despite
|
|
4350
|
-
* overall score being >= 0.7. Now we trust the judge's own `severity` field:
|
|
4351
|
-
* `critical` findings veto; `major`/`minor` reduce the score but don't veto.
|
|
4352
|
-
*/
|
|
4353
|
-
declare function gradeSemanticStatus(input: {
|
|
4354
|
-
score: number;
|
|
4355
|
-
findings: Array<{
|
|
4356
|
-
severity: Severity;
|
|
4357
|
-
present?: boolean;
|
|
4358
|
-
score?: number;
|
|
4359
|
-
}>;
|
|
4360
|
-
available: boolean;
|
|
4361
|
-
threshold?: number;
|
|
4362
|
-
}): LayerStatus;
|
|
4363
|
-
declare class MultiLayerVerifier<Env = unknown> {
|
|
4364
|
-
private readonly layers;
|
|
4365
|
-
constructor(layers: Layer<Env>[]);
|
|
4366
|
-
run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
|
|
4367
|
-
}
|
|
4368
|
-
|
|
4369
4233
|
/**
|
|
4370
4234
|
* CommandRunner — abstract subprocess execution surface.
|
|
4371
4235
|
*
|
|
@@ -6038,4 +5902,4 @@ interface OrthogonalityResult {
|
|
|
6038
5902
|
}
|
|
6039
5903
|
declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
|
|
6040
5904
|
|
|
6041
|
-
export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OutcomeFilter, type OutcomePair, OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|
|
5905
|
+
export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, type LayerCorrelation, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OutcomeFilter, type OutcomePair, OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
stopOnNoProgress,
|
|
20
20
|
stopOnRepeatedAction,
|
|
21
21
|
subjectiveEval
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-V5QSWN7L.js";
|
|
23
23
|
import {
|
|
24
24
|
CallbackResearcher,
|
|
25
25
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
@@ -46,7 +46,6 @@ import {
|
|
|
46
46
|
renderPreferenceMemoryMarkdown,
|
|
47
47
|
replayFeedbackTrajectories,
|
|
48
48
|
replayFeedbackTrajectory,
|
|
49
|
-
runEvalCampaign,
|
|
50
49
|
runMultiShotOptimization,
|
|
51
50
|
runPromptEvolution,
|
|
52
51
|
scalarScore,
|
|
@@ -54,25 +53,89 @@ import {
|
|
|
54
53
|
summarizePreferenceMemory,
|
|
55
54
|
trialTraceFromMultiShotTrial,
|
|
56
55
|
withAssignedFeedbackSplit
|
|
57
|
-
} from "./chunk-
|
|
56
|
+
} from "./chunk-VQQSPGSM.js";
|
|
58
57
|
import {
|
|
59
58
|
RunRecordValidationError,
|
|
60
59
|
isRunRecord,
|
|
61
60
|
parseRunRecordSafe,
|
|
62
61
|
roundTripRunRecord,
|
|
63
62
|
validateRunRecord
|
|
64
|
-
} from "./chunk-
|
|
63
|
+
} from "./chunk-QBW3YBTR.js";
|
|
65
64
|
import {
|
|
66
65
|
assertReleaseConfidence,
|
|
67
66
|
bootstrapCi,
|
|
68
|
-
evaluateInterimReleaseConfidence,
|
|
69
67
|
evaluateReleaseConfidence,
|
|
70
68
|
judgeReplayGate,
|
|
71
|
-
pairedEvalueSequence,
|
|
72
69
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
73
|
-
renderReleaseReport
|
|
70
|
+
renderReleaseReport
|
|
71
|
+
} from "./chunk-7EAUOUQS.js";
|
|
72
|
+
import {
|
|
73
|
+
PredictiveValidityResearcher,
|
|
74
|
+
adversarialScenarioSearch,
|
|
75
|
+
analyzeOptimizationResult,
|
|
76
|
+
applyEloUpdate,
|
|
77
|
+
bestOfN,
|
|
78
|
+
buildPairwiseFromCampaign,
|
|
79
|
+
compareAdaptationCurves,
|
|
80
|
+
detectRewardHacking,
|
|
81
|
+
doublyRobust,
|
|
82
|
+
extractPreferences,
|
|
83
|
+
extractStepRewards,
|
|
84
|
+
extractVerifiableReward,
|
|
85
|
+
extractVerifiableRewardsFromRecords,
|
|
86
|
+
filterDeterministicallyRewarded,
|
|
87
|
+
firstPassK,
|
|
88
|
+
fitBradleyTerry,
|
|
89
|
+
injectIrrelevantClause,
|
|
90
|
+
inverseProbabilityWeighting,
|
|
91
|
+
observationsFromRunRecords,
|
|
92
|
+
offPolicyEstimateAll,
|
|
93
|
+
prmTrainingPairs,
|
|
94
|
+
renameVariables,
|
|
95
|
+
runAdaptationCurve,
|
|
96
|
+
runComputeCurve,
|
|
97
|
+
runContaminationProbe,
|
|
98
|
+
runRLCampaign,
|
|
99
|
+
runwiseStepRewardSummary,
|
|
100
|
+
selfConsistency,
|
|
101
|
+
selfNormalizedImportanceWeighting,
|
|
102
|
+
shuffleOrder,
|
|
103
|
+
stepRewardsToJsonl,
|
|
104
|
+
thompsonCurriculum,
|
|
105
|
+
toAnthropicFormat,
|
|
106
|
+
toDpoJsonl,
|
|
107
|
+
toDpoRows,
|
|
108
|
+
toGrpoJsonl,
|
|
109
|
+
toGrpoRows,
|
|
110
|
+
toPrmJsonl,
|
|
111
|
+
toPrmRows,
|
|
112
|
+
toSftJsonl,
|
|
113
|
+
toSftRows,
|
|
114
|
+
toTRLFormat,
|
|
115
|
+
trialToRunRecord,
|
|
116
|
+
trialsToRunRecords,
|
|
117
|
+
varianceBasedCurriculum,
|
|
118
|
+
variantAggregateToRunRecord,
|
|
119
|
+
verificationReportToRunRecord
|
|
120
|
+
} from "./chunk-LZKIOBG2.js";
|
|
121
|
+
import {
|
|
122
|
+
runEvalCampaign
|
|
123
|
+
} from "./chunk-EXGR4XEM.js";
|
|
124
|
+
import {
|
|
125
|
+
LlmCallError,
|
|
126
|
+
LlmClient,
|
|
127
|
+
LlmRouteAssertionError,
|
|
128
|
+
assertLlmRoute,
|
|
129
|
+
callLlm,
|
|
130
|
+
callLlmJson,
|
|
131
|
+
probeLlm,
|
|
132
|
+
stripFencedJson
|
|
133
|
+
} from "./chunk-KAO3Q65R.js";
|
|
134
|
+
import {
|
|
135
|
+
evaluateInterimReleaseConfidence,
|
|
136
|
+
pairedEvalueSequence,
|
|
74
137
|
rubricPredictiveValidity
|
|
75
|
-
} from "./chunk-
|
|
138
|
+
} from "./chunk-AXHNWLIX.js";
|
|
76
139
|
import {
|
|
77
140
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
78
141
|
benjaminiHochberg,
|
|
@@ -145,12 +208,19 @@ import {
|
|
|
145
208
|
toolSpans,
|
|
146
209
|
traceAnalystFunctionGroup,
|
|
147
210
|
traceAnalystOnRunComplete
|
|
148
|
-
} from "./chunk-
|
|
211
|
+
} from "./chunk-XPHOZPOM.js";
|
|
149
212
|
import {
|
|
150
213
|
RunIntegrityError,
|
|
151
214
|
assertRunCaptured,
|
|
152
215
|
throwIfRunIncomplete
|
|
153
216
|
} from "./chunk-QUKKGHTZ.js";
|
|
217
|
+
import {
|
|
218
|
+
FileSystemRawProviderSink,
|
|
219
|
+
InMemoryRawProviderSink,
|
|
220
|
+
NoopRawProviderSink,
|
|
221
|
+
defaultProviderRedactor,
|
|
222
|
+
providerFromBaseUrl
|
|
223
|
+
} from "./chunk-SQQLHODJ.js";
|
|
154
224
|
import {
|
|
155
225
|
TraceEmitter,
|
|
156
226
|
llmSpanFromProvider
|
|
@@ -162,23 +232,6 @@ import {
|
|
|
162
232
|
signManifest,
|
|
163
233
|
verifyManifest
|
|
164
234
|
} from "./chunk-6M774GY6.js";
|
|
165
|
-
import {
|
|
166
|
-
LlmCallError,
|
|
167
|
-
LlmClient,
|
|
168
|
-
LlmRouteAssertionError,
|
|
169
|
-
assertLlmRoute,
|
|
170
|
-
callLlm,
|
|
171
|
-
callLlmJson,
|
|
172
|
-
probeLlm,
|
|
173
|
-
stripFencedJson
|
|
174
|
-
} from "./chunk-KAO3Q65R.js";
|
|
175
|
-
import {
|
|
176
|
-
FileSystemRawProviderSink,
|
|
177
|
-
InMemoryRawProviderSink,
|
|
178
|
-
NoopRawProviderSink,
|
|
179
|
-
defaultProviderRedactor,
|
|
180
|
-
providerFromBaseUrl
|
|
181
|
-
} from "./chunk-SQQLHODJ.js";
|
|
182
235
|
import "./chunk-PZ5AY32C.js";
|
|
183
236
|
|
|
184
237
|
// src/client.ts
|
|
@@ -10498,6 +10551,7 @@ export {
|
|
|
10498
10551
|
OTEL_AGENT_EVAL_SCOPE,
|
|
10499
10552
|
OtlpFileTraceStore,
|
|
10500
10553
|
PairwiseSteeringOptimizer,
|
|
10554
|
+
PredictiveValidityResearcher,
|
|
10501
10555
|
PrmGrader,
|
|
10502
10556
|
ProductClient,
|
|
10503
10557
|
ProjectRegistry,
|
|
@@ -10527,12 +10581,15 @@ export {
|
|
|
10527
10581
|
UNIVERSAL_FINDERS,
|
|
10528
10582
|
acquisitionPlansForKnowledgeGaps,
|
|
10529
10583
|
adversarialJudge,
|
|
10584
|
+
adversarialScenarioSearch,
|
|
10530
10585
|
aggregateLlm,
|
|
10531
10586
|
aggregateRunScore,
|
|
10532
10587
|
allCriticalPassed,
|
|
10533
10588
|
analyzeAntiSlop,
|
|
10589
|
+
analyzeOptimizationResult,
|
|
10534
10590
|
analyzeSeries,
|
|
10535
10591
|
analyzeTraces,
|
|
10592
|
+
applyEloUpdate,
|
|
10536
10593
|
argHash,
|
|
10537
10594
|
assertLlmRoute,
|
|
10538
10595
|
assertReleaseConfidence,
|
|
@@ -10542,12 +10599,14 @@ export {
|
|
|
10542
10599
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
10543
10600
|
benchmarks_exports as benchmarks,
|
|
10544
10601
|
benjaminiHochberg,
|
|
10602
|
+
bestOfN,
|
|
10545
10603
|
bhAdjust,
|
|
10546
10604
|
bisect,
|
|
10547
10605
|
blockingKnowledgeEval,
|
|
10548
10606
|
bonferroni,
|
|
10549
10607
|
bootstrapCi,
|
|
10550
10608
|
budgetBreachView,
|
|
10609
|
+
buildPairwiseFromCampaign,
|
|
10551
10610
|
buildReflectionPrompt,
|
|
10552
10611
|
buildReviewerPrompt,
|
|
10553
10612
|
buildTraceAnalystTools,
|
|
@@ -10573,6 +10632,7 @@ export {
|
|
|
10573
10632
|
coherenceJudge,
|
|
10574
10633
|
collectionPreserved,
|
|
10575
10634
|
commitBisect,
|
|
10635
|
+
compareAdaptationCurves,
|
|
10576
10636
|
compareReferenceReplay,
|
|
10577
10637
|
compareToBaseline,
|
|
10578
10638
|
compilerJudge,
|
|
@@ -10609,9 +10669,11 @@ export {
|
|
|
10609
10669
|
defaultTraceInsightPanel,
|
|
10610
10670
|
deployGateLayer,
|
|
10611
10671
|
describeTraceInsightScope,
|
|
10672
|
+
detectRewardHacking,
|
|
10612
10673
|
distillPlaybook,
|
|
10613
10674
|
domainEvidencePattern,
|
|
10614
10675
|
dominates,
|
|
10676
|
+
doublyRobust,
|
|
10615
10677
|
estimateCost,
|
|
10616
10678
|
estimateTokens,
|
|
10617
10679
|
euAiActReport,
|
|
@@ -10628,6 +10690,10 @@ export {
|
|
|
10628
10690
|
exportTrainingData,
|
|
10629
10691
|
extractAssetUrls,
|
|
10630
10692
|
extractErrorCount,
|
|
10693
|
+
extractPreferences,
|
|
10694
|
+
extractStepRewards,
|
|
10695
|
+
extractVerifiableReward,
|
|
10696
|
+
extractVerifiableRewardsFromRecords,
|
|
10631
10697
|
failureClusterView,
|
|
10632
10698
|
feedbackTrajectoriesToDatasetScenarios,
|
|
10633
10699
|
feedbackTrajectoriesToOptimizerRows,
|
|
@@ -10635,12 +10701,15 @@ export {
|
|
|
10635
10701
|
feedbackTrajectoryToOptimizerRow,
|
|
10636
10702
|
fileContains,
|
|
10637
10703
|
fileExists,
|
|
10704
|
+
filterDeterministicallyRewarded,
|
|
10638
10705
|
findAutoMatchNoExpectation,
|
|
10639
10706
|
findConstructorCwdDropped,
|
|
10640
10707
|
findFallbackToPass,
|
|
10641
10708
|
findLiteralTruePass,
|
|
10642
10709
|
findSkipCountsAsPass,
|
|
10643
10710
|
firstDivergenceView,
|
|
10711
|
+
firstPassK,
|
|
10712
|
+
fitBradleyTerry,
|
|
10644
10713
|
flowLayer,
|
|
10645
10714
|
formatBenchmarkReport,
|
|
10646
10715
|
formatDriverReport,
|
|
@@ -10656,12 +10725,14 @@ export {
|
|
|
10656
10725
|
inMemoryReferenceReplayStore,
|
|
10657
10726
|
inMemoryReviewStore,
|
|
10658
10727
|
inferDomainKeywords,
|
|
10728
|
+
injectIrrelevantClause,
|
|
10659
10729
|
integrationAsi,
|
|
10660
10730
|
integrationGateEvals,
|
|
10661
10731
|
integrationInvokeFailedPayload,
|
|
10662
10732
|
integrationManifestResolvedPayload,
|
|
10663
10733
|
integrationManifestValidatedPayload,
|
|
10664
10734
|
interRaterReliability,
|
|
10735
|
+
inverseProbabilityWeighting,
|
|
10665
10736
|
iqr,
|
|
10666
10737
|
isJudgeSpan,
|
|
10667
10738
|
isLlmSpan,
|
|
@@ -10697,6 +10768,8 @@ export {
|
|
|
10697
10768
|
normalizeScores,
|
|
10698
10769
|
notBlocked,
|
|
10699
10770
|
objectiveEval,
|
|
10771
|
+
observationsFromRunRecords,
|
|
10772
|
+
offPolicyEstimateAll,
|
|
10700
10773
|
outputLengthRubric,
|
|
10701
10774
|
pairedBootstrap,
|
|
10702
10775
|
pairedEvalueSequence,
|
|
@@ -10719,6 +10792,7 @@ export {
|
|
|
10719
10792
|
printDriverSummary,
|
|
10720
10793
|
prmBestOfN,
|
|
10721
10794
|
prmEnsembleBestOfN,
|
|
10795
|
+
prmTrainingPairs,
|
|
10722
10796
|
probeLlm,
|
|
10723
10797
|
promptBisect,
|
|
10724
10798
|
proposeSynthesisTargets,
|
|
@@ -10734,6 +10808,7 @@ export {
|
|
|
10734
10808
|
regexMatches,
|
|
10735
10809
|
regressionView,
|
|
10736
10810
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
10811
|
+
renameVariables,
|
|
10737
10812
|
renderMarkdown,
|
|
10738
10813
|
renderMarkdownReport,
|
|
10739
10814
|
renderPlaybookMarkdown,
|
|
@@ -10752,10 +10827,13 @@ export {
|
|
|
10752
10827
|
rowCount,
|
|
10753
10828
|
rowWhere,
|
|
10754
10829
|
rubricPredictiveValidity,
|
|
10830
|
+
runAdaptationCurve,
|
|
10755
10831
|
runAgentControlLoop,
|
|
10756
10832
|
runAssertions,
|
|
10757
10833
|
runBehavioralCanaries,
|
|
10758
10834
|
runCanaries,
|
|
10835
|
+
runComputeCurve,
|
|
10836
|
+
runContaminationProbe,
|
|
10759
10837
|
runCounterfactual,
|
|
10760
10838
|
runE2EWorkflow,
|
|
10761
10839
|
runEvalCampaign,
|
|
@@ -10771,11 +10849,13 @@ export {
|
|
|
10771
10849
|
runPromptEvolution,
|
|
10772
10850
|
runProposeReview,
|
|
10773
10851
|
runProposeReviewAsControlLoop,
|
|
10852
|
+
runRLCampaign,
|
|
10774
10853
|
runReferenceReplay,
|
|
10775
10854
|
runSelfPlay,
|
|
10776
10855
|
runSemanticConceptJudge,
|
|
10777
10856
|
runTestGradedScenario,
|
|
10778
10857
|
runsForScenario,
|
|
10858
|
+
runwiseStepRewardSummary,
|
|
10779
10859
|
scalarScore,
|
|
10780
10860
|
scanForMuffledGates,
|
|
10781
10861
|
scoreAllProjects,
|
|
@@ -10788,12 +10868,16 @@ export {
|
|
|
10788
10868
|
scoreTraceInsightReadiness,
|
|
10789
10869
|
securityJudge,
|
|
10790
10870
|
selectHarnessVariant,
|
|
10871
|
+
selfConsistency,
|
|
10872
|
+
selfNormalizedImportanceWeighting,
|
|
10791
10873
|
selfPreference,
|
|
10792
10874
|
sentenceReorderMutator,
|
|
10793
10875
|
serializeFeedbackTrajectoriesJsonl,
|
|
10876
|
+
shuffleOrder,
|
|
10794
10877
|
signManifest,
|
|
10795
10878
|
soc2Report,
|
|
10796
10879
|
statusAdvanced,
|
|
10880
|
+
stepRewardsToJsonl,
|
|
10797
10881
|
stopOnNoProgress,
|
|
10798
10882
|
stopOnRepeatedAction,
|
|
10799
10883
|
stripFencedJson,
|
|
@@ -10805,10 +10889,21 @@ export {
|
|
|
10805
10889
|
summaryTable,
|
|
10806
10890
|
testJudge,
|
|
10807
10891
|
textInSnapshot,
|
|
10892
|
+
thompsonCurriculum,
|
|
10808
10893
|
throwIfRunIncomplete,
|
|
10894
|
+
toAnthropicFormat,
|
|
10895
|
+
toDpoJsonl,
|
|
10896
|
+
toDpoRows,
|
|
10897
|
+
toGrpoJsonl,
|
|
10898
|
+
toGrpoRows,
|
|
10809
10899
|
toLangfuseEnvelope,
|
|
10810
10900
|
toNdjson,
|
|
10901
|
+
toPrmJsonl,
|
|
10902
|
+
toPrmRows,
|
|
10811
10903
|
toPrometheusText,
|
|
10904
|
+
toSftJsonl,
|
|
10905
|
+
toSftRows,
|
|
10906
|
+
toTRLFormat,
|
|
10812
10907
|
tokenizeDomainWords,
|
|
10813
10908
|
toolIntentAlignmentRubric,
|
|
10814
10909
|
toolNamesForRun,
|
|
@@ -10818,12 +10913,17 @@ export {
|
|
|
10818
10913
|
toolWasteView,
|
|
10819
10914
|
traceAnalystFunctionGroup,
|
|
10820
10915
|
traceAnalystOnRunComplete,
|
|
10916
|
+
trialToRunRecord,
|
|
10821
10917
|
trialTraceFromMultiShotTrial,
|
|
10918
|
+
trialsToRunRecords,
|
|
10822
10919
|
typoMutator,
|
|
10823
10920
|
urlContains,
|
|
10824
10921
|
userQuestionsForKnowledgeGaps,
|
|
10825
10922
|
validateRunRecord,
|
|
10923
|
+
varianceBasedCurriculum,
|
|
10924
|
+
variantAggregateToRunRecord,
|
|
10826
10925
|
verbosityBias,
|
|
10926
|
+
verificationReportToRunRecord,
|
|
10827
10927
|
verifyManifest,
|
|
10828
10928
|
visualDiff,
|
|
10829
10929
|
viteDeployRunner,
|