@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +102 -1
  2. package/README.md +4 -0
  3. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  6. package/dist/chunk-6M774GY6.js +53 -0
  7. package/dist/chunk-6M774GY6.js.map +1 -0
  8. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  9. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  10. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  11. package/dist/chunk-QUKKGHTZ.js +121 -0
  12. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  13. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  14. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  15. package/dist/chunk-UAND2LOT.js +738 -0
  16. package/dist/chunk-UAND2LOT.js.map +1 -0
  17. package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
  18. package/dist/chunk-USHQBPMH.js.map +1 -0
  19. package/dist/cli.js +3 -3
  20. package/dist/index.d.ts +10 -284
  21. package/dist/index.js +39 -19
  22. package/dist/index.js.map +1 -1
  23. package/dist/integrity-K2oVlF57.d.ts +210 -0
  24. package/dist/openapi.json +1 -1
  25. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  26. package/dist/optimization.d.ts +6 -144
  27. package/dist/optimization.js +9 -2
  28. package/dist/reporting-B82RSv9C.d.ts +593 -0
  29. package/dist/reporting.d.ts +2 -2
  30. package/dist/reporting.js +15 -8
  31. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  32. package/dist/traces.d.ts +101 -181
  33. package/dist/traces.js +16 -5
  34. package/dist/wire/index.js +3 -3
  35. package/docs/research-report-methodology.md +19 -4
  36. package/docs/wire-protocol.md +1 -1
  37. package/package.json +2 -2
  38. package/dist/chunk-3IX6QTB7.js.map +0 -1
  39. package/dist/chunk-HRZELXCR.js.map +0 -1
  40. package/dist/chunk-KRR4VMH7.js +0 -423
  41. package/dist/chunk-KRR4VMH7.js.map +0 -1
  42. package/dist/chunk-WOK2RTWG.js.map +0 -1
  43. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  44. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  45. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
package/dist/index.d.ts CHANGED
@@ -1,22 +1,24 @@
1
1
  import { TCloud } from '@tangle-network/tcloud';
2
- import { R as ReleaseConfidenceThresholds, a as ReleaseConfidenceScorecard } from './reporting-Da2ihlcM.js';
3
- export { B as BootstrapOptions, b as BootstrapResult, D as DEFAULT_FAILURE_RULES, F as FailureClassification, c as FailureCluster, d as FailureClusterReport, e as FailureContext, f as FailureRule, G as GainDistributionBin, g as GainDistributionFigureSpec, h as GainDistributionOptions, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, i as PairedBootstrapResult, j as ParetoFigureSpec, k as ParetoPoint, l as RESEARCH_REPORT_HARD_PAIR_FLOOR, m as ReleaseConfidenceAxis, n as ReleaseConfidenceAxisName, o as ReleaseConfidenceInput, p as ReleaseConfidenceIssue, q as ReleaseConfidenceMetrics, r as ReleaseConfidenceStatus, s as ReleaseTraceEvidence, t as RenderReleaseReportOptions, u as ResearchReport, v as ResearchReportCandidate, w as ResearchReportDecision, x as ResearchReportMethodology, y as ResearchReportOptions, z as ResearchReportRecommendation, S as SummaryTable, A as SummaryTableOptions, C as SummaryTableRow, V as Verdict, E as assertReleaseConfidence, H as bhAdjust, I as bootstrapCi, K as classifyFailure, L as evaluateReleaseConfidence, M as failureClusterView, N as gainHistogram, O as judgeReplayGate, Q as pairedBootstrap, T as pairedWilcoxon, U as paretoChart, W as releaseTraceEvidenceFromMultiShotTrials, X as renderReleaseReport, Y as researchReport, Z as summaryTable } from './reporting-Da2ihlcM.js';
2
+ import { R as ReleaseConfidenceThresholds, a as ReleaseConfidenceScorecard, O as OutcomeFilter, b as OutcomeStore } from './reporting-B82RSv9C.js';
3
+ export { B as BootstrapOptions, c as BootstrapResult, D as DeploymentOutcome, F as FileSystemOutcomeStore, d as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, e as InterimReleaseConfidence, f as InterimReleaseConfidenceInput, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, g as PairedBootstrapResult, h as PairedEvalueOptions, i as PairedEvalueSequence, j as PairedEvalueStep, k as ReleaseConfidenceAxis, l as ReleaseConfidenceAxisName, m as ReleaseConfidenceInput, n as ReleaseConfidenceIssue, o as ReleaseConfidenceMetrics, p as ReleaseConfidenceStatus, q as ReleaseTraceEvidence, r as RenderReleaseReportOptions, s as RubricOutcomePair, t as RubricPredictiveValidityInput, u as RubricPredictiveValidityReport, v as RubricRanking, S as SequentialDecision, V as Verdict, w as assertReleaseConfidence, x as bhAdjust, y as bootstrapCi, z as evaluateInterimReleaseConfidence, A as evaluateReleaseConfidence, C as judgeReplayGate, E as pairedBootstrap, G as pairedEvalueSequence, H as pairedWilcoxon, K as releaseTraceEvidenceFromMultiShotTrials, L as renderReleaseReport, M as rubricPredictiveValidity } from './reporting-B82RSv9C.js';
4
4
  import { F as FeedbackLabel, a as FeedbackTrajectoryStore, b as FeedbackTrajectory, C as ControlSeverity, c as ControlEvalResult } from './feedback-trajectory-CB0A32o3.js';
5
5
  export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, l as ControlStep, m as ControlStopPolicies, n as FeedbackArtifactType, o as FeedbackAttempt, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, y as FeedbackTrajectoryFilter, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, S as StopDecision, B as allCriticalPassed, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, M as objectiveEval, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, T as runAgentControlLoop, U as serializeFeedbackTrajectoriesJsonl, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-CB0A32o3.js';
6
6
  export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-cxwMOAsy.js';
7
7
  import { T as TraceEmitter } from './emitter-B2XqDKFU.js';
8
8
  export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
9
- import { A as ActionableSideInfo, O as Objective, P as ParetoResult, T as TrialCache, a as TrialResult, E as EvolvableVariant, M as MutateAdapter, V as VariantAggregate } from './multi-shot-optimization-Bvtz294B.js';
10
- export { b as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, c as Direction, G as GateDecision, d as GateEvidence, e as GenerationReport, H as HeldOutGate, f as HeldOutGateConfig, g as HeldOutGateRejectionCode, I as InMemoryTrialCache, h as MultiShotGateConfig, i as MultiShotGateResult, j as MultiShotMutateAdapter, k as MultiShotOptimizationConfig, l as MultiShotOptimizationResult, m as MultiShotRun, n as MultiShotRunInput, o as MultiShotRunner, p as MultiShotScore, q as MultiShotScorer, r as MultiShotSplit, s as MultiShotTrace, t as MultiShotTrialResult, u as MultiShotVariant, v as PromptEvolutionConfig, w as PromptEvolutionEvent, x as PromptEvolutionResult, R as ReflectionContext, y as ReflectionProposal, S as ScenarioAggregate, z as ScoreAdapter, B as TrialTrace, C as buildReflectionPrompt, F as crowdingDistance, J as defaultMultiShotObjectives, K as dominates, L as paretoFrontier, N as paretoFrontierWithCrowding, Q as parseReflectionResponse, U as runMultiShotOptimization, W as runPromptEvolution, X as scalarScore, Y as trialTraceFromMultiShotTrial } from './multi-shot-optimization-Bvtz294B.js';
9
+ import { A as ActionableSideInfo, O as Objective, P as ParetoResult, T as TrialCache, a as TrialResult, E as EvolvableVariant, M as MutateAdapter, V as VariantAggregate } from './summary-report-D4p7RlDu.js';
10
+ export { b as AsiSeverity, D as DEFAULT_FAILURE_RULES, c as DEFAULT_MUTATION_PRIMITIVES, d as Direction, F as FailureClassification, e as FailureCluster, f as FailureClusterReport, g as FailureContext, h as FailureRule, G as GainDistributionBin, i as GainDistributionFigureSpec, j as GainDistributionOptions, k as GateDecision, l as GateEvidence, m as GenerationReport, H as HeldOutGate, n as HeldOutGateConfig, o as HeldOutGateRejectionCode, I as InMemoryTrialCache, p as MultiShotGateConfig, q as MultiShotGateResult, r as MultiShotMutateAdapter, s as MultiShotOptimizationConfig, t as MultiShotOptimizationResult, u as MultiShotRun, v as MultiShotRunInput, w as MultiShotRunner, x as MultiShotScore, y as MultiShotScorer, z as MultiShotSplit, B as MultiShotTrace, C as MultiShotTrialResult, J as MultiShotVariant, K as ParetoFigureSpec, L as ParetoPoint, N as PromptEvolutionConfig, Q as PromptEvolutionEvent, R as PromptEvolutionResult, S as RESEARCH_REPORT_HARD_PAIR_FLOOR, U as ReflectionContext, W as ReflectionProposal, X as ResearchReport, Y as ResearchReportCandidate, Z as ResearchReportDecision, _ as ResearchReportMethodology, $ as ResearchReportOptions, a0 as ResearchReportRecommendation, a1 as ScenarioAggregate, a2 as ScoreAdapter, a3 as SummaryTable, a4 as SummaryTableOptions, a5 as SummaryTableRow, a6 as TrialTrace, a7 as buildReflectionPrompt, a8 as classifyFailure, a9 as crowdingDistance, aa as defaultMultiShotObjectives, ab as dominates, ac as failureClusterView, ad as gainHistogram, ae as paretoChart, af as paretoFrontier, ag as paretoFrontierWithCrowding, ah as parseReflectionResponse, ai as researchReport, aj as runMultiShotOptimization, ak as runPromptEvolution, al as scalarScore, am as summaryTable, an as trialTraceFromMultiShotTrial } from './summary-report-D4p7RlDu.js';
11
11
  import { a as Run$1, S as Span, f as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, F as FailureClass, g as BudgetSpec, c as ToolSpan, h as RunFilter, L as LlmSpan, J as JudgeSpan } from './store-u47QaJ9G.js';
12
12
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, m as RunLayer, n as RunStatus, e as SandboxSpan, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
13
- import { llmSpans, RawProviderSink, ProviderRedactor } from './traces.js';
14
- export { AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, FileSystemRawProviderSink, FileSystemRawProviderSinkOptions, InMemoryRawProviderSink, InMemoryRawProviderSinkOptions, NoopRawProviderSink, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RawProviderDirection, RawProviderEvent, RawProviderSinkFilter, RedactionReport, RedactionRule, RunIntegrityError, RunIntegrityExpectations, RunIntegrityIssue, RunIntegrityIssueCode, RunIntegrityReport, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalysisStore, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, aggregateLlm, analyzeTraces, argHash, assertRunCaptured, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultProviderRedactor, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, planTraceInsightQuestions, providerFromBaseUrl, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, throwIfRunIncomplete, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
13
+ import { llmSpans } from './traces.js';
14
+ export { AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalysisStore, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
15
+ export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, c as RawProviderEvent, d as RawProviderSink, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
15
16
  import { a as DatasetScenario, b as Dataset, c as DatasetManifest } from './dataset-B9qvlm_o.js';
16
17
  export { d as DatasetDifficulty, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B9qvlm_o.js';
18
+ import { L as LlmClientOptions } from './optimization-UVDNKaO6.js';
19
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, r as LlmRouteAssertionError, s as LlmRouteRequirements, t as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, u as assertLlmRoute, v as callLlm, w as callLlmJson, x as probeLlm, y as runEvalCampaign, z as stripFencedJson } from './optimization-UVDNKaO6.js';
17
20
  import { a as RunRecord } from './run-record-CX_jcAyr.js';
18
21
  export { b as RunJudgeMetadata, c as RunOutcome, d as RunRecordValidationError, R as RunSplitTag, e as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CX_jcAyr.js';
19
- export { CallbackResearcher, CallbackResearcherOptions, ExperimentPlan, ExperimentResult, FailureMode, NoopResearcher, Researcher, SteeringChange } from './optimization.js';
20
22
  export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-c5saLbKD.js';
21
23
  import '@ax-llm/ax';
22
24
 
@@ -3289,68 +3291,6 @@ declare class ProjectRegistry {
3289
3291
  projectChats(projectId: string): Promise<ChatSummary[]>;
3290
3292
  }
3291
3293
 
3292
- /**
3293
- * OutcomeStore — deployment outcomes attached to Run IDs.
3294
- *
3295
- * Outcomes arrive asynchronously from production telemetry after the
3296
- * eval run completed: user ratings, retention flags, conversion events,
3297
- * revenue, support-ticket rate, anything a product team can measure.
3298
- * The store is a peer to TraceStore — separate lifecycle, same runId
3299
- * foreign key.
3300
- *
3301
- * The whole point of this module is to make the meta-eval correlation
3302
- * question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
3303
- */
3304
- interface DeploymentOutcome {
3305
- runId: string;
3306
- capturedAt: number;
3307
- /** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
3308
- metrics: Record<string, number>;
3309
- /** Dimensions for stratified analysis — cohort, region, user_segment. */
3310
- labels?: Record<string, string>;
3311
- /** Free-form provenance (source system, pipeline version). */
3312
- source?: string;
3313
- }
3314
- interface OutcomeFilter {
3315
- runIds?: string[];
3316
- since?: number;
3317
- until?: number;
3318
- label?: {
3319
- key: string;
3320
- value: string;
3321
- };
3322
- source?: string;
3323
- }
3324
- interface OutcomeStore {
3325
- append(outcome: DeploymentOutcome): Promise<void>;
3326
- /** All outcomes attached to this run (a single run can have many — multiple
3327
- * capture windows over deployment time). */
3328
- forRun(runId: string): Promise<DeploymentOutcome[]>;
3329
- list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3330
- }
3331
- declare class InMemoryOutcomeStore implements OutcomeStore {
3332
- private items;
3333
- append(outcome: DeploymentOutcome): Promise<void>;
3334
- forRun(runId: string): Promise<DeploymentOutcome[]>;
3335
- list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3336
- }
3337
- interface FileSystemOutcomeStoreOptions {
3338
- dir: string;
3339
- maxBytes?: number;
3340
- }
3341
- declare class FileSystemOutcomeStore implements OutcomeStore {
3342
- private dir;
3343
- private maxBytes;
3344
- private memo?;
3345
- private loaded;
3346
- constructor(options: FileSystemOutcomeStoreOptions);
3347
- private ensureDir;
3348
- append(outcome: DeploymentOutcome): Promise<void>;
3349
- private load;
3350
- forRun(runId: string): Promise<DeploymentOutcome[]>;
3351
- list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3352
- }
3353
-
3354
3294
  /**
3355
3295
  * Correlation study — "does our eval score predict real-world outcomes?"
3356
3296
  *
@@ -4286,220 +4226,6 @@ interface UseCaseSignals {
4286
4226
  declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
4287
4227
  declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
4288
4228
 
4289
- /**
4290
- * LLM client with graceful degrade.
4291
- *
4292
- * OpenAI-compatible `/v1/chat/completions` client with:
4293
- * - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
4294
- * - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
4295
- * - Graceful json_schema → json_object degrade on 400 with schema-reject body.
4296
- * - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
4297
- * - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
4298
- * directly, cli-bridge subscriptions, and any router that speaks the spec.
4299
- *
4300
- * Usage:
4301
- * const { value, result } = await callLlmJson<MyType>(
4302
- * { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
4303
- * { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
4304
- * )
4305
- *
4306
- * This is THE llm-calling seam for agent-eval primitives that need structured
4307
- * output (semantic concept judge, reviewer directives, critic scores). Primitives
4308
- * that need free-form text use `callLlm` and parse output themselves.
4309
- */
4310
-
4311
- interface LlmMessage {
4312
- role: 'system' | 'user' | 'assistant';
4313
- /**
4314
- * Either a plain text content string OR a multimodal content array
4315
- * (text + image_url parts) for vision-capable models.
4316
- */
4317
- content: string | Array<{
4318
- type: 'text';
4319
- text: string;
4320
- } | {
4321
- type: 'image_url';
4322
- image_url: {
4323
- url: string;
4324
- detail?: 'auto' | 'low' | 'high';
4325
- };
4326
- }>;
4327
- }
4328
- interface LlmCallRequest {
4329
- model: string;
4330
- messages: LlmMessage[];
4331
- /** Optional JSON-mode response format (response_format: json_object). */
4332
- jsonMode?: boolean;
4333
- /** Optional structured output via JSON Schema. Falls back to json_object on 400. */
4334
- jsonSchema?: {
4335
- name: string;
4336
- schema: Record<string, unknown>;
4337
- };
4338
- temperature?: number;
4339
- maxTokens?: number;
4340
- /** Per-call timeout, default 60s. */
4341
- timeoutMs?: number;
4342
- }
4343
- interface LlmUsage {
4344
- promptTokens: number;
4345
- completionTokens: number;
4346
- totalTokens: number;
4347
- /** Proxies populate this when prompt caching is on. */
4348
- cachedPromptTokens?: number;
4349
- }
4350
- interface LlmCallResult {
4351
- /** The text content of the first choice. Empty string if none. */
4352
- content: string;
4353
- usage: LlmUsage;
4354
- /**
4355
- * Cost in USD. Pulled from proxy's `_response_cost` field when present;
4356
- * `null` when neither the proxy nor the caller can derive it.
4357
- */
4358
- costUsd: number | null;
4359
- /** Model name actually used (echoed from response). */
4360
- model: string;
4361
- /** Wall-clock duration of the HTTP call (last attempt, if retried). */
4362
- durationMs: number;
4363
- /** Raw response body. */
4364
- raw: Record<string, unknown>;
4365
- }
4366
- declare class LlmCallError extends Error {
4367
- readonly status: number;
4368
- readonly body: string;
4369
- readonly model: string;
4370
- constructor(message: string, status: number, body: string, model: string);
4371
- }
4372
- interface LlmClientOptions {
4373
- /** Base URL (without trailing slash). Must end at the `/v1` prefix. */
4374
- baseUrl?: string;
4375
- /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
4376
- apiKey?: string;
4377
- bearer?: string;
4378
- /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
4379
- authHeader?: {
4380
- name: string;
4381
- value: string;
4382
- };
4383
- /** Default timeout in ms. Per-call can override. */
4384
- defaultTimeoutMs?: number;
4385
- /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
4386
- maxRetries?: number;
4387
- /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
4388
- fetch?: typeof fetch;
4389
- /**
4390
- * Optional raw HTTP capture sink. When provided, every request, response,
4391
- * and error (across all retry attempts) is recorded to the sink, with auth
4392
- * headers and credential-shaped body fields redacted by default. This is
4393
- * the layer-1 forensics primitive: structured `LlmSpan`s record intent,
4394
- * raw events record what actually crossed the wire.
4395
- */
4396
- rawSink?: RawProviderSink;
4397
- /**
4398
- * Logical provider id attached to raw events. When omitted, derived from
4399
- * `baseUrl` via `providerFromBaseUrl`.
4400
- */
4401
- provider?: string;
4402
- /** Trace context attached to raw events; populated by emitter-aware callers. */
4403
- traceContext?: {
4404
- runId?: string;
4405
- spanId?: string;
4406
- };
4407
- /** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
4408
- redactor?: ProviderRedactor;
4409
- }
4410
- /**
4411
- * Strip a ```json / ``` code fence if the model emitted one.
4412
- * Idempotent for naked JSON. Some models (claude-code via router, certain
4413
- * deepseek models) wrap output even under json_object.
4414
- */
4415
- declare function stripFencedJson(raw: string): string;
4416
- /**
4417
- * Low-level call. Returns raw content + usage + cost. Retries on transient
4418
- * failures; does NOT degrade schema here — callers that want graceful
4419
- * degrade use `callLlmJson`.
4420
- */
4421
- declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
4422
- /**
4423
- * Structured-output call. Returns parsed JSON plus the raw result envelope.
4424
- * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
4425
- * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
4426
- * the `response_format.json_schema` shape but DO accept `json_object`.
4427
- */
4428
- declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
4429
- value: T;
4430
- result: LlmCallResult;
4431
- }>;
4432
- declare class LlmRouteAssertionError extends Error {
4433
- readonly code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
4434
- readonly baseUrl: string;
4435
- constructor(message: string, code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider', baseUrl: string);
4436
- }
4437
- interface LlmRouteRequirements {
4438
- /**
4439
- * Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
4440
- * `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
4441
- * the public/free-tier router is a defect — the launch reviewer needs to
4442
- * know exactly which provider answered.
4443
- */
4444
- requireExplicitBaseUrl?: boolean;
4445
- /**
4446
- * Allowlist of acceptable base URLs. Strings match by prefix
4447
- * (case-insensitive); RegExps test against the full base URL.
4448
- */
4449
- allowedBaseUrls?: Array<string | RegExp>;
4450
- /** Blocklist that takes precedence over `allowedBaseUrls`. */
4451
- blockedBaseUrls?: Array<string | RegExp>;
4452
- /** Throw if no auth header / api key is configured. */
4453
- requireAuth?: boolean;
4454
- /**
4455
- * Logical provider id the configured `baseUrl` is expected to match (via
4456
- * `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
4457
- */
4458
- expectedProvider?: string;
4459
- }
4460
- /**
4461
- * Fail-loud assertion that the configured LLM client points at the route
4462
- * the caller intends. Designed for the matrix-runner preflight: invoke
4463
- * once before any LLM call to catch misconfiguration before a sweep burns
4464
- * dollars on the wrong provider.
4465
- *
4466
- * Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
4467
- * from constructors and CI gates.
4468
- */
4469
- declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
4470
- /**
4471
- * Probe whether a model is reachable. Returns latency + null error on
4472
- * success; `ok=false` + error message on any failure (HTTP, timeout,
4473
- * network, parse). Designed for sweep preflights — fail loud at the
4474
- * boundary before burning a 30-leaf run on a misconfigured router.
4475
- *
4476
- * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
4477
- * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
4478
- * for short prompts, so don't tighten this further. We don't validate
4479
- * content; HTTP 200 means reachable.
4480
- */
4481
- declare function probeLlm(model: string, opts?: LlmClientOptions & {
4482
- timeoutMs?: number;
4483
- }): Promise<{
4484
- ok: boolean;
4485
- latencyMs: number;
4486
- error: string | null;
4487
- }>;
4488
- /**
4489
- * Stateful client — construct once with defaults, call many times.
4490
- * Thin wrapper around the free functions; exists for callers that want
4491
- * to inject a single configured instance into multiple primitives.
4492
- */
4493
- declare class LlmClient {
4494
- private readonly opts;
4495
- constructor(opts?: LlmClientOptions);
4496
- call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
4497
- callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
4498
- value: T;
4499
- result: LlmCallResult;
4500
- }>;
4501
- }
4502
-
4503
4229
  /**
4504
4230
  * Multi-layer verifier — ordered pipeline of verification layers.
4505
4231
  *
@@ -6312,4 +6038,4 @@ interface OrthogonalityResult {
6312
6038
  }
6313
6039
  declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
6314
6040
 
6315
- export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmMessage, LlmRouteAssertionError, type LlmRouteRequirements, LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OutcomeFilter, type OutcomePair, type OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, ProviderRedactor, RawProviderSink, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, assertLlmRoute, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
6041
+ export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OutcomeFilter, type OutcomePair, OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -46,6 +46,7 @@ import {
46
46
  renderPreferenceMemoryMarkdown,
47
47
  replayFeedbackTrajectories,
48
48
  replayFeedbackTrajectory,
49
+ runEvalCampaign,
49
50
  runMultiShotOptimization,
50
51
  runPromptEvolution,
51
52
  scalarScore,
@@ -53,7 +54,7 @@ import {
53
54
  summarizePreferenceMemory,
54
55
  trialTraceFromMultiShotTrial,
55
56
  withAssignedFeedbackSplit
56
- } from "./chunk-HRZELXCR.js";
57
+ } from "./chunk-USHQBPMH.js";
57
58
  import {
58
59
  RunRecordValidationError,
59
60
  isRunRecord,
@@ -62,40 +63,38 @@ import {
62
63
  validateRunRecord
63
64
  } from "./chunk-YUFXO3TU.js";
64
65
  import {
65
- RESEARCH_REPORT_HARD_PAIR_FLOOR,
66
66
  assertReleaseConfidence,
67
67
  bootstrapCi,
68
- canonicalize,
69
- evaluateHypothesis,
68
+ evaluateInterimReleaseConfidence,
70
69
  evaluateReleaseConfidence,
71
- gainHistogram,
72
- hashJson,
73
70
  judgeReplayGate,
74
- paretoChart,
71
+ pairedEvalueSequence,
75
72
  releaseTraceEvidenceFromMultiShotTrials,
76
73
  renderReleaseReport,
77
- researchReport,
78
- signManifest,
79
- summaryTable,
80
- verifyManifest
81
- } from "./chunk-3IX6QTB7.js";
74
+ rubricPredictiveValidity
75
+ } from "./chunk-UAND2LOT.js";
82
76
  import {
77
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
83
78
  benjaminiHochberg,
84
79
  bhAdjust,
85
80
  bonferroni,
86
81
  cohensD,
87
82
  confidenceInterval,
83
+ gainHistogram,
88
84
  interRaterReliability,
89
85
  mannWhitneyU,
90
86
  normalizeScores,
91
87
  pairedBootstrap,
92
88
  pairedTTest,
93
89
  pairedWilcoxon,
90
+ paretoChart,
94
91
  partialCredit,
95
92
  requiredSampleSize,
93
+ researchReport,
94
+ summaryTable,
96
95
  weightedMean,
97
96
  wilcoxonSignedRank
98
- } from "./chunk-KRR4VMH7.js";
97
+ } from "./chunk-IOXMGMHQ.js";
99
98
  import {
100
99
  DEFAULT_REDACTION_RULES,
101
100
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -105,7 +104,8 @@ import {
105
104
  OTEL_AGENT_EVAL_SCOPE,
106
105
  OtlpFileTraceStore,
107
106
  REDACTION_VERSION,
108
- RunIntegrityError,
107
+ ReplayCache,
108
+ ReplayCacheMissError,
109
109
  SpanNotFoundError,
110
110
  TRACE_ANALYST_ACTOR_DESCRIPTION,
111
111
  TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -117,10 +117,10 @@ import {
117
117
  aggregateLlm,
118
118
  analyzeTraces,
119
119
  argHash,
120
- assertRunCaptured,
121
120
  buildTraceAnalystTools,
122
121
  buildTraceInsightContext,
123
122
  buildTraceInsightPrompt,
123
+ createReplayFetch,
124
124
  defaultTraceInsightPanel,
125
125
  describeTraceInsightScope,
126
126
  domainEvidencePattern,
@@ -132,6 +132,7 @@ import {
132
132
  isRetrievalSpan,
133
133
  isSandboxSpan,
134
134
  isToolSpan,
135
+ iterateRawCalls,
135
136
  judgeSpans,
136
137
  llmSpans,
137
138
  planTraceInsightQuestions,
@@ -140,16 +141,27 @@ import {
140
141
  runFailureClass,
141
142
  runsForScenario,
142
143
  scoreTraceInsightReadiness,
143
- throwIfRunIncomplete,
144
144
  tokenizeDomainWords,
145
145
  toolSpans,
146
146
  traceAnalystFunctionGroup,
147
147
  traceAnalystOnRunComplete
148
- } from "./chunk-WOK2RTWG.js";
148
+ } from "./chunk-4W4NCYM2.js";
149
+ import {
150
+ RunIntegrityError,
151
+ assertRunCaptured,
152
+ throwIfRunIncomplete
153
+ } from "./chunk-QUKKGHTZ.js";
149
154
  import {
150
155
  TraceEmitter,
151
156
  llmSpanFromProvider
152
157
  } from "./chunk-5IIQKMD5.js";
158
+ import {
159
+ canonicalize,
160
+ evaluateHypothesis,
161
+ hashJson,
162
+ signManifest,
163
+ verifyManifest
164
+ } from "./chunk-6M774GY6.js";
153
165
  import {
154
166
  LlmCallError,
155
167
  LlmClient,
@@ -159,14 +171,14 @@ import {
159
171
  callLlmJson,
160
172
  probeLlm,
161
173
  stripFencedJson
162
- } from "./chunk-3GN6U53I.js";
174
+ } from "./chunk-KAO3Q65R.js";
163
175
  import {
164
176
  FileSystemRawProviderSink,
165
177
  InMemoryRawProviderSink,
166
178
  NoopRawProviderSink,
167
179
  defaultProviderRedactor,
168
180
  providerFromBaseUrl
169
- } from "./chunk-SNUHRBDL.js";
181
+ } from "./chunk-SQQLHODJ.js";
170
182
  import "./chunk-PZ5AY32C.js";
171
183
 
172
184
  // src/client.ts
@@ -10492,6 +10504,8 @@ export {
10492
10504
  PromptRegistry,
10493
10505
  REDACTION_VERSION,
10494
10506
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
10507
+ ReplayCache,
10508
+ ReplayCacheMissError,
10495
10509
  RunCritic,
10496
10510
  RunIntegrityError,
10497
10511
  RunRecordValidationError,
@@ -10580,6 +10594,7 @@ export {
10580
10594
  createFeedbackTrajectory,
10581
10595
  createIntentMatchJudge,
10582
10596
  createLlmReviewer,
10597
+ createReplayFetch,
10583
10598
  createSandboxCodeMutator,
10584
10599
  createSandboxPool,
10585
10600
  createSemanticConceptJudge,
@@ -10603,6 +10618,7 @@ export {
10603
10618
  evaluateActionPolicy,
10604
10619
  evaluateContract,
10605
10620
  evaluateHypothesis,
10621
+ evaluateInterimReleaseConfidence,
10606
10622
  evaluateOracles,
10607
10623
  evaluateReleaseConfidence,
10608
10624
  executeScenario,
@@ -10654,6 +10670,7 @@ export {
10654
10670
  isRunRecord,
10655
10671
  isSandboxSpan,
10656
10672
  isToolSpan,
10673
+ iterateRawCalls,
10657
10674
  jestTestParser,
10658
10675
  jsonHasKeys,
10659
10676
  jsonShape,
@@ -10682,6 +10699,7 @@ export {
10682
10699
  objectiveEval,
10683
10700
  outputLengthRubric,
10684
10701
  pairedBootstrap,
10702
+ pairedEvalueSequence,
10685
10703
  pairedTTest,
10686
10704
  pairedWilcoxon,
10687
10705
  paraphraseRobustness,
@@ -10733,12 +10751,14 @@ export {
10733
10751
  roundTripRunRecord,
10734
10752
  rowCount,
10735
10753
  rowWhere,
10754
+ rubricPredictiveValidity,
10736
10755
  runAgentControlLoop,
10737
10756
  runAssertions,
10738
10757
  runBehavioralCanaries,
10739
10758
  runCanaries,
10740
10759
  runCounterfactual,
10741
10760
  runE2EWorkflow,
10761
+ runEvalCampaign,
10742
10762
  runExpectations,
10743
10763
  runFailureClass,
10744
10764
  runHarnessExperiment,