@tangle-network/agent-eval 0.20.3 → 0.20.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,4 +1,5 @@
1
1
  import { TCloud } from '@tangle-network/tcloud';
2
+ import { AxAIService, AxFunction } from '@ax-llm/ax';
2
3
 
3
4
  interface Scenario {
4
5
  id: string;
@@ -8822,4 +8823,444 @@ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): P
8822
8823
  candidateSamples: number;
8823
8824
  }>;
8824
8825
 
8825
- export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome$1 as RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
8826
+ /**
8827
+ * Shared types for the trace-analyst module.
8828
+ *
8829
+ * Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
8830
+ * line per span, OTLP-shaped. We do NOT depend on a specific tracing
8831
+ * vendor at the type level. Adapter
8832
+ * layers map upstream shapes onto this interface.
8833
+ *
8834
+ * Design constraint. Every read operation that can return arbitrary
8835
+ * payload must carry a byte budget so the agent's tool result stays
8836
+ * bounded regardless of input trace size. Oversized responses
8837
+ * substitute a deterministic summary instead of bytes — see
8838
+ * `ViewTraceOversized`.
8839
+ */
8840
+ /** OTLP span kind (subset we actually use). */
8841
+ type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
8842
+ type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
8843
+ /** Subset of OTLP span fields the analyst exposes to the agent. The
8844
+ * store's job is to project upstream's full span shape down to this
8845
+ * view — the analyst never sees vendor extensions directly. */
8846
+ interface TraceAnalystSpan {
8847
+ trace_id: string;
8848
+ span_id: string;
8849
+ parent_span_id: string | null;
8850
+ name: string;
8851
+ kind: TraceAnalystSpanKind;
8852
+ start_time: string;
8853
+ end_time: string;
8854
+ duration_ms: number;
8855
+ status: TraceAnalystSpanStatus;
8856
+ status_message?: string;
8857
+ service_name: string | null;
8858
+ agent_name: string | null;
8859
+ model_name: string | null;
8860
+ tool_name: string | null;
8861
+ /** Raw JSON-serialisable attribute map. May contain large strings;
8862
+ * callers must respect the per-attribute byte cap. */
8863
+ attributes: Record<string, unknown>;
8864
+ }
8865
+ interface TraceAnalystTraceSummary {
8866
+ trace_id: string;
8867
+ service_name: string | null;
8868
+ agent_name: string | null;
8869
+ span_count: number;
8870
+ has_errors: boolean;
8871
+ start_time: string;
8872
+ end_time: string;
8873
+ duration_ms: number;
8874
+ raw_jsonl_bytes: number;
8875
+ models: string[];
8876
+ tools: string[];
8877
+ }
8878
+ interface TraceAnalystFilters {
8879
+ /** Restrict to traces that contain at least one error span. */
8880
+ has_errors?: boolean;
8881
+ /** Match if any span's `service.name` is in this list. */
8882
+ service_names?: string[];
8883
+ /** Match if any span's `agent.name` is in this list. */
8884
+ agent_names?: string[];
8885
+ /** Match if any LLM span's `llm.model_name` is in this list. */
8886
+ model_names?: string[];
8887
+ /** Match if any tool span's `tool.name` is in this list. */
8888
+ tool_names?: string[];
8889
+ /** ISO-8601 lower bound on the trace's earliest start time. */
8890
+ start_time_after?: string;
8891
+ /** ISO-8601 upper bound on the trace's earliest start time. */
8892
+ start_time_before?: string;
8893
+ /** Single regex applied to raw JSONL bytes for the trace. Opt-in;
8894
+ * expensive on large datasets. Use the indexed filters above first. */
8895
+ regex_pattern?: string;
8896
+ }
8897
+ interface DatasetOverview {
8898
+ total_traces: number;
8899
+ raw_jsonl_bytes: number;
8900
+ services: string[];
8901
+ agents: string[];
8902
+ models: string[];
8903
+ tool_names: string[];
8904
+ /** Up to 20 real trace ids the agent may pass to view/search tools. */
8905
+ sample_trace_ids: string[];
8906
+ errors: {
8907
+ trace_count: number;
8908
+ span_count: number;
8909
+ };
8910
+ time_range: {
8911
+ earliest: string;
8912
+ latest: string;
8913
+ } | null;
8914
+ }
8915
+ interface QueryTracesPage {
8916
+ traces: TraceAnalystTraceSummary[];
8917
+ total: number;
8918
+ has_more: boolean;
8919
+ }
8920
+ /** Full-trace view. When the response would exceed the per-call byte
8921
+ * budget, `oversized` is populated INSTEAD of `spans` so the agent
8922
+ * knows to switch to `searchTrace` / `viewSpans`. */
8923
+ interface ViewTraceResult {
8924
+ trace_id: string;
8925
+ spans?: TraceAnalystSpan[];
8926
+ oversized?: ViewTraceOversized;
8927
+ }
8928
+ interface ViewTraceOversized {
8929
+ span_count: number;
8930
+ /** Names with their counts, sorted desc. Capped at 20 entries. */
8931
+ top_span_names: Array<[string, number]>;
8932
+ /** Largest single span body (bytes after attribute-cap projection). */
8933
+ span_response_bytes_max: number;
8934
+ error_span_count: number;
8935
+ }
8936
+ interface ViewSpansResult {
8937
+ trace_id: string;
8938
+ spans: TraceAnalystSpan[];
8939
+ /** Number of requested span ids that were not found in the trace. */
8940
+ missing_span_ids: string[];
8941
+ /** Number of attribute fields truncated to fit the per-attribute cap. */
8942
+ truncated_attribute_count: number;
8943
+ }
8944
+ interface SpanMatchRecord {
8945
+ trace_id: string;
8946
+ span_id: string;
8947
+ span_name: string;
8948
+ span_kind: TraceAnalystSpanKind;
8949
+ /** JSON pointer-style path to the matched value, e.g.
8950
+ * `attributes."llm.input_messages"[2].content`. */
8951
+ attribute_path: string;
8952
+ matched_text: string;
8953
+ context_before: string;
8954
+ context_after: string;
8955
+ match_offset: number;
8956
+ }
8957
+ interface SearchTraceResult {
8958
+ trace_id: string;
8959
+ hits: SpanMatchRecord[];
8960
+ total_matches: number;
8961
+ has_more: boolean;
8962
+ }
8963
+ interface SearchSpanResult {
8964
+ trace_id: string;
8965
+ span_id: string;
8966
+ hits: SpanMatchRecord[];
8967
+ total_matches: number;
8968
+ has_more: boolean;
8969
+ }
8970
+ /** Tunable byte budgets for bounded RLM tool output. */
8971
+ interface TraceAnalystByteBudgets {
8972
+ /** Max bytes any single tool response may emit. Hard ceiling enforced
8973
+ * by the store; oversized → summary. Default 150_000. */
8974
+ perCallByteCeiling: number;
8975
+ /** Per-attribute string truncation cap on `viewTrace` (discovery scan).
8976
+ * Default 4096. */
8977
+ perAttributeViewBudget: number;
8978
+ /** Per-attribute string truncation cap on `viewSpans` (surgical reads).
8979
+ * Default 16384. */
8980
+ perAttributeSpanBudget: number;
8981
+ /** Per-attribute cap on a single match record's `matched_text` and
8982
+ * context window. Default 1024. */
8983
+ perMatchTextBudget: number;
8984
+ }
8985
+ declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
8986
+ /** Marker substituted in place of truncated string payloads. Callers
8987
+ * parsing tool output can detect it deterministically. */
8988
+ declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
8989
+
8990
+ /**
8991
+ * `TraceAnalysisStore` — read-side interface the trace-analyst calls
8992
+ * through. Six operations, all bounded:
8993
+ *
8994
+ * - `getOverview(filters?)` — dataset rollup + sample trace ids.
8995
+ * - `queryTraces(filters?, limit, offset)` — paginated summaries.
8996
+ * - `countTraces(filters?)` — cheap count without materialisation.
8997
+ * - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
8998
+ * - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
8999
+ * - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
9000
+ * - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
9001
+ *
9002
+ * Multiple implementations ship in the core (`OtlpFileTraceStore`).
9003
+ * Downstream callers can supply their own — e.g. a DuckDB-backed
9004
+ * adapter or an in-memory adapter for tests — by implementing this
9005
+ * interface.
9006
+ *
9007
+ * Filters compose with AND semantics. Empty/undefined fields impose
9008
+ * no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
9009
+ * implementations may skip it via `count`/`overview` when not set.
9010
+ */
9011
+
9012
+ interface TraceAnalysisStore {
9013
+ getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
9014
+ queryTraces(opts: {
9015
+ filters?: TraceAnalystFilters;
9016
+ limit: number;
9017
+ offset?: number;
9018
+ }): Promise<QueryTracesPage>;
9019
+ countTraces(filters?: TraceAnalystFilters): Promise<number>;
9020
+ viewTrace(opts: {
9021
+ trace_id: string;
9022
+ /** Override per-attribute byte cap. Defaults to discovery budget. */
9023
+ per_attribute_byte_cap?: number;
9024
+ }): Promise<ViewTraceResult>;
9025
+ viewSpans(opts: {
9026
+ trace_id: string;
9027
+ span_ids: readonly string[];
9028
+ /** Override per-attribute byte cap. Defaults to surgical budget. */
9029
+ per_attribute_byte_cap?: number;
9030
+ }): Promise<ViewSpansResult>;
9031
+ searchTrace(opts: {
9032
+ trace_id: string;
9033
+ regex_pattern: string;
9034
+ /** Hard cap on matches returned. Default 50. */
9035
+ max_matches?: number;
9036
+ }): Promise<SearchTraceResult>;
9037
+ searchSpan(opts: {
9038
+ trace_id: string;
9039
+ span_id: string;
9040
+ regex_pattern: string;
9041
+ max_matches?: number;
9042
+ }): Promise<SearchSpanResult>;
9043
+ }
9044
+
9045
+ interface AnalyzeTracesInput {
9046
+ /** The user-facing question. Domain framing belongs here, not in the
9047
+ * actor description. */
9048
+ question: string;
9049
+ }
9050
+ interface AnalyzeTracesResult {
9051
+ /** The responder's prose answer. */
9052
+ answer: string;
9053
+ /** Bulleted findings extracted from the responder's structured output. */
9054
+ findings: string[];
9055
+ /** Per-actor-turn snapshots captured via `actorTurnCallback`. */
9056
+ turns: AnalyzeTracesTurnSnapshot[];
9057
+ /** Total turns the actor took. */
9058
+ turnCount: number;
9059
+ /** Token usage by role. */
9060
+ usage: {
9061
+ actor: unknown[];
9062
+ responder: unknown[];
9063
+ };
9064
+ /** Full system + assistant + tool message log by role. */
9065
+ chatLog: {
9066
+ actor: unknown[];
9067
+ responder: unknown[];
9068
+ };
9069
+ /** Prompt version that produced this run. */
9070
+ actorPromptVersion: string;
9071
+ }
9072
+ interface AnalyzeTracesTurnSnapshot {
9073
+ turn: number;
9074
+ isError: boolean;
9075
+ /** The JS code the actor produced for this turn. */
9076
+ code: string;
9077
+ /** The formatted action-log entry the actor sees on the next turn. */
9078
+ output: string;
9079
+ /** Provider thought (when `actorOptions.showThoughts` is true and the
9080
+ * provider returns it). */
9081
+ thought?: string;
9082
+ }
9083
+ interface AnalyzeTracesOptions {
9084
+ /** Trace data source. Pass either an OTLP-JSONL path or a custom store. */
9085
+ source: string | TraceAnalysisStore;
9086
+ /** Caller-provided AxAIService. */
9087
+ ai: AxAIService;
9088
+ /** Model id forwarded to actor + responder. */
9089
+ model?: string;
9090
+ /** Recursion depth. 0 = no sub-agent dispatch. Default 1. */
9091
+ maxDepth?: number;
9092
+ /** Maximum actor turns. Default 12. */
9093
+ maxTurns?: number;
9094
+ /** Maximum parallel sub-agent calls in batched llmQuery. Default 2. */
9095
+ maxParallelSubagents?: number;
9096
+ /** Override the actor description. */
9097
+ actorDescription?: string;
9098
+ /** Override the subagent description. */
9099
+ subagentDescription?: string;
9100
+ /** Per-turn observability hook. */
9101
+ onTurn?: (turn: AnalyzeTracesTurnSnapshot) => void | Promise<void>;
9102
+ /** Override max runtime characters per turn. Default 6000. */
9103
+ maxRuntimeChars?: number;
9104
+ }
9105
+ /**
9106
+ * Run the trace analyst.
9107
+ *
9108
+ * Throws:
9109
+ * - `TraceFileMissingError` if `source` is a path and doesn't exist.
9110
+ * - `AxAgentClarificationError` if the analyst asks for clarification.
9111
+ * - Provider errors (auth, rate limits) propagate from the AI service.
9112
+ */
9113
+ declare function analyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions): Promise<AnalyzeTracesResult>;
9114
+
9115
+ /**
9116
+ * `OtlpFileTraceStore` — read-only OTLP-JSONL trace store for the
9117
+ * trace-analyst.
9118
+ *
9119
+ * Wire shape. Each line of the input file is one OTLP-shaped span. The
9120
+ * store understands flattened OTLP JSONL plus the OpenInference vocab.
9121
+ * We project upstream's full
9122
+ * span shape down to `TraceAnalystSpan` lazily — full materialisation
9123
+ * only happens for the spans the agent actually requests.
9124
+ *
9125
+ * Indexing. On first read the store builds an in-memory index keyed
9126
+ * by `trace_id` carrying:
9127
+ * - byte offsets + lengths for each span line (for surgical reads
9128
+ * without re-parsing the whole file)
9129
+ * - a `TraceAnalystTraceSummary` rollup
9130
+ * - sets of services / agents / models / tools / has_errors
9131
+ * - byte size of the trace's JSONL slab
9132
+ *
9133
+ * Memory bound. The index keeps span metadata only — names, kinds,
9134
+ * offsets, status. Attribute payloads stay on disk until requested.
9135
+ * For a 50MB JSONL with 50k spans, the index is ~5MB.
9136
+ *
9137
+ * Concurrency. The store builds the index once on first read and
9138
+ * caches it. Subsequent reads reuse the index. The file is opened on
9139
+ * each read; we never hold a long-lived FD.
9140
+ */
9141
+
9142
+ interface OtlpFileTraceStoreOptions {
9143
+ /** Path to the OTLP-JSONL file. */
9144
+ path: string;
9145
+ /** Override the discovery (`viewTrace`) per-attribute byte cap. */
9146
+ perAttributeViewBudget?: number;
9147
+ /** Override the surgical (`viewSpans`) per-attribute byte cap. */
9148
+ perAttributeSpanBudget?: number;
9149
+ /** Override the per-call ceiling that triggers oversized summaries. */
9150
+ perCallByteCeiling?: number;
9151
+ /** Override the per-match text budget. */
9152
+ perMatchTextBudget?: number;
9153
+ }
9154
+ declare class OtlpFileTraceStore implements TraceAnalysisStore {
9155
+ private readonly path;
9156
+ private readonly perAttributeViewBudget;
9157
+ private readonly perAttributeSpanBudget;
9158
+ private readonly perCallByteCeiling;
9159
+ private readonly perMatchTextBudget;
9160
+ private indexPromise?;
9161
+ /** Cached UTF-8 buffer of the file. We pin it once because every
9162
+ * read needs slice access and re-reading on each call balloons the
9163
+ * syscall count. */
9164
+ private bufferPromise?;
9165
+ constructor(opts: OtlpFileTraceStoreOptions);
9166
+ getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
9167
+ queryTraces(opts: {
9168
+ filters?: TraceAnalystFilters;
9169
+ limit: number;
9170
+ offset?: number;
9171
+ }): Promise<QueryTracesPage>;
9172
+ countTraces(filters?: TraceAnalystFilters): Promise<number>;
9173
+ viewTrace(opts: {
9174
+ trace_id: string;
9175
+ per_attribute_byte_cap?: number;
9176
+ }): Promise<ViewTraceResult>;
9177
+ viewSpans(opts: {
9178
+ trace_id: string;
9179
+ span_ids: readonly string[];
9180
+ per_attribute_byte_cap?: number;
9181
+ }): Promise<ViewSpansResult>;
9182
+ searchTrace(opts: {
9183
+ trace_id: string;
9184
+ regex_pattern: string;
9185
+ max_matches?: number;
9186
+ }): Promise<SearchTraceResult>;
9187
+ searchSpan(opts: {
9188
+ trace_id: string;
9189
+ span_id: string;
9190
+ regex_pattern: string;
9191
+ max_matches?: number;
9192
+ }): Promise<SearchSpanResult>;
9193
+ /** Force the index to materialise. Useful to amortise startup cost
9194
+ * before the first agent call. */
9195
+ ensureIndexed(): Promise<void>;
9196
+ private buffer;
9197
+ private index;
9198
+ private buildIndex;
9199
+ private matchedTraces;
9200
+ private toSummary;
9201
+ private projectSpan;
9202
+ private buildOversizedSummary;
9203
+ private scanSpanForMatches;
9204
+ }
9205
+ declare class TraceFileMissingError extends Error {
9206
+ constructor(path: string);
9207
+ }
9208
+ declare class TraceNotFoundError extends Error {
9209
+ readonly trace_id: string;
9210
+ constructor(trace_id: string);
9211
+ }
9212
+ declare class SpanNotFoundError extends Error {
9213
+ readonly trace_id: string;
9214
+ readonly span_id: string;
9215
+ constructor(trace_id: string, span_id: string);
9216
+ }
9217
+
9218
+ /**
9219
+ * Trace-analyst tool surface — six namespaced AxFunctions the analyst
9220
+ * agent calls from generated JS code via `traces.<name>(...)`.
9221
+ *
9222
+ * Discovery → narrow → deep-read protocol. Tool names + ordering
9223
+ * support RLM discovery:
9224
+ *
9225
+ * 1. `getDatasetOverview` (cheap) — first call, sizes the dataset
9226
+ * 2. `queryTraces` — paginated summaries with `raw_jsonl_bytes`
9227
+ * 3. `countTraces` — cheap pre-flight before regex
9228
+ * 4. `viewTrace` — full span list, oversized → summary
9229
+ * 5. `viewSpans` — surgical 16KB-cap reads
9230
+ * 6. `searchTrace` / `searchSpan` — bounded regex hits
9231
+ *
9232
+ * Failure mode. Tool handlers throw on bad input (invalid trace ids,
9233
+ * out-of-range pagination, malformed regex). Ax converts thrown errors
9234
+ * into actor-visible `[ERROR]` strings so the analyst can adjust on
9235
+ * the next turn instead of looping.
9236
+ */
9237
+
9238
+ interface BuildTraceAnalystToolsOpts {
9239
+ store: TraceAnalysisStore;
9240
+ /** Override the default sample-trace-id slot count (20). Mostly for tests. */
9241
+ sampleTraceLimit?: number;
9242
+ }
9243
+ /**
9244
+ * Build the trace-analyst function set. Pass the result into
9245
+ * `agent(...).functions.local`.
9246
+ */
9247
+ declare function buildTraceAnalystTools(opts: BuildTraceAnalystToolsOpts): AxFunction[];
9248
+ /**
9249
+ * Convenience: same shape as `buildTraceAnalystTools` but returns the
9250
+ * grouped form expected when registering trace tools alongside other
9251
+ * agent function modules. */
9252
+ declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
9253
+ namespace: string;
9254
+ title: string;
9255
+ selectionCriteria: string;
9256
+ description: string;
9257
+ functions: AxFunction[];
9258
+ };
9259
+
9260
+ /** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
9261
+ declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules (RLM stdout mode):\n- Each non-final actor turn must emit EXACTLY ONE `console.log(...)` and stop.\n- Don't combine `console.log` with `final(...)` or `askClarification(...)` in the same turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The final call goes through the responder which formats output fields.\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
9262
+ declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v1-2026-05-05";
9263
+ /** Subagent prompt for focused trace-inspection subtasks. */
9264
+ declare const TRACE_ANALYST_SUBAGENT_DESCRIPTION = "You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.\n\nCite trace ids and span ids for every claim. Do NOT invent ids.";
9265
+
9266
+ export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome$1 as RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };