@tangle-network/agent-eval 0.20.4 → 0.20.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +56 -3
- package/dist/index.js +245 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -9101,6 +9101,12 @@ interface AnalyzeTracesOptions {
|
|
|
9101
9101
|
onTurn?: (turn: AnalyzeTracesTurnSnapshot) => void | Promise<void>;
|
|
9102
9102
|
/** Override max runtime characters per turn. Default 6000. */
|
|
9103
9103
|
maxRuntimeChars?: number;
|
|
9104
|
+
/** When set, every turn's snapshot is appended to this JSONL file
|
|
9105
|
+
* immediately. If the analyst crashes mid-loop (provider 503,
|
|
9106
|
+
* network error, validator reject) the partial reasoning is still
|
|
9107
|
+
* on disk. Replay the file with the responder afterward to recover
|
|
9108
|
+
* evidence. */
|
|
9109
|
+
progressLogPath?: string;
|
|
9104
9110
|
}
|
|
9105
9111
|
/**
|
|
9106
9112
|
* Run the trace analyst.
|
|
@@ -9258,9 +9264,56 @@ declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
|
|
|
9258
9264
|
};
|
|
9259
9265
|
|
|
9260
9266
|
/** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
|
|
9261
|
-
declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules
|
|
9262
|
-
declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-
|
|
9267
|
+
declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n ```js\n await final(\"Format the per-item verdict report from the evidence below.\", {\n findings: [\n { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n ...all items\n ],\n systemic_summary: '3 sentences I wrote based on the evidence above'\n });\n ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
|
|
9268
|
+
declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
|
|
9263
9269
|
/** Subagent prompt for focused trace-inspection subtasks. */
|
|
9264
9270
|
declare const TRACE_ANALYST_SUBAGENT_DESCRIPTION = "You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.\n\nCite trace ids and span ids for every claim. Do NOT invent ids.";
|
|
9265
9271
|
|
|
9266
|
-
export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome$1 as RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|
|
9272
|
+
interface TraceInsightTask {
|
|
9273
|
+
id: string;
|
|
9274
|
+
name: string;
|
|
9275
|
+
prompt?: string;
|
|
9276
|
+
difficulty?: string;
|
|
9277
|
+
tags?: string[];
|
|
9278
|
+
outcome?: string;
|
|
9279
|
+
score?: number;
|
|
9280
|
+
gaps?: string[];
|
|
9281
|
+
}
|
|
9282
|
+
interface TraceInsightSuite {
|
|
9283
|
+
name: string;
|
|
9284
|
+
collectionId?: string;
|
|
9285
|
+
tasks: TraceInsightTask[];
|
|
9286
|
+
}
|
|
9287
|
+
interface TraceInsightFinding {
|
|
9288
|
+
kind: string;
|
|
9289
|
+
severity?: string;
|
|
9290
|
+
taskIds: string[];
|
|
9291
|
+
evidence?: string;
|
|
9292
|
+
proposedFixClass?: string;
|
|
9293
|
+
}
|
|
9294
|
+
interface TraceInsightQuestion {
|
|
9295
|
+
id: string;
|
|
9296
|
+
question: string;
|
|
9297
|
+
why: string;
|
|
9298
|
+
}
|
|
9299
|
+
interface TraceInsightPanelRole {
|
|
9300
|
+
id: string;
|
|
9301
|
+
name: string;
|
|
9302
|
+
responsibility: string;
|
|
9303
|
+
}
|
|
9304
|
+
interface TraceInsightPromptInput {
|
|
9305
|
+
suite: TraceInsightSuite;
|
|
9306
|
+
findings?: TraceInsightFinding[];
|
|
9307
|
+
agent?: Record<string, unknown>;
|
|
9308
|
+
totals?: Record<string, unknown>;
|
|
9309
|
+
maxRepresentativeTraces?: number;
|
|
9310
|
+
}
|
|
9311
|
+
declare function tokenizeDomainWords(value: string): string[];
|
|
9312
|
+
declare function inferDomainKeywords(suite: TraceInsightSuite): string[];
|
|
9313
|
+
declare function domainEvidencePattern(keywords: string[]): RegExp;
|
|
9314
|
+
declare function describeTraceInsightScope(suite: TraceInsightSuite): string;
|
|
9315
|
+
declare function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[];
|
|
9316
|
+
declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
|
|
9317
|
+
declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
|
|
9318
|
+
|
|
9319
|
+
export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome$1 as RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQuestion, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTraceInsightPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, defaultTraceInsightPanel, deployGateLayer, describeTraceInsightScope, distillPlaybook, domainEvidencePattern, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, inferDomainKeywords, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, planTraceInsightQuestions, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, tokenizeDomainWords, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|
package/dist/index.js
CHANGED
|
@@ -15150,6 +15150,8 @@ DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:
|
|
|
15150
15150
|
|
|
15151
15151
|
5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.
|
|
15152
15152
|
|
|
15153
|
+
5a. **Result-shape contract** \u2014 searchTrace and searchSpan return \`{ trace_id, hits, total_matches, has_more }\`. Iterate \`result.hits\` (NOT result.matches). Each hit has \`{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }\`. viewTrace returns \`{ trace_id, spans }\` (or \`oversized\`). viewSpans returns \`{ trace_id, spans, missing_span_ids, truncated_attribute_count }\`. Never assume a field name \u2014 log the result shape first if unsure.
|
|
15154
|
+
|
|
15153
15155
|
6. If viewTrace returns an \`oversized\` summary instead of \`spans\`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.
|
|
15154
15156
|
|
|
15155
15157
|
7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.
|
|
@@ -15165,11 +15167,26 @@ DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:
|
|
|
15165
15167
|
{ query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },
|
|
15166
15168
|
]);
|
|
15167
15169
|
|
|
15168
|
-
OBSERVABILITY rules
|
|
15169
|
-
- Each non-final actor turn must emit
|
|
15170
|
-
-
|
|
15170
|
+
OBSERVABILITY rules:
|
|
15171
|
+
- Each non-final actor turn must emit at least one \`console.log(...)\` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).
|
|
15172
|
+
- Do NOT combine \`console.log\` with \`final(...)\` or \`askClarification(...)\` in the same turn \u2014 finish gathering data first, then call final on its own turn.
|
|
15171
15173
|
- Reuse runtime variables across turns; don't recompute.
|
|
15172
|
-
- When done, call \`await final(answer)\` with the fully-formed report. The
|
|
15174
|
+
- When done, call \`await final(answer)\` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.
|
|
15175
|
+
|
|
15176
|
+
CRITICAL \u2014 \`final()\` payload contract for evidence-grounded analysis tasks:
|
|
15177
|
+
- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.
|
|
15178
|
+
- Example for per-item verdict tasks:
|
|
15179
|
+
\`\`\`js
|
|
15180
|
+
await final("Format the per-item verdict report from the evidence below.", {
|
|
15181
|
+
findings: [
|
|
15182
|
+
{ id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },
|
|
15183
|
+
...all items
|
|
15184
|
+
],
|
|
15185
|
+
systemic_summary: '3 sentences I wrote based on the evidence above'
|
|
15186
|
+
});
|
|
15187
|
+
\`\`\`
|
|
15188
|
+
- Calling \`final("answer", {})\` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.
|
|
15189
|
+
- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. \`spans[i].attributes['redteam.finding.title']\`), and for each one perform the requested cross-reference (e.g. read the source SPAN's \`attributes['source.content']\`).
|
|
15173
15190
|
|
|
15174
15191
|
OUTPUT contract \u2014 your final answer must include:
|
|
15175
15192
|
- A clear prose conclusion answering the user's question.
|
|
@@ -15177,7 +15194,7 @@ OUTPUT contract \u2014 your final answer must include:
|
|
|
15177
15194
|
- Failure modes named in the user's domain language, with frequency and concrete examples.
|
|
15178
15195
|
|
|
15179
15196
|
Do NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.`;
|
|
15180
|
-
var TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-
|
|
15197
|
+
var TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
|
|
15181
15198
|
var TRACE_ANALYST_SUBAGENT_DESCRIPTION = `You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.
|
|
15182
15199
|
|
|
15183
15200
|
Cite trace ids and span ids for every claim. Do NOT invent ids.`;
|
|
@@ -15215,7 +15232,7 @@ function buildTraceAnalystTools(opts) {
|
|
|
15215
15232
|
})
|
|
15216
15233
|
).build();
|
|
15217
15234
|
const searchTrace = fn("searchTrace").description(
|
|
15218
|
-
"Regex search across all spans of one trace. Returns
|
|
15235
|
+
"Regex search across all spans of one trace. Returns `{trace_id, hits: SpanMatchRecord[], total_matches, has_more}`. **Iterate `result.hits`, NOT `result.matches`** \u2014 the field is `hits`. Each hit has `{span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset}`. Bounded regardless of trace size by max_matches (1..500, default 50). If has_more=true, REFINE the regex rather than blindly raising max_matches."
|
|
15219
15236
|
).namespace(NAMESPACE).arg("trace_id", f.string("Real trace id")).arg("regex_pattern", f.string("JS-compatible regex, multiline")).arg("max_matches", f.number("Max records returned, 1..500; default 50").optional()).returns(f.json("SearchTraceResult")).handler(
|
|
15220
15237
|
async ({ trace_id, regex_pattern, max_matches }) => store.searchTrace({
|
|
15221
15238
|
trace_id: assertString(trace_id, "trace_id"),
|
|
@@ -15224,7 +15241,7 @@ function buildTraceAnalystTools(opts) {
|
|
|
15224
15241
|
})
|
|
15225
15242
|
).build();
|
|
15226
15243
|
const searchSpan = fn("searchSpan").description(
|
|
15227
|
-
"Regex search inside a single span. Use when viewSpans returned a 16KB-truncated payload and you need to narrow further."
|
|
15244
|
+
"Regex search inside a single span. Use when viewSpans returned a 16KB-truncated payload and you need to narrow further. Returns `{trace_id, span_id, hits: SpanMatchRecord[], total_matches, has_more}` \u2014 iterate `result.hits`, NOT `result.matches`."
|
|
15228
15245
|
).namespace(NAMESPACE).arg("trace_id", f.string("Real trace id")).arg("span_id", f.string("Real span id within trace")).arg("regex_pattern", f.string("JS-compatible regex, multiline")).arg("max_matches", f.number("Max records, 1..500; default 50").optional()).returns(f.json("SearchSpanResult")).handler(
|
|
15229
15246
|
async ({ trace_id, span_id, regex_pattern, max_matches }) => store.searchSpan({
|
|
15230
15247
|
trace_id: assertString(trace_id, "trace_id"),
|
|
@@ -15334,6 +15351,14 @@ async function analyzeTraces(input, options) {
|
|
|
15334
15351
|
}
|
|
15335
15352
|
const tools = buildTraceAnalystTools({ store });
|
|
15336
15353
|
const turns = [];
|
|
15354
|
+
let progressFs;
|
|
15355
|
+
if (options.progressLogPath) {
|
|
15356
|
+
const { createWriteStream } = await import("fs");
|
|
15357
|
+
const { mkdir } = await import("fs/promises");
|
|
15358
|
+
const { dirname: dirname6 } = await import("path");
|
|
15359
|
+
await mkdir(dirname6(options.progressLogPath), { recursive: true });
|
|
15360
|
+
progressFs = createWriteStream(options.progressLogPath, { flags: "a" });
|
|
15361
|
+
}
|
|
15337
15362
|
const actorTurnCallback = async (turn) => {
|
|
15338
15363
|
const snap = {
|
|
15339
15364
|
turn: turn.turn,
|
|
@@ -15343,6 +15368,13 @@ async function analyzeTraces(input, options) {
|
|
|
15343
15368
|
thought: turn.thought
|
|
15344
15369
|
};
|
|
15345
15370
|
turns.push(snap);
|
|
15371
|
+
if (progressFs) {
|
|
15372
|
+
try {
|
|
15373
|
+
progressFs.write(`${JSON.stringify({ ...snap, ts: Date.now() })}
|
|
15374
|
+
`);
|
|
15375
|
+
} catch {
|
|
15376
|
+
}
|
|
15377
|
+
}
|
|
15346
15378
|
if (options.onTurn) await options.onTurn(snap);
|
|
15347
15379
|
};
|
|
15348
15380
|
const maxDepth = options.maxDepth ?? 1;
|
|
@@ -15363,7 +15395,8 @@ async function analyzeTraces(input, options) {
|
|
|
15363
15395
|
allowedModules: [],
|
|
15364
15396
|
freezeIntrinsics: true,
|
|
15365
15397
|
blockShadowRealm: true,
|
|
15366
|
-
|
|
15398
|
+
// RLM stdout mode relies on runtime bindings persisting across turns.
|
|
15399
|
+
preventGlobalThisExtensions: false
|
|
15367
15400
|
}),
|
|
15368
15401
|
mode: maxDepth > 0 ? "advanced" : "simple",
|
|
15369
15402
|
recursionOptions: maxDepth > 0 ? { maxDepth } : void 0,
|
|
@@ -15371,21 +15404,33 @@ async function analyzeTraces(input, options) {
|
|
|
15371
15404
|
maxRuntimeChars,
|
|
15372
15405
|
maxBatchedLlmQueryConcurrency: maxParallelSubagents,
|
|
15373
15406
|
promptLevel: "detailed",
|
|
15374
|
-
|
|
15407
|
+
// Trace analysis depends on exact prior tool results and runtime variables.
|
|
15408
|
+
contextPolicy: { preset: "full", budget: "balanced" },
|
|
15375
15409
|
functions: { local: tools },
|
|
15376
15410
|
actorOptions: {
|
|
15377
15411
|
description: options.actorDescription ?? TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
15378
|
-
...options.model ? { model: options.model } : {}
|
|
15412
|
+
...options.model ? { model: options.model } : {},
|
|
15413
|
+
// Keep actor messages tool-call/content shaped across reasoning models.
|
|
15414
|
+
showThoughts: false,
|
|
15415
|
+
thinkingTokenBudget: "none"
|
|
15379
15416
|
},
|
|
15380
15417
|
responderOptions: {
|
|
15381
15418
|
...options.model ? { model: options.model } : {},
|
|
15382
|
-
description: options.subagentDescription ?? TRACE_ANALYST_SUBAGENT_DESCRIPTION
|
|
15419
|
+
description: options.subagentDescription ?? TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
15420
|
+
showThoughts: false
|
|
15383
15421
|
},
|
|
15384
15422
|
actorTurnCallback,
|
|
15385
15423
|
bubbleErrors: [TraceFileMissingError]
|
|
15386
15424
|
}
|
|
15387
15425
|
);
|
|
15388
|
-
|
|
15426
|
+
let result;
|
|
15427
|
+
try {
|
|
15428
|
+
result = await analyst.forward(options.ai, { question: input.question });
|
|
15429
|
+
} finally {
|
|
15430
|
+
if (progressFs) {
|
|
15431
|
+
await new Promise((resolve) => progressFs.end(() => resolve()));
|
|
15432
|
+
}
|
|
15433
|
+
}
|
|
15389
15434
|
return {
|
|
15390
15435
|
answer: typeof result.answer === "string" ? result.answer : String(result.answer ?? ""),
|
|
15391
15436
|
findings: Array.isArray(result.findings) ? result.findings.filter((s) => typeof s === "string") : [],
|
|
@@ -15396,6 +15441,187 @@ async function analyzeTraces(input, options) {
|
|
|
15396
15441
|
actorPromptVersion: TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION
|
|
15397
15442
|
};
|
|
15398
15443
|
}
|
|
15444
|
+
|
|
15445
|
+
// src/trace-analyst/insights.ts
|
|
15446
|
+
var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
|
|
15447
|
+
"and",
|
|
15448
|
+
"app",
|
|
15449
|
+
"build",
|
|
15450
|
+
"create",
|
|
15451
|
+
"for",
|
|
15452
|
+
"from",
|
|
15453
|
+
"implementation",
|
|
15454
|
+
"integrate",
|
|
15455
|
+
"project",
|
|
15456
|
+
"task",
|
|
15457
|
+
"the",
|
|
15458
|
+
"this",
|
|
15459
|
+
"with",
|
|
15460
|
+
"workflow"
|
|
15461
|
+
]);
|
|
15462
|
+
function tokenizeDomainWords(value) {
|
|
15463
|
+
return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)].map((match) => match[0].toLowerCase()).filter((word) => !DOMAIN_STOP_WORDS.has(word));
|
|
15464
|
+
}
|
|
15465
|
+
function inferDomainKeywords(suite) {
|
|
15466
|
+
const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ""}`));
|
|
15467
|
+
const source = [
|
|
15468
|
+
suite.name,
|
|
15469
|
+
suite.collectionId ?? "",
|
|
15470
|
+
...suite.tasks.flatMap((task) => [
|
|
15471
|
+
task.id,
|
|
15472
|
+
task.name,
|
|
15473
|
+
task.prompt ?? "",
|
|
15474
|
+
task.difficulty ?? "",
|
|
15475
|
+
...task.tags ?? [],
|
|
15476
|
+
...task.gaps ?? []
|
|
15477
|
+
])
|
|
15478
|
+
].join(" ");
|
|
15479
|
+
const counts = /* @__PURE__ */ new Map();
|
|
15480
|
+
for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1);
|
|
15481
|
+
return [...counts.entries()].filter(([word, count]) => count >= 2 || suiteWords.has(word)).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([word]) => word).slice(0, 18);
|
|
15482
|
+
}
|
|
15483
|
+
function domainEvidencePattern(keywords) {
|
|
15484
|
+
const escaped = keywords.filter((keyword) => keyword.length >= 3).map((keyword) => keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
|
|
15485
|
+
return escaped.length > 0 ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join("|")})(?![A-Za-z0-9])`, "i") : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i;
|
|
15486
|
+
}
|
|
15487
|
+
function describeTraceInsightScope(suite) {
|
|
15488
|
+
const taskLabel = suite.tasks.length === 1 ? "1 implementation task" : `${suite.tasks.length} implementation tasks`;
|
|
15489
|
+
const tags = /* @__PURE__ */ new Map();
|
|
15490
|
+
for (const task of suite.tasks) {
|
|
15491
|
+
for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1);
|
|
15492
|
+
}
|
|
15493
|
+
const topTags = [...tags.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).slice(0, 8).map(([tag]) => tag);
|
|
15494
|
+
if (topTags.length > 0) return `${taskLabel} across ${topTags.join(", ")}.`;
|
|
15495
|
+
const difficulties = [...new Set(suite.tasks.map((task) => task.difficulty).filter((value) => Boolean(value)))].join(", ");
|
|
15496
|
+
return `${taskLabel} across ${difficulties || "the selected benchmark scope"}.`;
|
|
15497
|
+
}
|
|
15498
|
+
function planTraceInsightQuestions(input) {
|
|
15499
|
+
const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== "satisfied");
|
|
15500
|
+
const hasMultipleShots = input.suite.tasks.some((task) => (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap)));
|
|
15501
|
+
const questions = [
|
|
15502
|
+
{
|
|
15503
|
+
id: "execution-path",
|
|
15504
|
+
question: "What did the worker actually do before the first meaningful implementation edit?",
|
|
15505
|
+
why: "Separates grounded execution from polished but shallow output."
|
|
15506
|
+
},
|
|
15507
|
+
{
|
|
15508
|
+
id: "research-grounding",
|
|
15509
|
+
question: "Did the worker inspect docs, source, examples, or package references before committing to an implementation path?",
|
|
15510
|
+
why: "Identifies whether failures came from weak retrieval, weak examples, or premature coding."
|
|
15511
|
+
},
|
|
15512
|
+
{
|
|
15513
|
+
id: "domain-proof",
|
|
15514
|
+
question: "Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?",
|
|
15515
|
+
why: "Keeps product-quality claims tied to concrete evidence."
|
|
15516
|
+
},
|
|
15517
|
+
{
|
|
15518
|
+
id: "root-cause",
|
|
15519
|
+
question: "For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?",
|
|
15520
|
+
why: "Turns trace observations into actionable ownership."
|
|
15521
|
+
},
|
|
15522
|
+
{
|
|
15523
|
+
id: "evidence-quality",
|
|
15524
|
+
question: "Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?",
|
|
15525
|
+
why: "Prevents unsupported customer-report conclusions."
|
|
15526
|
+
}
|
|
15527
|
+
];
|
|
15528
|
+
if (hasMultipleShots) {
|
|
15529
|
+
questions.push({
|
|
15530
|
+
id: "reviewer-lift",
|
|
15531
|
+
question: "Where did reviewer feedback improve score, stall, or regress across shots?",
|
|
15532
|
+
why: "Shows whether the driver loop is learning or merely repeating work."
|
|
15533
|
+
});
|
|
15534
|
+
}
|
|
15535
|
+
if (hasFailures) {
|
|
15536
|
+
questions.push({
|
|
15537
|
+
id: "optimization-targets",
|
|
15538
|
+
question: "Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?",
|
|
15539
|
+
why: "Connects benchmark evidence to the optimization loop."
|
|
15540
|
+
});
|
|
15541
|
+
}
|
|
15542
|
+
return questions;
|
|
15543
|
+
}
|
|
15544
|
+
function defaultTraceInsightPanel() {
|
|
15545
|
+
return [
|
|
15546
|
+
{
|
|
15547
|
+
id: "trace-forensics",
|
|
15548
|
+
name: "Trace Forensics",
|
|
15549
|
+
responsibility: "Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason."
|
|
15550
|
+
},
|
|
15551
|
+
{
|
|
15552
|
+
id: "root-cause",
|
|
15553
|
+
name: "Root Cause",
|
|
15554
|
+
responsibility: "Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior."
|
|
15555
|
+
},
|
|
15556
|
+
{
|
|
15557
|
+
id: "optimization",
|
|
15558
|
+
name: "Optimization",
|
|
15559
|
+
responsibility: "Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next."
|
|
15560
|
+
},
|
|
15561
|
+
{
|
|
15562
|
+
id: "external-evidence",
|
|
15563
|
+
name: "External Evidence",
|
|
15564
|
+
responsibility: "Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence."
|
|
15565
|
+
}
|
|
15566
|
+
];
|
|
15567
|
+
}
|
|
15568
|
+
function buildTraceInsightPrompt(input) {
|
|
15569
|
+
const questions = planTraceInsightQuestions(input);
|
|
15570
|
+
const keywords = inferDomainKeywords(input.suite);
|
|
15571
|
+
const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
|
|
15572
|
+
return `Analyze this benchmark run and produce evidence-backed trace intelligence.
|
|
15573
|
+
|
|
15574
|
+
Audience:
|
|
15575
|
+
- internal AI/product leadership
|
|
15576
|
+
- possible customer-facing report for ${input.suite.name}
|
|
15577
|
+
|
|
15578
|
+
Investigation plan:
|
|
15579
|
+
${questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
|
|
15580
|
+
|
|
15581
|
+
Analyst panel:
|
|
15582
|
+
${defaultTraceInsightPanel().map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
|
|
15583
|
+
|
|
15584
|
+
If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
|
|
15585
|
+
|
|
15586
|
+
Required output:
|
|
15587
|
+
1. Executive verdict: what this run proves and does not prove.
|
|
15588
|
+
2. The investigation questions you answered and the evidence used.
|
|
15589
|
+
3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.
|
|
15590
|
+
4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.
|
|
15591
|
+
5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.
|
|
15592
|
+
6. What is safe for an external report versus what must stay internal.
|
|
15593
|
+
7. One rerun plan that would validate lift after optimization.
|
|
15594
|
+
|
|
15595
|
+
Budget:
|
|
15596
|
+
- Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.
|
|
15597
|
+
- Prefer traces named in the failure summary over broad exploration.
|
|
15598
|
+
- Do not do exhaustive trace sweeps.
|
|
15599
|
+
- Return the final report as soon as the taxonomy and examples are supported.
|
|
15600
|
+
|
|
15601
|
+
Run summary:
|
|
15602
|
+
${JSON.stringify({
|
|
15603
|
+
suite: input.suite.name,
|
|
15604
|
+
scope: describeTraceInsightScope(input.suite),
|
|
15605
|
+
inferredKeywords: keywords,
|
|
15606
|
+
agent: input.agent ?? null,
|
|
15607
|
+
totals: input.totals ?? null,
|
|
15608
|
+
findings: (input.findings ?? []).map((finding) => ({
|
|
15609
|
+
kind: finding.kind,
|
|
15610
|
+
severity: finding.severity,
|
|
15611
|
+
taskCount: finding.taskIds.length,
|
|
15612
|
+
proposedFixClass: finding.proposedFixClass
|
|
15613
|
+
})),
|
|
15614
|
+
failures: input.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied").map((task) => ({
|
|
15615
|
+
task: task.id,
|
|
15616
|
+
difficulty: task.difficulty,
|
|
15617
|
+
outcome: task.outcome,
|
|
15618
|
+
score: task.score,
|
|
15619
|
+
gaps: task.gaps ?? []
|
|
15620
|
+
}))
|
|
15621
|
+
}, null, 2)}
|
|
15622
|
+
|
|
15623
|
+
Use the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`;
|
|
15624
|
+
}
|
|
15399
15625
|
export {
|
|
15400
15626
|
AgentDriver,
|
|
15401
15627
|
AxGepaSteeringOptimizer,
|
|
@@ -15502,6 +15728,7 @@ export {
|
|
|
15502
15728
|
buildReflectionPrompt,
|
|
15503
15729
|
buildReviewerPrompt,
|
|
15504
15730
|
buildTraceAnalystTools,
|
|
15731
|
+
buildTraceInsightPrompt,
|
|
15505
15732
|
buildTrajectory,
|
|
15506
15733
|
byteLengthRange,
|
|
15507
15734
|
calibrateJudge,
|
|
@@ -15551,8 +15778,11 @@ export {
|
|
|
15551
15778
|
defaultJudges,
|
|
15552
15779
|
defaultMultiShotObjectives,
|
|
15553
15780
|
defaultReferenceReplayMatcher,
|
|
15781
|
+
defaultTraceInsightPanel,
|
|
15554
15782
|
deployGateLayer,
|
|
15783
|
+
describeTraceInsightScope,
|
|
15555
15784
|
distillPlaybook,
|
|
15785
|
+
domainEvidencePattern,
|
|
15556
15786
|
dominates,
|
|
15557
15787
|
estimateCost,
|
|
15558
15788
|
estimateTokens,
|
|
@@ -15595,6 +15825,7 @@ export {
|
|
|
15595
15825
|
htmlContainsElement,
|
|
15596
15826
|
inMemoryReferenceReplayStore,
|
|
15597
15827
|
inMemoryReviewStore,
|
|
15828
|
+
inferDomainKeywords,
|
|
15598
15829
|
interRaterReliability,
|
|
15599
15830
|
iqr,
|
|
15600
15831
|
isJudgeSpan,
|
|
@@ -15644,6 +15875,7 @@ export {
|
|
|
15644
15875
|
partialCredit,
|
|
15645
15876
|
passOrthogonality,
|
|
15646
15877
|
pixelDeltaRatio,
|
|
15878
|
+
planTraceInsightQuestions,
|
|
15647
15879
|
politenessPrefixMutator,
|
|
15648
15880
|
positionalBias,
|
|
15649
15881
|
printDriverSummary,
|
|
@@ -15731,6 +15963,7 @@ export {
|
|
|
15731
15963
|
toLangfuseEnvelope,
|
|
15732
15964
|
toNdjson,
|
|
15733
15965
|
toPrometheusText,
|
|
15966
|
+
tokenizeDomainWords,
|
|
15734
15967
|
toolIntentAlignmentRubric,
|
|
15735
15968
|
toolNamesForRun,
|
|
15736
15969
|
toolNonRedundantRubric,
|