@tangle-network/agent-eval 0.20.6 → 0.20.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -9308,12 +9308,36 @@ interface TraceInsightPromptInput {
9308
9308
  totals?: Record<string, unknown>;
9309
9309
  maxRepresentativeTraces?: number;
9310
9310
  }
9311
+ interface TraceInsightContext {
9312
+ suite: TraceInsightSuite;
9313
+ scope: string;
9314
+ keywords: string[];
9315
+ questions: TraceInsightQuestion[];
9316
+ panel: TraceInsightPanelRole[];
9317
+ findings: TraceInsightFinding[];
9318
+ agent: Record<string, unknown> | null;
9319
+ totals: Record<string, unknown> | null;
9320
+ }
9321
+ interface TraceInsightQualityGate {
9322
+ id: string;
9323
+ label: string;
9324
+ passed: boolean;
9325
+ severity: 'critical' | 'high' | 'medium' | 'low';
9326
+ detail: string;
9327
+ }
9328
+ interface TraceInsightReadiness {
9329
+ score: number;
9330
+ grade: 'external-ready' | 'internal-review' | 'raw-analysis';
9331
+ gates: TraceInsightQualityGate[];
9332
+ }
9311
9333
  declare function tokenizeDomainWords(value: string): string[];
9312
9334
  declare function inferDomainKeywords(suite: TraceInsightSuite): string[];
9313
9335
  declare function domainEvidencePattern(keywords: string[]): RegExp;
9314
9336
  declare function describeTraceInsightScope(suite: TraceInsightSuite): string;
9315
9337
  declare function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[];
9338
+ declare function buildTraceInsightContext(input: TraceInsightPromptInput): TraceInsightContext;
9339
+ declare function scoreTraceInsightReadiness(context: TraceInsightContext): TraceInsightReadiness;
9316
9340
  declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
9317
9341
  declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
9318
9342
 
9319
- export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome$1 as RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQuestion, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTraceInsightPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, defaultTraceInsightPanel, deployGateLayer, describeTraceInsightScope, distillPlaybook, domainEvidencePattern, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, inferDomainKeywords, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, planTraceInsightQuestions, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, tokenizeDomainWords, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
9343
+ export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome$1 as RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, defaultTraceInsightPanel, deployGateLayer, describeTraceInsightScope, distillPlaybook, domainEvidencePattern, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, inferDomainKeywords, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, planTraceInsightQuestions, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, scoreTraceInsightReadiness, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, tokenizeDomainWords, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -15445,13 +15445,19 @@ async function analyzeTraces(input, options) {
15445
15445
  // src/trace-analyst/insights.ts
15446
15446
  var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
15447
15447
  "and",
15448
+ "advanced",
15448
15449
  "app",
15449
15450
  "build",
15450
15451
  "create",
15452
+ "easy",
15453
+ "expert",
15454
+ "extreme",
15451
15455
  "for",
15452
15456
  "from",
15457
+ "hard",
15453
15458
  "implementation",
15454
15459
  "integrate",
15460
+ "medium",
15455
15461
  "project",
15456
15462
  "task",
15457
15463
  "the",
@@ -15541,6 +15547,67 @@ function planTraceInsightQuestions(input) {
15541
15547
  }
15542
15548
  return questions;
15543
15549
  }
15550
+ function buildTraceInsightContext(input) {
15551
+ return {
15552
+ suite: input.suite,
15553
+ scope: describeTraceInsightScope(input.suite),
15554
+ keywords: inferDomainKeywords(input.suite),
15555
+ questions: planTraceInsightQuestions(input),
15556
+ panel: defaultTraceInsightPanel(),
15557
+ findings: input.findings ?? [],
15558
+ agent: input.agent ?? null,
15559
+ totals: input.totals ?? null
15560
+ };
15561
+ }
15562
+ function scoreTraceInsightReadiness(context) {
15563
+ const failedTasks = context.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied");
15564
+ const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds));
15565
+ const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id));
15566
+ const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0);
15567
+ const gates = [
15568
+ {
15569
+ id: "domain-context",
15570
+ label: "Domain context inferred",
15571
+ passed: context.keywords.length > 0,
15572
+ severity: "high",
15573
+ detail: context.keywords.length > 0 ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(", ")}` : "No domain terms were inferred from suite, tasks, prompts, tags, or gaps."
15574
+ },
15575
+ {
15576
+ id: "panel-coverage",
15577
+ label: "Analyst panel planned",
15578
+ passed: context.panel.length >= 4 && context.questions.length >= 5,
15579
+ severity: "high",
15580
+ detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`
15581
+ },
15582
+ {
15583
+ id: "failure-coverage",
15584
+ label: "Failures mapped to findings",
15585
+ passed: failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,
15586
+ severity: "critical",
15587
+ detail: failedTasks.length === 0 ? "No failed tasks in suite." : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`
15588
+ },
15589
+ {
15590
+ id: "gap-evidence",
15591
+ label: "Task gaps captured",
15592
+ passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,
15593
+ severity: "medium",
15594
+ detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`
15595
+ }
15596
+ ];
15597
+ const penalty = gates.reduce((sum2, gate) => {
15598
+ if (gate.passed) return sum2;
15599
+ if (gate.severity === "critical") return sum2 + 35;
15600
+ if (gate.severity === "high") return sum2 + 20;
15601
+ if (gate.severity === "medium") return sum2 + 10;
15602
+ return sum2 + 5;
15603
+ }, 0);
15604
+ const score = Math.max(0, Math.min(1, 1 - penalty / 100));
15605
+ return {
15606
+ score,
15607
+ grade: score >= 0.9 ? "external-ready" : score >= 0.7 ? "internal-review" : "raw-analysis",
15608
+ gates
15609
+ };
15610
+ }
15544
15611
  function defaultTraceInsightPanel() {
15545
15612
  return [
15546
15613
  {
@@ -15566,8 +15633,7 @@ function defaultTraceInsightPanel() {
15566
15633
  ];
15567
15634
  }
15568
15635
  function buildTraceInsightPrompt(input) {
15569
- const questions = planTraceInsightQuestions(input);
15570
- const keywords = inferDomainKeywords(input.suite);
15636
+ const context = buildTraceInsightContext(input);
15571
15637
  const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
15572
15638
  return `Analyze this benchmark run and produce evidence-backed trace intelligence.
15573
15639
 
@@ -15576,10 +15642,10 @@ Audience:
15576
15642
  - possible customer-facing report for ${input.suite.name}
15577
15643
 
15578
15644
  Investigation plan:
15579
- ${questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
15645
+ ${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
15580
15646
 
15581
15647
  Analyst panel:
15582
- ${defaultTraceInsightPanel().map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
15648
+ ${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
15583
15649
 
15584
15650
  If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
15585
15651
 
@@ -15601,11 +15667,11 @@ Budget:
15601
15667
  Run summary:
15602
15668
  ${JSON.stringify({
15603
15669
  suite: input.suite.name,
15604
- scope: describeTraceInsightScope(input.suite),
15605
- inferredKeywords: keywords,
15606
- agent: input.agent ?? null,
15607
- totals: input.totals ?? null,
15608
- findings: (input.findings ?? []).map((finding) => ({
15670
+ scope: context.scope,
15671
+ inferredKeywords: context.keywords,
15672
+ agent: context.agent,
15673
+ totals: context.totals,
15674
+ findings: context.findings.map((finding) => ({
15609
15675
  kind: finding.kind,
15610
15676
  severity: finding.severity,
15611
15677
  taskCount: finding.taskIds.length,
@@ -15728,6 +15794,7 @@ export {
15728
15794
  buildReflectionPrompt,
15729
15795
  buildReviewerPrompt,
15730
15796
  buildTraceAnalystTools,
15797
+ buildTraceInsightContext,
15731
15798
  buildTraceInsightPrompt,
15732
15799
  buildTrajectory,
15733
15800
  byteLengthRange,
@@ -15941,6 +16008,7 @@ export {
15941
16008
  scoreProject,
15942
16009
  scoreRedTeamOutput,
15943
16010
  scoreReferenceReplay,
16011
+ scoreTraceInsightReadiness,
15944
16012
  securityJudge,
15945
16013
  selectHarnessVariant,
15946
16014
  selfPreference,