@tangle-network/agent-eval 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -2828,6 +2828,26 @@ declare class CostTracker {
2828
2828
  timestamp?: number;
2829
2829
  }): CostEntry;
2830
2830
  markOutcome(scenarioId: string, completed: boolean): void;
2831
+ /**
2832
+ * Convenience: record + markOutcome in one call from a
2833
+ * `{ usage, verdict }`-shaped response (starter-foundry's
2834
+ * `invokeMetaJudge` returns this shape; consumers that wrap any
2835
+ * judge/critic can follow the same convention).
2836
+ *
2837
+ * `usage.model` must be present in `MODEL_PRICING` for cost math to
2838
+ * populate; otherwise totalCostUsd stays at 0 for the entry but
2839
+ * tokens still aggregate.
2840
+ */
2841
+ recordVerdict(verdict: {
2842
+ usage?: {
2843
+ inputTokens: number;
2844
+ outputTokens: number;
2845
+ model: string;
2846
+ cachedTokens?: number;
2847
+ reasoningTokens?: number;
2848
+ };
2849
+ verdict?: 'pass' | 'fail' | 'borderline' | string;
2850
+ }, scenarioId: string, tags?: Record<string, string>): CostEntry | null;
2831
2851
  get(scenarioId: string): ScenarioCost | undefined;
2832
2852
  list(): ScenarioCost[];
2833
2853
  summary(): CostSummary;
@@ -2843,6 +2863,108 @@ interface CostSummary {
2843
2863
  costPerCompletedTaskUsd: number | null;
2844
2864
  }
2845
2865
 
2866
+ /**
2867
+ * muffled-gate-scanner — test helper that greps consumer source for
2868
+ * gate + measurement anti-patterns and fails with file:line locations.
2869
+ *
2870
+ * Named pattern lives at starter-foundry's `.evolve/patterns/muffled-gate.md`;
2871
+ * same shape applies to every consumer (a gate that should fail loud
2872
+ * returns silent success; a metric that should emit a real number
2873
+ * reports noise/empty).
2874
+ *
2875
+ * Usage (in a consumer project's test file):
2876
+ *
2877
+ * import { scanForMuffledGates, DEFAULT_FINDERS } from '@tangle-network/agent-eval'
2878
+ *
2879
+ * test('no muffled gates in eval surface', () => {
2880
+ * const findings = scanForMuffledGates({
2881
+ * repoRoot: process.cwd(),
2882
+ * scanFiles: ['src/eval/scaffold.ts', 'scripts/promote.mjs'],
2883
+ * finders: DEFAULT_FINDERS,
2884
+ * })
2885
+ * if (findings.length) assert.fail(formatFindings(findings))
2886
+ * })
2887
+ *
2888
+ * Customize by passing your own `finders` — each finder is
2889
+ * `(file, text) => Finding[]` and runs per-file.
2890
+ *
2891
+ * Escape hatch: any line containing `muffle-ok:` is excluded from all
2892
+ * finders, letting consumers opt a legitimate fallback out explicitly.
2893
+ */
2894
+ interface MuffledFinding {
2895
+ file: string;
2896
+ line: number;
2897
+ lineText: string;
2898
+ pattern: string;
2899
+ }
2900
+ type MuffledFinder = (file: string, text: string) => MuffledFinding[];
2901
+ interface ScanOptions {
2902
+ /** Absolute path to the repo root. */
2903
+ repoRoot: string;
2904
+ /** Explicit file list (paths relative to repoRoot) for context-specific finders. */
2905
+ scanFiles: string[];
2906
+ /**
2907
+ * Auto-derived scan: walk these dirs for files matching importGlob + the
2908
+ * string `importsContain` and run the universal finders on them. Pattern
2909
+ * from starter-foundry H4 (research/decisions/001) — catches new files
2910
+ * with agent-eval import that would otherwise escape context-specific
2911
+ * scan lists.
2912
+ */
2913
+ autoDerive?: {
2914
+ roots: string[];
2915
+ extensions: RegExp;
2916
+ importsContain: string;
2917
+ universalFinders: MuffledFinder[];
2918
+ };
2919
+ /** Per-file finders (context-specific patterns). */
2920
+ finders: MuffledFinder[];
2921
+ }
2922
+ /**
2923
+ * Default finder: `command || true` in a testCommand/setupCommand/cmd/command
2924
+ * string. Swallows exit codes.
2925
+ */
2926
+ declare const findFallbackToPass: MuffledFinder;
2927
+ /**
2928
+ * `testCommand: 'true'` literal silent-pass — an unknown-language dispatch
2929
+ * arm that returns a no-op instead of throwing.
2930
+ */
2931
+ declare const findLiteralTruePass: MuffledFinder;
2932
+ /**
2933
+ * `new SubprocessSandboxDriver({ cwd: ... })` — constructor arg silently
2934
+ * dropped in agent-eval <0.7.1. 0.7.1+ honors as fallback, but the form
2935
+ * still invites confusion; prefer `new SubprocessSandboxDriver()` with
2936
+ * cwd in the per-call HarnessConfig.
2937
+ */
2938
+ declare const findConstructorCwdDropped: MuffledFinder;
2939
+ /**
2940
+ * `if (!expected) return true` — matcher auto-passes when ground truth is
2941
+ * absent. Inflates accuracy metrics for scenarios without expectations.
2942
+ */
2943
+ declare const findAutoMatchNoExpectation: MuffledFinder;
2944
+ /**
2945
+ * `if (p.skipped) return true` — skip-counts-as-pass in quality scorers.
2946
+ * Use three-valued `true | false | 'skipped'` return + explicit partial
2947
+ * credit instead.
2948
+ */
2949
+ declare const findSkipCountsAsPass: MuffledFinder;
2950
+ /**
2951
+ * The canonical default bundle. Callers can import these individually,
2952
+ * replace them, or append custom finders for project-specific patterns.
2953
+ */
2954
+ declare const DEFAULT_FINDERS: MuffledFinder[];
2955
+ /** Finders that should run on EVERY file with the target import, not just SCAN_FILES. */
2956
+ declare const UNIVERSAL_FINDERS: MuffledFinder[];
2957
+ /**
2958
+ * Run all finders against the configured files. Returns a flat list of
2959
+ * findings. Callers format + assert as they prefer.
2960
+ */
2961
+ declare function scanForMuffledGates(opts: ScanOptions): MuffledFinding[];
2962
+ /**
2963
+ * Format findings into a single assert.fail-ready message. Each finding
2964
+ * carries file:line + pattern name + the offending line.
2965
+ */
2966
+ declare function formatFindings(findings: MuffledFinding[]): string;
2967
+
2846
2968
  /**
2847
2969
  * Series convergence — detects whether a sequence of scalar measurements
2848
2970
  * is stabilizing, drifting, or noisy.
@@ -4674,4 +4796,4 @@ interface UseCaseSignals {
4674
4796
  declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
4675
4797
  declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
4676
4798
 
4677
- export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
4799
+ export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, formatBenchmarkReport, formatDriverReport, formatFindings, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
package/dist/index.js CHANGED
@@ -4853,6 +4853,30 @@ var CostTracker = class {
4853
4853
  if (!bucket) throw new Error(`CostTracker.markOutcome: unknown scenario "${scenarioId}"`);
4854
4854
  bucket.completed = completed;
4855
4855
  }
4856
+ /**
4857
+ * Convenience: record + markOutcome in one call from a
4858
+ * `{ usage, verdict }`-shaped response (starter-foundry's
4859
+ * `invokeMetaJudge` returns this shape; consumers that wrap any
4860
+ * judge/critic can follow the same convention).
4861
+ *
4862
+ * `usage.model` must be present in `MODEL_PRICING` for cost math to
4863
+ * populate; otherwise totalCostUsd stays at 0 for the entry but
4864
+ * tokens still aggregate.
4865
+ */
4866
+ recordVerdict(verdict, scenarioId, tags) {
4867
+ if (!verdict.usage) return null;
4868
+ const entry = this.record({
4869
+ scenarioId,
4870
+ model: verdict.usage.model,
4871
+ inputTokens: verdict.usage.inputTokens,
4872
+ outputTokens: verdict.usage.outputTokens,
4873
+ cachedTokens: verdict.usage.cachedTokens,
4874
+ reasoningTokens: verdict.usage.reasoningTokens,
4875
+ tags
4876
+ });
4877
+ this.markOutcome(scenarioId, verdict.verdict === "pass");
4878
+ return entry;
4879
+ }
4856
4880
  get(scenarioId) {
4857
4881
  return this.byScenario.get(scenarioId);
4858
4882
  }
@@ -4889,6 +4913,179 @@ function assertNonNegative(n, name) {
4889
4913
  }
4890
4914
  }
4891
4915
 
4916
+ // src/muffled-gate-scanner.ts
4917
+ import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
4918
+ import { join } from "path";
4919
+ function codeOf(line) {
4920
+ return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
4921
+ }
4922
+ function isMuffleOk(line) {
4923
+ return line.includes("muffle-ok:");
4924
+ }
4925
+ var findFallbackToPass = (file, text) => {
4926
+ const out = [];
4927
+ const lines = text.split("\n");
4928
+ for (let i = 0; i < lines.length; i++) {
4929
+ const line = lines[i];
4930
+ if (isMuffleOk(line)) continue;
4931
+ const code = codeOf(line);
4932
+ if (!code.trim()) continue;
4933
+ if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) {
4934
+ out.push({ file, line: i + 1, lineText: line.trim(), pattern: "fallback-to-pass (|| true in command string)" });
4935
+ }
4936
+ }
4937
+ return out;
4938
+ };
4939
+ var findLiteralTruePass = (file, text) => {
4940
+ const out = [];
4941
+ const lines = text.split("\n");
4942
+ for (let i = 0; i < lines.length; i++) {
4943
+ const line = lines[i];
4944
+ if (isMuffleOk(line)) continue;
4945
+ const code = codeOf(line);
4946
+ if (!code.trim()) continue;
4947
+ if (/testCommand\s*:\s*['"]true['"]/.test(code)) {
4948
+ out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' });
4949
+ }
4950
+ }
4951
+ return out;
4952
+ };
4953
+ var findConstructorCwdDropped = (file, text) => {
4954
+ const out = [];
4955
+ const lines = text.split("\n");
4956
+ for (let i = 0; i < lines.length; i++) {
4957
+ const line = lines[i];
4958
+ if (isMuffleOk(line)) continue;
4959
+ const code = codeOf(line);
4960
+ if (!code.trim()) continue;
4961
+ if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) {
4962
+ out.push({
4963
+ file,
4964
+ line: i + 1,
4965
+ lineText: line.trim(),
4966
+ pattern: "construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)"
4967
+ });
4968
+ }
4969
+ }
4970
+ return out;
4971
+ };
4972
+ var findAutoMatchNoExpectation = (file, text) => {
4973
+ const out = [];
4974
+ const lines = text.split("\n");
4975
+ for (let i = 0; i < lines.length; i++) {
4976
+ const line = lines[i];
4977
+ if (isMuffleOk(line)) continue;
4978
+ const code = codeOf(line);
4979
+ if (!code.trim()) continue;
4980
+ if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) {
4981
+ out.push({
4982
+ file,
4983
+ line: i + 1,
4984
+ lineText: line.trim(),
4985
+ pattern: "auto-match-no-expectation (if (!expected) return true)"
4986
+ });
4987
+ }
4988
+ }
4989
+ return out;
4990
+ };
4991
+ var findSkipCountsAsPass = (file, text) => {
4992
+ const out = [];
4993
+ const lines = text.split("\n");
4994
+ for (let i = 0; i < lines.length; i++) {
4995
+ const line = lines[i];
4996
+ if (isMuffleOk(line)) continue;
4997
+ const code = codeOf(line);
4998
+ if (!code.trim()) continue;
4999
+ if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) {
5000
+ out.push({
5001
+ file,
5002
+ line: i + 1,
5003
+ lineText: line.trim(),
5004
+ pattern: "skip-counts-as-pass (if (.skipped) return true)"
5005
+ });
5006
+ }
5007
+ }
5008
+ return out;
5009
+ };
5010
+ var DEFAULT_FINDERS = [
5011
+ findFallbackToPass,
5012
+ findLiteralTruePass,
5013
+ findAutoMatchNoExpectation,
5014
+ findSkipCountsAsPass
5015
+ ];
5016
+ var UNIVERSAL_FINDERS = [
5017
+ findConstructorCwdDropped
5018
+ ];
5019
+ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
5020
+ const matches2 = [];
5021
+ const walk = (rel) => {
5022
+ const abs = join(repoRoot, rel);
5023
+ if (!existsSync2(abs)) return;
5024
+ for (const entry of readdirSync(abs)) {
5025
+ const sub = join(rel, entry);
5026
+ const subAbs = join(repoRoot, sub);
5027
+ let st;
5028
+ try {
5029
+ st = statSync(subAbs);
5030
+ } catch {
5031
+ continue;
5032
+ }
5033
+ if (st.isDirectory()) {
5034
+ if (entry === "node_modules" || entry === "dist" || entry === "dist-tests" || entry.startsWith(".")) continue;
5035
+ walk(sub);
5036
+ } else if (st.isFile() && extensions.test(entry)) {
5037
+ if (entry.endsWith(".test.ts") || entry.endsWith(".test.mjs") || entry.endsWith(".test.js")) continue;
5038
+ let text;
5039
+ try {
5040
+ text = readFileSync2(subAbs, "utf8");
5041
+ } catch {
5042
+ continue;
5043
+ }
5044
+ if (text.includes(importsContain)) matches2.push(sub);
5045
+ }
5046
+ }
5047
+ };
5048
+ for (const r of roots) walk(r);
5049
+ return matches2;
5050
+ }
5051
+ function scanForMuffledGates(opts) {
5052
+ const findings = [];
5053
+ const scanned = /* @__PURE__ */ new Set();
5054
+ for (const file of opts.scanFiles) {
5055
+ const abs = join(opts.repoRoot, file);
5056
+ if (!existsSync2(abs)) continue;
5057
+ const text = readFileSync2(abs, "utf8");
5058
+ for (const find of opts.finders) findings.push(...find(file, text));
5059
+ scanned.add(file);
5060
+ }
5061
+ if (opts.autoDerive) {
5062
+ const importers = autoDeriveImporters(
5063
+ opts.repoRoot,
5064
+ opts.autoDerive.roots,
5065
+ opts.autoDerive.extensions,
5066
+ opts.autoDerive.importsContain
5067
+ );
5068
+ for (const file of importers) {
5069
+ if (scanned.has(file)) continue;
5070
+ const abs = join(opts.repoRoot, file);
5071
+ if (!existsSync2(abs)) continue;
5072
+ const text = readFileSync2(abs, "utf8");
5073
+ for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
5074
+ }
5075
+ }
5076
+ return findings;
5077
+ }
5078
+ function formatFindings(findings) {
5079
+ if (findings.length === 0) return "";
5080
+ return [
5081
+ `Found ${findings.length} muffled-gate pattern(s).`,
5082
+ `Fix each or annotate the line with "// muffle-ok: <reason>".`,
5083
+ "",
5084
+ ...findings.map((f) => ` ${f.file}:${f.line} \u2014 ${f.pattern}
5085
+ ${f.lineText}`)
5086
+ ].join("\n");
5087
+ }
5088
+
4892
5089
  // src/series-convergence.ts
4893
5090
  function analyzeSeries(values, options = {}) {
4894
5091
  const window = options.window ?? 5;
@@ -6858,7 +7055,7 @@ async function commitBisect(options) {
6858
7055
  }
6859
7056
  async function promptBisect(options) {
6860
7057
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
6861
- const join = (paragraphs) => paragraphs.join("\n\n");
7058
+ const join2 = (paragraphs) => paragraphs.join("\n\n");
6862
7059
  const goodParas = split(options.good);
6863
7060
  const badParas = split(options.bad);
6864
7061
  if (goodParas.length !== badParas.length) {
@@ -6876,7 +7073,7 @@ async function promptBisect(options) {
6876
7073
  const result = await bisect({
6877
7074
  good: goodMask,
6878
7075
  bad: badMask,
6879
- runEval: (mask) => options.runEval(join(paragraphsFor(mask))),
7076
+ runEval: (mask) => options.runEval(join2(paragraphsFor(mask))),
6880
7077
  maxIterations: options.maxIterations ?? n + 5,
6881
7078
  halfway: (g, b) => {
6882
7079
  for (let i = 0; i < g.length; i++) {
@@ -6907,12 +7104,12 @@ async function promptBisect(options) {
6907
7104
  }
6908
7105
  }
6909
7106
  const materializedPath = result.path.map((s) => ({
6910
- state: join(paragraphsFor(s.state)),
7107
+ state: join2(paragraphsFor(s.state)),
6911
7108
  score: s.score,
6912
7109
  pass: s.pass
6913
7110
  }));
6914
7111
  return {
6915
- culprit: join(paragraphsFor(culprit)),
7112
+ culprit: join2(paragraphsFor(culprit)),
6916
7113
  path: materializedPath,
6917
7114
  converged: result.converged,
6918
7115
  inputInconsistent: result.inputInconsistent,
@@ -7821,6 +8018,7 @@ export {
7821
8018
  CostTracker,
7822
8019
  DEFAULT_AGENT_SLOS,
7823
8020
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
8021
+ DEFAULT_FINDERS,
7824
8022
  DEFAULT_HARNESS_OBJECTIVES,
7825
8023
  DEFAULT_MUTATORS,
7826
8024
  DEFAULT_REDACTION_RULES,
@@ -7858,6 +8056,7 @@ export {
7858
8056
  TRACE_SCHEMA_VERSION,
7859
8057
  TokenCounter,
7860
8058
  TraceEmitter,
8059
+ UNIVERSAL_FINDERS,
7861
8060
  adversarialJudge,
7862
8061
  aggregateLlm,
7863
8062
  aggregateRunScore,
@@ -7916,9 +8115,15 @@ export {
7916
8115
  failureClusterView,
7917
8116
  fileContains,
7918
8117
  fileExists,
8118
+ findAutoMatchNoExpectation,
8119
+ findConstructorCwdDropped,
8120
+ findFallbackToPass,
8121
+ findLiteralTruePass,
8122
+ findSkipCountsAsPass,
7919
8123
  firstDivergenceView,
7920
8124
  formatBenchmarkReport,
7921
8125
  formatDriverReport,
8126
+ formatFindings,
7922
8127
  groupBy,
7923
8128
  hashContent,
7924
8129
  hashScenarios,
@@ -7991,6 +8196,7 @@ export {
7991
8196
  runSelfPlay,
7992
8197
  runTestGradedScenario,
7993
8198
  runsForScenario,
8199
+ scanForMuffledGates,
7994
8200
  scoreAllProjects,
7995
8201
  scoreContinuity,
7996
8202
  scoreProject,