npm - @tangle-network/agent-eval - Versions diffs - 0.7.1 → 0.7.2 - Mend

@tangle-network/agent-eval 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -2828,6 +2828,26 @@ declare class CostTracker {
         timestamp?: number;
     }): CostEntry;
     markOutcome(scenarioId: string, completed: boolean): void;
+    /**
+     * Convenience: record + markOutcome in one call from a
+     * `{ usage, verdict }`-shaped response (starter-foundry's
+     * `invokeMetaJudge` returns this shape; consumers that wrap any
+     * judge/critic can follow the same convention).
+     *
+     * `usage.model` must be present in `MODEL_PRICING` for cost math to
+     * populate; otherwise totalCostUsd stays at 0 for the entry but
+     * tokens still aggregate.
+     */
+    recordVerdict(verdict: {
+        usage?: {
+            inputTokens: number;
+            outputTokens: number;
+            model: string;
+            cachedTokens?: number;
+            reasoningTokens?: number;
+        };
+        verdict?: 'pass' | 'fail' | 'borderline' | string;
+    }, scenarioId: string, tags?: Record<string, string>): CostEntry | null;
     get(scenarioId: string): ScenarioCost | undefined;
     list(): ScenarioCost[];
     summary(): CostSummary;
@@ -2843,6 +2863,108 @@ interface CostSummary {
     costPerCompletedTaskUsd: number | null;
 }
+/**
+ * muffled-gate-scanner — test helper that greps consumer source for
+ * gate + measurement anti-patterns and fails with file:line locations.
+ *
+ * Named pattern lives at starter-foundry's `.evolve/patterns/muffled-gate.md`;
+ * same shape applies to every consumer (a gate that should fail loud
+ * returns silent success; a metric that should emit a real number
+ * reports noise/empty).
+ *
+ * Usage (in a consumer project's test file):
+ *
+ *   import { scanForMuffledGates, DEFAULT_FINDERS } from '@tangle-network/agent-eval'
+ *
+ *   test('no muffled gates in eval surface', () => {
+ *     const findings = scanForMuffledGates({
+ *       repoRoot: process.cwd(),
+ *       scanFiles: ['src/eval/scaffold.ts', 'scripts/promote.mjs'],
+ *       finders: DEFAULT_FINDERS,
+ *     })
+ *     if (findings.length) assert.fail(formatFindings(findings))
+ *   })
+ *
+ * Customize by passing your own `finders` — each finder is
+ * `(file, text) => Finding[]` and runs per-file.
+ *
+ * Escape hatch: any line containing `muffle-ok:` is excluded from all
+ * finders, letting consumers opt a legitimate fallback out explicitly.
+ */
+interface MuffledFinding {
+    file: string;
+    line: number;
+    lineText: string;
+    pattern: string;
+}
+type MuffledFinder = (file: string, text: string) => MuffledFinding[];
+interface ScanOptions {
+    /** Absolute path to the repo root. */
+    repoRoot: string;
+    /** Explicit file list (paths relative to repoRoot) for context-specific finders. */
+    scanFiles: string[];
+    /**
+     * Auto-derived scan: walk these dirs for files matching importGlob + the
+     * string `importsContain` and run the universal finders on them. Pattern
+     * from starter-foundry H4 (research/decisions/001) — catches new files
+     * with agent-eval import that would otherwise escape context-specific
+     * scan lists.
+     */
+    autoDerive?: {
+        roots: string[];
+        extensions: RegExp;
+        importsContain: string;
+        universalFinders: MuffledFinder[];
+    };
+    /** Per-file finders (context-specific patterns). */
+    finders: MuffledFinder[];
+}
+/**
+ * Default finder: `command || true` in a testCommand/setupCommand/cmd/command
+ * string. Swallows exit codes.
+ */
+declare const findFallbackToPass: MuffledFinder;
+/**
+ * `testCommand: 'true'` literal silent-pass — an unknown-language dispatch
+ * arm that returns a no-op instead of throwing.
+ */
+declare const findLiteralTruePass: MuffledFinder;
+/**
+ * `new SubprocessSandboxDriver({ cwd: ... })` — constructor arg silently
+ * dropped in agent-eval <0.7.1. 0.7.1+ honors as fallback, but the form
+ * still invites confusion; prefer `new SubprocessSandboxDriver()` with
+ * cwd in the per-call HarnessConfig.
+ */
+declare const findConstructorCwdDropped: MuffledFinder;
+/**
+ * `if (!expected) return true` — matcher auto-passes when ground truth is
+ * absent. Inflates accuracy metrics for scenarios without expectations.
+ */
+declare const findAutoMatchNoExpectation: MuffledFinder;
+/**
+ * `if (p.skipped) return true` — skip-counts-as-pass in quality scorers.
+ * Use three-valued `true | false | 'skipped'` return + explicit partial
+ * credit instead.
+ */
+declare const findSkipCountsAsPass: MuffledFinder;
+/**
+ * The canonical default bundle. Callers can import these individually,
+ * replace them, or append custom finders for project-specific patterns.
+ */
+declare const DEFAULT_FINDERS: MuffledFinder[];
+/** Finders that should run on EVERY file with the target import, not just SCAN_FILES. */
+declare const UNIVERSAL_FINDERS: MuffledFinder[];
+/**
+ * Run all finders against the configured files. Returns a flat list of
+ * findings. Callers format + assert as they prefer.
+ */
+declare function scanForMuffledGates(opts: ScanOptions): MuffledFinding[];
+/**
+ * Format findings into a single assert.fail-ready message. Each finding
+ * carries file:line + pattern name + the offending line.
+ */
+declare function formatFindings(findings: MuffledFinding[]): string;
 /**
  * Series convergence — detects whether a sequence of scalar measurements
  * is stabilizing, drifting, or noisy.
@@ -4674,4 +4796,4 @@ interface UseCaseSignals {
 declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
 declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
-export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
+export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, formatBenchmarkReport, formatDriverReport, formatFindings, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };

package/dist/index.js CHANGED Viewed

@@ -4853,6 +4853,30 @@ var CostTracker = class {
     if (!bucket) throw new Error(`CostTracker.markOutcome: unknown scenario "${scenarioId}"`);
     bucket.completed = completed;
   }
+  /**
+   * Convenience: record + markOutcome in one call from a
+   * `{ usage, verdict }`-shaped response (starter-foundry's
+   * `invokeMetaJudge` returns this shape; consumers that wrap any
+   * judge/critic can follow the same convention).
+   *
+   * `usage.model` must be present in `MODEL_PRICING` for cost math to
+   * populate; otherwise totalCostUsd stays at 0 for the entry but
+   * tokens still aggregate.
+   */
+  recordVerdict(verdict, scenarioId, tags) {
+    if (!verdict.usage) return null;
+    const entry = this.record({
+      scenarioId,
+      model: verdict.usage.model,
+      inputTokens: verdict.usage.inputTokens,
+      outputTokens: verdict.usage.outputTokens,
+      cachedTokens: verdict.usage.cachedTokens,
+      reasoningTokens: verdict.usage.reasoningTokens,
+      tags
+    });
+    this.markOutcome(scenarioId, verdict.verdict === "pass");
+    return entry;
+  }
   get(scenarioId) {
     return this.byScenario.get(scenarioId);
   }
@@ -4889,6 +4913,179 @@ function assertNonNegative(n, name) {
   }
 }
+// src/muffled-gate-scanner.ts
+import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
+import { join } from "path";
+function codeOf(line) {
+  return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
+}
+function isMuffleOk(line) {
+  return line.includes("muffle-ok:");
+}
+var findFallbackToPass = (file, text) => {
+  const out = [];
+  const lines = text.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    if (isMuffleOk(line)) continue;
+    const code = codeOf(line);
+    if (!code.trim()) continue;
+    if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) {
+      out.push({ file, line: i + 1, lineText: line.trim(), pattern: "fallback-to-pass (|| true in command string)" });
+    }
+  }
+  return out;
+};
+var findLiteralTruePass = (file, text) => {
+  const out = [];
+  const lines = text.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    if (isMuffleOk(line)) continue;
+    const code = codeOf(line);
+    if (!code.trim()) continue;
+    if (/testCommand\s*:\s*['"]true['"]/.test(code)) {
+      out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' });
+    }
+  }
+  return out;
+};
+var findConstructorCwdDropped = (file, text) => {
+  const out = [];
+  const lines = text.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    if (isMuffleOk(line)) continue;
+    const code = codeOf(line);
+    if (!code.trim()) continue;
+    if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) {
+      out.push({
+        file,
+        line: i + 1,
+        lineText: line.trim(),
+        pattern: "construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)"
+      });
+    }
+  }
+  return out;
+};
+var findAutoMatchNoExpectation = (file, text) => {
+  const out = [];
+  const lines = text.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    if (isMuffleOk(line)) continue;
+    const code = codeOf(line);
+    if (!code.trim()) continue;
+    if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) {
+      out.push({
+        file,
+        line: i + 1,
+        lineText: line.trim(),
+        pattern: "auto-match-no-expectation (if (!expected) return true)"
+      });
+    }
+  }
+  return out;
+};
+var findSkipCountsAsPass = (file, text) => {
+  const out = [];
+  const lines = text.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    if (isMuffleOk(line)) continue;
+    const code = codeOf(line);
+    if (!code.trim()) continue;
+    if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) {
+      out.push({
+        file,
+        line: i + 1,
+        lineText: line.trim(),
+        pattern: "skip-counts-as-pass (if (.skipped) return true)"
+      });
+    }
+  }
+  return out;
+};
+var DEFAULT_FINDERS = [
+  findFallbackToPass,
+  findLiteralTruePass,
+  findAutoMatchNoExpectation,
+  findSkipCountsAsPass
+];
+var UNIVERSAL_FINDERS = [
+  findConstructorCwdDropped
+];
+function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
+  const matches2 = [];
+  const walk = (rel) => {
+    const abs = join(repoRoot, rel);
+    if (!existsSync2(abs)) return;
+    for (const entry of readdirSync(abs)) {
+      const sub = join(rel, entry);
+      const subAbs = join(repoRoot, sub);
+      let st;
+      try {
+        st = statSync(subAbs);
+      } catch {
+        continue;
+      }
+      if (st.isDirectory()) {
+        if (entry === "node_modules" || entry === "dist" || entry === "dist-tests" || entry.startsWith(".")) continue;
+        walk(sub);
+      } else if (st.isFile() && extensions.test(entry)) {
+        if (entry.endsWith(".test.ts") || entry.endsWith(".test.mjs") || entry.endsWith(".test.js")) continue;
+        let text;
+        try {
+          text = readFileSync2(subAbs, "utf8");
+        } catch {
+          continue;
+        }
+        if (text.includes(importsContain)) matches2.push(sub);
+      }
+    }
+  };
+  for (const r of roots) walk(r);
+  return matches2;
+}
+function scanForMuffledGates(opts) {
+  const findings = [];
+  const scanned = /* @__PURE__ */ new Set();
+  for (const file of opts.scanFiles) {
+    const abs = join(opts.repoRoot, file);
+    if (!existsSync2(abs)) continue;
+    const text = readFileSync2(abs, "utf8");
+    for (const find of opts.finders) findings.push(...find(file, text));
+    scanned.add(file);
+  }
+  if (opts.autoDerive) {
+    const importers = autoDeriveImporters(
+      opts.repoRoot,
+      opts.autoDerive.roots,
+      opts.autoDerive.extensions,
+      opts.autoDerive.importsContain
+    );
+    for (const file of importers) {
+      if (scanned.has(file)) continue;
+      const abs = join(opts.repoRoot, file);
+      if (!existsSync2(abs)) continue;
+      const text = readFileSync2(abs, "utf8");
+      for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
+    }
+  }
+  return findings;
+}
+function formatFindings(findings) {
+  if (findings.length === 0) return "";
+  return [
+    `Found ${findings.length} muffled-gate pattern(s).`,
+    `Fix each or annotate the line with "// muffle-ok: <reason>".`,
+    "",
+    ...findings.map((f) => `  ${f.file}:${f.line} \u2014 ${f.pattern}
+    ${f.lineText}`)
+  ].join("\n");
+}
 // src/series-convergence.ts
 function analyzeSeries(values, options = {}) {
   const window = options.window ?? 5;
@@ -6858,7 +7055,7 @@ async function commitBisect(options) {
 }
 async function promptBisect(options) {
   const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
-  const join = (paragraphs) => paragraphs.join("\n\n");
+  const join2 = (paragraphs) => paragraphs.join("\n\n");
   const goodParas = split(options.good);
   const badParas = split(options.bad);
   if (goodParas.length !== badParas.length) {
@@ -6876,7 +7073,7 @@ async function promptBisect(options) {
   const result = await bisect({
     good: goodMask,
     bad: badMask,
-    runEval: (mask) => options.runEval(join(paragraphsFor(mask))),
+    runEval: (mask) => options.runEval(join2(paragraphsFor(mask))),
     maxIterations: options.maxIterations ?? n + 5,
     halfway: (g, b) => {
       for (let i = 0; i < g.length; i++) {
@@ -6907,12 +7104,12 @@ async function promptBisect(options) {
     }
   }
   const materializedPath = result.path.map((s) => ({
-    state: join(paragraphsFor(s.state)),
+    state: join2(paragraphsFor(s.state)),
     score: s.score,
     pass: s.pass
   }));
   return {
-    culprit: join(paragraphsFor(culprit)),
+    culprit: join2(paragraphsFor(culprit)),
     path: materializedPath,
     converged: result.converged,
     inputInconsistent: result.inputInconsistent,
@@ -7821,6 +8018,7 @@ export {
   CostTracker,
   DEFAULT_AGENT_SLOS,
   DEFAULT_RULES as DEFAULT_FAILURE_RULES,
+  DEFAULT_FINDERS,
   DEFAULT_HARNESS_OBJECTIVES,
   DEFAULT_MUTATORS,
   DEFAULT_REDACTION_RULES,
@@ -7858,6 +8056,7 @@ export {
   TRACE_SCHEMA_VERSION,
   TokenCounter,
   TraceEmitter,
+  UNIVERSAL_FINDERS,
   adversarialJudge,
   aggregateLlm,
   aggregateRunScore,
@@ -7916,9 +8115,15 @@ export {
   failureClusterView,
   fileContains,
   fileExists,
+  findAutoMatchNoExpectation,
+  findConstructorCwdDropped,
+  findFallbackToPass,
+  findLiteralTruePass,
+  findSkipCountsAsPass,
   firstDivergenceView,
   formatBenchmarkReport,
   formatDriverReport,
+  formatFindings,
   groupBy,
   hashContent,
   hashScenarios,
@@ -7991,6 +8196,7 @@ export {
   runSelfPlay,
   runTestGradedScenario,
   runsForScenario,
+  scanForMuffledGates,
   scoreAllProjects,
   scoreContinuity,
   scoreProject,