@tangle-network/agent-eval 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +123 -1
- package/dist/index.js +210 -4
- package/dist/index.js.map +1 -1
- package/package.json +10 -9
package/dist/index.d.ts
CHANGED
|
@@ -2828,6 +2828,26 @@ declare class CostTracker {
|
|
|
2828
2828
|
timestamp?: number;
|
|
2829
2829
|
}): CostEntry;
|
|
2830
2830
|
markOutcome(scenarioId: string, completed: boolean): void;
|
|
2831
|
+
/**
|
|
2832
|
+
* Convenience: record + markOutcome in one call from a
|
|
2833
|
+
* `{ usage, verdict }`-shaped response (starter-foundry's
|
|
2834
|
+
* `invokeMetaJudge` returns this shape; consumers that wrap any
|
|
2835
|
+
* judge/critic can follow the same convention).
|
|
2836
|
+
*
|
|
2837
|
+
* `usage.model` must be present in `MODEL_PRICING` for cost math to
|
|
2838
|
+
* populate; otherwise totalCostUsd stays at 0 for the entry but
|
|
2839
|
+
* tokens still aggregate.
|
|
2840
|
+
*/
|
|
2841
|
+
recordVerdict(verdict: {
|
|
2842
|
+
usage?: {
|
|
2843
|
+
inputTokens: number;
|
|
2844
|
+
outputTokens: number;
|
|
2845
|
+
model: string;
|
|
2846
|
+
cachedTokens?: number;
|
|
2847
|
+
reasoningTokens?: number;
|
|
2848
|
+
};
|
|
2849
|
+
verdict?: 'pass' | 'fail' | 'borderline' | string;
|
|
2850
|
+
}, scenarioId: string, tags?: Record<string, string>): CostEntry | null;
|
|
2831
2851
|
get(scenarioId: string): ScenarioCost | undefined;
|
|
2832
2852
|
list(): ScenarioCost[];
|
|
2833
2853
|
summary(): CostSummary;
|
|
@@ -2843,6 +2863,108 @@ interface CostSummary {
|
|
|
2843
2863
|
costPerCompletedTaskUsd: number | null;
|
|
2844
2864
|
}
|
|
2845
2865
|
|
|
2866
|
+
/**
|
|
2867
|
+
* muffled-gate-scanner — test helper that greps consumer source for
|
|
2868
|
+
* gate + measurement anti-patterns and fails with file:line locations.
|
|
2869
|
+
*
|
|
2870
|
+
* Named pattern lives at starter-foundry's `.evolve/patterns/muffled-gate.md`;
|
|
2871
|
+
* same shape applies to every consumer (a gate that should fail loud
|
|
2872
|
+
* returns silent success; a metric that should emit a real number
|
|
2873
|
+
* reports noise/empty).
|
|
2874
|
+
*
|
|
2875
|
+
* Usage (in a consumer project's test file):
|
|
2876
|
+
*
|
|
2877
|
+
* import { scanForMuffledGates, DEFAULT_FINDERS } from '@tangle-network/agent-eval'
|
|
2878
|
+
*
|
|
2879
|
+
* test('no muffled gates in eval surface', () => {
|
|
2880
|
+
* const findings = scanForMuffledGates({
|
|
2881
|
+
* repoRoot: process.cwd(),
|
|
2882
|
+
* scanFiles: ['src/eval/scaffold.ts', 'scripts/promote.mjs'],
|
|
2883
|
+
* finders: DEFAULT_FINDERS,
|
|
2884
|
+
* })
|
|
2885
|
+
* if (findings.length) assert.fail(formatFindings(findings))
|
|
2886
|
+
* })
|
|
2887
|
+
*
|
|
2888
|
+
* Customize by passing your own `finders` — each finder is
|
|
2889
|
+
* `(file, text) => Finding[]` and runs per-file.
|
|
2890
|
+
*
|
|
2891
|
+
* Escape hatch: any line containing `muffle-ok:` is excluded from all
|
|
2892
|
+
* finders, letting consumers opt a legitimate fallback out explicitly.
|
|
2893
|
+
*/
|
|
2894
|
+
interface MuffledFinding {
|
|
2895
|
+
file: string;
|
|
2896
|
+
line: number;
|
|
2897
|
+
lineText: string;
|
|
2898
|
+
pattern: string;
|
|
2899
|
+
}
|
|
2900
|
+
type MuffledFinder = (file: string, text: string) => MuffledFinding[];
|
|
2901
|
+
interface ScanOptions {
|
|
2902
|
+
/** Absolute path to the repo root. */
|
|
2903
|
+
repoRoot: string;
|
|
2904
|
+
/** Explicit file list (paths relative to repoRoot) for context-specific finders. */
|
|
2905
|
+
scanFiles: string[];
|
|
2906
|
+
/**
|
|
2907
|
+
* Auto-derived scan: walk these dirs for files matching importGlob + the
|
|
2908
|
+
* string `importsContain` and run the universal finders on them. Pattern
|
|
2909
|
+
* from starter-foundry H4 (research/decisions/001) — catches new files
|
|
2910
|
+
* with agent-eval import that would otherwise escape context-specific
|
|
2911
|
+
* scan lists.
|
|
2912
|
+
*/
|
|
2913
|
+
autoDerive?: {
|
|
2914
|
+
roots: string[];
|
|
2915
|
+
extensions: RegExp;
|
|
2916
|
+
importsContain: string;
|
|
2917
|
+
universalFinders: MuffledFinder[];
|
|
2918
|
+
};
|
|
2919
|
+
/** Per-file finders (context-specific patterns). */
|
|
2920
|
+
finders: MuffledFinder[];
|
|
2921
|
+
}
|
|
2922
|
+
/**
|
|
2923
|
+
* Default finder: `command || true` in a testCommand/setupCommand/cmd/command
|
|
2924
|
+
* string. Swallows exit codes.
|
|
2925
|
+
*/
|
|
2926
|
+
declare const findFallbackToPass: MuffledFinder;
|
|
2927
|
+
/**
|
|
2928
|
+
* `testCommand: 'true'` literal silent-pass — an unknown-language dispatch
|
|
2929
|
+
* arm that returns a no-op instead of throwing.
|
|
2930
|
+
*/
|
|
2931
|
+
declare const findLiteralTruePass: MuffledFinder;
|
|
2932
|
+
/**
|
|
2933
|
+
* `new SubprocessSandboxDriver({ cwd: ... })` — constructor arg silently
|
|
2934
|
+
* dropped in agent-eval <0.7.1. 0.7.1+ honors as fallback, but the form
|
|
2935
|
+
* still invites confusion; prefer `new SubprocessSandboxDriver()` with
|
|
2936
|
+
* cwd in the per-call HarnessConfig.
|
|
2937
|
+
*/
|
|
2938
|
+
declare const findConstructorCwdDropped: MuffledFinder;
|
|
2939
|
+
/**
|
|
2940
|
+
* `if (!expected) return true` — matcher auto-passes when ground truth is
|
|
2941
|
+
* absent. Inflates accuracy metrics for scenarios without expectations.
|
|
2942
|
+
*/
|
|
2943
|
+
declare const findAutoMatchNoExpectation: MuffledFinder;
|
|
2944
|
+
/**
|
|
2945
|
+
* `if (p.skipped) return true` — skip-counts-as-pass in quality scorers.
|
|
2946
|
+
* Use three-valued `true | false | 'skipped'` return + explicit partial
|
|
2947
|
+
* credit instead.
|
|
2948
|
+
*/
|
|
2949
|
+
declare const findSkipCountsAsPass: MuffledFinder;
|
|
2950
|
+
/**
|
|
2951
|
+
* The canonical default bundle. Callers can import these individually,
|
|
2952
|
+
* replace them, or append custom finders for project-specific patterns.
|
|
2953
|
+
*/
|
|
2954
|
+
declare const DEFAULT_FINDERS: MuffledFinder[];
|
|
2955
|
+
/** Finders that should run on EVERY file with the target import, not just SCAN_FILES. */
|
|
2956
|
+
declare const UNIVERSAL_FINDERS: MuffledFinder[];
|
|
2957
|
+
/**
|
|
2958
|
+
* Run all finders against the configured files. Returns a flat list of
|
|
2959
|
+
* findings. Callers format + assert as they prefer.
|
|
2960
|
+
*/
|
|
2961
|
+
declare function scanForMuffledGates(opts: ScanOptions): MuffledFinding[];
|
|
2962
|
+
/**
|
|
2963
|
+
* Format findings into a single assert.fail-ready message. Each finding
|
|
2964
|
+
* carries file:line + pattern name + the offending line.
|
|
2965
|
+
*/
|
|
2966
|
+
declare function formatFindings(findings: MuffledFinding[]): string;
|
|
2967
|
+
|
|
2846
2968
|
/**
|
|
2847
2969
|
* Series convergence — detects whether a sequence of scalar measurements
|
|
2848
2970
|
* is stabilizing, drifting, or noisy.
|
|
@@ -4674,4 +4796,4 @@ interface UseCaseSignals {
|
|
|
4674
4796
|
declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
|
|
4675
4797
|
declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
|
|
4676
4798
|
|
|
4677
|
-
export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
|
4799
|
+
export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, formatBenchmarkReport, formatDriverReport, formatFindings, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
package/dist/index.js
CHANGED
|
@@ -4853,6 +4853,30 @@ var CostTracker = class {
|
|
|
4853
4853
|
if (!bucket) throw new Error(`CostTracker.markOutcome: unknown scenario "${scenarioId}"`);
|
|
4854
4854
|
bucket.completed = completed;
|
|
4855
4855
|
}
|
|
4856
|
+
/**
|
|
4857
|
+
* Convenience: record + markOutcome in one call from a
|
|
4858
|
+
* `{ usage, verdict }`-shaped response (starter-foundry's
|
|
4859
|
+
* `invokeMetaJudge` returns this shape; consumers that wrap any
|
|
4860
|
+
* judge/critic can follow the same convention).
|
|
4861
|
+
*
|
|
4862
|
+
* `usage.model` must be present in `MODEL_PRICING` for cost math to
|
|
4863
|
+
* populate; otherwise totalCostUsd stays at 0 for the entry but
|
|
4864
|
+
* tokens still aggregate.
|
|
4865
|
+
*/
|
|
4866
|
+
recordVerdict(verdict, scenarioId, tags) {
|
|
4867
|
+
if (!verdict.usage) return null;
|
|
4868
|
+
const entry = this.record({
|
|
4869
|
+
scenarioId,
|
|
4870
|
+
model: verdict.usage.model,
|
|
4871
|
+
inputTokens: verdict.usage.inputTokens,
|
|
4872
|
+
outputTokens: verdict.usage.outputTokens,
|
|
4873
|
+
cachedTokens: verdict.usage.cachedTokens,
|
|
4874
|
+
reasoningTokens: verdict.usage.reasoningTokens,
|
|
4875
|
+
tags
|
|
4876
|
+
});
|
|
4877
|
+
this.markOutcome(scenarioId, verdict.verdict === "pass");
|
|
4878
|
+
return entry;
|
|
4879
|
+
}
|
|
4856
4880
|
get(scenarioId) {
|
|
4857
4881
|
return this.byScenario.get(scenarioId);
|
|
4858
4882
|
}
|
|
@@ -4889,6 +4913,179 @@ function assertNonNegative(n, name) {
|
|
|
4889
4913
|
}
|
|
4890
4914
|
}
|
|
4891
4915
|
|
|
4916
|
+
// src/muffled-gate-scanner.ts
|
|
4917
|
+
import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
|
|
4918
|
+
import { join } from "path";
|
|
4919
|
+
function codeOf(line) {
|
|
4920
|
+
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
4921
|
+
}
|
|
4922
|
+
function isMuffleOk(line) {
|
|
4923
|
+
return line.includes("muffle-ok:");
|
|
4924
|
+
}
|
|
4925
|
+
var findFallbackToPass = (file, text) => {
|
|
4926
|
+
const out = [];
|
|
4927
|
+
const lines = text.split("\n");
|
|
4928
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4929
|
+
const line = lines[i];
|
|
4930
|
+
if (isMuffleOk(line)) continue;
|
|
4931
|
+
const code = codeOf(line);
|
|
4932
|
+
if (!code.trim()) continue;
|
|
4933
|
+
if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) {
|
|
4934
|
+
out.push({ file, line: i + 1, lineText: line.trim(), pattern: "fallback-to-pass (|| true in command string)" });
|
|
4935
|
+
}
|
|
4936
|
+
}
|
|
4937
|
+
return out;
|
|
4938
|
+
};
|
|
4939
|
+
var findLiteralTruePass = (file, text) => {
|
|
4940
|
+
const out = [];
|
|
4941
|
+
const lines = text.split("\n");
|
|
4942
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4943
|
+
const line = lines[i];
|
|
4944
|
+
if (isMuffleOk(line)) continue;
|
|
4945
|
+
const code = codeOf(line);
|
|
4946
|
+
if (!code.trim()) continue;
|
|
4947
|
+
if (/testCommand\s*:\s*['"]true['"]/.test(code)) {
|
|
4948
|
+
out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' });
|
|
4949
|
+
}
|
|
4950
|
+
}
|
|
4951
|
+
return out;
|
|
4952
|
+
};
|
|
4953
|
+
var findConstructorCwdDropped = (file, text) => {
|
|
4954
|
+
const out = [];
|
|
4955
|
+
const lines = text.split("\n");
|
|
4956
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4957
|
+
const line = lines[i];
|
|
4958
|
+
if (isMuffleOk(line)) continue;
|
|
4959
|
+
const code = codeOf(line);
|
|
4960
|
+
if (!code.trim()) continue;
|
|
4961
|
+
if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) {
|
|
4962
|
+
out.push({
|
|
4963
|
+
file,
|
|
4964
|
+
line: i + 1,
|
|
4965
|
+
lineText: line.trim(),
|
|
4966
|
+
pattern: "construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)"
|
|
4967
|
+
});
|
|
4968
|
+
}
|
|
4969
|
+
}
|
|
4970
|
+
return out;
|
|
4971
|
+
};
|
|
4972
|
+
var findAutoMatchNoExpectation = (file, text) => {
|
|
4973
|
+
const out = [];
|
|
4974
|
+
const lines = text.split("\n");
|
|
4975
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4976
|
+
const line = lines[i];
|
|
4977
|
+
if (isMuffleOk(line)) continue;
|
|
4978
|
+
const code = codeOf(line);
|
|
4979
|
+
if (!code.trim()) continue;
|
|
4980
|
+
if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) {
|
|
4981
|
+
out.push({
|
|
4982
|
+
file,
|
|
4983
|
+
line: i + 1,
|
|
4984
|
+
lineText: line.trim(),
|
|
4985
|
+
pattern: "auto-match-no-expectation (if (!expected) return true)"
|
|
4986
|
+
});
|
|
4987
|
+
}
|
|
4988
|
+
}
|
|
4989
|
+
return out;
|
|
4990
|
+
};
|
|
4991
|
+
var findSkipCountsAsPass = (file, text) => {
|
|
4992
|
+
const out = [];
|
|
4993
|
+
const lines = text.split("\n");
|
|
4994
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4995
|
+
const line = lines[i];
|
|
4996
|
+
if (isMuffleOk(line)) continue;
|
|
4997
|
+
const code = codeOf(line);
|
|
4998
|
+
if (!code.trim()) continue;
|
|
4999
|
+
if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) {
|
|
5000
|
+
out.push({
|
|
5001
|
+
file,
|
|
5002
|
+
line: i + 1,
|
|
5003
|
+
lineText: line.trim(),
|
|
5004
|
+
pattern: "skip-counts-as-pass (if (.skipped) return true)"
|
|
5005
|
+
});
|
|
5006
|
+
}
|
|
5007
|
+
}
|
|
5008
|
+
return out;
|
|
5009
|
+
};
|
|
5010
|
+
var DEFAULT_FINDERS = [
|
|
5011
|
+
findFallbackToPass,
|
|
5012
|
+
findLiteralTruePass,
|
|
5013
|
+
findAutoMatchNoExpectation,
|
|
5014
|
+
findSkipCountsAsPass
|
|
5015
|
+
];
|
|
5016
|
+
var UNIVERSAL_FINDERS = [
|
|
5017
|
+
findConstructorCwdDropped
|
|
5018
|
+
];
|
|
5019
|
+
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
5020
|
+
const matches2 = [];
|
|
5021
|
+
const walk = (rel) => {
|
|
5022
|
+
const abs = join(repoRoot, rel);
|
|
5023
|
+
if (!existsSync2(abs)) return;
|
|
5024
|
+
for (const entry of readdirSync(abs)) {
|
|
5025
|
+
const sub = join(rel, entry);
|
|
5026
|
+
const subAbs = join(repoRoot, sub);
|
|
5027
|
+
let st;
|
|
5028
|
+
try {
|
|
5029
|
+
st = statSync(subAbs);
|
|
5030
|
+
} catch {
|
|
5031
|
+
continue;
|
|
5032
|
+
}
|
|
5033
|
+
if (st.isDirectory()) {
|
|
5034
|
+
if (entry === "node_modules" || entry === "dist" || entry === "dist-tests" || entry.startsWith(".")) continue;
|
|
5035
|
+
walk(sub);
|
|
5036
|
+
} else if (st.isFile() && extensions.test(entry)) {
|
|
5037
|
+
if (entry.endsWith(".test.ts") || entry.endsWith(".test.mjs") || entry.endsWith(".test.js")) continue;
|
|
5038
|
+
let text;
|
|
5039
|
+
try {
|
|
5040
|
+
text = readFileSync2(subAbs, "utf8");
|
|
5041
|
+
} catch {
|
|
5042
|
+
continue;
|
|
5043
|
+
}
|
|
5044
|
+
if (text.includes(importsContain)) matches2.push(sub);
|
|
5045
|
+
}
|
|
5046
|
+
}
|
|
5047
|
+
};
|
|
5048
|
+
for (const r of roots) walk(r);
|
|
5049
|
+
return matches2;
|
|
5050
|
+
}
|
|
5051
|
+
function scanForMuffledGates(opts) {
|
|
5052
|
+
const findings = [];
|
|
5053
|
+
const scanned = /* @__PURE__ */ new Set();
|
|
5054
|
+
for (const file of opts.scanFiles) {
|
|
5055
|
+
const abs = join(opts.repoRoot, file);
|
|
5056
|
+
if (!existsSync2(abs)) continue;
|
|
5057
|
+
const text = readFileSync2(abs, "utf8");
|
|
5058
|
+
for (const find of opts.finders) findings.push(...find(file, text));
|
|
5059
|
+
scanned.add(file);
|
|
5060
|
+
}
|
|
5061
|
+
if (opts.autoDerive) {
|
|
5062
|
+
const importers = autoDeriveImporters(
|
|
5063
|
+
opts.repoRoot,
|
|
5064
|
+
opts.autoDerive.roots,
|
|
5065
|
+
opts.autoDerive.extensions,
|
|
5066
|
+
opts.autoDerive.importsContain
|
|
5067
|
+
);
|
|
5068
|
+
for (const file of importers) {
|
|
5069
|
+
if (scanned.has(file)) continue;
|
|
5070
|
+
const abs = join(opts.repoRoot, file);
|
|
5071
|
+
if (!existsSync2(abs)) continue;
|
|
5072
|
+
const text = readFileSync2(abs, "utf8");
|
|
5073
|
+
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
5074
|
+
}
|
|
5075
|
+
}
|
|
5076
|
+
return findings;
|
|
5077
|
+
}
|
|
5078
|
+
function formatFindings(findings) {
|
|
5079
|
+
if (findings.length === 0) return "";
|
|
5080
|
+
return [
|
|
5081
|
+
`Found ${findings.length} muffled-gate pattern(s).`,
|
|
5082
|
+
`Fix each or annotate the line with "// muffle-ok: <reason>".`,
|
|
5083
|
+
"",
|
|
5084
|
+
...findings.map((f) => ` ${f.file}:${f.line} \u2014 ${f.pattern}
|
|
5085
|
+
${f.lineText}`)
|
|
5086
|
+
].join("\n");
|
|
5087
|
+
}
|
|
5088
|
+
|
|
4892
5089
|
// src/series-convergence.ts
|
|
4893
5090
|
function analyzeSeries(values, options = {}) {
|
|
4894
5091
|
const window = options.window ?? 5;
|
|
@@ -6858,7 +7055,7 @@ async function commitBisect(options) {
|
|
|
6858
7055
|
}
|
|
6859
7056
|
async function promptBisect(options) {
|
|
6860
7057
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
6861
|
-
const
|
|
7058
|
+
const join2 = (paragraphs) => paragraphs.join("\n\n");
|
|
6862
7059
|
const goodParas = split(options.good);
|
|
6863
7060
|
const badParas = split(options.bad);
|
|
6864
7061
|
if (goodParas.length !== badParas.length) {
|
|
@@ -6876,7 +7073,7 @@ async function promptBisect(options) {
|
|
|
6876
7073
|
const result = await bisect({
|
|
6877
7074
|
good: goodMask,
|
|
6878
7075
|
bad: badMask,
|
|
6879
|
-
runEval: (mask) => options.runEval(
|
|
7076
|
+
runEval: (mask) => options.runEval(join2(paragraphsFor(mask))),
|
|
6880
7077
|
maxIterations: options.maxIterations ?? n + 5,
|
|
6881
7078
|
halfway: (g, b) => {
|
|
6882
7079
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -6907,12 +7104,12 @@ async function promptBisect(options) {
|
|
|
6907
7104
|
}
|
|
6908
7105
|
}
|
|
6909
7106
|
const materializedPath = result.path.map((s) => ({
|
|
6910
|
-
state:
|
|
7107
|
+
state: join2(paragraphsFor(s.state)),
|
|
6911
7108
|
score: s.score,
|
|
6912
7109
|
pass: s.pass
|
|
6913
7110
|
}));
|
|
6914
7111
|
return {
|
|
6915
|
-
culprit:
|
|
7112
|
+
culprit: join2(paragraphsFor(culprit)),
|
|
6916
7113
|
path: materializedPath,
|
|
6917
7114
|
converged: result.converged,
|
|
6918
7115
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -7821,6 +8018,7 @@ export {
|
|
|
7821
8018
|
CostTracker,
|
|
7822
8019
|
DEFAULT_AGENT_SLOS,
|
|
7823
8020
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
8021
|
+
DEFAULT_FINDERS,
|
|
7824
8022
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
7825
8023
|
DEFAULT_MUTATORS,
|
|
7826
8024
|
DEFAULT_REDACTION_RULES,
|
|
@@ -7858,6 +8056,7 @@ export {
|
|
|
7858
8056
|
TRACE_SCHEMA_VERSION,
|
|
7859
8057
|
TokenCounter,
|
|
7860
8058
|
TraceEmitter,
|
|
8059
|
+
UNIVERSAL_FINDERS,
|
|
7861
8060
|
adversarialJudge,
|
|
7862
8061
|
aggregateLlm,
|
|
7863
8062
|
aggregateRunScore,
|
|
@@ -7916,9 +8115,15 @@ export {
|
|
|
7916
8115
|
failureClusterView,
|
|
7917
8116
|
fileContains,
|
|
7918
8117
|
fileExists,
|
|
8118
|
+
findAutoMatchNoExpectation,
|
|
8119
|
+
findConstructorCwdDropped,
|
|
8120
|
+
findFallbackToPass,
|
|
8121
|
+
findLiteralTruePass,
|
|
8122
|
+
findSkipCountsAsPass,
|
|
7919
8123
|
firstDivergenceView,
|
|
7920
8124
|
formatBenchmarkReport,
|
|
7921
8125
|
formatDriverReport,
|
|
8126
|
+
formatFindings,
|
|
7922
8127
|
groupBy,
|
|
7923
8128
|
hashContent,
|
|
7924
8129
|
hashScenarios,
|
|
@@ -7991,6 +8196,7 @@ export {
|
|
|
7991
8196
|
runSelfPlay,
|
|
7992
8197
|
runTestGradedScenario,
|
|
7993
8198
|
runsForScenario,
|
|
8199
|
+
scanForMuffledGates,
|
|
7994
8200
|
scoreAllProjects,
|
|
7995
8201
|
scoreContinuity,
|
|
7996
8202
|
scoreProject,
|