@tangle-network/agent-eval 0.58.0 → 0.58.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1384,14 +1384,24 @@ interface LiveProofResult extends TestResult {
1384
1384
  }
1385
1385
  declare function runLiveProof(config: LiveProofConfig): Promise<LiveProofResult>;
1386
1386
 
1387
- /** Per-1K token pricing for common models */
1388
- declare const MODEL_PRICING: Record<string, {
1387
+ interface TokenPrice {
1389
1388
  input: number;
1390
1389
  output: number;
1391
- }>;
1390
+ }
1391
+ /** Per-1K token pricing for exact model ids. */
1392
+ declare const MODEL_PRICING: Record<string, TokenPrice>;
1393
+ /** Resolve pricing for a model id: exact table, then family fallback.
1394
+ * Returns null when the id matches nothing (caller decides — never a
1395
+ * silent-zero masquerading as a real $0 cost). */
1396
+ declare function resolveModelPricing(model: string): TokenPrice | null;
1397
+ /** True when `model` has known pricing (exact or family). Lets cost-aware
1398
+ * callers distinguish a real $0 from an unpriced model. */
1399
+ declare function isModelPriced(model: string): boolean;
1392
1400
  /** Estimate token count from string length (chars / 4 approximation) */
1393
1401
  declare function estimateTokens(text: string): number;
1394
- /** Calculate cost in USD from token counts and model */
1402
+ /** Calculate cost in USD from token counts and model. Unknown models warn
1403
+ * once (not a silent zero) and return 0 so callers that ignore pricing keep
1404
+ * working; cost-sensitive callers should gate on {@link isModelPriced}. */
1395
1405
  declare function estimateCost(inputTokens: number, outputTokens: number, model: string): number;
1396
1406
  /**
1397
1407
  * TokenCounter — accumulates token usage and cost across turns.
@@ -5591,4 +5601,4 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
5591
5601
  */
5592
5602
  declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
5593
5603
 
5594
- export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertRealBackend, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
5604
+ export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertRealBackend, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -1,10 +1,13 @@
1
1
  import {
2
+ BackendIntegrityError,
2
3
  HoldoutAuditor,
4
+ assertRealBackend,
3
5
  canaryLeakView,
4
6
  checkBehavioralCanary,
5
7
  checkCanaries,
6
- runBehavioralCanaries
7
- } from "./chunk-SHTXZ4O2.js";
8
+ runBehavioralCanaries,
9
+ summarizeBackendIntegrity
10
+ } from "./chunk-ZWEQJIM6.js";
8
11
  import {
9
12
  DEFAULT_MUTATION_PRIMITIVES,
10
13
  DEFAULT_RED_TEAM_CORPUS,
@@ -2803,12 +2806,51 @@ var MODEL_PRICING = {
2803
2806
  "claude-opus-4-20250514": { input: 0.015, output: 0.075 },
2804
2807
  "claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
2805
2808
  };
2809
+ var FAMILY_PRICING = [
2810
+ [/claude.*opus/, { input: 0.015, output: 0.075 }],
2811
+ [/claude.*haiku/, { input: 8e-4, output: 4e-3 }],
2812
+ [/claude.*sonnet|claude-code|claude-sonnet/, { input: 3e-3, output: 0.015 }],
2813
+ [/gpt-4o-mini/, { input: 15e-5, output: 6e-4 }],
2814
+ [/gpt-5|gpt-4\.1|o[134]\b/, { input: 125e-5, output: 0.01 }],
2815
+ [/gpt-4o|gpt-4/, { input: 25e-4, output: 0.01 }],
2816
+ [/deepseek/, { input: 3e-4, output: 11e-4 }],
2817
+ [/glm|zhipu|zai/, { input: 6e-4, output: 22e-4 }],
2818
+ [/kimi|moonshot/, { input: 6e-4, output: 25e-4 }],
2819
+ [/qwen/, { input: 4e-4, output: 12e-4 }],
2820
+ [/gemini.*flash/, { input: 1e-4, output: 4e-4 }],
2821
+ [/gemini/, { input: 125e-5, output: 5e-3 }],
2822
+ [/llama/, { input: 2e-4, output: 6e-4 }]
2823
+ ];
2824
+ function normalizeModelId(model) {
2825
+ return (model.split("@")[0] ?? model).trim().toLowerCase();
2826
+ }
2827
+ function resolveModelPricing(model) {
2828
+ if (MODEL_PRICING[model]) return MODEL_PRICING[model];
2829
+ const id = normalizeModelId(model);
2830
+ if (MODEL_PRICING[id]) return MODEL_PRICING[id];
2831
+ for (const [pattern, price] of FAMILY_PRICING) {
2832
+ if (pattern.test(id)) return price;
2833
+ }
2834
+ return null;
2835
+ }
2836
+ function isModelPriced(model) {
2837
+ return resolveModelPricing(model) !== null;
2838
+ }
2839
+ var warnedUnpricedModels = /* @__PURE__ */ new Set();
2806
2840
  function estimateTokens(text) {
2807
2841
  return Math.ceil(text.length / 4);
2808
2842
  }
2809
2843
  function estimateCost(inputTokens, outputTokens, model) {
2810
- const pricing = MODEL_PRICING[model];
2811
- if (!pricing) return 0;
2844
+ const pricing = resolveModelPricing(model);
2845
+ if (!pricing) {
2846
+ if (!warnedUnpricedModels.has(model)) {
2847
+ warnedUnpricedModels.add(model);
2848
+ console.warn(
2849
+ `estimateCost: no pricing for model "${model}" \u2014 returning 0; add it to MODEL_PRICING/FAMILY_PRICING (cost/Pareto axes will be blank until then)`
2850
+ );
2851
+ }
2852
+ return 0;
2853
+ }
2812
2854
  return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
2813
2855
  }
2814
2856
  var TokenCounter = class {
@@ -3493,107 +3535,6 @@ function canonicalize2(value) {
3493
3535
  return out;
3494
3536
  }
3495
3537
 
3496
- // src/integrity/backend-integrity.ts
3497
- var BackendIntegrityError = class extends AgentEvalError {
3498
- constructor(message, report) {
3499
- super("backend_integrity", message);
3500
- this.report = report;
3501
- }
3502
- report;
3503
- };
3504
- function isStubRecord(rec) {
3505
- return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
3506
- }
3507
- function isUncostedRecord(rec) {
3508
- return rec.tokenUsage.output > 0 && rec.costUsd === 0;
3509
- }
3510
- function summarizeBackendIntegrity(records) {
3511
- const totalRecords = records.length;
3512
- let stubRecords = 0;
3513
- let realRecords = 0;
3514
- let uncostedRecords = 0;
3515
- let totalInputTokens = 0;
3516
- let totalOutputTokens = 0;
3517
- let totalCostUsd = 0;
3518
- for (const rec of records) {
3519
- totalInputTokens += rec.tokenUsage.input;
3520
- totalOutputTokens += rec.tokenUsage.output;
3521
- totalCostUsd += rec.costUsd;
3522
- if (isStubRecord(rec)) stubRecords++;
3523
- else realRecords++;
3524
- if (isUncostedRecord(rec)) uncostedRecords++;
3525
- }
3526
- const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
3527
- const diagnosis = buildDiagnosis({
3528
- totalRecords,
3529
- stubRecords,
3530
- realRecords,
3531
- uncostedRecords,
3532
- totalInputTokens,
3533
- totalOutputTokens,
3534
- totalCostUsd,
3535
- verdict
3536
- });
3537
- return {
3538
- totalRecords,
3539
- stubRecords,
3540
- realRecords,
3541
- uncostedRecords,
3542
- totalInputTokens,
3543
- totalOutputTokens,
3544
- totalCostUsd,
3545
- verdict,
3546
- diagnosis
3547
- };
3548
- }
3549
- function buildDiagnosis(r) {
3550
- if (r.totalRecords === 0) {
3551
- return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
3552
- }
3553
- if (r.verdict === "stub") {
3554
- return [
3555
- `all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
3556
- "common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
3557
- "auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
3558
- "or boot the cli-bridge / sandbox before invoking the eval."
3559
- ].join(" ");
3560
- }
3561
- if (r.verdict === "mixed") {
3562
- const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
3563
- return [
3564
- `${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
3565
- "common causes: rate-limit cascade (429s after the first N personas);",
3566
- "transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
3567
- ].join(" ");
3568
- }
3569
- if (r.uncostedRecords > 0) {
3570
- const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
3571
- return [
3572
- `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
3573
- `${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
3574
- "propagation from the runtime stream into RunRecord)."
3575
- ].join(" ");
3576
- }
3577
- return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
3578
- }
3579
- function assertRealBackend(records, opts = {}) {
3580
- const report = summarizeBackendIntegrity(records);
3581
- const allowMixed = opts.allowMixed ?? true;
3582
- if (report.verdict === "stub") {
3583
- throw new BackendIntegrityError(
3584
- `backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
3585
- report
3586
- );
3587
- }
3588
- if (!allowMixed && report.verdict === "mixed") {
3589
- throw new BackendIntegrityError(
3590
- `backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
3591
- report
3592
- );
3593
- }
3594
- return report;
3595
- }
3596
-
3597
3538
  // src/integrity/single-backend.ts
3598
3539
  var SingleBackendError = class extends AgentEvalError {
3599
3540
  constructor(message, report) {
@@ -10579,6 +10520,7 @@ export {
10579
10520
  iqr,
10580
10521
  isJudgeSpan,
10581
10522
  isLlmSpan,
10523
+ isModelPriced,
10582
10524
  isOtelConfigured,
10583
10525
  isRetrievalSpan,
10584
10526
  isRunRecord,
@@ -10669,6 +10611,7 @@ export {
10669
10611
  requiredSampleSize,
10670
10612
  researchReport,
10671
10613
  resetLockedAppendersForTesting,
10614
+ resolveModelPricing,
10672
10615
  roundTripRunRecord,
10673
10616
  rowCount,
10674
10617
  rowWhere,