@tangle-network/agent-eval 0.58.0 → 0.58.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-ZWEQJIM6.js +220 -0
- package/dist/chunk-ZWEQJIM6.js.map +1 -0
- package/dist/contract/index.js +18 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/index.d.ts +15 -5
- package/dist/index.js +48 -105
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +1 -1
- package/dist/chunk-SHTXZ4O2.js +0 -113
- package/dist/chunk-SHTXZ4O2.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -1384,14 +1384,24 @@ interface LiveProofResult extends TestResult {
|
|
|
1384
1384
|
}
|
|
1385
1385
|
declare function runLiveProof(config: LiveProofConfig): Promise<LiveProofResult>;
|
|
1386
1386
|
|
|
1387
|
-
|
|
1388
|
-
declare const MODEL_PRICING: Record<string, {
|
|
1387
|
+
interface TokenPrice {
|
|
1389
1388
|
input: number;
|
|
1390
1389
|
output: number;
|
|
1391
|
-
}
|
|
1390
|
+
}
|
|
1391
|
+
/** Per-1K token pricing for exact model ids. */
|
|
1392
|
+
declare const MODEL_PRICING: Record<string, TokenPrice>;
|
|
1393
|
+
/** Resolve pricing for a model id: exact table, then family fallback.
|
|
1394
|
+
* Returns null when the id matches nothing (caller decides — never a
|
|
1395
|
+
* silent-zero masquerading as a real $0 cost). */
|
|
1396
|
+
declare function resolveModelPricing(model: string): TokenPrice | null;
|
|
1397
|
+
/** True when `model` has known pricing (exact or family). Lets cost-aware
|
|
1398
|
+
* callers distinguish a real $0 from an unpriced model. */
|
|
1399
|
+
declare function isModelPriced(model: string): boolean;
|
|
1392
1400
|
/** Estimate token count from string length (chars / 4 approximation) */
|
|
1393
1401
|
declare function estimateTokens(text: string): number;
|
|
1394
|
-
/** Calculate cost in USD from token counts and model
|
|
1402
|
+
/** Calculate cost in USD from token counts and model. Unknown models warn
|
|
1403
|
+
* once (not a silent zero) and return 0 so callers that ignore pricing keep
|
|
1404
|
+
* working; cost-sensitive callers should gate on {@link isModelPriced}. */
|
|
1395
1405
|
declare function estimateCost(inputTokens: number, outputTokens: number, model: string): number;
|
|
1396
1406
|
/**
|
|
1397
1407
|
* TokenCounter — accumulates token usage and cost across turns.
|
|
@@ -5591,4 +5601,4 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
|
|
|
5591
5601
|
*/
|
|
5592
5602
|
declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
|
|
5593
5603
|
|
|
5594
|
-
export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertRealBackend, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|
|
5604
|
+
export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertRealBackend, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|
package/dist/index.js
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import {
|
|
2
|
+
BackendIntegrityError,
|
|
2
3
|
HoldoutAuditor,
|
|
4
|
+
assertRealBackend,
|
|
3
5
|
canaryLeakView,
|
|
4
6
|
checkBehavioralCanary,
|
|
5
7
|
checkCanaries,
|
|
6
|
-
runBehavioralCanaries
|
|
7
|
-
|
|
8
|
+
runBehavioralCanaries,
|
|
9
|
+
summarizeBackendIntegrity
|
|
10
|
+
} from "./chunk-ZWEQJIM6.js";
|
|
8
11
|
import {
|
|
9
12
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
10
13
|
DEFAULT_RED_TEAM_CORPUS,
|
|
@@ -2803,12 +2806,51 @@ var MODEL_PRICING = {
|
|
|
2803
2806
|
"claude-opus-4-20250514": { input: 0.015, output: 0.075 },
|
|
2804
2807
|
"claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
|
|
2805
2808
|
};
|
|
2809
|
+
var FAMILY_PRICING = [
|
|
2810
|
+
[/claude.*opus/, { input: 0.015, output: 0.075 }],
|
|
2811
|
+
[/claude.*haiku/, { input: 8e-4, output: 4e-3 }],
|
|
2812
|
+
[/claude.*sonnet|claude-code|claude-sonnet/, { input: 3e-3, output: 0.015 }],
|
|
2813
|
+
[/gpt-4o-mini/, { input: 15e-5, output: 6e-4 }],
|
|
2814
|
+
[/gpt-5|gpt-4\.1|o[134]\b/, { input: 125e-5, output: 0.01 }],
|
|
2815
|
+
[/gpt-4o|gpt-4/, { input: 25e-4, output: 0.01 }],
|
|
2816
|
+
[/deepseek/, { input: 3e-4, output: 11e-4 }],
|
|
2817
|
+
[/glm|zhipu|zai/, { input: 6e-4, output: 22e-4 }],
|
|
2818
|
+
[/kimi|moonshot/, { input: 6e-4, output: 25e-4 }],
|
|
2819
|
+
[/qwen/, { input: 4e-4, output: 12e-4 }],
|
|
2820
|
+
[/gemini.*flash/, { input: 1e-4, output: 4e-4 }],
|
|
2821
|
+
[/gemini/, { input: 125e-5, output: 5e-3 }],
|
|
2822
|
+
[/llama/, { input: 2e-4, output: 6e-4 }]
|
|
2823
|
+
];
|
|
2824
|
+
function normalizeModelId(model) {
|
|
2825
|
+
return (model.split("@")[0] ?? model).trim().toLowerCase();
|
|
2826
|
+
}
|
|
2827
|
+
function resolveModelPricing(model) {
|
|
2828
|
+
if (MODEL_PRICING[model]) return MODEL_PRICING[model];
|
|
2829
|
+
const id = normalizeModelId(model);
|
|
2830
|
+
if (MODEL_PRICING[id]) return MODEL_PRICING[id];
|
|
2831
|
+
for (const [pattern, price] of FAMILY_PRICING) {
|
|
2832
|
+
if (pattern.test(id)) return price;
|
|
2833
|
+
}
|
|
2834
|
+
return null;
|
|
2835
|
+
}
|
|
2836
|
+
function isModelPriced(model) {
|
|
2837
|
+
return resolveModelPricing(model) !== null;
|
|
2838
|
+
}
|
|
2839
|
+
var warnedUnpricedModels = /* @__PURE__ */ new Set();
|
|
2806
2840
|
function estimateTokens(text) {
|
|
2807
2841
|
return Math.ceil(text.length / 4);
|
|
2808
2842
|
}
|
|
2809
2843
|
function estimateCost(inputTokens, outputTokens, model) {
|
|
2810
|
-
const pricing =
|
|
2811
|
-
if (!pricing)
|
|
2844
|
+
const pricing = resolveModelPricing(model);
|
|
2845
|
+
if (!pricing) {
|
|
2846
|
+
if (!warnedUnpricedModels.has(model)) {
|
|
2847
|
+
warnedUnpricedModels.add(model);
|
|
2848
|
+
console.warn(
|
|
2849
|
+
`estimateCost: no pricing for model "${model}" \u2014 returning 0; add it to MODEL_PRICING/FAMILY_PRICING (cost/Pareto axes will be blank until then)`
|
|
2850
|
+
);
|
|
2851
|
+
}
|
|
2852
|
+
return 0;
|
|
2853
|
+
}
|
|
2812
2854
|
return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
|
|
2813
2855
|
}
|
|
2814
2856
|
var TokenCounter = class {
|
|
@@ -3493,107 +3535,6 @@ function canonicalize2(value) {
|
|
|
3493
3535
|
return out;
|
|
3494
3536
|
}
|
|
3495
3537
|
|
|
3496
|
-
// src/integrity/backend-integrity.ts
|
|
3497
|
-
var BackendIntegrityError = class extends AgentEvalError {
|
|
3498
|
-
constructor(message, report) {
|
|
3499
|
-
super("backend_integrity", message);
|
|
3500
|
-
this.report = report;
|
|
3501
|
-
}
|
|
3502
|
-
report;
|
|
3503
|
-
};
|
|
3504
|
-
function isStubRecord(rec) {
|
|
3505
|
-
return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
|
|
3506
|
-
}
|
|
3507
|
-
function isUncostedRecord(rec) {
|
|
3508
|
-
return rec.tokenUsage.output > 0 && rec.costUsd === 0;
|
|
3509
|
-
}
|
|
3510
|
-
function summarizeBackendIntegrity(records) {
|
|
3511
|
-
const totalRecords = records.length;
|
|
3512
|
-
let stubRecords = 0;
|
|
3513
|
-
let realRecords = 0;
|
|
3514
|
-
let uncostedRecords = 0;
|
|
3515
|
-
let totalInputTokens = 0;
|
|
3516
|
-
let totalOutputTokens = 0;
|
|
3517
|
-
let totalCostUsd = 0;
|
|
3518
|
-
for (const rec of records) {
|
|
3519
|
-
totalInputTokens += rec.tokenUsage.input;
|
|
3520
|
-
totalOutputTokens += rec.tokenUsage.output;
|
|
3521
|
-
totalCostUsd += rec.costUsd;
|
|
3522
|
-
if (isStubRecord(rec)) stubRecords++;
|
|
3523
|
-
else realRecords++;
|
|
3524
|
-
if (isUncostedRecord(rec)) uncostedRecords++;
|
|
3525
|
-
}
|
|
3526
|
-
const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
|
|
3527
|
-
const diagnosis = buildDiagnosis({
|
|
3528
|
-
totalRecords,
|
|
3529
|
-
stubRecords,
|
|
3530
|
-
realRecords,
|
|
3531
|
-
uncostedRecords,
|
|
3532
|
-
totalInputTokens,
|
|
3533
|
-
totalOutputTokens,
|
|
3534
|
-
totalCostUsd,
|
|
3535
|
-
verdict
|
|
3536
|
-
});
|
|
3537
|
-
return {
|
|
3538
|
-
totalRecords,
|
|
3539
|
-
stubRecords,
|
|
3540
|
-
realRecords,
|
|
3541
|
-
uncostedRecords,
|
|
3542
|
-
totalInputTokens,
|
|
3543
|
-
totalOutputTokens,
|
|
3544
|
-
totalCostUsd,
|
|
3545
|
-
verdict,
|
|
3546
|
-
diagnosis
|
|
3547
|
-
};
|
|
3548
|
-
}
|
|
3549
|
-
function buildDiagnosis(r) {
|
|
3550
|
-
if (r.totalRecords === 0) {
|
|
3551
|
-
return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
|
|
3552
|
-
}
|
|
3553
|
-
if (r.verdict === "stub") {
|
|
3554
|
-
return [
|
|
3555
|
-
`all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
|
|
3556
|
-
"common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
|
|
3557
|
-
"auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
|
|
3558
|
-
"or boot the cli-bridge / sandbox before invoking the eval."
|
|
3559
|
-
].join(" ");
|
|
3560
|
-
}
|
|
3561
|
-
if (r.verdict === "mixed") {
|
|
3562
|
-
const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
|
|
3563
|
-
return [
|
|
3564
|
-
`${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
|
|
3565
|
-
"common causes: rate-limit cascade (429s after the first N personas);",
|
|
3566
|
-
"transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
|
|
3567
|
-
].join(" ");
|
|
3568
|
-
}
|
|
3569
|
-
if (r.uncostedRecords > 0) {
|
|
3570
|
-
const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
|
|
3571
|
-
return [
|
|
3572
|
-
`${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
|
|
3573
|
-
`${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
|
|
3574
|
-
"propagation from the runtime stream into RunRecord)."
|
|
3575
|
-
].join(" ");
|
|
3576
|
-
}
|
|
3577
|
-
return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
|
|
3578
|
-
}
|
|
3579
|
-
function assertRealBackend(records, opts = {}) {
|
|
3580
|
-
const report = summarizeBackendIntegrity(records);
|
|
3581
|
-
const allowMixed = opts.allowMixed ?? true;
|
|
3582
|
-
if (report.verdict === "stub") {
|
|
3583
|
-
throw new BackendIntegrityError(
|
|
3584
|
-
`backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
|
|
3585
|
-
report
|
|
3586
|
-
);
|
|
3587
|
-
}
|
|
3588
|
-
if (!allowMixed && report.verdict === "mixed") {
|
|
3589
|
-
throw new BackendIntegrityError(
|
|
3590
|
-
`backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
|
|
3591
|
-
report
|
|
3592
|
-
);
|
|
3593
|
-
}
|
|
3594
|
-
return report;
|
|
3595
|
-
}
|
|
3596
|
-
|
|
3597
3538
|
// src/integrity/single-backend.ts
|
|
3598
3539
|
var SingleBackendError = class extends AgentEvalError {
|
|
3599
3540
|
constructor(message, report) {
|
|
@@ -10579,6 +10520,7 @@ export {
|
|
|
10579
10520
|
iqr,
|
|
10580
10521
|
isJudgeSpan,
|
|
10581
10522
|
isLlmSpan,
|
|
10523
|
+
isModelPriced,
|
|
10582
10524
|
isOtelConfigured,
|
|
10583
10525
|
isRetrievalSpan,
|
|
10584
10526
|
isRunRecord,
|
|
@@ -10669,6 +10611,7 @@ export {
|
|
|
10669
10611
|
requiredSampleSize,
|
|
10670
10612
|
researchReport,
|
|
10671
10613
|
resetLockedAppendersForTesting,
|
|
10614
|
+
resolveModelPricing,
|
|
10672
10615
|
roundTripRunRecord,
|
|
10673
10616
|
rowCount,
|
|
10674
10617
|
rowWhere,
|