@tangle-network/agent-eval 0.55.0 → 0.57.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/campaign/index.js +3 -3
  2. package/dist/{chunk-MAOZCN36.js → chunk-5GLYP2IQ.js} +207 -1
  3. package/dist/chunk-5GLYP2IQ.js.map +1 -0
  4. package/dist/{chunk-J3EIOI3O.js → chunk-74Y2EMNH.js} +2 -2
  5. package/dist/{chunk-UBQGWD3O.js → chunk-AIXHUIHG.js} +2 -2
  6. package/dist/{chunk-LYL4SOKT.js → chunk-GM476SZU.js} +2 -2
  7. package/dist/{chunk-YXD7GWJI.js → chunk-JB4UWIM6.js} +3 -3
  8. package/dist/{chunk-EGIPWXHL.js → chunk-OLIBRKRD.js} +2 -2
  9. package/dist/{chunk-H4TOS272.js → chunk-QDOSODID.js} +2 -2
  10. package/dist/{chunk-WP7SY7AI.js → chunk-S3SDD56V.js} +48 -1
  11. package/dist/chunk-S3SDD56V.js.map +1 -0
  12. package/dist/contract/index.d.ts +98 -1
  13. package/dist/contract/index.js +78 -4
  14. package/dist/contract/index.js.map +1 -1
  15. package/dist/index.d.ts +109 -4
  16. package/dist/index.js +144 -6
  17. package/dist/index.js.map +1 -1
  18. package/dist/openapi.json +1 -1
  19. package/dist/pipelines/index.js +2 -2
  20. package/dist/{release-report-B6l5fi7T.d.ts → release-report-DmPjIce3.d.ts} +44 -1
  21. package/dist/reporting.d.ts +1 -1
  22. package/dist/reporting.js +3 -3
  23. package/dist/rl.js +3 -3
  24. package/dist/{run-campaign-6UEVBPP3.js → run-campaign-ZURVWMMI.js} +3 -3
  25. package/dist/traces.d.ts +86 -3
  26. package/dist/traces.js +5 -1
  27. package/package.json +1 -1
  28. package/dist/chunk-MAOZCN36.js.map +0 -1
  29. package/dist/chunk-WP7SY7AI.js.map +0 -1
  30. /package/dist/{chunk-J3EIOI3O.js.map → chunk-74Y2EMNH.js.map} +0 -0
  31. /package/dist/{chunk-UBQGWD3O.js.map → chunk-AIXHUIHG.js.map} +0 -0
  32. /package/dist/{chunk-LYL4SOKT.js.map → chunk-GM476SZU.js.map} +0 -0
  33. /package/dist/{chunk-YXD7GWJI.js.map → chunk-JB4UWIM6.js.map} +0 -0
  34. /package/dist/{chunk-EGIPWXHL.js.map → chunk-OLIBRKRD.js.map} +0 -0
  35. /package/dist/{chunk-H4TOS272.js.map → chunk-QDOSODID.js.map} +0 -0
  36. /package/dist/{run-campaign-6UEVBPP3.js.map → run-campaign-ZURVWMMI.js.map} +0 -0
package/dist/index.d.ts CHANGED
@@ -11,7 +11,7 @@ export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreO
11
11
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
12
12
  export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
13
13
  import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
14
- export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
14
+ export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
15
15
  import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
16
16
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-CJbzDxZ2.js';
17
17
  import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
@@ -26,8 +26,8 @@ export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError,
26
26
  import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-DpUmE90J.js';
27
27
  export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DpUmE90J.js';
28
28
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
29
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-B6l5fi7T.js';
30
- export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-B6l5fi7T.js';
29
+ import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DmPjIce3.js';
30
+ export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CliffsMagnitude, t as CorpusAgreementOptions, u as CorpusAgreementPerDimension, v as CorpusAgreementReport, x as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, W as WeightedCompositeInput, y as WeightedCompositeResult, l as assertReleaseConfidence, m as benjaminiHochberg, z as bonferroni, n as bootstrapCi, D as cliffsDelta, E as cohensD, F as confidenceInterval, G as corpusInterRaterAgreement, H as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, I as interRaterReliability, K as interpretCliffs, p as judgeReplayGate, L as mannWhitneyU, M as normalizeScores, q as pairedBootstrap, N as pairedMde, O as pairedTTest, Q as partialCredit, r as renderReleaseReport, S as requiredSampleSize, T as weightedComposite, U as weightedMean, w as wilcoxonSignedRank } from './release-report-DmPjIce3.js';
31
31
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
32
32
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
33
33
  import { T as TraceEmitter } from './emitter-DEZwY14K.js';
@@ -1191,6 +1191,111 @@ declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
1191
1191
  allowMixed?: boolean;
1192
1192
  }): BackendIntegrityReport;
1193
1193
 
1194
+ /**
1195
+ * Single-backend guard: assert the agent and the rubric judge run through the
1196
+ * SAME backend config, so the judge can't silently re-route through a
1197
+ * different (often paid) backend than the agent.
1198
+ *
1199
+ * The bug class: `--backend cli-bridge` rewires the agent, but the judge still
1200
+ * reads `process.env.TANGLE_API_KEY` → router. Cost is billed against the
1201
+ * router, the eval reports the cli-bridge model, and the data is unusable.
1202
+ * Four consumers hand-roll this comparison (legal at `canonical.ts:702-795`);
1203
+ * this is the one substrate copy.
1204
+ *
1205
+ * Complements `assertRealBackend` (records → stub vs real) and
1206
+ * `assertCrossFamily` (judge-ensemble family diversity): this one compares two
1207
+ * backend *configs* before the run.
1208
+ */
1209
+
1210
+ /**
1211
+ * Minimal backend-config shape the assertion reads. Consumers may pass richer
1212
+ * types — only these five fields are inspected.
1213
+ */
1214
+ interface BackendDescriptor {
1215
+ /** Backend route — e.g. `'tcloud' | 'cli-bridge' | 'sandbox' | 'direct-provider'`;
1216
+ * free-form for consumer extensibility. */
1217
+ kind: string;
1218
+ /** Resolved base URL. Compared lexically (trailing slash stripped). */
1219
+ baseUrl: string;
1220
+ /** Model id (with snapshot suffix). Compared lexically. */
1221
+ model: string;
1222
+ /** Optional provider override. Compared when both set; flagged when only
1223
+ * one side sets it. */
1224
+ provider?: string;
1225
+ /** Bearer token. Values are NEVER compared (security) — only that EITHER
1226
+ * both are set OR both are empty. Mismatched presence is a divergence. */
1227
+ apiKey?: string;
1228
+ }
1229
+ interface AssertSingleBackendOptions {
1230
+ /** When true, ANY field divergence fails. When false (default), only
1231
+ * `kind` / `baseUrl` / `provider` / `apiKeyPresence` divergence throws —
1232
+ * a different judge `model` on the same route is allowed (the legal
1233
+ * pattern: a cheaper judge model). */
1234
+ strict?: boolean;
1235
+ agentLabel?: string;
1236
+ judgeLabel?: string;
1237
+ }
1238
+ type SingleBackendField = 'kind' | 'baseUrl' | 'model' | 'provider' | 'apiKeyPresence';
1239
+ interface SingleBackendDivergence {
1240
+ field: SingleBackendField;
1241
+ agent: string | undefined;
1242
+ judge: string | undefined;
1243
+ }
1244
+ interface SingleBackendReport {
1245
+ /** True when agent + judge agree per the configured strictness. */
1246
+ ok: boolean;
1247
+ /** Every divergence detected (includes `model` even when non-blocking). */
1248
+ divergences: ReadonlyArray<SingleBackendDivergence>;
1249
+ }
1250
+ declare class SingleBackendError extends AgentEvalError {
1251
+ readonly report: SingleBackendReport;
1252
+ constructor(message: string, report: SingleBackendReport);
1253
+ }
1254
+ /**
1255
+ * Throw `SingleBackendError` when the agent and judge backends diverge in a
1256
+ * way that would re-route the judge through a different backend than the
1257
+ * agent. Returns the report so callers can log it in either case.
1258
+ */
1259
+ declare function assertSingleBackend(agent: BackendDescriptor, judge: BackendDescriptor, opts?: AssertSingleBackendOptions): SingleBackendReport;
1260
+
1261
+ /**
1262
+ * Judge model-family classification + cross-family enforcement.
1263
+ *
1264
+ * A judge ensemble built entirely from one provider family shares that
1265
+ * family's blind spots and self-preference — its "agreement" is correlated
1266
+ * bias, not independent signal. `assertCrossFamily` makes the consumer prove
1267
+ * the ensemble spans ≥2 families; `judgeFamily` is the single regex map that
1268
+ * replaces the per-consumer copies (tax/legal/creative/gtm each ship one).
1269
+ */
1270
+ /** Provider family a model belongs to. `unknown` when no rule matches. */
1271
+ type JudgeFamily = 'anthropic' | 'openai' | 'google' | 'meta' | 'mistral' | 'deepseek' | 'xai' | 'qwen' | 'cohere' | 'amazon' | 'unknown';
1272
+ /**
1273
+ * Classify a model id into its provider family. Strips a `@snapshot` suffix
1274
+ * and prefers an explicit `provider/...` prefix; otherwise matches the model
1275
+ * name. Returns `unknown` when nothing matches (callers decide whether that's
1276
+ * acceptable — `assertCrossFamily` counts it as its own family).
1277
+ */
1278
+ declare function judgeFamily(modelId: string): JudgeFamily;
1279
+ interface AssertCrossFamilyOptions {
1280
+ /** Minimum number of distinct families the ensemble must span. Default 2. */
1281
+ minFamilies?: number;
1282
+ /** When false (default), `unknown`-family models do NOT count toward the
1283
+ * family total — an ensemble of all-unclassifiable models is not provably
1284
+ * cross-family. Set true to count `unknown` as one shared family. */
1285
+ allowUnknown?: boolean;
1286
+ }
1287
+ declare class CrossFamilyError extends Error {
1288
+ readonly families: JudgeFamily[];
1289
+ readonly models: string[];
1290
+ constructor(message: string, families: JudgeFamily[], models: string[]);
1291
+ }
1292
+ /**
1293
+ * Throw unless the judge models span at least `minFamilies` distinct provider
1294
+ * families. Pass the model ids backing your judge ensemble. Fail-loud by
1295
+ * design — a correlated single-family ensemble silently inflates agreement.
1296
+ */
1297
+ declare function assertCrossFamily(models: string[], opts?: AssertCrossFamilyOptions): JudgeFamily[];
1298
+
1194
1299
  /**
1195
1300
  * Create a domain expert judge with a configurable domain.
1196
1301
  *
@@ -5486,4 +5591,4 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
5486
5591
  */
5487
5592
  declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
5488
5593
 
5489
- export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
5594
+ export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertRealBackend, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -31,7 +31,7 @@ import {
31
31
  computeToolUseMetrics,
32
32
  iqr,
33
33
  welchsTTest
34
- } from "./chunk-H4TOS272.js";
34
+ } from "./chunk-QDOSODID.js";
35
35
  import {
36
36
  exportTrainingData,
37
37
  toNdjson
@@ -89,10 +89,10 @@ import {
89
89
  evaluateReleaseConfidence,
90
90
  judgeReplayGate,
91
91
  renderReleaseReport
92
- } from "./chunk-UBQGWD3O.js";
92
+ } from "./chunk-AIXHUIHG.js";
93
93
  import {
94
94
  runEvalCampaign
95
- } from "./chunk-LYL4SOKT.js";
95
+ } from "./chunk-GM476SZU.js";
96
96
  import {
97
97
  AGENT_PROFILE_KINDS,
98
98
  AgentProfileCellValidationError,
@@ -122,18 +122,20 @@ import {
122
122
  paretoChart,
123
123
  researchReport,
124
124
  summaryTable
125
- } from "./chunk-EGIPWXHL.js";
125
+ } from "./chunk-OLIBRKRD.js";
126
126
  import {
127
127
  benjaminiHochberg,
128
128
  bonferroni,
129
129
  calibrateJudge,
130
130
  calibrateJudgeContinuous,
131
+ cliffsDelta,
131
132
  cohensD,
132
133
  confidenceInterval,
133
134
  continuousAgreement,
134
135
  corpusInterRaterAgreement,
135
136
  corpusInterRaterAgreementFromJudgeScores,
136
137
  interRaterReliability,
138
+ interpretCliffs,
137
139
  mannWhitneyU,
138
140
  normalizeScores,
139
141
  pairedBootstrap,
@@ -144,9 +146,10 @@ import {
144
146
  requiredSampleSize,
145
147
  selfPreference,
146
148
  verbosityBias,
149
+ weightedComposite,
147
150
  weightedMean,
148
151
  wilcoxonSignedRank
149
- } from "./chunk-WP7SY7AI.js";
152
+ } from "./chunk-S3SDD56V.js";
150
153
  import {
151
154
  DEFAULT_TRACE_ANALYST_BUDGETS,
152
155
  FileSystemTraceStore,
@@ -166,6 +169,7 @@ import {
166
169
  buildTraceAnalystTools,
167
170
  buildTraceInsightContext,
168
171
  buildTraceInsightPrompt,
172
+ captureFetchToRawSink,
169
173
  createOtelExporter,
170
174
  createOtelTracingStore,
171
175
  createReplayFetch,
@@ -173,6 +177,7 @@ import {
173
177
  describeTraceInsightScope,
174
178
  domainEvidencePattern,
175
179
  exportRunAsOtlp,
180
+ flattenOtlpExportToNdjson,
176
181
  inferDomainKeywords,
177
182
  iterateRawCalls,
178
183
  otelRunCompleteHook,
@@ -181,7 +186,7 @@ import {
181
186
  tokenizeDomainWords,
182
187
  traceAnalystFunctionGroup,
183
188
  traceAnalystOnRunComplete
184
- } from "./chunk-MAOZCN36.js";
189
+ } from "./chunk-5GLYP2IQ.js";
185
190
  import {
186
191
  DEFAULT_REDACTION_RULES,
187
192
  REDACTION_VERSION,
@@ -3589,6 +3594,129 @@ function assertRealBackend(records, opts = {}) {
3589
3594
  return report;
3590
3595
  }
3591
3596
 
3597
+ // src/integrity/single-backend.ts
3598
+ var SingleBackendError = class extends AgentEvalError {
3599
+ constructor(message, report) {
3600
+ super("backend_integrity", message);
3601
+ this.report = report;
3602
+ this.name = "SingleBackendError";
3603
+ }
3604
+ report;
3605
+ };
3606
+ function stripSlash(url) {
3607
+ return url.replace(/\/+$/, "");
3608
+ }
3609
+ function assertSingleBackend(agent2, judge, opts = {}) {
3610
+ const divergences = [];
3611
+ if (agent2.kind !== judge.kind) {
3612
+ divergences.push({ field: "kind", agent: agent2.kind, judge: judge.kind });
3613
+ }
3614
+ if (stripSlash(agent2.baseUrl) !== stripSlash(judge.baseUrl)) {
3615
+ divergences.push({ field: "baseUrl", agent: agent2.baseUrl, judge: judge.baseUrl });
3616
+ }
3617
+ if (agent2.model !== judge.model) {
3618
+ divergences.push({ field: "model", agent: agent2.model, judge: judge.model });
3619
+ }
3620
+ if (agent2.provider !== judge.provider) {
3621
+ divergences.push({ field: "provider", agent: agent2.provider, judge: judge.provider });
3622
+ }
3623
+ const agentHasKey = Boolean(agent2.apiKey);
3624
+ const judgeHasKey = Boolean(judge.apiKey);
3625
+ if (agentHasKey !== judgeHasKey) {
3626
+ divergences.push({
3627
+ field: "apiKeyPresence",
3628
+ agent: agentHasKey ? "set" : "empty",
3629
+ judge: judgeHasKey ? "set" : "empty"
3630
+ });
3631
+ }
3632
+ const blocking = opts.strict ? divergences : divergences.filter((d) => d.field !== "model");
3633
+ const ok = blocking.length === 0;
3634
+ const report = { ok, divergences };
3635
+ if (!ok) {
3636
+ const agentLabel = opts.agentLabel ?? "agent";
3637
+ const judgeLabel = opts.judgeLabel ?? "judge";
3638
+ const detail = blocking.map((d) => `${d.field}: ${agentLabel}=${d.agent ?? "\u2205"} vs ${judgeLabel}=${d.judge ?? "\u2205"}`).join("; ");
3639
+ throw new SingleBackendError(
3640
+ `single-backend: ${agentLabel} and ${judgeLabel} backends diverge \u2014 the judge would re-route through a different backend than the agent (${detail})`,
3641
+ report
3642
+ );
3643
+ }
3644
+ return report;
3645
+ }
3646
+
3647
+ // src/judge-families.ts
3648
+ var PROVIDER_PREFIX = {
3649
+ anthropic: "anthropic",
3650
+ openai: "openai",
3651
+ "azure-openai": "openai",
3652
+ google: "google",
3653
+ "google-vertex": "google",
3654
+ meta: "meta",
3655
+ "meta-llama": "meta",
3656
+ mistral: "mistral",
3657
+ mistralai: "mistral",
3658
+ deepseek: "deepseek",
3659
+ xai: "xai",
3660
+ qwen: "qwen",
3661
+ alibaba: "qwen",
3662
+ cohere: "cohere",
3663
+ amazon: "amazon",
3664
+ bedrock: "amazon"
3665
+ };
3666
+ var NAME_PATTERNS = [
3667
+ [/claude/i, "anthropic"],
3668
+ [/\b(gpt|davinci|babbage)\b|^o[134]\b|[-/]o[134]\b|gpt-/i, "openai"],
3669
+ [/gemini|palm|gemma|bison/i, "google"],
3670
+ [/llama/i, "meta"],
3671
+ [/mi(s|x)tral|codestral|magistral/i, "mistral"],
3672
+ [/deepseek/i, "deepseek"],
3673
+ [/grok/i, "xai"],
3674
+ [/qwen/i, "qwen"],
3675
+ [/command-?(r|a)?/i, "cohere"],
3676
+ [/\b(nova|titan)\b/i, "amazon"]
3677
+ ];
3678
+ function judgeFamily(modelId) {
3679
+ const id = modelId.trim().split("@")[0].toLowerCase();
3680
+ const slash = id.indexOf("/");
3681
+ if (slash > 0) {
3682
+ const prefix = id.slice(0, slash);
3683
+ const mapped = PROVIDER_PREFIX[prefix];
3684
+ if (mapped) return mapped;
3685
+ }
3686
+ for (const [pattern, family] of NAME_PATTERNS) {
3687
+ if (pattern.test(id)) return family;
3688
+ }
3689
+ return "unknown";
3690
+ }
3691
+ var CrossFamilyError = class extends Error {
3692
+ constructor(message, families, models) {
3693
+ super(message);
3694
+ this.families = families;
3695
+ this.models = models;
3696
+ this.name = "CrossFamilyError";
3697
+ }
3698
+ families;
3699
+ models;
3700
+ };
3701
+ function assertCrossFamily(models, opts = {}) {
3702
+ const minFamilies = opts.minFamilies ?? 2;
3703
+ const families = /* @__PURE__ */ new Set();
3704
+ for (const m of models) {
3705
+ const f = judgeFamily(m);
3706
+ if (f === "unknown" && !opts.allowUnknown) continue;
3707
+ families.add(f);
3708
+ }
3709
+ const list = [...families].sort();
3710
+ if (list.length < minFamilies) {
3711
+ throw new CrossFamilyError(
3712
+ `judge ensemble spans ${list.length} provider famil${list.length === 1 ? "y" : "ies"} (${list.join(", ") || "none"}) but ${minFamilies} required \u2014 a single-family ensemble is correlated bias, not independent signal`,
3713
+ list,
3714
+ models
3715
+ );
3716
+ }
3717
+ return list;
3718
+ }
3719
+
3592
3720
  // src/judges.ts
3593
3721
  function createDomainExpertJudge(domain) {
3594
3722
  return async (tc, { scenario, turns }) => {
@@ -10182,6 +10310,7 @@ export {
10182
10310
  ConfigError,
10183
10311
  ConvergenceTracker,
10184
10312
  CostTracker,
10313
+ CrossFamilyError,
10185
10314
  D1ExperimentStore,
10186
10315
  DEFAULT_AGENT_SLOS,
10187
10316
  DEFAULT_COMPLEXITY_WEIGHTS,
@@ -10256,6 +10385,7 @@ export {
10256
10385
  SEMANTIC_CONCEPT_JUDGE_VERSION,
10257
10386
  SandboxHarness,
10258
10387
  ScenarioRegistry,
10388
+ SingleBackendError,
10259
10389
  SpanNotFoundError,
10260
10390
  SubprocessSandboxDriver,
10261
10391
  TRACE_ANALYST_ACTOR_DESCRIPTION,
@@ -10284,11 +10414,13 @@ export {
10284
10414
  analyzeTraces,
10285
10415
  appendScorecard,
10286
10416
  argHash,
10417
+ assertCrossFamily,
10287
10418
  assertLlmRoute,
10288
10419
  assertRealBackend,
10289
10420
  assertReleaseConfidence,
10290
10421
  assertRunAgentProfileCell,
10291
10422
  assertRunCaptured,
10423
+ assertSingleBackend,
10292
10424
  assignFeedbackSplit,
10293
10425
  attributeCounterfactuals,
10294
10426
  backoffMs,
@@ -10316,6 +10448,7 @@ export {
10316
10448
  callLlmJson,
10317
10449
  canaryLeakView,
10318
10450
  canonicalize,
10451
+ captureFetchToRawSink,
10319
10452
  causalAttribution,
10320
10453
  checkBehavioralCanary,
10321
10454
  checkCanaries,
@@ -10323,6 +10456,7 @@ export {
10323
10456
  clamp01,
10324
10457
  classifyEuAiRisk,
10325
10458
  classifyFailure,
10459
+ cliffsDelta,
10326
10460
  codeExecutionJudge,
10327
10461
  cohensD,
10328
10462
  coherenceJudge,
@@ -10409,6 +10543,7 @@ export {
10409
10543
  findFallbackToPass,
10410
10544
  findLiteralTruePass,
10411
10545
  findSkipCountsAsPass,
10546
+ flattenOtlpExportToNdjson,
10412
10547
  flowLayer,
10413
10548
  formatBenchmarkReport,
10414
10549
  formatDriverReport,
@@ -10429,6 +10564,7 @@ export {
10429
10564
  inMemoryReviewStore,
10430
10565
  inferDomainKeywords,
10431
10566
  interRaterReliability,
10567
+ interpretCliffs,
10432
10568
  iqr,
10433
10569
  isJudgeSpan,
10434
10570
  isLlmSpan,
@@ -10444,6 +10580,7 @@ export {
10444
10580
  jsonShape,
10445
10581
  jsonlReferenceReplayStore,
10446
10582
  jsonlReviewStore,
10583
+ judgeFamily,
10447
10584
  judgeReplayGate,
10448
10585
  judgeSpans,
10449
10586
  keyPreserved,
@@ -10600,6 +10737,7 @@ export {
10600
10737
  visualDiff,
10601
10738
  viteDeployRunner,
10602
10739
  vitestTestParser,
10740
+ weightedComposite,
10603
10741
  weightedMean,
10604
10742
  weightedRecall,
10605
10743
  welchsTTest,