@tangle-network/agent-eval 0.66.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,14 @@
1
1
  import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-c2R2kfmv.js';
2
2
  export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, f as CampaignResult, g as CampaignTraceWriter, h as CodeSurface, D as Dispatch, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, n as JudgeDimension, J as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-c2R2kfmv.js';
3
- import { L as LoopProvenanceRecord } from '../provenance-BZUFC1_D.js';
4
- export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, R as RunEvalOptions, c as composeGate, d as defaultProductionGate, e as evolutionaryDriver, h as heldOutGate, r as runEval } from '../provenance-BZUFC1_D.js';
3
+ import { L as LoopProvenanceRecord } from '../provenance-CChUqexv.js';
4
+ export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, R as RunEvalOptions, c as composeGate, d as defaultProductionGate, e as evolutionaryDriver, h as heldOutGate, r as runEval } from '../provenance-CChUqexv.js';
5
5
  import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-BKpM5T4t.js';
6
6
  export { G as GepaDriverOptions, a as RunCampaignOptions, b as RunImprovementLoopOptions, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, r as runCampaign, c as runImprovementLoop } from '../run-improvement-loop-BKpM5T4t.js';
7
7
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
8
8
  import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-DSEHMwvS.js';
9
9
  export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-DSEHMwvS.js';
10
10
  import { R as RunRecord, b as RunSplitTag } from '../run-record-BgTFzO2r.js';
11
- import { A as AnalystRegistry } from '../registry-BzAEvqAt.js';
11
+ import { A as AnalystRegistry } from '../registry-BGKyX6bw.js';
12
12
  import { a as DatasetScenario } from '../dataset-B2kL-fSM.js';
13
13
  import '../red-team-DW9Ca_tj.js';
14
14
  import '../store-CKUAgsJz.js';
@@ -20,7 +20,7 @@ import '../summary-report-ByiOUrHj.js';
20
20
  import '../failure-cluster-CL7IVgkJ.js';
21
21
  import '../judge-calibration-DilmB3Ml.js';
22
22
  import '../store-jzKpMl16.js';
23
- import '../types-DhqpAi_z.js';
23
+ import '../types-Croy5h7V.js';
24
24
  import '@tangle-network/tcloud';
25
25
 
26
26
  /**
@@ -5,7 +5,7 @@ import {
5
5
  evolutionaryDriver,
6
6
  runEval,
7
7
  surfaceContentHash
8
- } from "../chunk-RDK3P4JE.js";
8
+ } from "../chunk-MZ2IYGGN.js";
9
9
  import {
10
10
  createHostedClient
11
11
  } from "../chunk-DFS3FEXO.js";
@@ -16,7 +16,7 @@ import {
16
16
  gepaDriver,
17
17
  heldOutGate,
18
18
  runImprovementLoop
19
- } from "../chunk-Q56RRLEC.js";
19
+ } from "../chunk-NV2PF37Q.js";
20
20
  import {
21
21
  fsCampaignStorage,
22
22
  inMemoryCampaignStorage,
package/dist/index.d.ts CHANGED
@@ -14,10 +14,10 @@ import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInpu
14
14
  export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
15
15
  import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
16
16
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
17
- import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
18
- export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
19
- import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext, C as ChatRequest, f as CreateChatClientOpts } from './registry-BzAEvqAt.js';
20
- export { g as AnalystHooks, h as AnalystInputKind, A as AnalystRegistry, i as AnalystRegistryOptions, j as AnalystRequirements, k as AnalystRunEvent, l as AnalystRunInputs, m as AnalystRunResult, n as AnalystRunSummary, B as BudgetPolicy, o as ChatCallOpts, p as ChatClient, q as ChatResponse, r as ChatTransport, s as CliBridgeTransportOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BzAEvqAt.js';
17
+ import { b as JudgeFn, a as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-Croy5h7V.js';
18
+ export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, J as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-Croy5h7V.js';
19
+ import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext, C as ChatRequest, f as CreateChatClientOpts } from './registry-BGKyX6bw.js';
20
+ export { g as AnalystHooks, h as AnalystInputKind, A as AnalystRegistry, i as AnalystRegistryOptions, j as AnalystRequirements, k as AnalystRunEvent, l as AnalystRunInputs, m as AnalystRunResult, n as AnalystRunSummary, B as BudgetPolicy, o as ChatCallOpts, p as ChatClient, q as ChatResponse, r as ChatTransport, s as CliBridgeTransportOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BGKyX6bw.js';
21
21
  import { TCloud } from '@tangle-network/tcloud';
22
22
  import { z } from 'zod';
23
23
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
@@ -28,8 +28,9 @@ export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind
28
28
  import { A as AgentProfile } from './agent-profile-DzcPHR1Z.js';
29
29
  export { a as BackendIntegrityError, B as BackendIntegrityReport, b as agentProfileHash, c as assertRealBackend, s as summarizeBackendIntegrity } from './agent-profile-DzcPHR1Z.js';
30
30
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
31
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DGoeObZT.js';
32
- export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CliffsMagnitude, t as CorpusAgreementOptions, u as CorpusAgreementPerDimension, v as CorpusAgreementReport, x as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, W as WeightedCompositeInput, y as WeightedCompositeResult, l as assertReleaseConfidence, m as benjaminiHochberg, z as bonferroni, n as bootstrapCi, D as cliffsDelta, E as cohensD, F as confidenceInterval, G as corpusInterRaterAgreement, H as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, I as interRaterReliability, K as interpretCliffs, p as judgeReplayGate, L as mannWhitneyU, M as normalizeScores, q as pairedBootstrap, N as pairedMde, O as pairedTTest, Q as partialCredit, r as renderReleaseReport, S as requiredSampleSize, T as weightedComposite, U as weightedMean, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
31
+ import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CN8hJlhk.js';
32
+ export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
33
+ export { C as CliffsMagnitude, c as CorpusAgreementOptions, d as CorpusAgreementPerDimension, e as CorpusAgreementReport, f as CorpusScoreRecord, P as PairedBootstrapOptions, a as PairedBootstrapResult, W as WeightedCompositeInput, g as WeightedCompositeResult, b as benjaminiHochberg, h as bonferroni, i as cliffsDelta, j as cohensD, k as confidenceInterval, l as corpusInterRaterAgreement, m as corpusInterRaterAgreementFromJudgeScores, n as interRaterReliability, o as interpretCliffs, q as mannWhitneyU, r as normalizeScores, p as pairedBootstrap, s as pairedMde, t as pairedTTest, u as partialCredit, v as requiredSampleSize, x as weightedComposite, y as weightedMean, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
33
34
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
34
35
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
35
36
  import { T as TraceEmitter } from './emitter-DEZwY14K.js';
package/dist/index.js CHANGED
@@ -30,7 +30,7 @@ import {
30
30
  scalarScore,
31
31
  scoreRedTeamOutput,
32
32
  toolNamesForRun
33
- } from "./chunk-Q56RRLEC.js";
33
+ } from "./chunk-NV2PF37Q.js";
34
34
  import {
35
35
  BackendIntegrityError,
36
36
  assertRealBackend,
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.66.0",
5
+ "version": "0.67.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -59,8 +59,30 @@ interface DefaultProductionGateOptions {
59
59
  /** Required: scenarios held out from training; substrate compares
60
60
  * candidate-on-holdout vs baseline-on-holdout. */
61
61
  holdoutScenarios: Scenario[];
62
- /** Minimum mean-composite improvement required to ship. Default 0.5. */
62
+ /** Minimum held-out lift the **paired-bootstrap CI lower bound** must clear
63
+ * to ship — NOT a point estimate. Default 0 ⇒ "confidently positive at the
64
+ * confidence level". Interpreted in the judge's native composite scale (set
65
+ * e.g. 2 for a 0-100 rubric to require a ≥2-point significant gain). */
63
66
  deltaThreshold?: number;
67
+ /** Confidence level for the held-out + dimension bootstraps. Default 0.95. */
68
+ confidence?: number;
69
+ /** Bootstrap resamples. Default 2000. */
70
+ bootstrapResamples?: number;
71
+ /** Fixed bootstrap seed for a deterministic verdict. Default 1337. */
72
+ bootstrapSeed?: number;
73
+ /** Minimum paired holdout observations (scenarios × reps) before a
74
+ * significance claim is allowed; below it the gate HOLDS with `few_runs`
75
+ * rather than reading a degenerate CI. Default 3. */
76
+ minProductiveRuns?: number;
77
+ /** Critical judge dimensions that must NOT significantly regress even when
78
+ * the net composite rises (anti-Goodhart). The gate HOLDS if any listed
79
+ * dimension's paired-delta CI lower bound < −`regressionTolerance`. E.g.
80
+ * `['hallucination_free']` for a legal agent. */
81
+ criticalDimensions?: string[];
82
+ /** Tolerance for the per-dimension regression guard, in the dimension's
83
+ * native scale. When omitted it auto-scales off observed magnitudes:
84
+ * 0.05 on [0,1], 5 on 0-100. */
85
+ regressionTolerance?: number;
64
86
  /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
65
87
  * Composite verdict refuses to ship when spend exceeded budget. */
66
88
  budgetUsd?: number;
@@ -1,7 +1,7 @@
1
1
  import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
2
2
  import { R as RunRecord } from './run-record-BgTFzO2r.js';
3
3
  import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
4
- import { J as JudgeInput } from './types-DhqpAi_z.js';
4
+ import { a as JudgeInput } from './types-Croy5h7V.js';
5
5
 
6
6
  /**
7
7
  * ChatClient — the single LLM abstraction analysts call.
@@ -0,0 +1,233 @@
1
+ import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
2
+ import { m as GateDecision } from './summary-report-ByiOUrHj.js';
3
+ import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
4
+
5
+ /**
6
+ * Release confidence gate.
7
+ *
8
+ * This is the production-facing composition layer over the lower-level
9
+ * primitives:
10
+ * - Dataset manifests prove corpus/version coverage.
11
+ * - RunRecord rows prove reproducible search/holdout outcomes.
12
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
13
+ * - HeldOutGate decisions remain the paired promotion authority.
14
+ *
15
+ * The gate is intentionally pure and conservative. Missing declared evidence
16
+ * fails closed instead of being treated as a neutral zero.
17
+ */
18
+
19
+ /** Severity of an actionable finding attached to a run/trace. */
20
+ type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
21
+ /** Actionable side-info — a diagnosed finding the loop can act on. */
22
+ interface ActionableSideInfo {
23
+ /** Stable expectation/check id when available. */
24
+ expectationId?: string;
25
+ /** Human-readable diagnosis of what happened. */
26
+ message: string;
27
+ severity?: AsiSeverity;
28
+ /** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
29
+ evidence?: string;
30
+ /** Prompt/tool/context surface likely responsible. */
31
+ responsibleSurface?: string;
32
+ /** Suggested fix in natural language. */
33
+ suggestion?: string;
34
+ /** Whether this expectation was satisfied. Defaults to false for ASI rows. */
35
+ matched?: boolean;
36
+ metadata?: Record<string, unknown>;
37
+ }
38
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
39
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
40
+ interface ReleaseTraceEvidence {
41
+ scenarioId: string;
42
+ candidateId?: string;
43
+ split?: RunSplitTag;
44
+ score?: number;
45
+ ok?: boolean;
46
+ turnCount?: number;
47
+ costUsd?: number;
48
+ durationMs?: number;
49
+ failureMode?: string;
50
+ asi?: ActionableSideInfo[];
51
+ metadata?: Record<string, unknown>;
52
+ }
53
+ interface ReleaseConfidenceThresholds {
54
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
55
+ requireCorpus?: boolean;
56
+ minScenarioCount?: number;
57
+ minSearchRuns?: number;
58
+ minHoldoutRuns?: number;
59
+ /** Require at least one holdout scenario/run. Default true. */
60
+ requireHoldout?: boolean;
61
+ minPassRate?: number;
62
+ minMeanScore?: number;
63
+ /** Search mean may exceed holdout mean by at most this much. */
64
+ maxOverfitGap?: number;
65
+ maxMeanCostUsd?: number;
66
+ maxP95WallMs?: number;
67
+ /** Low-score/failed rows must carry ASI. Default true. */
68
+ requireAsiForFailures?: boolean;
69
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
70
+ failureScoreThreshold?: number;
71
+ }
72
+ interface ReleaseConfidenceInput {
73
+ target: string;
74
+ candidateId?: string;
75
+ baselineId?: string;
76
+ dataset?: DatasetManifest;
77
+ scenarios?: readonly DatasetScenario[];
78
+ runs?: readonly RunRecord[];
79
+ traces?: readonly ReleaseTraceEvidence[];
80
+ gateDecision?: GateDecision | null;
81
+ thresholds?: ReleaseConfidenceThresholds;
82
+ }
83
+ interface ReleaseConfidenceAxis {
84
+ name: ReleaseConfidenceAxisName;
85
+ status: ReleaseConfidenceStatus;
86
+ score: number;
87
+ detail: string;
88
+ }
89
+ interface ReleaseConfidenceIssue {
90
+ axis: ReleaseConfidenceAxisName;
91
+ severity: 'critical' | 'warning';
92
+ code: string;
93
+ detail: string;
94
+ }
95
+ interface ReleaseConfidenceMetrics {
96
+ scenarioCount: number;
97
+ searchRuns: number;
98
+ holdoutRuns: number;
99
+ passRate: number;
100
+ meanScore: number;
101
+ searchMeanScore: number;
102
+ holdoutMeanScore: number;
103
+ overfitGap: number;
104
+ meanCostUsd: number;
105
+ p95WallMs: number;
106
+ failedRows: number;
107
+ failuresWithAsi: number;
108
+ singleShotTraces: number;
109
+ multiShotTraces: number;
110
+ splitCounts: Record<DatasetSplit, number>;
111
+ domainCounts: Record<string, number>;
112
+ failureModeCounts: Record<string, number>;
113
+ responsibleSurfaceCounts: Record<string, number>;
114
+ }
115
+ interface ReleaseConfidenceScorecard {
116
+ target: string;
117
+ candidateId: string | null;
118
+ baselineId: string | null;
119
+ status: ReleaseConfidenceStatus;
120
+ promote: boolean;
121
+ axes: ReleaseConfidenceAxis[];
122
+ issues: ReleaseConfidenceIssue[];
123
+ metrics: ReleaseConfidenceMetrics;
124
+ dataset: DatasetManifest | null;
125
+ gateDecision: GateDecision | null;
126
+ summary: string;
127
+ }
128
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
129
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
130
+
131
+ /**
132
+ * Bootstrap-CI promotion gate.
133
+ *
134
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
135
+ * curation), the question is "did this generation actually improve, or are
136
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
137
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
138
+ * delta is real before code or prompts get promoted.
139
+ *
140
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
141
+ * and to compose into any verdict gate.
142
+ *
143
+ * Default gate:
144
+ * - Bootstrap mean baseline vs candidate (1k resamples).
145
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
146
+ * - Tunable confidence (default 95%) and resample count.
147
+ *
148
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
149
+ * vocabulary:
150
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
151
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
152
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
153
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
154
+ */
155
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
156
+ interface BootstrapResult {
157
+ baselineMean: number;
158
+ candidateMean: number;
159
+ /** candidateMean - baselineMean, point estimate. */
160
+ delta: number;
161
+ /** Lower bound of the (1 - alpha) CI on the delta. */
162
+ ciLower: number;
163
+ /** Upper bound of the (1 - alpha) CI on the delta. */
164
+ ciUpper: number;
165
+ /** Number of bootstrap resamples used. */
166
+ iterations: number;
167
+ alpha: number;
168
+ verdict: Verdict;
169
+ }
170
+ interface BootstrapOptions {
171
+ /** Confidence level alpha (default 0.05 → 95% CI). */
172
+ alpha?: number;
173
+ /** Number of resamples (default 1000). */
174
+ iterations?: number;
175
+ /**
176
+ * Minimum total samples (baseline + candidate) below which we always
177
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
178
+ * Default 6 (combined).
179
+ */
180
+ minTotalSamples?: number;
181
+ /** RNG seed for reproducibility. Default: Math.random. */
182
+ seed?: number;
183
+ }
184
+ /**
185
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
186
+ *
187
+ * Uses simple percentile bootstrap on the difference of resampled means.
188
+ * That's the standard non-parametric primitive — no distributional
189
+ * assumptions, robust to skew, easy to reason about.
190
+ */
191
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
192
+ /**
193
+ * Judge-replay promotion gate.
194
+ *
195
+ * The cheap inner-loop judge that drives an evolution run is by definition
196
+ * fast and noisy. When you're about to promote a winning variant to the
197
+ * canonical default, you want a STRONGER judge (a more expensive model, a
198
+ * human grader, a separately-trained reward model) to confirm the win
199
+ * generalises beyond the inner loop.
200
+ *
201
+ * This helper takes raw winner + baseline outputs, scores both through the
202
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
203
+ * judge agrees the winner is real with the configured confidence. Doesn't
204
+ * matter what shape your "output" is — pass a string, an object, anything
205
+ * the judge can read.
206
+ */
207
+ interface JudgeReplayGateArgs<TOutput> {
208
+ baselineOutputs: TOutput[];
209
+ candidateOutputs: TOutput[];
210
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
211
+ judge: (output: TOutput) => Promise<number> | number;
212
+ alpha?: number;
213
+ iterations?: number;
214
+ /** RNG seed for reproducibility. */
215
+ seed?: number;
216
+ /** Maximum concurrent judge calls. Default 4. */
217
+ judgeConcurrency?: number;
218
+ }
219
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
220
+ baselineSamples: number;
221
+ candidateSamples: number;
222
+ }>;
223
+
224
+ interface RenderReleaseReportOptions {
225
+ title?: string;
226
+ runs?: readonly RunRecord[];
227
+ comparator?: string;
228
+ traceAnalystFindings?: readonly string[];
229
+ nextActions?: readonly string[];
230
+ }
231
+ declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
232
+
233
+ export { type ActionableSideInfo as A, type BootstrapOptions as B, type JudgeReplayGateArgs as J, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type ReleaseConfidenceAxisName as b, type ReleaseConfidenceInput as c, type ReleaseConfidenceIssue as d, type ReleaseConfidenceMetrics as e, type ReleaseConfidenceScorecard as f, type ReleaseConfidenceStatus as g, type ReleaseConfidenceThresholds as h, type ReleaseTraceEvidence as i, type RenderReleaseReportOptions as j, assertReleaseConfidence as k, bootstrapCi as l, evaluateReleaseConfidence as m, judgeReplayGate as n, type AsiSeverity as o, renderReleaseReport as r };
@@ -1,14 +1,15 @@
1
1
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
+ export { P as PairedBootstrapOptions, a as PairedBootstrapResult, b as benjaminiHochberg, p as pairedBootstrap, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
4
5
  export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
5
6
  import './run-record-BgTFzO2r.js';
6
7
  import './errors-Dwqw-T_m.js';
7
8
  import './schema-m0gsnbt3.js';
8
9
  import './outcome-store-D6KWmYvj.js';
10
+ import './dataset-B2kL-fSM.js';
9
11
  import './judge-calibration-DilmB3Ml.js';
10
- import './types-DhqpAi_z.js';
12
+ import './types-Croy5h7V.js';
11
13
  import '@tangle-network/tcloud';
12
- import './dataset-B2kL-fSM.js';
13
14
  import './failure-cluster-CL7IVgkJ.js';
14
15
  import './store-CKUAgsJz.js';
@@ -0,0 +1,253 @@
1
+ import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
+ import { J as JudgeScore } from './types-Croy5h7V.js';
3
+
4
+ /**
5
+ * Normalize scores so all dimensions follow "higher = better".
6
+ * Inverted dimensions (hallucination, false_confidence, worst_failure)
7
+ * already use inverted scoring in the prompt (10 = no hallucination),
8
+ * but this function ensures consistency if raw scores leak through.
9
+ */
10
+ declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
11
+ /** Weighted mean — falls back to uniform weights when omitted */
12
+ declare function weightedMean(scores: {
13
+ score: number;
14
+ weight?: number;
15
+ }[]): number;
16
+ /** Bootstrap confidence interval */
17
+ declare function confidenceInterval(scores: number[], confidence?: number, opts?: {
18
+ seed?: number;
19
+ resamples?: number;
20
+ }): {
21
+ mean: number;
22
+ lower: number;
23
+ upper: number;
24
+ };
25
+ /**
26
+ * Inter-rater reliability — simplified Krippendorff's alpha.
27
+ *
28
+ * Each inner array is one judge's scores for all items.
29
+ * All arrays must have the same length (same items scored).
30
+ */
31
+ declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
32
+ /**
33
+ * Mann-Whitney U test for comparing two independent groups.
34
+ * Returns U statistic and approximate p-value (normal approximation).
35
+ */
36
+ declare function mannWhitneyU(a: number[], b: number[]): {
37
+ u: number;
38
+ p: number;
39
+ };
40
+ /** Partial credit: returns 0-1 ratio of current toward target */
41
+ declare function partialCredit(current: number, target: number): number;
42
+ /**
43
+ * Paired t-test — before/after measurements on the SAME items.
44
+ * Pairing removes inter-item variance, giving tighter significance than
45
+ * an unpaired test when comparing prompt v1 vs prompt v2 on identical
46
+ * scenarios.
47
+ */
48
+ declare function pairedTTest(before: number[], after: number[]): {
49
+ t: number;
50
+ df: number;
51
+ p: number;
52
+ };
53
+ /**
54
+ * Wilcoxon signed-rank test — paired non-parametric alternative.
55
+ * Use when the differences aren't normally distributed.
56
+ */
57
+ declare function wilcoxonSignedRank(before: number[], after: number[]): {
58
+ w: number;
59
+ p: number;
60
+ };
61
+ /**
62
+ * Cohen's d — standardized effect size for two independent groups.
63
+ * Positive d means group b has higher mean than group a.
64
+ * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
65
+ */
66
+ declare function cohensD(a: number[], b: number[]): number;
67
+ type CliffsMagnitude = 'negligible' | 'small' | 'medium' | 'large';
68
+ /**
69
+ * Cliff's delta — a non-parametric effect size for two independent samples.
70
+ * `δ = (#(after > before) − #(after < before)) / (n_before · n_after)`,
71
+ * ranging [-1, 1]. Positive ⇒ `after` tends to exceed `before` (improvement).
72
+ *
73
+ * Distribution-free counterpart to Cohen's d: no normality assumption, robust
74
+ * to the bounded/skewed score distributions judges produce. Pairs with
75
+ * `pairedBootstrap` / `wilcoxonSignedRank` for the non-parametric reporting
76
+ * path. Returns 0 when either sample is empty.
77
+ */
78
+ declare function cliffsDelta(before: number[], after: number[]): number;
79
+ /**
80
+ * Map a Cliff's delta to a qualitative magnitude using the standard
81
+ * Romano et al. thresholds (|δ|): <0.147 negligible, <0.33 small,
82
+ * <0.474 medium, else large.
83
+ */
84
+ declare function interpretCliffs(delta: number): CliffsMagnitude;
85
+ interface WeightedCompositeInput {
86
+ /** Per-dimension scores (typically 0..1). */
87
+ dims: Record<string, number>;
88
+ /** Weight per dimension. Every weighted dimension MUST be present in
89
+ * `dims` — a weight for an absent dimension is a config error and throws,
90
+ * because silently dropping it would renormalise the composite onto a
91
+ * different denominator than intended. */
92
+ weights: Record<string, number>;
93
+ /** Optional pass threshold; when set, the result reports `pass`. */
94
+ threshold?: number;
95
+ }
96
+ interface WeightedCompositeResult {
97
+ composite: number;
98
+ pass?: boolean;
99
+ }
100
+ /**
101
+ * Weighted composite over judge dimensions: `Σ(score_d · w_d) / Σ(w_d)` across
102
+ * the weighted dimensions. The canonical replacement for the per-consumer
103
+ * hand-rolled composite math (tax/legal/creative/gtm each ship a copy).
104
+ *
105
+ * Fail-loud: throws if a weighted dimension is missing from `dims`, if any
106
+ * weight is negative, or if the weights sum to 0 — none of which can produce
107
+ * a meaningful composite.
108
+ */
109
+ declare function weightedComposite(input: WeightedCompositeInput): WeightedCompositeResult;
110
+ interface CorpusScoreRecord {
111
+ /** Stable identifier for the rated item (scenario, span, turn, …). */
112
+ itemId: string;
113
+ /** Identifier for the judge that produced this score. */
114
+ judgeName: string;
115
+ /** Dimension name (matches `JudgeScore.dimension`). */
116
+ dimension: string;
117
+ /** Numeric score; must be finite. */
118
+ score: number;
119
+ }
120
+ interface CorpusAgreementPerDimension extends ContinuousAgreement {
121
+ dimension: string;
122
+ /** Item IDs that contributed to this dimension's matrix (every judge scored them). */
123
+ itemIds: string[];
124
+ /** Judge IDs that contributed to this dimension's matrix. */
125
+ judgeIds: string[];
126
+ }
127
+ interface CorpusAgreementReport {
128
+ /** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
129
+ perDimension: CorpusAgreementPerDimension[];
130
+ /** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
131
+ overallIcc: number;
132
+ /** Mean weighted κ across dimensions (NaN if none finite). */
133
+ overallWeightedKappa: number;
134
+ /** Dimensions evaluated (sorted). */
135
+ dimensions: string[];
136
+ /** Judges seen across the corpus (sorted). */
137
+ judgeIds: string[];
138
+ }
139
+ interface CorpusAgreementOptions extends ContinuousAgreementOptions {
140
+ /**
141
+ * Restrict the audit to these dimensions. Default = every dimension
142
+ * that appears in the input. A dimension named here but absent from
143
+ * the input throws — silent omission would corrupt the overall metric.
144
+ */
145
+ dimensions?: string[];
146
+ /**
147
+ * Restrict the audit to these judges. Default = every judge that
148
+ * appears in the input. A judge named here but absent from a
149
+ * dimension throws (see "fail loud" below).
150
+ */
151
+ judges?: string[];
152
+ }
153
+ /**
154
+ * Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
155
+ *
156
+ * For each dimension, builds the [n_items][n_judges] matrix of scores
157
+ * (keeping only items every judge rated on that dimension), then runs
158
+ * `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
159
+ * bootstrap CIs. Reports a pooled mean across dimensions as a single
160
+ * "is this judge panel reliable on this corpus?" number.
161
+ *
162
+ * Fail-loud contract:
163
+ * - Empty input throws.
164
+ * - Fewer than 2 judges or fewer than 2 items per dimension throws.
165
+ * - A judge present in some dimensions but with zero scored items on
166
+ * another dimension throws (would silently shrink the matrix).
167
+ * - Duplicate (itemId, judgeName, dimension) records throw.
168
+ */
169
+ declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
170
+ /**
171
+ * Convenience adapter for `JudgeScore[]` data keyed externally by item.
172
+ *
173
+ * Use when you have per-item arrays of `JudgeScore[]` (e.g. one
174
+ * `ScenarioResult.judgeScores` per scenario) and want corpus-wide
175
+ * agreement without manually flattening. `itemId` must be unique per
176
+ * row of `itemsScores`.
177
+ */
178
+ declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
179
+ itemId: string;
180
+ scores: JudgeScore[];
181
+ }>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
182
+ /**
183
+ * Required N per arm for a two-sample comparison at target effect size,
184
+ * alpha, and power. Normal-approximation formula:
185
+ * n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
186
+ * where d is Cohen's d. Returns Infinity for effect ≤ 0.
187
+ */
188
+ declare function requiredSampleSize(opts: {
189
+ effect: number;
190
+ alpha?: number;
191
+ power?: number;
192
+ twoSided?: boolean;
193
+ }): number;
194
+ /**
195
+ * Minimum detectable paired effect (standardised units) for a target paired
196
+ * sample size: d_min = (z_{1-α/2} + z_β) / sqrt(n_paired). Multiply by
197
+ * sd(deltas) for score units; treat as a lower bound — Wilcoxon and bootstrap
198
+ * have asymptotic relative efficiency below 1 vs the t-test on heavy tails.
199
+ */
200
+ declare function pairedMde(opts: {
201
+ nPaired: number;
202
+ alpha?: number;
203
+ power?: number;
204
+ twoSided?: boolean;
205
+ }): number;
206
+ /** Bonferroni adjustment: multiply every p-value by the test count, clamp at 1. */
207
+ declare function bonferroni(pValues: number[], alpha?: number): {
208
+ adjusted: number[];
209
+ significant: boolean[];
210
+ };
211
+ /**
212
+ * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
213
+ * significance at the target FDR; handles ties and preserves q monotonicity.
214
+ */
215
+ declare function benjaminiHochberg(pValues: number[], fdr?: number): {
216
+ qValues: number[];
217
+ significant: boolean[];
218
+ };
219
+ interface PairedBootstrapResult {
220
+ /** Number of paired observations. */
221
+ n: number;
222
+ /** Median of paired deltas (after − before). */
223
+ median: number;
224
+ /** Mean of paired deltas. */
225
+ mean: number;
226
+ /** Lower bound of the bootstrap CI on the chosen statistic. */
227
+ low: number;
228
+ /** Upper bound of the bootstrap CI on the chosen statistic. */
229
+ high: number;
230
+ /** Confidence level used (e.g. 0.95). */
231
+ confidence: number;
232
+ /** Number of bootstrap resamples used. */
233
+ resamples: number;
234
+ }
235
+ interface PairedBootstrapOptions {
236
+ /** Confidence level. Default 0.95. */
237
+ confidence?: number;
238
+ /** Bootstrap resample count. Default 2000. */
239
+ resamples?: number;
240
+ /** Statistic to bootstrap. Default 'median'. */
241
+ statistic?: 'median' | 'mean';
242
+ /** Deterministic seed. If omitted, uses Math.random(). */
243
+ seed?: number;
244
+ }
245
+ /**
246
+ * Paired bootstrap on (after − before) deltas. Returns a CI on the chosen
247
+ * statistic (median by default); pairs are resampled with replacement. The
248
+ * lower bound is what the promotion gate checks — `low > threshold` means the
249
+ * gain is real at the confidence level. Throws on unequal sample sizes.
250
+ */
251
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
252
+
253
+ export { type CliffsMagnitude as C, type PairedBootstrapOptions as P, type WeightedCompositeInput as W, type PairedBootstrapResult as a, benjaminiHochberg as b, type CorpusAgreementOptions as c, type CorpusAgreementPerDimension as d, type CorpusAgreementReport as e, type CorpusScoreRecord as f, type WeightedCompositeResult as g, bonferroni as h, cliffsDelta as i, cohensD as j, confidenceInterval as k, corpusInterRaterAgreement as l, corpusInterRaterAgreementFromJudgeScores as m, interRaterReliability as n, interpretCliffs as o, pairedBootstrap as p, mannWhitneyU as q, normalizeScores as r, pairedMde as s, pairedTTest as t, partialCredit as u, requiredSampleSize as v, wilcoxonSignedRank as w, weightedComposite as x, weightedMean as y };