@tangle-network/agent-eval 0.49.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/langchain.d.ts +1 -1
  3. package/dist/adapters/otel.d.ts +8 -2
  4. package/dist/campaign/index.d.ts +3 -3
  5. package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
  6. package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
  7. package/dist/chunk-EGIPWXHL.js.map +1 -0
  8. package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
  9. package/dist/chunk-FQK2CCIM.js.map +1 -0
  10. package/dist/chunk-MAZ26DC7.js +99 -0
  11. package/dist/chunk-MAZ26DC7.js.map +1 -0
  12. package/dist/chunk-SHTXZ4O2.js +113 -0
  13. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  14. package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
  15. package/dist/contract/index.d.ts +206 -9
  16. package/dist/contract/index.js +751 -3
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/governance/index.d.ts +1 -1
  19. package/dist/hosted/index.d.ts +8 -192
  20. package/dist/hosted/index.js +1 -1
  21. package/dist/index-BRxz6qov.d.ts +409 -0
  22. package/dist/index.d.ts +18 -462
  23. package/dist/index.js +14 -106
  24. package/dist/index.js.map +1 -1
  25. package/dist/meta-eval/index.d.ts +3 -3
  26. package/dist/openapi.json +1 -1
  27. package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
  28. package/dist/registry-8KAs18kY.d.ts +457 -0
  29. package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
  30. package/dist/reporting.d.ts +6 -4
  31. package/dist/reporting.js +6 -4
  32. package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
  33. package/dist/rl.d.ts +9 -8
  34. package/dist/rl.js +3 -2
  35. package/dist/rl.js.map +1 -1
  36. package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  37. package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
  38. package/dist/sequential-5iSVfzl2.d.ts +139 -0
  39. package/dist/store-CJbzDxZ2.d.ts +220 -0
  40. package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
  41. package/dist/traces.d.ts +3 -220
  42. package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
  43. package/dist/types-DhqpAi_z.d.ts +296 -0
  44. package/package.json +1 -1
  45. package/dist/chunk-MNL6LXGQ.js.map +0 -1
  46. package/dist/chunk-OYI6RZJK.js.map +0 -1
  47. /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
  48. /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
@@ -1,302 +1,9 @@
1
1
  import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
- import { TCloud } from '@tangle-network/tcloud';
2
+ import { a as JudgeScore } from './types-DhqpAi_z.js';
3
3
  import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
4
- import { w as GateDecision } from './sequential-CbFH___X.js';
4
+ import { m as GateDecision } from './summary-report-B7gNRX-r.js';
5
5
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
6
6
 
7
- interface Scenario {
8
- id: string;
9
- persona: string;
10
- label: string;
11
- thesis: string;
12
- dimensions: string[];
13
- turns: Turn[];
14
- artifactChecks: ArtifactCheck[];
15
- systemPromptAppend?: string;
16
- }
17
- interface Turn {
18
- user: string;
19
- expectedBehaviors: string[];
20
- adversarial?: boolean;
21
- feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
22
- }
23
- interface ArtifactCheck {
24
- type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
25
- target: string;
26
- contains?: string;
27
- minCount?: number;
28
- description: string;
29
- }
30
- interface JudgeConfig {
31
- model: string;
32
- temperature: number;
33
- rubric: JudgeRubric;
34
- }
35
- interface JudgeRubric {
36
- name: string;
37
- description: string;
38
- dimensions: RubricDimension[];
39
- }
40
- interface RubricDimension {
41
- name: string;
42
- description: string;
43
- anchor_low: string;
44
- anchor_high: string;
45
- weight: number;
46
- }
47
- interface ScenarioResult {
48
- scenarioId: string;
49
- persona: string;
50
- turns: TurnResult[];
51
- artifactResults: ArtifactResult[];
52
- judgeScores: JudgeScore[];
53
- judgeErrors: number;
54
- overallScore: number;
55
- totalDurationMs: number;
56
- artifacts: CollectedArtifacts;
57
- }
58
- interface TurnResult {
59
- turnIndex: number;
60
- userMessage: string;
61
- agentResponse: string;
62
- durationMs: number;
63
- blocksExtracted: {
64
- type: string;
65
- title: string;
66
- }[];
67
- containsCode: boolean;
68
- containsToolCall: boolean;
69
- }
70
- interface ArtifactResult {
71
- check: ArtifactCheck;
72
- passed: boolean;
73
- detail?: string;
74
- }
75
- interface JudgeScore {
76
- judgeName: string;
77
- dimension: string;
78
- score: number;
79
- reasoning: string;
80
- evidence?: string;
81
- }
82
- interface CollectedArtifacts {
83
- vaultFiles: {
84
- path: string;
85
- content: string;
86
- }[];
87
- blocksExtracted: {
88
- type: string;
89
- fields: Record<string, string>;
90
- }[];
91
- codeBlocks: {
92
- language: string;
93
- code: string;
94
- }[];
95
- toolCalls: string[];
96
- }
97
- interface BenchmarkReport {
98
- timestamp: string;
99
- generation: number;
100
- promptVersion: string;
101
- scenarioCount: number;
102
- results: ScenarioResult[];
103
- summary: {
104
- overallAvg: number;
105
- byPersona: Record<string, {
106
- avg: number;
107
- passed: number;
108
- total: number;
109
- }>;
110
- byDimension: Record<string, {
111
- avg: number;
112
- scores: number[];
113
- }>;
114
- weakest: {
115
- scenario: string;
116
- score: number;
117
- reason: string;
118
- }[];
119
- strongest: {
120
- scenario: string;
121
- score: number;
122
- reason: string;
123
- }[];
124
- };
125
- }
126
- interface RouteMap {
127
- signup?: string;
128
- login?: string;
129
- workspaces?: string;
130
- threads?: string;
131
- chat?: string;
132
- tasks?: string;
133
- events?: string;
134
- approvals?: string;
135
- vault?: string;
136
- generations?: string;
137
- [key: string]: string | undefined;
138
- }
139
- interface ProductClientConfig {
140
- baseUrl: string;
141
- routes: RouteMap;
142
- }
143
- interface ScenarioFile {
144
- id: string;
145
- category: string;
146
- persona: string;
147
- label: string;
148
- thesis: string;
149
- isControl?: boolean;
150
- rubric?: {
151
- dimensions: {
152
- name: string;
153
- description: string;
154
- weight: number;
155
- }[];
156
- };
157
- turns: Turn[];
158
- artifactChecks: ArtifactCheck[];
159
- }
160
- interface CompletionCriterion {
161
- name: string;
162
- check: (state: DriverState) => boolean;
163
- progress?: (state: DriverState) => number;
164
- }
165
- interface FeedbackPattern {
166
- trigger: string;
167
- response: string;
168
- }
169
- /**
170
- * How hard the simulated user pushes back. The driver LLM scales its tone
171
- * and follow-up aggression to this:
172
- * cooperative — forgiving early adopter; accepts reasonable answers.
173
- * demanding — experienced professional; rejects vague or hedged answers.
174
- * relentless — senior partner reviewing for a client who will litigate;
175
- * interrogates every claim, accepts nothing undefended.
176
- */
177
- type PersonaRigor = 'cooperative' | 'demanding' | 'relentless';
178
- interface PersonaConfig {
179
- id: string;
180
- role: string;
181
- goal: string;
182
- completionCriteria: CompletionCriterion[];
183
- feedbackPatterns?: FeedbackPattern[];
184
- maxTurns: number;
185
- driverModel?: string;
186
- /** How adversarial the simulated user is. Defaults to 'demanding'. */
187
- rigor?: PersonaRigor;
188
- /**
189
- * Domain expertise the simulated user holds — quoted into the driver
190
- * prompt so it challenges the agent with authority instead of vague
191
- * dissatisfaction. e.g. "a 15-year M&A partner who knows GAAP
192
- * working-capital mechanics cold".
193
- */
194
- expertise?: string;
195
- /**
196
- * Substantive issues a senior professional in this role would
197
- * interrogate — traps the scenario hides, claims that must be defended.
198
- * The driver probes these without revealing them verbatim; the agent
199
- * must surface them on its own.
200
- */
201
- pressurePoints?: string[];
202
- /**
203
- * Curveballs the driver may inject once the agent is coasting — changed
204
- * facts, a hostile counterparty position, a new constraint. Forces the
205
- * agent to re-derive rather than recite.
206
- */
207
- curveballs?: string[];
208
- }
209
- interface DriverState {
210
- tasks: number;
211
- events: number;
212
- proposals: {
213
- pending: number;
214
- approved: number;
215
- rejected: number;
216
- };
217
- vaultFiles: string[];
218
- codeBlocks: number;
219
- generations: number;
220
- }
221
- interface TurnMetrics {
222
- turn: number;
223
- timestamp: string;
224
- tasks: number;
225
- events: number;
226
- proposals: {
227
- pending: number;
228
- approved: number;
229
- rejected: number;
230
- };
231
- vaultFiles: number;
232
- responseLatencyMs: number;
233
- responseChars: number;
234
- codeBlocksProduced: number;
235
- blocksExtracted: number;
236
- qualityScore?: number;
237
- inputTokens: number;
238
- outputTokens: number;
239
- estimatedCostUsd: number;
240
- totalCostUsd: number;
241
- completionPercent: number;
242
- }
243
- interface DriverResult {
244
- personaId: string;
245
- /** True when the simulated user professionally signed off (driver said DONE). */
246
- completed: boolean;
247
- /** Turn at which the simulated user signed off, or null if it never did. */
248
- turnsToCompletion: number | null;
249
- /**
250
- * Turn at which nominal completionCriteria were first all met, or null.
251
- * Distinct from turnsToCompletion: criteria can be met while the
252
- * simulated professional is still unsatisfied with the work's rigor.
253
- */
254
- criteriaMetAtTurn: number | null;
255
- totalTurns: number;
256
- metrics: TurnMetrics[];
257
- finalState: DriverState;
258
- convergenceCurve: number[];
259
- totalCostUsd: number;
260
- finalQualityScore: number | null;
261
- }
262
- interface BenchmarkRunnerConfig {
263
- scenarios: Scenario[];
264
- judges: JudgeFn[];
265
- systemPrompt: string;
266
- model?: string;
267
- judgeModel?: string;
268
- passThreshold?: number;
269
- generation?: number;
270
- promptVersion?: string;
271
- }
272
- interface JudgeInput {
273
- scenario: Scenario;
274
- turns: TurnResult[];
275
- artifacts: CollectedArtifacts;
276
- }
277
- type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
278
-
279
- interface TestResult {
280
- name: string;
281
- passed: boolean;
282
- duration: number;
283
- detail?: string;
284
- checks: CheckResult[];
285
- }
286
- interface CheckResult {
287
- name: string;
288
- passed: boolean;
289
- expected: string;
290
- actual: string;
291
- }
292
- interface EvalResult {
293
- scenario: string;
294
- status: 'pass' | 'fail' | 'skip';
295
- duration: number;
296
- detail?: string;
297
- artifact?: string;
298
- }
299
-
300
7
  /**
301
8
  * Release confidence gate.
302
9
  *
@@ -731,4 +438,4 @@ interface RenderReleaseReportOptions {
731
438
  }
732
439
  declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
733
440
 
734
- export { type PersonaRigor as $, type CollectedArtifacts as A, type BootstrapOptions as B, type CheckResult as C, type DriverResult as D, type ScenarioResult as E, type TurnMetrics as F, type ScenarioFile as G, type CompletionCriterion as H, type ActionableSideInfo as I, type JudgeReplayGateArgs as J, type ArtifactCheck as K, type ArtifactResult as L, type AsiSeverity as M, type CorpusAgreementOptions as N, type CorpusAgreementPerDimension as O, type PairedBootstrapOptions as P, type CorpusAgreementReport as Q, type ReleaseConfidenceAxis as R, type Scenario as S, type TestResult as T, type CorpusScoreRecord as U, type Verdict as V, type EvalResult as W, type FeedbackPattern as X, type JudgeConfig as Y, type JudgeRubric as Z, type JudgeScore as _, type BootstrapResult as a, type RouteMap as a0, type RubricDimension as a1, type Turn as a2, type TurnResult as a3, bonferroni as a4, cohensD as a5, confidenceInterval as a6, corpusInterRaterAgreement as a7, corpusInterRaterAgreementFromJudgeScores as a8, interRaterReliability as a9, mannWhitneyU as aa, normalizeScores as ab, pairedMde as ac, pairedTTest as ad, partialCredit as ae, requiredSampleSize as af, weightedMean as ag, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, renderReleaseReport as r, type JudgeInput as s, type JudgeFn as t, type BenchmarkRunnerConfig as u, type BenchmarkReport as v, wilcoxonSignedRank as w, type ProductClientConfig as x, type PersonaConfig as y, type DriverState as z };
441
+ export { type ActionableSideInfo as A, type BootstrapOptions as B, type CorpusAgreementOptions as C, corpusInterRaterAgreement as D, corpusInterRaterAgreementFromJudgeScores as E, interRaterReliability as F, mannWhitneyU as G, normalizeScores as H, pairedMde as I, type JudgeReplayGateArgs as J, pairedTTest as K, partialCredit as L, requiredSampleSize as M, weightedMean as N, type PairedBootstrapOptions as P, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, renderReleaseReport as r, type AsiSeverity as s, type CorpusAgreementPerDimension as t, type CorpusAgreementReport as u, type CorpusScoreRecord as v, wilcoxonSignedRank as w, bonferroni as x, cohensD as y, confidenceInterval as z };
@@ -1,10 +1,12 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CJ08tGwq.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DBB8lB1P.js';
3
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-CbFH___X.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DSu0DWy8.js';
3
+ export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-B7gNRX-r.js';
4
5
  import './run-record-BGY6bHRh.js';
5
6
  import './errors-mje_cKOs.js';
6
- import './outcome-store-BxJ3DQKJ.js';
7
+ import './outcome-store-D6KWmYvj.js';
7
8
  import './judge-calibration-DilmB3Ml.js';
9
+ import './types-DhqpAi_z.js';
8
10
  import '@tangle-network/tcloud';
9
11
  import './dataset-BlwAtYYf.js';
10
12
  import './failure-cluster-Cw65_5FY.js';
package/dist/reporting.js CHANGED
@@ -4,19 +4,21 @@ import {
4
4
  evaluateReleaseConfidence,
5
5
  judgeReplayGate,
6
6
  renderReleaseReport
7
- } from "./chunk-KQ26DYTQ.js";
7
+ } from "./chunk-UBQGWD3O.js";
8
8
  import {
9
9
  rubricPredictiveValidity
10
10
  } from "./chunk-YRZ4M5GS.js";
11
11
  import {
12
- RESEARCH_REPORT_HARD_PAIR_FLOOR,
13
12
  evaluateInterimReleaseConfidence,
13
+ pairedEvalueSequence
14
+ } from "./chunk-MAZ26DC7.js";
15
+ import {
16
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
14
17
  gainHistogram,
15
- pairedEvalueSequence,
16
18
  paretoChart,
17
19
  researchReport,
18
20
  summaryTable
19
- } from "./chunk-MNL6LXGQ.js";
21
+ } from "./chunk-EGIPWXHL.js";
20
22
  import {
21
23
  benjaminiHochberg,
22
24
  pairedBootstrap,
@@ -1,6 +1,6 @@
1
1
  import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
2
2
  import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
3
- import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-CbFH___X.js';
3
+ import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-B7gNRX-r.js';
4
4
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
5
5
  import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
6
6
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
package/dist/rl.d.ts CHANGED
@@ -1,18 +1,19 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
- import { d as CampaignResult } from './types-8u72Gc76.js';
3
- import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CHMO56K0.js';
4
- export { r as runEvalCampaign } from './researcher-CHMO56K0.js';
2
+ import { j as CampaignResult } from './types-Dbj5gu8n.js';
3
+ import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-LZD0qHEa.js';
4
+ export { r as runEvalCampaign } from './researcher-LZD0qHEa.js';
5
5
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
6
- import { O as OutcomeStore } from './outcome-store-BxJ3DQKJ.js';
7
- export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-BxJ3DQKJ.js';
8
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CJ08tGwq.js';
9
- import { I as InterimReleaseConfidence } from './sequential-CbFH___X.js';
6
+ import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
7
+ export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
8
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-ByZEC3BX.js';
9
+ import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
10
10
  import './errors-mje_cKOs.js';
11
11
  import './llm-client-BXVRUZyX.js';
12
12
  import './raw-provider-sink-C46HDghv.js';
13
+ import './summary-report-B7gNRX-r.js';
14
+ import './failure-cluster-Cw65_5FY.js';
13
15
  import './emitter-DP_cSSiw.js';
14
16
  import './integrity-CTDhR1Sg.js';
15
- import './failure-cluster-Cw65_5FY.js';
16
17
 
17
18
  /**
18
19
  * Test-time compute scaling curves.
package/dist/rl.js CHANGED
@@ -10,14 +10,15 @@ import {
10
10
  } from "./chunk-3RF76KTD.js";
11
11
  import {
12
12
  runEvalCampaign
13
- } from "./chunk-PD3MH6WU.js";
13
+ } from "./chunk-5KSDYBYH.js";
14
14
  import "./chunk-BWZEGTES.js";
15
15
  import {
16
16
  rubricPredictiveValidity
17
17
  } from "./chunk-YRZ4M5GS.js";
18
18
  import {
19
19
  evaluateInterimReleaseConfidence
20
- } from "./chunk-MNL6LXGQ.js";
20
+ } from "./chunk-MAZ26DC7.js";
21
+ import "./chunk-EGIPWXHL.js";
21
22
  import {
22
23
  benjaminiHochberg,
23
24
  wilcoxonSignedRank