@tangle-network/agent-eval 0.49.0 → 0.50.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +135 -0
- package/README.md +235 -331
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +8 -2
- package/dist/campaign/index.d.ts +3 -3
- package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
- package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
- package/dist/chunk-EGIPWXHL.js.map +1 -0
- package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
- package/dist/chunk-FQK2CCIM.js.map +1 -0
- package/dist/chunk-MAZ26DC7.js +99 -0
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
- package/dist/contract/index.d.ts +206 -9
- package/dist/contract/index.js +751 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +8 -192
- package/dist/hosted/index.js +1 -1
- package/dist/index-BRxz6qov.d.ts +409 -0
- package/dist/index.d.ts +18 -462
- package/dist/index.js +14 -106
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
- package/dist/registry-8KAs18kY.d.ts +457 -0
- package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +6 -4
- package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
- package/dist/rl.d.ts +9 -8
- package/dist/rl.js +3 -2
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
- package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
- package/dist/sequential-5iSVfzl2.d.ts +139 -0
- package/dist/store-CJbzDxZ2.d.ts +220 -0
- package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
- package/dist/traces.d.ts +3 -220
- package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
- package/dist/types-DhqpAi_z.d.ts +296 -0
- package/docs/concepts.md +20 -0
- package/docs/customer-journeys.md +208 -0
- package/docs/insight-report.md +337 -0
- package/package.json +1 -1
- package/dist/chunk-MNL6LXGQ.js.map +0 -1
- package/dist/chunk-OYI6RZJK.js.map +0 -1
- /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
- /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
|
@@ -1,302 +1,9 @@
|
|
|
1
1
|
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
|
-
import {
|
|
2
|
+
import { a as JudgeScore } from './types-DhqpAi_z.js';
|
|
3
3
|
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
|
|
4
|
-
import {
|
|
4
|
+
import { m as GateDecision } from './summary-report-B7gNRX-r.js';
|
|
5
5
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
6
6
|
|
|
7
|
-
interface Scenario {
|
|
8
|
-
id: string;
|
|
9
|
-
persona: string;
|
|
10
|
-
label: string;
|
|
11
|
-
thesis: string;
|
|
12
|
-
dimensions: string[];
|
|
13
|
-
turns: Turn[];
|
|
14
|
-
artifactChecks: ArtifactCheck[];
|
|
15
|
-
systemPromptAppend?: string;
|
|
16
|
-
}
|
|
17
|
-
interface Turn {
|
|
18
|
-
user: string;
|
|
19
|
-
expectedBehaviors: string[];
|
|
20
|
-
adversarial?: boolean;
|
|
21
|
-
feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
|
|
22
|
-
}
|
|
23
|
-
interface ArtifactCheck {
|
|
24
|
-
type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
|
|
25
|
-
target: string;
|
|
26
|
-
contains?: string;
|
|
27
|
-
minCount?: number;
|
|
28
|
-
description: string;
|
|
29
|
-
}
|
|
30
|
-
interface JudgeConfig {
|
|
31
|
-
model: string;
|
|
32
|
-
temperature: number;
|
|
33
|
-
rubric: JudgeRubric;
|
|
34
|
-
}
|
|
35
|
-
interface JudgeRubric {
|
|
36
|
-
name: string;
|
|
37
|
-
description: string;
|
|
38
|
-
dimensions: RubricDimension[];
|
|
39
|
-
}
|
|
40
|
-
interface RubricDimension {
|
|
41
|
-
name: string;
|
|
42
|
-
description: string;
|
|
43
|
-
anchor_low: string;
|
|
44
|
-
anchor_high: string;
|
|
45
|
-
weight: number;
|
|
46
|
-
}
|
|
47
|
-
interface ScenarioResult {
|
|
48
|
-
scenarioId: string;
|
|
49
|
-
persona: string;
|
|
50
|
-
turns: TurnResult[];
|
|
51
|
-
artifactResults: ArtifactResult[];
|
|
52
|
-
judgeScores: JudgeScore[];
|
|
53
|
-
judgeErrors: number;
|
|
54
|
-
overallScore: number;
|
|
55
|
-
totalDurationMs: number;
|
|
56
|
-
artifacts: CollectedArtifacts;
|
|
57
|
-
}
|
|
58
|
-
interface TurnResult {
|
|
59
|
-
turnIndex: number;
|
|
60
|
-
userMessage: string;
|
|
61
|
-
agentResponse: string;
|
|
62
|
-
durationMs: number;
|
|
63
|
-
blocksExtracted: {
|
|
64
|
-
type: string;
|
|
65
|
-
title: string;
|
|
66
|
-
}[];
|
|
67
|
-
containsCode: boolean;
|
|
68
|
-
containsToolCall: boolean;
|
|
69
|
-
}
|
|
70
|
-
interface ArtifactResult {
|
|
71
|
-
check: ArtifactCheck;
|
|
72
|
-
passed: boolean;
|
|
73
|
-
detail?: string;
|
|
74
|
-
}
|
|
75
|
-
interface JudgeScore {
|
|
76
|
-
judgeName: string;
|
|
77
|
-
dimension: string;
|
|
78
|
-
score: number;
|
|
79
|
-
reasoning: string;
|
|
80
|
-
evidence?: string;
|
|
81
|
-
}
|
|
82
|
-
interface CollectedArtifacts {
|
|
83
|
-
vaultFiles: {
|
|
84
|
-
path: string;
|
|
85
|
-
content: string;
|
|
86
|
-
}[];
|
|
87
|
-
blocksExtracted: {
|
|
88
|
-
type: string;
|
|
89
|
-
fields: Record<string, string>;
|
|
90
|
-
}[];
|
|
91
|
-
codeBlocks: {
|
|
92
|
-
language: string;
|
|
93
|
-
code: string;
|
|
94
|
-
}[];
|
|
95
|
-
toolCalls: string[];
|
|
96
|
-
}
|
|
97
|
-
interface BenchmarkReport {
|
|
98
|
-
timestamp: string;
|
|
99
|
-
generation: number;
|
|
100
|
-
promptVersion: string;
|
|
101
|
-
scenarioCount: number;
|
|
102
|
-
results: ScenarioResult[];
|
|
103
|
-
summary: {
|
|
104
|
-
overallAvg: number;
|
|
105
|
-
byPersona: Record<string, {
|
|
106
|
-
avg: number;
|
|
107
|
-
passed: number;
|
|
108
|
-
total: number;
|
|
109
|
-
}>;
|
|
110
|
-
byDimension: Record<string, {
|
|
111
|
-
avg: number;
|
|
112
|
-
scores: number[];
|
|
113
|
-
}>;
|
|
114
|
-
weakest: {
|
|
115
|
-
scenario: string;
|
|
116
|
-
score: number;
|
|
117
|
-
reason: string;
|
|
118
|
-
}[];
|
|
119
|
-
strongest: {
|
|
120
|
-
scenario: string;
|
|
121
|
-
score: number;
|
|
122
|
-
reason: string;
|
|
123
|
-
}[];
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
interface RouteMap {
|
|
127
|
-
signup?: string;
|
|
128
|
-
login?: string;
|
|
129
|
-
workspaces?: string;
|
|
130
|
-
threads?: string;
|
|
131
|
-
chat?: string;
|
|
132
|
-
tasks?: string;
|
|
133
|
-
events?: string;
|
|
134
|
-
approvals?: string;
|
|
135
|
-
vault?: string;
|
|
136
|
-
generations?: string;
|
|
137
|
-
[key: string]: string | undefined;
|
|
138
|
-
}
|
|
139
|
-
interface ProductClientConfig {
|
|
140
|
-
baseUrl: string;
|
|
141
|
-
routes: RouteMap;
|
|
142
|
-
}
|
|
143
|
-
interface ScenarioFile {
|
|
144
|
-
id: string;
|
|
145
|
-
category: string;
|
|
146
|
-
persona: string;
|
|
147
|
-
label: string;
|
|
148
|
-
thesis: string;
|
|
149
|
-
isControl?: boolean;
|
|
150
|
-
rubric?: {
|
|
151
|
-
dimensions: {
|
|
152
|
-
name: string;
|
|
153
|
-
description: string;
|
|
154
|
-
weight: number;
|
|
155
|
-
}[];
|
|
156
|
-
};
|
|
157
|
-
turns: Turn[];
|
|
158
|
-
artifactChecks: ArtifactCheck[];
|
|
159
|
-
}
|
|
160
|
-
interface CompletionCriterion {
|
|
161
|
-
name: string;
|
|
162
|
-
check: (state: DriverState) => boolean;
|
|
163
|
-
progress?: (state: DriverState) => number;
|
|
164
|
-
}
|
|
165
|
-
interface FeedbackPattern {
|
|
166
|
-
trigger: string;
|
|
167
|
-
response: string;
|
|
168
|
-
}
|
|
169
|
-
/**
|
|
170
|
-
* How hard the simulated user pushes back. The driver LLM scales its tone
|
|
171
|
-
* and follow-up aggression to this:
|
|
172
|
-
* cooperative — forgiving early adopter; accepts reasonable answers.
|
|
173
|
-
* demanding — experienced professional; rejects vague or hedged answers.
|
|
174
|
-
* relentless — senior partner reviewing for a client who will litigate;
|
|
175
|
-
* interrogates every claim, accepts nothing undefended.
|
|
176
|
-
*/
|
|
177
|
-
type PersonaRigor = 'cooperative' | 'demanding' | 'relentless';
|
|
178
|
-
interface PersonaConfig {
|
|
179
|
-
id: string;
|
|
180
|
-
role: string;
|
|
181
|
-
goal: string;
|
|
182
|
-
completionCriteria: CompletionCriterion[];
|
|
183
|
-
feedbackPatterns?: FeedbackPattern[];
|
|
184
|
-
maxTurns: number;
|
|
185
|
-
driverModel?: string;
|
|
186
|
-
/** How adversarial the simulated user is. Defaults to 'demanding'. */
|
|
187
|
-
rigor?: PersonaRigor;
|
|
188
|
-
/**
|
|
189
|
-
* Domain expertise the simulated user holds — quoted into the driver
|
|
190
|
-
* prompt so it challenges the agent with authority instead of vague
|
|
191
|
-
* dissatisfaction. e.g. "a 15-year M&A partner who knows GAAP
|
|
192
|
-
* working-capital mechanics cold".
|
|
193
|
-
*/
|
|
194
|
-
expertise?: string;
|
|
195
|
-
/**
|
|
196
|
-
* Substantive issues a senior professional in this role would
|
|
197
|
-
* interrogate — traps the scenario hides, claims that must be defended.
|
|
198
|
-
* The driver probes these without revealing them verbatim; the agent
|
|
199
|
-
* must surface them on its own.
|
|
200
|
-
*/
|
|
201
|
-
pressurePoints?: string[];
|
|
202
|
-
/**
|
|
203
|
-
* Curveballs the driver may inject once the agent is coasting — changed
|
|
204
|
-
* facts, a hostile counterparty position, a new constraint. Forces the
|
|
205
|
-
* agent to re-derive rather than recite.
|
|
206
|
-
*/
|
|
207
|
-
curveballs?: string[];
|
|
208
|
-
}
|
|
209
|
-
interface DriverState {
|
|
210
|
-
tasks: number;
|
|
211
|
-
events: number;
|
|
212
|
-
proposals: {
|
|
213
|
-
pending: number;
|
|
214
|
-
approved: number;
|
|
215
|
-
rejected: number;
|
|
216
|
-
};
|
|
217
|
-
vaultFiles: string[];
|
|
218
|
-
codeBlocks: number;
|
|
219
|
-
generations: number;
|
|
220
|
-
}
|
|
221
|
-
interface TurnMetrics {
|
|
222
|
-
turn: number;
|
|
223
|
-
timestamp: string;
|
|
224
|
-
tasks: number;
|
|
225
|
-
events: number;
|
|
226
|
-
proposals: {
|
|
227
|
-
pending: number;
|
|
228
|
-
approved: number;
|
|
229
|
-
rejected: number;
|
|
230
|
-
};
|
|
231
|
-
vaultFiles: number;
|
|
232
|
-
responseLatencyMs: number;
|
|
233
|
-
responseChars: number;
|
|
234
|
-
codeBlocksProduced: number;
|
|
235
|
-
blocksExtracted: number;
|
|
236
|
-
qualityScore?: number;
|
|
237
|
-
inputTokens: number;
|
|
238
|
-
outputTokens: number;
|
|
239
|
-
estimatedCostUsd: number;
|
|
240
|
-
totalCostUsd: number;
|
|
241
|
-
completionPercent: number;
|
|
242
|
-
}
|
|
243
|
-
interface DriverResult {
|
|
244
|
-
personaId: string;
|
|
245
|
-
/** True when the simulated user professionally signed off (driver said DONE). */
|
|
246
|
-
completed: boolean;
|
|
247
|
-
/** Turn at which the simulated user signed off, or null if it never did. */
|
|
248
|
-
turnsToCompletion: number | null;
|
|
249
|
-
/**
|
|
250
|
-
* Turn at which nominal completionCriteria were first all met, or null.
|
|
251
|
-
* Distinct from turnsToCompletion: criteria can be met while the
|
|
252
|
-
* simulated professional is still unsatisfied with the work's rigor.
|
|
253
|
-
*/
|
|
254
|
-
criteriaMetAtTurn: number | null;
|
|
255
|
-
totalTurns: number;
|
|
256
|
-
metrics: TurnMetrics[];
|
|
257
|
-
finalState: DriverState;
|
|
258
|
-
convergenceCurve: number[];
|
|
259
|
-
totalCostUsd: number;
|
|
260
|
-
finalQualityScore: number | null;
|
|
261
|
-
}
|
|
262
|
-
interface BenchmarkRunnerConfig {
|
|
263
|
-
scenarios: Scenario[];
|
|
264
|
-
judges: JudgeFn[];
|
|
265
|
-
systemPrompt: string;
|
|
266
|
-
model?: string;
|
|
267
|
-
judgeModel?: string;
|
|
268
|
-
passThreshold?: number;
|
|
269
|
-
generation?: number;
|
|
270
|
-
promptVersion?: string;
|
|
271
|
-
}
|
|
272
|
-
interface JudgeInput {
|
|
273
|
-
scenario: Scenario;
|
|
274
|
-
turns: TurnResult[];
|
|
275
|
-
artifacts: CollectedArtifacts;
|
|
276
|
-
}
|
|
277
|
-
type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
|
|
278
|
-
|
|
279
|
-
interface TestResult {
|
|
280
|
-
name: string;
|
|
281
|
-
passed: boolean;
|
|
282
|
-
duration: number;
|
|
283
|
-
detail?: string;
|
|
284
|
-
checks: CheckResult[];
|
|
285
|
-
}
|
|
286
|
-
interface CheckResult {
|
|
287
|
-
name: string;
|
|
288
|
-
passed: boolean;
|
|
289
|
-
expected: string;
|
|
290
|
-
actual: string;
|
|
291
|
-
}
|
|
292
|
-
interface EvalResult {
|
|
293
|
-
scenario: string;
|
|
294
|
-
status: 'pass' | 'fail' | 'skip';
|
|
295
|
-
duration: number;
|
|
296
|
-
detail?: string;
|
|
297
|
-
artifact?: string;
|
|
298
|
-
}
|
|
299
|
-
|
|
300
7
|
/**
|
|
301
8
|
* Release confidence gate.
|
|
302
9
|
*
|
|
@@ -731,4 +438,4 @@ interface RenderReleaseReportOptions {
|
|
|
731
438
|
}
|
|
732
439
|
declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
|
|
733
440
|
|
|
734
|
-
export { type
|
|
441
|
+
export { type ActionableSideInfo as A, type BootstrapOptions as B, type CorpusAgreementOptions as C, corpusInterRaterAgreement as D, corpusInterRaterAgreementFromJudgeScores as E, interRaterReliability as F, mannWhitneyU as G, normalizeScores as H, pairedMde as I, type JudgeReplayGateArgs as J, pairedTTest as K, partialCredit as L, requiredSampleSize as M, weightedMean as N, type PairedBootstrapOptions as P, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, renderReleaseReport as r, type AsiSeverity as s, type CorpusAgreementPerDimension as t, type CorpusAgreementReport as u, type CorpusScoreRecord as v, wilcoxonSignedRank as w, bonferroni as x, cohensD as y, confidenceInterval as z };
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-
|
|
3
|
-
export {
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DSu0DWy8.js';
|
|
3
|
+
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-B7gNRX-r.js';
|
|
4
5
|
import './run-record-BGY6bHRh.js';
|
|
5
6
|
import './errors-mje_cKOs.js';
|
|
6
|
-
import './outcome-store-
|
|
7
|
+
import './outcome-store-D6KWmYvj.js';
|
|
7
8
|
import './judge-calibration-DilmB3Ml.js';
|
|
9
|
+
import './types-DhqpAi_z.js';
|
|
8
10
|
import '@tangle-network/tcloud';
|
|
9
11
|
import './dataset-BlwAtYYf.js';
|
|
10
12
|
import './failure-cluster-Cw65_5FY.js';
|
package/dist/reporting.js
CHANGED
|
@@ -4,19 +4,21 @@ import {
|
|
|
4
4
|
evaluateReleaseConfidence,
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
renderReleaseReport
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-UBQGWD3O.js";
|
|
8
8
|
import {
|
|
9
9
|
rubricPredictiveValidity
|
|
10
10
|
} from "./chunk-YRZ4M5GS.js";
|
|
11
11
|
import {
|
|
12
|
-
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
13
12
|
evaluateInterimReleaseConfidence,
|
|
13
|
+
pairedEvalueSequence
|
|
14
|
+
} from "./chunk-MAZ26DC7.js";
|
|
15
|
+
import {
|
|
16
|
+
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
14
17
|
gainHistogram,
|
|
15
|
-
pairedEvalueSequence,
|
|
16
18
|
paretoChart,
|
|
17
19
|
researchReport,
|
|
18
20
|
summaryTable
|
|
19
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-EGIPWXHL.js";
|
|
20
22
|
import {
|
|
21
23
|
benjaminiHochberg,
|
|
22
24
|
pairedBootstrap,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
2
2
|
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
|
|
3
|
-
import {
|
|
3
|
+
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-B7gNRX-r.js';
|
|
4
4
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
5
5
|
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
|
|
6
6
|
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import {
|
|
3
|
-
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-
|
|
4
|
-
export { r as runEvalCampaign } from './researcher-
|
|
2
|
+
import { j as CampaignResult } from './types-Dbj5gu8n.js';
|
|
3
|
+
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-LZD0qHEa.js';
|
|
4
|
+
export { r as runEvalCampaign } from './researcher-LZD0qHEa.js';
|
|
5
5
|
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
6
|
-
import { O as OutcomeStore } from './outcome-store-
|
|
7
|
-
export { D as DeploymentOutcome, F as FileSystemOutcomeStore,
|
|
8
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
9
|
-
import { I as InterimReleaseConfidence } from './sequential-
|
|
6
|
+
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
7
|
+
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
8
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-ByZEC3BX.js';
|
|
9
|
+
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
10
10
|
import './errors-mje_cKOs.js';
|
|
11
11
|
import './llm-client-BXVRUZyX.js';
|
|
12
12
|
import './raw-provider-sink-C46HDghv.js';
|
|
13
|
+
import './summary-report-B7gNRX-r.js';
|
|
14
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
13
15
|
import './emitter-DP_cSSiw.js';
|
|
14
16
|
import './integrity-CTDhR1Sg.js';
|
|
15
|
-
import './failure-cluster-Cw65_5FY.js';
|
|
16
17
|
|
|
17
18
|
/**
|
|
18
19
|
* Test-time compute scaling curves.
|
package/dist/rl.js
CHANGED
|
@@ -10,14 +10,15 @@ import {
|
|
|
10
10
|
} from "./chunk-3RF76KTD.js";
|
|
11
11
|
import {
|
|
12
12
|
runEvalCampaign
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-5KSDYBYH.js";
|
|
14
14
|
import "./chunk-BWZEGTES.js";
|
|
15
15
|
import {
|
|
16
16
|
rubricPredictiveValidity
|
|
17
17
|
} from "./chunk-YRZ4M5GS.js";
|
|
18
18
|
import {
|
|
19
19
|
evaluateInterimReleaseConfidence
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-MAZ26DC7.js";
|
|
21
|
+
import "./chunk-EGIPWXHL.js";
|
|
21
22
|
import {
|
|
22
23
|
benjaminiHochberg,
|
|
23
24
|
wilcoxonSignedRank
|