@tangle-network/agent-eval 0.20.10 → 0.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +129 -126
  2. package/dist/benchmarks/index.d.ts +2 -1
  3. package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
  4. package/dist/chunk-75MCTH7P.js.map +1 -0
  5. package/dist/chunk-HKYRWNHV.js +1354 -0
  6. package/dist/chunk-HKYRWNHV.js.map +1 -0
  7. package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
  8. package/dist/chunk-IKFVX537.js +717 -0
  9. package/dist/chunk-IKFVX537.js.map +1 -0
  10. package/dist/chunk-KWUAAIHR.js +1764 -0
  11. package/dist/chunk-KWUAAIHR.js.map +1 -0
  12. package/dist/chunk-MCMV7DUL.js +1310 -0
  13. package/dist/chunk-MCMV7DUL.js.map +1 -0
  14. package/dist/chunk-ODFINDLQ.js +413 -0
  15. package/dist/chunk-ODFINDLQ.js.map +1 -0
  16. package/dist/chunk-PKCVBYTQ.js +200 -0
  17. package/dist/chunk-PKCVBYTQ.js.map +1 -0
  18. package/dist/chunk-YUFXO3TU.js +148 -0
  19. package/dist/chunk-YUFXO3TU.js.map +1 -0
  20. package/dist/cli.js +2 -2
  21. package/dist/control-C8NKbF3w.d.ts +258 -0
  22. package/dist/control.d.ts +5 -0
  23. package/dist/control.js +30 -0
  24. package/dist/control.js.map +1 -0
  25. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  26. package/dist/emitter-BYO2nSDA.d.ts +387 -0
  27. package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
  28. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  29. package/dist/index.d.ts +115 -2870
  30. package/dist/index.js +1049 -6156
  31. package/dist/index.js.map +1 -1
  32. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  33. package/dist/openapi.json +1 -1
  34. package/dist/optimization.d.ts +145 -0
  35. package/dist/optimization.js +60 -0
  36. package/dist/optimization.js.map +1 -0
  37. package/dist/reporting.d.ts +426 -0
  38. package/dist/reporting.js +32 -0
  39. package/dist/reporting.js.map +1 -0
  40. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  41. package/dist/traces.d.ts +658 -0
  42. package/dist/traces.js +100 -0
  43. package/dist/traces.js.map +1 -0
  44. package/dist/wire/index.js +2 -2
  45. package/docs/concepts.md +16 -11
  46. package/docs/feature-guide.md +10 -17
  47. package/docs/integration-launch-gates.md +77 -0
  48. package/docs/product-eval-adoption.md +221 -0
  49. package/docs/trace-analysis.md +75 -0
  50. package/package.json +21 -1
  51. package/dist/chunk-JAOLXRIA.js.map +0 -1
  52. /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
@@ -0,0 +1,345 @@
1
+ import { D as DatasetSplit, a as DatasetScenario } from './dataset-B9qvlm_o.js';
2
+ import { T as TraceEmitter, F as FailureClass, a as TraceStore } from './emitter-BYO2nSDA.js';
3
+
4
+ /**
5
+ * Policy-based agent control runtime.
6
+ *
7
+ * This is the minimal reusable loop behind driver-agent patterns:
8
+ *
9
+ * observe state -> validate -> decide next action -> act -> observe -> ...
10
+ *
11
+ * It deliberately does not model named "topologies". Direct execution,
12
+ * critic/revise, driver intervention, specialist calls, and human escalation
13
+ * are all just actions chosen by the control policy.
14
+ */
15
+
16
+ type ControlSeverity = 'info' | 'warning' | 'error' | 'critical';
17
+ type ControlActionFailureMode = 'continue' | 'stop';
18
+ interface ControlEvalResult {
19
+ /** Stable validator or judge id. */
20
+ id: string;
21
+ /** Whether this check passed. */
22
+ passed: boolean;
23
+ /** Optional normalized score. 1 = best, 0 = worst. */
24
+ score?: number;
25
+ /** Objective validators should usually be "error" or "critical" when failed. */
26
+ severity?: ControlSeverity;
27
+ /** Human-readable result. */
28
+ detail?: string;
29
+ /** Small evidence string or pointer. Avoid large payloads. */
30
+ evidence?: string;
31
+ /** True when the result came from deterministic state, not LLM judgment. */
32
+ objective?: boolean;
33
+ /** Structured details for downstream control policies and reports. */
34
+ metadata?: Record<string, unknown>;
35
+ }
36
+ interface ControlBudget {
37
+ maxSteps: number;
38
+ maxWallMs?: number;
39
+ maxCostUsd?: number;
40
+ }
41
+ interface ControlStopPolicies<TState, TAction> {
42
+ /**
43
+ * Stop after N consecutive steps with no state fingerprint change and
44
+ * less than `minScoreDelta` score movement. Disabled when omitted.
45
+ */
46
+ maxNoProgressSteps?: number;
47
+ /**
48
+ * Stop after the same action fingerprint is selected N consecutive
49
+ * times. Disabled when omitted.
50
+ */
51
+ maxRepeatedActions?: number;
52
+ /** Minimum score movement that counts as progress. Default 0.001. */
53
+ minScoreDelta?: number;
54
+ /** Override the default JSON/string fingerprint for state comparisons. */
55
+ stateFingerprint?: (state: TState) => string;
56
+ /** Override the default JSON/string fingerprint for repeated-action checks. */
57
+ actionFingerprint?: (action: TAction) => string;
58
+ }
59
+ interface ControlContext<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
60
+ intent: string;
61
+ state: TState;
62
+ evals: TEval[];
63
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
64
+ budget: ControlBudget;
65
+ stepIndex: number;
66
+ wallMs: number;
67
+ spentCostUsd: number;
68
+ remainingCostUsd?: number;
69
+ abortSignal: AbortSignal;
70
+ emitter?: TraceEmitter;
71
+ }
72
+ type ControlDecision<TAction> = {
73
+ type: 'continue';
74
+ action: TAction;
75
+ reason?: string;
76
+ } | {
77
+ type: 'stop';
78
+ reason: string;
79
+ pass?: boolean;
80
+ score?: number;
81
+ };
82
+ interface StopDecision {
83
+ stop: boolean;
84
+ pass: boolean;
85
+ reason: string;
86
+ score?: number;
87
+ failureClass?: FailureClass;
88
+ }
89
+ interface ControlActionOutcome<TActionResult> {
90
+ ok: boolean;
91
+ result?: TActionResult;
92
+ error?: string;
93
+ costUsd?: number;
94
+ durationMs: number;
95
+ }
96
+ interface ControlRuntimeError {
97
+ phase: 'observe' | 'validate' | 'decide' | 'act' | 'stop-policy' | 'on-step' | 'trace';
98
+ stepIndex: number;
99
+ message: string;
100
+ }
101
+ interface ControlStep<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
102
+ index: number;
103
+ decision: ControlDecision<TAction>;
104
+ beforeState: TState;
105
+ afterState: TState;
106
+ evalsBefore: TEval[];
107
+ evalsAfter: TEval[];
108
+ actionOutcome?: ControlActionOutcome<TActionResult>;
109
+ startedAt: string;
110
+ endedAt: string;
111
+ }
112
+ interface ControlRunResult<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
113
+ intent: string;
114
+ pass: boolean;
115
+ completed: boolean;
116
+ reason: string;
117
+ score?: number;
118
+ steps: ControlStep<TState, TAction, TActionResult, TEval>[];
119
+ finalState: TState | undefined;
120
+ finalEvals: TEval[];
121
+ wallMs: number;
122
+ spentCostUsd: number;
123
+ runId: string | null;
124
+ failureClass?: FailureClass;
125
+ runtimeErrors: ControlRuntimeError[];
126
+ stoppedBy: 'policy' | 'stop-policy' | 'budget' | 'abort' | 'runtime-error';
127
+ }
128
+ interface ControlRuntimeConfig<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
129
+ intent: string;
130
+ budget?: Partial<ControlBudget>;
131
+ signal?: AbortSignal;
132
+ /** Defaults to `continue`: action failures are recorded, then the policy gets another chance. */
133
+ actionFailure?: ControlActionFailureMode;
134
+ /**
135
+ * Extract cost from an action result. Used for `maxCostUsd` budget
136
+ * enforcement and trace budget ledger emission.
137
+ */
138
+ getActionCostUsd?: (ctx: {
139
+ action: TAction;
140
+ result: TActionResult;
141
+ state: TState;
142
+ evals: TEval[];
143
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
144
+ }) => number | undefined;
145
+ /** Read typed task/product state. Prefer structured state over transcript-only context. */
146
+ observe: (ctx: {
147
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
148
+ abortSignal: AbortSignal;
149
+ }) => Promise<TState> | TState;
150
+ /** Objective validators first, subjective judges only where objective state is insufficient. */
151
+ validate: (ctx: {
152
+ intent: string;
153
+ state: TState;
154
+ history: ControlStep<TState, TAction, TActionResult, TEval>[];
155
+ abortSignal: AbortSignal;
156
+ }) => Promise<TEval[]> | TEval[];
157
+ /** Choose the next control action. Can call a worker, ask user, run critic, inspect state, or stop. */
158
+ decide: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<ControlDecision<TAction>> | ControlDecision<TAction>;
159
+ /** Execute the action selected by the policy. */
160
+ act: (action: TAction, ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<TActionResult> | TActionResult;
161
+ /** Final stopping policy. Called before decide and after each action. */
162
+ shouldStop?: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<StopDecision> | StopDecision;
163
+ /** Optional hook for tracing or live progress updates. */
164
+ onStep?: (step: ControlStep<TState, TAction, TActionResult, TEval>) => Promise<void> | void;
165
+ /** Optional generic stuck-loop policies. Custom `shouldStop` still runs first. */
166
+ stopPolicies?: ControlStopPolicies<TState, TAction>;
167
+ /** Optional trace sink. Emits one run plus one span per control step. */
168
+ store?: TraceStore;
169
+ scenarioId?: string;
170
+ projectId?: string;
171
+ variantId?: string;
172
+ }
173
+ declare function runAgentControlLoop<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(config: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>>;
174
+ declare function stopOnNoProgress<TState, TAction>(maxNoProgressSteps: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxNoProgressSteps'>): ControlStopPolicies<TState, TAction>;
175
+ declare function stopOnRepeatedAction<TState, TAction>(maxRepeatedActions: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxRepeatedActions'>): ControlStopPolicies<TState, TAction>;
176
+ declare function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
177
+ declare function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
178
+ declare function allCriticalPassed(evals: ControlEvalResult[]): boolean;
179
+
180
+ type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
181
+ type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
182
+ type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
183
+ type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
184
+ interface FeedbackTask {
185
+ intent: string;
186
+ context?: unknown;
187
+ }
188
+ interface ProposedSideEffect {
189
+ type: string;
190
+ risk?: 'low' | 'medium' | 'high';
191
+ costUsd?: number;
192
+ externalSideEffect?: boolean;
193
+ requiresApproval?: boolean;
194
+ metadata?: Record<string, unknown>;
195
+ }
196
+ interface FeedbackLabel {
197
+ id?: string;
198
+ source: FeedbackLabelSource;
199
+ kind: FeedbackLabelKind;
200
+ value: unknown;
201
+ reason?: string;
202
+ severity?: FeedbackSeverity;
203
+ createdAt: string;
204
+ metadata?: Record<string, unknown>;
205
+ }
206
+ interface FeedbackAttempt {
207
+ id: string;
208
+ stepIndex: number;
209
+ artifactType: FeedbackArtifactType;
210
+ artifact: unknown;
211
+ options?: unknown[];
212
+ proposedAction?: ProposedSideEffect;
213
+ evals?: ControlEvalResult[];
214
+ feedback?: FeedbackLabel[];
215
+ createdAt: string;
216
+ metadata?: Record<string, unknown>;
217
+ }
218
+ interface FeedbackOutcome {
219
+ success?: boolean;
220
+ score?: number;
221
+ metrics?: Record<string, number>;
222
+ costUsd?: number;
223
+ detail?: string;
224
+ observedAt?: string;
225
+ metadata?: Record<string, unknown>;
226
+ }
227
+ interface FeedbackTrajectory {
228
+ id: string;
229
+ projectId?: string;
230
+ scenarioId?: string;
231
+ task: FeedbackTask;
232
+ attempts: FeedbackAttempt[];
233
+ labels: FeedbackLabel[];
234
+ outcome?: FeedbackOutcome;
235
+ split?: DatasetSplit;
236
+ tags?: Record<string, string>;
237
+ createdAt: string;
238
+ updatedAt?: string;
239
+ metadata?: Record<string, unknown>;
240
+ }
241
+ interface FeedbackTrajectoryStore {
242
+ save(trajectory: FeedbackTrajectory): Promise<void>;
243
+ get(id: string): Promise<FeedbackTrajectory | null>;
244
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
245
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
246
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
247
+ }
248
+ interface FeedbackTrajectoryFilter {
249
+ projectId?: string;
250
+ scenarioId?: string;
251
+ split?: DatasetSplit;
252
+ tag?: [string, string];
253
+ }
254
+ interface FeedbackSplitPolicy {
255
+ trainPct?: number;
256
+ devPct?: number;
257
+ testPct?: number;
258
+ holdoutPct?: number;
259
+ }
260
+ interface PreferenceMemoryEntry {
261
+ instruction: string;
262
+ rationale: string;
263
+ weight: number;
264
+ sourceTrajectoryId: string;
265
+ sourceLabelId?: string;
266
+ category?: string;
267
+ }
268
+ interface FeedbackOptimizerRow {
269
+ scenarioId: string;
270
+ trajectoryId: string;
271
+ labelKinds: FeedbackLabelKind[];
272
+ score?: number;
273
+ metadata?: Record<string, unknown>;
274
+ }
275
+ interface FeedbackReplayResult {
276
+ trajectoryId: string;
277
+ pass: boolean;
278
+ score?: number;
279
+ labels: FeedbackLabel[];
280
+ outcome?: FeedbackOutcome;
281
+ metadata?: Record<string, unknown>;
282
+ }
283
+ interface FeedbackReplayAdapter {
284
+ replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
285
+ }
286
+ declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
287
+ private readonly trajectories;
288
+ save(trajectory: FeedbackTrajectory): Promise<void>;
289
+ get(id: string): Promise<FeedbackTrajectory | null>;
290
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
291
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
292
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
293
+ }
294
+ declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
295
+ private readonly dir;
296
+ private readonly memory;
297
+ private loaded;
298
+ constructor(options: {
299
+ dir: string;
300
+ });
301
+ save(trajectory: FeedbackTrajectory): Promise<void>;
302
+ get(id: string): Promise<FeedbackTrajectory | null>;
303
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
304
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
305
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
306
+ private append;
307
+ private load;
308
+ }
309
+ declare function createFeedbackTrajectory(input: {
310
+ id?: string;
311
+ projectId?: string;
312
+ scenarioId?: string;
313
+ task: FeedbackTask;
314
+ attempts?: FeedbackAttempt[];
315
+ labels?: FeedbackLabel[];
316
+ outcome?: FeedbackOutcome;
317
+ split?: DatasetSplit;
318
+ tags?: Record<string, string>;
319
+ createdAt?: string;
320
+ metadata?: Record<string, unknown>;
321
+ }): FeedbackTrajectory;
322
+ declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
323
+ declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
324
+ declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
325
+ declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
326
+ declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
327
+ declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
328
+ declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
329
+ declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
330
+ declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
331
+ maxEntries?: number;
332
+ }): PreferenceMemoryEntry[];
333
+ declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
334
+ declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
335
+ declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
336
+ declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
337
+ projectId?: string;
338
+ scenarioId?: string;
339
+ artifactType?: FeedbackArtifactType;
340
+ artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
341
+ proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
342
+ createdAt?: string;
343
+ }): FeedbackTrajectory;
344
+
345
+ export { type ProposedSideEffect as A, allCriticalPassed as B, type ControlSeverity as C, assignFeedbackSplit as D, controlRunToFeedbackTrajectory as E, type FeedbackLabel as F, createFeedbackTrajectory as G, feedbackTrajectoriesToDatasetScenarios as H, InMemoryFeedbackTrajectoryStore as I, feedbackTrajectoriesToOptimizerRows as J, feedbackTrajectoryToDatasetScenario as K, feedbackTrajectoryToOptimizerRow as L, objectiveEval as M, parseFeedbackTrajectoriesJsonl as N, renderPreferenceMemoryMarkdown as O, type PreferenceMemoryEntry as P, replayFeedbackTrajectories as Q, replayFeedbackTrajectory as R, type StopDecision as S, runAgentControlLoop as T, serializeFeedbackTrajectoriesJsonl as U, stopOnNoProgress as V, stopOnRepeatedAction as W, subjectiveEval as X, summarizePreferenceMemory as Y, withAssignedFeedbackSplit as Z, type FeedbackTrajectoryStore as a, type FeedbackTrajectory as b, type ControlEvalResult as c, type ControlActionFailureMode as d, type ControlActionOutcome as e, type ControlBudget as f, type ControlContext as g, type ControlDecision as h, type ControlRunResult as i, type ControlRuntimeConfig as j, type ControlRuntimeError as k, type ControlStep as l, type ControlStopPolicies as m, type FeedbackArtifactType as n, type FeedbackAttempt as o, type FeedbackLabelKind as p, type FeedbackLabelSource as q, type FeedbackOptimizerRow as r, type FeedbackOutcome as s, type FeedbackReplayAdapter as t, type FeedbackReplayResult as u, type FeedbackSeverity as v, type FeedbackSplitPolicy as w, type FeedbackTask as x, type FeedbackTrajectoryFilter as y, FileSystemFeedbackTrajectoryStore as z };
@@ -1,135 +1,4 @@
1
- /**
2
- * Paper-grade RunRecord schema + runtime validator.
3
- *
4
- * Every run that participates in a promotion gate, paper table, or
5
- * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
6
- * fields are exactly those the paper "Two Loops, Three Roles" requires
7
- * for reproducibility: who/what/when/cost/seed/hash, plus the search vs
8
- * holdout split tag and either a `searchScore` or a `holdoutScore`.
9
- *
10
- * This is intentionally NOT a replacement for the rich `Run` /
11
- * `ProposeReviewReport` / `ScenarioResult` types already in the
12
- * package. Those are runtime structures with full provenance. A
13
- * `RunRecord` is the analysis-time projection — the JSON-friendly
14
- * row you'd put in a parquet file or paste into a notebook.
15
- *
16
- * Validate at the boundary:
17
- *
18
- * const rec = validateRunRecord(rawJson) // throws on missing
19
- * const ok = isRunRecord(rawJson) // boolean check
20
- * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }
21
- *
22
- * The validator runs in pure TS — zod is intentionally NOT a
23
- * dependency. Round-trip tested in `tests/run-record.test.ts`.
24
- */
25
- /** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
26
- * combined train+test pool that the optimizer is allowed to read. */
27
- type RunSplitTag = 'search' | 'dev' | 'holdout';
28
- interface RunTokenUsage {
29
- input: number;
30
- output: number;
31
- cached?: number;
32
- }
33
- interface RunJudgeMetadata {
34
- model: string;
35
- promptVersion: string;
36
- /** [0,1] confidence the judge declared. Constant judge confidence
37
- * across many runs is a fallback signal (see `canary.ts`). */
38
- confidence: number;
39
- /** True if the judge degraded to a fallback path (rules-only,
40
- * prior-call cache, etc.). The canary uses this to alert. */
41
- fallback: boolean;
42
- }
43
- interface RunOutcome {
44
- /** Score on the search/optimization split. Optional because a
45
- * holdout-only evaluation only fills `holdoutScore`. */
46
- searchScore?: number;
47
- /** Score on the held-out split. Optional because a search-only run
48
- * only fills `searchScore`. At least one must be present. */
49
- holdoutScore?: number;
50
- /** Bag of any other metric the run produced — judge dimensions,
51
- * pass/fail counters, latency stats, etc. Numeric only — keeps
52
- * reporters honest. */
53
- raw: Record<string, number>;
54
- }
55
- /**
56
- * Mandatory paper-grade fields for a single evaluation run. Optional
57
- * fields are extension points; mandatory fields throw if missing.
58
- *
59
- * Hash discipline:
60
- * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
61
- * model (after any steering bundle merge).
62
- * - `configHash` is the sha256 of the effective run config (model,
63
- * temperature, tools, judges, splits). The pair (promptHash,
64
- * configHash) uniquely identifies an experimental cell.
65
- *
66
- * Model snapshot discipline:
67
- * - `model` MUST encode a snapshot version. Bare aliases like
68
- * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
69
- * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
70
- */
71
- interface RunRecord {
72
- /** UUID for the run. */
73
- runId: string;
74
- /** Logical experiment grouping (a treatment vs a baseline within
75
- * the same sweep should share `experimentId`). */
76
- experimentId: string;
77
- /** Stable identifier for the candidate (variant) being run. The
78
- * promotion gate compares two `candidateId`s on matched items. */
79
- candidateId: string;
80
- /** RNG seed for the run. Always recorded — silent re-seeding is
81
- * the most common cause of non-reproducible numbers. */
82
- seed: number;
83
- /** Model identifier WITH snapshot version. */
84
- model: string;
85
- /** sha256 of the effective prompt (post-steering). */
86
- promptHash: string;
87
- /** sha256 of the effective config. */
88
- configHash: string;
89
- /** Git SHA the harness was run from. */
90
- commitSha: string;
91
- /** End-to-end wall-clock duration in milliseconds. */
92
- wallMs: number;
93
- /** Time spent queued before execution started, if known. */
94
- queueMs?: number;
95
- /** Total USD cost. Mandatory — runs without a cost number are
96
- * unbounded by definition and must not be admitted into the gate. */
97
- costUsd: number;
98
- /** Token usage breakdown. */
99
- tokenUsage: RunTokenUsage;
100
- /** Judge-side metadata, if a judge was used. */
101
- judgeMetadata?: RunJudgeMetadata;
102
- /** Per-split scores + raw bag. */
103
- outcome: RunOutcome;
104
- /** Categorical failure tag, when the run failed and the harness
105
- * classified it. Free-form string; standard tags live in
106
- * `failure-taxonomy.ts`. */
107
- failureMode?: string;
108
- /** Which split this run was drawn from. */
109
- splitTag: RunSplitTag;
110
- }
111
- declare class RunRecordValidationError extends Error {
112
- readonly path: string;
113
- constructor(message: string, path?: string);
114
- }
115
- /**
116
- * Strict validator. Throws `RunRecordValidationError` on the first
117
- * missing or wrongly-typed field. Returns the input cast to
118
- * `RunRecord` on success — the validator does not coerce.
119
- */
120
- declare function validateRunRecord(input: unknown): RunRecord;
121
- /** Boolean validator — convenience for filtering arrays. */
122
- declare function isRunRecord(input: unknown): input is RunRecord;
123
- /** Non-throwing validator — returns a discriminated union. */
124
- declare function parseRunRecordSafe(input: unknown): {
125
- ok: true;
126
- value: RunRecord;
127
- } | {
128
- ok: false;
129
- error: RunRecordValidationError;
130
- };
131
- /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
132
- declare function roundTripRunRecord(record: RunRecord): RunRecord;
1
+ import { R as RunSplitTag } from './run-record-CX_jcAyr.js';
133
2
 
134
3
  /**
135
4
  * Shared types for the reference benchmark wrappers under
@@ -287,4 +156,4 @@ declare namespace index {
287
156
  export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
288
157
  }
289
158
 
290
- export { BENCHMARK_SPLIT_SEED as B, type RunRecord as R, type RunSplitTag as a, type BenchmarkAdapter as b, type BenchmarkDatasetItem as c, type BenchmarkEvaluation as d, type RunJudgeMetadata as e, type RunOutcome as f, RunRecordValidationError as g, type RunTokenUsage as h, deterministicSplit as i, index as j, isRunRecord as k, index$1 as l, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
159
+ export { BENCHMARK_SPLIT_SEED as B, type BenchmarkAdapter as a, type BenchmarkDatasetItem as b, type BenchmarkEvaluation as c, deterministicSplit as d, index$1 as e, index as i };