@tangle-network/agent-eval 0.14.2 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -0
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/chunk-PZ5AY32C.js.map +1 -0
- package/dist/cli.js +1 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +963 -4
- package/dist/index.js +1456 -132
- package/dist/index.js.map +1 -1
- package/dist/telemetry/file.js +2 -0
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +2 -0
- package/dist/telemetry/index.js.map +1 -1
- package/dist/wire/index.js +1 -0
- package/package.json +10 -12
package/dist/index.d.ts
CHANGED
|
@@ -1215,7 +1215,7 @@ interface BudgetSpec {
|
|
|
1215
1215
|
calls?: number;
|
|
1216
1216
|
usd?: number;
|
|
1217
1217
|
}
|
|
1218
|
-
interface RunOutcome {
|
|
1218
|
+
interface RunOutcome$1 {
|
|
1219
1219
|
score?: number;
|
|
1220
1220
|
pass?: boolean;
|
|
1221
1221
|
failureClass?: FailureClass;
|
|
@@ -1257,7 +1257,7 @@ interface Run {
|
|
|
1257
1257
|
startedAt: number;
|
|
1258
1258
|
endedAt?: number;
|
|
1259
1259
|
status: RunStatus;
|
|
1260
|
-
outcome?: RunOutcome;
|
|
1260
|
+
outcome?: RunOutcome$1;
|
|
1261
1261
|
budget?: BudgetSpec;
|
|
1262
1262
|
/** Free-form labels for downstream grouping. */
|
|
1263
1263
|
tags?: Record<string, string>;
|
|
@@ -1514,7 +1514,7 @@ declare class TraceEmitter {
|
|
|
1514
1514
|
constructor(store: TraceStore, options?: TraceEmitterOptions);
|
|
1515
1515
|
get runId(): string;
|
|
1516
1516
|
startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
|
|
1517
|
-
endRun(outcome?: RunOutcome): Promise<void>;
|
|
1517
|
+
endRun(outcome?: RunOutcome$1): Promise<void>;
|
|
1518
1518
|
abortRun(reason: string): Promise<void>;
|
|
1519
1519
|
span<S extends Span = Span>(init: {
|
|
1520
1520
|
kind: SpanKind;
|
|
@@ -5907,6 +5907,35 @@ interface ViteDeployRunnerInput {
|
|
|
5907
5907
|
* factory for {@link deployGateLayer}.
|
|
5908
5908
|
*/
|
|
5909
5909
|
declare function viteDeployRunner(input: ViteDeployRunnerInput): DeployRunner;
|
|
5910
|
+
interface WranglerDeployRunnerInput {
|
|
5911
|
+
workdir: string;
|
|
5912
|
+
exec: (cmd: string, opts?: {
|
|
5913
|
+
cwd?: string;
|
|
5914
|
+
timeoutMs?: number;
|
|
5915
|
+
}) => Promise<{
|
|
5916
|
+
stdout: string;
|
|
5917
|
+
stderr: string;
|
|
5918
|
+
exitCode: number;
|
|
5919
|
+
}>;
|
|
5920
|
+
exists: (relativePath: string) => boolean | Promise<boolean>;
|
|
5921
|
+
/** Build command. Default `npm run build`. */
|
|
5922
|
+
buildCommand?: string;
|
|
5923
|
+
/** Wrangler dry-run command. Default `npx wrangler deploy --dry-run --outdir dist`. */
|
|
5924
|
+
dryRunCommand?: string;
|
|
5925
|
+
/** Per-step cap (ms). Default 120s. */
|
|
5926
|
+
timeoutMs?: number;
|
|
5927
|
+
}
|
|
5928
|
+
/**
|
|
5929
|
+
* Canonical runner for the `fullstack-ts` family on Cloudflare Workers
|
|
5930
|
+
* (Remix / React Router v7 / Hono on Workers). Detects wrangler.toml or
|
|
5931
|
+
* wrangler.jsonc in the workdir, builds, then `wrangler deploy --dry-run`
|
|
5932
|
+
* to catch missing bindings, syntax errors in wrangler config, and
|
|
5933
|
+
* import-time crashes that don't surface in `tsc`.
|
|
5934
|
+
*
|
|
5935
|
+
* No wrangler config = skip with "no wrangler" evidence (not a failure
|
|
5936
|
+
* — the gate caller decides whether to require deploy validation).
|
|
5937
|
+
*/
|
|
5938
|
+
declare function wranglerDeployRunner(input: WranglerDeployRunnerInput): DeployRunner;
|
|
5910
5939
|
|
|
5911
5940
|
/**
|
|
5912
5941
|
* Keyword-coverage judge — baseline complement to the semantic concept
|
|
@@ -6220,6 +6249,936 @@ declare function compareReferenceReplay(baseline: ReferenceReplayScore, candidat
|
|
|
6220
6249
|
declare function decideReferenceReplayPromotion(baseline: ReferenceReplayScore, candidate: ReferenceReplayScore, policy?: ReferenceReplayPromotionPolicy): ReferenceReplayPromotionDecision;
|
|
6221
6250
|
declare function defaultReferenceReplayMatcher(reference: ReferenceReplayItem, candidate: ReferenceReplayCandidate): ReferenceMatchResult;
|
|
6222
6251
|
|
|
6252
|
+
/**
|
|
6253
|
+
* Paper-grade paired statistics for held-out promotion gates.
|
|
6254
|
+
*
|
|
6255
|
+
* The promotion gate (`HeldOutGate`) needs three things:
|
|
6256
|
+
*
|
|
6257
|
+
* 1. A bootstrap confidence interval on the per-item paired delta
|
|
6258
|
+
* (`pairedBootstrap`). Median delta is the headline number; the
|
|
6259
|
+
* CI lower bound is what the gate checks against `pairedDeltaThreshold`.
|
|
6260
|
+
* 2. A non-parametric significance test on the paired deltas
|
|
6261
|
+
* (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
|
|
6262
|
+
* paper-style name).
|
|
6263
|
+
* 3. False-discovery-rate correction across simultaneously-tested
|
|
6264
|
+
* candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
|
|
6265
|
+
*
|
|
6266
|
+
* Why a separate file: every existing primitive lives in `statistics.ts`
|
|
6267
|
+
* (general) or `power-analysis.ts` (correction). Paired-bootstrap is
|
|
6268
|
+
* paired-only, paper-grade, and load-bearing for the promotion gate.
|
|
6269
|
+
* Putting it next to `statistics.ts` would require editing that file;
|
|
6270
|
+
* the brief forbids that. New file, new exports, no surface change.
|
|
6271
|
+
*/
|
|
6272
|
+
interface PairedBootstrapResult {
|
|
6273
|
+
/** Number of paired observations (after dropping unequal lengths is rejected). */
|
|
6274
|
+
n: number;
|
|
6275
|
+
/** Median of paired deltas (after − before). */
|
|
6276
|
+
median: number;
|
|
6277
|
+
/** Mean of paired deltas. */
|
|
6278
|
+
mean: number;
|
|
6279
|
+
/** Lower bound of the bootstrap CI on the median delta. */
|
|
6280
|
+
low: number;
|
|
6281
|
+
/** Upper bound of the bootstrap CI on the median delta. */
|
|
6282
|
+
high: number;
|
|
6283
|
+
/** Confidence level used (e.g. 0.95). */
|
|
6284
|
+
confidence: number;
|
|
6285
|
+
/** Number of bootstrap resamples used. */
|
|
6286
|
+
resamples: number;
|
|
6287
|
+
}
|
|
6288
|
+
interface PairedBootstrapOptions {
|
|
6289
|
+
/** Confidence level. Default 0.95. */
|
|
6290
|
+
confidence?: number;
|
|
6291
|
+
/** Bootstrap resample count. Default 2000. */
|
|
6292
|
+
resamples?: number;
|
|
6293
|
+
/** Statistic to bootstrap. Default 'median'. */
|
|
6294
|
+
statistic?: 'median' | 'mean';
|
|
6295
|
+
/** Deterministic seed. If omitted, uses Math.random(). */
|
|
6296
|
+
seed?: number;
|
|
6297
|
+
}
|
|
6298
|
+
/**
|
|
6299
|
+
* Paired bootstrap on (after - before) deltas. Returns a CI on the
|
|
6300
|
+
* chosen statistic (median by default). Pairs are resampled with
|
|
6301
|
+
* replacement. The lower bound is what the promotion gate checks: if
|
|
6302
|
+
* `low > pairedDeltaThreshold`, the gain is real at the chosen
|
|
6303
|
+
* confidence level.
|
|
6304
|
+
*
|
|
6305
|
+
* Throws on unequal sample sizes — caller must align pairs upstream.
|
|
6306
|
+
*/
|
|
6307
|
+
declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
|
|
6308
|
+
/**
|
|
6309
|
+
* Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
|
|
6310
|
+
* paired deltas is the standard non-parametric significance test for
|
|
6311
|
+
* "candidate beats baseline on matched items." Use alongside the
|
|
6312
|
+
* bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
|
|
6313
|
+
*/
|
|
6314
|
+
declare function pairedWilcoxon(before: number[], after: number[]): {
|
|
6315
|
+
w: number;
|
|
6316
|
+
p: number;
|
|
6317
|
+
};
|
|
6318
|
+
/**
|
|
6319
|
+
* Paper-style alias for `benjaminiHochberg`. Use to correct p-values
|
|
6320
|
+
* across multiple candidate-vs-baseline comparisons run in the same
|
|
6321
|
+
* promotion sweep. Returns BH-adjusted q-values and significance at
|
|
6322
|
+
* the requested FDR (default 0.05).
|
|
6323
|
+
*/
|
|
6324
|
+
declare function bhAdjust(pValues: number[], fdr?: number): {
|
|
6325
|
+
qValues: number[];
|
|
6326
|
+
significant: boolean[];
|
|
6327
|
+
};
|
|
6328
|
+
|
|
6329
|
+
/**
|
|
6330
|
+
* Paper-grade RunRecord schema + runtime validator.
|
|
6331
|
+
*
|
|
6332
|
+
* Every run that participates in a promotion gate, paper table, or
|
|
6333
|
+
* researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
|
|
6334
|
+
* fields are exactly those the paper "Two Loops, Three Roles" requires
|
|
6335
|
+
* for reproducibility: who/what/when/cost/seed/hash, plus the search vs
|
|
6336
|
+
* holdout split tag and either a `searchScore` or a `holdoutScore`.
|
|
6337
|
+
*
|
|
6338
|
+
* This is intentionally NOT a replacement for the rich `Run` /
|
|
6339
|
+
* `ProposeReviewReport` / `ScenarioResult` types already in the
|
|
6340
|
+
* package. Those are runtime structures with full provenance. A
|
|
6341
|
+
* `RunRecord` is the analysis-time projection — the JSON-friendly
|
|
6342
|
+
* row you'd put in a parquet file or paste into a notebook.
|
|
6343
|
+
*
|
|
6344
|
+
* Validate at the boundary:
|
|
6345
|
+
*
|
|
6346
|
+
* const rec = validateRunRecord(rawJson) // throws on missing
|
|
6347
|
+
* const ok = isRunRecord(rawJson) // boolean check
|
|
6348
|
+
* const rec = parseRunRecordSafe(rawJson) // { ok, value | error }
|
|
6349
|
+
*
|
|
6350
|
+
* The validator runs in pure TS — zod is intentionally NOT a
|
|
6351
|
+
* dependency. Round-trip tested in `tests/run-record.test.ts`.
|
|
6352
|
+
*/
|
|
6353
|
+
/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
|
|
6354
|
+
* combined train+test pool that the optimizer is allowed to read. */
|
|
6355
|
+
type RunSplitTag = 'search' | 'dev' | 'holdout';
|
|
6356
|
+
interface RunTokenUsage {
|
|
6357
|
+
input: number;
|
|
6358
|
+
output: number;
|
|
6359
|
+
cached?: number;
|
|
6360
|
+
}
|
|
6361
|
+
interface RunJudgeMetadata {
|
|
6362
|
+
model: string;
|
|
6363
|
+
promptVersion: string;
|
|
6364
|
+
/** [0,1] confidence the judge declared. Constant judge confidence
|
|
6365
|
+
* across many runs is a fallback signal (see `canary.ts`). */
|
|
6366
|
+
confidence: number;
|
|
6367
|
+
/** True if the judge degraded to a fallback path (rules-only,
|
|
6368
|
+
* prior-call cache, etc.). The canary uses this to alert. */
|
|
6369
|
+
fallback: boolean;
|
|
6370
|
+
}
|
|
6371
|
+
interface RunOutcome {
|
|
6372
|
+
/** Score on the search/optimization split. Optional because a
|
|
6373
|
+
* holdout-only evaluation only fills `holdoutScore`. */
|
|
6374
|
+
searchScore?: number;
|
|
6375
|
+
/** Score on the held-out split. Optional because a search-only run
|
|
6376
|
+
* only fills `searchScore`. At least one must be present. */
|
|
6377
|
+
holdoutScore?: number;
|
|
6378
|
+
/** Bag of any other metric the run produced — judge dimensions,
|
|
6379
|
+
* pass/fail counters, latency stats, etc. Numeric only — keeps
|
|
6380
|
+
* reporters honest. */
|
|
6381
|
+
raw: Record<string, number>;
|
|
6382
|
+
}
|
|
6383
|
+
/**
|
|
6384
|
+
* Mandatory paper-grade fields for a single evaluation run. Optional
|
|
6385
|
+
* fields are extension points; mandatory fields throw if missing.
|
|
6386
|
+
*
|
|
6387
|
+
* Hash discipline:
|
|
6388
|
+
* - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
|
|
6389
|
+
* model (after any steering bundle merge).
|
|
6390
|
+
* - `configHash` is the sha256 of the effective run config (model,
|
|
6391
|
+
* temperature, tools, judges, splits). The pair (promptHash,
|
|
6392
|
+
* configHash) uniquely identifies an experimental cell.
|
|
6393
|
+
*
|
|
6394
|
+
* Model snapshot discipline:
|
|
6395
|
+
* - `model` MUST encode a snapshot version. Bare aliases like
|
|
6396
|
+
* `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
|
|
6397
|
+
* Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
|
|
6398
|
+
*/
|
|
6399
|
+
interface RunRecord {
|
|
6400
|
+
/** UUID for the run. */
|
|
6401
|
+
runId: string;
|
|
6402
|
+
/** Logical experiment grouping (a treatment vs a baseline within
|
|
6403
|
+
* the same sweep should share `experimentId`). */
|
|
6404
|
+
experimentId: string;
|
|
6405
|
+
/** Stable identifier for the candidate (variant) being run. The
|
|
6406
|
+
* promotion gate compares two `candidateId`s on matched items. */
|
|
6407
|
+
candidateId: string;
|
|
6408
|
+
/** RNG seed for the run. Always recorded — silent re-seeding is
|
|
6409
|
+
* the most common cause of non-reproducible numbers. */
|
|
6410
|
+
seed: number;
|
|
6411
|
+
/** Model identifier WITH snapshot version. */
|
|
6412
|
+
model: string;
|
|
6413
|
+
/** sha256 of the effective prompt (post-steering). */
|
|
6414
|
+
promptHash: string;
|
|
6415
|
+
/** sha256 of the effective config. */
|
|
6416
|
+
configHash: string;
|
|
6417
|
+
/** Git SHA the harness was run from. */
|
|
6418
|
+
commitSha: string;
|
|
6419
|
+
/** End-to-end wall-clock duration in milliseconds. */
|
|
6420
|
+
wallMs: number;
|
|
6421
|
+
/** Time spent queued before execution started, if known. */
|
|
6422
|
+
queueMs?: number;
|
|
6423
|
+
/** Total USD cost. Mandatory — runs without a cost number are
|
|
6424
|
+
* unbounded by definition and must not be admitted into the gate. */
|
|
6425
|
+
costUsd: number;
|
|
6426
|
+
/** Token usage breakdown. */
|
|
6427
|
+
tokenUsage: RunTokenUsage;
|
|
6428
|
+
/** Judge-side metadata, if a judge was used. */
|
|
6429
|
+
judgeMetadata?: RunJudgeMetadata;
|
|
6430
|
+
/** Per-split scores + raw bag. */
|
|
6431
|
+
outcome: RunOutcome;
|
|
6432
|
+
/** Categorical failure tag, when the run failed and the harness
|
|
6433
|
+
* classified it. Free-form string; standard tags live in
|
|
6434
|
+
* `failure-taxonomy.ts`. */
|
|
6435
|
+
failureMode?: string;
|
|
6436
|
+
/** Which split this run was drawn from. */
|
|
6437
|
+
splitTag: RunSplitTag;
|
|
6438
|
+
}
|
|
6439
|
+
declare class RunRecordValidationError extends Error {
|
|
6440
|
+
readonly path: string;
|
|
6441
|
+
constructor(message: string, path?: string);
|
|
6442
|
+
}
|
|
6443
|
+
/**
|
|
6444
|
+
* Strict validator. Throws `RunRecordValidationError` on the first
|
|
6445
|
+
* missing or wrongly-typed field. Returns the input cast to
|
|
6446
|
+
* `RunRecord` on success — the validator does not coerce.
|
|
6447
|
+
*/
|
|
6448
|
+
declare function validateRunRecord(input: unknown): RunRecord;
|
|
6449
|
+
/** Boolean validator — convenience for filtering arrays. */
|
|
6450
|
+
declare function isRunRecord(input: unknown): input is RunRecord;
|
|
6451
|
+
/** Non-throwing validator — returns a discriminated union. */
|
|
6452
|
+
declare function parseRunRecordSafe(input: unknown): {
|
|
6453
|
+
ok: true;
|
|
6454
|
+
value: RunRecord;
|
|
6455
|
+
} | {
|
|
6456
|
+
ok: false;
|
|
6457
|
+
error: RunRecordValidationError;
|
|
6458
|
+
};
|
|
6459
|
+
/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
|
|
6460
|
+
declare function roundTripRunRecord(record: RunRecord): RunRecord;
|
|
6461
|
+
|
|
6462
|
+
/**
|
|
6463
|
+
* HeldOutGate — first-class held-out paired-delta promotion gate.
|
|
6464
|
+
*
|
|
6465
|
+
* Encodes the "honesty override" pattern that lived inline in
|
|
6466
|
+
* `~/webb/redteam/scripts/agent-eval-autoresearch.ts:138–171`.
|
|
6467
|
+
* The optimizer's best-guess is one thing; what we should actually
|
|
6468
|
+
* ship is another. The gate is the line between them.
|
|
6469
|
+
*
|
|
6470
|
+
* A candidate is promoted iff ALL three pass:
|
|
6471
|
+
*
|
|
6472
|
+
* 1. **Productive runs**: the candidate has at least
|
|
6473
|
+
* `minProductiveRuns` paired observations on items where BOTH
|
|
6474
|
+
* candidate and baseline produced a real (non-silent) score.
|
|
6475
|
+
* 2. **Paired delta**: the lower bound of the bootstrap CI on the
|
|
6476
|
+
* median per-item delta (candidate − baseline) on the HOLDOUT
|
|
6477
|
+
* split is strictly greater than `pairedDeltaThreshold`.
|
|
6478
|
+
* 3. **Overfit gap**: the candidate's gap between search-split
|
|
6479
|
+
* score and holdout-split score is no worse (more positive)
|
|
6480
|
+
* than the baseline's gap by more than `overfitGapThreshold`.
|
|
6481
|
+
* "Better on search, worse on holdout" is the canonical
|
|
6482
|
+
* overfit pattern; this catches it.
|
|
6483
|
+
*
|
|
6484
|
+
* The decision carries a machine-readable `rejectionCode` plus an
|
|
6485
|
+
* `evidence` block with every number the gate looked at, so the
|
|
6486
|
+
* downstream researcher / paper / dashboard can re-derive the
|
|
6487
|
+
* verdict without re-running.
|
|
6488
|
+
*
|
|
6489
|
+
* See also:
|
|
6490
|
+
* - `src/paired-stats.ts` for `pairedBootstrap` + `pairedWilcoxon`
|
|
6491
|
+
* - `src/run-record.ts` for the input row schema
|
|
6492
|
+
* - `src/reference-replay.ts` for the older, reference-replay-
|
|
6493
|
+
* specific promotion path (still useful for replay-style evals).
|
|
6494
|
+
*/
|
|
6495
|
+
|
|
6496
|
+
type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
|
|
6497
|
+
interface HeldOutGateConfig {
|
|
6498
|
+
/** Minimum number of paired (candidate, baseline) holdout observations
|
|
6499
|
+
* required before the gate will even consider promoting. Default 3. */
|
|
6500
|
+
minProductiveRuns?: number;
|
|
6501
|
+
/** The bootstrap-CI lower bound on the median paired holdout delta
|
|
6502
|
+
* must exceed this to promote. Default 0. */
|
|
6503
|
+
pairedDeltaThreshold?: number;
|
|
6504
|
+
/** Maximum allowed worsening of (search − holdout) gap relative to
|
|
6505
|
+
* baseline. Default 0.15 (i.e. candidate may overfit by up to 15
|
|
6506
|
+
* absolute score points more than baseline before rejection). */
|
|
6507
|
+
overfitGapThreshold?: number;
|
|
6508
|
+
/** Stable label of the baseline candidate. Required — paper-grade
|
|
6509
|
+
* evaluation never compares two unlabelled candidates. */
|
|
6510
|
+
baselineKey: string;
|
|
6511
|
+
/** Confidence level for the bootstrap CI. Default 0.95. */
|
|
6512
|
+
confidence?: number;
|
|
6513
|
+
/** Bootstrap resamples. Default 2000. */
|
|
6514
|
+
bootstrapResamples?: number;
|
|
6515
|
+
/** Optional deterministic seed for the bootstrap. Default undefined
|
|
6516
|
+
* (Math.random). */
|
|
6517
|
+
seed?: number;
|
|
6518
|
+
}
|
|
6519
|
+
interface GateEvidence {
|
|
6520
|
+
/** Number of paired (candidate, baseline) holdout observations used. */
|
|
6521
|
+
productiveRuns: number;
|
|
6522
|
+
/** Median of (candidate − baseline) paired holdout deltas. */
|
|
6523
|
+
medianPairedDelta: number;
|
|
6524
|
+
/** Bootstrap CI on the median paired holdout delta. */
|
|
6525
|
+
pairedCI: {
|
|
6526
|
+
low: number;
|
|
6527
|
+
high: number;
|
|
6528
|
+
};
|
|
6529
|
+
/** Wilcoxon signed-rank p-value on the paired holdout deltas. */
|
|
6530
|
+
pairedPValue: number;
|
|
6531
|
+
/** Mean candidate score on the search split (NaN if none). */
|
|
6532
|
+
searchScore: number;
|
|
6533
|
+
/** Mean candidate score on the holdout split (NaN if none). */
|
|
6534
|
+
holdoutScore: number;
|
|
6535
|
+
/** Candidate (search − holdout) gap. */
|
|
6536
|
+
overfitGap: number;
|
|
6537
|
+
/** Baseline (search − holdout) gap. */
|
|
6538
|
+
baselineOverfitGap: number;
|
|
6539
|
+
}
|
|
6540
|
+
interface GateDecision {
|
|
6541
|
+
/** Final promote/no-promote verdict. */
|
|
6542
|
+
promote: boolean;
|
|
6543
|
+
/** The candidate that was evaluated. */
|
|
6544
|
+
candidateId: string;
|
|
6545
|
+
/** The baseline it was compared against. */
|
|
6546
|
+
baselineId: string;
|
|
6547
|
+
/** Every number the gate looked at, for audit + paper export. */
|
|
6548
|
+
evidence: GateEvidence;
|
|
6549
|
+
/** Human-readable reason. */
|
|
6550
|
+
reason: string;
|
|
6551
|
+
/** Machine-readable rejection code, or null on promote. */
|
|
6552
|
+
rejectionCode: HeldOutGateRejectionCode | null;
|
|
6553
|
+
}
|
|
6554
|
+
/**
|
|
6555
|
+
* Held-out paired-delta promotion gate. Construct once with config,
|
|
6556
|
+
* call `evaluate(candidateRuns, baselineRuns)` per (candidate,
|
|
6557
|
+
* baseline) pair. Stateless across calls.
|
|
6558
|
+
*/
|
|
6559
|
+
declare class HeldOutGate {
|
|
6560
|
+
private readonly minProductiveRuns;
|
|
6561
|
+
private readonly pairedDeltaThreshold;
|
|
6562
|
+
private readonly overfitGapThreshold;
|
|
6563
|
+
private readonly baselineKey;
|
|
6564
|
+
private readonly confidence;
|
|
6565
|
+
private readonly resamples;
|
|
6566
|
+
private readonly seed?;
|
|
6567
|
+
constructor(config: HeldOutGateConfig);
|
|
6568
|
+
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
6569
|
+
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
6570
|
+
* the candidate run with the matching baseline run. Pairs without
|
|
6571
|
+
* a holdout score on both sides are dropped. */
|
|
6572
|
+
evaluate(candidate: RunRecord[], baseline: RunRecord[]): GateDecision;
|
|
6573
|
+
}
|
|
6574
|
+
|
|
6575
|
+
/**
|
|
6576
|
+
* Researcher interface — stable hook for an external autonomous-research
|
|
6577
|
+
* agent to drive the meta-loop.
|
|
6578
|
+
*
|
|
6579
|
+
* Implementations live downstream (typically in a private repo that
|
|
6580
|
+
* runs the actual LLM). This package ships only the contract + a
|
|
6581
|
+
* `NoopResearcher` so consumers can wire the surface without being
|
|
6582
|
+
* forced to implement every method up front.
|
|
6583
|
+
*
|
|
6584
|
+
* The four methods mirror the four stages of the paper "Two Loops,
|
|
6585
|
+
* Three Roles":
|
|
6586
|
+
*
|
|
6587
|
+
* inspectFailures — given the observed runs, what failure modes
|
|
6588
|
+
* are present? (data → diagnosis)
|
|
6589
|
+
* proposeChange — given diagnosed failure modes, what
|
|
6590
|
+
* structural changes should we try?
|
|
6591
|
+
* (diagnosis → plan delta)
|
|
6592
|
+
* applyChange — fold the proposed deltas into a concrete
|
|
6593
|
+
* experiment plan against an existing baseline.
|
|
6594
|
+
* (plan delta → executable plan)
|
|
6595
|
+
* evaluateChange — run the plan, return runs + the gate verdict.
|
|
6596
|
+
* (executable plan → verdict)
|
|
6597
|
+
*
|
|
6598
|
+
* Composition is the discipline: a Researcher implementation MUST
|
|
6599
|
+
* keep these four steps separate and inspectable. Conflating
|
|
6600
|
+
* "diagnose + propose + run" into a single LLM call defeats the
|
|
6601
|
+
* point of the framework — you can't audit which step lied.
|
|
6602
|
+
*
|
|
6603
|
+
* THIS INTERFACE IS STABLE. Breaking changes require a new module
|
|
6604
|
+
* (e.g. `Researcher2`) so existing implementations keep working.
|
|
6605
|
+
*/
|
|
6606
|
+
|
|
6607
|
+
/** A diagnosed failure mode with the run-IDs that exhibit it. */
|
|
6608
|
+
interface FailureMode {
|
|
6609
|
+
/** Short machine-readable code. Must be stable across runs of the
|
|
6610
|
+
* same researcher to enable longitudinal tracking. */
|
|
6611
|
+
code: string;
|
|
6612
|
+
/** Human-readable description for the paper / dashboard. */
|
|
6613
|
+
description: string;
|
|
6614
|
+
evidence: {
|
|
6615
|
+
/** Run IDs (from `RunRecord.runId`) where this failure mode was
|
|
6616
|
+
* observed. */
|
|
6617
|
+
runIds: string[];
|
|
6618
|
+
/** Number of run samples that informed the diagnosis. */
|
|
6619
|
+
samples: number;
|
|
6620
|
+
};
|
|
6621
|
+
}
|
|
6622
|
+
/** A single steering change the researcher wants to try. */
|
|
6623
|
+
interface SteeringChange {
|
|
6624
|
+
kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
|
|
6625
|
+
/** Implementation-specific payload. Researcher implementations
|
|
6626
|
+
* define the schema — keep this `unknown` here to avoid coupling
|
|
6627
|
+
* the public interface to any one researcher's internal model. */
|
|
6628
|
+
payload: unknown;
|
|
6629
|
+
/** Why the researcher proposed this change. Goes into the audit
|
|
6630
|
+
* trail next to the failure-mode evidence. */
|
|
6631
|
+
rationale: string;
|
|
6632
|
+
/** Optional self-reported expected delta on the headline metric. */
|
|
6633
|
+
expectedDelta?: number;
|
|
6634
|
+
}
|
|
6635
|
+
/** A single experiment plan, mapped onto the search/holdout splits. */
|
|
6636
|
+
interface ExperimentPlan {
|
|
6637
|
+
baselineCandidateId: string;
|
|
6638
|
+
proposedCandidateId: string;
|
|
6639
|
+
changes: SteeringChange[];
|
|
6640
|
+
/** USD ceiling for the entire experiment. The runner must stop
|
|
6641
|
+
* before exceeding this and report a partial result. */
|
|
6642
|
+
evaluationBudgetUsd: number;
|
|
6643
|
+
/** Item IDs (your dataset keys) for the search vs holdout splits. */
|
|
6644
|
+
splits: {
|
|
6645
|
+
search: string[];
|
|
6646
|
+
holdout: string[];
|
|
6647
|
+
};
|
|
6648
|
+
}
|
|
6649
|
+
/** Result of running a plan: every run, plus the gate verdict. */
|
|
6650
|
+
interface ExperimentResult {
|
|
6651
|
+
plan: ExperimentPlan;
|
|
6652
|
+
runs: RunRecord[];
|
|
6653
|
+
gateDecision: GateDecision;
|
|
6654
|
+
}
|
|
6655
|
+
/**
|
|
6656
|
+
* The researcher loop. Stable, four-step, inspectable.
|
|
6657
|
+
*
|
|
6658
|
+
* ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
|
|
6659
|
+
* │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
|
|
6660
|
+
* └──────────┘ └──────────┘ └────┬─────┘
|
|
6661
|
+
* │
|
|
6662
|
+
* ▼
|
|
6663
|
+
* ┌────────────────┐ applyChange ┌────────┐
|
|
6664
|
+
* │ ExperimentPlan │ ◀────────────│ base │
|
|
6665
|
+
* └────────┬───────┘ └────────┘
|
|
6666
|
+
* │
|
|
6667
|
+
* evaluateChange ▼
|
|
6668
|
+
* ┌────────────────┐
|
|
6669
|
+
* │ ExperimentResult│
|
|
6670
|
+
* └────────────────┘
|
|
6671
|
+
*/
|
|
6672
|
+
interface Researcher {
|
|
6673
|
+
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
6674
|
+
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
6675
|
+
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
6676
|
+
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
6677
|
+
}
|
|
6678
|
+
/**
|
|
6679
|
+
* No-op researcher — fails loud on every method. Use as a placeholder
|
|
6680
|
+
* in code paths that wire the interface but don't have an implementation
|
|
6681
|
+
* yet. Importantly, this does NOT silently succeed: a no-op researcher
|
|
6682
|
+
* that returned empty arrays would muffle the loop's signal that
|
|
6683
|
+
* nobody implemented the brain.
|
|
6684
|
+
*/
|
|
6685
|
+
declare class NoopResearcher implements Researcher {
|
|
6686
|
+
private readonly hint;
|
|
6687
|
+
constructor(hint?: string);
|
|
6688
|
+
inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
|
|
6689
|
+
proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
6690
|
+
applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
6691
|
+
evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
6692
|
+
}
|
|
6693
|
+
|
|
6694
|
+
/**
|
|
6695
|
+
* Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
|
|
6696
|
+
* than replacing it.
|
|
6697
|
+
*
|
|
6698
|
+
* Three artefacts:
|
|
6699
|
+
*
|
|
6700
|
+
* - `summaryTable` Markdown table of per-candidate means,
|
|
6701
|
+
* 95% bootstrap CIs, BH-adjusted Wilcoxon
|
|
6702
|
+
* p-values, and Cohen's d versus a
|
|
6703
|
+
* comparator candidate.
|
|
6704
|
+
* - `paretoChart` Abstract spec for a cost vs quality
|
|
6705
|
+
* scatter, with gate decisions overlaid.
|
|
6706
|
+
* Returns numbers + labels — caller
|
|
6707
|
+
* chooses the plotting library.
|
|
6708
|
+
* - `gainHistogram`
|
|
6709
|
+
* Per-item paired holdout deltas as a
|
|
6710
|
+
* histogram spec (bins + counts + median +
|
|
6711
|
+
* CI). Same "data, not images" contract.
|
|
6712
|
+
*
|
|
6713
|
+
* The figure types are PlotSpecs — JSON-friendly, library-agnostic.
|
|
6714
|
+
* They aren't React components and they aren't PNGs; they are
|
|
6715
|
+
* what you'd hand to vega-lite, plotly, matplotlib, or your own
|
|
6716
|
+
* Canvas renderer to draw the actual figure.
|
|
6717
|
+
*/
|
|
6718
|
+
|
|
6719
|
+
interface SummaryTableOptions {
|
|
6720
|
+
/** Comparator candidate id. Wilcoxon + Cohen's d are computed
|
|
6721
|
+
* versus this candidate. Required for paired stats columns. */
|
|
6722
|
+
comparator?: string;
|
|
6723
|
+
/** Which split to read scores from. Default 'holdout'. */
|
|
6724
|
+
split?: 'search' | 'holdout';
|
|
6725
|
+
/** Confidence level for the bootstrap CI on the mean. Default 0.95. */
|
|
6726
|
+
confidence?: number;
|
|
6727
|
+
/** FDR for BH adjustment of the comparison p-values. Default 0.05. */
|
|
6728
|
+
fdr?: number;
|
|
6729
|
+
}
|
|
6730
|
+
interface SummaryTableRow {
|
|
6731
|
+
candidateId: string;
|
|
6732
|
+
n: number;
|
|
6733
|
+
mean: number;
|
|
6734
|
+
ciLow: number;
|
|
6735
|
+
ciHigh: number;
|
|
6736
|
+
/** BH-adjusted q-value vs comparator. NaN if no comparator. */
|
|
6737
|
+
qValue: number;
|
|
6738
|
+
/** Cohen's d vs comparator. NaN if no comparator. */
|
|
6739
|
+
cohensD: number;
|
|
6740
|
+
}
|
|
6741
|
+
interface SummaryTable {
|
|
6742
|
+
rows: SummaryTableRow[];
|
|
6743
|
+
comparator: string | null;
|
|
6744
|
+
split: 'search' | 'holdout';
|
|
6745
|
+
/** Pre-rendered markdown — drop into a paper or PR. */
|
|
6746
|
+
markdown: string;
|
|
6747
|
+
}
|
|
6748
|
+
/**
|
|
6749
|
+
* Table 1 helper. Buckets runs by `candidateId`, computes mean +
|
|
6750
|
+
* bootstrap CI on the chosen split, and (when a comparator is given)
|
|
6751
|
+
* BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
|
|
6752
|
+
*/
|
|
6753
|
+
declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
|
|
6754
|
+
interface ParetoPoint {
|
|
6755
|
+
candidateId: string;
|
|
6756
|
+
/** Mean USD cost per run on the chosen split. */
|
|
6757
|
+
cost: number;
|
|
6758
|
+
/** Mean score on the chosen split. */
|
|
6759
|
+
quality: number;
|
|
6760
|
+
/** Number of runs that informed this point. */
|
|
6761
|
+
n: number;
|
|
6762
|
+
/** Whether this candidate is on the Pareto frontier — high
|
|
6763
|
+
* quality, low cost, no dominator. */
|
|
6764
|
+
onFrontier: boolean;
|
|
6765
|
+
/** Optional gate verdict for this candidate, if a `GateDecision`
|
|
6766
|
+
* for it was passed in. */
|
|
6767
|
+
gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
|
|
6768
|
+
}
|
|
6769
|
+
interface ParetoFigureSpec {
|
|
6770
|
+
kind: 'pareto-cost-quality';
|
|
6771
|
+
split: 'search' | 'holdout';
|
|
6772
|
+
points: ParetoPoint[];
|
|
6773
|
+
axes: {
|
|
6774
|
+
x: 'costUsd';
|
|
6775
|
+
y: 'score';
|
|
6776
|
+
};
|
|
6777
|
+
}
|
|
6778
|
+
/**
|
|
6779
|
+
* Cost vs quality scatter spec. `gateDecisions` is keyed by
|
|
6780
|
+
* candidate id; if present, every point picks up the gate verdict
|
|
6781
|
+
* for overlay.
|
|
6782
|
+
*/
|
|
6783
|
+
declare function paretoChart(runs: RunRecord[], opts?: {
|
|
6784
|
+
split?: 'search' | 'holdout';
|
|
6785
|
+
gateDecisions?: Record<string, GateDecision>;
|
|
6786
|
+
}): ParetoFigureSpec;
|
|
6787
|
+
interface GainDistributionBin {
|
|
6788
|
+
/** Inclusive lower edge. */
|
|
6789
|
+
lo: number;
|
|
6790
|
+
/** Exclusive upper edge (or inclusive if it's the last bin). */
|
|
6791
|
+
hi: number;
|
|
6792
|
+
/** Number of pairs whose delta lands in this bin. */
|
|
6793
|
+
count: number;
|
|
6794
|
+
}
|
|
6795
|
+
interface GainDistributionFigureSpec {
|
|
6796
|
+
kind: 'gain-distribution';
|
|
6797
|
+
candidateId: string;
|
|
6798
|
+
comparator: string;
|
|
6799
|
+
split: 'search' | 'holdout';
|
|
6800
|
+
/** Number of pairs used. */
|
|
6801
|
+
n: number;
|
|
6802
|
+
bins: GainDistributionBin[];
|
|
6803
|
+
median: number;
|
|
6804
|
+
ci: {
|
|
6805
|
+
low: number;
|
|
6806
|
+
high: number;
|
|
6807
|
+
};
|
|
6808
|
+
}
|
|
6809
|
+
interface GainDistributionOptions {
|
|
6810
|
+
/** Number of histogram bins. Default 11 (so the centre is exact at 0). */
|
|
6811
|
+
bins?: number;
|
|
6812
|
+
/** Which split to use. Default 'holdout'. */
|
|
6813
|
+
split?: 'search' | 'holdout';
|
|
6814
|
+
/** Confidence level for the CI. Default 0.95. */
|
|
6815
|
+
confidence?: number;
|
|
6816
|
+
/** Bootstrap resamples. Default 2000. */
|
|
6817
|
+
resamples?: number;
|
|
6818
|
+
/** Deterministic seed. */
|
|
6819
|
+
seed?: number;
|
|
6820
|
+
}
|
|
6821
|
+
/**
|
|
6822
|
+
* Held-out improvement distribution: per-pair delta (candidate −
|
|
6823
|
+
* comparator), histogrammed. Includes the bootstrap CI on the median
|
|
6824
|
+
* delta — same primitive the promotion gate uses.
|
|
6825
|
+
*/
|
|
6826
|
+
declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
|
|
6827
|
+
|
|
6828
|
+
/**
|
|
6829
|
+
* Liveness canaries — cheap statistical checks that catch the failure
|
|
6830
|
+
* modes a green test suite never sees.
|
|
6831
|
+
*
|
|
6832
|
+
* Three canary types in this module:
|
|
6833
|
+
*
|
|
6834
|
+
* 1. **Silent judge fallback** — the judge degraded to a fallback
|
|
6835
|
+
* path (rules-only / cached / heuristic) without anyone
|
|
6836
|
+
* noticing. Signature: a string of consecutive runs whose
|
|
6837
|
+
* `judgeMetadata.confidence` equals a known fallback constant
|
|
6838
|
+
* (default 0.30) OR whose `judgeMetadata.fallback` is true.
|
|
6839
|
+
*
|
|
6840
|
+
* 2. **Judge calibration drift** — the judge's confidence
|
|
6841
|
+
* distribution has drifted from a historical window. Two-sample
|
|
6842
|
+
* Kolmogorov-Smirnov test on the recent vs historical confidences,
|
|
6843
|
+
* with the empirical-CDF max-difference statistic.
|
|
6844
|
+
*
|
|
6845
|
+
* 3. **Eval-set distribution shift** — the mix of categories /
|
|
6846
|
+
* buckets in the recent runs differs significantly from the
|
|
6847
|
+
* historical mix. Chi-square test on the binned counts.
|
|
6848
|
+
*
|
|
6849
|
+
* Outputs are alerts. The canary does NOT fail loud the way a test
|
|
6850
|
+
* does — failing tests are reserved for hard correctness violations.
|
|
6851
|
+
* A canary that fires is a *signal* to investigate, not a verdict.
|
|
6852
|
+
*
|
|
6853
|
+
* Why this lives here rather than in `observability.ts`: that module
|
|
6854
|
+
* exports already, and is a pure-fanout-to-Langfuse/Prometheus
|
|
6855
|
+
* adapter. Canaries are statistical detectors, not adapters.
|
|
6856
|
+
*/
|
|
6857
|
+
|
|
6858
|
+
type CanaryKind = 'silent_judge_fallback' | 'judge_calibration_drift' | 'distribution_shift';
|
|
6859
|
+
type CanarySeverity = 'info' | 'warn' | 'error';
|
|
6860
|
+
interface CanaryAlert {
|
|
6861
|
+
kind: CanaryKind;
|
|
6862
|
+
severity: CanarySeverity;
|
|
6863
|
+
message: string;
|
|
6864
|
+
/** Numbers that informed the decision — drop straight into a
|
|
6865
|
+
* dashboard / paper figure. */
|
|
6866
|
+
evidence: Record<string, unknown>;
|
|
6867
|
+
}
|
|
6868
|
+
interface CanaryReport {
|
|
6869
|
+
alerts: CanaryAlert[];
|
|
6870
|
+
/** Per-kind summary count. */
|
|
6871
|
+
counts: Record<CanaryKind, number>;
|
|
6872
|
+
}
|
|
6873
|
+
interface CanaryOptions {
|
|
6874
|
+
/**
|
|
6875
|
+
* Silent-fallback detection.
|
|
6876
|
+
* - `constant`: confidence value treated as the fallback signal.
|
|
6877
|
+
* Default 0.30 (matches the soft-fail default in
|
|
6878
|
+
* `propose-review.ts`).
|
|
6879
|
+
* - `consecutiveThreshold`: trip the alert after this many
|
|
6880
|
+
* consecutive runs at `constant` (or `fallback === true`).
|
|
6881
|
+
* Default 3.
|
|
6882
|
+
*/
|
|
6883
|
+
silentFallback?: {
|
|
6884
|
+
constant?: number;
|
|
6885
|
+
consecutiveThreshold?: number;
|
|
6886
|
+
/** Floating-point tolerance when comparing against `constant`. */
|
|
6887
|
+
epsilon?: number;
|
|
6888
|
+
};
|
|
6889
|
+
/**
|
|
6890
|
+
* Calibration-drift detection.
|
|
6891
|
+
* - `historyWindow`: number of past runs (oldest-first) treated as
|
|
6892
|
+
* the historical baseline. Default 50.
|
|
6893
|
+
* - `recentWindow`: number of recent runs (newest-first) compared
|
|
6894
|
+
* against history. Default 20.
|
|
6895
|
+
* - `ksAlpha`: alpha for the KS statistic vs critical value.
|
|
6896
|
+
* Default 0.05.
|
|
6897
|
+
* - `minRecent`: minimum recent runs required to even attempt the
|
|
6898
|
+
* check. Default 10.
|
|
6899
|
+
*/
|
|
6900
|
+
calibrationDrift?: {
|
|
6901
|
+
historyWindow?: number;
|
|
6902
|
+
recentWindow?: number;
|
|
6903
|
+
ksAlpha?: number;
|
|
6904
|
+
minRecent?: number;
|
|
6905
|
+
};
|
|
6906
|
+
/**
|
|
6907
|
+
* Distribution-shift detection.
|
|
6908
|
+
* - `category`: function that maps a run to a categorical bucket.
|
|
6909
|
+
* Required to enable this canary; if omitted the chi-square check
|
|
6910
|
+
* is skipped entirely.
|
|
6911
|
+
* - `chiSquareAlpha`: alpha. Default 0.05.
|
|
6912
|
+
* - `historyWindow`, `recentWindow`, `minRecent`: like above.
|
|
6913
|
+
*/
|
|
6914
|
+
distributionShift?: {
|
|
6915
|
+
category: (run: RunRecord) => string | null;
|
|
6916
|
+
chiSquareAlpha?: number;
|
|
6917
|
+
historyWindow?: number;
|
|
6918
|
+
recentWindow?: number;
|
|
6919
|
+
minRecent?: number;
|
|
6920
|
+
};
|
|
6921
|
+
}
|
|
6922
|
+
/**
|
|
6923
|
+
* Run all configured canaries against a chronological run list.
|
|
6924
|
+
* Runs MUST be sorted oldest-to-newest by the caller — the order of
|
|
6925
|
+
* the input is used to define "recent" vs "historical" windows.
|
|
6926
|
+
*/
|
|
6927
|
+
declare function runCanaries(runs: RunRecord[], opts?: CanaryOptions): CanaryReport;
|
|
6928
|
+
|
|
6929
|
+
/**
|
|
6930
|
+
* Shared types for the reference benchmark wrappers under
|
|
6931
|
+
* `src/benchmarks/`. Each wrapper exports the three functions in
|
|
6932
|
+
* `BenchmarkAdapter` plus its own typed `DatasetItem` shape.
|
|
6933
|
+
*/
|
|
6934
|
+
|
|
6935
|
+
interface BenchmarkDatasetItem<TPayload = unknown> {
|
|
6936
|
+
/** Stable dataset-local item id (used for split assignment + paper
|
|
6937
|
+
* references). Unique within a benchmark. */
|
|
6938
|
+
id: string;
|
|
6939
|
+
/** Free-form payload. Each benchmark defines its own shape. */
|
|
6940
|
+
payload: TPayload;
|
|
6941
|
+
}
|
|
6942
|
+
interface BenchmarkEvaluation {
|
|
6943
|
+
/** [0, 1] score for the response on this item. Exact-match
|
|
6944
|
+
* benchmarks use 0/1; partial-credit benchmarks may return
|
|
6945
|
+
* fractional values. */
|
|
6946
|
+
score: number;
|
|
6947
|
+
/** Optional bag of raw scoring signals — e.g. parsed numeric
|
|
6948
|
+
* answer, regex match, judge sub-scores. */
|
|
6949
|
+
raw: Record<string, unknown>;
|
|
6950
|
+
}
|
|
6951
|
+
/** Common signature implemented by every adapter under `src/benchmarks/*`. */
|
|
6952
|
+
interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {
|
|
6953
|
+
/** Load the dataset for the given split. May hit the network on
|
|
6954
|
+
* first call but should be cache-friendly. Adapters that don't
|
|
6955
|
+
* ship the dataset itself MUST throw a clearly-marked error
|
|
6956
|
+
* pointing the caller at the loader script. */
|
|
6957
|
+
loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>;
|
|
6958
|
+
/** Score a single response. Pure with respect to the inputs. */
|
|
6959
|
+
evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>;
|
|
6960
|
+
/** Deterministic split assignment via item id hashing. The
|
|
6961
|
+
* fraction of items in each split is implementation-defined but
|
|
6962
|
+
* MUST be stable across processes and platforms. */
|
|
6963
|
+
assignSplit(itemId: string): RunSplitTag;
|
|
6964
|
+
}
|
|
6965
|
+
/** Split-assignment seed shared across all benchmarks. Bumping this
|
|
6966
|
+
* value reshuffles every split — do NOT do that lightly. */
|
|
6967
|
+
declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
|
|
6968
|
+
/**
|
|
6969
|
+
* Assign an item id to one of `'search' | 'dev' | 'holdout'` using a
|
|
6970
|
+
* stable 32-bit hash of `${seed}::${id}`. Default proportions:
|
|
6971
|
+
*
|
|
6972
|
+
* search: 60% (optimization-readable)
|
|
6973
|
+
* dev: 20% (held-out for tuning, leak-on-purpose during dev)
|
|
6974
|
+
* holdout:20% (paper-grade held-out, gated reads)
|
|
6975
|
+
*/
|
|
6976
|
+
declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
|
|
6977
|
+
|
|
6978
|
+
/**
|
|
6979
|
+
* GSM8K wrapper — exact-match grading on the final numeric answer.
|
|
6980
|
+
*
|
|
6981
|
+
* The dataset itself is NOT bundled. `loadDataset` will:
|
|
6982
|
+
* 1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
|
|
6983
|
+
* file with `{ id, question, answer }` records — the standard
|
|
6984
|
+
* HF mirror layout converted to JSONL);
|
|
6985
|
+
* 2. otherwise throw a clearly-marked error pointing to the loader.
|
|
6986
|
+
*
|
|
6987
|
+
* `evaluate` parses the final number out of the response (last
|
|
6988
|
+
* occurrence of a signed-decimal-or-integer literal, optionally after
|
|
6989
|
+
* `####`, the GSM8K answer convention) and compares to the ground-
|
|
6990
|
+
* truth integer. Floating-point comparisons use a 1e-6 tolerance.
|
|
6991
|
+
*/
|
|
6992
|
+
|
|
6993
|
+
interface Gsm8kPayload {
|
|
6994
|
+
question: string;
|
|
6995
|
+
/** Reference answer, post-#### normalization. May be a number or
|
|
6996
|
+
* a numeric string ("72", "1.5"). */
|
|
6997
|
+
answer: string;
|
|
6998
|
+
}
|
|
6999
|
+
type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>;
|
|
7000
|
+
declare class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
|
|
7001
|
+
loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]>;
|
|
7002
|
+
evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation>;
|
|
7003
|
+
assignSplit(itemId: string): RunSplitTag;
|
|
7004
|
+
}
|
|
7005
|
+
/**
|
|
7006
|
+
* Parse a GSM8K-style answer. Honors the dataset's `#### N`
|
|
7007
|
+
* convention (the canonical answer comes after `####`); otherwise
|
|
7008
|
+
* returns the LAST signed numeric literal in the string.
|
|
7009
|
+
*/
|
|
7010
|
+
declare function parseGsm8kAnswer(text: string): number | null;
|
|
7011
|
+
declare const loadDataset$2: (split: RunSplitTag) => Promise<Gsm8kItem[]>;
|
|
7012
|
+
declare const evaluate$2: (item: Gsm8kItem, response: string) => Promise<BenchmarkEvaluation>;
|
|
7013
|
+
declare const assignSplit$2: (itemId: string) => RunSplitTag;
|
|
7014
|
+
|
|
7015
|
+
type index$3_Gsm8kAdapter = Gsm8kAdapter;
|
|
7016
|
+
declare const index$3_Gsm8kAdapter: typeof Gsm8kAdapter;
|
|
7017
|
+
type index$3_Gsm8kItem = Gsm8kItem;
|
|
7018
|
+
type index$3_Gsm8kPayload = Gsm8kPayload;
|
|
7019
|
+
declare const index$3_parseGsm8kAnswer: typeof parseGsm8kAnswer;
|
|
7020
|
+
declare namespace index$3 {
|
|
7021
|
+
export { index$3_Gsm8kAdapter as Gsm8kAdapter, type index$3_Gsm8kItem as Gsm8kItem, type index$3_Gsm8kPayload as Gsm8kPayload, assignSplit$2 as assignSplit, evaluate$2 as evaluate, loadDataset$2 as loadDataset, index$3_parseGsm8kAnswer as parseGsm8kAnswer };
|
|
7022
|
+
}
|
|
7023
|
+
|
|
7024
|
+
/**
|
|
7025
|
+
* SWE-Bench Lite wrapper — 30-instance subset.
|
|
7026
|
+
*
|
|
7027
|
+
* Status: STUB. The actual SWE-Bench harness needs a Docker host and
|
|
7028
|
+
* is too heavy to ship inside this package. We expose the contract
|
|
7029
|
+
* (loadDataset, evaluate, assignSplit) so consumers can plug in their
|
|
7030
|
+
* own grader without touching call sites.
|
|
7031
|
+
*
|
|
7032
|
+
* Wire-up paths in priority order:
|
|
7033
|
+
*
|
|
7034
|
+
* 1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
|
|
7035
|
+
* lite instances + per-instance metadata (instance_id,
|
|
7036
|
+
* problem_statement, base_commit, repo, FAIL_TO_PASS,
|
|
7037
|
+
* PASS_TO_PASS).
|
|
7038
|
+
* 2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
|
|
7039
|
+
* that reads `{instance_id, patch}` JSON on stdin and writes
|
|
7040
|
+
* `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
|
|
7041
|
+
* JSON on stdout. Implementations can shell out to the
|
|
7042
|
+
* official `swebench` runner here.
|
|
7043
|
+
*
|
|
7044
|
+
* If neither is set, every public method throws a clearly-marked
|
|
7045
|
+
* "not implemented" error. The stub fails LOUD; it never silently
|
|
7046
|
+
* scores zero.
|
|
7047
|
+
*/
|
|
7048
|
+
|
|
7049
|
+
interface SweBenchLitePayload {
|
|
7050
|
+
instanceId: string;
|
|
7051
|
+
problemStatement: string;
|
|
7052
|
+
baseCommit: string;
|
|
7053
|
+
repo: string;
|
|
7054
|
+
failToPass: string[];
|
|
7055
|
+
passToPass: string[];
|
|
7056
|
+
}
|
|
7057
|
+
type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>;
|
|
7058
|
+
declare class SweBenchLiteAdapter implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload> {
|
|
7059
|
+
loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]>;
|
|
7060
|
+
evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation>;
|
|
7061
|
+
assignSplit(itemId: string): RunSplitTag;
|
|
7062
|
+
}
|
|
7063
|
+
declare const loadDataset$1: (split: RunSplitTag) => Promise<SweBenchLiteItem[]>;
|
|
7064
|
+
declare const evaluate$1: (item: SweBenchLiteItem, response: string) => Promise<BenchmarkEvaluation>;
|
|
7065
|
+
declare const assignSplit$1: (itemId: string) => RunSplitTag;
|
|
7066
|
+
|
|
7067
|
+
type index$2_SweBenchLiteAdapter = SweBenchLiteAdapter;
|
|
7068
|
+
declare const index$2_SweBenchLiteAdapter: typeof SweBenchLiteAdapter;
|
|
7069
|
+
type index$2_SweBenchLiteItem = SweBenchLiteItem;
|
|
7070
|
+
type index$2_SweBenchLitePayload = SweBenchLitePayload;
|
|
7071
|
+
declare namespace index$2 {
|
|
7072
|
+
export { index$2_SweBenchLiteAdapter as SweBenchLiteAdapter, type index$2_SweBenchLiteItem as SweBenchLiteItem, type index$2_SweBenchLitePayload as SweBenchLitePayload, assignSplit$1 as assignSplit, evaluate$1 as evaluate, loadDataset$1 as loadDataset };
|
|
7073
|
+
}
|
|
7074
|
+
|
|
7075
|
+
/**
|
|
7076
|
+
* Synthetic routing dataset. 16 tasks across 4 categories. Used as a
|
|
7077
|
+
* deterministic, dependency-free benchmark for any router that maps a
|
|
7078
|
+
* natural-language request to one of a fixed set of route labels.
|
|
7079
|
+
*
|
|
7080
|
+
* Format (see `routing/README.md` for prose):
|
|
7081
|
+
*
|
|
7082
|
+
* {
|
|
7083
|
+
* id: stable per-task ID (matches across processes).
|
|
7084
|
+
* category: one of the four route labels.
|
|
7085
|
+
* prompt: the user-facing request the router must classify.
|
|
7086
|
+
* route: the ground-truth route the router should pick.
|
|
7087
|
+
* synonyms: other strings that count as a correct answer.
|
|
7088
|
+
* hardNegatives:close-but-wrong route labels — used to detect the
|
|
7089
|
+
* "always picks the popular route" failure mode.
|
|
7090
|
+
* }
|
|
7091
|
+
*
|
|
7092
|
+
* The four categories are intentionally cross-domain (file ops,
|
|
7093
|
+
* math, search, conversation) so a router that collapses to one
|
|
7094
|
+
* category is easy to spot.
|
|
7095
|
+
*/
|
|
7096
|
+
interface RoutingItem {
|
|
7097
|
+
id: string;
|
|
7098
|
+
category: 'file' | 'math' | 'search' | 'chat';
|
|
7099
|
+
prompt: string;
|
|
7100
|
+
/** Canonical correct route label. */
|
|
7101
|
+
route: string;
|
|
7102
|
+
/** Alternate route labels that also count as correct. */
|
|
7103
|
+
synonyms: string[];
|
|
7104
|
+
/** Wrong-but-tempting route labels (for analysis, not grading). */
|
|
7105
|
+
hardNegatives: string[];
|
|
7106
|
+
}
|
|
7107
|
+
declare const ROUTING_DATASET: RoutingItem[];
|
|
7108
|
+
|
|
7109
|
+
/**
|
|
7110
|
+
* Routing benchmark — synthetic, dependency-free, ships in the
|
|
7111
|
+
* package. 16 cross-category items in `dataset.ts`. See
|
|
7112
|
+
* `routing/README.md` for the format.
|
|
7113
|
+
*
|
|
7114
|
+
* `evaluate` does case-insensitive exact match against the canonical
|
|
7115
|
+
* route plus declared synonyms. The first valid route token in the
|
|
7116
|
+
* response wins; everything else is ignored. Wrong answers also
|
|
7117
|
+
* report whether they hit a hard negative — useful when triaging
|
|
7118
|
+
* "always picks the popular route" failure modes.
|
|
7119
|
+
*/
|
|
7120
|
+
|
|
7121
|
+
type RoutingPayload = RoutingItem;
|
|
7122
|
+
type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>;
|
|
7123
|
+
declare class RoutingAdapter implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload> {
|
|
7124
|
+
loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]>;
|
|
7125
|
+
evaluate(item: RoutingDatasetItem, response: string): Promise<BenchmarkEvaluation>;
|
|
7126
|
+
assignSplit(itemId: string): RunSplitTag;
|
|
7127
|
+
}
|
|
7128
|
+
/**
|
|
7129
|
+
* Pull route-shaped tokens out of a model response. Routes look like
|
|
7130
|
+
* `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics
|
|
7131
|
+
* are not routes, but `category.action` patterns are robust to most
|
|
7132
|
+
* model wrappers (JSON output, prose explanations, code fences).
|
|
7133
|
+
*/
|
|
7134
|
+
declare function extractRouteTokens(response: string): string[];
|
|
7135
|
+
declare const loadDataset: (split: RunSplitTag) => Promise<RoutingDatasetItem[]>;
|
|
7136
|
+
declare const evaluate: (item: RoutingDatasetItem, response: string) => Promise<BenchmarkEvaluation>;
|
|
7137
|
+
declare const assignSplit: (itemId: string) => RunSplitTag;
|
|
7138
|
+
|
|
7139
|
+
declare const index$1_ROUTING_DATASET: typeof ROUTING_DATASET;
|
|
7140
|
+
type index$1_RoutingAdapter = RoutingAdapter;
|
|
7141
|
+
declare const index$1_RoutingAdapter: typeof RoutingAdapter;
|
|
7142
|
+
type index$1_RoutingDatasetItem = RoutingDatasetItem;
|
|
7143
|
+
type index$1_RoutingItem = RoutingItem;
|
|
7144
|
+
type index$1_RoutingPayload = RoutingPayload;
|
|
7145
|
+
declare const index$1_assignSplit: typeof assignSplit;
|
|
7146
|
+
declare const index$1_evaluate: typeof evaluate;
|
|
7147
|
+
declare const index$1_extractRouteTokens: typeof extractRouteTokens;
|
|
7148
|
+
declare const index$1_loadDataset: typeof loadDataset;
|
|
7149
|
+
declare namespace index$1 {
|
|
7150
|
+
export { index$1_ROUTING_DATASET as ROUTING_DATASET, index$1_RoutingAdapter as RoutingAdapter, type index$1_RoutingDatasetItem as RoutingDatasetItem, type index$1_RoutingItem as RoutingItem, type index$1_RoutingPayload as RoutingPayload, index$1_assignSplit as assignSplit, index$1_evaluate as evaluate, index$1_extractRouteTokens as extractRouteTokens, index$1_loadDataset as loadDataset };
|
|
7151
|
+
}
|
|
7152
|
+
|
|
7153
|
+
/**
|
|
7154
|
+
* Reference benchmark wrappers — entry point.
|
|
7155
|
+
*
|
|
7156
|
+
* Three benchmarks ship under `src/benchmarks/`:
|
|
7157
|
+
* - `gsm8k` — exact-match math reasoning (HF mirror,
|
|
7158
|
+
* dataset NOT bundled — see `gsm8k/index.ts`).
|
|
7159
|
+
* - `swebench-lite` — 30-instance SWE-Bench subset (STUB; needs
|
|
7160
|
+
* external grader).
|
|
7161
|
+
* - `routing` — synthetic 16-task router benchmark, ships
|
|
7162
|
+
* in the package.
|
|
7163
|
+
*
|
|
7164
|
+
* Every benchmark exposes the same three exports — `loadDataset`,
|
|
7165
|
+
* `evaluate`, `assignSplit` — and a typed adapter class. Pick the
|
|
7166
|
+
* import path that matches the benchmark.
|
|
7167
|
+
*
|
|
7168
|
+
* Shared types (`BenchmarkAdapter`, `BenchmarkDatasetItem`,
|
|
7169
|
+
* `BenchmarkEvaluation`, `deterministicSplit`, `BENCHMARK_SPLIT_SEED`)
|
|
7170
|
+
* live in `./types`.
|
|
7171
|
+
*/
|
|
7172
|
+
|
|
7173
|
+
declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
|
|
7174
|
+
type index_BenchmarkAdapter<_TItem = unknown, TPayload = unknown> = BenchmarkAdapter<_TItem, TPayload>;
|
|
7175
|
+
type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayload>;
|
|
7176
|
+
type index_BenchmarkEvaluation = BenchmarkEvaluation;
|
|
7177
|
+
declare const index_deterministicSplit: typeof deterministicSplit;
|
|
7178
|
+
declare namespace index {
|
|
7179
|
+
export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$3 as gsm8k, index$1 as routing, index$2 as swebenchLite };
|
|
7180
|
+
}
|
|
7181
|
+
|
|
6223
7182
|
interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
|
|
6224
7183
|
bundleForRun?: (run: ReferenceReplayRun<Input>) => SteeringBundle;
|
|
6225
7184
|
scoreForCase?: (caseRun: ReferenceReplayCaseRun<Input>, run: ReferenceReplayRun<Input>) => RunScore;
|
|
@@ -7120,4 +8079,4 @@ interface ReflectionProposal {
|
|
|
7120
8079
|
*/
|
|
7121
8080
|
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
7122
8081
|
|
|
7123
|
-
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
|
8082
|
+
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|