@tangle-network/agent-eval 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +184 -11
- package/dist/chunk-ITN4YOZY.js +215 -0
- package/dist/chunk-ITN4YOZY.js.map +1 -0
- package/dist/chunk-OZPRSK4A.js +594 -0
- package/dist/chunk-OZPRSK4A.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +104 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +548 -1
- package/dist/index.js +876 -210
- package/dist/index.js.map +1 -1
- package/dist/wire/index.d.ts +211 -0
- package/dist/wire/index.js +56 -0
- package/dist/wire/index.js.map +1 -0
- package/package.json +17 -3
package/dist/index.d.ts
CHANGED
|
@@ -944,6 +944,114 @@ interface RunDiff {
|
|
|
944
944
|
}>;
|
|
945
945
|
}
|
|
946
946
|
|
|
947
|
+
/**
|
|
948
|
+
* FileSystemExperimentStore — NDJSON-backed `ExperimentStore` for local + CI.
|
|
949
|
+
*
|
|
950
|
+
* Mirrors the file layout of `FileSystemTraceStore`: two append-only NDJSON
|
|
951
|
+
* files (`experiments.ndjson` + `runs.ndjson`) under one directory, with size-
|
|
952
|
+
* based rollover. Writes are append-only so the file log doubles as an audit
|
|
953
|
+
* trail of every state transition the tracker ever wrote.
|
|
954
|
+
*
|
|
955
|
+
* Reads lazy-load every NDJSON file in the directory (including rolled-over
|
|
956
|
+
* archives), latest-write-wins per `id`. Subsequent writes update the
|
|
957
|
+
* in-memory index in place so reads after writes are O(1).
|
|
958
|
+
*
|
|
959
|
+
* Node-only — imports `node:fs/promises`. Don't import this from a Worker;
|
|
960
|
+
* use the in-memory store or the D1 store from `./experiment-tracker-d1`.
|
|
961
|
+
*/
|
|
962
|
+
|
|
963
|
+
interface FileSystemExperimentStoreOptions {
|
|
964
|
+
/** Directory the NDJSON files live in. Created on first write. */
|
|
965
|
+
dir: string;
|
|
966
|
+
/** Bytes after which a file is rolled over. Default 32 MB (matches FileSystemTraceStore). */
|
|
967
|
+
maxBytes?: number;
|
|
968
|
+
}
|
|
969
|
+
declare class FileSystemExperimentStore implements ExperimentStore {
|
|
970
|
+
private readonly dir;
|
|
971
|
+
private readonly maxBytes;
|
|
972
|
+
private index?;
|
|
973
|
+
private loaded;
|
|
974
|
+
constructor(options: FileSystemExperimentStoreOptions);
|
|
975
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
976
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
977
|
+
listExperiments(): Promise<Experiment[]>;
|
|
978
|
+
saveRun(run: Run$1): Promise<void>;
|
|
979
|
+
getRun(id: string): Promise<Run$1 | null>;
|
|
980
|
+
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
981
|
+
private ensureDir;
|
|
982
|
+
private append;
|
|
983
|
+
private load;
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
/**
|
|
987
|
+
* D1ExperimentStore — Cloudflare D1-backed `ExperimentStore`.
|
|
988
|
+
*
|
|
989
|
+
* Workers-safe (uses only the `D1Database` binding the runtime injects). Two
|
|
990
|
+
* tables, no joins, no migrations beyond `ensureSchema()`. Schema designed so
|
|
991
|
+
* a Worker route can both write the row at run start and update it at run end
|
|
992
|
+
* without losing the original config — the row's lifecycle mirrors the
|
|
993
|
+
* `Run.status` field one-to-one.
|
|
994
|
+
*
|
|
995
|
+
* Why this lives next to `InMemoryExperimentStore`:
|
|
996
|
+
* - bad-app, legal-agent, gtm-agent, film-agent all run as Workers
|
|
997
|
+
* - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
|
|
998
|
+
* - Hand-rolling D1 SQL in every consumer is exactly the duplication this
|
|
999
|
+
* module exists to prevent
|
|
1000
|
+
*
|
|
1001
|
+
* Schema versioning: the `meta` table records `schema_version` so a future
|
|
1002
|
+
* column addition can be detected and migrated additively. Today's schema is
|
|
1003
|
+
* v1; bump only on breaking shape changes.
|
|
1004
|
+
*/
|
|
1005
|
+
|
|
1006
|
+
/**
|
|
1007
|
+
* Minimal `D1Database` shape we depend on. Avoids pulling in
|
|
1008
|
+
* `@cloudflare/workers-types` as a hard dep — consumers that already have
|
|
1009
|
+
* those types installed can pass the binding directly.
|
|
1010
|
+
*/
|
|
1011
|
+
interface D1Like {
|
|
1012
|
+
prepare(query: string): D1PreparedStatementLike;
|
|
1013
|
+
batch?(statements: D1PreparedStatementLike[]): Promise<unknown[]>;
|
|
1014
|
+
exec(query: string): Promise<unknown>;
|
|
1015
|
+
}
|
|
1016
|
+
interface D1PreparedStatementLike {
|
|
1017
|
+
bind(...values: unknown[]): D1PreparedStatementLike;
|
|
1018
|
+
first<T = Record<string, unknown>>(): Promise<T | null>;
|
|
1019
|
+
all<T = Record<string, unknown>>(): Promise<{
|
|
1020
|
+
results: T[];
|
|
1021
|
+
}>;
|
|
1022
|
+
run(): Promise<unknown>;
|
|
1023
|
+
}
|
|
1024
|
+
interface D1ExperimentStoreOptions {
|
|
1025
|
+
/** D1 binding from `env`. */
|
|
1026
|
+
db: D1Like;
|
|
1027
|
+
/**
|
|
1028
|
+
* Optional table-name prefix so multiple ExperimentStores can share a DB
|
|
1029
|
+
* without colliding (e.g. `tax_eval_experiments` vs `legal_eval_experiments`).
|
|
1030
|
+
* Default: `agent_eval_`.
|
|
1031
|
+
*/
|
|
1032
|
+
tablePrefix?: string;
|
|
1033
|
+
}
|
|
1034
|
+
declare class D1ExperimentStore implements ExperimentStore {
|
|
1035
|
+
private readonly db;
|
|
1036
|
+
private readonly experimentsTable;
|
|
1037
|
+
private readonly runsTable;
|
|
1038
|
+
private readonly metaTable;
|
|
1039
|
+
private schemaReady;
|
|
1040
|
+
constructor(options: D1ExperimentStoreOptions);
|
|
1041
|
+
/**
|
|
1042
|
+
* Idempotent schema setup. Safe to call before every operation; the second
|
|
1043
|
+
* call short-circuits via `schemaReady`. Most consumers will call it once
|
|
1044
|
+
* during Worker bootstrap.
|
|
1045
|
+
*/
|
|
1046
|
+
ensureSchema(): Promise<void>;
|
|
1047
|
+
saveExperiment(exp: Experiment): Promise<void>;
|
|
1048
|
+
getExperiment(id: string): Promise<Experiment | null>;
|
|
1049
|
+
listExperiments(): Promise<Experiment[]>;
|
|
1050
|
+
saveRun(run: Run$1): Promise<void>;
|
|
1051
|
+
getRun(id: string): Promise<Run$1 | null>;
|
|
1052
|
+
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
1053
|
+
}
|
|
1054
|
+
|
|
947
1055
|
/**
|
|
948
1056
|
* Prompt optimizer — A/B test prompt variants with statistical rigor.
|
|
949
1057
|
*
|
|
@@ -6294,6 +6402,445 @@ interface PromptEvolutionResult<P = unknown> {
|
|
|
6294
6402
|
}
|
|
6295
6403
|
declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
|
|
6296
6404
|
|
|
6405
|
+
/**
|
|
6406
|
+
* concurrency — small primitives the evolution loop needs.
|
|
6407
|
+
*
|
|
6408
|
+
* `Mutex` is a zero-dep async lock with FIFO fairness. The evolution loop
|
|
6409
|
+
* uses it to serialise checkout/build/commit sequences inside a single
|
|
6410
|
+
* pool slot, and to gate concurrent JSONL writers (see
|
|
6411
|
+
* `lockedJsonlReferenceReplayStore`).
|
|
6412
|
+
*
|
|
6413
|
+
* Deliberately minimal — no priority queue, no timeouts. If you need
|
|
6414
|
+
* those, swap to `async-mutex` at the call site.
|
|
6415
|
+
*/
|
|
6416
|
+
declare class Mutex {
|
|
6417
|
+
private locked;
|
|
6418
|
+
private readonly waiters;
|
|
6419
|
+
acquire(): Promise<() => void>;
|
|
6420
|
+
private release;
|
|
6421
|
+
runExclusive<T>(fn: () => Promise<T> | T): Promise<T>;
|
|
6422
|
+
/** True iff someone holds the lock right now. Diagnostics only. */
|
|
6423
|
+
get isLocked(): boolean;
|
|
6424
|
+
/** Pending waiter count. Diagnostics only. */
|
|
6425
|
+
get pending(): number;
|
|
6426
|
+
}
|
|
6427
|
+
|
|
6428
|
+
/**
|
|
6429
|
+
* JsonlTrialCache — `TrialCache` backed by a JSONL append-only file so a
|
|
6430
|
+
* crashed `runPromptEvolution` can resume without re-running expensive
|
|
6431
|
+
* trials. Last write wins on key collision; the file is forward-swept at
|
|
6432
|
+
* construction.
|
|
6433
|
+
*
|
|
6434
|
+
* Tail corruption (partial line at the bottom from a hard kill) is
|
|
6435
|
+
* tolerated — we skip unparseable lines and continue.
|
|
6436
|
+
*
|
|
6437
|
+
* The cache surface (`get` / `set`) is synchronous because `TrialCache`
|
|
6438
|
+
* is. Writes are mutex-serialised through a `LockedJsonlAppender`
|
|
6439
|
+
* (kicked off with `void`) so two in-process callers can't tear a long
|
|
6440
|
+
* line that exceeds POSIX `PIPE_BUF`. Cross-process safety still
|
|
6441
|
+
* requires fcntl/flock and is deliberately out of scope.
|
|
6442
|
+
*/
|
|
6443
|
+
|
|
6444
|
+
declare class JsonlTrialCache implements TrialCache {
|
|
6445
|
+
private readonly map;
|
|
6446
|
+
private readonly path;
|
|
6447
|
+
private readonly appender;
|
|
6448
|
+
constructor(path: string);
|
|
6449
|
+
get(key: string): TrialResult | undefined;
|
|
6450
|
+
set(key: string, value: TrialResult): void;
|
|
6451
|
+
size(): number;
|
|
6452
|
+
/**
|
|
6453
|
+
* Synchronous fallback path for tests / CLI tools that want to be sure
|
|
6454
|
+
* the line is on disk before returning. Bypasses the mutex (single-
|
|
6455
|
+
* threaded callers only).
|
|
6456
|
+
*/
|
|
6457
|
+
setSync(key: string, value: TrialResult): void;
|
|
6458
|
+
}
|
|
6459
|
+
|
|
6460
|
+
/**
|
|
6461
|
+
* LockedJsonlAppender — mutex-serialized JSONL append helper for arbitrary
|
|
6462
|
+
* payloads. The reference-replay store does the same thing for typed
|
|
6463
|
+
* `ReferenceReplayRun` rows; this is the generic version used by
|
|
6464
|
+
* `MutationTelemetry`, `TrialTelemetry`, and any other consumer that wants
|
|
6465
|
+
* append-only durable telemetry without rolling its own lock.
|
|
6466
|
+
*
|
|
6467
|
+
* Locks are per absolute file path (process-local). Cross-process
|
|
6468
|
+
* concurrency is NOT addressed — that's an fcntl/flock problem.
|
|
6469
|
+
*/
|
|
6470
|
+
declare class LockedJsonlAppender {
|
|
6471
|
+
readonly path: string;
|
|
6472
|
+
private readonly mutex;
|
|
6473
|
+
constructor(path: string);
|
|
6474
|
+
append(entry: unknown): Promise<void>;
|
|
6475
|
+
}
|
|
6476
|
+
/** Reset all internal mutex state — tests only. */
|
|
6477
|
+
declare function resetLockedAppendersForTesting(): void;
|
|
6478
|
+
|
|
6479
|
+
/**
|
|
6480
|
+
* evolution-telemetry — durable JSONL/JSON sinks for the evolution loop.
|
|
6481
|
+
*
|
|
6482
|
+
* `runPromptEvolution` exposes generation-level events but doesn't persist
|
|
6483
|
+
* the per-mutation, per-trial, lineage, or cost breakdown. These four
|
|
6484
|
+
* sinks fill that gap so a finished autoresearch run leaves a forensically
|
|
6485
|
+
* complete trail under one directory:
|
|
6486
|
+
*
|
|
6487
|
+
* - `mutations.jsonl` — every mutate attempt (success + failure) with
|
|
6488
|
+
* latency, agent steps, diff stats, cost.
|
|
6489
|
+
* - `trials.jsonl` — every TrialResult including cache hits, with
|
|
6490
|
+
* provenance (channel, runtime slot, generation).
|
|
6491
|
+
* - `lineage.json` — variant tree {id → {parent, generation, kind, …}},
|
|
6492
|
+
* incremental upsert.
|
|
6493
|
+
* - `cost-ledger.json` — running $ totals per source (mutator-prompt,
|
|
6494
|
+
* mutator-code, scorer-prompt, scorer-code) plus pool utilisation.
|
|
6495
|
+
*
|
|
6496
|
+
* All writes are mutex-serialised. The append-only sinks (mutations,
|
|
6497
|
+
* trials) survive a hard kill; the snapshot sinks (lineage, cost-ledger)
|
|
6498
|
+
* rewrite on every update so the latest state is always on disk.
|
|
6499
|
+
*
|
|
6500
|
+
* Generic over a payload P so any consumer of `runPromptEvolution<P>` can
|
|
6501
|
+
* record lineage without leaking domain types.
|
|
6502
|
+
*/
|
|
6503
|
+
|
|
6504
|
+
type MutationChannel = 'prompt' | 'code';
|
|
6505
|
+
interface MutationAttempt {
|
|
6506
|
+
ts: number;
|
|
6507
|
+
channel: MutationChannel;
|
|
6508
|
+
generation: number;
|
|
6509
|
+
parentId: string;
|
|
6510
|
+
/** Successful child variant id, or null if the attempt failed. */
|
|
6511
|
+
childId: string | null;
|
|
6512
|
+
ok: boolean;
|
|
6513
|
+
/**
|
|
6514
|
+
* One of: 'parse_failure' | 'typecheck_failure' | 'no_changes' |
|
|
6515
|
+
* 'agent_error' | 'commit_failure' | 'no_api_key' | 'no_valid_proposals'
|
|
6516
|
+
* | 'reproduce_parent_failed' | 'branch_failed' | 'other'.
|
|
6517
|
+
* Free-form to allow consumer-specific reasons.
|
|
6518
|
+
*/
|
|
6519
|
+
failureReason?: string;
|
|
6520
|
+
/** Free-form description of what the agent said it did. */
|
|
6521
|
+
description?: string;
|
|
6522
|
+
/** Latency of the LLM call (ms). */
|
|
6523
|
+
latencyMs: number;
|
|
6524
|
+
/** Bytes of generated diff (code channel only). */
|
|
6525
|
+
diffBytes?: number;
|
|
6526
|
+
/** Files touched (code channel only). */
|
|
6527
|
+
filesTouched?: number;
|
|
6528
|
+
/** Steps the agent ran (tool calls). */
|
|
6529
|
+
agentSteps?: number;
|
|
6530
|
+
/** Approx $ spent on this mutation (LLM tokens). */
|
|
6531
|
+
costUsd?: number;
|
|
6532
|
+
/** Runtime slot used (code channel only). */
|
|
6533
|
+
runtimeSandboxId?: string;
|
|
6534
|
+
}
|
|
6535
|
+
declare class MutationTelemetry {
|
|
6536
|
+
private readonly appender;
|
|
6537
|
+
constructor(path: string);
|
|
6538
|
+
record(attempt: MutationAttempt): Promise<void>;
|
|
6539
|
+
}
|
|
6540
|
+
interface TrialAttempt {
|
|
6541
|
+
ts: number;
|
|
6542
|
+
channel: MutationChannel;
|
|
6543
|
+
generation: number;
|
|
6544
|
+
variantId: string;
|
|
6545
|
+
scenarioId: string;
|
|
6546
|
+
rep: number;
|
|
6547
|
+
ok: boolean;
|
|
6548
|
+
score: number;
|
|
6549
|
+
costUsd: number;
|
|
6550
|
+
durationMs: number;
|
|
6551
|
+
cached: boolean;
|
|
6552
|
+
runtimeSandboxId?: string;
|
|
6553
|
+
error?: string;
|
|
6554
|
+
metrics?: Record<string, number>;
|
|
6555
|
+
}
|
|
6556
|
+
declare class TrialTelemetry {
|
|
6557
|
+
private readonly appender;
|
|
6558
|
+
constructor(path: string);
|
|
6559
|
+
record(attempt: TrialAttempt): Promise<void>;
|
|
6560
|
+
}
|
|
6561
|
+
type LineageKind = 'seed' | 'prompt' | 'code';
|
|
6562
|
+
interface LineageNode {
|
|
6563
|
+
id: string;
|
|
6564
|
+
parentId: string | null;
|
|
6565
|
+
generation: number;
|
|
6566
|
+
kind: LineageKind;
|
|
6567
|
+
rationale?: string;
|
|
6568
|
+
/** Filled when scoring lands. */
|
|
6569
|
+
meanScore?: number;
|
|
6570
|
+
promotedToFrontier?: boolean;
|
|
6571
|
+
}
|
|
6572
|
+
/**
|
|
6573
|
+
* `kindOf` decides whether a variant is a seed (no parent), code mutation,
|
|
6574
|
+
* or prompt mutation. Default looks at `variant.payload.codeMutation` —
|
|
6575
|
+
* that field is part of the audit-bench convention but cheap enough to
|
|
6576
|
+
* accept any payload that mirrors it. Override by passing your own.
|
|
6577
|
+
*/
|
|
6578
|
+
type LineageKindResolver<P> = (variant: PromptVariant<P>) => LineageKind;
|
|
6579
|
+
/**
|
|
6580
|
+
* Persistence shape:
|
|
6581
|
+
*
|
|
6582
|
+
* `<path>` — JSONL of upserts (event log). Each line is a
|
|
6583
|
+
* partial node; replay folds them into the current
|
|
6584
|
+
* state. Append-only, so cost is O(1) per upsert
|
|
6585
|
+
* instead of the previous O(n²) full rewrite.
|
|
6586
|
+
* `<path>.snapshot` — Optional consolidated snapshot, written on
|
|
6587
|
+
* demand via `compact()` (e.g. at end of run).
|
|
6588
|
+
* Read by external tools that don't want to
|
|
6589
|
+
* replay the log.
|
|
6590
|
+
*
|
|
6591
|
+
* Loaded at construction time: if `<path>.snapshot` exists, parse it
|
|
6592
|
+
* first; then replay any newer log lines on top. Falls back to log-only
|
|
6593
|
+
* when no snapshot is present.
|
|
6594
|
+
*/
|
|
6595
|
+
declare class LineageRecorder<P = unknown> {
|
|
6596
|
+
private readonly path;
|
|
6597
|
+
private readonly snapshotPath;
|
|
6598
|
+
private readonly mutex;
|
|
6599
|
+
private readonly nodes;
|
|
6600
|
+
private readonly kindOf;
|
|
6601
|
+
constructor(path: string, kindOf?: LineageKindResolver<P>);
|
|
6602
|
+
upsert(node: LineageNode): Promise<void>;
|
|
6603
|
+
upsertVariant(variant: PromptVariant<P>): Promise<void>;
|
|
6604
|
+
snapshot(): LineageNode[];
|
|
6605
|
+
/**
|
|
6606
|
+
* Write the current consolidated state to `<path>.snapshot` so external
|
|
6607
|
+
* tools can read it without replaying the event log. Idempotent.
|
|
6608
|
+
*/
|
|
6609
|
+
compact(): Promise<void>;
|
|
6610
|
+
}
|
|
6611
|
+
/** Per-generation cost rollup. Same shape as the totals, scoped to one gen. */
|
|
6612
|
+
interface CostLedgerGeneration {
|
|
6613
|
+
generation: number;
|
|
6614
|
+
mutatorPromptUsd: number;
|
|
6615
|
+
mutatorCodeUsd: number;
|
|
6616
|
+
scorerPromptUsd: number;
|
|
6617
|
+
scorerCodeUsd: number;
|
|
6618
|
+
trialsCounted: number;
|
|
6619
|
+
cachedTrials: number;
|
|
6620
|
+
}
|
|
6621
|
+
interface CostLedgerSnapshot {
|
|
6622
|
+
totalUsd: number;
|
|
6623
|
+
mutatorPromptUsd: number;
|
|
6624
|
+
mutatorCodeUsd: number;
|
|
6625
|
+
scorerPromptUsd: number;
|
|
6626
|
+
scorerCodeUsd: number;
|
|
6627
|
+
trialsCounted: number;
|
|
6628
|
+
cachedTrials: number;
|
|
6629
|
+
poolBusyMs?: number;
|
|
6630
|
+
poolUtilizationPct?: number;
|
|
6631
|
+
/** Per-generation breakdown, sorted ascending. Empty when generations
|
|
6632
|
+
* weren't supplied to addMutation/addTrial. */
|
|
6633
|
+
byGeneration: CostLedgerGeneration[];
|
|
6634
|
+
}
|
|
6635
|
+
declare class CostLedger {
|
|
6636
|
+
private totals;
|
|
6637
|
+
private readonly path;
|
|
6638
|
+
private readonly mutex;
|
|
6639
|
+
constructor(path: string);
|
|
6640
|
+
private genBucket;
|
|
6641
|
+
addMutation(channel: MutationChannel, usd: number, opts?: {
|
|
6642
|
+
generation?: number;
|
|
6643
|
+
}): Promise<void>;
|
|
6644
|
+
addTrial(channel: MutationChannel, usd: number, cached: boolean, opts?: {
|
|
6645
|
+
generation?: number;
|
|
6646
|
+
}): Promise<void>;
|
|
6647
|
+
setPoolUtilization(busyMs: number, totalMs: number): Promise<void>;
|
|
6648
|
+
snapshot(): CostLedgerSnapshot;
|
|
6649
|
+
private persist;
|
|
6650
|
+
}
|
|
6651
|
+
|
|
6652
|
+
/**
|
|
6653
|
+
* createCompositeMutator — combines two `MutateAdapter<P>`s under a policy.
|
|
6654
|
+
*
|
|
6655
|
+
* prompt-only — every generation runs `primary` (typical: a reflective
|
|
6656
|
+
* prompt mutator). The default.
|
|
6657
|
+
* secondary-only — every generation runs `secondary` (typical: a coding
|
|
6658
|
+
* agent that edits the harness itself). Slow + expensive.
|
|
6659
|
+
* alternate — even gens run `primary`, odd gens run `secondary`.
|
|
6660
|
+
* plateau — start with `primary`; switch to a 50/50 split between
|
|
6661
|
+
* `primary` and `secondary` after K gens with less than
|
|
6662
|
+
* Δ improvement (auto-detect when prompt evolution has
|
|
6663
|
+
* hit a structural ceiling).
|
|
6664
|
+
*
|
|
6665
|
+
* Naming is generic: the original audit-bench version called the channels
|
|
6666
|
+
* "prompt" and "code" — those are the canonical use cases, but the
|
|
6667
|
+
* primitive doesn't care what each mutator actually does.
|
|
6668
|
+
*/
|
|
6669
|
+
|
|
6670
|
+
type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
|
|
6671
|
+
interface CreateCompositeMutatorOpts<P> {
|
|
6672
|
+
primary: MutateAdapter<P>;
|
|
6673
|
+
secondary?: MutateAdapter<P>;
|
|
6674
|
+
policy: CompositePolicy;
|
|
6675
|
+
/** For 'plateau': minimum improvement (Δ meanScore) to count as progress. Default 0.02. */
|
|
6676
|
+
plateauThreshold?: number;
|
|
6677
|
+
/** For 'plateau': consecutive gens without progress that trigger split mode. Default 2. */
|
|
6678
|
+
plateauPatience?: number;
|
|
6679
|
+
/** Optional progress hook. */
|
|
6680
|
+
onPolicyDecision?: (info: {
|
|
6681
|
+
generation: number;
|
|
6682
|
+
chose: 'primary' | 'secondary' | 'split';
|
|
6683
|
+
reason: string;
|
|
6684
|
+
}) => void;
|
|
6685
|
+
}
|
|
6686
|
+
declare function createCompositeMutator<P>(opts: CreateCompositeMutatorOpts<P>): MutateAdapter<P>;
|
|
6687
|
+
|
|
6688
|
+
/**
|
|
6689
|
+
* SandboxPool — bounded checkout/release pool for mutation slots.
|
|
6690
|
+
*
|
|
6691
|
+
* The composite-mutator's `code` channel needs an isolated workspace per
|
|
6692
|
+
* mutation attempt: a git worktree, a sandbox container, a tmpdir clone —
|
|
6693
|
+
* whatever the consumer's runtime is. Without a pool, every consumer
|
|
6694
|
+
* re-implements the same machinery (mint N slots, check one out per
|
|
6695
|
+
* mutation, reset before reuse, drain at the end, track utilisation for
|
|
6696
|
+
* the cost ledger). This primitive ships that machinery so consumers
|
|
6697
|
+
* supply only a `SlotFactory`.
|
|
6698
|
+
*
|
|
6699
|
+
* Generic over a slot resource `T` so the same pool serves git worktrees
|
|
6700
|
+
* (T = path), Tangle sandboxes (T = SandboxBox), or anything else with
|
|
6701
|
+
* the create/reset/destroy lifecycle.
|
|
6702
|
+
*
|
|
6703
|
+
* Concurrency: FIFO via the shared `Mutex` primitive. Each `checkout()`
|
|
6704
|
+
* either takes an idle slot or queues until one is released. Lifecycle
|
|
6705
|
+
* is single-process — multi-process pools need external coordination
|
|
6706
|
+
* (file locks, etc.) and are deliberately out of scope.
|
|
6707
|
+
*/
|
|
6708
|
+
interface PoolSlot<T> {
|
|
6709
|
+
/** Stable id assigned at slot creation. Use for telemetry / lineage. */
|
|
6710
|
+
readonly id: string;
|
|
6711
|
+
/** Consumer-defined resource. */
|
|
6712
|
+
readonly resource: T;
|
|
6713
|
+
}
|
|
6714
|
+
interface SlotFactory<T> {
|
|
6715
|
+
/** Build a new slot. Called lazily as the pool grows up to `size`. */
|
|
6716
|
+
create(slotId: string): Promise<T>;
|
|
6717
|
+
/**
|
|
6718
|
+
* Reset a slot to a clean state before reuse. Called BEFORE every
|
|
6719
|
+
* checkout returns it (including the first — so the factory's
|
|
6720
|
+
* `create` can leave the slot dirty and let `reset` normalise).
|
|
6721
|
+
* Optional; default is a no-op.
|
|
6722
|
+
*/
|
|
6723
|
+
reset?(slot: PoolSlot<T>): Promise<void>;
|
|
6724
|
+
/** Tear the slot down. Called by `drain()`. */
|
|
6725
|
+
destroy(slot: PoolSlot<T>): Promise<void>;
|
|
6726
|
+
}
|
|
6727
|
+
interface SandboxPool<T> {
|
|
6728
|
+
/**
|
|
6729
|
+
* Take a slot. If all slots are busy, the promise resolves when one
|
|
6730
|
+
* is released. Always pair with the returned `release` (or wrap with
|
|
6731
|
+
* `withSlot`).
|
|
6732
|
+
*/
|
|
6733
|
+
checkout(): Promise<{
|
|
6734
|
+
slot: PoolSlot<T>;
|
|
6735
|
+
release: () => void;
|
|
6736
|
+
}>;
|
|
6737
|
+
/**
|
|
6738
|
+
* Run `fn` with a checked-out slot, releasing on completion or throw.
|
|
6739
|
+
* The convenience wrapper most callers should use.
|
|
6740
|
+
*/
|
|
6741
|
+
withSlot<R>(fn: (slot: PoolSlot<T>) => Promise<R>): Promise<R>;
|
|
6742
|
+
/** Destroy every slot. Idempotent. */
|
|
6743
|
+
drain(): Promise<void>;
|
|
6744
|
+
/** How many slots have been minted (≤ `size`). */
|
|
6745
|
+
poolSize(): number;
|
|
6746
|
+
/** How many checkouts are currently outstanding. */
|
|
6747
|
+
activeCheckouts(): number;
|
|
6748
|
+
/** Snapshot of busy/total durations for the cost ledger. */
|
|
6749
|
+
utilization(): {
|
|
6750
|
+
busyMs: number;
|
|
6751
|
+
totalMs: number;
|
|
6752
|
+
checkouts: number;
|
|
6753
|
+
};
|
|
6754
|
+
}
|
|
6755
|
+
interface CreateSandboxPoolOpts<T> {
|
|
6756
|
+
/** Maximum concurrent slots. Slots are minted on first need, not eagerly. */
|
|
6757
|
+
size: number;
|
|
6758
|
+
factory: SlotFactory<T>;
|
|
6759
|
+
}
|
|
6760
|
+
declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPool<T>;
|
|
6761
|
+
|
|
6762
|
+
/**
|
|
6763
|
+
* createSandboxCodeMutator — `MutateAdapter<P>` that runs a coding agent
|
|
6764
|
+
* inside a SandboxPool slot to produce code-channel variants.
|
|
6765
|
+
*
|
|
6766
|
+
* Composable shape (matches `reflective-mutation.ts`'s separation of
|
|
6767
|
+
* "build the prompt" from "run the model"):
|
|
6768
|
+
*
|
|
6769
|
+
* pool → where mutations execute (any SlotFactory)
|
|
6770
|
+
* runner → consumer-supplied: invokes the coding agent in a slot,
|
|
6771
|
+
* returns the diff/branch/whatever as `CodeMutationOutcome`s
|
|
6772
|
+
* toVariantPayload → maps outcome → P (consumer encodes the diff their
|
|
6773
|
+
* way — patch string, branch ref, file map, etc)
|
|
6774
|
+
*
|
|
6775
|
+
* What this primitive owns (so consumers don't reinvent it every time):
|
|
6776
|
+
* - Pool checkout / release with reset between attempts
|
|
6777
|
+
* - Per-attempt mutex so a single slot can't be invoked concurrently
|
|
6778
|
+
* - Telemetry write-through (mutations.jsonl, lineage.json,
|
|
6779
|
+
* cost-ledger.json) when sinks are passed
|
|
6780
|
+
* - Stable child-id generation
|
|
6781
|
+
* - Failure capture (every attempt produces either a successful child
|
|
6782
|
+
* or a recorded failure with reason — never a silent drop)
|
|
6783
|
+
*
|
|
6784
|
+
* Consumers stay focused on the actual interesting parts: building the
|
|
6785
|
+
* agent prompt, running the agent, capturing the diff.
|
|
6786
|
+
*/
|
|
6787
|
+
|
|
6788
|
+
/**
|
|
6789
|
+
* Result of one coding-agent invocation. The runner produces 1..N of
|
|
6790
|
+
* these per `runner` call (a single agent session can sometimes
|
|
6791
|
+
* produce multiple sibling diffs cheaply — runner decides).
|
|
6792
|
+
*/
|
|
6793
|
+
interface CodeMutationOutcome {
|
|
6794
|
+
ok: boolean;
|
|
6795
|
+
/** Stable id for the child variant if `ok`. The mutator falls back to
|
|
6796
|
+
* a generated id when omitted. */
|
|
6797
|
+
childId?: string;
|
|
6798
|
+
/** Free-form one-liner: "tightened tool descriptions in forge-tools.ts". */
|
|
6799
|
+
description?: string;
|
|
6800
|
+
/** What the runner was trying to fix (carried into PromptVariant.rationale). */
|
|
6801
|
+
rationale?: string;
|
|
6802
|
+
/** Caller-defined diff payload. Mapped into the variant's payload by
|
|
6803
|
+
* `toVariantPayload`; agent-eval treats it as opaque. */
|
|
6804
|
+
artifact?: unknown;
|
|
6805
|
+
/** When ok === false. Free-form: 'parse_failure' / 'agent_error' /
|
|
6806
|
+
* 'no_changes' / 'commit_failed' / etc. */
|
|
6807
|
+
failureReason?: string;
|
|
6808
|
+
/** Telemetry stats. */
|
|
6809
|
+
diffBytes?: number;
|
|
6810
|
+
filesTouched?: number;
|
|
6811
|
+
agentSteps?: number;
|
|
6812
|
+
costUsd?: number;
|
|
6813
|
+
latencyMs: number;
|
|
6814
|
+
}
|
|
6815
|
+
type CodeMutationRunner<T, P> = (args: {
|
|
6816
|
+
slot: PoolSlot<T>;
|
|
6817
|
+
parent: PromptVariant<P>;
|
|
6818
|
+
parentAggregate: VariantAggregate;
|
|
6819
|
+
topTrials: TrialResult[];
|
|
6820
|
+
bottomTrials: TrialResult[];
|
|
6821
|
+
childCount: number;
|
|
6822
|
+
generation: number;
|
|
6823
|
+
}) => Promise<CodeMutationOutcome[]>;
|
|
6824
|
+
interface CreateSandboxCodeMutatorOpts<T, P> {
|
|
6825
|
+
pool: SandboxPool<T>;
|
|
6826
|
+
runner: CodeMutationRunner<T, P>;
|
|
6827
|
+
/**
|
|
6828
|
+
* Map an outcome into the variant payload `P`. Lets the consumer
|
|
6829
|
+
* encode the diff however they want (file map, patch string, branch
|
|
6830
|
+
* ref, snapshot id) without agent-eval taking a stance.
|
|
6831
|
+
*/
|
|
6832
|
+
toVariantPayload(outcome: CodeMutationOutcome, parent: PromptVariant<P>): P;
|
|
6833
|
+
/** Optional telemetry sinks. */
|
|
6834
|
+
mutationTelemetry?: MutationTelemetry;
|
|
6835
|
+
costLedger?: CostLedger;
|
|
6836
|
+
lineage?: LineageRecorder<P>;
|
|
6837
|
+
/** Override id generation. Default: `${parent.id}.g${generation}.code.${i}`. */
|
|
6838
|
+
childIdFor?(parent: PromptVariant<P>, generation: number, index: number): string;
|
|
6839
|
+
/** Default label for the variant (visible in reports). */
|
|
6840
|
+
labelFor?(outcome: CodeMutationOutcome, parent: PromptVariant<P>, generation: number, index: number): string;
|
|
6841
|
+
}
|
|
6842
|
+
declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOpts<T, P>): MutateAdapter<P>;
|
|
6843
|
+
|
|
6297
6844
|
/**
|
|
6298
6845
|
* GoldenMatcher — fuzzy matcher for "did the agent produce the expected things?".
|
|
6299
6846
|
*
|
|
@@ -6564,4 +7111,4 @@ interface ReflectionProposal {
|
|
|
6564
7111
|
*/
|
|
6565
7112
|
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
6566
7113
|
|
|
6567
|
-
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialCache, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
|
7114
|
+
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|