@tangle-network/agent-eval 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -0
- package/dist/index.d.ts +449 -1
- package/dist/index.js +633 -8
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -78,8 +78,96 @@ The recipe for a code-generator eval is in [`SKILL.md` §Minimal working path](.
|
|
|
78
78
|
| `clients/python/` | First-party Python client (`tangle-agent-eval` on PyPI). Version-locked to npm. | clients/python/README.md |
|
|
79
79
|
| `BenchmarkRunner`, `executeScenario`, `ConvergenceTracker` | Multi-turn scenario execution + cross-run tracking. | SKILL.md |
|
|
80
80
|
| `ExperimentTracker`, `PromptOptimizer`, `bisector` | A/B prompts, optimize steering, bisect regressions. | SKILL.md |
|
|
81
|
+
| `runPromptEvolution`, `createCompositeMutator`, `createSandboxPool`, `createSandboxCodeMutator`, `MutationTelemetry`, `LineageRecorder`, `CostLedger`, `JsonlTrialCache` | Prompt + code evolution loops with bounded sandbox pools, durable JSONL telemetry, plateau-detecting composite mutators, crash-resumable trial cache. | §Evolution loop |
|
|
82
|
+
| `reflective-mutation` (`buildReflectionPrompt`, `parseReflectionResponse`, `DEFAULT_MUTATION_PRIMITIVES`) | Trace-conditioned LLM mutator that reasons over top/bottom trials instead of blind rewrites. | inline JSDoc |
|
|
83
|
+
| `correlationStudy`, `OutcomeStore`, `ProductRegistry` | Meta-eval: do our scores predict deployment outcomes (revenue, retention)? | inline JSDoc |
|
|
81
84
|
| Telemetry (`telemetry/`, `telemetry/file`) | OTLP export, trace replay, file sinks. | inline JSDoc |
|
|
82
85
|
|
|
86
|
+
## Evolution loop
|
|
87
|
+
|
|
88
|
+
Closing the loop on a prompt or codebase is **two adapters + a config**. Compose `runPromptEvolution` with `createCompositeMutator` (plateau policy) and you get prompt-only optimization until improvement stalls, then automatic switch to code-channel mutations from a coding agent inside a `SandboxPool`.
|
|
89
|
+
|
|
90
|
+
```ts
|
|
91
|
+
import {
|
|
92
|
+
createSandboxPool,
|
|
93
|
+
createSandboxCodeMutator,
|
|
94
|
+
createCompositeMutator,
|
|
95
|
+
buildReflectionPrompt,
|
|
96
|
+
parseReflectionResponse,
|
|
97
|
+
runPromptEvolution,
|
|
98
|
+
MutationTelemetry,
|
|
99
|
+
LineageRecorder,
|
|
100
|
+
CostLedger,
|
|
101
|
+
JsonlTrialCache,
|
|
102
|
+
} from '@tangle-network/agent-eval'
|
|
103
|
+
|
|
104
|
+
// 1. Prompt mutator — reflective-mutation reasons over top/bottom trials
|
|
105
|
+
const promptMutator = {
|
|
106
|
+
async mutate({ parent, topTrials, bottomTrials, childCount }) {
|
|
107
|
+
const ctx = { target: 'forge-prompt', parentPayload: parent.payload, topTrials, bottomTrials, childCount }
|
|
108
|
+
const reflection = buildReflectionPrompt(ctx)
|
|
109
|
+
const raw = await yourLlm(reflection)
|
|
110
|
+
return parseReflectionResponse(raw, childCount).map((p, i) => ({
|
|
111
|
+
id: `${parent.id}.g${parent.generation + 1}.prompt.${i}`,
|
|
112
|
+
payload: p.payload,
|
|
113
|
+
generation: parent.generation + 1,
|
|
114
|
+
parentId: parent.id,
|
|
115
|
+
label: p.label,
|
|
116
|
+
rationale: p.rationale,
|
|
117
|
+
}))
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// 2. Code mutator — runs a coding agent in a sandbox slot, captures the diff
|
|
122
|
+
const pool = createSandboxPool({
|
|
123
|
+
size: 4,
|
|
124
|
+
factory: {
|
|
125
|
+
async create(id) { return await yourSandboxClient.create({ name: id }) },
|
|
126
|
+
async reset(slot) { await slot.resource.exec('git reset --hard origin/main && git clean -fd') },
|
|
127
|
+
async destroy(slot) { await slot.resource.delete() },
|
|
128
|
+
},
|
|
129
|
+
})
|
|
130
|
+
const codeMutator = createSandboxCodeMutator({
|
|
131
|
+
pool,
|
|
132
|
+
runner: async ({ slot, parent, topTrials, bottomTrials }) => {
|
|
133
|
+
const result = await slot.resource.task(`Improve the prompt at /repo/forge-prompt.ts...`)
|
|
134
|
+
return [{ ok: true, latencyMs: result.durationMs, costUsd: result.costUsd, artifact: { diff: result.diff } }]
|
|
135
|
+
},
|
|
136
|
+
toVariantPayload: (outcome, parent) => ({ ...parent.payload, codeMutation: outcome.artifact }),
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
// 3. Compose — plateau policy auto-switches when prompt evolution stalls
|
|
140
|
+
const composite = createCompositeMutator({
|
|
141
|
+
primary: promptMutator,
|
|
142
|
+
secondary: codeMutator,
|
|
143
|
+
policy: 'plateau',
|
|
144
|
+
plateauThreshold: 0.02,
|
|
145
|
+
plateauPatience: 2,
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
// 4. Run — durable telemetry to disk, crash-resumable
|
|
149
|
+
const result = await runPromptEvolution({
|
|
150
|
+
runId: `forge_${Date.now()}`,
|
|
151
|
+
target: 'forge-prompt',
|
|
152
|
+
seedVariants: [{ id: 'v0', payload: { text: currentPrompt }, generation: 0, label: 'baseline' }],
|
|
153
|
+
scenarioIds: referenceCorpus.map(s => s.id),
|
|
154
|
+
reps: 3,
|
|
155
|
+
generations: 5,
|
|
156
|
+
populationSize: 4,
|
|
157
|
+
scoreAdapter: { /* runs your eval against (variant, scenario, rep) */ },
|
|
158
|
+
mutateAdapter: composite,
|
|
159
|
+
cache: new JsonlTrialCache('.evolve/cache.jsonl'),
|
|
160
|
+
objectives: [
|
|
161
|
+
{ name: 'score', direction: 'maximize', value: a => a.meanScore },
|
|
162
|
+
{ name: 'cost', direction: 'minimize', value: a => a.meanCost },
|
|
163
|
+
],
|
|
164
|
+
})
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
The `MutationTelemetry`, `LineageRecorder`, and `CostLedger` pass into the `code-mutator` (and any consumer that wants them) — they emit append-only JSONL of every attempt (success + failure with reason) and a snapshot lineage tree, so a finished run leaves a forensically complete trail under one directory.
|
|
168
|
+
|
|
169
|
+
For the full primitive surface and rationale, read each module's JSDoc — `prompt-evolution.ts`, `composite-mutator.ts`, `sandbox-pool.ts`, `code-mutator.ts`, `reflective-mutation.ts`, `evolution-telemetry.ts`.
|
|
170
|
+
|
|
83
171
|
## Tech stack
|
|
84
172
|
|
|
85
173
|
- TypeScript strict, no semicolons, single quotes, 2-space indent
|
package/dist/index.d.ts
CHANGED
|
@@ -6402,6 +6402,454 @@ interface PromptEvolutionResult<P = unknown> {
|
|
|
6402
6402
|
}
|
|
6403
6403
|
declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
|
|
6404
6404
|
|
|
6405
|
+
/**
|
|
6406
|
+
* concurrency — small primitives the evolution loop needs.
|
|
6407
|
+
*
|
|
6408
|
+
* `Mutex` is a zero-dep async lock with FIFO fairness. The evolution loop
|
|
6409
|
+
* uses it to serialise checkout/build/commit sequences inside a single
|
|
6410
|
+
* pool slot, and to gate concurrent JSONL writers (see
|
|
6411
|
+
* `lockedJsonlReferenceReplayStore`).
|
|
6412
|
+
*
|
|
6413
|
+
* Deliberately minimal — no priority queue, no timeouts. If you need
|
|
6414
|
+
* those, swap to `async-mutex` at the call site.
|
|
6415
|
+
*/
|
|
6416
|
+
declare class Mutex {
|
|
6417
|
+
private locked;
|
|
6418
|
+
private readonly waiters;
|
|
6419
|
+
acquire(): Promise<() => void>;
|
|
6420
|
+
private release;
|
|
6421
|
+
runExclusive<T>(fn: () => Promise<T> | T): Promise<T>;
|
|
6422
|
+
/** True iff someone holds the lock right now. Diagnostics only. */
|
|
6423
|
+
get isLocked(): boolean;
|
|
6424
|
+
/** Pending waiter count. Diagnostics only. */
|
|
6425
|
+
get pending(): number;
|
|
6426
|
+
}
|
|
6427
|
+
|
|
6428
|
+
/**
|
|
6429
|
+
* JsonlTrialCache — `TrialCache` backed by a JSONL append-only file so a
|
|
6430
|
+
* crashed `runPromptEvolution` can resume without re-running expensive
|
|
6431
|
+
* trials. Last write wins on key collision; the file is forward-swept at
|
|
6432
|
+
* construction.
|
|
6433
|
+
*
|
|
6434
|
+
* Tail corruption (partial line at the bottom from a hard kill) is
|
|
6435
|
+
* tolerated — we skip unparseable lines and continue.
|
|
6436
|
+
*
|
|
6437
|
+
* The cache surface (`get` / `set`) is synchronous because `TrialCache`
|
|
6438
|
+
* is. Writes are mutex-serialised through a `LockedJsonlAppender`
|
|
6439
|
+
* (kicked off with `void`) so two in-process callers can't tear a long
|
|
6440
|
+
* line that exceeds POSIX `PIPE_BUF`. Cross-process safety still
|
|
6441
|
+
* requires fcntl/flock and is deliberately out of scope.
|
|
6442
|
+
*/
|
|
6443
|
+
|
|
6444
|
+
declare class JsonlTrialCache implements TrialCache {
|
|
6445
|
+
private readonly map;
|
|
6446
|
+
private readonly path;
|
|
6447
|
+
private readonly appender;
|
|
6448
|
+
constructor(path: string);
|
|
6449
|
+
get(key: string): TrialResult | undefined;
|
|
6450
|
+
set(key: string, value: TrialResult): void;
|
|
6451
|
+
size(): number;
|
|
6452
|
+
/**
|
|
6453
|
+
* Synchronous fallback path for tests / CLI tools that want to be sure
|
|
6454
|
+
* the line is on disk before returning. Bypasses the mutex (single-
|
|
6455
|
+
* threaded callers only).
|
|
6456
|
+
*/
|
|
6457
|
+
setSync(key: string, value: TrialResult): void;
|
|
6458
|
+
}
|
|
6459
|
+
|
|
6460
|
+
/**
|
|
6461
|
+
* LockedJsonlAppender — mutex-serialized JSONL append helper for arbitrary
|
|
6462
|
+
* payloads. The reference-replay store does the same thing for typed
|
|
6463
|
+
* `ReferenceReplayRun` rows; this is the generic version used by
|
|
6464
|
+
* `MutationTelemetry`, `TrialTelemetry`, and any other consumer that wants
|
|
6465
|
+
* append-only durable telemetry without rolling its own lock.
|
|
6466
|
+
*
|
|
6467
|
+
* Locks are per absolute file path (process-local). Cross-process
|
|
6468
|
+
* concurrency is NOT addressed — that's an fcntl/flock problem.
|
|
6469
|
+
*/
|
|
6470
|
+
declare class LockedJsonlAppender {
|
|
6471
|
+
readonly path: string;
|
|
6472
|
+
private readonly mutex;
|
|
6473
|
+
constructor(path: string);
|
|
6474
|
+
append(entry: unknown): Promise<void>;
|
|
6475
|
+
}
|
|
6476
|
+
/** Reset all internal mutex state — tests only. */
|
|
6477
|
+
declare function resetLockedAppendersForTesting(): void;
|
|
6478
|
+
|
|
6479
|
+
/**
|
|
6480
|
+
* evolution-telemetry — durable JSONL/JSON sinks for the evolution loop.
|
|
6481
|
+
*
|
|
6482
|
+
* `runPromptEvolution` exposes generation-level events but doesn't persist
|
|
6483
|
+
* the per-mutation, per-trial, lineage, or cost breakdown. These four
|
|
6484
|
+
* sinks fill that gap so a finished autoresearch run leaves a forensically
|
|
6485
|
+
* complete trail under one directory:
|
|
6486
|
+
*
|
|
6487
|
+
* - `mutations.jsonl` — every mutate attempt (success + failure) with
|
|
6488
|
+
* latency, agent steps, diff stats, cost.
|
|
6489
|
+
* - `trials.jsonl` — every TrialResult including cache hits, with
|
|
6490
|
+
* provenance (channel, runtime slot, generation).
|
|
6491
|
+
* - `lineage.json` — variant tree {id → {parent, generation, kind, …}},
|
|
6492
|
+
* incremental upsert.
|
|
6493
|
+
* - `cost-ledger.json` — running $ totals per source (mutator-prompt,
|
|
6494
|
+
* mutator-code, scorer-prompt, scorer-code) plus pool utilisation.
|
|
6495
|
+
*
|
|
6496
|
+
* All writes are mutex-serialised. The append-only sinks (mutations,
|
|
6497
|
+
* trials) survive a hard kill; the snapshot sinks (lineage, cost-ledger)
|
|
6498
|
+
* rewrite on every update so the latest state is always on disk.
|
|
6499
|
+
*
|
|
6500
|
+
* Generic over a payload P so any consumer of `runPromptEvolution<P>` can
|
|
6501
|
+
* record lineage without leaking domain types.
|
|
6502
|
+
*/
|
|
6503
|
+
|
|
6504
|
+
type MutationChannel = 'prompt' | 'code';
|
|
6505
|
+
interface MutationAttempt {
|
|
6506
|
+
ts: number;
|
|
6507
|
+
channel: MutationChannel;
|
|
6508
|
+
generation: number;
|
|
6509
|
+
parentId: string;
|
|
6510
|
+
/** Successful child variant id, or null if the attempt failed. */
|
|
6511
|
+
childId: string | null;
|
|
6512
|
+
ok: boolean;
|
|
6513
|
+
/**
|
|
6514
|
+
* One of: 'parse_failure' | 'typecheck_failure' | 'no_changes' |
|
|
6515
|
+
* 'agent_error' | 'commit_failure' | 'no_api_key' | 'no_valid_proposals'
|
|
6516
|
+
* | 'reproduce_parent_failed' | 'branch_failed' | 'other'.
|
|
6517
|
+
* Free-form to allow consumer-specific reasons.
|
|
6518
|
+
*/
|
|
6519
|
+
failureReason?: string;
|
|
6520
|
+
/** Free-form description of what the agent said it did. */
|
|
6521
|
+
description?: string;
|
|
6522
|
+
/** Latency of the LLM call (ms). */
|
|
6523
|
+
latencyMs: number;
|
|
6524
|
+
/** Bytes of generated diff (code channel only). */
|
|
6525
|
+
diffBytes?: number;
|
|
6526
|
+
/** Files touched (code channel only). */
|
|
6527
|
+
filesTouched?: number;
|
|
6528
|
+
/** Steps the agent ran (tool calls). */
|
|
6529
|
+
agentSteps?: number;
|
|
6530
|
+
/** Approx $ spent on this mutation (LLM tokens). */
|
|
6531
|
+
costUsd?: number;
|
|
6532
|
+
/** Runtime slot used (code channel only). */
|
|
6533
|
+
runtimeSandboxId?: string;
|
|
6534
|
+
}
|
|
6535
|
+
declare class MutationTelemetry {
|
|
6536
|
+
private readonly appender;
|
|
6537
|
+
constructor(path: string);
|
|
6538
|
+
record(attempt: MutationAttempt): Promise<void>;
|
|
6539
|
+
}
|
|
6540
|
+
interface TrialAttempt {
|
|
6541
|
+
ts: number;
|
|
6542
|
+
channel: MutationChannel;
|
|
6543
|
+
generation: number;
|
|
6544
|
+
variantId: string;
|
|
6545
|
+
scenarioId: string;
|
|
6546
|
+
rep: number;
|
|
6547
|
+
ok: boolean;
|
|
6548
|
+
score: number;
|
|
6549
|
+
costUsd: number;
|
|
6550
|
+
durationMs: number;
|
|
6551
|
+
cached: boolean;
|
|
6552
|
+
runtimeSandboxId?: string;
|
|
6553
|
+
error?: string;
|
|
6554
|
+
metrics?: Record<string, number>;
|
|
6555
|
+
}
|
|
6556
|
+
declare class TrialTelemetry {
|
|
6557
|
+
private readonly appender;
|
|
6558
|
+
constructor(path: string);
|
|
6559
|
+
record(attempt: TrialAttempt): Promise<void>;
|
|
6560
|
+
}
|
|
6561
|
+
type LineageKind = 'seed' | 'prompt' | 'code';
|
|
6562
|
+
interface LineageNode {
|
|
6563
|
+
id: string;
|
|
6564
|
+
parentId: string | null;
|
|
6565
|
+
generation: number;
|
|
6566
|
+
kind: LineageKind;
|
|
6567
|
+
rationale?: string;
|
|
6568
|
+
/** Filled when scoring lands. */
|
|
6569
|
+
meanScore?: number;
|
|
6570
|
+
promotedToFrontier?: boolean;
|
|
6571
|
+
/**
|
|
6572
|
+
* The variant payload (e.g. evolved persona text, code mutation diff).
|
|
6573
|
+
* Persisted so a winning variant can be reproduced after a run completes
|
|
6574
|
+
* without re-running the optimizer. Optional — pass `omitPayload: true` to
|
|
6575
|
+
* `upsertVariant` for cases where the payload is too large to log.
|
|
6576
|
+
*/
|
|
6577
|
+
payload?: unknown;
|
|
6578
|
+
}
|
|
6579
|
+
/**
|
|
6580
|
+
* `kindOf` decides whether a variant is a seed (no parent), code mutation,
|
|
6581
|
+
* or prompt mutation. Default looks at `variant.payload.codeMutation` —
|
|
6582
|
+
* that field is part of the audit-bench convention but cheap enough to
|
|
6583
|
+
* accept any payload that mirrors it. Override by passing your own.
|
|
6584
|
+
*/
|
|
6585
|
+
type LineageKindResolver<P> = (variant: PromptVariant<P>) => LineageKind;
|
|
6586
|
+
/**
|
|
6587
|
+
* Persistence shape:
|
|
6588
|
+
*
|
|
6589
|
+
* `<path>` — JSONL of upserts (event log). Each line is a
|
|
6590
|
+
* partial node; replay folds them into the current
|
|
6591
|
+
* state. Append-only, so cost is O(1) per upsert
|
|
6592
|
+
* instead of the previous O(n²) full rewrite.
|
|
6593
|
+
* `<path>.snapshot` — Optional consolidated snapshot, written on
|
|
6594
|
+
* demand via `compact()` (e.g. at end of run).
|
|
6595
|
+
* Read by external tools that don't want to
|
|
6596
|
+
* replay the log.
|
|
6597
|
+
*
|
|
6598
|
+
* Loaded at construction time: if `<path>.snapshot` exists, parse it
|
|
6599
|
+
* first; then replay any newer log lines on top. Falls back to log-only
|
|
6600
|
+
* when no snapshot is present.
|
|
6601
|
+
*/
|
|
6602
|
+
declare class LineageRecorder<P = unknown> {
|
|
6603
|
+
private readonly path;
|
|
6604
|
+
private readonly snapshotPath;
|
|
6605
|
+
private readonly mutex;
|
|
6606
|
+
private readonly nodes;
|
|
6607
|
+
private readonly kindOf;
|
|
6608
|
+
constructor(path: string, kindOf?: LineageKindResolver<P>);
|
|
6609
|
+
upsert(node: LineageNode): Promise<void>;
|
|
6610
|
+
upsertVariant(variant: PromptVariant<P>, opts?: {
|
|
6611
|
+
omitPayload?: boolean;
|
|
6612
|
+
}): Promise<void>;
|
|
6613
|
+
snapshot(): LineageNode[];
|
|
6614
|
+
/**
|
|
6615
|
+
* Write the current consolidated state to `<path>.snapshot` so external
|
|
6616
|
+
* tools can read it without replaying the event log. Idempotent.
|
|
6617
|
+
*/
|
|
6618
|
+
compact(): Promise<void>;
|
|
6619
|
+
}
|
|
6620
|
+
/** Per-generation cost rollup. Same shape as the totals, scoped to one gen. */
|
|
6621
|
+
interface CostLedgerGeneration {
|
|
6622
|
+
generation: number;
|
|
6623
|
+
mutatorPromptUsd: number;
|
|
6624
|
+
mutatorCodeUsd: number;
|
|
6625
|
+
scorerPromptUsd: number;
|
|
6626
|
+
scorerCodeUsd: number;
|
|
6627
|
+
trialsCounted: number;
|
|
6628
|
+
cachedTrials: number;
|
|
6629
|
+
}
|
|
6630
|
+
interface CostLedgerSnapshot {
|
|
6631
|
+
totalUsd: number;
|
|
6632
|
+
mutatorPromptUsd: number;
|
|
6633
|
+
mutatorCodeUsd: number;
|
|
6634
|
+
scorerPromptUsd: number;
|
|
6635
|
+
scorerCodeUsd: number;
|
|
6636
|
+
trialsCounted: number;
|
|
6637
|
+
cachedTrials: number;
|
|
6638
|
+
poolBusyMs?: number;
|
|
6639
|
+
poolUtilizationPct?: number;
|
|
6640
|
+
/** Per-generation breakdown, sorted ascending. Empty when generations
|
|
6641
|
+
* weren't supplied to addMutation/addTrial. */
|
|
6642
|
+
byGeneration: CostLedgerGeneration[];
|
|
6643
|
+
}
|
|
6644
|
+
declare class CostLedger {
|
|
6645
|
+
private totals;
|
|
6646
|
+
private readonly path;
|
|
6647
|
+
private readonly mutex;
|
|
6648
|
+
constructor(path: string);
|
|
6649
|
+
private genBucket;
|
|
6650
|
+
addMutation(channel: MutationChannel, usd: number, opts?: {
|
|
6651
|
+
generation?: number;
|
|
6652
|
+
}): Promise<void>;
|
|
6653
|
+
addTrial(channel: MutationChannel, usd: number, cached: boolean, opts?: {
|
|
6654
|
+
generation?: number;
|
|
6655
|
+
}): Promise<void>;
|
|
6656
|
+
setPoolUtilization(busyMs: number, totalMs: number): Promise<void>;
|
|
6657
|
+
snapshot(): CostLedgerSnapshot;
|
|
6658
|
+
private persist;
|
|
6659
|
+
}
|
|
6660
|
+
|
|
6661
|
+
/**
|
|
6662
|
+
* createCompositeMutator — combines two `MutateAdapter<P>`s under a policy.
|
|
6663
|
+
*
|
|
6664
|
+
* prompt-only — every generation runs `primary` (typical: a reflective
|
|
6665
|
+
* prompt mutator). The default.
|
|
6666
|
+
* secondary-only — every generation runs `secondary` (typical: a coding
|
|
6667
|
+
* agent that edits the harness itself). Slow + expensive.
|
|
6668
|
+
* alternate — even gens run `primary`, odd gens run `secondary`.
|
|
6669
|
+
* plateau — start with `primary`; switch to a 50/50 split between
|
|
6670
|
+
* `primary` and `secondary` after K gens with less than
|
|
6671
|
+
* Δ improvement (auto-detect when prompt evolution has
|
|
6672
|
+
* hit a structural ceiling).
|
|
6673
|
+
*
|
|
6674
|
+
* Naming is generic: the original audit-bench version called the channels
|
|
6675
|
+
* "prompt" and "code" — those are the canonical use cases, but the
|
|
6676
|
+
* primitive doesn't care what each mutator actually does.
|
|
6677
|
+
*/
|
|
6678
|
+
|
|
6679
|
+
type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
|
|
6680
|
+
interface CreateCompositeMutatorOpts<P> {
|
|
6681
|
+
primary: MutateAdapter<P>;
|
|
6682
|
+
secondary?: MutateAdapter<P>;
|
|
6683
|
+
policy: CompositePolicy;
|
|
6684
|
+
/** For 'plateau': minimum improvement (Δ meanScore) to count as progress. Default 0.02. */
|
|
6685
|
+
plateauThreshold?: number;
|
|
6686
|
+
/** For 'plateau': consecutive gens without progress that trigger split mode. Default 2. */
|
|
6687
|
+
plateauPatience?: number;
|
|
6688
|
+
/** Optional progress hook. */
|
|
6689
|
+
onPolicyDecision?: (info: {
|
|
6690
|
+
generation: number;
|
|
6691
|
+
chose: 'primary' | 'secondary' | 'split';
|
|
6692
|
+
reason: string;
|
|
6693
|
+
}) => void;
|
|
6694
|
+
}
|
|
6695
|
+
declare function createCompositeMutator<P>(opts: CreateCompositeMutatorOpts<P>): MutateAdapter<P>;
|
|
6696
|
+
|
|
6697
|
+
/**
|
|
6698
|
+
* SandboxPool — bounded checkout/release pool for mutation slots.
|
|
6699
|
+
*
|
|
6700
|
+
* The composite-mutator's `code` channel needs an isolated workspace per
|
|
6701
|
+
* mutation attempt: a git worktree, a sandbox container, a tmpdir clone —
|
|
6702
|
+
* whatever the consumer's runtime is. Without a pool, every consumer
|
|
6703
|
+
* re-implements the same machinery (mint N slots, check one out per
|
|
6704
|
+
* mutation, reset before reuse, drain at the end, track utilisation for
|
|
6705
|
+
* the cost ledger). This primitive ships that machinery so consumers
|
|
6706
|
+
* supply only a `SlotFactory`.
|
|
6707
|
+
*
|
|
6708
|
+
* Generic over a slot resource `T` so the same pool serves git worktrees
|
|
6709
|
+
* (T = path), Tangle sandboxes (T = SandboxBox), or anything else with
|
|
6710
|
+
* the create/reset/destroy lifecycle.
|
|
6711
|
+
*
|
|
6712
|
+
* Concurrency: FIFO via the shared `Mutex` primitive. Each `checkout()`
|
|
6713
|
+
* either takes an idle slot or queues until one is released. Lifecycle
|
|
6714
|
+
* is single-process — multi-process pools need external coordination
|
|
6715
|
+
* (file locks, etc.) and are deliberately out of scope.
|
|
6716
|
+
*/
|
|
6717
|
+
interface PoolSlot<T> {
|
|
6718
|
+
/** Stable id assigned at slot creation. Use for telemetry / lineage. */
|
|
6719
|
+
readonly id: string;
|
|
6720
|
+
/** Consumer-defined resource. */
|
|
6721
|
+
readonly resource: T;
|
|
6722
|
+
}
|
|
6723
|
+
interface SlotFactory<T> {
|
|
6724
|
+
/** Build a new slot. Called lazily as the pool grows up to `size`. */
|
|
6725
|
+
create(slotId: string): Promise<T>;
|
|
6726
|
+
/**
|
|
6727
|
+
* Reset a slot to a clean state before reuse. Called BEFORE every
|
|
6728
|
+
* checkout returns it (including the first — so the factory's
|
|
6729
|
+
* `create` can leave the slot dirty and let `reset` normalise).
|
|
6730
|
+
* Optional; default is a no-op.
|
|
6731
|
+
*/
|
|
6732
|
+
reset?(slot: PoolSlot<T>): Promise<void>;
|
|
6733
|
+
/** Tear the slot down. Called by `drain()`. */
|
|
6734
|
+
destroy(slot: PoolSlot<T>): Promise<void>;
|
|
6735
|
+
}
|
|
6736
|
+
interface SandboxPool<T> {
|
|
6737
|
+
/**
|
|
6738
|
+
* Take a slot. If all slots are busy, the promise resolves when one
|
|
6739
|
+
* is released. Always pair with the returned `release` (or wrap with
|
|
6740
|
+
* `withSlot`).
|
|
6741
|
+
*/
|
|
6742
|
+
checkout(): Promise<{
|
|
6743
|
+
slot: PoolSlot<T>;
|
|
6744
|
+
release: () => void;
|
|
6745
|
+
}>;
|
|
6746
|
+
/**
|
|
6747
|
+
* Run `fn` with a checked-out slot, releasing on completion or throw.
|
|
6748
|
+
* The convenience wrapper most callers should use.
|
|
6749
|
+
*/
|
|
6750
|
+
withSlot<R>(fn: (slot: PoolSlot<T>) => Promise<R>): Promise<R>;
|
|
6751
|
+
/** Destroy every slot. Idempotent. */
|
|
6752
|
+
drain(): Promise<void>;
|
|
6753
|
+
/** How many slots have been minted (≤ `size`). */
|
|
6754
|
+
poolSize(): number;
|
|
6755
|
+
/** How many checkouts are currently outstanding. */
|
|
6756
|
+
activeCheckouts(): number;
|
|
6757
|
+
/** Snapshot of busy/total durations for the cost ledger. */
|
|
6758
|
+
utilization(): {
|
|
6759
|
+
busyMs: number;
|
|
6760
|
+
totalMs: number;
|
|
6761
|
+
checkouts: number;
|
|
6762
|
+
};
|
|
6763
|
+
}
|
|
6764
|
+
interface CreateSandboxPoolOpts<T> {
|
|
6765
|
+
/** Maximum concurrent slots. Slots are minted on first need, not eagerly. */
|
|
6766
|
+
size: number;
|
|
6767
|
+
factory: SlotFactory<T>;
|
|
6768
|
+
}
|
|
6769
|
+
declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPool<T>;
|
|
6770
|
+
|
|
6771
|
+
/**
|
|
6772
|
+
* createSandboxCodeMutator — `MutateAdapter<P>` that runs a coding agent
|
|
6773
|
+
* inside a SandboxPool slot to produce code-channel variants.
|
|
6774
|
+
*
|
|
6775
|
+
* Composable shape (matches `reflective-mutation.ts`'s separation of
|
|
6776
|
+
* "build the prompt" from "run the model"):
|
|
6777
|
+
*
|
|
6778
|
+
* pool → where mutations execute (any SlotFactory)
|
|
6779
|
+
* runner → consumer-supplied: invokes the coding agent in a slot,
|
|
6780
|
+
* returns the diff/branch/whatever as `CodeMutationOutcome`s
|
|
6781
|
+
* toVariantPayload → maps outcome → P (consumer encodes the diff their
|
|
6782
|
+
* way — patch string, branch ref, file map, etc)
|
|
6783
|
+
*
|
|
6784
|
+
* What this primitive owns (so consumers don't reinvent it every time):
|
|
6785
|
+
* - Pool checkout / release with reset between attempts
|
|
6786
|
+
* - Per-attempt mutex so a single slot can't be invoked concurrently
|
|
6787
|
+
* - Telemetry write-through (mutations.jsonl, lineage.json,
|
|
6788
|
+
* cost-ledger.json) when sinks are passed
|
|
6789
|
+
* - Stable child-id generation
|
|
6790
|
+
* - Failure capture (every attempt produces either a successful child
|
|
6791
|
+
* or a recorded failure with reason — never a silent drop)
|
|
6792
|
+
*
|
|
6793
|
+
* Consumers stay focused on the actual interesting parts: building the
|
|
6794
|
+
* agent prompt, running the agent, capturing the diff.
|
|
6795
|
+
*/
|
|
6796
|
+
|
|
6797
|
+
/**
|
|
6798
|
+
* Result of one coding-agent invocation. The runner produces 1..N of
|
|
6799
|
+
* these per `runner` call (a single agent session can sometimes
|
|
6800
|
+
* produce multiple sibling diffs cheaply — runner decides).
|
|
6801
|
+
*/
|
|
6802
|
+
interface CodeMutationOutcome {
|
|
6803
|
+
ok: boolean;
|
|
6804
|
+
/** Stable id for the child variant if `ok`. The mutator falls back to
|
|
6805
|
+
* a generated id when omitted. */
|
|
6806
|
+
childId?: string;
|
|
6807
|
+
/** Free-form one-liner: "tightened tool descriptions in forge-tools.ts". */
|
|
6808
|
+
description?: string;
|
|
6809
|
+
/** What the runner was trying to fix (carried into PromptVariant.rationale). */
|
|
6810
|
+
rationale?: string;
|
|
6811
|
+
/** Caller-defined diff payload. Mapped into the variant's payload by
|
|
6812
|
+
* `toVariantPayload`; agent-eval treats it as opaque. */
|
|
6813
|
+
artifact?: unknown;
|
|
6814
|
+
/** When ok === false. Free-form: 'parse_failure' / 'agent_error' /
|
|
6815
|
+
* 'no_changes' / 'commit_failed' / etc. */
|
|
6816
|
+
failureReason?: string;
|
|
6817
|
+
/** Telemetry stats. */
|
|
6818
|
+
diffBytes?: number;
|
|
6819
|
+
filesTouched?: number;
|
|
6820
|
+
agentSteps?: number;
|
|
6821
|
+
costUsd?: number;
|
|
6822
|
+
latencyMs: number;
|
|
6823
|
+
}
|
|
6824
|
+
type CodeMutationRunner<T, P> = (args: {
|
|
6825
|
+
slot: PoolSlot<T>;
|
|
6826
|
+
parent: PromptVariant<P>;
|
|
6827
|
+
parentAggregate: VariantAggregate;
|
|
6828
|
+
topTrials: TrialResult[];
|
|
6829
|
+
bottomTrials: TrialResult[];
|
|
6830
|
+
childCount: number;
|
|
6831
|
+
generation: number;
|
|
6832
|
+
}) => Promise<CodeMutationOutcome[]>;
|
|
6833
|
+
interface CreateSandboxCodeMutatorOpts<T, P> {
|
|
6834
|
+
pool: SandboxPool<T>;
|
|
6835
|
+
runner: CodeMutationRunner<T, P>;
|
|
6836
|
+
/**
|
|
6837
|
+
* Map an outcome into the variant payload `P`. Lets the consumer
|
|
6838
|
+
* encode the diff however they want (file map, patch string, branch
|
|
6839
|
+
* ref, snapshot id) without agent-eval taking a stance.
|
|
6840
|
+
*/
|
|
6841
|
+
toVariantPayload(outcome: CodeMutationOutcome, parent: PromptVariant<P>): P;
|
|
6842
|
+
/** Optional telemetry sinks. */
|
|
6843
|
+
mutationTelemetry?: MutationTelemetry;
|
|
6844
|
+
costLedger?: CostLedger;
|
|
6845
|
+
lineage?: LineageRecorder<P>;
|
|
6846
|
+
/** Override id generation. Default: `${parent.id}.g${generation}.code.${i}`. */
|
|
6847
|
+
childIdFor?(parent: PromptVariant<P>, generation: number, index: number): string;
|
|
6848
|
+
/** Default label for the variant (visible in reports). */
|
|
6849
|
+
labelFor?(outcome: CodeMutationOutcome, parent: PromptVariant<P>, generation: number, index: number): string;
|
|
6850
|
+
}
|
|
6851
|
+
declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOpts<T, P>): MutateAdapter<P>;
|
|
6852
|
+
|
|
6405
6853
|
/**
|
|
6406
6854
|
* GoldenMatcher — fuzzy matcher for "did the agent produce the expected things?".
|
|
6407
6855
|
*
|
|
@@ -6672,4 +7120,4 @@ interface ReflectionProposal {
|
|
|
6672
7120
|
*/
|
|
6673
7121
|
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
6674
7122
|
|
|
6675
|
-
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialCache, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
|
7123
|
+
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|