@tangle-network/agent-eval 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -0
- package/dist/index.d.ts +449 -1
- package/dist/index.js +633 -8
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.js
CHANGED
|
@@ -9538,6 +9538,49 @@ function extractErrorCount(text, opts = {}) {
|
|
|
9538
9538
|
// src/reference-replay.ts
|
|
9539
9539
|
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
|
|
9540
9540
|
import { dirname as dirname2 } from "path";
|
|
9541
|
+
|
|
9542
|
+
// src/concurrency.ts
|
|
9543
|
+
var Mutex = class {
|
|
9544
|
+
locked = false;
|
|
9545
|
+
waiters = [];
|
|
9546
|
+
async acquire() {
|
|
9547
|
+
if (!this.locked) {
|
|
9548
|
+
this.locked = true;
|
|
9549
|
+
return () => this.release();
|
|
9550
|
+
}
|
|
9551
|
+
return new Promise((resolve) => {
|
|
9552
|
+
this.waiters.push(() => {
|
|
9553
|
+
resolve(() => this.release());
|
|
9554
|
+
});
|
|
9555
|
+
});
|
|
9556
|
+
}
|
|
9557
|
+
release() {
|
|
9558
|
+
const next = this.waiters.shift();
|
|
9559
|
+
if (next) {
|
|
9560
|
+
next();
|
|
9561
|
+
} else {
|
|
9562
|
+
this.locked = false;
|
|
9563
|
+
}
|
|
9564
|
+
}
|
|
9565
|
+
async runExclusive(fn) {
|
|
9566
|
+
const release = await this.acquire();
|
|
9567
|
+
try {
|
|
9568
|
+
return await fn();
|
|
9569
|
+
} finally {
|
|
9570
|
+
release();
|
|
9571
|
+
}
|
|
9572
|
+
}
|
|
9573
|
+
/** True iff someone holds the lock right now. Diagnostics only. */
|
|
9574
|
+
get isLocked() {
|
|
9575
|
+
return this.locked;
|
|
9576
|
+
}
|
|
9577
|
+
/** Pending waiter count. Diagnostics only. */
|
|
9578
|
+
get pending() {
|
|
9579
|
+
return this.waiters.length;
|
|
9580
|
+
}
|
|
9581
|
+
};
|
|
9582
|
+
|
|
9583
|
+
// src/reference-replay.ts
|
|
9541
9584
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
9542
9585
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
9543
9586
|
async function runReferenceReplay(cases, options) {
|
|
@@ -9638,15 +9681,29 @@ function inMemoryReferenceReplayStore(initial = []) {
|
|
|
9638
9681
|
}
|
|
9639
9682
|
};
|
|
9640
9683
|
}
|
|
9684
|
+
var jsonlStoreLocks = /* @__PURE__ */ new Map();
|
|
9685
|
+
function getJsonlStoreLock(path) {
|
|
9686
|
+
let m = jsonlStoreLocks.get(path);
|
|
9687
|
+
if (!m) {
|
|
9688
|
+
m = new Mutex();
|
|
9689
|
+
jsonlStoreLocks.set(path, m);
|
|
9690
|
+
}
|
|
9691
|
+
return m;
|
|
9692
|
+
}
|
|
9641
9693
|
function jsonlReferenceReplayStore(path) {
|
|
9694
|
+
const lock = getJsonlStoreLock(path);
|
|
9642
9695
|
return {
|
|
9643
9696
|
async save(run) {
|
|
9644
|
-
|
|
9645
|
-
|
|
9697
|
+
await lock.runExclusive(() => {
|
|
9698
|
+
mkdirSync2(dirname2(path), { recursive: true });
|
|
9699
|
+
appendFileSync2(path, JSON.stringify(run) + "\n");
|
|
9700
|
+
});
|
|
9646
9701
|
},
|
|
9647
9702
|
async list() {
|
|
9648
|
-
|
|
9649
|
-
|
|
9703
|
+
return lock.runExclusive(() => {
|
|
9704
|
+
if (!existsSync4(path)) return [];
|
|
9705
|
+
return readJsonl(path);
|
|
9706
|
+
});
|
|
9650
9707
|
}
|
|
9651
9708
|
};
|
|
9652
9709
|
}
|
|
@@ -10195,13 +10252,14 @@ function aggregateTrials(population, scenarioIds, trials) {
|
|
|
10195
10252
|
const scenarios = scenarioIds.map((sid) => {
|
|
10196
10253
|
const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
|
|
10197
10254
|
const okTrials = scenarioTrials.filter((t) => t.ok);
|
|
10198
|
-
const
|
|
10255
|
+
const gradedTrials = scenarioTrials.filter((t) => !t.error);
|
|
10256
|
+
const metrics = aggregateMetrics(gradedTrials.map((t) => t.metrics ?? {}));
|
|
10199
10257
|
return {
|
|
10200
10258
|
variantId: variant.id,
|
|
10201
10259
|
scenarioId: sid,
|
|
10202
|
-
meanScore: mean5(
|
|
10203
|
-
meanCost: mean5(
|
|
10204
|
-
meanDurationMs: mean5(
|
|
10260
|
+
meanScore: mean5(gradedTrials.map((t) => t.score)),
|
|
10261
|
+
meanCost: mean5(gradedTrials.map((t) => t.cost ?? 0)),
|
|
10262
|
+
meanDurationMs: mean5(gradedTrials.map((t) => t.durationMs ?? 0)),
|
|
10205
10263
|
okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
|
|
10206
10264
|
trials: scenarioTrials.length,
|
|
10207
10265
|
metrics
|
|
@@ -10272,6 +10330,562 @@ function samePopulation(a, b) {
|
|
|
10272
10330
|
return b.every((id) => setA.has(id));
|
|
10273
10331
|
}
|
|
10274
10332
|
|
|
10333
|
+
// src/jsonl-trial-cache.ts
|
|
10334
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
10335
|
+
import { dirname as dirname4 } from "path";
|
|
10336
|
+
|
|
10337
|
+
// src/locked-jsonl-appender.ts
|
|
10338
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
|
|
10339
|
+
import { dirname as dirname3 } from "path";
|
|
10340
|
+
var mutexes = /* @__PURE__ */ new Map();
|
|
10341
|
+
function getMutex(path) {
|
|
10342
|
+
let m = mutexes.get(path);
|
|
10343
|
+
if (!m) {
|
|
10344
|
+
m = new Mutex();
|
|
10345
|
+
mutexes.set(path, m);
|
|
10346
|
+
}
|
|
10347
|
+
return m;
|
|
10348
|
+
}
|
|
10349
|
+
var LockedJsonlAppender = class {
|
|
10350
|
+
constructor(path) {
|
|
10351
|
+
this.path = path;
|
|
10352
|
+
this.mutex = getMutex(path);
|
|
10353
|
+
if (!existsSync5(dirname3(path))) {
|
|
10354
|
+
mkdirSync3(dirname3(path), { recursive: true });
|
|
10355
|
+
}
|
|
10356
|
+
}
|
|
10357
|
+
path;
|
|
10358
|
+
mutex;
|
|
10359
|
+
async append(entry) {
|
|
10360
|
+
const line = `${JSON.stringify(entry)}
|
|
10361
|
+
`;
|
|
10362
|
+
await this.mutex.runExclusive(() => {
|
|
10363
|
+
appendFileSync3(this.path, line);
|
|
10364
|
+
});
|
|
10365
|
+
}
|
|
10366
|
+
};
|
|
10367
|
+
function resetLockedAppendersForTesting() {
|
|
10368
|
+
mutexes.clear();
|
|
10369
|
+
}
|
|
10370
|
+
|
|
10371
|
+
// src/jsonl-trial-cache.ts
|
|
10372
|
+
var JsonlTrialCache = class {
|
|
10373
|
+
map = /* @__PURE__ */ new Map();
|
|
10374
|
+
path;
|
|
10375
|
+
appender;
|
|
10376
|
+
constructor(path) {
|
|
10377
|
+
this.path = path;
|
|
10378
|
+
if (existsSync6(path)) {
|
|
10379
|
+
for (const line of readFileSync5(path, "utf-8").split("\n")) {
|
|
10380
|
+
if (!line.trim()) continue;
|
|
10381
|
+
try {
|
|
10382
|
+
const entry = JSON.parse(line);
|
|
10383
|
+
this.map.set(entry.key, entry.result);
|
|
10384
|
+
} catch {
|
|
10385
|
+
}
|
|
10386
|
+
}
|
|
10387
|
+
} else {
|
|
10388
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10389
|
+
}
|
|
10390
|
+
this.appender = new LockedJsonlAppender(path);
|
|
10391
|
+
}
|
|
10392
|
+
get(key) {
|
|
10393
|
+
return this.map.get(key);
|
|
10394
|
+
}
|
|
10395
|
+
set(key, value) {
|
|
10396
|
+
this.map.set(key, value);
|
|
10397
|
+
const line = { key, result: value, writtenAt: Date.now() };
|
|
10398
|
+
void this.appender.append(line);
|
|
10399
|
+
}
|
|
10400
|
+
size() {
|
|
10401
|
+
return this.map.size;
|
|
10402
|
+
}
|
|
10403
|
+
/**
|
|
10404
|
+
* Synchronous fallback path for tests / CLI tools that want to be sure
|
|
10405
|
+
* the line is on disk before returning. Bypasses the mutex (single-
|
|
10406
|
+
* threaded callers only).
|
|
10407
|
+
*/
|
|
10408
|
+
setSync(key, value) {
|
|
10409
|
+
this.map.set(key, value);
|
|
10410
|
+
const line = { key, result: value, writtenAt: Date.now() };
|
|
10411
|
+
appendFileSync4(this.path, `${JSON.stringify(line)}
|
|
10412
|
+
`);
|
|
10413
|
+
}
|
|
10414
|
+
};
|
|
10415
|
+
|
|
10416
|
+
// src/evolution-telemetry.ts
|
|
10417
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
|
|
10418
|
+
import { dirname as dirname5 } from "path";
|
|
10419
|
+
var MutationTelemetry = class {
|
|
10420
|
+
appender;
|
|
10421
|
+
constructor(path) {
|
|
10422
|
+
this.appender = new LockedJsonlAppender(path);
|
|
10423
|
+
}
|
|
10424
|
+
async record(attempt) {
|
|
10425
|
+
await this.appender.append(attempt);
|
|
10426
|
+
}
|
|
10427
|
+
};
|
|
10428
|
+
var TrialTelemetry = class {
|
|
10429
|
+
appender;
|
|
10430
|
+
constructor(path) {
|
|
10431
|
+
this.appender = new LockedJsonlAppender(path);
|
|
10432
|
+
}
|
|
10433
|
+
async record(attempt) {
|
|
10434
|
+
await this.appender.append(attempt);
|
|
10435
|
+
}
|
|
10436
|
+
};
|
|
10437
|
+
var LineageRecorder = class {
|
|
10438
|
+
path;
|
|
10439
|
+
snapshotPath;
|
|
10440
|
+
mutex = new Mutex();
|
|
10441
|
+
nodes = /* @__PURE__ */ new Map();
|
|
10442
|
+
kindOf;
|
|
10443
|
+
constructor(path, kindOf) {
|
|
10444
|
+
this.path = path;
|
|
10445
|
+
this.snapshotPath = `${path}.snapshot`;
|
|
10446
|
+
this.kindOf = kindOf ?? defaultKindOf;
|
|
10447
|
+
mkdirSync5(dirname5(path), { recursive: true });
|
|
10448
|
+
if (existsSync7(this.snapshotPath)) {
|
|
10449
|
+
try {
|
|
10450
|
+
const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
|
|
10451
|
+
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10452
|
+
} catch {
|
|
10453
|
+
}
|
|
10454
|
+
}
|
|
10455
|
+
if (existsSync7(path)) {
|
|
10456
|
+
try {
|
|
10457
|
+
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
10458
|
+
if (!line.trim()) continue;
|
|
10459
|
+
try {
|
|
10460
|
+
const entry = JSON.parse(line);
|
|
10461
|
+
const prev = this.nodes.get(entry.id);
|
|
10462
|
+
this.nodes.set(entry.id, { ...prev, ...entry });
|
|
10463
|
+
} catch {
|
|
10464
|
+
}
|
|
10465
|
+
}
|
|
10466
|
+
} catch {
|
|
10467
|
+
}
|
|
10468
|
+
}
|
|
10469
|
+
if (existsSync7(path) && this.nodes.size === 0) {
|
|
10470
|
+
try {
|
|
10471
|
+
const raw = readFileSync6(path, "utf-8").trim();
|
|
10472
|
+
if (raw.startsWith("[")) {
|
|
10473
|
+
const parsed = JSON.parse(raw);
|
|
10474
|
+
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10475
|
+
}
|
|
10476
|
+
} catch {
|
|
10477
|
+
}
|
|
10478
|
+
}
|
|
10479
|
+
}
|
|
10480
|
+
async upsert(node) {
|
|
10481
|
+
await this.mutex.runExclusive(() => {
|
|
10482
|
+
const prev = this.nodes.get(node.id);
|
|
10483
|
+
this.nodes.set(node.id, { ...prev, ...node });
|
|
10484
|
+
try {
|
|
10485
|
+
if (existsSync7(this.path)) {
|
|
10486
|
+
const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
10487
|
+
if (head === "[") {
|
|
10488
|
+
writeFileSync(this.path, "");
|
|
10489
|
+
}
|
|
10490
|
+
}
|
|
10491
|
+
} catch {
|
|
10492
|
+
}
|
|
10493
|
+
appendFileSync5(this.path, `${JSON.stringify(this.nodes.get(node.id))}
|
|
10494
|
+
`);
|
|
10495
|
+
});
|
|
10496
|
+
}
|
|
10497
|
+
async upsertVariant(variant, opts = {}) {
|
|
10498
|
+
await this.upsert({
|
|
10499
|
+
id: variant.id,
|
|
10500
|
+
parentId: variant.parentId ?? null,
|
|
10501
|
+
generation: variant.generation,
|
|
10502
|
+
kind: this.kindOf(variant),
|
|
10503
|
+
...variant.rationale ? { rationale: variant.rationale } : {},
|
|
10504
|
+
...opts.omitPayload || variant.payload === void 0 ? {} : { payload: variant.payload }
|
|
10505
|
+
});
|
|
10506
|
+
}
|
|
10507
|
+
snapshot() {
|
|
10508
|
+
return [...this.nodes.values()];
|
|
10509
|
+
}
|
|
10510
|
+
/**
|
|
10511
|
+
* Write the current consolidated state to `<path>.snapshot` so external
|
|
10512
|
+
* tools can read it without replaying the event log. Idempotent.
|
|
10513
|
+
*/
|
|
10514
|
+
async compact() {
|
|
10515
|
+
await this.mutex.runExclusive(() => {
|
|
10516
|
+
writeFileSync(this.snapshotPath, JSON.stringify([...this.nodes.values()], null, 2));
|
|
10517
|
+
});
|
|
10518
|
+
}
|
|
10519
|
+
};
|
|
10520
|
+
function defaultKindOf(variant) {
|
|
10521
|
+
if (variant.parentId === void 0) return "seed";
|
|
10522
|
+
const payload = variant.payload;
|
|
10523
|
+
if (payload && typeof payload === "object" && payload.codeMutation) return "code";
|
|
10524
|
+
return "prompt";
|
|
10525
|
+
}
|
|
10526
|
+
function emptyGenBucket() {
|
|
10527
|
+
return {
|
|
10528
|
+
mutatorPromptUsd: 0,
|
|
10529
|
+
mutatorCodeUsd: 0,
|
|
10530
|
+
scorerPromptUsd: 0,
|
|
10531
|
+
scorerCodeUsd: 0,
|
|
10532
|
+
trialsCounted: 0,
|
|
10533
|
+
cachedTrials: 0
|
|
10534
|
+
};
|
|
10535
|
+
}
|
|
10536
|
+
var CostLedger = class {
|
|
10537
|
+
totals = {
|
|
10538
|
+
mutatorPromptUsd: 0,
|
|
10539
|
+
mutatorCodeUsd: 0,
|
|
10540
|
+
scorerPromptUsd: 0,
|
|
10541
|
+
scorerCodeUsd: 0,
|
|
10542
|
+
trialsCounted: 0,
|
|
10543
|
+
cachedTrials: 0,
|
|
10544
|
+
poolBusyMs: 0,
|
|
10545
|
+
poolUtilizationPct: 0,
|
|
10546
|
+
byGeneration: {}
|
|
10547
|
+
};
|
|
10548
|
+
path;
|
|
10549
|
+
mutex = new Mutex();
|
|
10550
|
+
constructor(path) {
|
|
10551
|
+
this.path = path;
|
|
10552
|
+
if (existsSync7(path)) {
|
|
10553
|
+
try {
|
|
10554
|
+
const loaded = JSON.parse(readFileSync6(path, "utf-8"));
|
|
10555
|
+
for (const k of Object.keys(this.totals)) {
|
|
10556
|
+
if (k === "byGeneration") {
|
|
10557
|
+
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
10558
|
+
this.totals.byGeneration = loaded.byGeneration;
|
|
10559
|
+
}
|
|
10560
|
+
continue;
|
|
10561
|
+
}
|
|
10562
|
+
const v = loaded[k];
|
|
10563
|
+
if (typeof v === "number" && Number.isFinite(v)) {
|
|
10564
|
+
this.totals[k] = v;
|
|
10565
|
+
}
|
|
10566
|
+
}
|
|
10567
|
+
} catch {
|
|
10568
|
+
}
|
|
10569
|
+
} else {
|
|
10570
|
+
mkdirSync5(dirname5(path), { recursive: true });
|
|
10571
|
+
}
|
|
10572
|
+
}
|
|
10573
|
+
genBucket(generation) {
|
|
10574
|
+
if (generation === void 0) return null;
|
|
10575
|
+
const key = String(generation);
|
|
10576
|
+
if (!this.totals.byGeneration[key]) {
|
|
10577
|
+
this.totals.byGeneration[key] = emptyGenBucket();
|
|
10578
|
+
}
|
|
10579
|
+
return this.totals.byGeneration[key];
|
|
10580
|
+
}
|
|
10581
|
+
async addMutation(channel, usd, opts = {}) {
|
|
10582
|
+
await this.mutex.runExclusive(() => {
|
|
10583
|
+
const bucket = this.genBucket(opts.generation);
|
|
10584
|
+
if (channel === "prompt") {
|
|
10585
|
+
this.totals.mutatorPromptUsd += usd;
|
|
10586
|
+
if (bucket) bucket.mutatorPromptUsd += usd;
|
|
10587
|
+
} else {
|
|
10588
|
+
this.totals.mutatorCodeUsd += usd;
|
|
10589
|
+
if (bucket) bucket.mutatorCodeUsd += usd;
|
|
10590
|
+
}
|
|
10591
|
+
this.persist();
|
|
10592
|
+
});
|
|
10593
|
+
}
|
|
10594
|
+
async addTrial(channel, usd, cached, opts = {}) {
|
|
10595
|
+
await this.mutex.runExclusive(() => {
|
|
10596
|
+
const bucket = this.genBucket(opts.generation);
|
|
10597
|
+
if (cached) {
|
|
10598
|
+
this.totals.cachedTrials++;
|
|
10599
|
+
this.totals.trialsCounted++;
|
|
10600
|
+
if (bucket) {
|
|
10601
|
+
bucket.cachedTrials++;
|
|
10602
|
+
bucket.trialsCounted++;
|
|
10603
|
+
}
|
|
10604
|
+
this.persist();
|
|
10605
|
+
return;
|
|
10606
|
+
}
|
|
10607
|
+
if (channel === "prompt") {
|
|
10608
|
+
this.totals.scorerPromptUsd += usd;
|
|
10609
|
+
if (bucket) bucket.scorerPromptUsd += usd;
|
|
10610
|
+
} else {
|
|
10611
|
+
this.totals.scorerCodeUsd += usd;
|
|
10612
|
+
if (bucket) bucket.scorerCodeUsd += usd;
|
|
10613
|
+
}
|
|
10614
|
+
this.totals.trialsCounted++;
|
|
10615
|
+
if (bucket) bucket.trialsCounted++;
|
|
10616
|
+
this.persist();
|
|
10617
|
+
});
|
|
10618
|
+
}
|
|
10619
|
+
async setPoolUtilization(busyMs, totalMs) {
|
|
10620
|
+
await this.mutex.runExclusive(() => {
|
|
10621
|
+
this.totals.poolBusyMs = busyMs;
|
|
10622
|
+
this.totals.poolUtilizationPct = totalMs > 0 ? 100 * busyMs / totalMs : 0;
|
|
10623
|
+
this.persist();
|
|
10624
|
+
});
|
|
10625
|
+
}
|
|
10626
|
+
snapshot() {
|
|
10627
|
+
const totalUsd = this.totals.mutatorPromptUsd + this.totals.mutatorCodeUsd + this.totals.scorerPromptUsd + this.totals.scorerCodeUsd;
|
|
10628
|
+
const byGeneration = Object.entries(this.totals.byGeneration).map(([g, b]) => ({ generation: Number(g), ...b })).sort((a, b) => a.generation - b.generation);
|
|
10629
|
+
return {
|
|
10630
|
+
totalUsd,
|
|
10631
|
+
mutatorPromptUsd: this.totals.mutatorPromptUsd,
|
|
10632
|
+
mutatorCodeUsd: this.totals.mutatorCodeUsd,
|
|
10633
|
+
scorerPromptUsd: this.totals.scorerPromptUsd,
|
|
10634
|
+
scorerCodeUsd: this.totals.scorerCodeUsd,
|
|
10635
|
+
trialsCounted: this.totals.trialsCounted,
|
|
10636
|
+
cachedTrials: this.totals.cachedTrials,
|
|
10637
|
+
poolBusyMs: this.totals.poolBusyMs,
|
|
10638
|
+
poolUtilizationPct: this.totals.poolUtilizationPct,
|
|
10639
|
+
byGeneration
|
|
10640
|
+
};
|
|
10641
|
+
}
|
|
10642
|
+
persist() {
|
|
10643
|
+
writeFileSync(this.path, JSON.stringify(this.totals, null, 2));
|
|
10644
|
+
}
|
|
10645
|
+
};
|
|
10646
|
+
|
|
10647
|
+
// src/composite-mutator.ts
|
|
10648
|
+
function createCompositeMutator(opts) {
|
|
10649
|
+
const recentScores = [];
|
|
10650
|
+
const plateauThreshold = opts.plateauThreshold ?? 0.02;
|
|
10651
|
+
const plateauPatience = opts.plateauPatience ?? 2;
|
|
10652
|
+
function pickMode(args) {
|
|
10653
|
+
recentScores.push(args.parentAggregate.meanScore);
|
|
10654
|
+
switch (opts.policy) {
|
|
10655
|
+
case "primary-only":
|
|
10656
|
+
return { mode: "primary", reason: "policy=primary-only" };
|
|
10657
|
+
case "secondary-only":
|
|
10658
|
+
if (!opts.secondary) return { mode: "primary", reason: "secondary-only requested but no secondary mutator wired" };
|
|
10659
|
+
return { mode: "secondary", reason: "policy=secondary-only" };
|
|
10660
|
+
case "alternate":
|
|
10661
|
+
if (!opts.secondary) return { mode: "primary", reason: "alternate requested but no secondary mutator wired" };
|
|
10662
|
+
return args.generation % 2 === 1 ? { mode: "secondary", reason: `alternate: gen${args.generation} odd \u2192 secondary` } : { mode: "primary", reason: `alternate: gen${args.generation} even \u2192 primary` };
|
|
10663
|
+
case "plateau": {
|
|
10664
|
+
if (!opts.secondary) return { mode: "primary", reason: "plateau requested but no secondary mutator wired" };
|
|
10665
|
+
if (recentScores.length <= plateauPatience) {
|
|
10666
|
+
return { mode: "primary", reason: "plateau: warming up with primary mutations" };
|
|
10667
|
+
}
|
|
10668
|
+
const window = recentScores.slice(-plateauPatience - 1);
|
|
10669
|
+
const deltas = window.slice(1).map((v, i) => v - window[i]);
|
|
10670
|
+
const stagnant = deltas.every((d) => d < plateauThreshold);
|
|
10671
|
+
if (stagnant) {
|
|
10672
|
+
return {
|
|
10673
|
+
mode: "split",
|
|
10674
|
+
reason: `plateau detected (${deltas.map((d) => d.toFixed(3)).join(", ")}) \u2192 split`
|
|
10675
|
+
};
|
|
10676
|
+
}
|
|
10677
|
+
return {
|
|
10678
|
+
mode: "primary",
|
|
10679
|
+
reason: `plateau: still improving (${deltas[deltas.length - 1].toFixed(3)})`
|
|
10680
|
+
};
|
|
10681
|
+
}
|
|
10682
|
+
}
|
|
10683
|
+
}
|
|
10684
|
+
return {
|
|
10685
|
+
async mutate(args) {
|
|
10686
|
+
const { mode, reason } = pickMode(args);
|
|
10687
|
+
opts.onPolicyDecision?.({ generation: args.generation, chose: mode, reason });
|
|
10688
|
+
if (mode === "primary") return opts.primary.mutate(args);
|
|
10689
|
+
if (mode === "secondary" && opts.secondary) return opts.secondary.mutate(args);
|
|
10690
|
+
if (mode === "split" && opts.secondary) {
|
|
10691
|
+
const secondaryShare = Math.ceil(args.childCount / 2);
|
|
10692
|
+
const primaryShare = args.childCount - secondaryShare;
|
|
10693
|
+
const [primaryChildren, secondaryChildren] = await Promise.all([
|
|
10694
|
+
opts.primary.mutate({ ...args, childCount: primaryShare }),
|
|
10695
|
+
opts.secondary.mutate({ ...args, childCount: secondaryShare })
|
|
10696
|
+
]);
|
|
10697
|
+
return [...primaryChildren, ...secondaryChildren];
|
|
10698
|
+
}
|
|
10699
|
+
return opts.primary.mutate(args);
|
|
10700
|
+
}
|
|
10701
|
+
};
|
|
10702
|
+
}
|
|
10703
|
+
|
|
10704
|
+
// src/sandbox-pool.ts
|
|
10705
|
+
function createSandboxPool(opts) {
|
|
10706
|
+
if (opts.size < 1) throw new Error(`sandbox pool size must be >= 1 (got ${opts.size})`);
|
|
10707
|
+
const slots = [];
|
|
10708
|
+
const waiters = [];
|
|
10709
|
+
const mutex = new Mutex();
|
|
10710
|
+
let nextSlotId = 0;
|
|
10711
|
+
let totalCheckouts = 0;
|
|
10712
|
+
let busyMs = 0;
|
|
10713
|
+
const startedAt = Date.now();
|
|
10714
|
+
async function acquireSlot() {
|
|
10715
|
+
let mintId;
|
|
10716
|
+
const ready = await mutex.runExclusive(async () => {
|
|
10717
|
+
const idle = slots.find((s) => !s.busy);
|
|
10718
|
+
if (idle) {
|
|
10719
|
+
idle.busy = true;
|
|
10720
|
+
return idle;
|
|
10721
|
+
}
|
|
10722
|
+
if (slots.length < opts.size) {
|
|
10723
|
+
mintId = `slot_${nextSlotId++}`;
|
|
10724
|
+
return null;
|
|
10725
|
+
}
|
|
10726
|
+
return null;
|
|
10727
|
+
});
|
|
10728
|
+
if (ready) return ready;
|
|
10729
|
+
if (mintId !== void 0) {
|
|
10730
|
+
const resource = await opts.factory.create(mintId);
|
|
10731
|
+
const state = {
|
|
10732
|
+
slot: { id: mintId, resource },
|
|
10733
|
+
busy: true
|
|
10734
|
+
};
|
|
10735
|
+
await mutex.runExclusive(() => {
|
|
10736
|
+
slots.push(state);
|
|
10737
|
+
});
|
|
10738
|
+
return state;
|
|
10739
|
+
}
|
|
10740
|
+
return new Promise((resolve) => {
|
|
10741
|
+
waiters.push((s) => {
|
|
10742
|
+
s.busy = true;
|
|
10743
|
+
resolve(s);
|
|
10744
|
+
});
|
|
10745
|
+
});
|
|
10746
|
+
}
|
|
10747
|
+
function releaseSlot(state) {
|
|
10748
|
+
void (async () => {
|
|
10749
|
+
try {
|
|
10750
|
+
if (opts.factory.reset) await opts.factory.reset(state.slot);
|
|
10751
|
+
} catch (err) {
|
|
10752
|
+
console.warn(`[sandbox-pool] reset failed for slot ${state.slot.id}:`, err);
|
|
10753
|
+
}
|
|
10754
|
+
state.busy = false;
|
|
10755
|
+
const next = waiters.shift();
|
|
10756
|
+
if (next) next(state);
|
|
10757
|
+
})();
|
|
10758
|
+
}
|
|
10759
|
+
async function checkout() {
|
|
10760
|
+
const state = await acquireSlot();
|
|
10761
|
+
const checkoutStart = Date.now();
|
|
10762
|
+
totalCheckouts++;
|
|
10763
|
+
return {
|
|
10764
|
+
slot: state.slot,
|
|
10765
|
+
release: () => {
|
|
10766
|
+
busyMs += Date.now() - checkoutStart;
|
|
10767
|
+
releaseSlot(state);
|
|
10768
|
+
}
|
|
10769
|
+
};
|
|
10770
|
+
}
|
|
10771
|
+
async function withSlot(fn) {
|
|
10772
|
+
const { slot, release } = await checkout();
|
|
10773
|
+
try {
|
|
10774
|
+
return await fn(slot);
|
|
10775
|
+
} finally {
|
|
10776
|
+
release();
|
|
10777
|
+
}
|
|
10778
|
+
}
|
|
10779
|
+
async function drain() {
|
|
10780
|
+
const snapshot = await mutex.runExclusive(() => {
|
|
10781
|
+
const taken = slots.splice(0, slots.length);
|
|
10782
|
+
for (const w of waiters.splice(0, waiters.length)) {
|
|
10783
|
+
void w;
|
|
10784
|
+
}
|
|
10785
|
+
return taken;
|
|
10786
|
+
});
|
|
10787
|
+
await Promise.allSettled(snapshot.map((s) => opts.factory.destroy(s.slot)));
|
|
10788
|
+
}
|
|
10789
|
+
function utilization() {
|
|
10790
|
+
return {
|
|
10791
|
+
busyMs,
|
|
10792
|
+
totalMs: Date.now() - startedAt,
|
|
10793
|
+
checkouts: totalCheckouts
|
|
10794
|
+
};
|
|
10795
|
+
}
|
|
10796
|
+
return {
|
|
10797
|
+
checkout,
|
|
10798
|
+
withSlot,
|
|
10799
|
+
drain,
|
|
10800
|
+
poolSize: () => slots.length,
|
|
10801
|
+
activeCheckouts: () => slots.filter((s) => s.busy).length,
|
|
10802
|
+
utilization
|
|
10803
|
+
};
|
|
10804
|
+
}
|
|
10805
|
+
|
|
10806
|
+
// src/code-mutator.ts
|
|
10807
|
+
function createSandboxCodeMutator(opts) {
|
|
10808
|
+
const childIdFor = opts.childIdFor ?? ((parent, generation, index) => `${parent.id}.g${generation}.code.${index}`);
|
|
10809
|
+
const labelFor = opts.labelFor ?? ((outcome, parent, _generation, index) => outcome.description?.slice(0, 80) ?? `${parent.label} \u2192 code.${index}`);
|
|
10810
|
+
return {
|
|
10811
|
+
async mutate(args) {
|
|
10812
|
+
const { parent, parentAggregate, topTrials, bottomTrials, childCount, generation } = args;
|
|
10813
|
+
const startedAt = Date.now();
|
|
10814
|
+
const outcomes = await opts.pool.withSlot(async (slot) => {
|
|
10815
|
+
try {
|
|
10816
|
+
return await opts.runner({
|
|
10817
|
+
slot,
|
|
10818
|
+
parent,
|
|
10819
|
+
parentAggregate,
|
|
10820
|
+
topTrials,
|
|
10821
|
+
bottomTrials,
|
|
10822
|
+
childCount,
|
|
10823
|
+
generation
|
|
10824
|
+
});
|
|
10825
|
+
} catch (err) {
|
|
10826
|
+
return [{
|
|
10827
|
+
ok: false,
|
|
10828
|
+
failureReason: "runner_error",
|
|
10829
|
+
description: err instanceof Error ? err.message : String(err),
|
|
10830
|
+
latencyMs: Date.now() - startedAt
|
|
10831
|
+
}];
|
|
10832
|
+
}
|
|
10833
|
+
});
|
|
10834
|
+
const variants = [];
|
|
10835
|
+
let index = 0;
|
|
10836
|
+
for (const outcome of outcomes) {
|
|
10837
|
+
const childId = outcome.childId ?? childIdFor(parent, generation, index);
|
|
10838
|
+
if (opts.mutationTelemetry) {
|
|
10839
|
+
await opts.mutationTelemetry.record({
|
|
10840
|
+
ts: Date.now(),
|
|
10841
|
+
channel: "code",
|
|
10842
|
+
generation,
|
|
10843
|
+
parentId: parent.id,
|
|
10844
|
+
childId: outcome.ok ? childId : null,
|
|
10845
|
+
ok: outcome.ok,
|
|
10846
|
+
failureReason: outcome.failureReason,
|
|
10847
|
+
description: outcome.description,
|
|
10848
|
+
latencyMs: outcome.latencyMs,
|
|
10849
|
+
diffBytes: outcome.diffBytes,
|
|
10850
|
+
filesTouched: outcome.filesTouched,
|
|
10851
|
+
agentSteps: outcome.agentSteps,
|
|
10852
|
+
costUsd: outcome.costUsd
|
|
10853
|
+
});
|
|
10854
|
+
}
|
|
10855
|
+
if (opts.costLedger && outcome.costUsd !== void 0) {
|
|
10856
|
+
await opts.costLedger.addMutation("code", outcome.costUsd, { generation });
|
|
10857
|
+
}
|
|
10858
|
+
if (outcome.ok) {
|
|
10859
|
+
const variant = {
|
|
10860
|
+
id: childId,
|
|
10861
|
+
payload: opts.toVariantPayload(outcome, parent),
|
|
10862
|
+
generation,
|
|
10863
|
+
parentId: parent.id,
|
|
10864
|
+
label: labelFor(outcome, parent, generation, index),
|
|
10865
|
+
...outcome.rationale ? { rationale: outcome.rationale } : {}
|
|
10866
|
+
};
|
|
10867
|
+
variants.push(variant);
|
|
10868
|
+
if (opts.lineage) {
|
|
10869
|
+
await opts.lineage.upsert({
|
|
10870
|
+
id: variant.id,
|
|
10871
|
+
parentId: variant.parentId ?? null,
|
|
10872
|
+
generation: variant.generation,
|
|
10873
|
+
kind: "code",
|
|
10874
|
+
...variant.rationale ? { rationale: variant.rationale } : {}
|
|
10875
|
+
});
|
|
10876
|
+
}
|
|
10877
|
+
}
|
|
10878
|
+
index++;
|
|
10879
|
+
}
|
|
10880
|
+
if (opts.costLedger) {
|
|
10881
|
+
const u = opts.pool.utilization();
|
|
10882
|
+
await opts.costLedger.setPoolUtilization(u.busyMs, u.totalMs);
|
|
10883
|
+
}
|
|
10884
|
+
return variants;
|
|
10885
|
+
}
|
|
10886
|
+
};
|
|
10887
|
+
}
|
|
10888
|
+
|
|
10275
10889
|
// src/golden-matcher.ts
|
|
10276
10890
|
function matchGoldens(goldens, candidates, options = {}) {
|
|
10277
10891
|
const extract = options.text ?? defaultExtract5;
|
|
@@ -10634,6 +11248,7 @@ export {
|
|
|
10634
11248
|
BudgetGuard,
|
|
10635
11249
|
BuilderSession,
|
|
10636
11250
|
ConvergenceTracker,
|
|
11251
|
+
CostLedger,
|
|
10637
11252
|
CostTracker,
|
|
10638
11253
|
D1ExperimentStore,
|
|
10639
11254
|
DEFAULT_AGENT_SLOS,
|
|
@@ -10664,12 +11279,17 @@ export {
|
|
|
10664
11279
|
InMemoryTraceStore,
|
|
10665
11280
|
InMemoryTrialCache,
|
|
10666
11281
|
InMemoryWorkspaceInspector,
|
|
11282
|
+
JsonlTrialCache,
|
|
10667
11283
|
JudgeRunner,
|
|
11284
|
+
LineageRecorder,
|
|
10668
11285
|
LlmCallError,
|
|
10669
11286
|
LlmClient,
|
|
11287
|
+
LockedJsonlAppender,
|
|
10670
11288
|
MODEL_PRICING,
|
|
10671
11289
|
MetricsCollector,
|
|
10672
11290
|
MultiLayerVerifier,
|
|
11291
|
+
MutationTelemetry,
|
|
11292
|
+
Mutex,
|
|
10673
11293
|
OTEL_AGENT_EVAL_SCOPE,
|
|
10674
11294
|
OptimizationLoop,
|
|
10675
11295
|
PairwiseSteeringOptimizer,
|
|
@@ -10687,6 +11307,7 @@ export {
|
|
|
10687
11307
|
TRACE_SCHEMA_VERSION,
|
|
10688
11308
|
TokenCounter,
|
|
10689
11309
|
TraceEmitter,
|
|
11310
|
+
TrialTelemetry,
|
|
10690
11311
|
UNIVERSAL_FINDERS,
|
|
10691
11312
|
adversarialJudge,
|
|
10692
11313
|
aggregateLlm,
|
|
@@ -10731,11 +11352,14 @@ export {
|
|
|
10731
11352
|
correlateLayers,
|
|
10732
11353
|
correlationStudy,
|
|
10733
11354
|
createAntiSlopJudge,
|
|
11355
|
+
createCompositeMutator,
|
|
10734
11356
|
createCustomJudge,
|
|
10735
11357
|
createDefaultReviewer,
|
|
10736
11358
|
createDomainExpertJudge,
|
|
10737
11359
|
createIntentMatchJudge,
|
|
10738
11360
|
createLlmReviewer,
|
|
11361
|
+
createSandboxCodeMutator,
|
|
11362
|
+
createSandboxPool,
|
|
10739
11363
|
createSemanticConceptJudge,
|
|
10740
11364
|
crossTraceDiff,
|
|
10741
11365
|
crowdingDistance,
|
|
@@ -10846,6 +11470,7 @@ export {
|
|
|
10846
11470
|
replayScorerOverCorpus,
|
|
10847
11471
|
replayTraceThroughJudge,
|
|
10848
11472
|
requiredSampleSize,
|
|
11473
|
+
resetLockedAppendersForTesting,
|
|
10849
11474
|
resumeBuilderSession,
|
|
10850
11475
|
rowCount,
|
|
10851
11476
|
rowWhere,
|