@tangle-network/agent-eval 0.17.2 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -16
- package/dist/index.d.ts +271 -75
- package/dist/index.js +393 -16
- package/dist/index.js.map +1 -1
- package/docs/concepts.md +155 -0
- package/docs/control-runtime.md +351 -0
- package/docs/feature-guide.md +213 -0
- package/docs/feedback-trajectories.md +193 -0
- package/docs/multi-shot-optimization.md +122 -0
- package/docs/wire-protocol.md +199 -0
- package/package.json +21 -14
package/dist/index.js
CHANGED
|
@@ -2252,8 +2252,6 @@ async function finish(emitter, result) {
|
|
|
2252
2252
|
}
|
|
2253
2253
|
|
|
2254
2254
|
// src/feedback-trajectory.ts
|
|
2255
|
-
import { appendFile, mkdir, readFile } from "fs/promises";
|
|
2256
|
-
import { join } from "path";
|
|
2257
2255
|
var DEFAULT_SPLIT_POLICY = {
|
|
2258
2256
|
trainPct: 70,
|
|
2259
2257
|
devPct: 15,
|
|
@@ -2330,12 +2328,16 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
2330
2328
|
return next;
|
|
2331
2329
|
}
|
|
2332
2330
|
async append(record) {
|
|
2331
|
+
const { appendFile, mkdir } = await import("fs/promises");
|
|
2332
|
+
const { join: join3 } = await import("path");
|
|
2333
2333
|
await mkdir(this.dir, { recursive: true });
|
|
2334
|
-
await appendFile(
|
|
2334
|
+
await appendFile(join3(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
|
|
2335
2335
|
}
|
|
2336
2336
|
async load() {
|
|
2337
2337
|
if (this.loaded) return;
|
|
2338
|
-
const
|
|
2338
|
+
const { readFile } = await import("fs/promises");
|
|
2339
|
+
const { join: join3 } = await import("path");
|
|
2340
|
+
const file = join3(this.dir, "feedback-trajectories.ndjson");
|
|
2339
2341
|
try {
|
|
2340
2342
|
const raw = await readFile(file, "utf8");
|
|
2341
2343
|
for (const line of raw.split("\n")) {
|
|
@@ -2422,6 +2424,44 @@ function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
|
2422
2424
|
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
2423
2425
|
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
2424
2426
|
}
|
|
2427
|
+
async function replayFeedbackTrajectory(trajectory, adapter2) {
|
|
2428
|
+
try {
|
|
2429
|
+
const result = await adapter2.replay(trajectory);
|
|
2430
|
+
return {
|
|
2431
|
+
trajectoryId: trajectory.id,
|
|
2432
|
+
...result
|
|
2433
|
+
};
|
|
2434
|
+
} catch (err) {
|
|
2435
|
+
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2436
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2437
|
+
return {
|
|
2438
|
+
trajectoryId: trajectory.id,
|
|
2439
|
+
pass: false,
|
|
2440
|
+
labels: [{
|
|
2441
|
+
source: "system",
|
|
2442
|
+
kind: "reject",
|
|
2443
|
+
value: false,
|
|
2444
|
+
reason: message,
|
|
2445
|
+
severity: "error",
|
|
2446
|
+
createdAt
|
|
2447
|
+
}],
|
|
2448
|
+
outcome: {
|
|
2449
|
+
success: false,
|
|
2450
|
+
score: 0,
|
|
2451
|
+
detail: message,
|
|
2452
|
+
observedAt: createdAt
|
|
2453
|
+
},
|
|
2454
|
+
metadata: { replayError: true }
|
|
2455
|
+
};
|
|
2456
|
+
}
|
|
2457
|
+
}
|
|
2458
|
+
async function replayFeedbackTrajectories(trajectories, adapter2) {
|
|
2459
|
+
const results = [];
|
|
2460
|
+
for (const trajectory of trajectories) {
|
|
2461
|
+
results.push(await replayFeedbackTrajectory(trajectory, adapter2));
|
|
2462
|
+
}
|
|
2463
|
+
return results;
|
|
2464
|
+
}
|
|
2425
2465
|
function summarizePreferenceMemory(trajectories, options = {}) {
|
|
2426
2466
|
const maxEntries = options.maxEntries ?? 20;
|
|
2427
2467
|
const entries = [];
|
|
@@ -2585,6 +2625,69 @@ function canonicalize(value) {
|
|
|
2585
2625
|
return out;
|
|
2586
2626
|
}
|
|
2587
2627
|
|
|
2628
|
+
// src/action-policy.ts
|
|
2629
|
+
function evaluateActionPolicy(action, policy = {}, options = {}) {
|
|
2630
|
+
const reasons = [];
|
|
2631
|
+
let blocked = false;
|
|
2632
|
+
let requiresApproval = Boolean(action.requiresApproval);
|
|
2633
|
+
if (policy.allowedTypes?.length && !policy.allowedTypes.includes(action.type)) {
|
|
2634
|
+
blocked = true;
|
|
2635
|
+
reasons.push(`action type "${action.type}" is not allowed`);
|
|
2636
|
+
}
|
|
2637
|
+
if (policy.blockedTypes?.includes(action.type)) {
|
|
2638
|
+
blocked = true;
|
|
2639
|
+
reasons.push(`action type "${action.type}" is blocked`);
|
|
2640
|
+
}
|
|
2641
|
+
if (policy.alwaysRequireApprovalTypes?.includes(action.type)) {
|
|
2642
|
+
requiresApproval = true;
|
|
2643
|
+
reasons.push(`action type "${action.type}" requires approval`);
|
|
2644
|
+
}
|
|
2645
|
+
if (policy.requireApprovalForExternalSideEffects && action.externalSideEffect) {
|
|
2646
|
+
requiresApproval = true;
|
|
2647
|
+
reasons.push("external side effect requires approval");
|
|
2648
|
+
}
|
|
2649
|
+
if (policy.requireApprovalAboveCostUsd !== void 0 && (action.costUsd ?? 0) > policy.requireApprovalAboveCostUsd) {
|
|
2650
|
+
requiresApproval = true;
|
|
2651
|
+
reasons.push(`cost ${action.costUsd} exceeds approval threshold ${policy.requireApprovalAboveCostUsd}`);
|
|
2652
|
+
}
|
|
2653
|
+
if (policy.maxActionCostUsd !== void 0 && (action.costUsd ?? 0) > policy.maxActionCostUsd) {
|
|
2654
|
+
blocked = true;
|
|
2655
|
+
reasons.push(`cost ${action.costUsd} exceeds max action cost ${policy.maxActionCostUsd}`);
|
|
2656
|
+
}
|
|
2657
|
+
if (policy.remainingBudgetUsd !== void 0 && (action.costUsd ?? 0) > policy.remainingBudgetUsd) {
|
|
2658
|
+
blocked = true;
|
|
2659
|
+
reasons.push(`cost ${action.costUsd} exceeds remaining budget ${policy.remainingBudgetUsd}`);
|
|
2660
|
+
}
|
|
2661
|
+
if (policy.expectedOutcomeRequired && !action.metadata?.expectedOutcome) {
|
|
2662
|
+
blocked = true;
|
|
2663
|
+
reasons.push("expected outcome is required");
|
|
2664
|
+
}
|
|
2665
|
+
if (policy.killCriteriaRequired && !action.metadata?.killCriteria) {
|
|
2666
|
+
blocked = true;
|
|
2667
|
+
reasons.push("kill criteria are required");
|
|
2668
|
+
}
|
|
2669
|
+
if (policy.autoApproveTypes?.includes(action.type) && requiresApproval) {
|
|
2670
|
+
reasons.push(`action type "${action.type}" is auto-approved only when no approval policy applies`);
|
|
2671
|
+
}
|
|
2672
|
+
if (!reasons.length) reasons.push(requiresApproval ? "approval required" : "action allowed");
|
|
2673
|
+
const label = blocked || requiresApproval ? {
|
|
2674
|
+
source: "policy",
|
|
2675
|
+
kind: blocked ? "policy_block" : "comment",
|
|
2676
|
+
value: { actionType: action.type, blocked, requiresApproval },
|
|
2677
|
+
reason: reasons.join("; "),
|
|
2678
|
+
severity: blocked ? "critical" : "warning",
|
|
2679
|
+
createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
2680
|
+
metadata: { action, policy }
|
|
2681
|
+
} : void 0;
|
|
2682
|
+
return {
|
|
2683
|
+
allowed: !blocked,
|
|
2684
|
+
blocked,
|
|
2685
|
+
requiresApproval: !blocked && requiresApproval,
|
|
2686
|
+
reasons,
|
|
2687
|
+
label
|
|
2688
|
+
};
|
|
2689
|
+
}
|
|
2690
|
+
|
|
2588
2691
|
// src/prompt-registry.ts
|
|
2589
2692
|
var PromptRegistry = class {
|
|
2590
2693
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -6382,7 +6485,7 @@ function assertNonNegative(n, name) {
|
|
|
6382
6485
|
|
|
6383
6486
|
// src/muffled-gate-scanner.ts
|
|
6384
6487
|
import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
|
|
6385
|
-
import { join
|
|
6488
|
+
import { join } from "path";
|
|
6386
6489
|
function codeOf(line) {
|
|
6387
6490
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
6388
6491
|
}
|
|
@@ -6486,11 +6589,11 @@ var UNIVERSAL_FINDERS = [
|
|
|
6486
6589
|
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
6487
6590
|
const matches2 = [];
|
|
6488
6591
|
const walk = (rel) => {
|
|
6489
|
-
const abs =
|
|
6592
|
+
const abs = join(repoRoot, rel);
|
|
6490
6593
|
if (!existsSync2(abs)) return;
|
|
6491
6594
|
for (const entry of readdirSync(abs)) {
|
|
6492
|
-
const sub =
|
|
6493
|
-
const subAbs =
|
|
6595
|
+
const sub = join(rel, entry);
|
|
6596
|
+
const subAbs = join(repoRoot, sub);
|
|
6494
6597
|
let st;
|
|
6495
6598
|
try {
|
|
6496
6599
|
st = statSync(subAbs);
|
|
@@ -6519,7 +6622,7 @@ function scanForMuffledGates(opts) {
|
|
|
6519
6622
|
const findings = [];
|
|
6520
6623
|
const scanned = /* @__PURE__ */ new Set();
|
|
6521
6624
|
for (const file of opts.scanFiles) {
|
|
6522
|
-
const abs =
|
|
6625
|
+
const abs = join(opts.repoRoot, file);
|
|
6523
6626
|
if (!existsSync2(abs)) continue;
|
|
6524
6627
|
const text = readFileSync2(abs, "utf8");
|
|
6525
6628
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
@@ -6534,7 +6637,7 @@ function scanForMuffledGates(opts) {
|
|
|
6534
6637
|
);
|
|
6535
6638
|
for (const file of importers) {
|
|
6536
6639
|
if (scanned.has(file)) continue;
|
|
6537
|
-
const abs =
|
|
6640
|
+
const abs = join(opts.repoRoot, file);
|
|
6538
6641
|
if (!existsSync2(abs)) continue;
|
|
6539
6642
|
const text = readFileSync2(abs, "utf8");
|
|
6540
6643
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
@@ -8522,7 +8625,7 @@ async function commitBisect(options) {
|
|
|
8522
8625
|
}
|
|
8523
8626
|
async function promptBisect(options) {
|
|
8524
8627
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
8525
|
-
const
|
|
8628
|
+
const join3 = (paragraphs) => paragraphs.join("\n\n");
|
|
8526
8629
|
const goodParas = split(options.good);
|
|
8527
8630
|
const badParas = split(options.bad);
|
|
8528
8631
|
if (goodParas.length !== badParas.length) {
|
|
@@ -8540,7 +8643,7 @@ async function promptBisect(options) {
|
|
|
8540
8643
|
const result = await bisect({
|
|
8541
8644
|
good: goodMask,
|
|
8542
8645
|
bad: badMask,
|
|
8543
|
-
runEval: (mask) => options.runEval(
|
|
8646
|
+
runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
|
|
8544
8647
|
maxIterations: options.maxIterations ?? n + 5,
|
|
8545
8648
|
halfway: (g, b) => {
|
|
8546
8649
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -8571,12 +8674,12 @@ async function promptBisect(options) {
|
|
|
8571
8674
|
}
|
|
8572
8675
|
}
|
|
8573
8676
|
const materializedPath = result.path.map((s) => ({
|
|
8574
|
-
state:
|
|
8677
|
+
state: join3(paragraphsFor(s.state)),
|
|
8575
8678
|
score: s.score,
|
|
8576
8679
|
pass: s.pass
|
|
8577
8680
|
}));
|
|
8578
8681
|
return {
|
|
8579
|
-
culprit:
|
|
8682
|
+
culprit: join3(paragraphsFor(culprit)),
|
|
8580
8683
|
path: materializedPath,
|
|
8581
8684
|
converged: result.converged,
|
|
8582
8685
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -9631,7 +9734,7 @@ function mergeSignals(a, b) {
|
|
|
9631
9734
|
// src/command-runner.ts
|
|
9632
9735
|
import { spawnSync } from "child_process";
|
|
9633
9736
|
import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
|
|
9634
|
-
import { join as
|
|
9737
|
+
import { join as join2 } from "path";
|
|
9635
9738
|
var localCommandRunner = {
|
|
9636
9739
|
name: "local",
|
|
9637
9740
|
async run(input) {
|
|
@@ -9678,7 +9781,7 @@ var localCommandRunner = {
|
|
|
9678
9781
|
const out = [];
|
|
9679
9782
|
for (const name of entries) {
|
|
9680
9783
|
try {
|
|
9681
|
-
const st = statSync2(
|
|
9784
|
+
const st = statSync2(join2(path, name));
|
|
9682
9785
|
out.push({
|
|
9683
9786
|
name,
|
|
9684
9787
|
isDirectory: st.isDirectory(),
|
|
@@ -12589,6 +12692,274 @@ function samePopulation(a, b) {
|
|
|
12589
12692
|
return b.every((id) => setA.has(id));
|
|
12590
12693
|
}
|
|
12591
12694
|
|
|
12695
|
+
// src/multi-shot-optimization.ts
|
|
12696
|
+
async function runMultiShotOptimization(config) {
|
|
12697
|
+
validateConfig(config);
|
|
12698
|
+
const scoreAdapter = {
|
|
12699
|
+
score: (args) => scoreOne(config, args.variant, args.scenarioId, args.rep, "search")
|
|
12700
|
+
};
|
|
12701
|
+
const evolution = await runPromptEvolution({
|
|
12702
|
+
runId: config.runId,
|
|
12703
|
+
target: config.target,
|
|
12704
|
+
seedVariants: config.seedVariants,
|
|
12705
|
+
scenarioIds: config.searchScenarioIds,
|
|
12706
|
+
reps: config.reps,
|
|
12707
|
+
generations: config.generations,
|
|
12708
|
+
populationSize: config.populationSize,
|
|
12709
|
+
scoreConcurrency: config.scoreConcurrency ?? 1,
|
|
12710
|
+
scoreAdapter,
|
|
12711
|
+
mutateAdapter: {
|
|
12712
|
+
mutate: (args) => config.mutateAdapter.mutate({
|
|
12713
|
+
...args,
|
|
12714
|
+
topTrials: args.topTrials,
|
|
12715
|
+
bottomTrials: args.bottomTrials
|
|
12716
|
+
})
|
|
12717
|
+
},
|
|
12718
|
+
objectives: config.objectives ?? defaultMultiShotObjectives(),
|
|
12719
|
+
scalarWeights: config.scalarWeights,
|
|
12720
|
+
earlyStopOnNoImprovement: config.earlyStopOnNoImprovement,
|
|
12721
|
+
cache: config.cache,
|
|
12722
|
+
onProgress: config.onProgress
|
|
12723
|
+
});
|
|
12724
|
+
let gate = null;
|
|
12725
|
+
const baseline = config.seedVariants[0];
|
|
12726
|
+
let promotedVariant = evolution.bestVariant;
|
|
12727
|
+
let promotedAggregate = evolution.bestAggregate;
|
|
12728
|
+
if (config.gate && evolution.bestVariant.id !== baseline.id) {
|
|
12729
|
+
gate = await evaluateMultiShotGate(config, baseline, evolution.bestVariant);
|
|
12730
|
+
if (!gate.decision.promote) {
|
|
12731
|
+
promotedVariant = baseline;
|
|
12732
|
+
promotedAggregate = aggregateFor(evolution, baseline.id);
|
|
12733
|
+
}
|
|
12734
|
+
}
|
|
12735
|
+
return {
|
|
12736
|
+
evolution,
|
|
12737
|
+
searchBestVariant: evolution.bestVariant,
|
|
12738
|
+
searchBestAggregate: evolution.bestAggregate,
|
|
12739
|
+
promotedVariant,
|
|
12740
|
+
promotedAggregate,
|
|
12741
|
+
gate
|
|
12742
|
+
};
|
|
12743
|
+
}
|
|
12744
|
+
function defaultMultiShotObjectives() {
|
|
12745
|
+
return [
|
|
12746
|
+
{ name: "score", direction: "maximize", value: (a) => a.meanScore },
|
|
12747
|
+
{ name: "cost", direction: "minimize", value: (a) => a.meanCost }
|
|
12748
|
+
];
|
|
12749
|
+
}
|
|
12750
|
+
function trialTraceFromMultiShotTrial(trial) {
|
|
12751
|
+
return {
|
|
12752
|
+
id: `${trial.variantId}/${trial.scenarioId}/r${trial.rep}`,
|
|
12753
|
+
score: trial.score,
|
|
12754
|
+
inputName: trial.scenarioId,
|
|
12755
|
+
expectations: (trial.asi ?? []).map((item, i) => ({
|
|
12756
|
+
id: item.expectationId ?? `asi-${i}`,
|
|
12757
|
+
phrase: item.message,
|
|
12758
|
+
matched: item.matched ?? false
|
|
12759
|
+
})),
|
|
12760
|
+
emitted: trial.emitted ?? traceExcerpt(trial.trace),
|
|
12761
|
+
metrics: trial.metrics
|
|
12762
|
+
};
|
|
12763
|
+
}
|
|
12764
|
+
async function evaluateMultiShotGate(config, baseline, candidate) {
|
|
12765
|
+
const gateConfig = config.gate;
|
|
12766
|
+
const reps = gateConfig.reps ?? config.reps;
|
|
12767
|
+
const candidateRuns = [];
|
|
12768
|
+
const baselineRuns = [];
|
|
12769
|
+
const searchIds = gateConfig.searchScenarioIds ?? config.searchScenarioIds;
|
|
12770
|
+
for (const scenarioId of searchIds) {
|
|
12771
|
+
for (let rep = 0; rep < reps; rep++) {
|
|
12772
|
+
const seed = seedFor(config, scenarioId, rep);
|
|
12773
|
+
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "search");
|
|
12774
|
+
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "search");
|
|
12775
|
+
baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "search", seed, baseTrial));
|
|
12776
|
+
candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "search", seed, candTrial));
|
|
12777
|
+
}
|
|
12778
|
+
}
|
|
12779
|
+
for (const scenarioId of gateConfig.holdoutScenarioIds) {
|
|
12780
|
+
for (let rep = 0; rep < reps; rep++) {
|
|
12781
|
+
const seed = seedFor(config, scenarioId, rep);
|
|
12782
|
+
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "holdout");
|
|
12783
|
+
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "holdout");
|
|
12784
|
+
baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "holdout", seed, baseTrial));
|
|
12785
|
+
candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "holdout", seed, candTrial));
|
|
12786
|
+
}
|
|
12787
|
+
}
|
|
12788
|
+
const decision = new HeldOutGate(gateConfig.gate).evaluate(candidateRuns, baselineRuns);
|
|
12789
|
+
return { decision, candidateRuns, baselineRuns };
|
|
12790
|
+
}
|
|
12791
|
+
async function scoreOne(config, variant, scenarioId, rep, split) {
|
|
12792
|
+
const seed = seedFor(config, scenarioId, rep);
|
|
12793
|
+
const input = { variant, scenarioId, rep, split, seed };
|
|
12794
|
+
try {
|
|
12795
|
+
const run = await config.runner.run(input);
|
|
12796
|
+
const scored = await config.scorer.score({ ...input, run });
|
|
12797
|
+
const asi = scored.asi ?? [];
|
|
12798
|
+
return {
|
|
12799
|
+
variantId: variant.id,
|
|
12800
|
+
scenarioId,
|
|
12801
|
+
rep,
|
|
12802
|
+
ok: scored.ok ?? true,
|
|
12803
|
+
score: clamp013(scored.score),
|
|
12804
|
+
cost: scored.costUsd ?? run.costUsd ?? 0,
|
|
12805
|
+
durationMs: scored.durationMs ?? run.durationMs ?? 0,
|
|
12806
|
+
metrics: {
|
|
12807
|
+
...numericMetrics(scored.metrics),
|
|
12808
|
+
...asiMetrics(asi)
|
|
12809
|
+
},
|
|
12810
|
+
split,
|
|
12811
|
+
seed,
|
|
12812
|
+
trace: run.trace,
|
|
12813
|
+
asi,
|
|
12814
|
+
emitted: scored.emitted ?? traceExcerpt(run.trace),
|
|
12815
|
+
metadata: scored.metadata
|
|
12816
|
+
};
|
|
12817
|
+
} catch (err) {
|
|
12818
|
+
return {
|
|
12819
|
+
variantId: variant.id,
|
|
12820
|
+
scenarioId,
|
|
12821
|
+
rep,
|
|
12822
|
+
ok: false,
|
|
12823
|
+
score: 0,
|
|
12824
|
+
cost: 0,
|
|
12825
|
+
durationMs: 0,
|
|
12826
|
+
metrics: { error: 1 },
|
|
12827
|
+
error: err instanceof Error ? err.message : String(err),
|
|
12828
|
+
split,
|
|
12829
|
+
seed,
|
|
12830
|
+
asi: [{
|
|
12831
|
+
severity: "critical",
|
|
12832
|
+
message: err instanceof Error ? err.message : String(err),
|
|
12833
|
+
responsibleSurface: config.target
|
|
12834
|
+
}],
|
|
12835
|
+
emitted: ""
|
|
12836
|
+
};
|
|
12837
|
+
}
|
|
12838
|
+
}
|
|
12839
|
+
function toValidatedRecord(config, variant, scenarioId, rep, split, seed, trial) {
|
|
12840
|
+
const record = config.gate.toRunRecord({ variant, scenarioId, rep, split, seed, trial });
|
|
12841
|
+
return validateRunRecord(record);
|
|
12842
|
+
}
|
|
12843
|
+
function validateConfig(config) {
|
|
12844
|
+
if (!config.runId.trim()) throw new Error("runMultiShotOptimization: runId must not be empty");
|
|
12845
|
+
if (!config.target.trim()) throw new Error("runMultiShotOptimization: target must not be empty");
|
|
12846
|
+
if (config.seedVariants.length === 0) {
|
|
12847
|
+
throw new Error("runMultiShotOptimization: seedVariants must not be empty");
|
|
12848
|
+
}
|
|
12849
|
+
if (config.searchScenarioIds.length === 0) {
|
|
12850
|
+
throw new Error("runMultiShotOptimization: searchScenarioIds must not be empty");
|
|
12851
|
+
}
|
|
12852
|
+
requirePositiveInteger(config.reps, "reps");
|
|
12853
|
+
requirePositiveInteger(config.generations, "generations");
|
|
12854
|
+
requirePositiveInteger(config.populationSize, "populationSize");
|
|
12855
|
+
if (config.scoreConcurrency !== void 0) requirePositiveInteger(config.scoreConcurrency, "scoreConcurrency");
|
|
12856
|
+
if (config.populationSize < config.seedVariants.length) {
|
|
12857
|
+
throw new Error("runMultiShotOptimization: populationSize must be >= seedVariants.length");
|
|
12858
|
+
}
|
|
12859
|
+
assertUnique(config.seedVariants.map((v) => v.id), "seedVariants.id");
|
|
12860
|
+
assertUnique(config.searchScenarioIds, "searchScenarioIds");
|
|
12861
|
+
if (config.gate) {
|
|
12862
|
+
if (config.gate.holdoutScenarioIds.length === 0) {
|
|
12863
|
+
throw new Error("runMultiShotOptimization: gate.holdoutScenarioIds must not be empty");
|
|
12864
|
+
}
|
|
12865
|
+
if (config.gate.reps !== void 0) requirePositiveInteger(config.gate.reps, "gate.reps");
|
|
12866
|
+
assertUnique(config.gate.holdoutScenarioIds, "gate.holdoutScenarioIds");
|
|
12867
|
+
if (config.gate.searchScenarioIds) assertUnique(config.gate.searchScenarioIds, "gate.searchScenarioIds");
|
|
12868
|
+
const searchIds = new Set(config.searchScenarioIds);
|
|
12869
|
+
for (const id of config.gate.holdoutScenarioIds) {
|
|
12870
|
+
if (searchIds.has(id)) {
|
|
12871
|
+
throw new Error(`runMultiShotOptimization: holdout scenario "${id}" also appears in searchScenarioIds`);
|
|
12872
|
+
}
|
|
12873
|
+
}
|
|
12874
|
+
const baselineId = config.seedVariants[0].id;
|
|
12875
|
+
if (config.gate.gate.baselineKey !== baselineId) {
|
|
12876
|
+
throw new Error(
|
|
12877
|
+
`runMultiShotOptimization: gate.gate.baselineKey must match first seed variant id "${baselineId}"`
|
|
12878
|
+
);
|
|
12879
|
+
}
|
|
12880
|
+
}
|
|
12881
|
+
}
|
|
12882
|
+
function requirePositiveInteger(value, name) {
|
|
12883
|
+
if (!Number.isInteger(value) || value <= 0) {
|
|
12884
|
+
throw new Error(`runMultiShotOptimization: ${name} must be a positive integer`);
|
|
12885
|
+
}
|
|
12886
|
+
}
|
|
12887
|
+
function assertUnique(values, name) {
|
|
12888
|
+
const seen = /* @__PURE__ */ new Set();
|
|
12889
|
+
for (const value of values) {
|
|
12890
|
+
if (!value.trim()) throw new Error(`runMultiShotOptimization: ${name} must not contain empty values`);
|
|
12891
|
+
if (seen.has(value)) throw new Error(`runMultiShotOptimization: duplicate ${name} "${value}"`);
|
|
12892
|
+
seen.add(value);
|
|
12893
|
+
}
|
|
12894
|
+
}
|
|
12895
|
+
function aggregateFor(evolution, variantId) {
|
|
12896
|
+
const final = evolution.generations[evolution.generations.length - 1];
|
|
12897
|
+
const aggregate2 = final?.aggregates.find((a) => a.variantId === variantId);
|
|
12898
|
+
if (!aggregate2) {
|
|
12899
|
+
throw new Error(`runMultiShotOptimization: missing aggregate for variant "${variantId}"`);
|
|
12900
|
+
}
|
|
12901
|
+
return aggregate2;
|
|
12902
|
+
}
|
|
12903
|
+
function seedFor(config, scenarioId, rep) {
|
|
12904
|
+
const base = config.seedBase ?? 0;
|
|
12905
|
+
return (base + stableHash2(`${scenarioId}${rep}`)) % Number.MAX_SAFE_INTEGER;
|
|
12906
|
+
}
|
|
12907
|
+
function stableHash2(input) {
|
|
12908
|
+
let h = 2166136261;
|
|
12909
|
+
for (let i = 0; i < input.length; i++) {
|
|
12910
|
+
h ^= input.charCodeAt(i);
|
|
12911
|
+
h = Math.imul(h, 16777619);
|
|
12912
|
+
}
|
|
12913
|
+
return h >>> 0;
|
|
12914
|
+
}
|
|
12915
|
+
function clamp013(n) {
|
|
12916
|
+
if (!Number.isFinite(n)) return 0;
|
|
12917
|
+
return Math.max(0, Math.min(1, n));
|
|
12918
|
+
}
|
|
12919
|
+
function numericMetrics(metrics) {
|
|
12920
|
+
const out = {};
|
|
12921
|
+
for (const [k, v] of Object.entries(metrics ?? {})) {
|
|
12922
|
+
if (Number.isFinite(v)) out[k] = v;
|
|
12923
|
+
}
|
|
12924
|
+
return out;
|
|
12925
|
+
}
|
|
12926
|
+
function asiMetrics(asi) {
|
|
12927
|
+
const out = { asi: asi.length };
|
|
12928
|
+
for (const item of asi.slice(0, 1e3)) {
|
|
12929
|
+
const sev = normalizeSeverity(item.severity);
|
|
12930
|
+
out[`asi.${sev}`] = (out[`asi.${sev}`] ?? 0) + 1;
|
|
12931
|
+
if (item.responsibleSurface) {
|
|
12932
|
+
const key = `surface.${metricKeySegment(item.responsibleSurface)}`;
|
|
12933
|
+
out[key] = (out[key] ?? 0) + 1;
|
|
12934
|
+
}
|
|
12935
|
+
}
|
|
12936
|
+
return out;
|
|
12937
|
+
}
|
|
12938
|
+
function normalizeSeverity(severity) {
|
|
12939
|
+
if (severity === "info" || severity === "warning" || severity === "error" || severity === "critical") {
|
|
12940
|
+
return severity;
|
|
12941
|
+
}
|
|
12942
|
+
return "error";
|
|
12943
|
+
}
|
|
12944
|
+
function metricKeySegment(raw) {
|
|
12945
|
+
return raw.trim().replace(/[^a-zA-Z0-9._-]+/g, "_").slice(0, 80) || "unknown";
|
|
12946
|
+
}
|
|
12947
|
+
function traceExcerpt(trace) {
|
|
12948
|
+
if (!trace) return void 0;
|
|
12949
|
+
if (typeof trace.output === "string") return trace.output;
|
|
12950
|
+
if (trace.transcript) return trace.transcript;
|
|
12951
|
+
if (trace.turns) {
|
|
12952
|
+
try {
|
|
12953
|
+
const clipped = trace.turns.slice(0, 20);
|
|
12954
|
+
const suffix = trace.turns.length > clipped.length ? ` ... ${trace.turns.length - clipped.length} more turn(s)` : "";
|
|
12955
|
+
return `${JSON.stringify(clipped).slice(0, 2e3)}${suffix}`;
|
|
12956
|
+
} catch {
|
|
12957
|
+
return "[unserializable trace turns]";
|
|
12958
|
+
}
|
|
12959
|
+
}
|
|
12960
|
+
return void 0;
|
|
12961
|
+
}
|
|
12962
|
+
|
|
12592
12963
|
// src/jsonl-trial-cache.ts
|
|
12593
12964
|
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
12594
12965
|
import { dirname as dirname4 } from "path";
|
|
@@ -13708,6 +14079,7 @@ export {
|
|
|
13708
14079
|
decideReferenceReplayPromotion,
|
|
13709
14080
|
decideReferenceReplayRunPromotion,
|
|
13710
14081
|
defaultJudges,
|
|
14082
|
+
defaultMultiShotObjectives,
|
|
13711
14083
|
defaultReferenceReplayMatcher,
|
|
13712
14084
|
deployGateLayer,
|
|
13713
14085
|
distillPlaybook,
|
|
@@ -13715,6 +14087,7 @@ export {
|
|
|
13715
14087
|
estimateCost,
|
|
13716
14088
|
estimateTokens,
|
|
13717
14089
|
euAiActReport,
|
|
14090
|
+
evaluateActionPolicy,
|
|
13718
14091
|
evaluateContract,
|
|
13719
14092
|
evaluateHypothesis,
|
|
13720
14093
|
evaluateOracles,
|
|
@@ -13822,6 +14195,8 @@ export {
|
|
|
13822
14195
|
renderPlaybookMarkdown,
|
|
13823
14196
|
renderPreferenceMemoryMarkdown,
|
|
13824
14197
|
renderSteeringText,
|
|
14198
|
+
replayFeedbackTrajectories,
|
|
14199
|
+
replayFeedbackTrajectory,
|
|
13825
14200
|
replayScorerOverCorpus,
|
|
13826
14201
|
replayTraceThroughJudge,
|
|
13827
14202
|
requiredSampleSize,
|
|
@@ -13842,6 +14217,7 @@ export {
|
|
|
13842
14217
|
runJudgeFleet,
|
|
13843
14218
|
runKeywordCoverageJudge,
|
|
13844
14219
|
runKeywordCoverageJudgeUrl,
|
|
14220
|
+
runMultiShotOptimization,
|
|
13845
14221
|
runPromptEvolution,
|
|
13846
14222
|
runProposeReview,
|
|
13847
14223
|
runProposeReviewAsControlLoop,
|
|
@@ -13885,6 +14261,7 @@ export {
|
|
|
13885
14261
|
toolSpans,
|
|
13886
14262
|
toolSuccessRubric,
|
|
13887
14263
|
toolWasteView,
|
|
14264
|
+
trialTraceFromMultiShotTrial,
|
|
13888
14265
|
typoMutator,
|
|
13889
14266
|
urlContains,
|
|
13890
14267
|
validateRunRecord,
|