@tangle-network/agent-eval 0.40.5 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -355
- package/dist/campaign/index.js +106 -6
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
package/dist/rl.js
CHANGED
|
@@ -6,24 +6,23 @@ import {
|
|
|
6
6
|
} from "./chunk-YV7J7X5N.js";
|
|
7
7
|
import {
|
|
8
8
|
runEvalCampaign
|
|
9
|
-
} from "./chunk-
|
|
10
|
-
import "./chunk-VXNVVBZO.js";
|
|
9
|
+
} from "./chunk-PD3MH6WU.js";
|
|
11
10
|
import "./chunk-BWZEGTES.js";
|
|
12
11
|
import {
|
|
13
12
|
rubricPredictiveValidity
|
|
14
13
|
} from "./chunk-YRZ4M5GS.js";
|
|
15
14
|
import {
|
|
16
15
|
evaluateInterimReleaseConfidence
|
|
17
|
-
} from "./chunk-
|
|
18
|
-
import "./chunk-EGIPWXHL.js";
|
|
16
|
+
} from "./chunk-MNL6LXGQ.js";
|
|
19
17
|
import {
|
|
20
18
|
benjaminiHochberg,
|
|
21
19
|
wilcoxonSignedRank
|
|
22
20
|
} from "./chunk-WP7SY7AI.js";
|
|
23
21
|
import "./chunk-UBPIXOC4.js";
|
|
24
|
-
import "./chunk-PC4UYEBM.js";
|
|
25
22
|
import "./chunk-TVVP3ZZQ.js";
|
|
26
23
|
import "./chunk-VSMTAMNK.js";
|
|
24
|
+
import "./chunk-VXNVVBZO.js";
|
|
25
|
+
import "./chunk-PC4UYEBM.js";
|
|
27
26
|
import {
|
|
28
27
|
ValidationError
|
|
29
28
|
} from "./chunk-QYJT52YW.js";
|
|
@@ -508,48 +507,44 @@ function scenarioOf(run) {
|
|
|
508
507
|
}
|
|
509
508
|
|
|
510
509
|
// src/rl/run-record-adapters.ts
|
|
511
|
-
function
|
|
510
|
+
function campaignToRunRecords(campaign, ctx) {
|
|
512
511
|
const splitTag = ctx.splitTag ?? "search";
|
|
513
|
-
const
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
}
|
|
545
|
-
function trialsToRunRecords(trials, ctx) {
|
|
546
|
-
return trials.map((t) => trialToRunRecord(t, ctx));
|
|
512
|
+
const candidateId = ctx.candidateId ?? campaign.manifestHash;
|
|
513
|
+
return campaign.cells.map((cell) => {
|
|
514
|
+
const composites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
515
|
+
const score = composites.length > 0 ? composites.reduce((a, b) => a + b, 0) / composites.length : 0;
|
|
516
|
+
const raw = { rep: cell.rep, duration_ms: cell.durationMs };
|
|
517
|
+
for (const judge of Object.values(cell.judgeScores)) {
|
|
518
|
+
for (const [dim, value] of Object.entries(judge.dimensions)) {
|
|
519
|
+
if (Number.isFinite(value)) raw[`dim.${dim}`] = value;
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
if (typeof cell.generation === "number") raw.generation = cell.generation;
|
|
523
|
+
const outcome = { raw };
|
|
524
|
+
if (splitTag === "holdout") outcome.holdoutScore = score;
|
|
525
|
+
else outcome.searchScore = score;
|
|
526
|
+
return {
|
|
527
|
+
runId: cell.cellId,
|
|
528
|
+
experimentId: ctx.experimentId,
|
|
529
|
+
candidateId,
|
|
530
|
+
seed: cell.seed,
|
|
531
|
+
model: ctx.model,
|
|
532
|
+
promptHash: ctx.promptHash,
|
|
533
|
+
configHash: ctx.configHash,
|
|
534
|
+
commitSha: ctx.commitSha,
|
|
535
|
+
wallMs: cell.durationMs,
|
|
536
|
+
costUsd: Number.isFinite(cell.costUsd) ? cell.costUsd : ctx.defaultCostUsd ?? 0,
|
|
537
|
+
tokenUsage: { input: 0, output: 0 },
|
|
538
|
+
outcome,
|
|
539
|
+
failureMode: cell.error ? "cell_error" : void 0,
|
|
540
|
+
splitTag,
|
|
541
|
+
scenarioId: cell.scenarioId
|
|
542
|
+
};
|
|
543
|
+
});
|
|
547
544
|
}
|
|
548
545
|
function verificationReportToRunRecord(report, ctx, opts = {}) {
|
|
549
546
|
const splitTag = ctx.splitTag ?? "search";
|
|
550
547
|
const runId = opts.runId ?? `run-${ctx.candidateId}-${ctx.experimentId}-${report.startedAt}`;
|
|
551
|
-
const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
|
|
552
|
-
const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
|
|
553
548
|
const raw = {
|
|
554
549
|
pass_count: report.passCount,
|
|
555
550
|
fail_count: report.failCount,
|
|
@@ -577,8 +572,8 @@ function verificationReportToRunRecord(report, ctx, opts = {}) {
|
|
|
577
572
|
candidateId: ctx.candidateId,
|
|
578
573
|
seed: 0,
|
|
579
574
|
model: ctx.model,
|
|
580
|
-
promptHash,
|
|
581
|
-
configHash,
|
|
575
|
+
promptHash: ctx.promptHash,
|
|
576
|
+
configHash: ctx.configHash,
|
|
582
577
|
commitSha: ctx.commitSha,
|
|
583
578
|
wallMs: report.durationMs,
|
|
584
579
|
costUsd: ctx.defaultCostUsd ?? 0,
|
|
@@ -589,39 +584,6 @@ function verificationReportToRunRecord(report, ctx, opts = {}) {
|
|
|
589
584
|
scenarioId: ctx.scenarioId
|
|
590
585
|
};
|
|
591
586
|
}
|
|
592
|
-
function variantAggregateToRunRecord(agg, ctx, opts = {}) {
|
|
593
|
-
const splitTag = ctx.splitTag ?? "search";
|
|
594
|
-
const runId = opts.runId ?? `agg-${agg.variantId}-${ctx.experimentId}`;
|
|
595
|
-
const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
|
|
596
|
-
const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
|
|
597
|
-
const raw = {
|
|
598
|
-
...agg.metrics,
|
|
599
|
-
ok_rate: agg.okRate,
|
|
600
|
-
duration_ms: agg.meanDurationMs,
|
|
601
|
-
n_scenarios: agg.scenarios.length
|
|
602
|
-
};
|
|
603
|
-
const outcome = { raw };
|
|
604
|
-
if (splitTag === "holdout") outcome.holdoutScore = agg.meanScore;
|
|
605
|
-
else outcome.searchScore = agg.meanScore;
|
|
606
|
-
return {
|
|
607
|
-
runId,
|
|
608
|
-
experimentId: ctx.experimentId,
|
|
609
|
-
candidateId: agg.variantId,
|
|
610
|
-
seed: 0,
|
|
611
|
-
model: ctx.model,
|
|
612
|
-
promptHash,
|
|
613
|
-
configHash,
|
|
614
|
-
commitSha: ctx.commitSha,
|
|
615
|
-
wallMs: agg.meanDurationMs,
|
|
616
|
-
costUsd: agg.meanCost,
|
|
617
|
-
tokenUsage: { input: 0, output: 0 },
|
|
618
|
-
outcome,
|
|
619
|
-
splitTag
|
|
620
|
-
};
|
|
621
|
-
}
|
|
622
|
-
function defaultRunId(ctx, t) {
|
|
623
|
-
return `run-${ctx.experimentId}-${t.variantId}-${t.scenarioId}-${t.rep}`;
|
|
624
|
-
}
|
|
625
587
|
function failureModeFromLayer(layer) {
|
|
626
588
|
if (layer.status === "error") return `layer_${layer.layer}_error`;
|
|
627
589
|
if (layer.status === "fail") return `layer_${layer.layer}_fail`;
|
|
@@ -1245,120 +1207,6 @@ function defaultReward(run) {
|
|
|
1245
1207
|
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
1246
1208
|
}
|
|
1247
1209
|
|
|
1248
|
-
// src/rl/auto-research.ts
|
|
1249
|
-
async function analyzeOptimizationResult(opts) {
|
|
1250
|
-
const trials = extractTrials(opts.result);
|
|
1251
|
-
const runs = trialsToRunRecords(trials, opts.ctx);
|
|
1252
|
-
const rewardSignals = extractVerifiableRewardsFromRecords(runs, opts.verifiableReward ?? {});
|
|
1253
|
-
const preferences = extractPreferences(runs, {
|
|
1254
|
-
strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
|
|
1255
|
-
minMargin: opts.preferences?.minMargin ?? 0.05,
|
|
1256
|
-
splitTag: opts.preferences?.splitTag ?? opts.ctx.splitTag ?? "search",
|
|
1257
|
-
rewardOf: opts.preferences?.rewardOf
|
|
1258
|
-
});
|
|
1259
|
-
let interimConfidence = null;
|
|
1260
|
-
if (opts.comparator) {
|
|
1261
|
-
const deltaSeries = collectPairedDeltaSeries(runs, opts.comparator);
|
|
1262
|
-
if (deltaSeries.some((s) => s.deltas.length > 0)) {
|
|
1263
|
-
interimConfidence = evaluateInterimReleaseConfidence({
|
|
1264
|
-
deltaSeries,
|
|
1265
|
-
alpha: opts.sequential?.alpha,
|
|
1266
|
-
bound: opts.sequential?.bound,
|
|
1267
|
-
rope: opts.sequential?.rope
|
|
1268
|
-
});
|
|
1269
|
-
}
|
|
1270
|
-
}
|
|
1271
|
-
const rewardHacking = detectRewardHacking({
|
|
1272
|
-
runs,
|
|
1273
|
-
verifiableRewardOptions: opts.verifiableReward
|
|
1274
|
-
});
|
|
1275
|
-
let predictiveValidity = null;
|
|
1276
|
-
if (opts.outcomes) {
|
|
1277
|
-
predictiveValidity = await rubricPredictiveValidity({
|
|
1278
|
-
runs,
|
|
1279
|
-
outcomes: opts.outcomes.store,
|
|
1280
|
-
outcomeMetrics: opts.outcomes.metrics
|
|
1281
|
-
});
|
|
1282
|
-
}
|
|
1283
|
-
const trainerRows = {};
|
|
1284
|
-
if (opts.trainerExport?.dpo) {
|
|
1285
|
-
trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
|
|
1286
|
-
}
|
|
1287
|
-
if (opts.trainerExport?.grpo) {
|
|
1288
|
-
trainerRows.grpo = await toGrpoRows(runs, opts.trainerExport.grpo);
|
|
1289
|
-
}
|
|
1290
|
-
const summary = buildSummary({
|
|
1291
|
-
runs,
|
|
1292
|
-
preferences,
|
|
1293
|
-
interimConfidence,
|
|
1294
|
-
rewardHacking,
|
|
1295
|
-
predictiveValidity
|
|
1296
|
-
});
|
|
1297
|
-
return {
|
|
1298
|
-
runs,
|
|
1299
|
-
rewardSignals,
|
|
1300
|
-
preferences,
|
|
1301
|
-
interimConfidence,
|
|
1302
|
-
rewardHacking,
|
|
1303
|
-
predictiveValidity,
|
|
1304
|
-
trainerRows,
|
|
1305
|
-
summary
|
|
1306
|
-
};
|
|
1307
|
-
}
|
|
1308
|
-
function extractTrials(result) {
|
|
1309
|
-
if ("evolution" in result) {
|
|
1310
|
-
return collectFromEvolution(result.evolution);
|
|
1311
|
-
}
|
|
1312
|
-
return collectFromEvolution(result);
|
|
1313
|
-
}
|
|
1314
|
-
function collectFromEvolution(evolution) {
|
|
1315
|
-
const trials = [];
|
|
1316
|
-
for (const gen of evolution.generations) {
|
|
1317
|
-
for (const t of gen.trials ?? []) trials.push(t);
|
|
1318
|
-
}
|
|
1319
|
-
return trials;
|
|
1320
|
-
}
|
|
1321
|
-
function collectPairedDeltaSeries(runs, comparator) {
|
|
1322
|
-
const baseline = /* @__PURE__ */ new Map();
|
|
1323
|
-
for (const r of runs) {
|
|
1324
|
-
if (r.candidateId !== comparator) continue;
|
|
1325
|
-
const sid = r.scenarioId ?? r.experimentId;
|
|
1326
|
-
const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1327
|
-
if (typeof score !== "number" || !Number.isFinite(score)) continue;
|
|
1328
|
-
baseline.set(`${sid}::${r.seed}`, score);
|
|
1329
|
-
}
|
|
1330
|
-
const byCandidate = /* @__PURE__ */ new Map();
|
|
1331
|
-
for (const r of runs) {
|
|
1332
|
-
if (r.candidateId === comparator) continue;
|
|
1333
|
-
const sid = r.scenarioId ?? r.experimentId;
|
|
1334
|
-
const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1335
|
-
if (typeof score !== "number" || !Number.isFinite(score)) continue;
|
|
1336
|
-
const baseScore = baseline.get(`${sid}::${r.seed}`);
|
|
1337
|
-
if (typeof baseScore !== "number") continue;
|
|
1338
|
-
const arr = byCandidate.get(r.candidateId) ?? [];
|
|
1339
|
-
arr.push(score - baseScore);
|
|
1340
|
-
byCandidate.set(r.candidateId, arr);
|
|
1341
|
-
}
|
|
1342
|
-
return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
|
|
1343
|
-
}
|
|
1344
|
-
function buildSummary(args) {
|
|
1345
|
-
const lines = [
|
|
1346
|
-
`${args.runs.length} runs analysed`,
|
|
1347
|
-
`${args.preferences.pairs.length} preference pairs (${args.preferences.strategy})`,
|
|
1348
|
-
`reward-hacking verdict: ${args.rewardHacking.verdict}`
|
|
1349
|
-
];
|
|
1350
|
-
if (args.interimConfidence) {
|
|
1351
|
-
lines.push(
|
|
1352
|
-
`sequential: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : "")
|
|
1353
|
-
);
|
|
1354
|
-
}
|
|
1355
|
-
if (args.predictiveValidity?.ranked[0]) {
|
|
1356
|
-
const top = args.predictiveValidity.ranked[0];
|
|
1357
|
-
lines.push(`top-rubric: ${top.rubric} \u03C1=${top.spearman.toFixed(2)}`);
|
|
1358
|
-
}
|
|
1359
|
-
return lines.join(" | ");
|
|
1360
|
-
}
|
|
1361
|
-
|
|
1362
1210
|
// src/rl/predictive-validity-researcher.ts
|
|
1363
1211
|
var PredictiveValidityResearcher = class {
|
|
1364
1212
|
opts;
|
|
@@ -1640,7 +1488,7 @@ async function runRLCampaign(opts) {
|
|
|
1640
1488
|
let interimConfidence = null;
|
|
1641
1489
|
if (opts.report?.comparator) {
|
|
1642
1490
|
const comparator = opts.report.comparator;
|
|
1643
|
-
const deltaSeries =
|
|
1491
|
+
const deltaSeries = collectPairedDeltaSeries(campaign.runs, comparator);
|
|
1644
1492
|
if (deltaSeries.some((s) => s.deltas.length > 0)) {
|
|
1645
1493
|
interimConfidence = evaluateInterimReleaseConfidence({
|
|
1646
1494
|
deltaSeries,
|
|
@@ -1672,7 +1520,7 @@ async function runRLCampaign(opts) {
|
|
|
1672
1520
|
if (opts.trainerExport?.sft) {
|
|
1673
1521
|
trainerRows.sft = await toSftRows(campaign.runs, opts.trainerExport.sft);
|
|
1674
1522
|
}
|
|
1675
|
-
const summary =
|
|
1523
|
+
const summary = buildSummary({
|
|
1676
1524
|
campaign,
|
|
1677
1525
|
preferences,
|
|
1678
1526
|
interimConfidence,
|
|
@@ -1691,7 +1539,7 @@ async function runRLCampaign(opts) {
|
|
|
1691
1539
|
kind: "agent-eval-rl-campaign"
|
|
1692
1540
|
};
|
|
1693
1541
|
}
|
|
1694
|
-
function
|
|
1542
|
+
function collectPairedDeltaSeries(runs, comparator) {
|
|
1695
1543
|
const baseline = /* @__PURE__ */ new Map();
|
|
1696
1544
|
for (const r of runs) {
|
|
1697
1545
|
if (r.candidateId !== comparator) continue;
|
|
@@ -1714,7 +1562,7 @@ function collectPairedDeltaSeries2(runs, comparator) {
|
|
|
1714
1562
|
}
|
|
1715
1563
|
return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
|
|
1716
1564
|
}
|
|
1717
|
-
function
|
|
1565
|
+
function buildSummary(args) {
|
|
1718
1566
|
const c = args.campaign;
|
|
1719
1567
|
const lines = [
|
|
1720
1568
|
`${c.campaignId}: ${c.runs.length} successful runs / ${c.failedRuns.length} failed (fingerprint ${c.campaignFingerprint.slice(0, 12)}\u2026)`,
|
|
@@ -1739,10 +1587,10 @@ function buildSummary2(args) {
|
|
|
1739
1587
|
export {
|
|
1740
1588
|
PredictiveValidityResearcher,
|
|
1741
1589
|
adversarialScenarioSearch,
|
|
1742
|
-
analyzeOptimizationResult,
|
|
1743
1590
|
applyEloUpdate,
|
|
1744
1591
|
bestOfN,
|
|
1745
1592
|
buildPairwiseFromCampaign,
|
|
1593
|
+
campaignToRunRecords,
|
|
1746
1594
|
compareAdaptationCurves,
|
|
1747
1595
|
detectRewardHacking,
|
|
1748
1596
|
doublyRobust,
|
|
@@ -1781,10 +1629,7 @@ export {
|
|
|
1781
1629
|
toSftJsonl,
|
|
1782
1630
|
toSftRows,
|
|
1783
1631
|
toTRLFormat,
|
|
1784
|
-
trialToRunRecord,
|
|
1785
|
-
trialsToRunRecords,
|
|
1786
1632
|
varianceBasedCurriculum,
|
|
1787
|
-
variantAggregateToRunRecord,
|
|
1788
1633
|
verificationReportToRunRecord
|
|
1789
1634
|
};
|
|
1790
1635
|
//# sourceMappingURL=rl.js.map
|