@tangle-network/agent-eval 0.40.5 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/campaign/index.d.ts +48 -355
  2. package/dist/campaign/index.js +106 -6
  3. package/dist/campaign/index.js.map +1 -1
  4. package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
  5. package/dist/chunk-H4TOS272.js.map +1 -0
  6. package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
  7. package/dist/chunk-KQ26DYTQ.js.map +1 -0
  8. package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
  9. package/dist/chunk-MNL6LXGQ.js.map +1 -0
  10. package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
  11. package/dist/chunk-N4SBKEPJ.js.map +1 -0
  12. package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
  13. package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
  14. package/dist/control.d.ts +2 -2
  15. package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
  16. package/dist/index.d.ts +227 -687
  17. package/dist/index.js +753 -1237
  18. package/dist/index.js.map +1 -1
  19. package/dist/integrity-CTDhR1Sg.d.ts +81 -0
  20. package/dist/llm-client-BXVRUZyX.d.ts +234 -0
  21. package/dist/openapi.json +1 -1
  22. package/dist/pipelines/index.js +67 -3
  23. package/dist/pipelines/index.js.map +1 -1
  24. package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
  25. package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
  26. package/dist/reporting.d.ts +2 -3
  27. package/dist/reporting.js +4 -8
  28. package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
  29. package/dist/rl.d.ts +103 -221
  30. package/dist/rl.js +44 -199
  31. package/dist/rl.js.map +1 -1
  32. package/dist/sequential-DdV5ShjT.d.ts +561 -0
  33. package/dist/traces.d.ts +3 -2
  34. package/dist/traces.js +5 -5
  35. package/dist/types-BLbRTxoc.d.ts +367 -0
  36. package/dist/wire/index.d.ts +1 -1
  37. package/package.json +1 -6
  38. package/dist/chunk-5U2DOJU4.js.map +0 -1
  39. package/dist/chunk-AU2JLNSZ.js.map +0 -1
  40. package/dist/chunk-DMW5VENN.js +0 -1412
  41. package/dist/chunk-DMW5VENN.js.map +0 -1
  42. package/dist/chunk-EGIPWXHL.js.map +0 -1
  43. package/dist/chunk-MAZ26DC7.js +0 -99
  44. package/dist/chunk-MAZ26DC7.js.map +0 -1
  45. package/dist/chunk-NKLGKF2Q.js.map +0 -1
  46. package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
  47. package/dist/optimization.d.ts +0 -11
  48. package/dist/optimization.js +0 -71
  49. package/dist/optimization.js.map +0 -1
  50. package/dist/sequential-5iSVfzl2.d.ts +0 -139
  51. package/dist/summary-report-DuZXOk7K.d.ts +0 -917
  52. /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
package/dist/rl.js CHANGED
@@ -6,24 +6,23 @@ import {
6
6
  } from "./chunk-YV7J7X5N.js";
7
7
  import {
8
8
  runEvalCampaign
9
- } from "./chunk-LCIDRYGP.js";
10
- import "./chunk-VXNVVBZO.js";
9
+ } from "./chunk-PD3MH6WU.js";
11
10
  import "./chunk-BWZEGTES.js";
12
11
  import {
13
12
  rubricPredictiveValidity
14
13
  } from "./chunk-YRZ4M5GS.js";
15
14
  import {
16
15
  evaluateInterimReleaseConfidence
17
- } from "./chunk-MAZ26DC7.js";
18
- import "./chunk-EGIPWXHL.js";
16
+ } from "./chunk-MNL6LXGQ.js";
19
17
  import {
20
18
  benjaminiHochberg,
21
19
  wilcoxonSignedRank
22
20
  } from "./chunk-WP7SY7AI.js";
23
21
  import "./chunk-UBPIXOC4.js";
24
- import "./chunk-PC4UYEBM.js";
25
22
  import "./chunk-TVVP3ZZQ.js";
26
23
  import "./chunk-VSMTAMNK.js";
24
+ import "./chunk-VXNVVBZO.js";
25
+ import "./chunk-PC4UYEBM.js";
27
26
  import {
28
27
  ValidationError
29
28
  } from "./chunk-QYJT52YW.js";
@@ -508,48 +507,44 @@ function scenarioOf(run) {
508
507
  }
509
508
 
510
509
  // src/rl/run-record-adapters.ts
511
- function trialToRunRecord(trial, ctx, opts = {}) {
510
+ function campaignToRunRecords(campaign, ctx) {
512
511
  const splitTag = ctx.splitTag ?? "search";
513
- const promptHash = typeof ctx.promptHash === "function" ? ctx.promptHash(trial) : ctx.promptHash;
514
- const configHash = typeof ctx.configHash === "function" ? ctx.configHash(trial) : ctx.configHash;
515
- const runId = opts.runId ?? defaultRunId(ctx, trial);
516
- const experimentId = opts.experimentIdPerTrial?.(trial) ?? ctx.experimentId;
517
- const costRecorded = typeof trial.cost === "number" && Number.isFinite(trial.cost);
518
- const costUsd = costRecorded ? trial.cost : ctx.defaultCostUsd ?? 0;
519
- const raw = { ...trial.metrics ?? {} };
520
- if (!costRecorded) raw.cost_unknown = 1;
521
- if (typeof trial.durationMs === "number") raw.duration_ms = trial.durationMs;
522
- raw.rep = trial.rep;
523
- const score = Number.isFinite(trial.score) ? trial.score : 0;
524
- const outcome = { raw };
525
- if (splitTag === "holdout") outcome.holdoutScore = score;
526
- else outcome.searchScore = score;
527
- return {
528
- runId,
529
- experimentId,
530
- candidateId: trial.variantId,
531
- seed: trial.rep,
532
- model: ctx.model,
533
- promptHash,
534
- configHash,
535
- commitSha: ctx.commitSha,
536
- wallMs: trial.durationMs ?? 0,
537
- costUsd,
538
- tokenUsage: { input: 0, output: 0 },
539
- outcome,
540
- failureMode: trial.ok ? void 0 : trial.error ? "optimizer_trial_error" : "optimizer_trial_failed",
541
- splitTag,
542
- scenarioId: trial.scenarioId
543
- };
544
- }
545
- function trialsToRunRecords(trials, ctx) {
546
- return trials.map((t) => trialToRunRecord(t, ctx));
512
+ const candidateId = ctx.candidateId ?? campaign.manifestHash;
513
+ return campaign.cells.map((cell) => {
514
+ const composites = Object.values(cell.judgeScores).map((s) => s.composite);
515
+ const score = composites.length > 0 ? composites.reduce((a, b) => a + b, 0) / composites.length : 0;
516
+ const raw = { rep: cell.rep, duration_ms: cell.durationMs };
517
+ for (const judge of Object.values(cell.judgeScores)) {
518
+ for (const [dim, value] of Object.entries(judge.dimensions)) {
519
+ if (Number.isFinite(value)) raw[`dim.${dim}`] = value;
520
+ }
521
+ }
522
+ if (typeof cell.generation === "number") raw.generation = cell.generation;
523
+ const outcome = { raw };
524
+ if (splitTag === "holdout") outcome.holdoutScore = score;
525
+ else outcome.searchScore = score;
526
+ return {
527
+ runId: cell.cellId,
528
+ experimentId: ctx.experimentId,
529
+ candidateId,
530
+ seed: cell.seed,
531
+ model: ctx.model,
532
+ promptHash: ctx.promptHash,
533
+ configHash: ctx.configHash,
534
+ commitSha: ctx.commitSha,
535
+ wallMs: cell.durationMs,
536
+ costUsd: Number.isFinite(cell.costUsd) ? cell.costUsd : ctx.defaultCostUsd ?? 0,
537
+ tokenUsage: { input: 0, output: 0 },
538
+ outcome,
539
+ failureMode: cell.error ? "cell_error" : void 0,
540
+ splitTag,
541
+ scenarioId: cell.scenarioId
542
+ };
543
+ });
547
544
  }
548
545
  function verificationReportToRunRecord(report, ctx, opts = {}) {
549
546
  const splitTag = ctx.splitTag ?? "search";
550
547
  const runId = opts.runId ?? `run-${ctx.candidateId}-${ctx.experimentId}-${report.startedAt}`;
551
- const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
552
- const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
553
548
  const raw = {
554
549
  pass_count: report.passCount,
555
550
  fail_count: report.failCount,
@@ -577,8 +572,8 @@ function verificationReportToRunRecord(report, ctx, opts = {}) {
577
572
  candidateId: ctx.candidateId,
578
573
  seed: 0,
579
574
  model: ctx.model,
580
- promptHash,
581
- configHash,
575
+ promptHash: ctx.promptHash,
576
+ configHash: ctx.configHash,
582
577
  commitSha: ctx.commitSha,
583
578
  wallMs: report.durationMs,
584
579
  costUsd: ctx.defaultCostUsd ?? 0,
@@ -589,39 +584,6 @@ function verificationReportToRunRecord(report, ctx, opts = {}) {
589
584
  scenarioId: ctx.scenarioId
590
585
  };
591
586
  }
592
- function variantAggregateToRunRecord(agg, ctx, opts = {}) {
593
- const splitTag = ctx.splitTag ?? "search";
594
- const runId = opts.runId ?? `agg-${agg.variantId}-${ctx.experimentId}`;
595
- const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
596
- const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
597
- const raw = {
598
- ...agg.metrics,
599
- ok_rate: agg.okRate,
600
- duration_ms: agg.meanDurationMs,
601
- n_scenarios: agg.scenarios.length
602
- };
603
- const outcome = { raw };
604
- if (splitTag === "holdout") outcome.holdoutScore = agg.meanScore;
605
- else outcome.searchScore = agg.meanScore;
606
- return {
607
- runId,
608
- experimentId: ctx.experimentId,
609
- candidateId: agg.variantId,
610
- seed: 0,
611
- model: ctx.model,
612
- promptHash,
613
- configHash,
614
- commitSha: ctx.commitSha,
615
- wallMs: agg.meanDurationMs,
616
- costUsd: agg.meanCost,
617
- tokenUsage: { input: 0, output: 0 },
618
- outcome,
619
- splitTag
620
- };
621
- }
622
- function defaultRunId(ctx, t) {
623
- return `run-${ctx.experimentId}-${t.variantId}-${t.scenarioId}-${t.rep}`;
624
- }
625
587
  function failureModeFromLayer(layer) {
626
588
  if (layer.status === "error") return `layer_${layer.layer}_error`;
627
589
  if (layer.status === "fail") return `layer_${layer.layer}_fail`;
@@ -1245,120 +1207,6 @@ function defaultReward(run) {
1245
1207
  return typeof v === "number" && Number.isFinite(v) ? v : null;
1246
1208
  }
1247
1209
 
1248
- // src/rl/auto-research.ts
1249
- async function analyzeOptimizationResult(opts) {
1250
- const trials = extractTrials(opts.result);
1251
- const runs = trialsToRunRecords(trials, opts.ctx);
1252
- const rewardSignals = extractVerifiableRewardsFromRecords(runs, opts.verifiableReward ?? {});
1253
- const preferences = extractPreferences(runs, {
1254
- strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
1255
- minMargin: opts.preferences?.minMargin ?? 0.05,
1256
- splitTag: opts.preferences?.splitTag ?? opts.ctx.splitTag ?? "search",
1257
- rewardOf: opts.preferences?.rewardOf
1258
- });
1259
- let interimConfidence = null;
1260
- if (opts.comparator) {
1261
- const deltaSeries = collectPairedDeltaSeries(runs, opts.comparator);
1262
- if (deltaSeries.some((s) => s.deltas.length > 0)) {
1263
- interimConfidence = evaluateInterimReleaseConfidence({
1264
- deltaSeries,
1265
- alpha: opts.sequential?.alpha,
1266
- bound: opts.sequential?.bound,
1267
- rope: opts.sequential?.rope
1268
- });
1269
- }
1270
- }
1271
- const rewardHacking = detectRewardHacking({
1272
- runs,
1273
- verifiableRewardOptions: opts.verifiableReward
1274
- });
1275
- let predictiveValidity = null;
1276
- if (opts.outcomes) {
1277
- predictiveValidity = await rubricPredictiveValidity({
1278
- runs,
1279
- outcomes: opts.outcomes.store,
1280
- outcomeMetrics: opts.outcomes.metrics
1281
- });
1282
- }
1283
- const trainerRows = {};
1284
- if (opts.trainerExport?.dpo) {
1285
- trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
1286
- }
1287
- if (opts.trainerExport?.grpo) {
1288
- trainerRows.grpo = await toGrpoRows(runs, opts.trainerExport.grpo);
1289
- }
1290
- const summary = buildSummary({
1291
- runs,
1292
- preferences,
1293
- interimConfidence,
1294
- rewardHacking,
1295
- predictiveValidity
1296
- });
1297
- return {
1298
- runs,
1299
- rewardSignals,
1300
- preferences,
1301
- interimConfidence,
1302
- rewardHacking,
1303
- predictiveValidity,
1304
- trainerRows,
1305
- summary
1306
- };
1307
- }
1308
- function extractTrials(result) {
1309
- if ("evolution" in result) {
1310
- return collectFromEvolution(result.evolution);
1311
- }
1312
- return collectFromEvolution(result);
1313
- }
1314
- function collectFromEvolution(evolution) {
1315
- const trials = [];
1316
- for (const gen of evolution.generations) {
1317
- for (const t of gen.trials ?? []) trials.push(t);
1318
- }
1319
- return trials;
1320
- }
1321
- function collectPairedDeltaSeries(runs, comparator) {
1322
- const baseline = /* @__PURE__ */ new Map();
1323
- for (const r of runs) {
1324
- if (r.candidateId !== comparator) continue;
1325
- const sid = r.scenarioId ?? r.experimentId;
1326
- const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1327
- if (typeof score !== "number" || !Number.isFinite(score)) continue;
1328
- baseline.set(`${sid}::${r.seed}`, score);
1329
- }
1330
- const byCandidate = /* @__PURE__ */ new Map();
1331
- for (const r of runs) {
1332
- if (r.candidateId === comparator) continue;
1333
- const sid = r.scenarioId ?? r.experimentId;
1334
- const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1335
- if (typeof score !== "number" || !Number.isFinite(score)) continue;
1336
- const baseScore = baseline.get(`${sid}::${r.seed}`);
1337
- if (typeof baseScore !== "number") continue;
1338
- const arr = byCandidate.get(r.candidateId) ?? [];
1339
- arr.push(score - baseScore);
1340
- byCandidate.set(r.candidateId, arr);
1341
- }
1342
- return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
1343
- }
1344
- function buildSummary(args) {
1345
- const lines = [
1346
- `${args.runs.length} runs analysed`,
1347
- `${args.preferences.pairs.length} preference pairs (${args.preferences.strategy})`,
1348
- `reward-hacking verdict: ${args.rewardHacking.verdict}`
1349
- ];
1350
- if (args.interimConfidence) {
1351
- lines.push(
1352
- `sequential: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : "")
1353
- );
1354
- }
1355
- if (args.predictiveValidity?.ranked[0]) {
1356
- const top = args.predictiveValidity.ranked[0];
1357
- lines.push(`top-rubric: ${top.rubric} \u03C1=${top.spearman.toFixed(2)}`);
1358
- }
1359
- return lines.join(" | ");
1360
- }
1361
-
1362
1210
  // src/rl/predictive-validity-researcher.ts
1363
1211
  var PredictiveValidityResearcher = class {
1364
1212
  opts;
@@ -1640,7 +1488,7 @@ async function runRLCampaign(opts) {
1640
1488
  let interimConfidence = null;
1641
1489
  if (opts.report?.comparator) {
1642
1490
  const comparator = opts.report.comparator;
1643
- const deltaSeries = collectPairedDeltaSeries2(campaign.runs, comparator);
1491
+ const deltaSeries = collectPairedDeltaSeries(campaign.runs, comparator);
1644
1492
  if (deltaSeries.some((s) => s.deltas.length > 0)) {
1645
1493
  interimConfidence = evaluateInterimReleaseConfidence({
1646
1494
  deltaSeries,
@@ -1672,7 +1520,7 @@ async function runRLCampaign(opts) {
1672
1520
  if (opts.trainerExport?.sft) {
1673
1521
  trainerRows.sft = await toSftRows(campaign.runs, opts.trainerExport.sft);
1674
1522
  }
1675
- const summary = buildSummary2({
1523
+ const summary = buildSummary({
1676
1524
  campaign,
1677
1525
  preferences,
1678
1526
  interimConfidence,
@@ -1691,7 +1539,7 @@ async function runRLCampaign(opts) {
1691
1539
  kind: "agent-eval-rl-campaign"
1692
1540
  };
1693
1541
  }
1694
- function collectPairedDeltaSeries2(runs, comparator) {
1542
+ function collectPairedDeltaSeries(runs, comparator) {
1695
1543
  const baseline = /* @__PURE__ */ new Map();
1696
1544
  for (const r of runs) {
1697
1545
  if (r.candidateId !== comparator) continue;
@@ -1714,7 +1562,7 @@ function collectPairedDeltaSeries2(runs, comparator) {
1714
1562
  }
1715
1563
  return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
1716
1564
  }
1717
- function buildSummary2(args) {
1565
+ function buildSummary(args) {
1718
1566
  const c = args.campaign;
1719
1567
  const lines = [
1720
1568
  `${c.campaignId}: ${c.runs.length} successful runs / ${c.failedRuns.length} failed (fingerprint ${c.campaignFingerprint.slice(0, 12)}\u2026)`,
@@ -1739,10 +1587,10 @@ function buildSummary2(args) {
1739
1587
  export {
1740
1588
  PredictiveValidityResearcher,
1741
1589
  adversarialScenarioSearch,
1742
- analyzeOptimizationResult,
1743
1590
  applyEloUpdate,
1744
1591
  bestOfN,
1745
1592
  buildPairwiseFromCampaign,
1593
+ campaignToRunRecords,
1746
1594
  compareAdaptationCurves,
1747
1595
  detectRewardHacking,
1748
1596
  doublyRobust,
@@ -1781,10 +1629,7 @@ export {
1781
1629
  toSftJsonl,
1782
1630
  toSftRows,
1783
1631
  toTRLFormat,
1784
- trialToRunRecord,
1785
- trialsToRunRecords,
1786
1632
  varianceBasedCurriculum,
1787
- variantAggregateToRunRecord,
1788
1633
  verificationReportToRunRecord
1789
1634
  };
1790
1635
  //# sourceMappingURL=rl.js.map