@tangle-network/agent-eval 0.70.0 → 0.72.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CHANGELOG.md +24 -0
  2. package/dist/adapters/http.js +1 -1
  3. package/dist/adapters/http.js.map +1 -1
  4. package/dist/campaign/index.d.ts +10 -0
  5. package/dist/campaign/index.js +48 -11
  6. package/dist/campaign/index.js.map +1 -1
  7. package/dist/{chunk-ZZCQQHW7.js → chunk-4QJN7RDX.js} +4 -4
  8. package/dist/chunk-4QJN7RDX.js.map +1 -0
  9. package/dist/{chunk-3B7Y5AUR.js → chunk-GWGO2K6Y.js} +3 -2
  10. package/dist/chunk-GWGO2K6Y.js.map +1 -0
  11. package/dist/{chunk-Z4ZCBC7M.js → chunk-ODGETRTM.js} +4 -3
  12. package/dist/chunk-ODGETRTM.js.map +1 -0
  13. package/dist/chunk-SL55X4VN.js +186 -0
  14. package/dist/chunk-SL55X4VN.js.map +1 -0
  15. package/dist/{chunk-GYELOWB6.js → chunk-UD6EF73X.js} +3 -3
  16. package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
  17. package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
  18. package/dist/contract/index.js +3 -3
  19. package/dist/index.js +31 -171
  20. package/dist/index.js.map +1 -1
  21. package/dist/openapi.json +1 -1
  22. package/dist/pipelines/index.js +1 -1
  23. package/dist/rl.d.ts +155 -1
  24. package/dist/rl.js +195 -6
  25. package/dist/rl.js.map +1 -1
  26. package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
  27. package/dist/traces.js +1 -1
  28. package/package.json +1 -1
  29. package/dist/chunk-3B7Y5AUR.js.map +0 -1
  30. package/dist/chunk-PQV2TKC3.js +0 -27
  31. package/dist/chunk-PQV2TKC3.js.map +0 -1
  32. package/dist/chunk-Z4ZCBC7M.js.map +0 -1
  33. package/dist/chunk-ZZCQQHW7.js.map +0 -1
  34. /package/dist/{chunk-GYELOWB6.js.map → chunk-UD6EF73X.js.map} +0 -0
  35. /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.70.0",
5
+ "version": "0.72.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -3,7 +3,7 @@ import {
3
3
  classifyFailure,
4
4
  compareToBaseline,
5
5
  computeToolUseMetrics
6
- } from "../chunk-3B7Y5AUR.js";
6
+ } from "../chunk-GWGO2K6Y.js";
7
7
  import {
8
8
  buildTrajectory
9
9
  } from "../chunk-RZTMDUO7.js";
package/dist/rl.d.ts CHANGED
@@ -1380,6 +1380,160 @@ interface StepRewardJsonlRow {
1380
1380
  }
1381
1381
  declare function stepRewardsToJsonl(stepRewards: StepReward[]): string;
1382
1382
 
1383
+ /**
1384
+ * RL dataset packaging + datasheet — the publishable, sellable bundle.
1385
+ *
1386
+ * The format exporters (`toGrpoRows` / `toSftRows` / `toDpoRows`) already
1387
+ * produce trainer-ready shapes (prime-rl GRPO, TRL DPO, conversational SFT).
1388
+ * What turns that into a dataset someone can PUBLISH or BUY is the provenance
1389
+ * + a datasheet: which models produced it, which prompt/agent versions, how the
1390
+ * reward was derived (deterministic verifiable vs probabilistic judge — the
1391
+ * credibility axis a buyer checks first), the split discipline, the reward
1392
+ * distribution, the quality gates, the license, and the intended/out-of-scope
1393
+ * uses. This module computes those facts from the `RunRecord[]` and renders a
1394
+ * "Datasheet for Datasets" (Gebru et al. 2018) card alongside the format files.
1395
+ *
1396
+ * It composes the existing `rl/exporters` — it does not reimplement any trainer
1397
+ * format. The renderers token-identity step (DeepSeek/Kimi/Qwen tokenization
1398
+ * with per-token loss masks) is a downstream Python stage that consumes the
1399
+ * `messages`/`completions` this bundle emits.
1400
+ */
1401
+
1402
+ type RewardKind = 'deterministic' | 'probabilistic' | 'mixed';
1403
+ type DatasetFormat = 'grpo' | 'sft' | 'dpo';
1404
+ /** Caller-declared context — the qualitative half of the datasheet that can't
1405
+ * be computed from records. */
1406
+ interface RlDatasetConfig {
1407
+ name: string;
1408
+ version: string;
1409
+ /** Product/task domain, e.g. 'legal-m&a', 'tax-1040'. */
1410
+ domain: string;
1411
+ /** SPDX id or a named commercial license. Required — an unlicensed dataset
1412
+ * cannot be published or sold. */
1413
+ license: string;
1414
+ /** How the reward was produced. `kind: 'deterministic'` (a test/schema/XPath
1415
+ * decided it) is the credibility signal; 'probabilistic' = LLM-judge. */
1416
+ reward: {
1417
+ kind: RewardKind;
1418
+ source: string;
1419
+ description: string;
1420
+ };
1421
+ intendedUse: string;
1422
+ outOfScope: string;
1423
+ limitations: string;
1424
+ /** ISO timestamp — passed in (the substrate forbids Date.now()). */
1425
+ createdAtIso: string;
1426
+ /** Default: ['grpo', 'sft']. */
1427
+ formats?: DatasetFormat[];
1428
+ /** Quality gates already run, recorded on the card for the buyer. */
1429
+ qualityGates?: {
1430
+ contaminationProbe?: 'passed' | 'failed' | 'not-run';
1431
+ dedup?: boolean;
1432
+ verifiableRewardFilter?: boolean;
1433
+ };
1434
+ }
1435
+ interface RewardStats {
1436
+ n: number;
1437
+ mean: number;
1438
+ median: number;
1439
+ min: number;
1440
+ max: number;
1441
+ std: number;
1442
+ }
1443
+ interface RlDatasetStats {
1444
+ records: number;
1445
+ /** Record count per split — a publishable dataset must declare its holdout. */
1446
+ splits: Record<RunSplitTag, number>;
1447
+ reward: RewardStats;
1448
+ /** Distinct snapshot-pinned models that produced the trajectories. */
1449
+ models: string[];
1450
+ /** Distinct effective-prompt hashes (the agent profile/prompt versions). */
1451
+ promptHashes: string[];
1452
+ commitShas: string[];
1453
+ totalTokens: {
1454
+ input: number;
1455
+ output: number;
1456
+ };
1457
+ totalCostUsd: number;
1458
+ }
1459
+ interface RlDatasetManifest extends RlDatasetConfig {
1460
+ formats: DatasetFormat[];
1461
+ rowCounts: Partial<Record<DatasetFormat, number>>;
1462
+ stats: RlDatasetStats;
1463
+ }
1464
+ interface RlDatasetBundle {
1465
+ manifest: RlDatasetManifest;
1466
+ /** Relative filename -> contents. Write these to a directory to publish. */
1467
+ files: Record<string, string>;
1468
+ }
1469
+ /**
1470
+ * Package graded `RunRecord[]` into a publishable RL dataset bundle: the
1471
+ * trainer-format JSONL files + a manifest + a datasheet. DPO requires
1472
+ * pre-extracted preference triples (pass `preferences`); GRPO/SFT derive from
1473
+ * the records directly via the supplied lookups. Throws on an empty corpus —
1474
+ * an empty dataset must never be published.
1475
+ */
1476
+ declare function buildRlDataset(records: RunRecord[], lookups: GrpoLookups & SftLookups, config: RlDatasetConfig, preferences?: {
1477
+ triples: PreferenceTriple[];
1478
+ lookups: DpoLookups;
1479
+ }): Promise<RlDatasetBundle>;
1480
+ /** Render the "Datasheet for Datasets" card — the artifact a buyer reads. */
1481
+ declare function datasheetToMarkdown(m: RlDatasetManifest): string;
1482
+
1483
+ /**
1484
+ * RL corpus — the durable, append-only accumulation of graded RunRecords that
1485
+ * every eval run deposits BY DEFAULT.
1486
+ *
1487
+ * The dataset is the free exhaust of the normal eval process: we run evals
1488
+ * constantly to get an agent production-ready, and those runs already produce
1489
+ * graded trajectories. Instead of writing them to an ephemeral run dir and
1490
+ * throwing them away, `appendToCorpus` accumulates them into a durable corpus;
1491
+ * `buildDatasetFromCorpus` later harvests the whole corpus into a publishable
1492
+ * bundle. No separate data-collection campaign — the data accrues from work we
1493
+ * do anyway. This is the "best things for free by our process" layer.
1494
+ *
1495
+ * Trajectory text rides on the record as top-level `prompt` / `completion`
1496
+ * (what the eval harnesses capture; the RunRecord validator ignores the extra
1497
+ * keys). The harvest reads them directly — no trace store round-trip needed.
1498
+ */
1499
+
1500
+ /** A corpus record is a RunRecord carrying the trajectory text the harness
1501
+ * captured. `prompt`/`completion` are top-level (the validator ignores extras). */
1502
+ type CorpusRecord = RunRecord & {
1503
+ prompt?: string;
1504
+ completion?: string;
1505
+ };
1506
+ interface CorpusAppendResult {
1507
+ appended: number;
1508
+ /** Skipped because a record with the same runId was already in the corpus
1509
+ * (idempotent appends — NOT re-run collapsing; re-runs get fresh runIds). */
1510
+ skipped: number;
1511
+ total: number;
1512
+ }
1513
+ /**
1514
+ * Append graded records to the corpus (append-only JSONL). Deduplicates by
1515
+ * `runId` against what's already on disk so re-running the same harness is
1516
+ * idempotent. Creates the file and parent dir. This is the call every eval
1517
+ * harness makes by default after producing its records.
1518
+ */
1519
+ declare function appendToCorpus(records: CorpusRecord[], corpusPath: string): CorpusAppendResult;
1520
+ /** Read the full corpus. Returns [] if the corpus does not exist yet. */
1521
+ declare function readCorpus(corpusPath: string): CorpusRecord[];
1522
+ interface HarvestOptions {
1523
+ /** Keep only records scoring >= this (rejection-sampling for SFT). */
1524
+ minScore?: number;
1525
+ /** Keep only these splits (e.g. ['holdout'] for an eval-only dataset). */
1526
+ splits?: RunRecord['splitTag'][];
1527
+ }
1528
+ /**
1529
+ * Harvest the accumulated corpus into a publishable RL dataset bundle. Reads
1530
+ * trajectory text from each record's top-level `prompt`/`completion`; records
1531
+ * missing either are excluded (a graded score with no trajectory can't train).
1532
+ * Optionally filters by score / split. Throws (via buildRlDataset) if nothing
1533
+ * survives — an empty dataset must never be published.
1534
+ */
1535
+ declare function buildDatasetFromCorpus(corpusPath: string, config: RlDatasetConfig, opts?: HarvestOptions): Promise<RlDatasetBundle>;
1536
+
1383
1537
  /**
1384
1538
  * `PredictiveValidityResearcher` — concrete `Researcher` implementation
1385
1539
  * that drives selection from outcome-anchored predictive validity.
@@ -1626,4 +1780,4 @@ interface RLCampaignResult<V> {
1626
1780
  }
1627
1781
  declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
1628
1782
 
1629
- export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CurriculumAllocation, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, OutcomeStore, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, applyEloUpdate, bestOfN, buildPairwiseFromCampaign, campaignToRunRecords, compareAdaptationCurves, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };
1783
+ export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CorpusAppendResult, type CorpusRecord, type CurriculumAllocation, type DatasetFormat, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type HarvestOptions, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, OutcomeStore, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RewardKind, type RewardStats, type RlDatasetBundle, type RlDatasetConfig, type RlDatasetManifest, type RlDatasetStats, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, appendToCorpus, applyEloUpdate, bestOfN, buildDatasetFromCorpus, buildPairwiseFromCampaign, buildRlDataset, campaignToRunRecords, compareAdaptationCurves, datasheetToMarkdown, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, readCorpus, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };
package/dist/rl.js CHANGED
@@ -361,11 +361,11 @@ function extractPreferences(runs, opts = {}) {
361
361
  const strategy = opts.strategy ?? "paired-by-scenario-and-seed";
362
362
  const minMargin = opts.minMargin ?? 0.05;
363
363
  const splitTag = opts.splitTag ?? SPLIT_TAG_DEFAULT;
364
- const rewardOf = opts.rewardOf ?? DEFAULT_REWARD;
364
+ const rewardOf2 = opts.rewardOf ?? DEFAULT_REWARD;
365
365
  const filtered = runs.filter((r) => r.splitTag === splitTag);
366
366
  const scoredEntries = [];
367
367
  for (const run of filtered) {
368
- const s = rewardOf(run);
368
+ const s = rewardOf2(run);
369
369
  if (s === null) continue;
370
370
  scoredEntries.push({ run, score: s });
371
371
  }
@@ -1060,6 +1060,10 @@ function mulberry32(seed) {
1060
1060
  };
1061
1061
  }
1062
1062
 
1063
+ // src/rl/corpus.ts
1064
+ import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
1065
+ import { dirname } from "path";
1066
+
1063
1067
  // src/rl/exporters.ts
1064
1068
  async function toDpoRows(triples, lookups) {
1065
1069
  const out = [];
@@ -1091,7 +1095,7 @@ function toDpoJsonl(rows) {
1091
1095
  return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1092
1096
  }
1093
1097
  async function toGrpoRows(runs, lookups) {
1094
- const rewardOf = lookups.rewardOf ?? defaultReward;
1098
+ const rewardOf2 = lookups.rewardOf ?? defaultReward;
1095
1099
  const grouped = /* @__PURE__ */ new Map();
1096
1100
  for (const r of runs) {
1097
1101
  const sid = r.scenarioId ?? r.experimentId;
@@ -1107,11 +1111,11 @@ async function toGrpoRows(runs, lookups) {
1107
1111
  const rewards = [];
1108
1112
  const runIds = [];
1109
1113
  for (const r of group) {
1110
- const reward = rewardOf(r);
1111
- if (reward === null) continue;
1114
+ const reward2 = rewardOf2(r);
1115
+ if (reward2 === null) continue;
1112
1116
  const completion = await Promise.resolve(lookups.completionOf(r.runId));
1113
1117
  completions.push(completion);
1114
- rewards.push(reward);
1118
+ rewards.push(reward2);
1115
1119
  runIds.push(r.runId);
1116
1120
  }
1117
1121
  if (completions.length === 0) continue;
@@ -1212,6 +1216,186 @@ function defaultReward(run) {
1212
1216
  return typeof v === "number" && Number.isFinite(v) ? v : null;
1213
1217
  }
1214
1218
 
1219
+ // src/rl/dataset.ts
1220
+ function reward(r) {
1221
+ const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
1222
+ return typeof v === "number" && Number.isFinite(v) ? v : null;
1223
+ }
1224
+ function distinct(xs) {
1225
+ return [...new Set(xs)].sort();
1226
+ }
1227
+ function computeRewardStats(values) {
1228
+ if (values.length === 0) return { n: 0, mean: 0, median: 0, min: 0, max: 0, std: 0 };
1229
+ const sorted = [...values].sort((a, b) => a - b);
1230
+ const n = sorted.length;
1231
+ const mean = sorted.reduce((s, x) => s + x, 0) / n;
1232
+ const mid = Math.floor(n / 2);
1233
+ const median = n % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
1234
+ const variance = sorted.reduce((s, x) => s + (x - mean) ** 2, 0) / n;
1235
+ return { n, mean, median, min: sorted[0], max: sorted[n - 1], std: Math.sqrt(variance) };
1236
+ }
1237
+ function computeStats(records) {
1238
+ const splits = { search: 0, dev: 0, holdout: 0 };
1239
+ let inTok = 0;
1240
+ let outTok = 0;
1241
+ let cost = 0;
1242
+ const rewards = [];
1243
+ for (const r of records) {
1244
+ splits[r.splitTag] = (splits[r.splitTag] ?? 0) + 1;
1245
+ inTok += r.tokenUsage.input;
1246
+ outTok += r.tokenUsage.output;
1247
+ cost += r.costUsd;
1248
+ const rw = reward(r);
1249
+ if (rw !== null) rewards.push(rw);
1250
+ }
1251
+ return {
1252
+ records: records.length,
1253
+ splits,
1254
+ reward: computeRewardStats(rewards),
1255
+ models: distinct(records.map((r) => r.model)),
1256
+ promptHashes: distinct(records.map((r) => r.promptHash)),
1257
+ commitShas: distinct(records.map((r) => r.commitSha)),
1258
+ totalTokens: { input: inTok, output: outTok },
1259
+ totalCostUsd: cost
1260
+ };
1261
+ }
1262
+ async function buildRlDataset(records, lookups, config, preferences) {
1263
+ if (records.length === 0) {
1264
+ throw new Error("buildRlDataset: no records \u2014 refusing to package an empty dataset");
1265
+ }
1266
+ const formats = config.formats ?? ["grpo", "sft"];
1267
+ const files = {};
1268
+ const rowCounts = {};
1269
+ if (formats.includes("grpo")) {
1270
+ const rows = await toGrpoRows(records, lookups);
1271
+ files["train.grpo.jsonl"] = toGrpoJsonl(rows);
1272
+ rowCounts.grpo = rows.length;
1273
+ }
1274
+ if (formats.includes("sft")) {
1275
+ const rows = await toSftRows(records, lookups);
1276
+ files["train.sft.jsonl"] = toSftJsonl(rows);
1277
+ rowCounts.sft = rows.length;
1278
+ }
1279
+ if (formats.includes("dpo")) {
1280
+ if (!preferences) {
1281
+ throw new Error("buildRlDataset: format 'dpo' requires `preferences` (triples + lookups)");
1282
+ }
1283
+ const rows = await toDpoRows(preferences.triples, preferences.lookups);
1284
+ files["train.dpo.jsonl"] = toDpoJsonl(rows);
1285
+ rowCounts.dpo = rows.length;
1286
+ }
1287
+ const manifest = {
1288
+ ...config,
1289
+ formats,
1290
+ rowCounts,
1291
+ stats: computeStats(records)
1292
+ };
1293
+ files["manifest.json"] = `${JSON.stringify(manifest, null, 2)}
1294
+ `;
1295
+ files["DATASHEET.md"] = datasheetToMarkdown(manifest);
1296
+ return { manifest, files };
1297
+ }
1298
+ function pct(x) {
1299
+ return `${(x * 100).toFixed(1)}%`;
1300
+ }
1301
+ function datasheetToMarkdown(m) {
1302
+ const s = m.stats;
1303
+ const total = s.records || 1;
1304
+ const splitLines = ["search", "dev", "holdout"].map((k) => ` - \`${k}\`: ${s.splits[k]} (${pct(s.splits[k] / total)})`).join("\n");
1305
+ const deterministic = m.reward.kind === "deterministic";
1306
+ return [
1307
+ `# Dataset: ${m.name} \`v${m.version}\``,
1308
+ "",
1309
+ `**Domain:** ${m.domain} \xB7 **Created:** ${m.createdAtIso} \xB7 **License:** ${m.license}`,
1310
+ "",
1311
+ "## Reward provenance",
1312
+ `- **Kind:** ${m.reward.kind}${deterministic ? " \u2705 (decidable \u2014 not judge-noise)" : ""}`,
1313
+ `- **Source:** ${m.reward.source}`,
1314
+ `- **Description:** ${m.reward.description}`,
1315
+ "",
1316
+ "## Composition",
1317
+ `- **Records (trajectories):** ${s.records}`,
1318
+ `- **Formats:** ${m.formats.map((f) => `${f} (${m.rowCounts[f] ?? 0} rows)`).join(", ")}`,
1319
+ "- **Splits:**",
1320
+ splitLines,
1321
+ "",
1322
+ "## Reward distribution",
1323
+ `- n=${s.reward.n} \xB7 mean=${s.reward.mean.toFixed(3)} \xB7 median=${s.reward.median.toFixed(3)} \xB7 min=${s.reward.min.toFixed(3)} \xB7 max=${s.reward.max.toFixed(3)} \xB7 std=${s.reward.std.toFixed(3)}`,
1324
+ "",
1325
+ "## Provenance",
1326
+ `- **Models:** ${s.models.join(", ")}`,
1327
+ `- **Prompt/agent versions (sha256):** ${s.promptHashes.length} distinct`,
1328
+ `- **Commits:** ${s.commitShas.join(", ")}`,
1329
+ `- **Tokens:** ${s.totalTokens.input} in / ${s.totalTokens.output} out \xB7 **Cost:** $${s.totalCostUsd.toFixed(2)}`,
1330
+ "",
1331
+ "## Quality gates",
1332
+ `- Contamination probe: ${m.qualityGates?.contaminationProbe ?? "not-run"}`,
1333
+ `- Dedup: ${m.qualityGates?.dedup ? "yes" : "no"} \xB7 Verifiable-reward filter: ${m.qualityGates?.verifiableRewardFilter ? "yes" : "no"}`,
1334
+ "",
1335
+ "## Recommended uses",
1336
+ m.intendedUse,
1337
+ "",
1338
+ "## Out of scope",
1339
+ m.outOfScope,
1340
+ "",
1341
+ "## Limitations",
1342
+ m.limitations,
1343
+ "",
1344
+ "## Token rendering",
1345
+ "For RL/SFT training, tokenize with the per-model renderer (DeepSeek-V3 / Kimi-K2 / Qwen3) to preserve token identity and per-token loss masks across tool-call turns \u2014 see `renderers` (PrimeIntellect). The `messages` / `completions` here are the renderer input.",
1346
+ ""
1347
+ ].join("\n");
1348
+ }
1349
+
1350
+ // src/rl/corpus.ts
1351
+ function appendToCorpus(records, corpusPath) {
1352
+ mkdirSync(dirname(corpusPath), { recursive: true });
1353
+ const existing = existsSync(corpusPath) ? readCorpus(corpusPath) : [];
1354
+ const seen = new Set(existing.map((r) => r.runId));
1355
+ const lines = [];
1356
+ let appended = 0;
1357
+ let skipped = 0;
1358
+ for (const r of records) {
1359
+ if (seen.has(r.runId)) {
1360
+ skipped++;
1361
+ continue;
1362
+ }
1363
+ seen.add(r.runId);
1364
+ lines.push(JSON.stringify(r));
1365
+ appended++;
1366
+ }
1367
+ if (lines.length > 0) appendFileSync(corpusPath, `${lines.join("\n")}
1368
+ `);
1369
+ return { appended, skipped, total: existing.length + appended };
1370
+ }
1371
+ function readCorpus(corpusPath) {
1372
+ if (!existsSync(corpusPath)) return [];
1373
+ const out = [];
1374
+ for (const line of readFileSync(corpusPath, "utf8").split("\n")) {
1375
+ if (line.trim()) out.push(JSON.parse(line));
1376
+ }
1377
+ return out;
1378
+ }
1379
+ function rewardOf(r) {
1380
+ const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
1381
+ return typeof v === "number" && Number.isFinite(v) ? v : 0;
1382
+ }
1383
+ async function buildDatasetFromCorpus(corpusPath, config, opts = {}) {
1384
+ let records = readCorpus(corpusPath).filter(
1385
+ (r) => typeof r.prompt === "string" && typeof r.completion === "string"
1386
+ );
1387
+ if (opts.splits) records = records.filter((r) => opts.splits.includes(r.splitTag));
1388
+ if (opts.minScore != null) records = records.filter((r) => rewardOf(r) >= opts.minScore);
1389
+ const text = new Map(
1390
+ records.map((r) => [r.runId, { prompt: r.prompt, completion: r.completion }])
1391
+ );
1392
+ const lookups = {
1393
+ promptOf: (id) => text.get(id)?.prompt ?? "",
1394
+ completionOf: (id) => text.get(id)?.completion ?? ""
1395
+ };
1396
+ return buildRlDataset(records, lookups, config);
1397
+ }
1398
+
1215
1399
  // src/rl/predictive-validity-researcher.ts
1216
1400
  var PredictiveValidityResearcher = class {
1217
1401
  opts;
@@ -1596,11 +1780,15 @@ export {
1596
1780
  InMemoryOutcomeStore,
1597
1781
  PredictiveValidityResearcher,
1598
1782
  adversarialScenarioSearch,
1783
+ appendToCorpus,
1599
1784
  applyEloUpdate,
1600
1785
  bestOfN,
1786
+ buildDatasetFromCorpus,
1601
1787
  buildPairwiseFromCampaign,
1788
+ buildRlDataset,
1602
1789
  campaignToRunRecords,
1603
1790
  compareAdaptationCurves,
1791
+ datasheetToMarkdown,
1604
1792
  detectRewardHacking,
1605
1793
  doublyRobust,
1606
1794
  extractPreferences,
@@ -1616,6 +1804,7 @@ export {
1616
1804
  offPolicyEstimateAll,
1617
1805
  paretoFrontier,
1618
1806
  prmTrainingPairs,
1807
+ readCorpus,
1619
1808
  renameVariables,
1620
1809
  runAdaptationCurve,
1621
1810
  runComputeCurve,