@tangle-network/agent-eval 0.70.0 → 0.71.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/http.js.map +1 -1
- package/dist/campaign/index.d.ts +10 -0
- package/dist/campaign/index.js +29 -5
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-GYELOWB6.js → chunk-6QZUCFKM.js} +2 -2
- package/dist/{chunk-3B7Y5AUR.js → chunk-GWGO2K6Y.js} +3 -2
- package/dist/chunk-GWGO2K6Y.js.map +1 -0
- package/dist/{chunk-Z4ZCBC7M.js → chunk-ODGETRTM.js} +4 -3
- package/dist/chunk-ODGETRTM.js.map +1 -0
- package/dist/{chunk-ZZCQQHW7.js → chunk-VMAYE3LM.js} +2 -2
- package/dist/chunk-VMAYE3LM.js.map +1 -0
- package/dist/contract/index.js +2 -2
- package/dist/index.js +21 -16
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +1 -1
- package/dist/rl.d.ts +155 -1
- package/dist/rl.js +195 -6
- package/dist/rl.js.map +1 -1
- package/dist/traces.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-3B7Y5AUR.js.map +0 -1
- package/dist/chunk-Z4ZCBC7M.js.map +0 -1
- package/dist/chunk-ZZCQQHW7.js.map +0 -1
- /package/dist/{chunk-GYELOWB6.js.map → chunk-6QZUCFKM.js.map} +0 -0
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.71.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/pipelines/index.js
CHANGED
package/dist/rl.d.ts
CHANGED
|
@@ -1380,6 +1380,160 @@ interface StepRewardJsonlRow {
|
|
|
1380
1380
|
}
|
|
1381
1381
|
declare function stepRewardsToJsonl(stepRewards: StepReward[]): string;
|
|
1382
1382
|
|
|
1383
|
+
/**
|
|
1384
|
+
* RL dataset packaging + datasheet — the publishable, sellable bundle.
|
|
1385
|
+
*
|
|
1386
|
+
* The format exporters (`toGrpoRows` / `toSftRows` / `toDpoRows`) already
|
|
1387
|
+
* produce trainer-ready shapes (prime-rl GRPO, TRL DPO, conversational SFT).
|
|
1388
|
+
* What turns that into a dataset someone can PUBLISH or BUY is the provenance
|
|
1389
|
+
* + a datasheet: which models produced it, which prompt/agent versions, how the
|
|
1390
|
+
* reward was derived (deterministic verifiable vs probabilistic judge — the
|
|
1391
|
+
* credibility axis a buyer checks first), the split discipline, the reward
|
|
1392
|
+
* distribution, the quality gates, the license, and the intended/out-of-scope
|
|
1393
|
+
* uses. This module computes those facts from the `RunRecord[]` and renders a
|
|
1394
|
+
* "Datasheet for Datasets" (Gebru et al. 2018) card alongside the format files.
|
|
1395
|
+
*
|
|
1396
|
+
* It composes the existing `rl/exporters` — it does not reimplement any trainer
|
|
1397
|
+
* format. The renderers token-identity step (DeepSeek/Kimi/Qwen tokenization
|
|
1398
|
+
* with per-token loss masks) is a downstream Python stage that consumes the
|
|
1399
|
+
* `messages`/`completions` this bundle emits.
|
|
1400
|
+
*/
|
|
1401
|
+
|
|
1402
|
+
type RewardKind = 'deterministic' | 'probabilistic' | 'mixed';
|
|
1403
|
+
type DatasetFormat = 'grpo' | 'sft' | 'dpo';
|
|
1404
|
+
/** Caller-declared context — the qualitative half of the datasheet that can't
|
|
1405
|
+
* be computed from records. */
|
|
1406
|
+
interface RlDatasetConfig {
|
|
1407
|
+
name: string;
|
|
1408
|
+
version: string;
|
|
1409
|
+
/** Product/task domain, e.g. 'legal-m&a', 'tax-1040'. */
|
|
1410
|
+
domain: string;
|
|
1411
|
+
/** SPDX id or a named commercial license. Required — an unlicensed dataset
|
|
1412
|
+
* cannot be published or sold. */
|
|
1413
|
+
license: string;
|
|
1414
|
+
/** How the reward was produced. `kind: 'deterministic'` (a test/schema/XPath
|
|
1415
|
+
* decided it) is the credibility signal; 'probabilistic' = LLM-judge. */
|
|
1416
|
+
reward: {
|
|
1417
|
+
kind: RewardKind;
|
|
1418
|
+
source: string;
|
|
1419
|
+
description: string;
|
|
1420
|
+
};
|
|
1421
|
+
intendedUse: string;
|
|
1422
|
+
outOfScope: string;
|
|
1423
|
+
limitations: string;
|
|
1424
|
+
/** ISO timestamp — passed in (the substrate forbids Date.now()). */
|
|
1425
|
+
createdAtIso: string;
|
|
1426
|
+
/** Default: ['grpo', 'sft']. */
|
|
1427
|
+
formats?: DatasetFormat[];
|
|
1428
|
+
/** Quality gates already run, recorded on the card for the buyer. */
|
|
1429
|
+
qualityGates?: {
|
|
1430
|
+
contaminationProbe?: 'passed' | 'failed' | 'not-run';
|
|
1431
|
+
dedup?: boolean;
|
|
1432
|
+
verifiableRewardFilter?: boolean;
|
|
1433
|
+
};
|
|
1434
|
+
}
|
|
1435
|
+
interface RewardStats {
|
|
1436
|
+
n: number;
|
|
1437
|
+
mean: number;
|
|
1438
|
+
median: number;
|
|
1439
|
+
min: number;
|
|
1440
|
+
max: number;
|
|
1441
|
+
std: number;
|
|
1442
|
+
}
|
|
1443
|
+
interface RlDatasetStats {
|
|
1444
|
+
records: number;
|
|
1445
|
+
/** Record count per split — a publishable dataset must declare its holdout. */
|
|
1446
|
+
splits: Record<RunSplitTag, number>;
|
|
1447
|
+
reward: RewardStats;
|
|
1448
|
+
/** Distinct snapshot-pinned models that produced the trajectories. */
|
|
1449
|
+
models: string[];
|
|
1450
|
+
/** Distinct effective-prompt hashes (the agent profile/prompt versions). */
|
|
1451
|
+
promptHashes: string[];
|
|
1452
|
+
commitShas: string[];
|
|
1453
|
+
totalTokens: {
|
|
1454
|
+
input: number;
|
|
1455
|
+
output: number;
|
|
1456
|
+
};
|
|
1457
|
+
totalCostUsd: number;
|
|
1458
|
+
}
|
|
1459
|
+
interface RlDatasetManifest extends RlDatasetConfig {
|
|
1460
|
+
formats: DatasetFormat[];
|
|
1461
|
+
rowCounts: Partial<Record<DatasetFormat, number>>;
|
|
1462
|
+
stats: RlDatasetStats;
|
|
1463
|
+
}
|
|
1464
|
+
interface RlDatasetBundle {
|
|
1465
|
+
manifest: RlDatasetManifest;
|
|
1466
|
+
/** Relative filename -> contents. Write these to a directory to publish. */
|
|
1467
|
+
files: Record<string, string>;
|
|
1468
|
+
}
|
|
1469
|
+
/**
|
|
1470
|
+
* Package graded `RunRecord[]` into a publishable RL dataset bundle: the
|
|
1471
|
+
* trainer-format JSONL files + a manifest + a datasheet. DPO requires
|
|
1472
|
+
* pre-extracted preference triples (pass `preferences`); GRPO/SFT derive from
|
|
1473
|
+
* the records directly via the supplied lookups. Throws on an empty corpus —
|
|
1474
|
+
* an empty dataset must never be published.
|
|
1475
|
+
*/
|
|
1476
|
+
declare function buildRlDataset(records: RunRecord[], lookups: GrpoLookups & SftLookups, config: RlDatasetConfig, preferences?: {
|
|
1477
|
+
triples: PreferenceTriple[];
|
|
1478
|
+
lookups: DpoLookups;
|
|
1479
|
+
}): Promise<RlDatasetBundle>;
|
|
1480
|
+
/** Render the "Datasheet for Datasets" card — the artifact a buyer reads. */
|
|
1481
|
+
declare function datasheetToMarkdown(m: RlDatasetManifest): string;
|
|
1482
|
+
|
|
1483
|
+
/**
|
|
1484
|
+
* RL corpus — the durable, append-only accumulation of graded RunRecords that
|
|
1485
|
+
* every eval run deposits BY DEFAULT.
|
|
1486
|
+
*
|
|
1487
|
+
* The dataset is the free exhaust of the normal eval process: we run evals
|
|
1488
|
+
* constantly to get an agent production-ready, and those runs already produce
|
|
1489
|
+
* graded trajectories. Instead of writing them to an ephemeral run dir and
|
|
1490
|
+
* throwing them away, `appendToCorpus` accumulates them into a durable corpus;
|
|
1491
|
+
* `buildDatasetFromCorpus` later harvests the whole corpus into a publishable
|
|
1492
|
+
* bundle. No separate data-collection campaign — the data accrues from work we
|
|
1493
|
+
* do anyway. This is the "best things for free by our process" layer.
|
|
1494
|
+
*
|
|
1495
|
+
* Trajectory text rides on the record as top-level `prompt` / `completion`
|
|
1496
|
+
* (what the eval harnesses capture; the RunRecord validator ignores the extra
|
|
1497
|
+
* keys). The harvest reads them directly — no trace store round-trip needed.
|
|
1498
|
+
*/
|
|
1499
|
+
|
|
1500
|
+
/** A corpus record is a RunRecord carrying the trajectory text the harness
|
|
1501
|
+
* captured. `prompt`/`completion` are top-level (the validator ignores extras). */
|
|
1502
|
+
type CorpusRecord = RunRecord & {
|
|
1503
|
+
prompt?: string;
|
|
1504
|
+
completion?: string;
|
|
1505
|
+
};
|
|
1506
|
+
interface CorpusAppendResult {
|
|
1507
|
+
appended: number;
|
|
1508
|
+
/** Skipped because a record with the same runId was already in the corpus
|
|
1509
|
+
* (idempotent appends — NOT re-run collapsing; re-runs get fresh runIds). */
|
|
1510
|
+
skipped: number;
|
|
1511
|
+
total: number;
|
|
1512
|
+
}
|
|
1513
|
+
/**
|
|
1514
|
+
* Append graded records to the corpus (append-only JSONL). Deduplicates by
|
|
1515
|
+
* `runId` against what's already on disk so re-running the same harness is
|
|
1516
|
+
* idempotent. Creates the file and parent dir. This is the call every eval
|
|
1517
|
+
* harness makes by default after producing its records.
|
|
1518
|
+
*/
|
|
1519
|
+
declare function appendToCorpus(records: CorpusRecord[], corpusPath: string): CorpusAppendResult;
|
|
1520
|
+
/** Read the full corpus. Returns [] if the corpus does not exist yet. */
|
|
1521
|
+
declare function readCorpus(corpusPath: string): CorpusRecord[];
|
|
1522
|
+
interface HarvestOptions {
|
|
1523
|
+
/** Keep only records scoring >= this (rejection-sampling for SFT). */
|
|
1524
|
+
minScore?: number;
|
|
1525
|
+
/** Keep only these splits (e.g. ['holdout'] for an eval-only dataset). */
|
|
1526
|
+
splits?: RunRecord['splitTag'][];
|
|
1527
|
+
}
|
|
1528
|
+
/**
|
|
1529
|
+
* Harvest the accumulated corpus into a publishable RL dataset bundle. Reads
|
|
1530
|
+
* trajectory text from each record's top-level `prompt`/`completion`; records
|
|
1531
|
+
* missing either are excluded (a graded score with no trajectory can't train).
|
|
1532
|
+
* Optionally filters by score / split. Throws (via buildRlDataset) if nothing
|
|
1533
|
+
* survives — an empty dataset must never be published.
|
|
1534
|
+
*/
|
|
1535
|
+
declare function buildDatasetFromCorpus(corpusPath: string, config: RlDatasetConfig, opts?: HarvestOptions): Promise<RlDatasetBundle>;
|
|
1536
|
+
|
|
1383
1537
|
/**
|
|
1384
1538
|
* `PredictiveValidityResearcher` — concrete `Researcher` implementation
|
|
1385
1539
|
* that drives selection from outcome-anchored predictive validity.
|
|
@@ -1626,4 +1780,4 @@ interface RLCampaignResult<V> {
|
|
|
1626
1780
|
}
|
|
1627
1781
|
declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
|
|
1628
1782
|
|
|
1629
|
-
export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CurriculumAllocation, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, OutcomeStore, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, applyEloUpdate, bestOfN, buildPairwiseFromCampaign, campaignToRunRecords, compareAdaptationCurves, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };
|
|
1783
|
+
export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CorpusAppendResult, type CorpusRecord, type CurriculumAllocation, type DatasetFormat, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type HarvestOptions, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, OutcomeStore, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RewardKind, type RewardStats, type RlDatasetBundle, type RlDatasetConfig, type RlDatasetManifest, type RlDatasetStats, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, appendToCorpus, applyEloUpdate, bestOfN, buildDatasetFromCorpus, buildPairwiseFromCampaign, buildRlDataset, campaignToRunRecords, compareAdaptationCurves, datasheetToMarkdown, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, readCorpus, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };
|
package/dist/rl.js
CHANGED
|
@@ -361,11 +361,11 @@ function extractPreferences(runs, opts = {}) {
|
|
|
361
361
|
const strategy = opts.strategy ?? "paired-by-scenario-and-seed";
|
|
362
362
|
const minMargin = opts.minMargin ?? 0.05;
|
|
363
363
|
const splitTag = opts.splitTag ?? SPLIT_TAG_DEFAULT;
|
|
364
|
-
const
|
|
364
|
+
const rewardOf2 = opts.rewardOf ?? DEFAULT_REWARD;
|
|
365
365
|
const filtered = runs.filter((r) => r.splitTag === splitTag);
|
|
366
366
|
const scoredEntries = [];
|
|
367
367
|
for (const run of filtered) {
|
|
368
|
-
const s =
|
|
368
|
+
const s = rewardOf2(run);
|
|
369
369
|
if (s === null) continue;
|
|
370
370
|
scoredEntries.push({ run, score: s });
|
|
371
371
|
}
|
|
@@ -1060,6 +1060,10 @@ function mulberry32(seed) {
|
|
|
1060
1060
|
};
|
|
1061
1061
|
}
|
|
1062
1062
|
|
|
1063
|
+
// src/rl/corpus.ts
|
|
1064
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
|
|
1065
|
+
import { dirname } from "path";
|
|
1066
|
+
|
|
1063
1067
|
// src/rl/exporters.ts
|
|
1064
1068
|
async function toDpoRows(triples, lookups) {
|
|
1065
1069
|
const out = [];
|
|
@@ -1091,7 +1095,7 @@ function toDpoJsonl(rows) {
|
|
|
1091
1095
|
return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
|
|
1092
1096
|
}
|
|
1093
1097
|
async function toGrpoRows(runs, lookups) {
|
|
1094
|
-
const
|
|
1098
|
+
const rewardOf2 = lookups.rewardOf ?? defaultReward;
|
|
1095
1099
|
const grouped = /* @__PURE__ */ new Map();
|
|
1096
1100
|
for (const r of runs) {
|
|
1097
1101
|
const sid = r.scenarioId ?? r.experimentId;
|
|
@@ -1107,11 +1111,11 @@ async function toGrpoRows(runs, lookups) {
|
|
|
1107
1111
|
const rewards = [];
|
|
1108
1112
|
const runIds = [];
|
|
1109
1113
|
for (const r of group) {
|
|
1110
|
-
const
|
|
1111
|
-
if (
|
|
1114
|
+
const reward2 = rewardOf2(r);
|
|
1115
|
+
if (reward2 === null) continue;
|
|
1112
1116
|
const completion = await Promise.resolve(lookups.completionOf(r.runId));
|
|
1113
1117
|
completions.push(completion);
|
|
1114
|
-
rewards.push(
|
|
1118
|
+
rewards.push(reward2);
|
|
1115
1119
|
runIds.push(r.runId);
|
|
1116
1120
|
}
|
|
1117
1121
|
if (completions.length === 0) continue;
|
|
@@ -1212,6 +1216,186 @@ function defaultReward(run) {
|
|
|
1212
1216
|
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
1213
1217
|
}
|
|
1214
1218
|
|
|
1219
|
+
// src/rl/dataset.ts
|
|
1220
|
+
function reward(r) {
|
|
1221
|
+
const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1222
|
+
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
1223
|
+
}
|
|
1224
|
+
function distinct(xs) {
|
|
1225
|
+
return [...new Set(xs)].sort();
|
|
1226
|
+
}
|
|
1227
|
+
function computeRewardStats(values) {
|
|
1228
|
+
if (values.length === 0) return { n: 0, mean: 0, median: 0, min: 0, max: 0, std: 0 };
|
|
1229
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
1230
|
+
const n = sorted.length;
|
|
1231
|
+
const mean = sorted.reduce((s, x) => s + x, 0) / n;
|
|
1232
|
+
const mid = Math.floor(n / 2);
|
|
1233
|
+
const median = n % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
1234
|
+
const variance = sorted.reduce((s, x) => s + (x - mean) ** 2, 0) / n;
|
|
1235
|
+
return { n, mean, median, min: sorted[0], max: sorted[n - 1], std: Math.sqrt(variance) };
|
|
1236
|
+
}
|
|
1237
|
+
function computeStats(records) {
|
|
1238
|
+
const splits = { search: 0, dev: 0, holdout: 0 };
|
|
1239
|
+
let inTok = 0;
|
|
1240
|
+
let outTok = 0;
|
|
1241
|
+
let cost = 0;
|
|
1242
|
+
const rewards = [];
|
|
1243
|
+
for (const r of records) {
|
|
1244
|
+
splits[r.splitTag] = (splits[r.splitTag] ?? 0) + 1;
|
|
1245
|
+
inTok += r.tokenUsage.input;
|
|
1246
|
+
outTok += r.tokenUsage.output;
|
|
1247
|
+
cost += r.costUsd;
|
|
1248
|
+
const rw = reward(r);
|
|
1249
|
+
if (rw !== null) rewards.push(rw);
|
|
1250
|
+
}
|
|
1251
|
+
return {
|
|
1252
|
+
records: records.length,
|
|
1253
|
+
splits,
|
|
1254
|
+
reward: computeRewardStats(rewards),
|
|
1255
|
+
models: distinct(records.map((r) => r.model)),
|
|
1256
|
+
promptHashes: distinct(records.map((r) => r.promptHash)),
|
|
1257
|
+
commitShas: distinct(records.map((r) => r.commitSha)),
|
|
1258
|
+
totalTokens: { input: inTok, output: outTok },
|
|
1259
|
+
totalCostUsd: cost
|
|
1260
|
+
};
|
|
1261
|
+
}
|
|
1262
|
+
async function buildRlDataset(records, lookups, config, preferences) {
|
|
1263
|
+
if (records.length === 0) {
|
|
1264
|
+
throw new Error("buildRlDataset: no records \u2014 refusing to package an empty dataset");
|
|
1265
|
+
}
|
|
1266
|
+
const formats = config.formats ?? ["grpo", "sft"];
|
|
1267
|
+
const files = {};
|
|
1268
|
+
const rowCounts = {};
|
|
1269
|
+
if (formats.includes("grpo")) {
|
|
1270
|
+
const rows = await toGrpoRows(records, lookups);
|
|
1271
|
+
files["train.grpo.jsonl"] = toGrpoJsonl(rows);
|
|
1272
|
+
rowCounts.grpo = rows.length;
|
|
1273
|
+
}
|
|
1274
|
+
if (formats.includes("sft")) {
|
|
1275
|
+
const rows = await toSftRows(records, lookups);
|
|
1276
|
+
files["train.sft.jsonl"] = toSftJsonl(rows);
|
|
1277
|
+
rowCounts.sft = rows.length;
|
|
1278
|
+
}
|
|
1279
|
+
if (formats.includes("dpo")) {
|
|
1280
|
+
if (!preferences) {
|
|
1281
|
+
throw new Error("buildRlDataset: format 'dpo' requires `preferences` (triples + lookups)");
|
|
1282
|
+
}
|
|
1283
|
+
const rows = await toDpoRows(preferences.triples, preferences.lookups);
|
|
1284
|
+
files["train.dpo.jsonl"] = toDpoJsonl(rows);
|
|
1285
|
+
rowCounts.dpo = rows.length;
|
|
1286
|
+
}
|
|
1287
|
+
const manifest = {
|
|
1288
|
+
...config,
|
|
1289
|
+
formats,
|
|
1290
|
+
rowCounts,
|
|
1291
|
+
stats: computeStats(records)
|
|
1292
|
+
};
|
|
1293
|
+
files["manifest.json"] = `${JSON.stringify(manifest, null, 2)}
|
|
1294
|
+
`;
|
|
1295
|
+
files["DATASHEET.md"] = datasheetToMarkdown(manifest);
|
|
1296
|
+
return { manifest, files };
|
|
1297
|
+
}
|
|
1298
|
+
function pct(x) {
|
|
1299
|
+
return `${(x * 100).toFixed(1)}%`;
|
|
1300
|
+
}
|
|
1301
|
+
function datasheetToMarkdown(m) {
|
|
1302
|
+
const s = m.stats;
|
|
1303
|
+
const total = s.records || 1;
|
|
1304
|
+
const splitLines = ["search", "dev", "holdout"].map((k) => ` - \`${k}\`: ${s.splits[k]} (${pct(s.splits[k] / total)})`).join("\n");
|
|
1305
|
+
const deterministic = m.reward.kind === "deterministic";
|
|
1306
|
+
return [
|
|
1307
|
+
`# Dataset: ${m.name} \`v${m.version}\``,
|
|
1308
|
+
"",
|
|
1309
|
+
`**Domain:** ${m.domain} \xB7 **Created:** ${m.createdAtIso} \xB7 **License:** ${m.license}`,
|
|
1310
|
+
"",
|
|
1311
|
+
"## Reward provenance",
|
|
1312
|
+
`- **Kind:** ${m.reward.kind}${deterministic ? " \u2705 (decidable \u2014 not judge-noise)" : ""}`,
|
|
1313
|
+
`- **Source:** ${m.reward.source}`,
|
|
1314
|
+
`- **Description:** ${m.reward.description}`,
|
|
1315
|
+
"",
|
|
1316
|
+
"## Composition",
|
|
1317
|
+
`- **Records (trajectories):** ${s.records}`,
|
|
1318
|
+
`- **Formats:** ${m.formats.map((f) => `${f} (${m.rowCounts[f] ?? 0} rows)`).join(", ")}`,
|
|
1319
|
+
"- **Splits:**",
|
|
1320
|
+
splitLines,
|
|
1321
|
+
"",
|
|
1322
|
+
"## Reward distribution",
|
|
1323
|
+
`- n=${s.reward.n} \xB7 mean=${s.reward.mean.toFixed(3)} \xB7 median=${s.reward.median.toFixed(3)} \xB7 min=${s.reward.min.toFixed(3)} \xB7 max=${s.reward.max.toFixed(3)} \xB7 std=${s.reward.std.toFixed(3)}`,
|
|
1324
|
+
"",
|
|
1325
|
+
"## Provenance",
|
|
1326
|
+
`- **Models:** ${s.models.join(", ")}`,
|
|
1327
|
+
`- **Prompt/agent versions (sha256):** ${s.promptHashes.length} distinct`,
|
|
1328
|
+
`- **Commits:** ${s.commitShas.join(", ")}`,
|
|
1329
|
+
`- **Tokens:** ${s.totalTokens.input} in / ${s.totalTokens.output} out \xB7 **Cost:** $${s.totalCostUsd.toFixed(2)}`,
|
|
1330
|
+
"",
|
|
1331
|
+
"## Quality gates",
|
|
1332
|
+
`- Contamination probe: ${m.qualityGates?.contaminationProbe ?? "not-run"}`,
|
|
1333
|
+
`- Dedup: ${m.qualityGates?.dedup ? "yes" : "no"} \xB7 Verifiable-reward filter: ${m.qualityGates?.verifiableRewardFilter ? "yes" : "no"}`,
|
|
1334
|
+
"",
|
|
1335
|
+
"## Recommended uses",
|
|
1336
|
+
m.intendedUse,
|
|
1337
|
+
"",
|
|
1338
|
+
"## Out of scope",
|
|
1339
|
+
m.outOfScope,
|
|
1340
|
+
"",
|
|
1341
|
+
"## Limitations",
|
|
1342
|
+
m.limitations,
|
|
1343
|
+
"",
|
|
1344
|
+
"## Token rendering",
|
|
1345
|
+
"For RL/SFT training, tokenize with the per-model renderer (DeepSeek-V3 / Kimi-K2 / Qwen3) to preserve token identity and per-token loss masks across tool-call turns \u2014 see `renderers` (PrimeIntellect). The `messages` / `completions` here are the renderer input.",
|
|
1346
|
+
""
|
|
1347
|
+
].join("\n");
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
// src/rl/corpus.ts
|
|
1351
|
+
function appendToCorpus(records, corpusPath) {
|
|
1352
|
+
mkdirSync(dirname(corpusPath), { recursive: true });
|
|
1353
|
+
const existing = existsSync(corpusPath) ? readCorpus(corpusPath) : [];
|
|
1354
|
+
const seen = new Set(existing.map((r) => r.runId));
|
|
1355
|
+
const lines = [];
|
|
1356
|
+
let appended = 0;
|
|
1357
|
+
let skipped = 0;
|
|
1358
|
+
for (const r of records) {
|
|
1359
|
+
if (seen.has(r.runId)) {
|
|
1360
|
+
skipped++;
|
|
1361
|
+
continue;
|
|
1362
|
+
}
|
|
1363
|
+
seen.add(r.runId);
|
|
1364
|
+
lines.push(JSON.stringify(r));
|
|
1365
|
+
appended++;
|
|
1366
|
+
}
|
|
1367
|
+
if (lines.length > 0) appendFileSync(corpusPath, `${lines.join("\n")}
|
|
1368
|
+
`);
|
|
1369
|
+
return { appended, skipped, total: existing.length + appended };
|
|
1370
|
+
}
|
|
1371
|
+
function readCorpus(corpusPath) {
|
|
1372
|
+
if (!existsSync(corpusPath)) return [];
|
|
1373
|
+
const out = [];
|
|
1374
|
+
for (const line of readFileSync(corpusPath, "utf8").split("\n")) {
|
|
1375
|
+
if (line.trim()) out.push(JSON.parse(line));
|
|
1376
|
+
}
|
|
1377
|
+
return out;
|
|
1378
|
+
}
|
|
1379
|
+
function rewardOf(r) {
|
|
1380
|
+
const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1381
|
+
return typeof v === "number" && Number.isFinite(v) ? v : 0;
|
|
1382
|
+
}
|
|
1383
|
+
async function buildDatasetFromCorpus(corpusPath, config, opts = {}) {
|
|
1384
|
+
let records = readCorpus(corpusPath).filter(
|
|
1385
|
+
(r) => typeof r.prompt === "string" && typeof r.completion === "string"
|
|
1386
|
+
);
|
|
1387
|
+
if (opts.splits) records = records.filter((r) => opts.splits.includes(r.splitTag));
|
|
1388
|
+
if (opts.minScore != null) records = records.filter((r) => rewardOf(r) >= opts.minScore);
|
|
1389
|
+
const text = new Map(
|
|
1390
|
+
records.map((r) => [r.runId, { prompt: r.prompt, completion: r.completion }])
|
|
1391
|
+
);
|
|
1392
|
+
const lookups = {
|
|
1393
|
+
promptOf: (id) => text.get(id)?.prompt ?? "",
|
|
1394
|
+
completionOf: (id) => text.get(id)?.completion ?? ""
|
|
1395
|
+
};
|
|
1396
|
+
return buildRlDataset(records, lookups, config);
|
|
1397
|
+
}
|
|
1398
|
+
|
|
1215
1399
|
// src/rl/predictive-validity-researcher.ts
|
|
1216
1400
|
var PredictiveValidityResearcher = class {
|
|
1217
1401
|
opts;
|
|
@@ -1596,11 +1780,15 @@ export {
|
|
|
1596
1780
|
InMemoryOutcomeStore,
|
|
1597
1781
|
PredictiveValidityResearcher,
|
|
1598
1782
|
adversarialScenarioSearch,
|
|
1783
|
+
appendToCorpus,
|
|
1599
1784
|
applyEloUpdate,
|
|
1600
1785
|
bestOfN,
|
|
1786
|
+
buildDatasetFromCorpus,
|
|
1601
1787
|
buildPairwiseFromCampaign,
|
|
1788
|
+
buildRlDataset,
|
|
1602
1789
|
campaignToRunRecords,
|
|
1603
1790
|
compareAdaptationCurves,
|
|
1791
|
+
datasheetToMarkdown,
|
|
1604
1792
|
detectRewardHacking,
|
|
1605
1793
|
doublyRobust,
|
|
1606
1794
|
extractPreferences,
|
|
@@ -1616,6 +1804,7 @@ export {
|
|
|
1616
1804
|
offPolicyEstimateAll,
|
|
1617
1805
|
paretoFrontier,
|
|
1618
1806
|
prmTrainingPairs,
|
|
1807
|
+
readCorpus,
|
|
1619
1808
|
renameVariables,
|
|
1620
1809
|
runAdaptationCurve,
|
|
1621
1810
|
runComputeCurve,
|