@tangle-network/agent-eval 0.40.5 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -355
- package/dist/campaign/index.js +106 -6
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
package/dist/rl.d.ts
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
2
|
+
import { C as CampaignResult } from './types-BLbRTxoc.js';
|
|
3
|
+
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
|
|
4
|
+
export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
|
|
5
|
+
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
4
6
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
5
7
|
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-ByZEC3BX.js';
|
|
6
|
-
import { I as InterimReleaseConfidence } from './sequential-
|
|
7
|
-
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
8
|
-
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-DeZ_EArp.js';
|
|
9
|
-
export { r as runEvalCampaign } from './researcher-DeZ_EArp.js';
|
|
8
|
+
import { I as InterimReleaseConfidence } from './sequential-DdV5ShjT.js';
|
|
10
9
|
import './errors-mje_cKOs.js';
|
|
11
|
-
import './
|
|
12
|
-
import './
|
|
10
|
+
import './llm-client-BXVRUZyX.js';
|
|
11
|
+
import './raw-provider-sink-C46HDghv.js';
|
|
13
12
|
import './emitter-DP_cSSiw.js';
|
|
13
|
+
import './integrity-CTDhR1Sg.js';
|
|
14
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
14
15
|
|
|
15
16
|
/**
|
|
16
17
|
* Test-time compute scaling curves.
|
|
@@ -529,17 +530,17 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
|
|
|
529
530
|
}>;
|
|
530
531
|
|
|
531
532
|
/**
|
|
532
|
-
* Adapters: convert `
|
|
533
|
-
*
|
|
534
|
-
* `
|
|
535
|
-
*
|
|
536
|
-
*
|
|
537
|
-
*
|
|
538
|
-
*
|
|
539
|
-
*
|
|
540
|
-
*
|
|
541
|
-
*
|
|
542
|
-
* — the caller
|
|
533
|
+
* Adapters: convert measurement outputs into the canonical `RunRecord[]`
|
|
534
|
+
* artifact that `replayCache`, `pairedEvalueSequence`, and
|
|
535
|
+
* `rubricPredictiveValidity` consume. Two sources:
|
|
536
|
+
* - `campaignToRunRecords` — the campaign substrate's per-cell results
|
|
537
|
+
* (the modern path: `runCampaign` / `runImprovementLoop` → records).
|
|
538
|
+
* - `verificationReportToRunRecord` — a `MultiLayerVerifier` report.
|
|
539
|
+
*
|
|
540
|
+
* Adapters are thin and explicit — every mandatory `RunRecord` field comes
|
|
541
|
+
* from a caller-supplied context (`commitSha`, `model`, `promptHash`,
|
|
542
|
+
* `configHash`) plus the cell's runtime data. The validator still rejects
|
|
543
|
+
* bare-alias model strings — the caller snapshot-pins.
|
|
543
544
|
*/
|
|
544
545
|
|
|
545
546
|
interface AdapterContext {
|
|
@@ -550,41 +551,30 @@ interface AdapterContext {
|
|
|
550
551
|
/** Git SHA the harness was run from. */
|
|
551
552
|
commitSha: string;
|
|
552
553
|
/** Hash of the effective prompt sent to the model. */
|
|
553
|
-
promptHash: string
|
|
554
|
+
promptHash: string;
|
|
554
555
|
/** Hash of the effective config (model, temperature, tools, judges, splits). */
|
|
555
|
-
configHash: string
|
|
556
|
-
/** Default split tag. Default `'search'
|
|
556
|
+
configHash: string;
|
|
557
|
+
/** Default split tag. Default `'search'`. */
|
|
557
558
|
splitTag?: RunSplitTag;
|
|
558
|
-
/** Default cost in USD when the
|
|
559
|
+
/** Default cost in USD when the source doesn't record one. Default `0`. */
|
|
559
560
|
defaultCostUsd?: number;
|
|
560
561
|
}
|
|
561
562
|
/**
|
|
562
|
-
* Convert
|
|
563
|
-
*
|
|
564
|
-
*
|
|
565
|
-
*
|
|
566
|
-
*
|
|
567
|
-
*
|
|
568
|
-
* filters can distinguish "free" from "untracked"). This preserves the
|
|
569
|
-
* paper-grade contract: a record without a cost number is unbounded by
|
|
570
|
-
* definition, but we don't drop the record.
|
|
563
|
+
* Convert a `CampaignResult` into canonical `RunRecord[]` — one record per
|
|
564
|
+
* scored cell. The cell's mean judge composite becomes the split score; every
|
|
565
|
+
* judge dimension is carried through to `outcome.raw`. A cell that errored
|
|
566
|
+
* becomes a record with `failureMode: 'cell_error'` (kept, not dropped — an
|
|
567
|
+
* unscored cell is signal). `candidateId` identifies the measured surface
|
|
568
|
+
* (defaults to the campaign manifest hash).
|
|
571
569
|
*/
|
|
572
|
-
declare function
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
}): RunRecord;
|
|
576
|
-
/** Convenience: convert an array of `TrialResult` in one go. */
|
|
577
|
-
declare function trialsToRunRecords(trials: TrialResult[], ctx: AdapterContext): RunRecord[];
|
|
570
|
+
declare function campaignToRunRecords(campaign: CampaignResult, ctx: AdapterContext & {
|
|
571
|
+
candidateId?: string;
|
|
572
|
+
}): RunRecord[];
|
|
578
573
|
/**
|
|
579
574
|
* Convert a `MultiLayerVerifier` `VerificationReport` into a `RunRecord`.
|
|
580
|
-
*
|
|
581
|
-
*
|
|
582
|
-
*
|
|
583
|
-
* - `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`
|
|
584
|
-
* - `outcome.raw` carries every layer's score keyed `layer.<name>`
|
|
585
|
-
* plus a `layer_<name>_pass` 1/0 indicator
|
|
586
|
-
* - `failureMode` is taken from the first failing layer's `reason`
|
|
587
|
-
* - `wallMs` is `report.durationMs`
|
|
575
|
+
* `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`;
|
|
576
|
+
* `outcome.raw` carries every layer's score + a pass indicator; `failureMode`
|
|
577
|
+
* is the first failing layer's reason.
|
|
588
578
|
*/
|
|
589
579
|
declare function verificationReportToRunRecord(report: VerificationReport, ctx: AdapterContext & {
|
|
590
580
|
candidateId: string;
|
|
@@ -592,15 +582,6 @@ declare function verificationReportToRunRecord(report: VerificationReport, ctx:
|
|
|
592
582
|
}, opts?: {
|
|
593
583
|
runId?: string;
|
|
594
584
|
}): RunRecord;
|
|
595
|
-
/**
|
|
596
|
-
* Convert a `VariantAggregate` (per-variant rollup from `prompt-evolution`)
|
|
597
|
-
* into a synthetic `RunRecord` representing the aggregate. Useful when the
|
|
598
|
-
* downstream consumer wants per-variant entries for a `researchReport`
|
|
599
|
-
* rather than per-(variant, scenario, rep) trial entries.
|
|
600
|
-
*/
|
|
601
|
-
declare function variantAggregateToRunRecord(agg: VariantAggregate, ctx: AdapterContext, opts?: {
|
|
602
|
-
runId?: string;
|
|
603
|
-
}): RunRecord;
|
|
604
585
|
|
|
605
586
|
/**
|
|
606
587
|
* Bradley-Terry / Elo tournament evaluation.
|
|
@@ -1396,6 +1377,72 @@ interface StepRewardJsonlRow {
|
|
|
1396
1377
|
}
|
|
1397
1378
|
declare function stepRewardsToJsonl(stepRewards: StepReward[]): string;
|
|
1398
1379
|
|
|
1380
|
+
/**
|
|
1381
|
+
* `PredictiveValidityResearcher` — concrete `Researcher` implementation
|
|
1382
|
+
* that drives selection from outcome-anchored predictive validity.
|
|
1383
|
+
*
|
|
1384
|
+
* Each method:
|
|
1385
|
+
*
|
|
1386
|
+
* - `inspectFailures(runs)` — synthesizes failure modes from the
|
|
1387
|
+
* bottom-quartile of `RunRecord`s on the configured proxy reward.
|
|
1388
|
+
* - `proposeChange(failures)` — proposes steering changes that target
|
|
1389
|
+
* the rubrics with the lowest predictive validity (decorative ones).
|
|
1390
|
+
* Either reduce their weight in the composite, or recalibrate them.
|
|
1391
|
+
* - `applyChange(changes, baseline)` — merges the proposed steering
|
|
1392
|
+
* into the experiment plan.
|
|
1393
|
+
* - `evaluateChange(plan)` — re-runs the predictive-validity check on
|
|
1394
|
+
* the post-change runs and reports the delta.
|
|
1395
|
+
*
|
|
1396
|
+
* The result is a closed loop: the rubric weights drift toward the ones
|
|
1397
|
+
* that actually predict deployment outcomes, automatically. Pair with
|
|
1398
|
+
* `runRLCampaign` for the full auto-research story.
|
|
1399
|
+
*/
|
|
1400
|
+
|
|
1401
|
+
interface PredictiveValidityResearcherOptions {
|
|
1402
|
+
outcomes: OutcomeStore;
|
|
1403
|
+
outcomeMetrics: string[];
|
|
1404
|
+
/** Score threshold below which a run counts as a "failure." Default 0.5. */
|
|
1405
|
+
failureThreshold?: number;
|
|
1406
|
+
/** Spearman bucket below which a rubric is "decorative." Default 0.4. */
|
|
1407
|
+
decorativeThreshold?: number;
|
|
1408
|
+
/** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
|
|
1409
|
+
steeringNamespace?: string;
|
|
1410
|
+
/** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
|
|
1411
|
+
rubrics?: string[];
|
|
1412
|
+
/**
|
|
1413
|
+
* Snapshot stash hook — called with the most recent predictive-validity
|
|
1414
|
+
* report. Useful when a downstream system wants to log rubric drift over
|
|
1415
|
+
* time. Default no-op.
|
|
1416
|
+
*/
|
|
1417
|
+
onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
|
|
1418
|
+
}
|
|
1419
|
+
/**
|
|
1420
|
+
* Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
|
|
1421
|
+
* rubrics that don't predict deployment outcomes don't earn weight.
|
|
1422
|
+
*/
|
|
1423
|
+
declare class PredictiveValidityResearcher implements Researcher {
|
|
1424
|
+
private opts;
|
|
1425
|
+
private lastReport;
|
|
1426
|
+
constructor(opts: PredictiveValidityResearcherOptions);
|
|
1427
|
+
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
1428
|
+
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
1429
|
+
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
1430
|
+
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
1431
|
+
/**
|
|
1432
|
+
* Run the predictive-validity check explicitly against a fresh RunRecord
|
|
1433
|
+
* set. Updates the researcher's cached report so subsequent
|
|
1434
|
+
* `proposeChange` calls have evidence to draw from.
|
|
1435
|
+
*/
|
|
1436
|
+
runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
|
|
1437
|
+
/**
|
|
1438
|
+
* Force-feed a predictive-validity report into the researcher state —
|
|
1439
|
+
* useful when the consumer ran the report out-of-band and wants the
|
|
1440
|
+
* researcher's later proposals informed by it.
|
|
1441
|
+
*/
|
|
1442
|
+
setReport(report: RubricPredictiveValidityReport): void;
|
|
1443
|
+
getLastReport(): RubricPredictiveValidityReport | null;
|
|
1444
|
+
}
|
|
1445
|
+
|
|
1399
1446
|
/**
|
|
1400
1447
|
* Reward hacking / Goodhart detection.
|
|
1401
1448
|
*
|
|
@@ -1499,171 +1546,6 @@ interface DetectRewardHackingInput {
|
|
|
1499
1546
|
}
|
|
1500
1547
|
declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
|
|
1501
1548
|
|
|
1502
|
-
/**
|
|
1503
|
-
* `analyzeOptimizationResult` — unifies the auto-research stack
|
|
1504
|
-
* (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
|
|
1505
|
-
* Ax/AxRLM trace analyst) with the RL bridge in a single call.
|
|
1506
|
-
*
|
|
1507
|
-
* The optimization primitives produce `TrialResult[]`; the RL bridge
|
|
1508
|
-
* consumes `RunRecord[]`. Trace-analyst is independent of both. This
|
|
1509
|
-
* function does the wiring once so consumers don't have to:
|
|
1510
|
-
*
|
|
1511
|
-
* Optimization (existing primitives) RL bridge
|
|
1512
|
-
* ────────────────────────────────── ────────
|
|
1513
|
-
* runPromptEvolution → TrialResult[] →
|
|
1514
|
-
* runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
|
|
1515
|
-
* reflective-mutation → mutations.jsonl → ↓
|
|
1516
|
-
* │
|
|
1517
|
-
* ↓ (per-generation inputs flow back) │
|
|
1518
|
-
* PredictiveValidityResearcher.proposeChange ←───────────────────── │
|
|
1519
|
-
* │
|
|
1520
|
-
* ↓ │
|
|
1521
|
-
* TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
|
|
1522
|
-
*
|
|
1523
|
-
* The output is the canonical RL artifact set: `RunRecord[]` (so every
|
|
1524
|
-
* other RL primitive composes), preference triples, verifiable reward
|
|
1525
|
-
* signals, reward-hacking diagnosis, sequential interim verdict, and
|
|
1526
|
-
* (when wired) trace-analyst summary.
|
|
1527
|
-
*
|
|
1528
|
-
* What this primitive does NOT do: it does not modify the optimization
|
|
1529
|
-
* primitives' internals. They keep producing `TrialResult` and emitting
|
|
1530
|
-
* `onProgress` events; this function bridges *after* the sweep completes.
|
|
1531
|
-
* Per-step capture-integrity (raw HTTP events from inside the score
|
|
1532
|
-
* adapter) requires the consumer to wire `RawProviderSink` into their
|
|
1533
|
-
* own `ScoreAdapter` — that's a per-consumer integration point.
|
|
1534
|
-
*/
|
|
1535
|
-
|
|
1536
|
-
interface AnalyzeOptimizationResultOptions {
|
|
1537
|
-
/**
|
|
1538
|
-
* The optimization output. Either a `PromptEvolutionResult` or a
|
|
1539
|
-
* `MultiShotOptimizationResult`. The function detects which by
|
|
1540
|
-
* structural typing and produces canonical `RunRecord[]` from either.
|
|
1541
|
-
*/
|
|
1542
|
-
result: PromptEvolutionResult | MultiShotOptimizationResult;
|
|
1543
|
-
/** Adapter context — `commitSha`, `model`, `promptHash`, `configHash`. */
|
|
1544
|
-
ctx: AdapterContext;
|
|
1545
|
-
/** Optional comparator candidate id for paired analyses. */
|
|
1546
|
-
comparator?: string;
|
|
1547
|
-
/** Verifiable-reward extraction options. */
|
|
1548
|
-
verifiableReward?: VerifiableRewardExtractionOptions;
|
|
1549
|
-
/** Preference extraction options. */
|
|
1550
|
-
preferences?: ExtractPreferencesOptions;
|
|
1551
|
-
/** Sequential interim-confidence options. */
|
|
1552
|
-
sequential?: {
|
|
1553
|
-
alpha?: number;
|
|
1554
|
-
bound?: number;
|
|
1555
|
-
rope?: {
|
|
1556
|
-
low: number;
|
|
1557
|
-
high: number;
|
|
1558
|
-
};
|
|
1559
|
-
};
|
|
1560
|
-
/** Outcome calibration store + metrics. */
|
|
1561
|
-
outcomes?: {
|
|
1562
|
-
store: OutcomeStore;
|
|
1563
|
-
metrics: string[];
|
|
1564
|
-
};
|
|
1565
|
-
/** Trainer-format export — DPO + GRPO lookups. */
|
|
1566
|
-
trainerExport?: {
|
|
1567
|
-
dpo?: DpoLookups;
|
|
1568
|
-
grpo?: GrpoLookups;
|
|
1569
|
-
};
|
|
1570
|
-
}
|
|
1571
|
-
interface AnalyzeOptimizationResultReport {
|
|
1572
|
-
/** All trials promoted to canonical `RunRecord` shape. */
|
|
1573
|
-
runs: RunRecord[];
|
|
1574
|
-
/** Per-run verifiable reward signal. */
|
|
1575
|
-
rewardSignals: Array<{
|
|
1576
|
-
runId: string;
|
|
1577
|
-
reward: VerifiableReward | null;
|
|
1578
|
-
}>;
|
|
1579
|
-
/** Preference triples ready for DPO/PPO/KTO training. */
|
|
1580
|
-
preferences: PreferenceExtractionReport;
|
|
1581
|
-
/** Anytime-valid sequential verdict, when a comparator is supplied. */
|
|
1582
|
-
interimConfidence: InterimReleaseConfidence | null;
|
|
1583
|
-
/** Standing reward-hacking hygiene check. */
|
|
1584
|
-
rewardHacking: RewardHackingReport;
|
|
1585
|
-
/** Predictive validity, when an outcome store is supplied. */
|
|
1586
|
-
predictiveValidity: RubricPredictiveValidityReport | null;
|
|
1587
|
-
/** Trainer-export rows, populated only for the formats requested. */
|
|
1588
|
-
trainerRows: {
|
|
1589
|
-
dpo?: DpoExportRow[];
|
|
1590
|
-
grpo?: GrpoExportRow[];
|
|
1591
|
-
};
|
|
1592
|
-
/** One-line summary suitable for logs. */
|
|
1593
|
-
summary: string;
|
|
1594
|
-
}
|
|
1595
|
-
/**
|
|
1596
|
-
* Convert an optimization sweep output into a fully-analysed RL artifact
|
|
1597
|
-
* set. Idempotent and read-only with respect to the optimization result.
|
|
1598
|
-
*/
|
|
1599
|
-
declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOptions): Promise<AnalyzeOptimizationResultReport>;
|
|
1600
|
-
|
|
1601
|
-
/**
|
|
1602
|
-
* `PredictiveValidityResearcher` — concrete `Researcher` implementation
|
|
1603
|
-
* that drives selection from outcome-anchored predictive validity.
|
|
1604
|
-
*
|
|
1605
|
-
* Each method:
|
|
1606
|
-
*
|
|
1607
|
-
* - `inspectFailures(runs)` — synthesizes failure modes from the
|
|
1608
|
-
* bottom-quartile of `RunRecord`s on the configured proxy reward.
|
|
1609
|
-
* - `proposeChange(failures)` — proposes steering changes that target
|
|
1610
|
-
* the rubrics with the lowest predictive validity (decorative ones).
|
|
1611
|
-
* Either reduce their weight in the composite, or recalibrate them.
|
|
1612
|
-
* - `applyChange(changes, baseline)` — merges the proposed steering
|
|
1613
|
-
* into the experiment plan.
|
|
1614
|
-
* - `evaluateChange(plan)` — re-runs the predictive-validity check on
|
|
1615
|
-
* the post-change runs and reports the delta.
|
|
1616
|
-
*
|
|
1617
|
-
* The result is a closed loop: the rubric weights drift toward the ones
|
|
1618
|
-
* that actually predict deployment outcomes, automatically. Pair with
|
|
1619
|
-
* `runRLCampaign` for the full auto-research story.
|
|
1620
|
-
*/
|
|
1621
|
-
|
|
1622
|
-
interface PredictiveValidityResearcherOptions {
|
|
1623
|
-
outcomes: OutcomeStore;
|
|
1624
|
-
outcomeMetrics: string[];
|
|
1625
|
-
/** Score threshold below which a run counts as a "failure." Default 0.5. */
|
|
1626
|
-
failureThreshold?: number;
|
|
1627
|
-
/** Spearman bucket below which a rubric is "decorative." Default 0.4. */
|
|
1628
|
-
decorativeThreshold?: number;
|
|
1629
|
-
/** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
|
|
1630
|
-
steeringNamespace?: string;
|
|
1631
|
-
/** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
|
|
1632
|
-
rubrics?: string[];
|
|
1633
|
-
/**
|
|
1634
|
-
* Snapshot stash hook — called with the most recent predictive-validity
|
|
1635
|
-
* report. Useful when a downstream system wants to log rubric drift over
|
|
1636
|
-
* time. Default no-op.
|
|
1637
|
-
*/
|
|
1638
|
-
onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
|
|
1639
|
-
}
|
|
1640
|
-
/**
|
|
1641
|
-
* Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
|
|
1642
|
-
* rubrics that don't predict deployment outcomes don't earn weight.
|
|
1643
|
-
*/
|
|
1644
|
-
declare class PredictiveValidityResearcher implements Researcher {
|
|
1645
|
-
private opts;
|
|
1646
|
-
private lastReport;
|
|
1647
|
-
constructor(opts: PredictiveValidityResearcherOptions);
|
|
1648
|
-
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
1649
|
-
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
1650
|
-
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
1651
|
-
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
1652
|
-
/**
|
|
1653
|
-
* Run the predictive-validity check explicitly against a fresh RunRecord
|
|
1654
|
-
* set. Updates the researcher's cached report so subsequent
|
|
1655
|
-
* `proposeChange` calls have evidence to draw from.
|
|
1656
|
-
*/
|
|
1657
|
-
runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
|
|
1658
|
-
/**
|
|
1659
|
-
* Force-feed a predictive-validity report into the researcher state —
|
|
1660
|
-
* useful when the consumer ran the report out-of-band and wants the
|
|
1661
|
-
* researcher's later proposals informed by it.
|
|
1662
|
-
*/
|
|
1663
|
-
setReport(report: RubricPredictiveValidityReport): void;
|
|
1664
|
-
getLastReport(): RubricPredictiveValidityReport | null;
|
|
1665
|
-
}
|
|
1666
|
-
|
|
1667
1549
|
/**
|
|
1668
1550
|
* `runRLCampaign` — top-level orchestrator that runs the matrix and
|
|
1669
1551
|
* produces every RL-ready artifact in one call.
|
|
@@ -1741,4 +1623,4 @@ interface RLCampaignResult<V> {
|
|
|
1741
1623
|
}
|
|
1742
1624
|
declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
|
|
1743
1625
|
|
|
1744
|
-
export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type
|
|
1626
|
+
export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CurriculumAllocation, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, applyEloUpdate, bestOfN, buildPairwiseFromCampaign, campaignToRunRecords, compareAdaptationCurves, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };
|