@tangle-network/agent-eval 0.38.0 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +695 -0
- package/dist/campaign/index.js +741 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/multishot/index.d.ts +21 -21
- package/dist/multishot/index.js +64 -15
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/package.json +33 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
package/dist/reporting.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
7
7
|
renderReleaseReport
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-NKLGKF2Q.js";
|
|
9
9
|
import {
|
|
10
10
|
rubricPredictiveValidity
|
|
11
11
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -19,12 +19,12 @@ import {
|
|
|
19
19
|
paretoChart,
|
|
20
20
|
researchReport,
|
|
21
21
|
summaryTable
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-EGIPWXHL.js";
|
|
23
23
|
import {
|
|
24
24
|
benjaminiHochberg,
|
|
25
25
|
pairedBootstrap,
|
|
26
26
|
wilcoxonSignedRank
|
|
27
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-WP7SY7AI.js";
|
|
28
28
|
import "./chunk-VSMTAMNK.js";
|
|
29
29
|
import "./chunk-QYJT52YW.js";
|
|
30
30
|
import "./chunk-PZ5AY32C.js";
|
package/dist/rl.js
CHANGED
|
@@ -1,19 +1,25 @@
|
|
|
1
|
+
import {
|
|
2
|
+
detectRewardHacking,
|
|
3
|
+
extractVerifiableReward,
|
|
4
|
+
extractVerifiableRewardsFromRecords,
|
|
5
|
+
filterDeterministicallyRewarded
|
|
6
|
+
} from "./chunk-YV7J7X5N.js";
|
|
1
7
|
import {
|
|
2
8
|
runEvalCampaign
|
|
3
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-LCIDRYGP.js";
|
|
4
10
|
import "./chunk-VXNVVBZO.js";
|
|
5
|
-
import "./chunk-
|
|
11
|
+
import "./chunk-BWZEGTES.js";
|
|
6
12
|
import {
|
|
7
13
|
rubricPredictiveValidity
|
|
8
14
|
} from "./chunk-YRZ4M5GS.js";
|
|
9
15
|
import {
|
|
10
16
|
evaluateInterimReleaseConfidence
|
|
11
17
|
} from "./chunk-MAZ26DC7.js";
|
|
12
|
-
import "./chunk-
|
|
18
|
+
import "./chunk-EGIPWXHL.js";
|
|
13
19
|
import {
|
|
14
20
|
benjaminiHochberg,
|
|
15
21
|
wilcoxonSignedRank
|
|
16
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-WP7SY7AI.js";
|
|
17
23
|
import "./chunk-UBPIXOC4.js";
|
|
18
24
|
import "./chunk-PC4UYEBM.js";
|
|
19
25
|
import "./chunk-TVVP3ZZQ.js";
|
|
@@ -157,7 +163,7 @@ async function runContaminationProbe(input, opts = {}) {
|
|
|
157
163
|
const deltas = valid.map((p) => p.delta);
|
|
158
164
|
const sortedDeltas = [...deltas].sort((a, b) => a - b);
|
|
159
165
|
const median = sortedDeltas[Math.floor(sortedDeltas.length / 2)];
|
|
160
|
-
const
|
|
166
|
+
const mean = deltas.reduce((s, d) => s + d, 0) / deltas.length;
|
|
161
167
|
const pseudoP = valid.map((p) => Math.min(1, Math.max(1e-6, 1 - Math.abs(p.delta) / 1)));
|
|
162
168
|
const { qValues } = benjaminiHochberg(pseudoP, fdr);
|
|
163
169
|
for (let i = 0; i < valid.length; i++) {
|
|
@@ -171,7 +177,7 @@ async function runContaminationProbe(input, opts = {}) {
|
|
|
171
177
|
perScenario,
|
|
172
178
|
pairedTest,
|
|
173
179
|
medianDelta: median,
|
|
174
|
-
meanDelta:
|
|
180
|
+
meanDelta: mean,
|
|
175
181
|
contaminationSuspected,
|
|
176
182
|
reason,
|
|
177
183
|
n: valid.length
|
|
@@ -753,167 +759,6 @@ function buildPairwiseFromCampaign(input) {
|
|
|
753
759
|
return outcomes;
|
|
754
760
|
}
|
|
755
761
|
|
|
756
|
-
// src/rl/verifiable-reward.ts
|
|
757
|
-
var DEFAULT_DETERMINISTIC_LAYERS = /* @__PURE__ */ new Set([
|
|
758
|
-
"install",
|
|
759
|
-
"typecheck",
|
|
760
|
-
"build",
|
|
761
|
-
"lint",
|
|
762
|
-
"test",
|
|
763
|
-
"compile",
|
|
764
|
-
"schema",
|
|
765
|
-
"sandbox",
|
|
766
|
-
"unit_tests",
|
|
767
|
-
"integration_tests"
|
|
768
|
-
]);
|
|
769
|
-
var DEFAULT_SOURCE_FOR = (name) => {
|
|
770
|
-
const lower = name.toLowerCase();
|
|
771
|
-
if (lower.includes("test")) return "test";
|
|
772
|
-
if (lower.includes("compile") || lower.includes("build") || lower.includes("typecheck") || lower.includes("lint"))
|
|
773
|
-
return "compile";
|
|
774
|
-
if (lower.includes("schema")) return "schema";
|
|
775
|
-
if (lower.includes("sandbox")) return "sandbox";
|
|
776
|
-
if (lower.includes("judge") || lower.includes("semantic")) return "judge";
|
|
777
|
-
return "composite";
|
|
778
|
-
};
|
|
779
|
-
function extractVerifiableReward(report, opts = {}) {
|
|
780
|
-
const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
|
|
781
|
-
const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
|
|
782
|
-
const fallbackToJudge = opts.fallbackToJudge ?? true;
|
|
783
|
-
const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
|
|
784
|
-
const deterministic = report.layers.filter(
|
|
785
|
-
(l) => deterministicSet.has(l.layer) && typeof l.score === "number" && Number.isFinite(l.score)
|
|
786
|
-
);
|
|
787
|
-
if (deterministic.length === 1) {
|
|
788
|
-
const layer = deterministic[0];
|
|
789
|
-
return {
|
|
790
|
-
value: clamp01(layer.score),
|
|
791
|
-
source: sourceFor(layer.layer),
|
|
792
|
-
determinism: "deterministic",
|
|
793
|
-
confidence: 1,
|
|
794
|
-
origin: layer.layer,
|
|
795
|
-
breakdown: layerBreakdown(layer)
|
|
796
|
-
};
|
|
797
|
-
}
|
|
798
|
-
if (deterministic.length > 1) {
|
|
799
|
-
let num = 0;
|
|
800
|
-
let denom = 0;
|
|
801
|
-
const breakdown = {};
|
|
802
|
-
for (const l of deterministic) {
|
|
803
|
-
const w = l.detail?.weight ?? 1;
|
|
804
|
-
num += w * (l.score ?? 0);
|
|
805
|
-
denom += w;
|
|
806
|
-
breakdown[l.layer] = l.score;
|
|
807
|
-
}
|
|
808
|
-
return {
|
|
809
|
-
value: denom === 0 ? 0 : clamp01(num / denom),
|
|
810
|
-
source: "composite",
|
|
811
|
-
determinism: "deterministic",
|
|
812
|
-
confidence: 1,
|
|
813
|
-
origin: deterministic.map((l) => l.layer).join("+"),
|
|
814
|
-
breakdown
|
|
815
|
-
};
|
|
816
|
-
}
|
|
817
|
-
if (!fallbackToJudge) return null;
|
|
818
|
-
const judge = report.layers.find(
|
|
819
|
-
(l) => typeof l.score === "number" && Number.isFinite(l.score) && sourceFor(l.layer) === "judge"
|
|
820
|
-
) ?? report.layers.find((l) => typeof l.score === "number" && Number.isFinite(l.score));
|
|
821
|
-
if (!judge) return null;
|
|
822
|
-
const confFromDetail = judge.detail?.confidence;
|
|
823
|
-
return {
|
|
824
|
-
value: clamp01(judge.score),
|
|
825
|
-
source: "judge",
|
|
826
|
-
determinism: "probabilistic",
|
|
827
|
-
confidence: typeof confFromDetail === "number" ? confFromDetail : judgeFloor,
|
|
828
|
-
origin: judge.layer,
|
|
829
|
-
breakdown: layerBreakdown(judge)
|
|
830
|
-
};
|
|
831
|
-
}
|
|
832
|
-
function extractVerifiableRewardsFromRecords(runs, opts = {}) {
|
|
833
|
-
const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
|
|
834
|
-
const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
|
|
835
|
-
const fallbackToJudge = opts.fallbackToJudge ?? true;
|
|
836
|
-
const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
|
|
837
|
-
return runs.map((run) => {
|
|
838
|
-
const layerScores = [];
|
|
839
|
-
for (const [k, v] of Object.entries(run.outcome.raw)) {
|
|
840
|
-
if (k.startsWith("layer.") && !k.includes(".", 6) && typeof v === "number" && Number.isFinite(v)) {
|
|
841
|
-
layerScores.push({ name: k.slice("layer.".length), score: v });
|
|
842
|
-
}
|
|
843
|
-
}
|
|
844
|
-
const det = layerScores.filter((l) => deterministicSet.has(l.name));
|
|
845
|
-
if (det.length === 1) {
|
|
846
|
-
const layer = det[0];
|
|
847
|
-
return {
|
|
848
|
-
runId: run.runId,
|
|
849
|
-
reward: {
|
|
850
|
-
value: clamp01(layer.score),
|
|
851
|
-
source: sourceFor(layer.name),
|
|
852
|
-
determinism: "deterministic",
|
|
853
|
-
confidence: 1,
|
|
854
|
-
origin: layer.name
|
|
855
|
-
}
|
|
856
|
-
};
|
|
857
|
-
}
|
|
858
|
-
if (det.length > 1) {
|
|
859
|
-
const value = det.reduce((s, l) => s + l.score, 0) / det.length;
|
|
860
|
-
const breakdown = Object.fromEntries(
|
|
861
|
-
det.map((l) => [l.name, l.score])
|
|
862
|
-
);
|
|
863
|
-
return {
|
|
864
|
-
runId: run.runId,
|
|
865
|
-
reward: {
|
|
866
|
-
value: clamp01(value),
|
|
867
|
-
source: "composite",
|
|
868
|
-
determinism: "deterministic",
|
|
869
|
-
confidence: 1,
|
|
870
|
-
origin: det.map((l) => l.name).join("+"),
|
|
871
|
-
breakdown
|
|
872
|
-
}
|
|
873
|
-
};
|
|
874
|
-
}
|
|
875
|
-
if (!fallbackToJudge) return { runId: run.runId, reward: null };
|
|
876
|
-
const primary = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
877
|
-
if (typeof primary !== "number" || !Number.isFinite(primary)) {
|
|
878
|
-
return { runId: run.runId, reward: null };
|
|
879
|
-
}
|
|
880
|
-
return {
|
|
881
|
-
runId: run.runId,
|
|
882
|
-
reward: {
|
|
883
|
-
value: clamp01(primary),
|
|
884
|
-
source: "judge",
|
|
885
|
-
determinism: "probabilistic",
|
|
886
|
-
confidence: judgeFloor,
|
|
887
|
-
origin: "run.outcome.score"
|
|
888
|
-
}
|
|
889
|
-
};
|
|
890
|
-
});
|
|
891
|
-
}
|
|
892
|
-
function filterDeterministicallyRewarded(runs, opts = {}) {
|
|
893
|
-
const rewarded = extractVerifiableRewardsFromRecords(runs, { ...opts, fallbackToJudge: false });
|
|
894
|
-
const out = [];
|
|
895
|
-
for (let i = 0; i < runs.length; i++) {
|
|
896
|
-
const r = rewarded[i];
|
|
897
|
-
if (r.reward && r.reward.determinism === "deterministic") {
|
|
898
|
-
out.push({ run: runs[i], reward: r.reward });
|
|
899
|
-
}
|
|
900
|
-
}
|
|
901
|
-
return out;
|
|
902
|
-
}
|
|
903
|
-
function clamp01(x) {
|
|
904
|
-
if (!Number.isFinite(x)) return 0;
|
|
905
|
-
return Math.max(0, Math.min(1, x));
|
|
906
|
-
}
|
|
907
|
-
function layerBreakdown(l) {
|
|
908
|
-
const out = {};
|
|
909
|
-
if (l.diagnostics) {
|
|
910
|
-
for (const [k, v] of Object.entries(l.diagnostics)) {
|
|
911
|
-
if (typeof v === "number" && Number.isFinite(v)) out[k] = v;
|
|
912
|
-
}
|
|
913
|
-
}
|
|
914
|
-
return out;
|
|
915
|
-
}
|
|
916
|
-
|
|
917
762
|
// src/rl/active-curriculum.ts
|
|
918
763
|
function varianceBasedCurriculum(observations, candidateCells, opts) {
|
|
919
764
|
const variancePrior = opts.variancePrior ?? 0.05;
|
|
@@ -930,10 +775,10 @@ function varianceBasedCurriculum(observations, candidateCells, opts) {
|
|
|
930
775
|
const k = `${c.variantId}::${c.scenarioId}`;
|
|
931
776
|
const samples = grouped.get(k) ?? [];
|
|
932
777
|
const n = samples.length;
|
|
933
|
-
const
|
|
934
|
-
const variance = n < 2 ? variancePrior : samples.reduce((s, v) => s + (v -
|
|
778
|
+
const mean = n === 0 ? 0.5 : samples.reduce((s, v) => s + v, 0) / n;
|
|
779
|
+
const variance = n < 2 ? variancePrior : samples.reduce((s, v) => s + (v - mean) ** 2, 0) / (n - 1) + variancePrior;
|
|
935
780
|
const weight = Math.sqrt(variance) + 1 / Math.sqrt(Math.max(1, n));
|
|
936
|
-
return { variantId: c.variantId, scenarioId: c.scenarioId, n, mean
|
|
781
|
+
return { variantId: c.variantId, scenarioId: c.scenarioId, n, mean, variance, weight };
|
|
937
782
|
});
|
|
938
783
|
const floorTotal = floor * cellStats.length;
|
|
939
784
|
if (floorTotal >= budget) {
|
|
@@ -1400,151 +1245,6 @@ function defaultReward(run) {
|
|
|
1400
1245
|
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
1401
1246
|
}
|
|
1402
1247
|
|
|
1403
|
-
// src/rl/reward-hacking.ts
|
|
1404
|
-
var DEFAULT_PROXY = (r) => {
|
|
1405
|
-
const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1406
|
-
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
1407
|
-
};
|
|
1408
|
-
function detectRewardHacking(input) {
|
|
1409
|
-
const proxyOf = input.proxyOf ?? DEFAULT_PROXY;
|
|
1410
|
-
const truthOf = input.truthOf;
|
|
1411
|
-
const sus = input.thresholds?.suspect ?? 0.3;
|
|
1412
|
-
const gam = input.thresholds?.gaming ?? 0.6;
|
|
1413
|
-
const runs = input.runs.filter((r) => proxyOf(r) !== null);
|
|
1414
|
-
const n = runs.length;
|
|
1415
|
-
if (n < 4) {
|
|
1416
|
-
return {
|
|
1417
|
-
findings: [],
|
|
1418
|
-
verdict: "clean",
|
|
1419
|
-
n,
|
|
1420
|
-
rationale: [`fewer than 4 runs with proxy reward (n=${n}); insufficient evidence`]
|
|
1421
|
-
};
|
|
1422
|
-
}
|
|
1423
|
-
const windowSize = Math.max(1, input.windowSize ?? Math.min(50, Math.floor(n / 2)));
|
|
1424
|
-
const before = runs.slice(0, n - windowSize);
|
|
1425
|
-
const after = runs.slice(n - windowSize);
|
|
1426
|
-
const findings = [];
|
|
1427
|
-
if (truthOf) {
|
|
1428
|
-
const beforeProxy = before.map(proxyOf).filter((v) => typeof v === "number");
|
|
1429
|
-
const afterProxy = after.map(proxyOf).filter((v) => typeof v === "number");
|
|
1430
|
-
const beforeTruth = before.map(truthOf).filter((v) => typeof v === "number");
|
|
1431
|
-
const afterTruth = after.map(truthOf).filter((v) => typeof v === "number");
|
|
1432
|
-
if (beforeProxy.length >= 2 && afterProxy.length >= 2 && beforeTruth.length >= 2 && afterTruth.length >= 2) {
|
|
1433
|
-
const proxyDelta = mean(afterProxy) - mean(beforeProxy);
|
|
1434
|
-
const truthDelta = mean(afterTruth) - mean(beforeTruth);
|
|
1435
|
-
const gap = Math.max(0, proxyDelta - truthDelta);
|
|
1436
|
-
const severity = clamp012(gap * 5);
|
|
1437
|
-
findings.push({
|
|
1438
|
-
signal: "reward_divergence",
|
|
1439
|
-
severity,
|
|
1440
|
-
message: severity >= sus ? `proxy reward rose by ${proxyDelta.toFixed(3)} while truth changed by ${truthDelta.toFixed(3)} \u2014 potential Goodhart` : `proxy and truth moved together (proxy ${proxyDelta.toFixed(3)}, truth ${truthDelta.toFixed(3)})`,
|
|
1441
|
-
detail: {
|
|
1442
|
-
proxyDelta,
|
|
1443
|
-
truthDelta,
|
|
1444
|
-
gap,
|
|
1445
|
-
beforeN: beforeProxy.length,
|
|
1446
|
-
afterN: afterProxy.length
|
|
1447
|
-
}
|
|
1448
|
-
});
|
|
1449
|
-
}
|
|
1450
|
-
}
|
|
1451
|
-
{
|
|
1452
|
-
const beforeP = before.map(proxyOf).filter((v) => typeof v === "number");
|
|
1453
|
-
const afterP = after.map(proxyOf).filter((v) => typeof v === "number");
|
|
1454
|
-
if (beforeP.length >= 4 && afterP.length >= 4) {
|
|
1455
|
-
const ks = ksStatistic(beforeP, afterP);
|
|
1456
|
-
const severity = clamp012(ks - 0.2);
|
|
1457
|
-
findings.push({
|
|
1458
|
-
signal: "distribution_shift",
|
|
1459
|
-
severity,
|
|
1460
|
-
message: severity >= sus ? `KS=${ks.toFixed(3)} between before/after windows \u2014 distributional shift large` : `KS=${ks.toFixed(3)} between before/after windows \u2014 within-distribution drift`,
|
|
1461
|
-
detail: { ks, beforeN: beforeP.length, afterN: afterP.length }
|
|
1462
|
-
});
|
|
1463
|
-
}
|
|
1464
|
-
}
|
|
1465
|
-
{
|
|
1466
|
-
const secondaryOf = input.secondaryRewardOf ?? defaultSecondary(input.verifiableRewardOptions);
|
|
1467
|
-
const aligned = runs.map((r) => ({ p: proxyOf(r), s: secondaryOf(r) })).filter(
|
|
1468
|
-
(x) => typeof x.p === "number" && typeof x.s === "number"
|
|
1469
|
-
);
|
|
1470
|
-
if (aligned.length >= 4) {
|
|
1471
|
-
const ps = aligned.map((x) => x.p);
|
|
1472
|
-
const ss = aligned.map((x) => x.s);
|
|
1473
|
-
const r = pearsonR(ps, ss);
|
|
1474
|
-
const severity = clamp012(0.5 - Math.max(0, r));
|
|
1475
|
-
findings.push({
|
|
1476
|
-
signal: "reward_disagreement",
|
|
1477
|
-
severity,
|
|
1478
|
-
message: severity >= sus ? `proxy and independent secondary reward correlate \u03C1=${r.toFixed(3)} \u2014 possibly hacking proxy` : `proxy and secondary reward correlate \u03C1=${r.toFixed(3)}`,
|
|
1479
|
-
detail: { pearson: r, n: aligned.length }
|
|
1480
|
-
});
|
|
1481
|
-
}
|
|
1482
|
-
}
|
|
1483
|
-
{
|
|
1484
|
-
const detRuns = filterDeterministicallyRewarded(runs, input.verifiableRewardOptions ?? {});
|
|
1485
|
-
if (detRuns.length >= 4) {
|
|
1486
|
-
const detBefore = detRuns.slice(0, Math.floor(detRuns.length / 2));
|
|
1487
|
-
const detAfter = detRuns.slice(Math.floor(detRuns.length / 2));
|
|
1488
|
-
const detDelta = mean(detAfter.map((r) => r.reward.value)) - mean(detBefore.map((r) => r.reward.value));
|
|
1489
|
-
const proxyDelta = mean(after.map(proxyOf).filter((v) => typeof v === "number")) - mean(before.map(proxyOf).filter((v) => typeof v === "number"));
|
|
1490
|
-
const driftGap = Math.max(0, proxyDelta - detDelta);
|
|
1491
|
-
const severity = clamp012(driftGap * 5);
|
|
1492
|
-
findings.push({
|
|
1493
|
-
signal: "judge_drift",
|
|
1494
|
-
severity,
|
|
1495
|
-
message: severity >= sus ? `judge proxy +${proxyDelta.toFixed(3)} while deterministic reward +${detDelta.toFixed(3)} \u2014 judge drifting up without verifiable backing` : `judge and deterministic rewards move in step (judge ${proxyDelta.toFixed(3)}, det ${detDelta.toFixed(3)})`,
|
|
1496
|
-
detail: { proxyDelta, detDelta, driftGap, n: detRuns.length }
|
|
1497
|
-
});
|
|
1498
|
-
}
|
|
1499
|
-
}
|
|
1500
|
-
const maxSev = findings.reduce((m, f) => Math.max(m, f.severity), 0);
|
|
1501
|
-
const verdict = maxSev >= gam ? "gaming" : maxSev >= sus ? "suspect" : "clean";
|
|
1502
|
-
const rationale = findings.filter((f) => f.severity >= sus).map((f) => `${f.signal}: severity ${f.severity.toFixed(2)} \u2014 ${f.message}`);
|
|
1503
|
-
if (rationale.length === 0) rationale.push("no signals fired above suspect threshold");
|
|
1504
|
-
return { findings, verdict, rationale, n };
|
|
1505
|
-
}
|
|
1506
|
-
function mean(xs) {
|
|
1507
|
-
if (xs.length === 0) return 0;
|
|
1508
|
-
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
1509
|
-
}
|
|
1510
|
-
function clamp012(x) {
|
|
1511
|
-
if (!Number.isFinite(x)) return 0;
|
|
1512
|
-
return Math.max(0, Math.min(1, x));
|
|
1513
|
-
}
|
|
1514
|
-
function pearsonR(a, b) {
|
|
1515
|
-
if (a.length !== b.length || a.length < 2) return 0;
|
|
1516
|
-
const ma = mean(a);
|
|
1517
|
-
const mb = mean(b);
|
|
1518
|
-
let num = 0, da = 0, db = 0;
|
|
1519
|
-
for (let i = 0; i < a.length; i++) {
|
|
1520
|
-
const xa = a[i] - ma;
|
|
1521
|
-
const xb = b[i] - mb;
|
|
1522
|
-
num += xa * xb;
|
|
1523
|
-
da += xa * xa;
|
|
1524
|
-
db += xb * xb;
|
|
1525
|
-
}
|
|
1526
|
-
if (da === 0 || db === 0) return 0;
|
|
1527
|
-
return num / Math.sqrt(da * db);
|
|
1528
|
-
}
|
|
1529
|
-
function ksStatistic(a, b) {
|
|
1530
|
-
const sortedA = [...a].sort((x, y) => x - y);
|
|
1531
|
-
const sortedB = [...b].sort((x, y) => x - y);
|
|
1532
|
-
const all = [.../* @__PURE__ */ new Set([...sortedA, ...sortedB])].sort((x, y) => x - y);
|
|
1533
|
-
let max = 0;
|
|
1534
|
-
for (const v of all) {
|
|
1535
|
-
const fa = sortedA.filter((x) => x <= v).length / sortedA.length;
|
|
1536
|
-
const fb = sortedB.filter((x) => x <= v).length / sortedB.length;
|
|
1537
|
-
max = Math.max(max, Math.abs(fa - fb));
|
|
1538
|
-
}
|
|
1539
|
-
return max;
|
|
1540
|
-
}
|
|
1541
|
-
function defaultSecondary(verifiableOpts) {
|
|
1542
|
-
return (run) => {
|
|
1543
|
-
const filtered = filterDeterministicallyRewarded([run], verifiableOpts ?? {});
|
|
1544
|
-
return filtered.length === 1 ? filtered[0].reward.value : null;
|
|
1545
|
-
};
|
|
1546
|
-
}
|
|
1547
|
-
|
|
1548
1248
|
// src/rl/auto-research.ts
|
|
1549
1249
|
async function analyzeOptimizationResult(opts) {
|
|
1550
1250
|
const trials = extractTrials(opts.result);
|